Committed by
GitHub
Add Java and Kotlin API for sense voice (#1164)
正在显示
16 个修改的文件
包含
601 行增加
和
2 行删除
| @@ -114,6 +114,16 @@ jobs: | @@ -114,6 +114,16 @@ jobs: | ||
| 114 | ./run-kws-from-file.sh | 114 | ./run-kws-from-file.sh |
| 115 | rm -rf sherpa-onnx-* | 115 | rm -rf sherpa-onnx-* |
| 116 | 116 | ||
| 117 | + - name: Run java test (VAD + Non-streaming SenseVoice) | ||
| 118 | + shell: bash | ||
| 119 | + run: | | ||
| 120 | + cd ./java-api-examples | ||
| 121 | + ./run-vad-non-streaming-sense-voice.sh | ||
| 122 | + rm *.onnx | ||
| 123 | + ls -lh *.wav | ||
| 124 | + rm *.wav | ||
| 125 | + rm -rf sherpa-onnx-* | ||
| 126 | + | ||
| 117 | - name: Run java test (VAD + Non-streaming Paraformer) | 127 | - name: Run java test (VAD + Non-streaming Paraformer) |
| 118 | shell: bash | 128 | shell: bash |
| 119 | run: | | 129 | run: | |
| @@ -193,6 +203,10 @@ jobs: | @@ -193,6 +203,10 @@ jobs: | ||
| 193 | shell: bash | 203 | shell: bash |
| 194 | run: | | 204 | run: | |
| 195 | cd ./java-api-examples | 205 | cd ./java-api-examples |
| 206 | + | ||
| 207 | + ./run-non-streaming-decode-file-sense-voice.sh | ||
| 208 | + rm -rf sherpa-onnx-sense-voice-* | ||
| 209 | + | ||
| 196 | ./run-inverse-text-normalization-paraformer.sh | 210 | ./run-inverse-text-normalization-paraformer.sh |
| 197 | 211 | ||
| 198 | ./run-non-streaming-decode-file-paraformer.sh | 212 | ./run-non-streaming-decode-file-paraformer.sh |
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use an offline SenseVoice model, | ||
| 4 | +// i.e., non-streaming SenseVoice model, | ||
| 5 | +// to decode files. | ||
| 6 | +import com.k2fsa.sherpa.onnx.*; | ||
| 7 | + | ||
| 8 | +public class NonStreamingDecodeFileSenseVoice { | ||
| 9 | + public static void main(String[] args) { | ||
| 10 | + // please refer to | ||
| 11 | + // https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html | ||
| 12 | + // to download model files | ||
| 13 | + String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
| 14 | + String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
| 15 | + | ||
| 16 | + String waveFilename = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav"; | ||
| 17 | + | ||
| 18 | + WaveReader reader = new WaveReader(waveFilename); | ||
| 19 | + | ||
| 20 | + OfflineSenseVoiceModelConfig senseVoice = | ||
| 21 | + OfflineSenseVoiceModelConfig.builder().setModel(model).build(); | ||
| 22 | + | ||
| 23 | + OfflineModelConfig modelConfig = | ||
| 24 | + OfflineModelConfig.builder() | ||
| 25 | + .setSenseVoice(senseVoice) | ||
| 26 | + .setTokens(tokens) | ||
| 27 | + .setNumThreads(1) | ||
| 28 | + .setDebug(true) | ||
| 29 | + .build(); | ||
| 30 | + | ||
| 31 | + OfflineRecognizerConfig config = | ||
| 32 | + OfflineRecognizerConfig.builder() | ||
| 33 | + .setOfflineModelConfig(modelConfig) | ||
| 34 | + .setDecodingMethod("greedy_search") | ||
| 35 | + .build(); | ||
| 36 | + | ||
| 37 | + OfflineRecognizer recognizer = new OfflineRecognizer(config); | ||
| 38 | + OfflineStream stream = recognizer.createStream(); | ||
| 39 | + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); | ||
| 40 | + | ||
| 41 | + recognizer.decode(stream); | ||
| 42 | + | ||
| 43 | + String text = recognizer.getResult(stream).getText(); | ||
| 44 | + | ||
| 45 | + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); | ||
| 46 | + | ||
| 47 | + stream.release(); | ||
| 48 | + recognizer.release(); | ||
| 49 | + } | ||
| 50 | +} |
| @@ -18,6 +18,7 @@ This directory contains examples for the JAVA API of sherpa-onnx. | @@ -18,6 +18,7 @@ This directory contains examples for the JAVA API of sherpa-onnx. | ||
| 18 | 18 | ||
| 19 | ```bash | 19 | ```bash |
| 20 | ./run-non-streaming-decode-file-paraformer.sh | 20 | ./run-non-streaming-decode-file-paraformer.sh |
| 21 | +./run-non-streaming-decode-file-sense-voice.sh | ||
| 21 | ./run-non-streaming-decode-file-transducer.sh | 22 | ./run-non-streaming-decode-file-transducer.sh |
| 22 | ./run-non-streaming-decode-file-whisper.sh | 23 | ./run-non-streaming-decode-file-whisper.sh |
| 23 | ./run-non-streaming-decode-file-nemo.sh | 24 | ./run-non-streaming-decode-file-nemo.sh |
| @@ -64,6 +65,12 @@ The punctuation model supports both English and Chinese. | @@ -64,6 +65,12 @@ The punctuation model supports both English and Chinese. | ||
| 64 | ./run-vad-from-mic.sh | 65 | ./run-vad-from-mic.sh |
| 65 | ``` | 66 | ``` |
| 66 | 67 | ||
| 68 | +## VAD with a microphone + Non-streaming SenseVoice for speech recognition | ||
| 69 | + | ||
| 70 | +```bash | ||
| 71 | +./run-vad-from-mic-non-streaming-sense-voice.sh | ||
| 72 | +``` | ||
| 73 | + | ||
| 67 | ## VAD with a microphone + Non-streaming Paraformer for speech recognition | 74 | ## VAD with a microphone + Non-streaming Paraformer for speech recognition |
| 68 | 75 | ||
| 69 | ```bash | 76 | ```bash |
| @@ -82,6 +89,12 @@ The punctuation model supports both English and Chinese. | @@ -82,6 +89,12 @@ The punctuation model supports both English and Chinese. | ||
| 82 | ./run-vad-remove-slience.sh | 89 | ./run-vad-remove-slience.sh |
| 83 | ``` | 90 | ``` |
| 84 | 91 | ||
| 92 | +## VAD + Non-streaming SenseVoice for speech recognition | ||
| 93 | + | ||
| 94 | +```bash | ||
| 95 | +./run-vad-non-streaming-sense-voice.sh | ||
| 96 | +``` | ||
| 97 | + | ||
| 85 | ## VAD + Non-streaming Paraformer for speech recognition | 98 | ## VAD + Non-streaming Paraformer for speech recognition |
| 86 | 99 | ||
| 87 | ```bash | 100 | ```bash |
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use a silero_vad model with a non-streaming | ||
| 4 | +// SenseVoice model for speech recognition. | ||
| 5 | + | ||
| 6 | +import com.k2fsa.sherpa.onnx.*; | ||
| 7 | +import javax.sound.sampled.*; | ||
| 8 | + | ||
| 9 | +public class VadFromMicWithNonStreamingSenseVoice { | ||
| 10 | + private static final int sampleRate = 16000; | ||
| 11 | + private static final int windowSize = 512; | ||
| 12 | + | ||
| 13 | + public static Vad createVad() { | ||
| 14 | + // please download ./silero_vad.onnx from | ||
| 15 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 16 | + String model = "./silero_vad.onnx"; | ||
| 17 | + SileroVadModelConfig sileroVad = | ||
| 18 | + SileroVadModelConfig.builder() | ||
| 19 | + .setModel(model) | ||
| 20 | + .setThreshold(0.5f) | ||
| 21 | + .setMinSilenceDuration(0.25f) | ||
| 22 | + .setMinSpeechDuration(0.5f) | ||
| 23 | + .setWindowSize(windowSize) | ||
| 24 | + .build(); | ||
| 25 | + | ||
| 26 | + VadModelConfig config = | ||
| 27 | + VadModelConfig.builder() | ||
| 28 | + .setSileroVadModelConfig(sileroVad) | ||
| 29 | + .setSampleRate(sampleRate) | ||
| 30 | + .setNumThreads(1) | ||
| 31 | + .setDebug(true) | ||
| 32 | + .setProvider("cpu") | ||
| 33 | + .build(); | ||
| 34 | + | ||
| 35 | + return new Vad(config); | ||
| 36 | + } | ||
| 37 | + | ||
| 38 | + public static OfflineRecognizer createOfflineRecognizer() { | ||
| 39 | + // please refer to | ||
| 40 | + // https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html | ||
| 41 | + // to download model files | ||
| 42 | + String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
| 43 | + String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
| 44 | + | ||
| 45 | + OfflineSenseVoiceModelConfig senseVoice = | ||
| 46 | + OfflineSenseVoiceModelConfig.builder().setModel(model).build(); | ||
| 47 | + | ||
| 48 | + OfflineModelConfig modelConfig = | ||
| 49 | + OfflineModelConfig.builder() | ||
| 50 | + .setSenseVoice(senseVoice) | ||
| 51 | + .setTokens(tokens) | ||
| 52 | + .setNumThreads(1) | ||
| 53 | + .setDebug(true) | ||
| 54 | + .build(); | ||
| 55 | + | ||
| 56 | + OfflineRecognizerConfig config = | ||
| 57 | + OfflineRecognizerConfig.builder() | ||
| 58 | + .setOfflineModelConfig(modelConfig) | ||
| 59 | + .setDecodingMethod("greedy_search") | ||
| 60 | + .build(); | ||
| 61 | + | ||
| 62 | + return new OfflineRecognizer(config); | ||
| 63 | + } | ||
| 64 | + | ||
| 65 | + public static void main(String[] args) { | ||
| 66 | + Vad vad = createVad(); | ||
| 67 | + OfflineRecognizer recognizer = createOfflineRecognizer(); | ||
| 68 | + | ||
| 69 | + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html | ||
| 70 | + // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian | ||
| 71 | + AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false); | ||
| 72 | + | ||
| 73 | + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int- | ||
| 74 | + DataLine.Info info = new DataLine.Info(TargetDataLine.class, format); | ||
| 75 | + TargetDataLine targetDataLine; | ||
| 76 | + try { | ||
| 77 | + targetDataLine = (TargetDataLine) AudioSystem.getLine(info); | ||
| 78 | + targetDataLine.open(format); | ||
| 79 | + targetDataLine.start(); | ||
| 80 | + } catch (LineUnavailableException e) { | ||
| 81 | + System.out.println("Failed to open target data line: " + e.getMessage()); | ||
| 82 | + vad.release(); | ||
| 83 | + recognizer.release(); | ||
| 84 | + return; | ||
| 85 | + } | ||
| 86 | + | ||
| 87 | + boolean printed = false; | ||
| 88 | + byte[] buffer = new byte[windowSize * 2]; | ||
| 89 | + float[] samples = new float[windowSize]; | ||
| 90 | + | ||
| 91 | + System.out.println("Started. Please speak"); | ||
| 92 | + boolean running = true; | ||
| 93 | + while (targetDataLine.isOpen() && running) { | ||
| 94 | + int n = targetDataLine.read(buffer, 0, buffer.length); | ||
| 95 | + if (n <= 0) { | ||
| 96 | + System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length); | ||
| 97 | + continue; | ||
| 98 | + } | ||
| 99 | + for (int i = 0; i != windowSize; ++i) { | ||
| 100 | + short low = buffer[2 * i]; | ||
| 101 | + short high = buffer[2 * i + 1]; | ||
| 102 | + int s = (high << 8) + low; | ||
| 103 | + samples[i] = (float) s / 32768; | ||
| 104 | + } | ||
| 105 | + | ||
| 106 | + vad.acceptWaveform(samples); | ||
| 107 | + if (vad.isSpeechDetected() && !printed) { | ||
| 108 | + System.out.println("Detected speech"); | ||
| 109 | + printed = true; | ||
| 110 | + } | ||
| 111 | + | ||
| 112 | + if (!vad.isSpeechDetected()) { | ||
| 113 | + printed = false; | ||
| 114 | + } | ||
| 115 | + | ||
| 116 | + while (!vad.empty()) { | ||
| 117 | + SpeechSegment segment = vad.front(); | ||
| 118 | + float startTime = segment.getStart() / (float) sampleRate; | ||
| 119 | + float duration = segment.getSamples().length / (float) sampleRate; | ||
| 120 | + | ||
| 121 | + OfflineStream stream = recognizer.createStream(); | ||
| 122 | + stream.acceptWaveform(segment.getSamples(), sampleRate); | ||
| 123 | + recognizer.decode(stream); | ||
| 124 | + String text = recognizer.getResult(stream).getText(); | ||
| 125 | + stream.release(); | ||
| 126 | + | ||
| 127 | + if (!text.isEmpty()) { | ||
| 128 | + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
| 129 | + } | ||
| 130 | + | ||
| 131 | + if (text.contains("退出程序")) { | ||
| 132 | + running = false; | ||
| 133 | + } | ||
| 134 | + | ||
| 135 | + vad.pop(); | ||
| 136 | + } | ||
| 137 | + } | ||
| 138 | + | ||
| 139 | + vad.release(); | ||
| 140 | + recognizer.release(); | ||
| 141 | + } | ||
| 142 | +} |
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use a silero_vad model with a non-streaming SenseVoiceModel | ||
| 4 | +// for speech recognition. | ||
| 5 | + | ||
| 6 | +import com.k2fsa.sherpa.onnx.*; | ||
| 7 | +import java.util.Arrays; | ||
| 8 | + | ||
| 9 | +public class VadNonStreamingSenseVoice { | ||
| 10 | + public static Vad createVad() { | ||
| 11 | + // please download ./silero_vad.onnx from | ||
| 12 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 13 | + String model = "./silero_vad.onnx"; | ||
| 14 | + SileroVadModelConfig sileroVad = | ||
| 15 | + SileroVadModelConfig.builder() | ||
| 16 | + .setModel(model) | ||
| 17 | + .setThreshold(0.5f) | ||
| 18 | + .setMinSilenceDuration(0.25f) | ||
| 19 | + .setMinSpeechDuration(0.5f) | ||
| 20 | + .setWindowSize(512) | ||
| 21 | + .build(); | ||
| 22 | + | ||
| 23 | + VadModelConfig config = | ||
| 24 | + VadModelConfig.builder() | ||
| 25 | + .setSileroVadModelConfig(sileroVad) | ||
| 26 | + .setSampleRate(16000) | ||
| 27 | + .setNumThreads(1) | ||
| 28 | + .setDebug(true) | ||
| 29 | + .setProvider("cpu") | ||
| 30 | + .build(); | ||
| 31 | + | ||
| 32 | + return new Vad(config); | ||
| 33 | + } | ||
| 34 | + | ||
| 35 | + public static OfflineRecognizer createOfflineRecognizer() { | ||
| 36 | + // please refer to | ||
| 37 | + // https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html | ||
| 38 | + // to download model files | ||
| 39 | + String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
| 40 | + String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
| 41 | + | ||
| 42 | + OfflineSenseVoiceModelConfig senseVoice = | ||
| 43 | + OfflineSenseVoiceModelConfig.builder().setModel(model).build(); | ||
| 44 | + | ||
| 45 | + OfflineModelConfig modelConfig = | ||
| 46 | + OfflineModelConfig.builder() | ||
| 47 | + .setSenseVoice(senseVoice) | ||
| 48 | + .setTokens(tokens) | ||
| 49 | + .setNumThreads(1) | ||
| 50 | + .setDebug(true) | ||
| 51 | + .build(); | ||
| 52 | + | ||
| 53 | + OfflineRecognizerConfig config = | ||
| 54 | + OfflineRecognizerConfig.builder() | ||
| 55 | + .setOfflineModelConfig(modelConfig) | ||
| 56 | + .setDecodingMethod("greedy_search") | ||
| 57 | + .build(); | ||
| 58 | + | ||
| 59 | + return new OfflineRecognizer(config); | ||
| 60 | + } | ||
| 61 | + | ||
| 62 | + public static void main(String[] args) { | ||
| 63 | + | ||
| 64 | + Vad vad = createVad(); | ||
| 65 | + OfflineRecognizer recognizer = createOfflineRecognizer(); | ||
| 66 | + | ||
| 67 | + // You can download the test file from | ||
| 68 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 69 | + String testWaveFilename = "./lei-jun-test.wav"; | ||
| 70 | + WaveReader reader = new WaveReader(testWaveFilename); | ||
| 71 | + | ||
| 72 | + int numSamples = reader.getSamples().length; | ||
| 73 | + int numIter = numSamples / 512; | ||
| 74 | + | ||
| 75 | + for (int i = 0; i != numIter; ++i) { | ||
| 76 | + int start = i * 512; | ||
| 77 | + int end = start + 512; | ||
| 78 | + float[] samples = Arrays.copyOfRange(reader.getSamples(), start, end); | ||
| 79 | + vad.acceptWaveform(samples); | ||
| 80 | + if (vad.isSpeechDetected()) { | ||
| 81 | + while (!vad.empty()) { | ||
| 82 | + SpeechSegment segment = vad.front(); | ||
| 83 | + float startTime = segment.getStart() / 16000.0f; | ||
| 84 | + float duration = segment.getSamples().length / 16000.0f; | ||
| 85 | + | ||
| 86 | + OfflineStream stream = recognizer.createStream(); | ||
| 87 | + stream.acceptWaveform(segment.getSamples(), 16000); | ||
| 88 | + recognizer.decode(stream); | ||
| 89 | + String text = recognizer.getResult(stream).getText(); | ||
| 90 | + stream.release(); | ||
| 91 | + | ||
| 92 | + if (!text.isEmpty()) { | ||
| 93 | + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + vad.pop(); | ||
| 97 | + } | ||
| 98 | + } | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + vad.flush(); | ||
| 102 | + while (!vad.empty()) { | ||
| 103 | + SpeechSegment segment = vad.front(); | ||
| 104 | + float startTime = segment.getStart() / 16000.0f; | ||
| 105 | + float duration = segment.getSamples().length / 16000.0f; | ||
| 106 | + | ||
| 107 | + OfflineStream stream = recognizer.createStream(); | ||
| 108 | + stream.acceptWaveform(segment.getSamples(), 16000); | ||
| 109 | + recognizer.decode(stream); | ||
| 110 | + String text = recognizer.getResult(stream).getText(); | ||
| 111 | + stream.release(); | ||
| 112 | + | ||
| 113 | + if (!text.isEmpty()) { | ||
| 114 | + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
| 115 | + } | ||
| 116 | + | ||
| 117 | + vad.pop(); | ||
| 118 | + } | ||
| 119 | + | ||
| 120 | + vad.release(); | ||
| 121 | + recognizer.release(); | ||
| 122 | + } | ||
| 123 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then | ||
| 29 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 30 | + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 31 | + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 32 | +fi | ||
| 33 | + | ||
| 34 | +java \ | ||
| 35 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 36 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 37 | + NonStreamingDecodeFileSenseVoice.java |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +if [ ! -f ./silero_vad.onnx ]; then | ||
| 29 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 30 | +fi | ||
| 31 | + | ||
| 32 | +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then | ||
| 33 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 34 | + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 35 | + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 36 | +fi | ||
| 37 | + | ||
| 38 | +java \ | ||
| 39 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 40 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 41 | + ./VadFromMicWithNonStreamingSenseVoice.java |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +if [ ! -f ./silero_vad.onnx ]; then | ||
| 29 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 30 | +fi | ||
| 31 | + | ||
| 32 | +if [ ! -f ./lei-jun-test.wav ]; then | ||
| 33 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 34 | +fi | ||
| 35 | + | ||
| 36 | +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then | ||
| 37 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 38 | + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 39 | + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 40 | +fi | ||
| 41 | + | ||
| 42 | +java \ | ||
| 43 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 44 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 45 | + ./VadNonStreamingSenseVoice.java |
| @@ -167,6 +167,12 @@ function testSpokenLanguageIdentification() { | @@ -167,6 +167,12 @@ function testSpokenLanguageIdentification() { | ||
| 167 | } | 167 | } |
| 168 | 168 | ||
| 169 | function testOfflineAsr() { | 169 | function testOfflineAsr() { |
| 170 | + if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then | ||
| 171 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 172 | + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 173 | + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 174 | + fi | ||
| 175 | + | ||
| 170 | if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then | 176 | if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then |
| 171 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | 177 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 |
| 172 | tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 | 178 | tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 |
| 1 | package com.k2fsa.sherpa.onnx | 1 | package com.k2fsa.sherpa.onnx |
| 2 | 2 | ||
| 3 | fun main() { | 3 | fun main() { |
| 4 | - val types = arrayOf(0, 2, 5, 6) | 4 | + val types = arrayOf(0, 2, 5, 6, 15) |
| 5 | for (type in types) { | 5 | for (type in types) { |
| 6 | test(type) | 6 | test(type) |
| 7 | } | 7 | } |
| @@ -15,6 +15,7 @@ fun test(type: Int) { | @@ -15,6 +15,7 @@ fun test(type: Int) { | ||
| 15 | 2 -> "./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav" | 15 | 2 -> "./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav" |
| 16 | 5 -> "./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav" | 16 | 5 -> "./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav" |
| 17 | 6 -> "./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/8k.wav" | 17 | 6 -> "./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/8k.wav" |
| 18 | + 15 -> "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav" | ||
| 18 | else -> null | 19 | else -> null |
| 19 | } | 20 | } |
| 20 | 21 |
| @@ -90,6 +90,23 @@ def get_models(): | @@ -90,6 +90,23 @@ def get_models(): | ||
| 90 | """, | 90 | """, |
| 91 | ), | 91 | ), |
| 92 | Model( | 92 | Model( |
| 93 | + model_name="sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", | ||
| 94 | + idx=15, | ||
| 95 | + lang="zh_en_ko_ja_yue", | ||
| 96 | + short_name="sense_voice", | ||
| 97 | + cmd=""" | ||
| 98 | + pushd $model_name | ||
| 99 | + | ||
| 100 | + rm -rfv test_wavs | ||
| 101 | + rm -fv model.onnx | ||
| 102 | + rm -fv *.py | ||
| 103 | + | ||
| 104 | + ls -lh | ||
| 105 | + | ||
| 106 | + popd | ||
| 107 | + """, | ||
| 108 | + ), | ||
| 109 | + Model( | ||
| 93 | model_name="sherpa-onnx-paraformer-zh-small-2024-03-09", | 110 | model_name="sherpa-onnx-paraformer-zh-small-2024-03-09", |
| 94 | idx=14, | 111 | idx=14, |
| 95 | lang="zh", | 112 | lang="zh", |
| @@ -27,6 +27,7 @@ java_files += OfflineTransducerModelConfig.java | @@ -27,6 +27,7 @@ java_files += OfflineTransducerModelConfig.java | ||
| 27 | java_files += OfflineParaformerModelConfig.java | 27 | java_files += OfflineParaformerModelConfig.java |
| 28 | java_files += OfflineWhisperModelConfig.java | 28 | java_files += OfflineWhisperModelConfig.java |
| 29 | java_files += OfflineNemoEncDecCtcModelConfig.java | 29 | java_files += OfflineNemoEncDecCtcModelConfig.java |
| 30 | +java_files += OfflineSenseVoiceModelConfig.java | ||
| 30 | java_files += OfflineModelConfig.java | 31 | java_files += OfflineModelConfig.java |
| 31 | java_files += OfflineRecognizerConfig.java | 32 | java_files += OfflineRecognizerConfig.java |
| 32 | java_files += OfflineRecognizerResult.java | 33 | java_files += OfflineRecognizerResult.java |
| @@ -7,6 +7,7 @@ public class OfflineModelConfig { | @@ -7,6 +7,7 @@ public class OfflineModelConfig { | ||
| 7 | private final OfflineParaformerModelConfig paraformer; | 7 | private final OfflineParaformerModelConfig paraformer; |
| 8 | private final OfflineWhisperModelConfig whisper; | 8 | private final OfflineWhisperModelConfig whisper; |
| 9 | private final OfflineNemoEncDecCtcModelConfig nemo; | 9 | private final OfflineNemoEncDecCtcModelConfig nemo; |
| 10 | + private final OfflineSenseVoiceModelConfig senseVoice; | ||
| 10 | private final String teleSpeech; | 11 | private final String teleSpeech; |
| 11 | private final String tokens; | 12 | private final String tokens; |
| 12 | private final int numThreads; | 13 | private final int numThreads; |
| @@ -22,6 +23,7 @@ public class OfflineModelConfig { | @@ -22,6 +23,7 @@ public class OfflineModelConfig { | ||
| 22 | this.paraformer = builder.paraformer; | 23 | this.paraformer = builder.paraformer; |
| 23 | this.whisper = builder.whisper; | 24 | this.whisper = builder.whisper; |
| 24 | this.nemo = builder.nemo; | 25 | this.nemo = builder.nemo; |
| 26 | + this.senseVoice = builder.senseVoice; | ||
| 25 | this.teleSpeech = builder.teleSpeech; | 27 | this.teleSpeech = builder.teleSpeech; |
| 26 | this.tokens = builder.tokens; | 28 | this.tokens = builder.tokens; |
| 27 | this.numThreads = builder.numThreads; | 29 | this.numThreads = builder.numThreads; |
| @@ -48,6 +50,10 @@ public class OfflineModelConfig { | @@ -48,6 +50,10 @@ public class OfflineModelConfig { | ||
| 48 | return whisper; | 50 | return whisper; |
| 49 | } | 51 | } |
| 50 | 52 | ||
| 53 | + public OfflineSenseVoiceModelConfig getSenseVoice() { | ||
| 54 | + return senseVoice; | ||
| 55 | + } | ||
| 56 | + | ||
| 51 | public String getTokens() { | 57 | public String getTokens() { |
| 52 | return tokens; | 58 | return tokens; |
| 53 | } | 59 | } |
| @@ -85,6 +91,7 @@ public class OfflineModelConfig { | @@ -85,6 +91,7 @@ public class OfflineModelConfig { | ||
| 85 | private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); | 91 | private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); |
| 86 | private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); | 92 | private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); |
| 87 | private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); | 93 | private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); |
| 94 | + private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build(); | ||
| 88 | private String teleSpeech = ""; | 95 | private String teleSpeech = ""; |
| 89 | private String tokens = ""; | 96 | private String tokens = ""; |
| 90 | private int numThreads = 1; | 97 | private int numThreads = 1; |
| @@ -113,7 +120,6 @@ public class OfflineModelConfig { | @@ -113,7 +120,6 @@ public class OfflineModelConfig { | ||
| 113 | return this; | 120 | return this; |
| 114 | } | 121 | } |
| 115 | 122 | ||
| 116 | - | ||
| 117 | public Builder setTeleSpeech(String teleSpeech) { | 123 | public Builder setTeleSpeech(String teleSpeech) { |
| 118 | this.teleSpeech = teleSpeech; | 124 | this.teleSpeech = teleSpeech; |
| 119 | return this; | 125 | return this; |
| @@ -124,6 +130,11 @@ public class OfflineModelConfig { | @@ -124,6 +130,11 @@ public class OfflineModelConfig { | ||
| 124 | return this; | 130 | return this; |
| 125 | } | 131 | } |
| 126 | 132 | ||
| 133 | + public Builder setSenseVoice(OfflineSenseVoiceModelConfig senseVoice) { | ||
| 134 | + this.senseVoice = senseVoice; | ||
| 135 | + return this; | ||
| 136 | + } | ||
| 137 | + | ||
| 127 | public Builder setTokens(String tokens) { | 138 | public Builder setTokens(String tokens) { |
| 128 | this.tokens = tokens; | 139 | this.tokens = tokens; |
| 129 | return this; | 140 | return this; |
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +package com.k2fsa.sherpa.onnx; | ||
| 4 | + | ||
| 5 | +public class OfflineSenseVoiceModelConfig { | ||
| 6 | + private final String model; | ||
| 7 | + private final String language; | ||
| 8 | + private final boolean useInverseTextNormalization; | ||
| 9 | + | ||
| 10 | + private OfflineSenseVoiceModelConfig(Builder builder) { | ||
| 11 | + this.model = builder.model; | ||
| 12 | + this.language = builder.language; | ||
| 13 | + this.useInverseTextNormalization = builder.useInverseTextNormalization; | ||
| 14 | + } | ||
| 15 | + | ||
| 16 | + public static Builder builder() { | ||
| 17 | + return new Builder(); | ||
| 18 | + } | ||
| 19 | + | ||
| 20 | + public String getModel() { | ||
| 21 | + return model; | ||
| 22 | + } | ||
| 23 | + | ||
| 24 | + public String getLanguage() { | ||
| 25 | + return language; | ||
| 26 | + } | ||
| 27 | + | ||
| 28 | + public boolean getUseInverseTextNormalization() { | ||
| 29 | + return useInverseTextNormalization; | ||
| 30 | + } | ||
| 31 | + | ||
| 32 | + public static class Builder { | ||
| 33 | + private String model = ""; | ||
| 34 | + private String language = ""; | ||
| 35 | + private boolean useInverseTextNormalization = true; | ||
| 36 | + | ||
| 37 | + public OfflineSenseVoiceModelConfig build() { | ||
| 38 | + return new OfflineSenseVoiceModelConfig(this); | ||
| 39 | + } | ||
| 40 | + | ||
| 41 | + public Builder setModel(String model) { | ||
| 42 | + this.model = model; | ||
| 43 | + return this; | ||
| 44 | + } | ||
| 45 | + | ||
| 46 | + public Builder setLanguage(String language) { | ||
| 47 | + this.language = language; | ||
| 48 | + return this; | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + public Builder setInverseTextNormalization(boolean useInverseTextNormalization) { | ||
| 52 | + this.useInverseTextNormalization = useInverseTextNormalization; | ||
| 53 | + return this; | ||
| 54 | + } | ||
| 55 | + } | ||
| 56 | +} |
| @@ -171,6 +171,31 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { | @@ -171,6 +171,31 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { | ||
| 171 | ans.model_config.whisper.tail_paddings = | 171 | ans.model_config.whisper.tail_paddings = |
| 172 | env->GetIntField(whisper_config, fid); | 172 | env->GetIntField(whisper_config, fid); |
| 173 | 173 | ||
| 174 | + // sense voice | ||
| 175 | + fid = env->GetFieldID(model_config_cls, "senseVoice", | ||
| 176 | + "Lcom/k2fsa/sherpa/onnx/OfflineSenseVoiceModelConfig;"); | ||
| 177 | + jobject sense_voice_config = env->GetObjectField(model_config, fid); | ||
| 178 | + jclass sense_voice_config_cls = env->GetObjectClass(sense_voice_config); | ||
| 179 | + | ||
| 180 | + fid = env->GetFieldID(sense_voice_config_cls, "model", "Ljava/lang/String;"); | ||
| 181 | + s = (jstring)env->GetObjectField(sense_voice_config, fid); | ||
| 182 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 183 | + ans.model_config.sense_voice.model = p; | ||
| 184 | + env->ReleaseStringUTFChars(s, p); | ||
| 185 | + | ||
| 186 | + fid = | ||
| 187 | + env->GetFieldID(sense_voice_config_cls, "language", "Ljava/lang/String;"); | ||
| 188 | + s = (jstring)env->GetObjectField(sense_voice_config, fid); | ||
| 189 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 190 | + ans.model_config.sense_voice.language = p; | ||
| 191 | + env->ReleaseStringUTFChars(s, p); | ||
| 192 | + | ||
| 193 | + fid = env->GetFieldID(sense_voice_config_cls, "useInverseTextNormalization", | ||
| 194 | + "Z"); | ||
| 195 | + ans.model_config.sense_voice.use_itn = | ||
| 196 | + env->GetBooleanField(sense_voice_config, fid); | ||
| 197 | + | ||
| 198 | + // nemo | ||
| 174 | fid = env->GetFieldID( | 199 | fid = env->GetFieldID( |
| 175 | model_config_cls, "nemo", | 200 | model_config_cls, "nemo", |
| 176 | "Lcom/k2fsa/sherpa/onnx/OfflineNemoEncDecCtcModelConfig;"); | 201 | "Lcom/k2fsa/sherpa/onnx/OfflineNemoEncDecCtcModelConfig;"); |
| @@ -30,11 +30,18 @@ data class OfflineWhisperModelConfig( | @@ -30,11 +30,18 @@ data class OfflineWhisperModelConfig( | ||
| 30 | var tailPaddings: Int = 1000, // Padding added at the end of the samples | 30 | var tailPaddings: Int = 1000, // Padding added at the end of the samples |
| 31 | ) | 31 | ) |
| 32 | 32 | ||
| 33 | +data class OfflineSenseVoiceModelConfig( | ||
| 34 | + var model: String = "", | ||
| 35 | + var language: String = "", | ||
| 36 | + var useInverseTextNormalization: Boolean = true, | ||
| 37 | +) | ||
| 38 | + | ||
| 33 | data class OfflineModelConfig( | 39 | data class OfflineModelConfig( |
| 34 | var transducer: OfflineTransducerModelConfig = OfflineTransducerModelConfig(), | 40 | var transducer: OfflineTransducerModelConfig = OfflineTransducerModelConfig(), |
| 35 | var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), | 41 | var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), |
| 36 | var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), | 42 | var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), |
| 37 | var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), | 43 | var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), |
| 44 | + var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(), | ||
| 38 | var teleSpeech: String = "", | 45 | var teleSpeech: String = "", |
| 39 | var numThreads: Int = 1, | 46 | var numThreads: Int = 1, |
| 40 | var debug: Boolean = false, | 47 | var debug: Boolean = false, |
| @@ -321,6 +328,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { | @@ -321,6 +328,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { | ||
| 321 | modelType = "paraformer", | 328 | modelType = "paraformer", |
| 322 | ) | 329 | ) |
| 323 | } | 330 | } |
| 331 | + | ||
| 332 | + 15 -> { | ||
| 333 | + val modelDir = "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17" | ||
| 334 | + return OfflineModelConfig( | ||
| 335 | + senseVoice = OfflineSenseVoiceModelConfig( | ||
| 336 | + model = "$modelDir/model.int8.onnx", | ||
| 337 | + ), | ||
| 338 | + tokens = "$modelDir/tokens.txt", | ||
| 339 | + ) | ||
| 340 | + } | ||
| 324 | } | 341 | } |
| 325 | return null | 342 | return null |
| 326 | } | 343 | } |
-
请 注册 或 登录 后发表评论