Committed by
GitHub
Add VAD + Non-streaming ASR + microphone examples for Java API (#1046)
正在显示
7 个修改的文件
包含
389 行增加
和
406 行删除
| @@ -63,6 +63,18 @@ The punctuation model supports both English and Chinese. | @@ -63,6 +63,18 @@ The punctuation model supports both English and Chinese. | ||
| 63 | ./run-vad-from-mic.sh | 63 | ./run-vad-from-mic.sh |
| 64 | ``` | 64 | ``` |
| 65 | 65 | ||
| 66 | +## VAD with a microphone + Non-streaming Paraformer for speech recognition | ||
| 67 | + | ||
| 68 | +```bash | ||
| 69 | +./run-vad-from-mic-non-streaming-paraformer.sh | ||
| 70 | +``` | ||
| 71 | + | ||
| 72 | +## VAD with a microphone + Non-streaming Whisper tiny.en for speech recognition | ||
| 73 | + | ||
| 74 | +```bash | ||
| 75 | +./run-vad-from-mic-non-streaming-whisper.sh | ||
| 76 | +``` | ||
| 77 | + | ||
| 66 | ## VAD (Remove silence) | 78 | ## VAD (Remove silence) |
| 67 | 79 | ||
| 68 | ```bash | 80 | ```bash |
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use a silero_vad model with a non-streaming Paraformer | ||
| 4 | +// for speech recognition. | ||
| 5 | + | ||
| 6 | +import com.k2fsa.sherpa.onnx.*; | ||
| 7 | +import javax.sound.sampled.*; | ||
| 8 | + | ||
| 9 | +public class VadFromMicWithNonStreamingParaformer { | ||
| 10 | + private static final int sampleRate = 16000; | ||
| 11 | + private static final int windowSize = 512; | ||
| 12 | + | ||
| 13 | + public static Vad createVad() { | ||
| 14 | + // please download ./silero_vad.onnx from | ||
| 15 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 16 | + String model = "./silero_vad.onnx"; | ||
| 17 | + SileroVadModelConfig sileroVad = | ||
| 18 | + SileroVadModelConfig.builder() | ||
| 19 | + .setModel(model) | ||
| 20 | + .setThreshold(0.5f) | ||
| 21 | + .setMinSilenceDuration(0.25f) | ||
| 22 | + .setMinSpeechDuration(0.5f) | ||
| 23 | + .setWindowSize(windowSize) | ||
| 24 | + .build(); | ||
| 25 | + | ||
| 26 | + VadModelConfig config = | ||
| 27 | + VadModelConfig.builder() | ||
| 28 | + .setSileroVadModelConfig(sileroVad) | ||
| 29 | + .setSampleRate(sampleRate) | ||
| 30 | + .setNumThreads(1) | ||
| 31 | + .setDebug(true) | ||
| 32 | + .setProvider("cpu") | ||
| 33 | + .build(); | ||
| 34 | + | ||
| 35 | + return new Vad(config); | ||
| 36 | + } | ||
| 37 | + | ||
| 38 | + public static OfflineRecognizer createOfflineRecognizer() { | ||
| 39 | + // please refer to | ||
| 40 | + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english | ||
| 41 | + // to download model files | ||
| 42 | + String model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx"; | ||
| 43 | + String tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt"; | ||
| 44 | + | ||
| 45 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst | ||
| 46 | + String ruleFsts = "./itn_zh_number.fst"; | ||
| 47 | + | ||
| 48 | + OfflineParaformerModelConfig paraformer = | ||
| 49 | + OfflineParaformerModelConfig.builder().setModel(model).build(); | ||
| 50 | + | ||
| 51 | + OfflineModelConfig modelConfig = | ||
| 52 | + OfflineModelConfig.builder() | ||
| 53 | + .setParaformer(paraformer) | ||
| 54 | + .setTokens(tokens) | ||
| 55 | + .setNumThreads(1) | ||
| 56 | + .setDebug(true) | ||
| 57 | + .build(); | ||
| 58 | + | ||
| 59 | + OfflineRecognizerConfig config = | ||
| 60 | + OfflineRecognizerConfig.builder() | ||
| 61 | + .setOfflineModelConfig(modelConfig) | ||
| 62 | + .setDecodingMethod("greedy_search") | ||
| 63 | + .setRuleFsts(ruleFsts) | ||
| 64 | + .build(); | ||
| 65 | + | ||
| 66 | + return new OfflineRecognizer(config); | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | + public static void main(String[] args) { | ||
| 70 | + Vad vad = createVad(); | ||
| 71 | + OfflineRecognizer recognizer = createOfflineRecognizer(); | ||
| 72 | + | ||
| 73 | + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html | ||
| 74 | + // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian | ||
| 75 | + AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false); | ||
| 76 | + | ||
| 77 | + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int- | ||
| 78 | + DataLine.Info info = new DataLine.Info(TargetDataLine.class, format); | ||
| 79 | + TargetDataLine targetDataLine; | ||
| 80 | + try { | ||
| 81 | + targetDataLine = (TargetDataLine) AudioSystem.getLine(info); | ||
| 82 | + targetDataLine.open(format); | ||
| 83 | + targetDataLine.start(); | ||
| 84 | + } catch (LineUnavailableException e) { | ||
| 85 | + System.out.println("Failed to open target data line: " + e.getMessage()); | ||
| 86 | + vad.release(); | ||
| 87 | + recognizer.release(); | ||
| 88 | + return; | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + boolean printed = false; | ||
| 92 | + byte[] buffer = new byte[windowSize * 2]; | ||
| 93 | + float[] samples = new float[windowSize]; | ||
| 94 | + | ||
| 95 | + System.out.println("Started. Please speak"); | ||
| 96 | + boolean running = true; | ||
| 97 | + while (targetDataLine.isOpen() && running) { | ||
| 98 | + int n = targetDataLine.read(buffer, 0, buffer.length); | ||
| 99 | + if (n <= 0) { | ||
| 100 | + System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length); | ||
| 101 | + continue; | ||
| 102 | + } | ||
| 103 | + for (int i = 0; i != windowSize; ++i) { | ||
| 104 | + short low = buffer[2 * i]; | ||
| 105 | + short high = buffer[2 * i + 1]; | ||
| 106 | + int s = (high << 8) + low; | ||
| 107 | + samples[i] = (float) s / 32768; | ||
| 108 | + } | ||
| 109 | + | ||
| 110 | + vad.acceptWaveform(samples); | ||
| 111 | + if (vad.isSpeechDetected() && !printed) { | ||
| 112 | + System.out.println("Detected speech"); | ||
| 113 | + printed = true; | ||
| 114 | + } | ||
| 115 | + | ||
| 116 | + if (!vad.isSpeechDetected()) { | ||
| 117 | + printed = false; | ||
| 118 | + } | ||
| 119 | + | ||
| 120 | + while (!vad.empty()) { | ||
| 121 | + SpeechSegment segment = vad.front(); | ||
| 122 | + float startTime = segment.getStart() / (float) sampleRate; | ||
| 123 | + float duration = segment.getSamples().length / (float) sampleRate; | ||
| 124 | + | ||
| 125 | + OfflineStream stream = recognizer.createStream(); | ||
| 126 | + stream.acceptWaveform(segment.getSamples(), sampleRate); | ||
| 127 | + recognizer.decode(stream); | ||
| 128 | + String text = recognizer.getResult(stream).getText(); | ||
| 129 | + stream.release(); | ||
| 130 | + | ||
| 131 | + if (!text.isEmpty()) { | ||
| 132 | + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
| 133 | + } | ||
| 134 | + | ||
| 135 | + if (text.contains("退出程序")) { | ||
| 136 | + running = false; | ||
| 137 | + } | ||
| 138 | + | ||
| 139 | + vad.pop(); | ||
| 140 | + } | ||
| 141 | + } | ||
| 142 | + | ||
| 143 | + vad.release(); | ||
| 144 | + recognizer.release(); | ||
| 145 | + } | ||
| 146 | +} |
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use a silero_vad model with a non-streaming Whisper tiny.en | ||
| 4 | +// for speech recognition. | ||
| 5 | + | ||
| 6 | +import com.k2fsa.sherpa.onnx.*; | ||
| 7 | +import javax.sound.sampled.*; | ||
| 8 | + | ||
| 9 | +public class VadFromMicNonStreamingWhisper { | ||
| 10 | + private static final int sampleRate = 16000; | ||
| 11 | + private static final int windowSize = 512; | ||
| 12 | + | ||
| 13 | + public static Vad createVad() { | ||
| 14 | + // please download ./silero_vad.onnx from | ||
| 15 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 16 | + String model = "./silero_vad.onnx"; | ||
| 17 | + SileroVadModelConfig sileroVad = | ||
| 18 | + SileroVadModelConfig.builder() | ||
| 19 | + .setModel(model) | ||
| 20 | + .setThreshold(0.5f) | ||
| 21 | + .setMinSilenceDuration(0.25f) | ||
| 22 | + .setMinSpeechDuration(0.5f) | ||
| 23 | + .setWindowSize(windowSize) | ||
| 24 | + .build(); | ||
| 25 | + | ||
| 26 | + VadModelConfig config = | ||
| 27 | + VadModelConfig.builder() | ||
| 28 | + .setSileroVadModelConfig(sileroVad) | ||
| 29 | + .setSampleRate(sampleRate) | ||
| 30 | + .setNumThreads(1) | ||
| 31 | + .setDebug(true) | ||
| 32 | + .setProvider("cpu") | ||
| 33 | + .build(); | ||
| 34 | + | ||
| 35 | + return new Vad(config); | ||
| 36 | + } | ||
| 37 | + | ||
| 38 | + public static OfflineRecognizer createOfflineRecognizer() { | ||
| 39 | + // please refer to | ||
| 40 | + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html | ||
| 41 | + // to download model files | ||
| 42 | + String encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"; | ||
| 43 | + String decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"; | ||
| 44 | + String tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"; | ||
| 45 | + | ||
| 46 | + OfflineWhisperModelConfig whisper = | ||
| 47 | + OfflineWhisperModelConfig.builder().setEncoder(encoder).setDecoder(decoder).build(); | ||
| 48 | + | ||
| 49 | + OfflineModelConfig modelConfig = | ||
| 50 | + OfflineModelConfig.builder() | ||
| 51 | + .setWhisper(whisper) | ||
| 52 | + .setTokens(tokens) | ||
| 53 | + .setNumThreads(1) | ||
| 54 | + .setDebug(true) | ||
| 55 | + .build(); | ||
| 56 | + | ||
| 57 | + OfflineRecognizerConfig config = | ||
| 58 | + OfflineRecognizerConfig.builder() | ||
| 59 | + .setOfflineModelConfig(modelConfig) | ||
| 60 | + .setDecodingMethod("greedy_search") | ||
| 61 | + .build(); | ||
| 62 | + | ||
| 63 | + return new OfflineRecognizer(config); | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + public static void main(String[] args) { | ||
| 67 | + Vad vad = createVad(); | ||
| 68 | + OfflineRecognizer recognizer = createOfflineRecognizer(); | ||
| 69 | + | ||
| 70 | + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html | ||
| 71 | + // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian | ||
| 72 | + AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false); | ||
| 73 | + | ||
| 74 | + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int- | ||
| 75 | + DataLine.Info info = new DataLine.Info(TargetDataLine.class, format); | ||
| 76 | + TargetDataLine targetDataLine; | ||
| 77 | + try { | ||
| 78 | + targetDataLine = (TargetDataLine) AudioSystem.getLine(info); | ||
| 79 | + targetDataLine.open(format); | ||
| 80 | + targetDataLine.start(); | ||
| 81 | + } catch (LineUnavailableException e) { | ||
| 82 | + System.out.println("Failed to open target data line: " + e.getMessage()); | ||
| 83 | + vad.release(); | ||
| 84 | + recognizer.release(); | ||
| 85 | + return; | ||
| 86 | + } | ||
| 87 | + | ||
| 88 | + boolean printed = false; | ||
| 89 | + byte[] buffer = new byte[windowSize * 2]; | ||
| 90 | + float[] samples = new float[windowSize]; | ||
| 91 | + | ||
| 92 | + System.out.println("Started. Please speak"); | ||
| 93 | + boolean running = true; | ||
| 94 | + while (targetDataLine.isOpen() && running) { | ||
| 95 | + int n = targetDataLine.read(buffer, 0, buffer.length); | ||
| 96 | + if (n <= 0) { | ||
| 97 | + System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length); | ||
| 98 | + continue; | ||
| 99 | + } | ||
| 100 | + for (int i = 0; i != windowSize; ++i) { | ||
| 101 | + short low = buffer[2 * i]; | ||
| 102 | + short high = buffer[2 * i + 1]; | ||
| 103 | + int s = (high << 8) + low; | ||
| 104 | + samples[i] = (float) s / 32768; | ||
| 105 | + } | ||
| 106 | + | ||
| 107 | + vad.acceptWaveform(samples); | ||
| 108 | + if (vad.isSpeechDetected() && !printed) { | ||
| 109 | + System.out.println("Detected speech"); | ||
| 110 | + printed = true; | ||
| 111 | + } | ||
| 112 | + | ||
| 113 | + if (!vad.isSpeechDetected()) { | ||
| 114 | + printed = false; | ||
| 115 | + } | ||
| 116 | + | ||
| 117 | + while (!vad.empty()) { | ||
| 118 | + SpeechSegment segment = vad.front(); | ||
| 119 | + float startTime = segment.getStart() / (float) sampleRate; | ||
| 120 | + float duration = segment.getSamples().length / (float) sampleRate; | ||
| 121 | + | ||
| 122 | + OfflineStream stream = recognizer.createStream(); | ||
| 123 | + stream.acceptWaveform(segment.getSamples(), sampleRate); | ||
| 124 | + recognizer.decode(stream); | ||
| 125 | + String text = recognizer.getResult(stream).getText(); | ||
| 126 | + stream.release(); | ||
| 127 | + | ||
| 128 | + if (!text.isEmpty()) { | ||
| 129 | + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
| 130 | + } | ||
| 131 | + | ||
| 132 | + if (text.contains("exit the program")) { | ||
| 133 | + running = false; | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + vad.pop(); | ||
| 137 | + } | ||
| 138 | + } | ||
| 139 | + | ||
| 140 | + vad.release(); | ||
| 141 | + recognizer.release(); | ||
| 142 | + } | ||
| 143 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +if [ ! -f ./silero_vad.onnx ]; then | ||
| 29 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 30 | +fi | ||
| 31 | + | ||
| 32 | +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then | ||
| 33 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 | ||
| 34 | + | ||
| 35 | + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 | ||
| 36 | + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 | ||
| 37 | +fi | ||
| 38 | + | ||
| 39 | +if [ ! -f ./itn_zh_number.fst ]; then | ||
| 40 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst | ||
| 41 | +fi | ||
| 42 | + | ||
| 43 | +java \ | ||
| 44 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 45 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 46 | + ./VadFromMicWithNonStreamingParaformer.java |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +if [ ! -f ./silero_vad.onnx ]; then | ||
| 29 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 30 | +fi | ||
| 31 | + | ||
| 32 | +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then | ||
| 33 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 34 | + | ||
| 35 | + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 36 | + rm sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 37 | +fi | ||
| 38 | + | ||
| 39 | +java \ | ||
| 40 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 41 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 42 | + ./VadFromMicWithNonStreamingWhisper.java |
java-api-examples/src/DecodeFile.java
已删除
100644 → 0
| 1 | -/* | ||
| 2 | - * // Copyright 2022-2023 by zhaoming | ||
| 3 | - */ | ||
| 4 | -/* | ||
| 5 | -Config modelconfig.cfg | ||
| 6 | - sample_rate=16000 | ||
| 7 | - feature_dim=80 | ||
| 8 | - rule1_min_trailing_silence=2.4 | ||
| 9 | - rule2_min_trailing_silence=1.2 | ||
| 10 | - rule3_min_utterance_length=20 | ||
| 11 | - encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx | ||
| 12 | - decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx | ||
| 13 | - joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx | ||
| 14 | - tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt | ||
| 15 | - num_threads=4 | ||
| 16 | - enable_endpoint_detection=false | ||
| 17 | - decoding_method=greedy_search | ||
| 18 | - max_active_paths=4 | ||
| 19 | -*/ | ||
| 20 | - | ||
| 21 | -import com.k2fsa.sherpa.onnx.OnlineRecognizer; | ||
| 22 | -import com.k2fsa.sherpa.onnx.OnlineStream; | ||
| 23 | -import java.io.*; | ||
| 24 | -import java.nio.charset.StandardCharsets; | ||
| 25 | - | ||
| 26 | -public class DecodeFile { | ||
| 27 | - OnlineRecognizer rcgOjb; | ||
| 28 | - OnlineStream streamObj; | ||
| 29 | - String wavfilename; | ||
| 30 | - | ||
| 31 | - public DecodeFile(String fileName) { | ||
| 32 | - wavfilename = fileName; | ||
| 33 | - } | ||
| 34 | - | ||
| 35 | - public void initModelWithPara() { | ||
| 36 | - try { | ||
| 37 | - String modelDir = | ||
| 38 | - "/sherpa-onnx/build_old/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"; | ||
| 39 | - String encoder = modelDir + "/encoder-epoch-99-avg-1.onnx"; | ||
| 40 | - String decoder = modelDir + "/decoder-epoch-99-avg-1.onnx"; | ||
| 41 | - String joiner = modelDir + "/joiner-epoch-99-avg-1.onnx"; | ||
| 42 | - String tokens = modelDir + "/tokens.txt"; | ||
| 43 | - int numThreads = 4; | ||
| 44 | - int sampleRate = 16000; | ||
| 45 | - int featureDim = 80; | ||
| 46 | - boolean enableEndpointDetection = false; | ||
| 47 | - float rule1MinTrailingSilence = 2.4F; | ||
| 48 | - float rule2MinTrailingSilence = 1.2F; | ||
| 49 | - float rule3MinUtteranceLength = 20F; | ||
| 50 | - String decodingMethod = "greedy_search"; | ||
| 51 | - int maxActivePaths = 4; | ||
| 52 | - String hotwordsFile = ""; | ||
| 53 | - float hotwordsScore = 1.5F; | ||
| 54 | - String lm_model = ""; | ||
| 55 | - float lm_scale = 0.5F; | ||
| 56 | - String modelType = "zipformer"; | ||
| 57 | - rcgOjb = | ||
| 58 | - new OnlineRecognizer( | ||
| 59 | - tokens, | ||
| 60 | - encoder, | ||
| 61 | - decoder, | ||
| 62 | - joiner, | ||
| 63 | - numThreads, | ||
| 64 | - sampleRate, | ||
| 65 | - featureDim, | ||
| 66 | - enableEndpointDetection, | ||
| 67 | - rule1MinTrailingSilence, | ||
| 68 | - rule2MinTrailingSilence, | ||
| 69 | - rule3MinUtteranceLength, | ||
| 70 | - decodingMethod, | ||
| 71 | - lm_model, | ||
| 72 | - lm_scale, | ||
| 73 | - maxActivePaths, | ||
| 74 | - hotwordsFile, | ||
| 75 | - hotwordsScore, | ||
| 76 | - modelType); | ||
| 77 | - streamObj = rcgOjb.createStream(); | ||
| 78 | - } catch (Exception e) { | ||
| 79 | - System.err.println(e); | ||
| 80 | - e.printStackTrace(); | ||
| 81 | - } | ||
| 82 | - } | ||
| 83 | - | ||
| 84 | - public void initModelWithCfg(String cfgFile) { | ||
| 85 | - try { | ||
| 86 | - // you should set setCfgPath() before running this | ||
| 87 | - rcgOjb = new OnlineRecognizer(cfgFile); | ||
| 88 | - streamObj = rcgOjb.createStream(); | ||
| 89 | - } catch (Exception e) { | ||
| 90 | - System.err.println(e); | ||
| 91 | - e.printStackTrace(); | ||
| 92 | - } | ||
| 93 | - } | ||
| 94 | - | ||
| 95 | - public void simpleExample() { | ||
| 96 | - try { | ||
| 97 | - float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file | ||
| 98 | - streamObj.acceptWaveform(buffer); // feed stream with data | ||
| 99 | - streamObj.inputFinished(); // tell engine you done with all data | ||
| 100 | - OnlineStream ssObj[] = new OnlineStream[1]; | ||
| 101 | - while (rcgOjb.isReady(streamObj)) { // engine is ready for unprocessed data | ||
| 102 | - ssObj[0] = streamObj; | ||
| 103 | - rcgOjb.decodeStreams(ssObj); // decode for multiple stream | ||
| 104 | - // rcgOjb.DecodeStream(streamObj); // decode for single stream | ||
| 105 | - } | ||
| 106 | - | ||
| 107 | - String recText = "simple:" + rcgOjb.getResult(streamObj) + "\n"; | ||
| 108 | - byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8); | ||
| 109 | - System.out.println(new String(utf8Data)); | ||
| 110 | - rcgOjb.reSet(streamObj); | ||
| 111 | - rcgOjb.releaseStream(streamObj); // release stream | ||
| 112 | - rcgOjb.release(); // release recognizer | ||
| 113 | - | ||
| 114 | - } catch (Exception e) { | ||
| 115 | - System.err.println(e); | ||
| 116 | - e.printStackTrace(); | ||
| 117 | - } | ||
| 118 | - } | ||
| 119 | - | ||
| 120 | - public void streamExample() { | ||
| 121 | - try { | ||
| 122 | - float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file | ||
| 123 | - float[] chunk = new float[1600]; // //each time read 1600(0.1s) data | ||
| 124 | - int chunkIndex = 0; | ||
| 125 | - for (int i = 0; i < buffer.length; i++) // total wav length loop | ||
| 126 | - { | ||
| 127 | - chunk[chunkIndex] = buffer[i]; | ||
| 128 | - chunkIndex++; | ||
| 129 | - if (chunkIndex >= 1600 || i == (buffer.length - 1)) { | ||
| 130 | - chunkIndex = 0; | ||
| 131 | - streamObj.acceptWaveform(chunk); // feed chunk | ||
| 132 | - if (rcgOjb.isReady(streamObj)) { | ||
| 133 | - rcgOjb.decodeStream(streamObj); | ||
| 134 | - } | ||
| 135 | - String testDate = rcgOjb.getResult(streamObj); | ||
| 136 | - byte[] utf8Data = testDate.getBytes(StandardCharsets.UTF_8); | ||
| 137 | - | ||
| 138 | - if (utf8Data.length > 0) { | ||
| 139 | - System.out.println(Float.valueOf((float) i / 16000) + ":" + new String(utf8Data)); | ||
| 140 | - } | ||
| 141 | - } | ||
| 142 | - } | ||
| 143 | - streamObj.inputFinished(); | ||
| 144 | - while (rcgOjb.isReady(streamObj)) { | ||
| 145 | - rcgOjb.decodeStream(streamObj); | ||
| 146 | - } | ||
| 147 | - | ||
| 148 | - String recText = "stream:" + rcgOjb.getResult(streamObj) + "\n"; | ||
| 149 | - byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8); | ||
| 150 | - System.out.println(new String(utf8Data)); | ||
| 151 | - rcgOjb.reSet(streamObj); | ||
| 152 | - rcgOjb.releaseStream(streamObj); // release stream | ||
| 153 | - rcgOjb.release(); // release recognizer | ||
| 154 | - | ||
| 155 | - } catch (Exception e) { | ||
| 156 | - System.err.println(e); | ||
| 157 | - e.printStackTrace(); | ||
| 158 | - } | ||
| 159 | - } | ||
| 160 | - | ||
| 161 | - public static void main(String[] args) { | ||
| 162 | - try { | ||
| 163 | - String appDir = System.getProperty("user.dir"); | ||
| 164 | - System.out.println("appdir=" + appDir); | ||
| 165 | - String fileName = appDir + "/" + args[0]; | ||
| 166 | - String cfgPath = appDir + "/modeltest.cfg"; | ||
| 167 | - String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so"; | ||
| 168 | - OnlineRecognizer.setSoPath(soPath); | ||
| 169 | - DecodeFile rcgDemo = new DecodeFile(fileName); | ||
| 170 | - | ||
| 171 | - // ***************** */ | ||
| 172 | - rcgDemo.initModelWithCfg(cfgPath); | ||
| 173 | - rcgDemo.streamExample(); | ||
| 174 | - // **************** */ | ||
| 175 | - rcgDemo.initModelWithCfg(cfgPath); | ||
| 176 | - rcgDemo.simpleExample(); | ||
| 177 | - | ||
| 178 | - } catch (Exception e) { | ||
| 179 | - System.err.println(e); | ||
| 180 | - e.printStackTrace(); | ||
| 181 | - } | ||
| 182 | - } | ||
| 183 | -} |
java-api-examples/src/DecodeMic.java
已删除
100755 → 0
| 1 | -/* | ||
| 2 | - * // Copyright 2022-2023 by zhaoming | ||
| 3 | - */ | ||
| 4 | -/* | ||
| 5 | -Real-time speech recognition from a microphone with com.k2fsa.sherpa.onnx Java API | ||
| 6 | - | ||
| 7 | -example for cfgFile modelconfig.cfg | ||
| 8 | - sample_rate=16000 | ||
| 9 | - feature_dim=80 | ||
| 10 | - rule1_min_trailing_silence=2.4 | ||
| 11 | - rule2_min_trailing_silence=1.2 | ||
| 12 | - rule3_min_utterance_length=20 | ||
| 13 | - encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx | ||
| 14 | - decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx | ||
| 15 | - joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx | ||
| 16 | - tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt | ||
| 17 | - num_threads=4 | ||
| 18 | - enable_endpoint_detection=true | ||
| 19 | - decoding_method=greedy_search | ||
| 20 | - max_active_paths=4 | ||
| 21 | - | ||
| 22 | -*/ | ||
| 23 | -import com.k2fsa.sherpa.onnx.OnlineRecognizer; | ||
| 24 | -import com.k2fsa.sherpa.onnx.OnlineStream; | ||
| 25 | -import java.io.*; | ||
| 26 | -import java.nio.ByteBuffer; | ||
| 27 | -import java.nio.ByteOrder; | ||
| 28 | -import java.nio.ShortBuffer; | ||
| 29 | -import java.nio.charset.StandardCharsets; | ||
| 30 | -import javax.sound.sampled.AudioFormat; | ||
| 31 | -import javax.sound.sampled.AudioSystem; | ||
| 32 | -import javax.sound.sampled.DataLine; | ||
| 33 | -import javax.sound.sampled.TargetDataLine; | ||
| 34 | - | ||
| 35 | -/** Microphone Example */ | ||
| 36 | -public class DecodeMic { | ||
| 37 | - MicRcgThread micRcgThread = null; // thread handle | ||
| 38 | - | ||
| 39 | - OnlineRecognizer rcgOjb; // the recognizer | ||
| 40 | - | ||
| 41 | - OnlineStream streamObj; // the stream | ||
| 42 | - | ||
| 43 | - public DecodeMic() { | ||
| 44 | - | ||
| 45 | - micRcgThread = new MicRcgThread(); // create a new instance for MicRcgThread | ||
| 46 | - } | ||
| 47 | - | ||
| 48 | - public void open() { | ||
| 49 | - micRcgThread.start(); // start to capture microphone data | ||
| 50 | - } | ||
| 51 | - | ||
| 52 | - public void close() { | ||
| 53 | - micRcgThread.stop(); // close capture | ||
| 54 | - } | ||
| 55 | - | ||
| 56 | - /** init asr engine with config file */ | ||
| 57 | - public void initModelWithCfg(String cfgFile) { | ||
| 58 | - try { | ||
| 59 | - | ||
| 60 | - // set setSoPath() before running this | ||
| 61 | - rcgOjb = new OnlineRecognizer(cfgFile); | ||
| 62 | - | ||
| 63 | - streamObj = rcgOjb.createStream(); // create a stream for asr engine to feed data | ||
| 64 | - } catch (Exception e) { | ||
| 65 | - System.err.println(e); | ||
| 66 | - e.printStackTrace(); | ||
| 67 | - } | ||
| 68 | - } | ||
| 69 | - | ||
| 70 | - /** read data from mic and feed to asr engine */ | ||
| 71 | - class MicRcgThread implements Runnable { | ||
| 72 | - | ||
| 73 | - TargetDataLine capline; // line for capture mic data | ||
| 74 | - | ||
| 75 | - Thread thread; // this thread | ||
| 76 | - int segmentId = 0; // record the segment id when detect endpoint | ||
| 77 | - String preText = ""; // decoded text | ||
| 78 | - | ||
| 79 | - public MicRcgThread() {} | ||
| 80 | - | ||
| 81 | - public void start() { | ||
| 82 | - | ||
| 83 | - thread = new Thread(this); | ||
| 84 | - | ||
| 85 | - thread.start(); // start thread | ||
| 86 | - } | ||
| 87 | - | ||
| 88 | - public void stop() { | ||
| 89 | - capline.stop(); | ||
| 90 | - capline.close(); | ||
| 91 | - capline = null; | ||
| 92 | - thread = null; | ||
| 93 | - } | ||
| 94 | - | ||
| 95 | - /** feed captured microphone data to asr */ | ||
| 96 | - public void decodeSample(byte[] samplebytes) { | ||
| 97 | - try { | ||
| 98 | - ByteBuffer byteBuf = ByteBuffer.wrap(samplebytes); // create a bytebuf for samples | ||
| 99 | - byteBuf.order(ByteOrder.LITTLE_ENDIAN); // set bytebuf to little endian | ||
| 100 | - ShortBuffer shortBuf = byteBuf.asShortBuffer(); // covert to short type | ||
| 101 | - short[] arrShort = new short[shortBuf.capacity()]; // array for copy short data | ||
| 102 | - float[] arrFloat = new float[shortBuf.capacity()]; // array for copy float data | ||
| 103 | - shortBuf.get(arrShort); // put date to arrShort | ||
| 104 | - | ||
| 105 | - for (int i = 0; i < arrShort.length; i++) { | ||
| 106 | - arrFloat[i] = arrShort[i] / 32768f; // loop to covert short data to float -1 to 1 | ||
| 107 | - } | ||
| 108 | - streamObj.acceptWaveform(arrFloat); // feed asr engine with float data | ||
| 109 | - while (rcgOjb.isReady(streamObj)) { // if engine is ready for unprocessed data | ||
| 110 | - | ||
| 111 | - rcgOjb.decodeStream(streamObj); // decode for this stream | ||
| 112 | - } | ||
| 113 | - boolean isEndpoint = | ||
| 114 | - rcgOjb.isEndpoint( | ||
| 115 | - streamObj); // endpoint check, make sure enable_endpoint_detection=true in config | ||
| 116 | - // file | ||
| 117 | - String nowText = rcgOjb.getResult(streamObj); // get asr result | ||
| 118 | - String recText = ""; | ||
| 119 | - byte[] utf8Data; // for covert text to utf8 | ||
| 120 | - if (isEndpoint && nowText.length() > 0) { | ||
| 121 | - rcgOjb.reSet(streamObj); // reSet stream when detect endpoint | ||
| 122 | - segmentId++; | ||
| 123 | - preText = nowText; | ||
| 124 | - recText = "text(seg_" + String.valueOf(segmentId) + "):" + nowText + "\n"; | ||
| 125 | - utf8Data = recText.getBytes(StandardCharsets.UTF_8); | ||
| 126 | - System.out.println(new String(utf8Data)); | ||
| 127 | - } | ||
| 128 | - | ||
| 129 | - if (!nowText.equals(preText)) { // if preText not equal nowtext | ||
| 130 | - preText = nowText; | ||
| 131 | - recText = nowText + "\n"; | ||
| 132 | - utf8Data = recText.getBytes(StandardCharsets.UTF_8); | ||
| 133 | - System.out.println(new String(utf8Data)); | ||
| 134 | - } | ||
| 135 | - } catch (Exception e) { | ||
| 136 | - System.err.println(e); | ||
| 137 | - e.printStackTrace(); | ||
| 138 | - } | ||
| 139 | - } | ||
| 140 | - | ||
| 141 | - /** run mic capture thread */ | ||
| 142 | - public void run() { | ||
| 143 | - System.out.println("Started! Please speak..."); | ||
| 144 | - | ||
| 145 | - AudioFormat.Encoding encoding = AudioFormat.Encoding.PCM_SIGNED; // the pcm format | ||
| 146 | - float rate = 16000.0f; // using 16 kHz | ||
| 147 | - int channels = 1; // single channel | ||
| 148 | - int sampleSize = 16; // sampleSize 16bit | ||
| 149 | - boolean isBigEndian = false; // using little endian | ||
| 150 | - | ||
| 151 | - AudioFormat format = | ||
| 152 | - new AudioFormat( | ||
| 153 | - encoding, rate, sampleSize, channels, (sampleSize / 8) * channels, rate, isBigEndian); | ||
| 154 | - | ||
| 155 | - DataLine.Info info = new DataLine.Info(TargetDataLine.class, format); | ||
| 156 | - | ||
| 157 | - // check system support such data format | ||
| 158 | - if (!AudioSystem.isLineSupported(info)) { | ||
| 159 | - System.out.println(info + " not supported."); | ||
| 160 | - return; | ||
| 161 | - } | ||
| 162 | - | ||
| 163 | - // open a line for capture. | ||
| 164 | - | ||
| 165 | - try { | ||
| 166 | - capline = (TargetDataLine) AudioSystem.getLine(info); | ||
| 167 | - capline.open(format, capline.getBufferSize()); | ||
| 168 | - } catch (Exception ex) { | ||
| 169 | - System.out.println(ex); | ||
| 170 | - return; | ||
| 171 | - } | ||
| 172 | - | ||
| 173 | - // the buf size for mic captured each time | ||
| 174 | - int bufferLengthInBytes = capline.getBufferSize() / 8 * format.getFrameSize(); | ||
| 175 | - byte[] micData = new byte[bufferLengthInBytes]; | ||
| 176 | - int numBytesRead; | ||
| 177 | - | ||
| 178 | - capline.start(); // start to capture mic data | ||
| 179 | - | ||
| 180 | - while (thread != null) { | ||
| 181 | - // read data from line | ||
| 182 | - if ((numBytesRead = capline.read(micData, 0, bufferLengthInBytes)) == -1) { | ||
| 183 | - break; | ||
| 184 | - } | ||
| 185 | - | ||
| 186 | - decodeSample(micData); // decode mic data | ||
| 187 | - } | ||
| 188 | - | ||
| 189 | - // stop and close | ||
| 190 | - | ||
| 191 | - try { | ||
| 192 | - if (capline != null) { | ||
| 193 | - capline.stop(); | ||
| 194 | - capline.close(); | ||
| 195 | - capline = null; | ||
| 196 | - } | ||
| 197 | - | ||
| 198 | - } catch (Exception ex) { | ||
| 199 | - System.err.println(ex); | ||
| 200 | - } | ||
| 201 | - } | ||
| 202 | - } // End class DecodeMic | ||
| 203 | - | ||
| 204 | - public static void main(String s[]) { | ||
| 205 | - try { | ||
| 206 | - String appDir = System.getProperty("user.dir"); | ||
| 207 | - System.out.println("appdir=" + appDir); | ||
| 208 | - String cfgPath = appDir + "/modelconfig.cfg"; | ||
| 209 | - String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so"; | ||
| 210 | - OnlineRecognizer.setSoPath(soPath); // set so. lib for OnlineRecognizer | ||
| 211 | - | ||
| 212 | - DecodeMic decodeEx = new DecodeMic(); | ||
| 213 | - decodeEx.initModelWithCfg(cfgPath); // init asr engine | ||
| 214 | - decodeEx.open(); // open thread for mic | ||
| 215 | - System.out.print("Press Enter to EXIT!\n"); | ||
| 216 | - char i = (char) System.in.read(); | ||
| 217 | - decodeEx.close(); | ||
| 218 | - } catch (Exception e) { | ||
| 219 | - System.err.println(e); | ||
| 220 | - e.printStackTrace(); | ||
| 221 | - } | ||
| 222 | - } | ||
| 223 | -} |
-
请 注册 或 登录 后发表评论