Fangjun Kuang
Committed by GitHub

Add VAD + Non-streaming ASR + microphone examples for Java API (#1046)

@@ -63,6 +63,18 @@ The punctuation model supports both English and Chinese. @@ -63,6 +63,18 @@ The punctuation model supports both English and Chinese.
63 ./run-vad-from-mic.sh 63 ./run-vad-from-mic.sh
64 ``` 64 ```
65 65
  66 +## VAD with a microphone + Non-streaming Paraformer for speech recognition
  67 +
  68 +```bash
  69 +./run-vad-from-mic-non-streaming-paraformer.sh
  70 +```
  71 +
  72 +## VAD with a microphone + Non-streaming Whisper tiny.en for speech recognition
  73 +
  74 +```bash
  75 +./run-vad-from-mic-non-streaming-whisper.sh
  76 +```
  77 +
66 ## VAD (Remove silence) 78 ## VAD (Remove silence)
67 79
68 ```bash 80 ```bash
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a silero_vad model with a non-streaming Paraformer
  4 +// for speech recognition.
  5 +
  6 +import com.k2fsa.sherpa.onnx.*;
  7 +import javax.sound.sampled.*;
  8 +
  9 +public class VadFromMicWithNonStreamingParaformer {
  10 + private static final int sampleRate = 16000;
  11 + private static final int windowSize = 512;
  12 +
  13 + public static Vad createVad() {
  14 + // please download ./silero_vad.onnx from
  15 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  16 + String model = "./silero_vad.onnx";
  17 + SileroVadModelConfig sileroVad =
  18 + SileroVadModelConfig.builder()
  19 + .setModel(model)
  20 + .setThreshold(0.5f)
  21 + .setMinSilenceDuration(0.25f)
  22 + .setMinSpeechDuration(0.5f)
  23 + .setWindowSize(windowSize)
  24 + .build();
  25 +
  26 + VadModelConfig config =
  27 + VadModelConfig.builder()
  28 + .setSileroVadModelConfig(sileroVad)
  29 + .setSampleRate(sampleRate)
  30 + .setNumThreads(1)
  31 + .setDebug(true)
  32 + .setProvider("cpu")
  33 + .build();
  34 +
  35 + return new Vad(config);
  36 + }
  37 +
  38 + public static OfflineRecognizer createOfflineRecognizer() {
  39 + // please refer to
  40 + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english
  41 + // to download model files
  42 + String model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx";
  43 + String tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt";
  44 +
  45 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
  46 + String ruleFsts = "./itn_zh_number.fst";
  47 +
  48 + OfflineParaformerModelConfig paraformer =
  49 + OfflineParaformerModelConfig.builder().setModel(model).build();
  50 +
  51 + OfflineModelConfig modelConfig =
  52 + OfflineModelConfig.builder()
  53 + .setParaformer(paraformer)
  54 + .setTokens(tokens)
  55 + .setNumThreads(1)
  56 + .setDebug(true)
  57 + .build();
  58 +
  59 + OfflineRecognizerConfig config =
  60 + OfflineRecognizerConfig.builder()
  61 + .setOfflineModelConfig(modelConfig)
  62 + .setDecodingMethod("greedy_search")
  63 + .setRuleFsts(ruleFsts)
  64 + .build();
  65 +
  66 + return new OfflineRecognizer(config);
  67 + }
  68 +
  69 + public static void main(String[] args) {
  70 + Vad vad = createVad();
  71 + OfflineRecognizer recognizer = createOfflineRecognizer();
  72 +
  73 + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
  74 + // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
  75 + AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
  76 +
  77 + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
  78 + DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
  79 + TargetDataLine targetDataLine;
  80 + try {
  81 + targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
  82 + targetDataLine.open(format);
  83 + targetDataLine.start();
  84 + } catch (LineUnavailableException e) {
  85 + System.out.println("Failed to open target data line: " + e.getMessage());
  86 + vad.release();
  87 + recognizer.release();
  88 + return;
  89 + }
  90 +
  91 + boolean printed = false;
  92 + byte[] buffer = new byte[windowSize * 2];
  93 + float[] samples = new float[windowSize];
  94 +
  95 + System.out.println("Started. Please speak");
  96 + boolean running = true;
  97 + while (targetDataLine.isOpen() && running) {
  98 + int n = targetDataLine.read(buffer, 0, buffer.length);
  99 + if (n <= 0) {
  100 + System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
  101 + continue;
  102 + }
  103 + for (int i = 0; i != windowSize; ++i) {
  104 + short low = buffer[2 * i];
  105 + short high = buffer[2 * i + 1];
  106 + int s = (high << 8) + low;
  107 + samples[i] = (float) s / 32768;
  108 + }
  109 +
  110 + vad.acceptWaveform(samples);
  111 + if (vad.isSpeechDetected() && !printed) {
  112 + System.out.println("Detected speech");
  113 + printed = true;
  114 + }
  115 +
  116 + if (!vad.isSpeechDetected()) {
  117 + printed = false;
  118 + }
  119 +
  120 + while (!vad.empty()) {
  121 + SpeechSegment segment = vad.front();
  122 + float startTime = segment.getStart() / (float) sampleRate;
  123 + float duration = segment.getSamples().length / (float) sampleRate;
  124 +
  125 + OfflineStream stream = recognizer.createStream();
  126 + stream.acceptWaveform(segment.getSamples(), sampleRate);
  127 + recognizer.decode(stream);
  128 + String text = recognizer.getResult(stream).getText();
  129 + stream.release();
  130 +
  131 + if (!text.isEmpty()) {
  132 + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
  133 + }
  134 +
  135 + if (text.contains("退出程序")) {
  136 + running = false;
  137 + }
  138 +
  139 + vad.pop();
  140 + }
  141 + }
  142 +
  143 + vad.release();
  144 + recognizer.release();
  145 + }
  146 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a silero_vad model with a non-streaming Whisper tiny.en
  4 +// for speech recognition.
  5 +
  6 +import com.k2fsa.sherpa.onnx.*;
  7 +import javax.sound.sampled.*;
  8 +
  9 +public class VadFromMicNonStreamingWhisper {
  10 + private static final int sampleRate = 16000;
  11 + private static final int windowSize = 512;
  12 +
  13 + public static Vad createVad() {
  14 + // please download ./silero_vad.onnx from
  15 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  16 + String model = "./silero_vad.onnx";
  17 + SileroVadModelConfig sileroVad =
  18 + SileroVadModelConfig.builder()
  19 + .setModel(model)
  20 + .setThreshold(0.5f)
  21 + .setMinSilenceDuration(0.25f)
  22 + .setMinSpeechDuration(0.5f)
  23 + .setWindowSize(windowSize)
  24 + .build();
  25 +
  26 + VadModelConfig config =
  27 + VadModelConfig.builder()
  28 + .setSileroVadModelConfig(sileroVad)
  29 + .setSampleRate(sampleRate)
  30 + .setNumThreads(1)
  31 + .setDebug(true)
  32 + .setProvider("cpu")
  33 + .build();
  34 +
  35 + return new Vad(config);
  36 + }
  37 +
  38 + public static OfflineRecognizer createOfflineRecognizer() {
  39 + // please refer to
  40 + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html
  41 + // to download model files
  42 + String encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx";
  43 + String decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx";
  44 + String tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt";
  45 +
  46 + OfflineWhisperModelConfig whisper =
  47 + OfflineWhisperModelConfig.builder().setEncoder(encoder).setDecoder(decoder).build();
  48 +
  49 + OfflineModelConfig modelConfig =
  50 + OfflineModelConfig.builder()
  51 + .setWhisper(whisper)
  52 + .setTokens(tokens)
  53 + .setNumThreads(1)
  54 + .setDebug(true)
  55 + .build();
  56 +
  57 + OfflineRecognizerConfig config =
  58 + OfflineRecognizerConfig.builder()
  59 + .setOfflineModelConfig(modelConfig)
  60 + .setDecodingMethod("greedy_search")
  61 + .build();
  62 +
  63 + return new OfflineRecognizer(config);
  64 + }
  65 +
  66 + public static void main(String[] args) {
  67 + Vad vad = createVad();
  68 + OfflineRecognizer recognizer = createOfflineRecognizer();
  69 +
  70 + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
  71 + // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
  72 + AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
  73 +
  74 + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
  75 + DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
  76 + TargetDataLine targetDataLine;
  77 + try {
  78 + targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
  79 + targetDataLine.open(format);
  80 + targetDataLine.start();
  81 + } catch (LineUnavailableException e) {
  82 + System.out.println("Failed to open target data line: " + e.getMessage());
  83 + vad.release();
  84 + recognizer.release();
  85 + return;
  86 + }
  87 +
  88 + boolean printed = false;
  89 + byte[] buffer = new byte[windowSize * 2];
  90 + float[] samples = new float[windowSize];
  91 +
  92 + System.out.println("Started. Please speak");
  93 + boolean running = true;
  94 + while (targetDataLine.isOpen() && running) {
  95 + int n = targetDataLine.read(buffer, 0, buffer.length);
  96 + if (n <= 0) {
  97 + System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
  98 + continue;
  99 + }
  100 + for (int i = 0; i != windowSize; ++i) {
  101 + short low = buffer[2 * i];
  102 + short high = buffer[2 * i + 1];
  103 + int s = (high << 8) + low;
  104 + samples[i] = (float) s / 32768;
  105 + }
  106 +
  107 + vad.acceptWaveform(samples);
  108 + if (vad.isSpeechDetected() && !printed) {
  109 + System.out.println("Detected speech");
  110 + printed = true;
  111 + }
  112 +
  113 + if (!vad.isSpeechDetected()) {
  114 + printed = false;
  115 + }
  116 +
  117 + while (!vad.empty()) {
  118 + SpeechSegment segment = vad.front();
  119 + float startTime = segment.getStart() / (float) sampleRate;
  120 + float duration = segment.getSamples().length / (float) sampleRate;
  121 +
  122 + OfflineStream stream = recognizer.createStream();
  123 + stream.acceptWaveform(segment.getSamples(), sampleRate);
  124 + recognizer.decode(stream);
  125 + String text = recognizer.getResult(stream).getText();
  126 + stream.release();
  127 +
  128 + if (!text.isEmpty()) {
  129 + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
  130 + }
  131 +
  132 + if (text.contains("exit the program")) {
  133 + running = false;
  134 + }
  135 +
  136 + vad.pop();
  137 + }
  138 + }
  139 +
  140 + vad.release();
  141 + recognizer.release();
  142 + }
  143 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./silero_vad.onnx ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  30 +fi
  31 +
  32 +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then
  33 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
  34 +
  35 + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
  36 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
  37 +fi
  38 +
  39 +if [ ! -f ./itn_zh_number.fst ]; then
  40 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
  41 +fi
  42 +
  43 +java \
  44 + -Djava.library.path=$PWD/../build/lib \
  45 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  46 + ./VadFromMicWithNonStreamingParaformer.java
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./silero_vad.onnx ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  30 +fi
  31 +
  32 +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
  33 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  34 +
  35 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  36 + rm sherpa-onnx-whisper-tiny.en.tar.bz2
  37 +fi
  38 +
  39 +java \
  40 + -Djava.library.path=$PWD/../build/lib \
  41 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  42 + ./VadFromMicWithNonStreamingWhisper.java
1 -/*  
2 - * // Copyright 2022-2023 by zhaoming  
3 - */  
4 -/*  
5 -Config modelconfig.cfg  
6 - sample_rate=16000  
7 - feature_dim=80  
8 - rule1_min_trailing_silence=2.4  
9 - rule2_min_trailing_silence=1.2  
10 - rule3_min_utterance_length=20  
11 - encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx  
12 - decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx  
13 - joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx  
14 - tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt  
15 - num_threads=4  
16 - enable_endpoint_detection=false  
17 - decoding_method=greedy_search  
18 - max_active_paths=4  
19 -*/  
20 -  
21 -import com.k2fsa.sherpa.onnx.OnlineRecognizer;  
22 -import com.k2fsa.sherpa.onnx.OnlineStream;  
23 -import java.io.*;  
24 -import java.nio.charset.StandardCharsets;  
25 -  
26 -public class DecodeFile {  
27 - OnlineRecognizer rcgOjb;  
28 - OnlineStream streamObj;  
29 - String wavfilename;  
30 -  
31 - public DecodeFile(String fileName) {  
32 - wavfilename = fileName;  
33 - }  
34 -  
35 - public void initModelWithPara() {  
36 - try {  
37 - String modelDir =  
38 - "/sherpa-onnx/build_old/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20";  
39 - String encoder = modelDir + "/encoder-epoch-99-avg-1.onnx";  
40 - String decoder = modelDir + "/decoder-epoch-99-avg-1.onnx";  
41 - String joiner = modelDir + "/joiner-epoch-99-avg-1.onnx";  
42 - String tokens = modelDir + "/tokens.txt";  
43 - int numThreads = 4;  
44 - int sampleRate = 16000;  
45 - int featureDim = 80;  
46 - boolean enableEndpointDetection = false;  
47 - float rule1MinTrailingSilence = 2.4F;  
48 - float rule2MinTrailingSilence = 1.2F;  
49 - float rule3MinUtteranceLength = 20F;  
50 - String decodingMethod = "greedy_search";  
51 - int maxActivePaths = 4;  
52 - String hotwordsFile = "";  
53 - float hotwordsScore = 1.5F;  
54 - String lm_model = "";  
55 - float lm_scale = 0.5F;  
56 - String modelType = "zipformer";  
57 - rcgOjb =  
58 - new OnlineRecognizer(  
59 - tokens,  
60 - encoder,  
61 - decoder,  
62 - joiner,  
63 - numThreads,  
64 - sampleRate,  
65 - featureDim,  
66 - enableEndpointDetection,  
67 - rule1MinTrailingSilence,  
68 - rule2MinTrailingSilence,  
69 - rule3MinUtteranceLength,  
70 - decodingMethod,  
71 - lm_model,  
72 - lm_scale,  
73 - maxActivePaths,  
74 - hotwordsFile,  
75 - hotwordsScore,  
76 - modelType);  
77 - streamObj = rcgOjb.createStream();  
78 - } catch (Exception e) {  
79 - System.err.println(e);  
80 - e.printStackTrace();  
81 - }  
82 - }  
83 -  
84 - public void initModelWithCfg(String cfgFile) {  
85 - try {  
86 - // you should set setCfgPath() before running this  
87 - rcgOjb = new OnlineRecognizer(cfgFile);  
88 - streamObj = rcgOjb.createStream();  
89 - } catch (Exception e) {  
90 - System.err.println(e);  
91 - e.printStackTrace();  
92 - }  
93 - }  
94 -  
95 - public void simpleExample() {  
96 - try {  
97 - float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file  
98 - streamObj.acceptWaveform(buffer); // feed stream with data  
99 - streamObj.inputFinished(); // tell engine you done with all data  
100 - OnlineStream ssObj[] = new OnlineStream[1];  
101 - while (rcgOjb.isReady(streamObj)) { // engine is ready for unprocessed data  
102 - ssObj[0] = streamObj;  
103 - rcgOjb.decodeStreams(ssObj); // decode for multiple stream  
104 - // rcgOjb.DecodeStream(streamObj); // decode for single stream  
105 - }  
106 -  
107 - String recText = "simple:" + rcgOjb.getResult(streamObj) + "\n";  
108 - byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8);  
109 - System.out.println(new String(utf8Data));  
110 - rcgOjb.reSet(streamObj);  
111 - rcgOjb.releaseStream(streamObj); // release stream  
112 - rcgOjb.release(); // release recognizer  
113 -  
114 - } catch (Exception e) {  
115 - System.err.println(e);  
116 - e.printStackTrace();  
117 - }  
118 - }  
119 -  
120 - public void streamExample() {  
121 - try {  
122 - float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file  
123 - float[] chunk = new float[1600]; // //each time read 1600(0.1s) data  
124 - int chunkIndex = 0;  
125 - for (int i = 0; i < buffer.length; i++) // total wav length loop  
126 - {  
127 - chunk[chunkIndex] = buffer[i];  
128 - chunkIndex++;  
129 - if (chunkIndex >= 1600 || i == (buffer.length - 1)) {  
130 - chunkIndex = 0;  
131 - streamObj.acceptWaveform(chunk); // feed chunk  
132 - if (rcgOjb.isReady(streamObj)) {  
133 - rcgOjb.decodeStream(streamObj);  
134 - }  
135 - String testDate = rcgOjb.getResult(streamObj);  
136 - byte[] utf8Data = testDate.getBytes(StandardCharsets.UTF_8);  
137 -  
138 - if (utf8Data.length > 0) {  
139 - System.out.println(Float.valueOf((float) i / 16000) + ":" + new String(utf8Data));  
140 - }  
141 - }  
142 - }  
143 - streamObj.inputFinished();  
144 - while (rcgOjb.isReady(streamObj)) {  
145 - rcgOjb.decodeStream(streamObj);  
146 - }  
147 -  
148 - String recText = "stream:" + rcgOjb.getResult(streamObj) + "\n";  
149 - byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8);  
150 - System.out.println(new String(utf8Data));  
151 - rcgOjb.reSet(streamObj);  
152 - rcgOjb.releaseStream(streamObj); // release stream  
153 - rcgOjb.release(); // release recognizer  
154 -  
155 - } catch (Exception e) {  
156 - System.err.println(e);  
157 - e.printStackTrace();  
158 - }  
159 - }  
160 -  
161 - public static void main(String[] args) {  
162 - try {  
163 - String appDir = System.getProperty("user.dir");  
164 - System.out.println("appdir=" + appDir);  
165 - String fileName = appDir + "/" + args[0];  
166 - String cfgPath = appDir + "/modeltest.cfg";  
167 - String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so";  
168 - OnlineRecognizer.setSoPath(soPath);  
169 - DecodeFile rcgDemo = new DecodeFile(fileName);  
170 -  
171 - // ***************** */  
172 - rcgDemo.initModelWithCfg(cfgPath);  
173 - rcgDemo.streamExample();  
174 - // **************** */  
175 - rcgDemo.initModelWithCfg(cfgPath);  
176 - rcgDemo.simpleExample();  
177 -  
178 - } catch (Exception e) {  
179 - System.err.println(e);  
180 - e.printStackTrace();  
181 - }  
182 - }  
183 -}  
1 -/*  
2 - * // Copyright 2022-2023 by zhaoming  
3 - */  
4 -/*  
5 -Real-time speech recognition from a microphone with com.k2fsa.sherpa.onnx Java API  
6 -  
7 -example for cfgFile modelconfig.cfg  
8 - sample_rate=16000  
9 - feature_dim=80  
10 - rule1_min_trailing_silence=2.4  
11 - rule2_min_trailing_silence=1.2  
12 - rule3_min_utterance_length=20  
13 - encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx  
14 - decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx  
15 - joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx  
16 - tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt  
17 - num_threads=4  
18 - enable_endpoint_detection=true  
19 - decoding_method=greedy_search  
20 - max_active_paths=4  
21 -  
22 -*/  
23 -import com.k2fsa.sherpa.onnx.OnlineRecognizer;  
24 -import com.k2fsa.sherpa.onnx.OnlineStream;  
25 -import java.io.*;  
26 -import java.nio.ByteBuffer;  
27 -import java.nio.ByteOrder;  
28 -import java.nio.ShortBuffer;  
29 -import java.nio.charset.StandardCharsets;  
30 -import javax.sound.sampled.AudioFormat;  
31 -import javax.sound.sampled.AudioSystem;  
32 -import javax.sound.sampled.DataLine;  
33 -import javax.sound.sampled.TargetDataLine;  
34 -  
35 -/** Microphone Example */  
36 -public class DecodeMic {  
37 - MicRcgThread micRcgThread = null; // thread handle  
38 -  
39 - OnlineRecognizer rcgOjb; // the recognizer  
40 -  
41 - OnlineStream streamObj; // the stream  
42 -  
43 - public DecodeMic() {  
44 -  
45 - micRcgThread = new MicRcgThread(); // create a new instance for MicRcgThread  
46 - }  
47 -  
48 - public void open() {  
49 - micRcgThread.start(); // start to capture microphone data  
50 - }  
51 -  
52 - public void close() {  
53 - micRcgThread.stop(); // close capture  
54 - }  
55 -  
56 - /** init asr engine with config file */  
57 - public void initModelWithCfg(String cfgFile) {  
58 - try {  
59 -  
60 - // set setSoPath() before running this  
61 - rcgOjb = new OnlineRecognizer(cfgFile);  
62 -  
63 - streamObj = rcgOjb.createStream(); // create a stream for asr engine to feed data  
64 - } catch (Exception e) {  
65 - System.err.println(e);  
66 - e.printStackTrace();  
67 - }  
68 - }  
69 -  
70 - /** read data from mic and feed to asr engine */  
71 - class MicRcgThread implements Runnable {  
72 -  
73 - TargetDataLine capline; // line for capture mic data  
74 -  
75 - Thread thread; // this thread  
76 - int segmentId = 0; // record the segment id when detect endpoint  
77 - String preText = ""; // decoded text  
78 -  
79 - public MicRcgThread() {}  
80 -  
81 - public void start() {  
82 -  
83 - thread = new Thread(this);  
84 -  
85 - thread.start(); // start thread  
86 - }  
87 -  
88 - public void stop() {  
89 - capline.stop();  
90 - capline.close();  
91 - capline = null;  
92 - thread = null;  
93 - }  
94 -  
95 - /** feed captured microphone data to asr */  
96 - public void decodeSample(byte[] samplebytes) {  
97 - try {  
98 - ByteBuffer byteBuf = ByteBuffer.wrap(samplebytes); // create a bytebuf for samples  
99 - byteBuf.order(ByteOrder.LITTLE_ENDIAN); // set bytebuf to little endian  
100 - ShortBuffer shortBuf = byteBuf.asShortBuffer(); // covert to short type  
101 - short[] arrShort = new short[shortBuf.capacity()]; // array for copy short data  
102 - float[] arrFloat = new float[shortBuf.capacity()]; // array for copy float data  
103 - shortBuf.get(arrShort); // put date to arrShort  
104 -  
105 - for (int i = 0; i < arrShort.length; i++) {  
106 - arrFloat[i] = arrShort[i] / 32768f; // loop to covert short data to float -1 to 1  
107 - }  
108 - streamObj.acceptWaveform(arrFloat); // feed asr engine with float data  
109 - while (rcgOjb.isReady(streamObj)) { // if engine is ready for unprocessed data  
110 -  
111 - rcgOjb.decodeStream(streamObj); // decode for this stream  
112 - }  
113 - boolean isEndpoint =  
114 - rcgOjb.isEndpoint(  
115 - streamObj); // endpoint check, make sure enable_endpoint_detection=true in config  
116 - // file  
117 - String nowText = rcgOjb.getResult(streamObj); // get asr result  
118 - String recText = "";  
119 - byte[] utf8Data; // for covert text to utf8  
120 - if (isEndpoint && nowText.length() > 0) {  
121 - rcgOjb.reSet(streamObj); // reSet stream when detect endpoint  
122 - segmentId++;  
123 - preText = nowText;  
124 - recText = "text(seg_" + String.valueOf(segmentId) + "):" + nowText + "\n";  
125 - utf8Data = recText.getBytes(StandardCharsets.UTF_8);  
126 - System.out.println(new String(utf8Data));  
127 - }  
128 -  
129 - if (!nowText.equals(preText)) { // if preText not equal nowtext  
130 - preText = nowText;  
131 - recText = nowText + "\n";  
132 - utf8Data = recText.getBytes(StandardCharsets.UTF_8);  
133 - System.out.println(new String(utf8Data));  
134 - }  
135 - } catch (Exception e) {  
136 - System.err.println(e);  
137 - e.printStackTrace();  
138 - }  
139 - }  
140 -  
141 - /** run mic capture thread */  
142 - public void run() {  
143 - System.out.println("Started! Please speak...");  
144 -  
145 - AudioFormat.Encoding encoding = AudioFormat.Encoding.PCM_SIGNED; // the pcm format  
146 - float rate = 16000.0f; // using 16 kHz  
147 - int channels = 1; // single channel  
148 - int sampleSize = 16; // sampleSize 16bit  
149 - boolean isBigEndian = false; // using little endian  
150 -  
151 - AudioFormat format =  
152 - new AudioFormat(  
153 - encoding, rate, sampleSize, channels, (sampleSize / 8) * channels, rate, isBigEndian);  
154 -  
155 - DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);  
156 -  
157 - // check system support such data format  
158 - if (!AudioSystem.isLineSupported(info)) {  
159 - System.out.println(info + " not supported.");  
160 - return;  
161 - }  
162 -  
163 - // open a line for capture.  
164 -  
165 - try {  
166 - capline = (TargetDataLine) AudioSystem.getLine(info);  
167 - capline.open(format, capline.getBufferSize());  
168 - } catch (Exception ex) {  
169 - System.out.println(ex);  
170 - return;  
171 - }  
172 -  
173 - // the buf size for mic captured each time  
174 - int bufferLengthInBytes = capline.getBufferSize() / 8 * format.getFrameSize();  
175 - byte[] micData = new byte[bufferLengthInBytes];  
176 - int numBytesRead;  
177 -  
178 - capline.start(); // start to capture mic data  
179 -  
180 - while (thread != null) {  
181 - // read data from line  
182 - if ((numBytesRead = capline.read(micData, 0, bufferLengthInBytes)) == -1) {  
183 - break;  
184 - }  
185 -  
186 - decodeSample(micData); // decode mic data  
187 - }  
188 -  
189 - // stop and close  
190 -  
191 - try {  
192 - if (capline != null) {  
193 - capline.stop();  
194 - capline.close();  
195 - capline = null;  
196 - }  
197 -  
198 - } catch (Exception ex) {  
199 - System.err.println(ex);  
200 - }  
201 - }  
202 - } // End class DecodeMic  
203 -  
204 - public static void main(String s[]) {  
205 - try {  
206 - String appDir = System.getProperty("user.dir");  
207 - System.out.println("appdir=" + appDir);  
208 - String cfgPath = appDir + "/modelconfig.cfg";  
209 - String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so";  
210 - OnlineRecognizer.setSoPath(soPath); // set so. lib for OnlineRecognizer  
211 -  
212 - DecodeMic decodeEx = new DecodeMic();  
213 - decodeEx.initModelWithCfg(cfgPath); // init asr engine  
214 - decodeEx.open(); // open thread for mic  
215 - System.out.print("Press Enter to EXIT!\n");  
216 - char i = (char) System.in.read();  
217 - decodeEx.close();  
218 - } catch (Exception e) {  
219 - System.err.println(e);  
220 - e.printStackTrace();  
221 - }  
222 - }  
223 -}