Add VAD + Non-streaming ASR + microphone examples for Java API (#1046)

Fangjun Kuang · GitHub
Commit 29abf242c35208b13bc227e7a7df26d519de0e09 29abf242 1 parent 757a44b1
java-api-examples/README.md
java-api-examples/VadFromMicWithNonStreamingParaformer.java
java-api-examples/VadFromMicWithNonStreamingWhisper.java
java-api-examples/run-vad-from-mic-non-streaming-paraformer.sh
java-api-examples/run-vad-from-mic-non-streaming-whisper.sh
java-api-examples/src/DecodeFile.java
java-api-examples/src/DecodeMic.java
--- a/java-api-examples/README.md
查看文件 @29abf24
+++ b/java-api-examples/README.md
查看文件 @29abf24
@@ -63,6 +63,18 @@ The punctuation model supports both English and Chinese.
 ./run-vad-from-mic.sh
 ```
 
+ ## VAD with a microphone + Non-streaming Paraformer for speech recognition
+ 
+ ```bash
+ ./run-vad-from-mic-non-streaming-paraformer.sh
+ ```
+ 
+ ## VAD with a microphone + Non-streaming Whisper tiny.en for speech recognition
+ 
+ ```bash
+ ./run-vad-from-mic-non-streaming-whisper.sh
+ ```
+ 
 ## VAD (Remove silence)
 
 ```bash
--- a/java-api-examples/VadFromMicWithNonStreamingParaformer.java 0 → 100644
查看文件 @29abf24
+++ b/java-api-examples/VadFromMicWithNonStreamingParaformer.java 0 → 100644
查看文件 @29abf24
+ // Copyright 2024 Xiaomi Corporation
+ 
+ // This file shows how to use a silero_vad model with a non-streaming Paraformer
+ // for speech recognition.
+ 
+ import com.k2fsa.sherpa.onnx.*;
+ import javax.sound.sampled.*;
+ 
+ public class VadFromMicWithNonStreamingParaformer {
+   private static final int sampleRate = 16000;
+   private static final int windowSize = 512;
+ 
+   public static Vad createVad() {
+     // please download ./silero_vad.onnx from
+     // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+     String model = "./silero_vad.onnx";
+     SileroVadModelConfig sileroVad =
+         SileroVadModelConfig.builder()
+             .setModel(model)
+             .setThreshold(0.5f)
+             .setMinSilenceDuration(0.25f)
+             .setMinSpeechDuration(0.5f)
+             .setWindowSize(windowSize)
+             .build();
+ 
+     VadModelConfig config =
+         VadModelConfig.builder()
+             .setSileroVadModelConfig(sileroVad)
+             .setSampleRate(sampleRate)
+             .setNumThreads(1)
+             .setDebug(true)
+             .setProvider("cpu")
+             .build();
+ 
+     return new Vad(config);
+   }
+ 
+   public static OfflineRecognizer createOfflineRecognizer() {
+     // please refer to
+     // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english
+     // to download model files
+     String model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx";
+     String tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt";
+ 
+     // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
+     String ruleFsts = "./itn_zh_number.fst";
+ 
+     OfflineParaformerModelConfig paraformer =
+         OfflineParaformerModelConfig.builder().setModel(model).build();
+ 
+     OfflineModelConfig modelConfig =
+         OfflineModelConfig.builder()
+             .setParaformer(paraformer)
+             .setTokens(tokens)
+             .setNumThreads(1)
+             .setDebug(true)
+             .build();
+ 
+     OfflineRecognizerConfig config =
+         OfflineRecognizerConfig.builder()
+             .setOfflineModelConfig(modelConfig)
+             .setDecodingMethod("greedy_search")
+             .setRuleFsts(ruleFsts)
+             .build();
+ 
+     return new OfflineRecognizer(config);
+   }
+ 
+   public static void main(String[] args) {
+     Vad vad = createVad();
+     OfflineRecognizer recognizer = createOfflineRecognizer();
+ 
+     // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
+     // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
+     AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
+ 
+     // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
+     DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
+     TargetDataLine targetDataLine;
+     try {
+       targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
+       targetDataLine.open(format);
+       targetDataLine.start();
+     } catch (LineUnavailableException e) {
+       System.out.println("Failed to open target data line: " + e.getMessage());
+       vad.release();
+       recognizer.release();
+       return;
+     }
+ 
+     boolean printed = false;
+     byte[] buffer = new byte[windowSize * 2];
+     float[] samples = new float[windowSize];
+ 
+     System.out.println("Started. Please speak");
+     boolean running = true;
+     while (targetDataLine.isOpen() && running) {
+       int n = targetDataLine.read(buffer, 0, buffer.length);
+       if (n <= 0) {
+         System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
+         continue;
+       }
+       for (int i = 0; i != windowSize; ++i) {
+         short low = buffer[2 * i];
+         short high = buffer[2 * i + 1];
+         int s = (high << 8) + low;
+         samples[i] = (float) s / 32768;
+       }
+ 
+       vad.acceptWaveform(samples);
+       if (vad.isSpeechDetected() && !printed) {
+         System.out.println("Detected speech");
+         printed = true;
+       }
+ 
+       if (!vad.isSpeechDetected()) {
+         printed = false;
+       }
+ 
+       while (!vad.empty()) {
+         SpeechSegment segment = vad.front();
+         float startTime = segment.getStart() / (float) sampleRate;
+         float duration = segment.getSamples().length / (float) sampleRate;
+ 
+         OfflineStream stream = recognizer.createStream();
+         stream.acceptWaveform(segment.getSamples(), sampleRate);
+         recognizer.decode(stream);
+         String text = recognizer.getResult(stream).getText();
+         stream.release();
+ 
+         if (!text.isEmpty()) {
+           System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
+         }
+ 
+         if (text.contains("退出程序")) {
+           running = false;
+         }
+ 
+         vad.pop();
+       }
+     }
+ 
+     vad.release();
+     recognizer.release();
+   }
+ }
--- a/java-api-examples/VadFromMicWithNonStreamingWhisper.java 0 → 100644
查看文件 @29abf24
+++ b/java-api-examples/VadFromMicWithNonStreamingWhisper.java 0 → 100644
查看文件 @29abf24
+ // Copyright 2024 Xiaomi Corporation
+ 
+ // This file shows how to use a silero_vad model with a non-streaming Whisper tiny.en
+ // for speech recognition.
+ 
+ import com.k2fsa.sherpa.onnx.*;
+ import javax.sound.sampled.*;
+ 
+ public class VadFromMicNonStreamingWhisper {
+   private static final int sampleRate = 16000;
+   private static final int windowSize = 512;
+ 
+   public static Vad createVad() {
+     // please download ./silero_vad.onnx from
+     // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+     String model = "./silero_vad.onnx";
+     SileroVadModelConfig sileroVad =
+         SileroVadModelConfig.builder()
+             .setModel(model)
+             .setThreshold(0.5f)
+             .setMinSilenceDuration(0.25f)
+             .setMinSpeechDuration(0.5f)
+             .setWindowSize(windowSize)
+             .build();
+ 
+     VadModelConfig config =
+         VadModelConfig.builder()
+             .setSileroVadModelConfig(sileroVad)
+             .setSampleRate(sampleRate)
+             .setNumThreads(1)
+             .setDebug(true)
+             .setProvider("cpu")
+             .build();
+ 
+     return new Vad(config);
+   }
+ 
+   public static OfflineRecognizer createOfflineRecognizer() {
+     // please refer to
+     // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html
+     // to download model files
+     String encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx";
+     String decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx";
+     String tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt";
+ 
+     OfflineWhisperModelConfig whisper =
+         OfflineWhisperModelConfig.builder().setEncoder(encoder).setDecoder(decoder).build();
+ 
+     OfflineModelConfig modelConfig =
+         OfflineModelConfig.builder()
+             .setWhisper(whisper)
+             .setTokens(tokens)
+             .setNumThreads(1)
+             .setDebug(true)
+             .build();
+ 
+     OfflineRecognizerConfig config =
+         OfflineRecognizerConfig.builder()
+             .setOfflineModelConfig(modelConfig)
+             .setDecodingMethod("greedy_search")
+             .build();
+ 
+     return new OfflineRecognizer(config);
+   }
+ 
+   public static void main(String[] args) {
+     Vad vad = createVad();
+     OfflineRecognizer recognizer = createOfflineRecognizer();
+ 
+     // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
+     // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
+     AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
+ 
+     // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
+     DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
+     TargetDataLine targetDataLine;
+     try {
+       targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
+       targetDataLine.open(format);
+       targetDataLine.start();
+     } catch (LineUnavailableException e) {
+       System.out.println("Failed to open target data line: " + e.getMessage());
+       vad.release();
+       recognizer.release();
+       return;
+     }
+ 
+     boolean printed = false;
+     byte[] buffer = new byte[windowSize * 2];
+     float[] samples = new float[windowSize];
+ 
+     System.out.println("Started. Please speak");
+     boolean running = true;
+     while (targetDataLine.isOpen() && running) {
+       int n = targetDataLine.read(buffer, 0, buffer.length);
+       if (n <= 0) {
+         System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
+         continue;
+       }
+       for (int i = 0; i != windowSize; ++i) {
+         short low = buffer[2 * i];
+         short high = buffer[2 * i + 1];
+         int s = (high << 8) + low;
+         samples[i] = (float) s / 32768;
+       }
+ 
+       vad.acceptWaveform(samples);
+       if (vad.isSpeechDetected() && !printed) {
+         System.out.println("Detected speech");
+         printed = true;
+       }
+ 
+       if (!vad.isSpeechDetected()) {
+         printed = false;
+       }
+ 
+       while (!vad.empty()) {
+         SpeechSegment segment = vad.front();
+         float startTime = segment.getStart() / (float) sampleRate;
+         float duration = segment.getSamples().length / (float) sampleRate;
+ 
+         OfflineStream stream = recognizer.createStream();
+         stream.acceptWaveform(segment.getSamples(), sampleRate);
+         recognizer.decode(stream);
+         String text = recognizer.getResult(stream).getText();
+         stream.release();
+ 
+         if (!text.isEmpty()) {
+           System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
+         }
+ 
+         if (text.contains("exit the program")) {
+           running = false;
+         }
+ 
+         vad.pop();
+       }
+     }
+ 
+     vad.release();
+     recognizer.release();
+   }
+ }
--- a/java-api-examples/run-vad-from-mic-non-streaming-paraformer.sh 0 → 100755
查看文件 @29abf24
+++ b/java-api-examples/run-vad-from-mic-non-streaming-paraformer.sh 0 → 100755
查看文件 @29abf24
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib  && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
+   mkdir -p ../build
+   pushd ../build
+   cmake \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     -DSHERPA_ONNX_ENABLE_JNI=ON \
+     ..
+ 
+   make -j4
+   ls -lh lib
+   popd
+ fi
+ 
+ if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
+   pushd ../sherpa-onnx/java-api
+   make
+   popd
+ fi
+ 
+ if [ ! -f ./silero_vad.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+ fi
+ 
+ if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
+ 
+   tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
+   rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
+ fi
+ 
+ if [ ! -f ./itn_zh_number.fst ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
+ fi
+ 
+ java \
+   -Djava.library.path=$PWD/../build/lib \
+   -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
+   ./VadFromMicWithNonStreamingParaformer.java
--- a/java-api-examples/run-vad-from-mic-non-streaming-whisper.sh 0 → 100755
查看文件 @29abf24
+++ b/java-api-examples/run-vad-from-mic-non-streaming-whisper.sh 0 → 100755
查看文件 @29abf24
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib  && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
+   mkdir -p ../build
+   pushd ../build
+   cmake \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     -DSHERPA_ONNX_ENABLE_JNI=ON \
+     ..
+ 
+   make -j4
+   ls -lh lib
+   popd
+ fi
+ 
+ if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
+   pushd ../sherpa-onnx/java-api
+   make
+   popd
+ fi
+ 
+ if [ ! -f ./silero_vad.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+ fi
+ 
+ if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+ 
+   tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+   rm sherpa-onnx-whisper-tiny.en.tar.bz2
+ fi
+ 
+ java \
+   -Djava.library.path=$PWD/../build/lib \
+   -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
+   ./VadFromMicWithNonStreamingWhisper.java
--- a/java-api-examples/src/DecodeFile.java 已删除 100644 → 0
查看文件 @757a44b
+++ b/java-api-examples/src/DecodeFile.java 已删除 100644 → 0
查看文件 @757a44b
- /*
-  * // Copyright 2022-2023 by zhaoming
-  */
- /*
- Config modelconfig.cfg
-   sample_rate=16000
-   feature_dim=80
-   rule1_min_trailing_silence=2.4
-   rule2_min_trailing_silence=1.2
-   rule3_min_utterance_length=20
-   encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx
-   decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx
-   joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx
-   tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt
-   num_threads=4
-   enable_endpoint_detection=false
-   decoding_method=greedy_search
-   max_active_paths=4
- */
- 
- import com.k2fsa.sherpa.onnx.OnlineRecognizer;
- import com.k2fsa.sherpa.onnx.OnlineStream;
- import java.io.*;
- import java.nio.charset.StandardCharsets;
- 
- public class DecodeFile {
-   OnlineRecognizer rcgOjb;
-   OnlineStream streamObj;
-   String wavfilename;
- 
-   public DecodeFile(String fileName) {
-     wavfilename = fileName;
-   }
- 
-   public void initModelWithPara() {
-     try {
-       String modelDir =
-           "/sherpa-onnx/build_old/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20";
-       String encoder = modelDir + "/encoder-epoch-99-avg-1.onnx";
-       String decoder = modelDir + "/decoder-epoch-99-avg-1.onnx";
-       String joiner = modelDir + "/joiner-epoch-99-avg-1.onnx";
-       String tokens = modelDir + "/tokens.txt";
-       int numThreads = 4;
-       int sampleRate = 16000;
-       int featureDim = 80;
-       boolean enableEndpointDetection = false;
-       float rule1MinTrailingSilence = 2.4F;
-       float rule2MinTrailingSilence = 1.2F;
-       float rule3MinUtteranceLength = 20F;
-       String decodingMethod = "greedy_search";
-       int maxActivePaths = 4;
-       String hotwordsFile = "";
-       float hotwordsScore = 1.5F;
-       String lm_model = "";
-       float lm_scale = 0.5F;
-       String modelType = "zipformer";
-       rcgOjb =
-           new OnlineRecognizer(
-               tokens,
-               encoder,
-               decoder,
-               joiner,
-               numThreads,
-               sampleRate,
-               featureDim,
-               enableEndpointDetection,
-               rule1MinTrailingSilence,
-               rule2MinTrailingSilence,
-               rule3MinUtteranceLength,
-               decodingMethod,
-               lm_model,
-               lm_scale,
-               maxActivePaths,
-               hotwordsFile,
-               hotwordsScore,
-               modelType);
-       streamObj = rcgOjb.createStream();
-     } catch (Exception e) {
-       System.err.println(e);
-       e.printStackTrace();
-     }
-   }
- 
-   public void initModelWithCfg(String cfgFile) {
-     try {
-       // you should set setCfgPath() before running this
-       rcgOjb = new OnlineRecognizer(cfgFile);
-       streamObj = rcgOjb.createStream();
-     } catch (Exception e) {
-       System.err.println(e);
-       e.printStackTrace();
-     }
-   }
- 
-   public void simpleExample() {
-     try {
-       float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file
-       streamObj.acceptWaveform(buffer); // feed stream with data
-       streamObj.inputFinished(); // tell engine you done with all data
-       OnlineStream ssObj[] = new OnlineStream[1];
-       while (rcgOjb.isReady(streamObj)) { // engine is ready for unprocessed data
-         ssObj[0] = streamObj;
-         rcgOjb.decodeStreams(ssObj); // decode for multiple stream
-         // rcgOjb.DecodeStream(streamObj);   // decode for single stream
-       }
- 
-       String recText = "simple:" + rcgOjb.getResult(streamObj) + "\n";
-       byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8);
-       System.out.println(new String(utf8Data));
-       rcgOjb.reSet(streamObj);
-       rcgOjb.releaseStream(streamObj); // release stream
-       rcgOjb.release(); // release recognizer
- 
-     } catch (Exception e) {
-       System.err.println(e);
-       e.printStackTrace();
-     }
-   }
- 
-   public void streamExample() {
-     try {
-       float[] buffer = rcgOjb.readWavFile(wavfilename); // read data from file
-       float[] chunk = new float[1600]; // //each time read 1600(0.1s) data
-       int chunkIndex = 0;
-       for (int i = 0; i < buffer.length; i++) // total wav length loop
-       {
-         chunk[chunkIndex] = buffer[i];
-         chunkIndex++;
-         if (chunkIndex >= 1600 || i == (buffer.length - 1)) {
-           chunkIndex = 0;
-           streamObj.acceptWaveform(chunk); // feed chunk
-           if (rcgOjb.isReady(streamObj)) {
-             rcgOjb.decodeStream(streamObj);
-           }
-           String testDate = rcgOjb.getResult(streamObj);
-           byte[] utf8Data = testDate.getBytes(StandardCharsets.UTF_8);
- 
-           if (utf8Data.length > 0) {
-             System.out.println(Float.valueOf((float) i / 16000) + ":" + new String(utf8Data));
-           }
-         }
-       }
-       streamObj.inputFinished();
-       while (rcgOjb.isReady(streamObj)) {
-         rcgOjb.decodeStream(streamObj);
-       }
- 
-       String recText = "stream:" + rcgOjb.getResult(streamObj) + "\n";
-       byte[] utf8Data = recText.getBytes(StandardCharsets.UTF_8);
-       System.out.println(new String(utf8Data));
-       rcgOjb.reSet(streamObj);
-       rcgOjb.releaseStream(streamObj); // release stream
-       rcgOjb.release(); // release recognizer
- 
-     } catch (Exception e) {
-       System.err.println(e);
-       e.printStackTrace();
-     }
-   }
- 
-   public static void main(String[] args) {
-     try {
-       String appDir = System.getProperty("user.dir");
-       System.out.println("appdir=" + appDir);
-       String fileName = appDir + "/" + args[0];
-       String cfgPath = appDir + "/modeltest.cfg";
-       String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so";
-       OnlineRecognizer.setSoPath(soPath);
-       DecodeFile rcgDemo = new DecodeFile(fileName);
- 
-       // ***************** */
-       rcgDemo.initModelWithCfg(cfgPath);
-       rcgDemo.streamExample();
-       // **************** */
-       rcgDemo.initModelWithCfg(cfgPath);
-       rcgDemo.simpleExample();
- 
-     } catch (Exception e) {
-       System.err.println(e);
-       e.printStackTrace();
-     }
-   }
- }
--- a/java-api-examples/src/DecodeMic.java 已删除 100755 → 0
查看文件 @757a44b
+++ b/java-api-examples/src/DecodeMic.java 已删除 100755 → 0
查看文件 @757a44b
- /*
-  * // Copyright 2022-2023 by zhaoming
-  */
- /*
- Real-time speech recognition from a microphone with com.k2fsa.sherpa.onnx Java API
- 
- example for cfgFile modelconfig.cfg
-   sample_rate=16000
-   feature_dim=80
-   rule1_min_trailing_silence=2.4
-   rule2_min_trailing_silence=1.2
-   rule3_min_utterance_length=20
-   encoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx
-   decoder=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx
-   joiner=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx
-   tokens=/sherpa-onnx/build/bin/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt
-   num_threads=4
-   enable_endpoint_detection=true
-   decoding_method=greedy_search
-   max_active_paths=4
- 
- */
- import com.k2fsa.sherpa.onnx.OnlineRecognizer;
- import com.k2fsa.sherpa.onnx.OnlineStream;
- import java.io.*;
- import java.nio.ByteBuffer;
- import java.nio.ByteOrder;
- import java.nio.ShortBuffer;
- import java.nio.charset.StandardCharsets;
- import javax.sound.sampled.AudioFormat;
- import javax.sound.sampled.AudioSystem;
- import javax.sound.sampled.DataLine;
- import javax.sound.sampled.TargetDataLine;
- 
- /** Microphone Example */
- public class DecodeMic {
-   MicRcgThread micRcgThread = null; // thread handle
- 
-   OnlineRecognizer rcgOjb; // the recognizer
- 
-   OnlineStream streamObj; // the stream
- 
-   public DecodeMic() {
- 
-     micRcgThread = new MicRcgThread(); // create a new instance for MicRcgThread
-   }
- 
-   public void open() {
-     micRcgThread.start(); // start to capture microphone data
-   }
- 
-   public void close() {
-     micRcgThread.stop(); // close capture
-   }
- 
-   /** init asr engine with config file */
-   public void initModelWithCfg(String cfgFile) {
-     try {
- 
-       // set setSoPath() before running this
-       rcgOjb = new OnlineRecognizer(cfgFile);
- 
-       streamObj = rcgOjb.createStream(); // create a stream for asr engine to feed data
-     } catch (Exception e) {
-       System.err.println(e);
-       e.printStackTrace();
-     }
-   }
- 
-   /** read data from mic and feed to asr engine */
-   class MicRcgThread implements Runnable {
- 
-     TargetDataLine capline; // line for capture mic data
- 
-     Thread thread; // this thread
-     int segmentId = 0; // record the segment id when detect endpoint
-     String preText = ""; // decoded text
- 
-     public MicRcgThread() {}
- 
-     public void start() {
- 
-       thread = new Thread(this);
- 
-       thread.start(); // start thread
-     }
- 
-     public void stop() {
-       capline.stop();
-       capline.close();
-       capline = null;
-       thread = null;
-     }
- 
-     /** feed captured microphone data to asr */
-     public void decodeSample(byte[] samplebytes) {
-       try {
-         ByteBuffer byteBuf = ByteBuffer.wrap(samplebytes); // create a bytebuf for samples
-         byteBuf.order(ByteOrder.LITTLE_ENDIAN); // set bytebuf to little endian
-         ShortBuffer shortBuf = byteBuf.asShortBuffer(); // covert to short type
-         short[] arrShort = new short[shortBuf.capacity()]; // array for copy short data
-         float[] arrFloat = new float[shortBuf.capacity()]; // array for copy float data
-         shortBuf.get(arrShort); // put date to arrShort
- 
-         for (int i = 0; i < arrShort.length; i++) {
-           arrFloat[i] = arrShort[i] / 32768f; // loop to covert short data to float -1 to 1
-         }
-         streamObj.acceptWaveform(arrFloat); // feed asr engine with float data
-         while (rcgOjb.isReady(streamObj)) { // if engine is ready for unprocessed data
- 
-           rcgOjb.decodeStream(streamObj); // decode for this stream
-         }
-         boolean isEndpoint =
-             rcgOjb.isEndpoint(
-                 streamObj); // endpoint check, make sure enable_endpoint_detection=true in config
-                             // file
-         String nowText = rcgOjb.getResult(streamObj); // get asr result
-         String recText = "";
-         byte[] utf8Data; // for covert text to utf8
-         if (isEndpoint && nowText.length() > 0) {
-           rcgOjb.reSet(streamObj); // reSet stream when detect endpoint
-           segmentId++;
-           preText = nowText;
-           recText = "text(seg_" + String.valueOf(segmentId) + "):" + nowText + "\n";
-           utf8Data = recText.getBytes(StandardCharsets.UTF_8);
-           System.out.println(new String(utf8Data));
-         }
- 
-         if (!nowText.equals(preText)) { // if preText not equal nowtext
-           preText = nowText;
-           recText = nowText + "\n";
-           utf8Data = recText.getBytes(StandardCharsets.UTF_8);
-           System.out.println(new String(utf8Data));
-         }
-       } catch (Exception e) {
-         System.err.println(e);
-         e.printStackTrace();
-       }
-     }
- 
-     /** run mic capture thread */
-     public void run() {
-       System.out.println("Started! Please speak...");
- 
-       AudioFormat.Encoding encoding = AudioFormat.Encoding.PCM_SIGNED; // the pcm format
-       float rate = 16000.0f; // using 16 kHz
-       int channels = 1; // single channel
-       int sampleSize = 16; // sampleSize 16bit
-       boolean isBigEndian = false; // using little endian
- 
-       AudioFormat format =
-           new AudioFormat(
-               encoding, rate, sampleSize, channels, (sampleSize / 8) * channels, rate, isBigEndian);
- 
-       DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
- 
-       // check system support such data format
-       if (!AudioSystem.isLineSupported(info)) {
-         System.out.println(info + " not supported.");
-         return;
-       }
- 
-       // open a line for capture.
- 
-       try {
-         capline = (TargetDataLine) AudioSystem.getLine(info);
-         capline.open(format, capline.getBufferSize());
-       } catch (Exception ex) {
-         System.out.println(ex);
-         return;
-       }
- 
-       // the buf size for mic captured each time
-       int bufferLengthInBytes = capline.getBufferSize() / 8 * format.getFrameSize();
-       byte[] micData = new byte[bufferLengthInBytes];
-       int numBytesRead;
- 
-       capline.start(); // start to capture mic data
- 
-       while (thread != null) {
-         // read data from line
-         if ((numBytesRead = capline.read(micData, 0, bufferLengthInBytes)) == -1) {
-           break;
-         }
- 
-         decodeSample(micData); // decode mic data
-       }
- 
-       // stop and close
- 
-       try {
-         if (capline != null) {
-           capline.stop();
-           capline.close();
-           capline = null;
-         }
- 
-       } catch (Exception ex) {
-         System.err.println(ex);
-       }
-     }
-   } // End class DecodeMic
- 
-   public static void main(String s[]) {
-     try {
-       String appDir = System.getProperty("user.dir");
-       System.out.println("appdir=" + appDir);
-       String cfgPath = appDir + "/modelconfig.cfg";
-       String soPath = appDir + "/../build/lib/libsherpa-onnx-jni.so";
-       OnlineRecognizer.setSoPath(soPath); // set so. lib for OnlineRecognizer
- 
-       DecodeMic decodeEx = new DecodeMic();
-       decodeEx.initModelWithCfg(cfgPath); // init asr engine
-       decodeEx.open(); // open thread for mic
-       System.out.print("Press Enter to EXIT!\n");
-       char i = (char) System.in.read();
-       decodeEx.close();
-     } catch (Exception e) {
-       System.err.println(e);
-       e.printStackTrace();
-     }
-   }
- }