Fangjun Kuang
Committed by GitHub

Add Java and Kotlin API for sense voice (#1164)

@@ -114,6 +114,16 @@ jobs: @@ -114,6 +114,16 @@ jobs:
114 ./run-kws-from-file.sh 114 ./run-kws-from-file.sh
115 rm -rf sherpa-onnx-* 115 rm -rf sherpa-onnx-*
116 116
  117 + - name: Run java test (VAD + Non-streaming SenseVoice)
  118 + shell: bash
  119 + run: |
  120 + cd ./java-api-examples
  121 + ./run-vad-non-streaming-sense-voice.sh
  122 + rm *.onnx
  123 + ls -lh *.wav
  124 + rm *.wav
  125 + rm -rf sherpa-onnx-*
  126 +
117 - name: Run java test (VAD + Non-streaming Paraformer) 127 - name: Run java test (VAD + Non-streaming Paraformer)
118 shell: bash 128 shell: bash
119 run: | 129 run: |
@@ -193,6 +203,10 @@ jobs: @@ -193,6 +203,10 @@ jobs:
193 shell: bash 203 shell: bash
194 run: | 204 run: |
195 cd ./java-api-examples 205 cd ./java-api-examples
  206 +
  207 + ./run-non-streaming-decode-file-sense-voice.sh
  208 + rm -rf sherpa-onnx-sense-voice-*
  209 +
196 ./run-inverse-text-normalization-paraformer.sh 210 ./run-inverse-text-normalization-paraformer.sh
197 211
198 ./run-non-streaming-decode-file-paraformer.sh 212 ./run-non-streaming-decode-file-paraformer.sh
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use an offline SenseVoice model,
  4 +// i.e., non-streaming SenseVoice model,
  5 +// to decode files.
  6 +import com.k2fsa.sherpa.onnx.*;
  7 +
  8 +public class NonStreamingDecodeFileSenseVoice {
  9 + public static void main(String[] args) {
  10 + // please refer to
  11 + // https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html
  12 + // to download model files
  13 + String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
  14 + String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
  15 +
  16 + String waveFilename = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav";
  17 +
  18 + WaveReader reader = new WaveReader(waveFilename);
  19 +
  20 + OfflineSenseVoiceModelConfig senseVoice =
  21 + OfflineSenseVoiceModelConfig.builder().setModel(model).build();
  22 +
  23 + OfflineModelConfig modelConfig =
  24 + OfflineModelConfig.builder()
  25 + .setSenseVoice(senseVoice)
  26 + .setTokens(tokens)
  27 + .setNumThreads(1)
  28 + .setDebug(true)
  29 + .build();
  30 +
  31 + OfflineRecognizerConfig config =
  32 + OfflineRecognizerConfig.builder()
  33 + .setOfflineModelConfig(modelConfig)
  34 + .setDecodingMethod("greedy_search")
  35 + .build();
  36 +
  37 + OfflineRecognizer recognizer = new OfflineRecognizer(config);
  38 + OfflineStream stream = recognizer.createStream();
  39 + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
  40 +
  41 + recognizer.decode(stream);
  42 +
  43 + String text = recognizer.getResult(stream).getText();
  44 +
  45 + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
  46 +
  47 + stream.release();
  48 + recognizer.release();
  49 + }
  50 +}
@@ -18,6 +18,7 @@ This directory contains examples for the JAVA API of sherpa-onnx. @@ -18,6 +18,7 @@ This directory contains examples for the JAVA API of sherpa-onnx.
18 18
19 ```bash 19 ```bash
20 ./run-non-streaming-decode-file-paraformer.sh 20 ./run-non-streaming-decode-file-paraformer.sh
  21 +./run-non-streaming-decode-file-sense-voice.sh
21 ./run-non-streaming-decode-file-transducer.sh 22 ./run-non-streaming-decode-file-transducer.sh
22 ./run-non-streaming-decode-file-whisper.sh 23 ./run-non-streaming-decode-file-whisper.sh
23 ./run-non-streaming-decode-file-nemo.sh 24 ./run-non-streaming-decode-file-nemo.sh
@@ -64,6 +65,12 @@ The punctuation model supports both English and Chinese. @@ -64,6 +65,12 @@ The punctuation model supports both English and Chinese.
64 ./run-vad-from-mic.sh 65 ./run-vad-from-mic.sh
65 ``` 66 ```
66 67
  68 +## VAD with a microphone + Non-streaming SenseVoice for speech recognition
  69 +
  70 +```bash
  71 +./run-vad-from-mic-non-streaming-sense-voice.sh
  72 +```
  73 +
67 ## VAD with a microphone + Non-streaming Paraformer for speech recognition 74 ## VAD with a microphone + Non-streaming Paraformer for speech recognition
68 75
69 ```bash 76 ```bash
@@ -82,6 +89,12 @@ The punctuation model supports both English and Chinese. @@ -82,6 +89,12 @@ The punctuation model supports both English and Chinese.
82 ./run-vad-remove-slience.sh 89 ./run-vad-remove-slience.sh
83 ``` 90 ```
84 91
  92 +## VAD + Non-streaming SenseVoice for speech recognition
  93 +
  94 +```bash
  95 +./run-vad-non-streaming-sense-voice.sh
  96 +```
  97 +
85 ## VAD + Non-streaming Paraformer for speech recognition 98 ## VAD + Non-streaming Paraformer for speech recognition
86 99
87 ```bash 100 ```bash
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a silero_vad model with a non-streaming
  4 +// SenseVoice model for speech recognition.
  5 +
  6 +import com.k2fsa.sherpa.onnx.*;
  7 +import javax.sound.sampled.*;
  8 +
  9 +public class VadFromMicWithNonStreamingSenseVoice {
  10 + private static final int sampleRate = 16000;
  11 + private static final int windowSize = 512;
  12 +
  13 + public static Vad createVad() {
  14 + // please download ./silero_vad.onnx from
  15 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  16 + String model = "./silero_vad.onnx";
  17 + SileroVadModelConfig sileroVad =
  18 + SileroVadModelConfig.builder()
  19 + .setModel(model)
  20 + .setThreshold(0.5f)
  21 + .setMinSilenceDuration(0.25f)
  22 + .setMinSpeechDuration(0.5f)
  23 + .setWindowSize(windowSize)
  24 + .build();
  25 +
  26 + VadModelConfig config =
  27 + VadModelConfig.builder()
  28 + .setSileroVadModelConfig(sileroVad)
  29 + .setSampleRate(sampleRate)
  30 + .setNumThreads(1)
  31 + .setDebug(true)
  32 + .setProvider("cpu")
  33 + .build();
  34 +
  35 + return new Vad(config);
  36 + }
  37 +
  38 + public static OfflineRecognizer createOfflineRecognizer() {
  39 + // please refer to
  40 + // https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html
  41 + // to download model files
  42 + String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
  43 + String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
  44 +
  45 + OfflineSenseVoiceModelConfig senseVoice =
  46 + OfflineSenseVoiceModelConfig.builder().setModel(model).build();
  47 +
  48 + OfflineModelConfig modelConfig =
  49 + OfflineModelConfig.builder()
  50 + .setSenseVoice(senseVoice)
  51 + .setTokens(tokens)
  52 + .setNumThreads(1)
  53 + .setDebug(true)
  54 + .build();
  55 +
  56 + OfflineRecognizerConfig config =
  57 + OfflineRecognizerConfig.builder()
  58 + .setOfflineModelConfig(modelConfig)
  59 + .setDecodingMethod("greedy_search")
  60 + .build();
  61 +
  62 + return new OfflineRecognizer(config);
  63 + }
  64 +
  65 + public static void main(String[] args) {
  66 + Vad vad = createVad();
  67 + OfflineRecognizer recognizer = createOfflineRecognizer();
  68 +
  69 + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
  70 + // Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
  71 + AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
  72 +
  73 + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
  74 + DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
  75 + TargetDataLine targetDataLine;
  76 + try {
  77 + targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
  78 + targetDataLine.open(format);
  79 + targetDataLine.start();
  80 + } catch (LineUnavailableException e) {
  81 + System.out.println("Failed to open target data line: " + e.getMessage());
  82 + vad.release();
  83 + recognizer.release();
  84 + return;
  85 + }
  86 +
  87 + boolean printed = false;
  88 + byte[] buffer = new byte[windowSize * 2];
  89 + float[] samples = new float[windowSize];
  90 +
  91 + System.out.println("Started. Please speak");
  92 + boolean running = true;
  93 + while (targetDataLine.isOpen() && running) {
  94 + int n = targetDataLine.read(buffer, 0, buffer.length);
  95 + if (n <= 0) {
  96 + System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
  97 + continue;
  98 + }
  99 + for (int i = 0; i != windowSize; ++i) {
  100 + short low = buffer[2 * i];
  101 + short high = buffer[2 * i + 1];
  102 + int s = (high << 8) + low;
  103 + samples[i] = (float) s / 32768;
  104 + }
  105 +
  106 + vad.acceptWaveform(samples);
  107 + if (vad.isSpeechDetected() && !printed) {
  108 + System.out.println("Detected speech");
  109 + printed = true;
  110 + }
  111 +
  112 + if (!vad.isSpeechDetected()) {
  113 + printed = false;
  114 + }
  115 +
  116 + while (!vad.empty()) {
  117 + SpeechSegment segment = vad.front();
  118 + float startTime = segment.getStart() / (float) sampleRate;
  119 + float duration = segment.getSamples().length / (float) sampleRate;
  120 +
  121 + OfflineStream stream = recognizer.createStream();
  122 + stream.acceptWaveform(segment.getSamples(), sampleRate);
  123 + recognizer.decode(stream);
  124 + String text = recognizer.getResult(stream).getText();
  125 + stream.release();
  126 +
  127 + if (!text.isEmpty()) {
  128 + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
  129 + }
  130 +
  131 + if (text.contains("退出程序")) {
  132 + running = false;
  133 + }
  134 +
  135 + vad.pop();
  136 + }
  137 + }
  138 +
  139 + vad.release();
  140 + recognizer.release();
  141 + }
  142 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a silero_vad model with a non-streaming SenseVoiceModel
  4 +// for speech recognition.
  5 +
  6 +import com.k2fsa.sherpa.onnx.*;
  7 +import java.util.Arrays;
  8 +
  9 +public class VadNonStreamingSenseVoice {
  10 + public static Vad createVad() {
  11 + // please download ./silero_vad.onnx from
  12 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  13 + String model = "./silero_vad.onnx";
  14 + SileroVadModelConfig sileroVad =
  15 + SileroVadModelConfig.builder()
  16 + .setModel(model)
  17 + .setThreshold(0.5f)
  18 + .setMinSilenceDuration(0.25f)
  19 + .setMinSpeechDuration(0.5f)
  20 + .setWindowSize(512)
  21 + .build();
  22 +
  23 + VadModelConfig config =
  24 + VadModelConfig.builder()
  25 + .setSileroVadModelConfig(sileroVad)
  26 + .setSampleRate(16000)
  27 + .setNumThreads(1)
  28 + .setDebug(true)
  29 + .setProvider("cpu")
  30 + .build();
  31 +
  32 + return new Vad(config);
  33 + }
  34 +
  35 + public static OfflineRecognizer createOfflineRecognizer() {
  36 + // please refer to
  37 + // https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html
  38 + // to download model files
  39 + String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
  40 + String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
  41 +
  42 + OfflineSenseVoiceModelConfig senseVoice =
  43 + OfflineSenseVoiceModelConfig.builder().setModel(model).build();
  44 +
  45 + OfflineModelConfig modelConfig =
  46 + OfflineModelConfig.builder()
  47 + .setSenseVoice(senseVoice)
  48 + .setTokens(tokens)
  49 + .setNumThreads(1)
  50 + .setDebug(true)
  51 + .build();
  52 +
  53 + OfflineRecognizerConfig config =
  54 + OfflineRecognizerConfig.builder()
  55 + .setOfflineModelConfig(modelConfig)
  56 + .setDecodingMethod("greedy_search")
  57 + .build();
  58 +
  59 + return new OfflineRecognizer(config);
  60 + }
  61 +
  62 + public static void main(String[] args) {
  63 +
  64 + Vad vad = createVad();
  65 + OfflineRecognizer recognizer = createOfflineRecognizer();
  66 +
  67 + // You can download the test file from
  68 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  69 + String testWaveFilename = "./lei-jun-test.wav";
  70 + WaveReader reader = new WaveReader(testWaveFilename);
  71 +
  72 + int numSamples = reader.getSamples().length;
  73 + int numIter = numSamples / 512;
  74 +
  75 + for (int i = 0; i != numIter; ++i) {
  76 + int start = i * 512;
  77 + int end = start + 512;
  78 + float[] samples = Arrays.copyOfRange(reader.getSamples(), start, end);
  79 + vad.acceptWaveform(samples);
  80 + if (vad.isSpeechDetected()) {
  81 + while (!vad.empty()) {
  82 + SpeechSegment segment = vad.front();
  83 + float startTime = segment.getStart() / 16000.0f;
  84 + float duration = segment.getSamples().length / 16000.0f;
  85 +
  86 + OfflineStream stream = recognizer.createStream();
  87 + stream.acceptWaveform(segment.getSamples(), 16000);
  88 + recognizer.decode(stream);
  89 + String text = recognizer.getResult(stream).getText();
  90 + stream.release();
  91 +
  92 + if (!text.isEmpty()) {
  93 + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
  94 + }
  95 +
  96 + vad.pop();
  97 + }
  98 + }
  99 + }
  100 +
  101 + vad.flush();
  102 + while (!vad.empty()) {
  103 + SpeechSegment segment = vad.front();
  104 + float startTime = segment.getStart() / 16000.0f;
  105 + float duration = segment.getSamples().length / 16000.0f;
  106 +
  107 + OfflineStream stream = recognizer.createStream();
  108 + stream.acceptWaveform(segment.getSamples(), 16000);
  109 + recognizer.decode(stream);
  110 + String text = recognizer.getResult(stream).getText();
  111 + stream.release();
  112 +
  113 + if (!text.isEmpty()) {
  114 + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
  115 + }
  116 +
  117 + vad.pop();
  118 + }
  119 +
  120 + vad.release();
  121 + recognizer.release();
  122 + }
  123 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  30 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  31 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  32 +fi
  33 +
  34 +java \
  35 + -Djava.library.path=$PWD/../build/lib \
  36 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  37 + NonStreamingDecodeFileSenseVoice.java
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./silero_vad.onnx ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  30 +fi
  31 +
  32 +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
  33 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  34 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  35 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  36 +fi
  37 +
  38 +java \
  39 + -Djava.library.path=$PWD/../build/lib \
  40 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  41 + ./VadFromMicWithNonStreamingSenseVoice.java
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./silero_vad.onnx ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  30 +fi
  31 +
  32 +if [ ! -f ./lei-jun-test.wav ]; then
  33 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  34 +fi
  35 +
  36 +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
  37 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  38 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  39 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  40 +fi
  41 +
  42 +java \
  43 + -Djava.library.path=$PWD/../build/lib \
  44 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  45 + ./VadNonStreamingSenseVoice.java
@@ -167,6 +167,12 @@ function testSpokenLanguageIdentification() { @@ -167,6 +167,12 @@ function testSpokenLanguageIdentification() {
167 } 167 }
168 168
169 function testOfflineAsr() { 169 function testOfflineAsr() {
  170 + if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
  171 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  172 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  173 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  174 + fi
  175 +
170 if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then 176 if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then
171 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 177 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
172 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 178 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
1 package com.k2fsa.sherpa.onnx 1 package com.k2fsa.sherpa.onnx
2 2
3 fun main() { 3 fun main() {
4 - val types = arrayOf(0, 2, 5, 6) 4 + val types = arrayOf(0, 2, 5, 6, 15)
5 for (type in types) { 5 for (type in types) {
6 test(type) 6 test(type)
7 } 7 }
@@ -15,6 +15,7 @@ fun test(type: Int) { @@ -15,6 +15,7 @@ fun test(type: Int) {
15 2 -> "./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav" 15 2 -> "./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav"
16 5 -> "./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav" 16 5 -> "./sherpa-onnx-zipformer-multi-zh-hans-2023-9-2/test_wavs/1.wav"
17 6 -> "./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/8k.wav" 17 6 -> "./sherpa-onnx-nemo-ctc-en-citrinet-512/test_wavs/8k.wav"
  18 + 15 -> "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav"
18 else -> null 19 else -> null
19 } 20 }
20 21
@@ -90,6 +90,23 @@ def get_models(): @@ -90,6 +90,23 @@ def get_models():
90 """, 90 """,
91 ), 91 ),
92 Model( 92 Model(
  93 + model_name="sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
  94 + idx=15,
  95 + lang="zh_en_ko_ja_yue",
  96 + short_name="sense_voice",
  97 + cmd="""
  98 + pushd $model_name
  99 +
  100 + rm -rfv test_wavs
  101 + rm -fv model.onnx
  102 + rm -fv *.py
  103 +
  104 + ls -lh
  105 +
  106 + popd
  107 + """,
  108 + ),
  109 + Model(
93 model_name="sherpa-onnx-paraformer-zh-small-2024-03-09", 110 model_name="sherpa-onnx-paraformer-zh-small-2024-03-09",
94 idx=14, 111 idx=14,
95 lang="zh", 112 lang="zh",
@@ -27,6 +27,7 @@ java_files += OfflineTransducerModelConfig.java @@ -27,6 +27,7 @@ java_files += OfflineTransducerModelConfig.java
27 java_files += OfflineParaformerModelConfig.java 27 java_files += OfflineParaformerModelConfig.java
28 java_files += OfflineWhisperModelConfig.java 28 java_files += OfflineWhisperModelConfig.java
29 java_files += OfflineNemoEncDecCtcModelConfig.java 29 java_files += OfflineNemoEncDecCtcModelConfig.java
  30 +java_files += OfflineSenseVoiceModelConfig.java
30 java_files += OfflineModelConfig.java 31 java_files += OfflineModelConfig.java
31 java_files += OfflineRecognizerConfig.java 32 java_files += OfflineRecognizerConfig.java
32 java_files += OfflineRecognizerResult.java 33 java_files += OfflineRecognizerResult.java
@@ -7,6 +7,7 @@ public class OfflineModelConfig { @@ -7,6 +7,7 @@ public class OfflineModelConfig {
7 private final OfflineParaformerModelConfig paraformer; 7 private final OfflineParaformerModelConfig paraformer;
8 private final OfflineWhisperModelConfig whisper; 8 private final OfflineWhisperModelConfig whisper;
9 private final OfflineNemoEncDecCtcModelConfig nemo; 9 private final OfflineNemoEncDecCtcModelConfig nemo;
  10 + private final OfflineSenseVoiceModelConfig senseVoice;
10 private final String teleSpeech; 11 private final String teleSpeech;
11 private final String tokens; 12 private final String tokens;
12 private final int numThreads; 13 private final int numThreads;
@@ -22,6 +23,7 @@ public class OfflineModelConfig { @@ -22,6 +23,7 @@ public class OfflineModelConfig {
22 this.paraformer = builder.paraformer; 23 this.paraformer = builder.paraformer;
23 this.whisper = builder.whisper; 24 this.whisper = builder.whisper;
24 this.nemo = builder.nemo; 25 this.nemo = builder.nemo;
  26 + this.senseVoice = builder.senseVoice;
25 this.teleSpeech = builder.teleSpeech; 27 this.teleSpeech = builder.teleSpeech;
26 this.tokens = builder.tokens; 28 this.tokens = builder.tokens;
27 this.numThreads = builder.numThreads; 29 this.numThreads = builder.numThreads;
@@ -48,6 +50,10 @@ public class OfflineModelConfig { @@ -48,6 +50,10 @@ public class OfflineModelConfig {
48 return whisper; 50 return whisper;
49 } 51 }
50 52
  53 + public OfflineSenseVoiceModelConfig getSenseVoice() {
  54 + return senseVoice;
  55 + }
  56 +
51 public String getTokens() { 57 public String getTokens() {
52 return tokens; 58 return tokens;
53 } 59 }
@@ -85,6 +91,7 @@ public class OfflineModelConfig { @@ -85,6 +91,7 @@ public class OfflineModelConfig {
85 private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); 91 private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build();
86 private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); 92 private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build();
87 private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); 93 private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build();
  94 + private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build();
88 private String teleSpeech = ""; 95 private String teleSpeech = "";
89 private String tokens = ""; 96 private String tokens = "";
90 private int numThreads = 1; 97 private int numThreads = 1;
@@ -113,7 +120,6 @@ public class OfflineModelConfig { @@ -113,7 +120,6 @@ public class OfflineModelConfig {
113 return this; 120 return this;
114 } 121 }
115 122
116 -  
117 public Builder setTeleSpeech(String teleSpeech) { 123 public Builder setTeleSpeech(String teleSpeech) {
118 this.teleSpeech = teleSpeech; 124 this.teleSpeech = teleSpeech;
119 return this; 125 return this;
@@ -124,6 +130,11 @@ public class OfflineModelConfig { @@ -124,6 +130,11 @@ public class OfflineModelConfig {
124 return this; 130 return this;
125 } 131 }
126 132
  133 + public Builder setSenseVoice(OfflineSenseVoiceModelConfig senseVoice) {
  134 + this.senseVoice = senseVoice;
  135 + return this;
  136 + }
  137 +
127 public Builder setTokens(String tokens) { 138 public Builder setTokens(String tokens) {
128 this.tokens = tokens; 139 this.tokens = tokens;
129 return this; 140 return this;
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class OfflineSenseVoiceModelConfig {
  6 + private final String model;
  7 + private final String language;
  8 + private final boolean useInverseTextNormalization;
  9 +
  10 + private OfflineSenseVoiceModelConfig(Builder builder) {
  11 + this.model = builder.model;
  12 + this.language = builder.language;
  13 + this.useInverseTextNormalization = builder.useInverseTextNormalization;
  14 + }
  15 +
  16 + public static Builder builder() {
  17 + return new Builder();
  18 + }
  19 +
  20 + public String getModel() {
  21 + return model;
  22 + }
  23 +
  24 + public String getLanguage() {
  25 + return language;
  26 + }
  27 +
  28 + public boolean getUseInverseTextNormalization() {
  29 + return useInverseTextNormalization;
  30 + }
  31 +
  32 + public static class Builder {
  33 + private String model = "";
  34 + private String language = "";
  35 + private boolean useInverseTextNormalization = true;
  36 +
  37 + public OfflineSenseVoiceModelConfig build() {
  38 + return new OfflineSenseVoiceModelConfig(this);
  39 + }
  40 +
  41 + public Builder setModel(String model) {
  42 + this.model = model;
  43 + return this;
  44 + }
  45 +
  46 + public Builder setLanguage(String language) {
  47 + this.language = language;
  48 + return this;
  49 + }
  50 +
  51 + public Builder setInverseTextNormalization(boolean useInverseTextNormalization) {
  52 + this.useInverseTextNormalization = useInverseTextNormalization;
  53 + return this;
  54 + }
  55 + }
  56 +}
@@ -171,6 +171,31 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { @@ -171,6 +171,31 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
171 ans.model_config.whisper.tail_paddings = 171 ans.model_config.whisper.tail_paddings =
172 env->GetIntField(whisper_config, fid); 172 env->GetIntField(whisper_config, fid);
173 173
  174 + // sense voice
  175 + fid = env->GetFieldID(model_config_cls, "senseVoice",
  176 + "Lcom/k2fsa/sherpa/onnx/OfflineSenseVoiceModelConfig;");
  177 + jobject sense_voice_config = env->GetObjectField(model_config, fid);
  178 + jclass sense_voice_config_cls = env->GetObjectClass(sense_voice_config);
  179 +
  180 + fid = env->GetFieldID(sense_voice_config_cls, "model", "Ljava/lang/String;");
  181 + s = (jstring)env->GetObjectField(sense_voice_config, fid);
  182 + p = env->GetStringUTFChars(s, nullptr);
  183 + ans.model_config.sense_voice.model = p;
  184 + env->ReleaseStringUTFChars(s, p);
  185 +
  186 + fid =
  187 + env->GetFieldID(sense_voice_config_cls, "language", "Ljava/lang/String;");
  188 + s = (jstring)env->GetObjectField(sense_voice_config, fid);
  189 + p = env->GetStringUTFChars(s, nullptr);
  190 + ans.model_config.sense_voice.language = p;
  191 + env->ReleaseStringUTFChars(s, p);
  192 +
  193 + fid = env->GetFieldID(sense_voice_config_cls, "useInverseTextNormalization",
  194 + "Z");
  195 + ans.model_config.sense_voice.use_itn =
  196 + env->GetBooleanField(sense_voice_config, fid);
  197 +
  198 + // nemo
174 fid = env->GetFieldID( 199 fid = env->GetFieldID(
175 model_config_cls, "nemo", 200 model_config_cls, "nemo",
176 "Lcom/k2fsa/sherpa/onnx/OfflineNemoEncDecCtcModelConfig;"); 201 "Lcom/k2fsa/sherpa/onnx/OfflineNemoEncDecCtcModelConfig;");
@@ -30,11 +30,18 @@ data class OfflineWhisperModelConfig( @@ -30,11 +30,18 @@ data class OfflineWhisperModelConfig(
30 var tailPaddings: Int = 1000, // Padding added at the end of the samples 30 var tailPaddings: Int = 1000, // Padding added at the end of the samples
31 ) 31 )
32 32
  33 +data class OfflineSenseVoiceModelConfig(
  34 + var model: String = "",
  35 + var language: String = "",
  36 + var useInverseTextNormalization: Boolean = true,
  37 +)
  38 +
33 data class OfflineModelConfig( 39 data class OfflineModelConfig(
34 var transducer: OfflineTransducerModelConfig = OfflineTransducerModelConfig(), 40 var transducer: OfflineTransducerModelConfig = OfflineTransducerModelConfig(),
35 var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), 41 var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(),
36 var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), 42 var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(),
37 var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), 43 var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(),
  44 + var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(),
38 var teleSpeech: String = "", 45 var teleSpeech: String = "",
39 var numThreads: Int = 1, 46 var numThreads: Int = 1,
40 var debug: Boolean = false, 47 var debug: Boolean = false,
@@ -321,6 +328,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { @@ -321,6 +328,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
321 modelType = "paraformer", 328 modelType = "paraformer",
322 ) 329 )
323 } 330 }
  331 +
  332 + 15 -> {
  333 + val modelDir = "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17"
  334 + return OfflineModelConfig(
  335 + senseVoice = OfflineSenseVoiceModelConfig(
  336 + model = "$modelDir/model.int8.onnx",
  337 + ),
  338 + tokens = "$modelDir/tokens.txt",
  339 + )
  340 + }
324 } 341 }
325 return null 342 return null
326 } 343 }