Committed by
GitHub
Support non-streaming zipformer CTC ASR models (#2340)
This PR adds support for non-streaming Zipformer CTC ASR models across multiple language bindings, WebAssembly, examples, and CI workflows. - Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs - Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js - Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models Model doc is available at https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
正在显示
71 个修改的文件
包含
2121 行增加
和
68 行删除
| @@ -6,6 +6,10 @@ cd dart-api-examples | @@ -6,6 +6,10 @@ cd dart-api-examples | ||
| 6 | 6 | ||
| 7 | pushd non-streaming-asr | 7 | pushd non-streaming-asr |
| 8 | 8 | ||
| 9 | +echo '----------Zipformer CTC----------' | ||
| 10 | +./run-zipformer-ctc.sh | ||
| 11 | +rm -rf sherpa-onnx-* | ||
| 12 | + | ||
| 9 | echo '----------SenseVoice----------' | 13 | echo '----------SenseVoice----------' |
| 10 | ./run-sense-voice-with-hr.sh | 14 | ./run-sense-voice-with-hr.sh |
| 11 | ./run-sense-voice.sh | 15 | ./run-sense-voice.sh |
| @@ -114,6 +118,10 @@ popd | @@ -114,6 +118,10 @@ popd | ||
| 114 | 118 | ||
| 115 | pushd vad-with-non-streaming-asr | 119 | pushd vad-with-non-streaming-asr |
| 116 | 120 | ||
| 121 | +echo '----------Zipformer CTC----------' | ||
| 122 | +./run-zipformer-ctc.sh | ||
| 123 | +rm -rf sherpa-onnx-* | ||
| 124 | + | ||
| 117 | echo '----------Dolphin CTC----------' | 125 | echo '----------Dolphin CTC----------' |
| 118 | ./run-dolphin-ctc.sh | 126 | ./run-dolphin-ctc.sh |
| 119 | rm -rf sherpa-onnx-* | 127 | rm -rf sherpa-onnx-* |
| @@ -6,43 +6,11 @@ cd ./version-test | @@ -6,43 +6,11 @@ cd ./version-test | ||
| 6 | ./run.sh | 6 | ./run.sh |
| 7 | ls -lh | 7 | ls -lh |
| 8 | 8 | ||
| 9 | -cd ../speech-enhancement-gtcrn | ||
| 10 | -./run.sh | ||
| 11 | -ls -lh | ||
| 12 | - | ||
| 13 | -cd ../kokoro-tts | ||
| 14 | -./run-kokoro.sh | ||
| 15 | -ls -lh | ||
| 16 | - | ||
| 17 | -cd ../offline-tts | ||
| 18 | -./run-matcha-zh.sh | ||
| 19 | -ls -lh *.wav | ||
| 20 | -./run-matcha-en.sh | ||
| 21 | -ls -lh *.wav | ||
| 22 | -./run-aishell3.sh | ||
| 23 | -ls -lh *.wav | ||
| 24 | -./run-piper.sh | ||
| 25 | -ls -lh *.wav | ||
| 26 | -./run-hf-fanchen.sh | ||
| 27 | -ls -lh *.wav | ||
| 28 | -ls -lh | ||
| 29 | - | ||
| 30 | -pushd ../.. | ||
| 31 | - | ||
| 32 | -mkdir tts | ||
| 33 | - | ||
| 34 | -cp -v dotnet-examples/kokoro-tts/*.wav ./tts | ||
| 35 | -cp -v dotnet-examples/offline-tts/*.wav ./tts | ||
| 36 | -popd | ||
| 37 | - | ||
| 38 | -cd ../offline-speaker-diarization | ||
| 39 | -./run.sh | ||
| 40 | -rm -rfv *.onnx | ||
| 41 | -rm -fv *.wav | ||
| 42 | -rm -rfv sherpa-onnx-pyannote-* | ||
| 43 | - | ||
| 44 | cd ../offline-decode-files | 9 | cd ../offline-decode-files |
| 45 | 10 | ||
| 11 | +./run-zipformer-ctc.sh | ||
| 12 | +rm -rf sherpa-onnx-* | ||
| 13 | + | ||
| 46 | ./run-dolphin-ctc.sh | 14 | ./run-dolphin-ctc.sh |
| 47 | rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 | 15 | rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 |
| 48 | 16 | ||
| @@ -82,6 +50,41 @@ rm -rf sherpa-onnx-* | @@ -82,6 +50,41 @@ rm -rf sherpa-onnx-* | ||
| 82 | ./run-tdnn-yesno.sh | 50 | ./run-tdnn-yesno.sh |
| 83 | rm -rf sherpa-onnx-* | 51 | rm -rf sherpa-onnx-* |
| 84 | 52 | ||
| 53 | +cd ../speech-enhancement-gtcrn | ||
| 54 | +./run.sh | ||
| 55 | +ls -lh | ||
| 56 | + | ||
| 57 | +cd ../kokoro-tts | ||
| 58 | +./run-kokoro.sh | ||
| 59 | +ls -lh | ||
| 60 | + | ||
| 61 | +cd ../offline-tts | ||
| 62 | +./run-matcha-zh.sh | ||
| 63 | +ls -lh *.wav | ||
| 64 | +./run-matcha-en.sh | ||
| 65 | +ls -lh *.wav | ||
| 66 | +./run-aishell3.sh | ||
| 67 | +ls -lh *.wav | ||
| 68 | +./run-piper.sh | ||
| 69 | +ls -lh *.wav | ||
| 70 | +./run-hf-fanchen.sh | ||
| 71 | +ls -lh *.wav | ||
| 72 | +ls -lh | ||
| 73 | + | ||
| 74 | +pushd ../.. | ||
| 75 | + | ||
| 76 | +mkdir tts | ||
| 77 | + | ||
| 78 | +cp -v dotnet-examples/kokoro-tts/*.wav ./tts | ||
| 79 | +cp -v dotnet-examples/offline-tts/*.wav ./tts | ||
| 80 | +popd | ||
| 81 | + | ||
| 82 | +cd ../offline-speaker-diarization | ||
| 83 | +./run.sh | ||
| 84 | +rm -rfv *.onnx | ||
| 85 | +rm -fv *.wav | ||
| 86 | +rm -rfv sherpa-onnx-pyannote-* | ||
| 87 | + | ||
| 85 | cd ../keyword-spotting-from-files | 88 | cd ../keyword-spotting-from-files |
| 86 | ./run.sh | 89 | ./run.sh |
| 87 | 90 | ||
| @@ -115,5 +118,3 @@ rm -rf sherpa-onnx-* | @@ -115,5 +118,3 @@ rm -rf sherpa-onnx-* | ||
| 115 | cd ../spoken-language-identification | 118 | cd ../spoken-language-identification |
| 116 | ./run.sh | 119 | ./run.sh |
| 117 | rm -rf sherpa-onnx-* | 120 | rm -rf sherpa-onnx-* |
| 118 | - | ||
| 119 | - |
| @@ -10,6 +10,15 @@ arch=$(node -p "require('os').arch()") | @@ -10,6 +10,15 @@ arch=$(node -p "require('os').arch()") | ||
| 10 | platform=$(node -p "require('os').platform()") | 10 | platform=$(node -p "require('os').platform()") |
| 11 | node_version=$(node -p "process.versions.node.split('.')[0]") | 11 | node_version=$(node -p "process.versions.node.split('.')[0]") |
| 12 | 12 | ||
| 13 | +echo "----------non-streaming ASR Zipformer CTC----------" | ||
| 14 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 15 | + | ||
| 16 | +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 17 | +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 18 | + | ||
| 19 | +node ./test_asr_non_streaming_zipformer_ctc.js | ||
| 20 | +rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 | ||
| 21 | + | ||
| 13 | echo "----------non-streaming ASR NeMo parakeet tdt----------" | 22 | echo "----------non-streaming ASR NeMo parakeet tdt----------" |
| 14 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 | 23 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 |
| 15 | tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 | 24 | tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 |
| @@ -9,6 +9,15 @@ git status | @@ -9,6 +9,15 @@ git status | ||
| 9 | ls -lh | 9 | ls -lh |
| 10 | ls -lh node_modules | 10 | ls -lh node_modules |
| 11 | 11 | ||
| 12 | +# asr with offline zipformer ctc | ||
| 13 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 14 | + | ||
| 15 | +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 16 | +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 17 | + | ||
| 18 | +node ./test-offline-zipformer-ctc.js | ||
| 19 | +rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 | ||
| 20 | + | ||
| 12 | # asr with offline dolphin ctc | 21 | # asr with offline dolphin ctc |
| 13 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | 22 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 |
| 14 | tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | 23 | tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 |
| @@ -9,6 +9,9 @@ ls -lh | @@ -9,6 +9,9 @@ ls -lh | ||
| 9 | 9 | ||
| 10 | ./run-test-version.sh | 10 | ./run-test-version.sh |
| 11 | 11 | ||
| 12 | +./run-zipformer-ctc-asr.sh | ||
| 13 | +rm -rf sherpa-onnx-zipformer-* | ||
| 14 | + | ||
| 12 | ./run-decode-file-sense-voice-with-hr.sh | 15 | ./run-decode-file-sense-voice-with-hr.sh |
| 13 | rm -rf sherpa-onnx-sense-voice-* | 16 | rm -rf sherpa-onnx-sense-voice-* |
| 14 | rm -rf dict lexicon.txt replace.fst test-hr.wav | 17 | rm -rf dict lexicon.txt replace.fst test-hr.wav |
| @@ -89,6 +89,7 @@ jobs: | @@ -89,6 +89,7 @@ jobs: | ||
| 89 | make -j4 install | 89 | make -j4 install |
| 90 | 90 | ||
| 91 | cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin | 91 | cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin |
| 92 | + cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin | ||
| 92 | 93 | ||
| 93 | rm -rf install/lib/pkgconfig | 94 | rm -rf install/lib/pkgconfig |
| 94 | rm -fv install/lib/cargs.h | 95 | rm -fv install/lib/cargs.h |
| @@ -135,6 +136,7 @@ jobs: | @@ -135,6 +136,7 @@ jobs: | ||
| 135 | make -j4 install | 136 | make -j4 install |
| 136 | 137 | ||
| 137 | cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin | 138 | cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin |
| 139 | + cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin | ||
| 138 | 140 | ||
| 139 | rm -rf install/lib/pkgconfig | 141 | rm -rf install/lib/pkgconfig |
| 140 | rm -fv install/lib/cargs.h | 142 | rm -fv install/lib/cargs.h |
| @@ -90,6 +90,7 @@ jobs: | @@ -90,6 +90,7 @@ jobs: | ||
| 90 | make install | 90 | make install |
| 91 | 91 | ||
| 92 | cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin | 92 | cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin |
| 93 | + cp bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin | ||
| 93 | 94 | ||
| 94 | ls -lh install/lib | 95 | ls -lh install/lib |
| 95 | 96 |
| @@ -37,7 +37,7 @@ jobs: | @@ -37,7 +37,7 @@ jobs: | ||
| 37 | strategy: | 37 | strategy: |
| 38 | fail-fast: false | 38 | fail-fast: false |
| 39 | matrix: | 39 | matrix: |
| 40 | - os: [ubuntu-latest, macos-latest, macos-13, windows-latest] | 40 | + os: [ubuntu-latest, macos-latest, macos-13, windows-latest, ubuntu-22.04-arm] |
| 41 | 41 | ||
| 42 | steps: | 42 | steps: |
| 43 | - uses: actions/checkout@v4 | 43 | - uses: actions/checkout@v4 |
| @@ -56,7 +56,7 @@ jobs: | @@ -56,7 +56,7 @@ jobs: | ||
| 56 | key: ${{ matrix.os }} | 56 | key: ${{ matrix.os }} |
| 57 | 57 | ||
| 58 | - name: Install Free pascal compiler (ubuntu) | 58 | - name: Install Free pascal compiler (ubuntu) |
| 59 | - if: matrix.os == 'ubuntu-latest' | 59 | + if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-22.04-arm' |
| 60 | shell: bash | 60 | shell: bash |
| 61 | run: | | 61 | run: | |
| 62 | sudo apt-get update | 62 | sudo apt-get update |
| @@ -156,6 +156,10 @@ jobs: | @@ -156,6 +156,10 @@ jobs: | ||
| 156 | 156 | ||
| 157 | pushd non-streaming-asr | 157 | pushd non-streaming-asr |
| 158 | 158 | ||
| 159 | + ./run-zipformer-ctc.sh | ||
| 160 | + rm -rf sherpa-onnx-* | ||
| 161 | + echo "---" | ||
| 162 | + | ||
| 159 | ./run-dolphin-ctc.sh | 163 | ./run-dolphin-ctc.sh |
| 160 | rm -rf sherpa-onnx-* | 164 | rm -rf sherpa-onnx-* |
| 161 | echo "---" | 165 | echo "---" |
| @@ -264,9 +268,12 @@ jobs: | @@ -264,9 +268,12 @@ jobs: | ||
| 264 | 268 | ||
| 265 | cd ./pascal-api-examples | 269 | cd ./pascal-api-examples |
| 266 | 270 | ||
| 267 | - | ||
| 268 | pushd vad-with-non-streaming-asr | 271 | pushd vad-with-non-streaming-asr |
| 269 | 272 | ||
| 273 | + time ./run-vad-with-zipformer-ctc.sh | ||
| 274 | + rm -rf sherpa-onnx-* | ||
| 275 | + echo "---" | ||
| 276 | + | ||
| 270 | time ./run-vad-with-dolphin-ctc.sh | 277 | time ./run-vad-with-dolphin-ctc.sh |
| 271 | rm -rf sherpa-onnx-* | 278 | rm -rf sherpa-onnx-* |
| 272 | echo "---" | 279 | echo "---" |
| @@ -165,6 +165,9 @@ jobs: | @@ -165,6 +165,9 @@ jobs: | ||
| 165 | run: | | 165 | run: | |
| 166 | cd ./java-api-examples | 166 | cd ./java-api-examples |
| 167 | 167 | ||
| 168 | + ./run-non-streaming-decode-file-zipformer-ctc.sh | ||
| 169 | + rm -rf sherpa-onnx-zipformer-ctc-* | ||
| 170 | + | ||
| 168 | ./run-non-streaming-decode-file-dolphin-ctc.sh | 171 | ./run-non-streaming-decode-file-dolphin-ctc.sh |
| 169 | rm -rf sherpa-onnx-dolphin-* | 172 | rm -rf sherpa-onnx-dolphin-* |
| 170 | 173 |
| @@ -184,6 +184,10 @@ jobs: | @@ -184,6 +184,10 @@ jobs: | ||
| 184 | go build | 184 | go build |
| 185 | ls -lh | 185 | ls -lh |
| 186 | 186 | ||
| 187 | + echo "Test Zipformer CTC" | ||
| 188 | + ./run-zipformer-ctc.sh | ||
| 189 | + rm -rf sherpa-onnx-zipformer-* | ||
| 190 | + | ||
| 187 | echo "Test SenseVoice ctc" | 191 | echo "Test SenseVoice ctc" |
| 188 | ./run-sense-voice-small-with-hr.sh | 192 | ./run-sense-voice-small-with-hr.sh |
| 189 | ./run-sense-voice-small.sh | 193 | ./run-sense-voice-small.sh |
| @@ -19,12 +19,36 @@ jobs: | @@ -19,12 +19,36 @@ jobs: | ||
| 19 | fail-fast: false | 19 | fail-fast: false |
| 20 | matrix: | 20 | matrix: |
| 21 | os: [ubuntu-latest] | 21 | os: [ubuntu-latest] |
| 22 | - python-version: ["3.8"] | 22 | + python-version: ["3.10"] |
| 23 | 23 | ||
| 24 | steps: | 24 | steps: |
| 25 | - uses: actions/checkout@v4 | 25 | - uses: actions/checkout@v4 |
| 26 | 26 | ||
| 27 | + - name: Zipformer CTC (non-streaming) | ||
| 28 | + shell: bash | ||
| 29 | + run: | | ||
| 30 | + git lfs install | ||
| 31 | + names=( | ||
| 32 | + sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 | ||
| 33 | + sherpa-onnx-zipformer-ctc-zh-2025-07-03 | ||
| 34 | + sherpa-onnx-zipformer-ctc-zh-fp16-2025-07-03 | ||
| 35 | + ) | ||
| 36 | + for name in ${names[@]}; do | ||
| 37 | + git clone https://huggingface.co/csukuangfj/$name | ||
| 38 | + pushd $name | ||
| 39 | + git lfs pull | ||
| 40 | + rm -rf .git | ||
| 41 | + rm -rfv .gitattributes | ||
| 42 | + ls -lh | ||
| 43 | + popd | ||
| 44 | + | ||
| 45 | + tar cjfv $name.tar.bz2 $name | ||
| 46 | + rm -rf $name | ||
| 47 | + ls -lh *.tar.bz2 | ||
| 48 | + done | ||
| 49 | + | ||
| 27 | - name: Vietnamese (zipformer) | 50 | - name: Vietnamese (zipformer) |
| 51 | + if: false | ||
| 28 | shell: bash | 52 | shell: bash |
| 29 | run: | | 53 | run: | |
| 30 | rm -rf models | 54 | rm -rf models |
| @@ -76,6 +100,7 @@ jobs: | @@ -76,6 +100,7 @@ jobs: | ||
| 76 | mv models/* . | 100 | mv models/* . |
| 77 | 101 | ||
| 78 | - name: Publish to huggingface (Vietnamese zipformer) | 102 | - name: Publish to huggingface (Vietnamese zipformer) |
| 103 | + if: false | ||
| 79 | env: | 104 | env: |
| 80 | HF_TOKEN: ${{ secrets.HF_TOKEN }} | 105 | HF_TOKEN: ${{ secrets.HF_TOKEN }} |
| 81 | uses: nick-fields/retry@v3 | 106 | uses: nick-fields/retry@v3 |
| @@ -114,6 +114,7 @@ We also have spaces built using WebAssembly. They are listed below: | @@ -114,6 +114,7 @@ We also have spaces built using WebAssembly. They are listed below: | ||
| 114 | |Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| | 114 | |Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| |
| 115 | |Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| | 115 | |Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| |
| 116 | |Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| | 116 | |Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| |
| 117 | +|VAD + speech recognition (Chinese) with [Zipformer CTC](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|[Click me][wasm-hf-vad-asr-zh-zipformer-ctc-07-03]| [地址][wasm-ms-vad-asr-zh-zipformer-ctc-07-03]| | ||
| 117 | |VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| | 118 | |VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| |
| 118 | |VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| | 119 | |VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| |
| 119 | |VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]| | 120 | |VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]| |
| @@ -141,6 +142,7 @@ We also have spaces built using WebAssembly. They are listed below: | @@ -141,6 +142,7 @@ We also have spaces built using WebAssembly. They are listed below: | ||
| 141 | |----------------------------------------|------------------------------------|-----------------------------------| | 142 | |----------------------------------------|------------------------------------|-----------------------------------| |
| 142 | | Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]| | 143 | | Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]| |
| 143 | | Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] | | 144 | | Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] | |
| 145 | +| Simulated-streaming speech recognition | [Address][apk-simula-streaming-asr]| [点此][apk-simula-streaming-asr-cn]| | ||
| 144 | | Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | | 146 | | Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | |
| 145 | | Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | | 147 | | Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | |
| 146 | | VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | | 148 | | VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | |
| @@ -250,8 +252,10 @@ for more models. The following table lists only **SOME** of them. | @@ -250,8 +252,10 @@ for more models. The following table lists only **SOME** of them. | ||
| 250 | 252 | ||
| 251 | |Name | Supported Languages| Description| | 253 | |Name | Supported Languages| Description| |
| 252 | |-----|-----|----| | 254 | |-----|-----|----| |
| 255 | +|[sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english)| English | It is converted from <https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2>| | ||
| 253 | |[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)| | 256 | |[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)| |
| 254 | |[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)| | 257 | |[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)| |
| 258 | +|[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|Chinese| A Zipformer CTC model| | ||
| 255 | |[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)| | 259 | |[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)| |
| 256 | |[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)| | 260 | |[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)| |
| 257 | |[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)| | 261 | |[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)| |
| @@ -413,6 +417,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | @@ -413,6 +417,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | ||
| 413 | [wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en | 417 | [wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en |
| 414 | [wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en | 418 | [wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en |
| 415 | [SenseVoice]: https://github.com/FunAudioLLM/SenseVoice | 419 | [SenseVoice]: https://github.com/FunAudioLLM/SenseVoice |
| 420 | +[wasm-hf-vad-asr-zh-zipformer-ctc-07-03]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc | ||
| 421 | +[wasm-ms-vad-asr-zh-zipformer-ctc-07-03]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc/summary | ||
| 416 | [wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice | 422 | [wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice |
| 417 | [wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice | 423 | [wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice |
| 418 | [wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny | 424 | [wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny |
| @@ -423,20 +429,20 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | @@ -423,20 +429,20 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | ||
| 423 | [wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech | 429 | [wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech |
| 424 | [wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech | 430 | [wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech |
| 425 | [wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech | 431 | [wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech |
| 426 | -[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf | 432 | +[reazonspeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf |
| 427 | [wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer | 433 | [wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer |
| 428 | [wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer | 434 | [wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer |
| 429 | -[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2 | 435 | +[gigaspeech2]: https://github.com/speechcolab/gigaspeech2 |
| 430 | [wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer | 436 | [wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer |
| 431 | [wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer | 437 | [wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer |
| 432 | -[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR | 438 | +[telespeech-asr]: https://github.com/tele-ai/telespeech-asr |
| 433 | [wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech | 439 | [wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech |
| 434 | [wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech | 440 | [wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech |
| 435 | [wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer | 441 | [wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer |
| 436 | [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer | 442 | [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer |
| 437 | [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small | 443 | [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small |
| 438 | [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small | 444 | [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small |
| 439 | -[Dolphin]: https://github.com/DataoceanAI/Dolphin | 445 | +[dolphin]: https://github.com/dataoceanai/dolphin |
| 440 | [wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc | 446 | [wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc |
| 441 | [wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc | 447 | [wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc |
| 442 | 448 | ||
| @@ -450,6 +456,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | @@ -450,6 +456,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | ||
| 450 | [apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html | 456 | [apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html |
| 451 | [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html | 457 | [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html |
| 452 | [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html | 458 | [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html |
| 459 | +[apk-simula-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr.html | ||
| 460 | +[apk-simula-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr-cn.html | ||
| 453 | [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html | 461 | [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html |
| 454 | [apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html | 462 | [apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html |
| 455 | [apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html | 463 | [apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html |
| @@ -45,6 +45,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | @@ -45,6 +45,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | ||
| 45 | sherpa-onnx-cxx-api | 45 | sherpa-onnx-cxx-api |
| 46 | portaudio_static | 46 | portaudio_static |
| 47 | ) | 47 | ) |
| 48 | + | ||
| 49 | + add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api | ||
| 50 | + ./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc | ||
| 51 | + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc | ||
| 52 | + ) | ||
| 53 | + target_link_libraries(zipformer-ctc-simulate-streaming-microphone-cxx-api | ||
| 54 | + sherpa-onnx-cxx-api | ||
| 55 | + portaudio_static | ||
| 56 | + ) | ||
| 48 | endif() | 57 | endif() |
| 49 | 58 | ||
| 50 | if(SHERPA_ONNX_HAS_ALSA) | 59 | if(SHERPA_ONNX_HAS_ALSA) |
| @@ -57,10 +66,21 @@ if(SHERPA_ONNX_HAS_ALSA) | @@ -57,10 +66,21 @@ if(SHERPA_ONNX_HAS_ALSA) | ||
| 57 | portaudio_static | 66 | portaudio_static |
| 58 | ) | 67 | ) |
| 59 | 68 | ||
| 69 | + add_executable(zipformer-ctc-simulate-streaming-alsa-cxx-api | ||
| 70 | + ./zipformer-ctc-simulate-streaming-alsa-cxx-api.cc | ||
| 71 | + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc | ||
| 72 | + ) | ||
| 73 | + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api | ||
| 74 | + sherpa-onnx-cxx-api | ||
| 75 | + portaudio_static | ||
| 76 | + ) | ||
| 77 | + | ||
| 60 | if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) | 78 | if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) |
| 61 | target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) | 79 | target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) |
| 80 | + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) | ||
| 62 | else() | 81 | else() |
| 63 | target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound) | 82 | target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound) |
| 83 | + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api asound) | ||
| 64 | endif() | 84 | endif() |
| 65 | endif() | 85 | endif() |
| 66 | 86 |
| 1 | +// cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc | ||
| 2 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 3 | + | ||
| 4 | +// | ||
| 5 | +// This file demonstrates how to use zipformer CTC with sherpa-onnx's C++ API | ||
| 6 | +// for streaming speech recognition from a microphone. | ||
| 7 | +// | ||
| 8 | +// clang-format off | ||
| 9 | +// | ||
| 10 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 11 | +// | ||
| 12 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 13 | +// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 14 | +// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 15 | +// | ||
| 16 | +// clang-format on | ||
| 17 | + | ||
| 18 | +#include <signal.h> | ||
| 19 | +#include <stdio.h> | ||
| 20 | +#include <stdlib.h> | ||
| 21 | + | ||
| 22 | +#include <chrono> // NOLINT | ||
| 23 | +#include <condition_variable> // NOLINT | ||
| 24 | +#include <iostream> | ||
| 25 | +#include <mutex> // NOLINT | ||
| 26 | +#include <queue> | ||
| 27 | +#include <thread> // NOLINT | ||
| 28 | +#include <vector> | ||
| 29 | + | ||
| 30 | +#include "sherpa-display.h" // NOLINT | ||
| 31 | +#include "sherpa-onnx/c-api/cxx-api.h" | ||
| 32 | +#include "sherpa-onnx/csrc/alsa.h" | ||
| 33 | + | ||
| 34 | +std::queue<std::vector<float>> samples_queue; | ||
| 35 | +std::condition_variable condition_variable; | ||
| 36 | +std::mutex mutex; | ||
| 37 | +bool stop = false; | ||
| 38 | + | ||
| 39 | +static void Handler(int32_t /*sig*/) { | ||
| 40 | + stop = true; | ||
| 41 | + condition_variable.notify_one(); | ||
| 42 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); | ||
| 43 | +} | ||
| 44 | + | ||
| 45 | +static void RecordCallback(sherpa_onnx::Alsa *alsa) { | ||
| 46 | + int32_t chunk = 0.1 * alsa->GetActualSampleRate(); | ||
| 47 | + while (!stop) { | ||
| 48 | + std::vector<float> samples = alsa->Read(chunk); | ||
| 49 | + | ||
| 50 | + std::lock_guard<std::mutex> lock(mutex); | ||
| 51 | + samples_queue.emplace(std::move(samples)); | ||
| 52 | + condition_variable.notify_one(); | ||
| 53 | + } | ||
| 54 | +} | ||
| 55 | + | ||
| 56 | +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { | ||
| 57 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 58 | + VadModelConfig config; | ||
| 59 | + config.silero_vad.model = "./silero_vad.onnx"; | ||
| 60 | + config.silero_vad.threshold = 0.5; | ||
| 61 | + config.silero_vad.min_silence_duration = 0.1; | ||
| 62 | + config.silero_vad.min_speech_duration = 0.25; | ||
| 63 | + config.silero_vad.max_speech_duration = 8; | ||
| 64 | + config.sample_rate = 16000; | ||
| 65 | + config.debug = false; | ||
| 66 | + | ||
| 67 | + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); | ||
| 68 | + if (!vad.Get()) { | ||
| 69 | + std::cerr << "Failed to create VAD. Please check your config\n"; | ||
| 70 | + exit(-1); | ||
| 71 | + } | ||
| 72 | + | ||
| 73 | + return vad; | ||
| 74 | +} | ||
| 75 | + | ||
| 76 | +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { | ||
| 77 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 78 | + OfflineRecognizerConfig config; | ||
| 79 | + | ||
| 80 | + config.model_config.zipformer_ctc.model = | ||
| 81 | + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"; | ||
| 82 | + config.model_config.tokens = | ||
| 83 | + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"; | ||
| 84 | + | ||
| 85 | + config.model_config.num_threads = 2; | ||
| 86 | + config.model_config.debug = false; | ||
| 87 | + | ||
| 88 | + std::cout << "Loading model\n"; | ||
| 89 | + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); | ||
| 90 | + if (!recognizer.Get()) { | ||
| 91 | + std::cerr << "Please check your config\n"; | ||
| 92 | + exit(-1); | ||
| 93 | + } | ||
| 94 | + std::cout << "Loading model done\n"; | ||
| 95 | + return recognizer; | ||
| 96 | +} | ||
| 97 | + | ||
| 98 | +int32_t main(int32_t argc, const char *argv[]) { | ||
| 99 | + const char *kUsageMessage = R"usage( | ||
| 100 | +Usage: | ||
| 101 | + | ||
| 102 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 103 | + | ||
| 104 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 105 | +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 106 | +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 107 | + | ||
| 108 | +./zipformer-ctc-simulate-streaming-alsa-cxx-api device_name | ||
| 109 | + | ||
| 110 | +The device name specifies which microphone to use in case there are several | ||
| 111 | +on your system. You can use | ||
| 112 | + | ||
| 113 | + arecord -l | ||
| 114 | + | ||
| 115 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 116 | + | ||
| 117 | +**** List of CAPTURE Hardware Devices **** | ||
| 118 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 119 | + Subdevices: 1/1 | ||
| 120 | + Subdevice #0: subdevice #0 | ||
| 121 | + | ||
| 122 | +and if you want to select card 3 and device 0 on that card, please use: | ||
| 123 | + | ||
| 124 | + plughw:3,0 | ||
| 125 | + | ||
| 126 | +as the device_name. | ||
| 127 | +)usage"; | ||
| 128 | + | ||
| 129 | + if (argc != 2) { | ||
| 130 | + fprintf(stderr, "%s\n", kUsageMessage); | ||
| 131 | + return -1; | ||
| 132 | + } | ||
| 133 | + | ||
| 134 | + signal(SIGINT, Handler); | ||
| 135 | + | ||
| 136 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 137 | + | ||
| 138 | + auto vad = CreateVad(); | ||
| 139 | + auto recognizer = CreateOfflineRecognizer(); | ||
| 140 | + | ||
| 141 | + int32_t expected_sample_rate = 16000; | ||
| 142 | + | ||
| 143 | + std::string device_name = argv[1]; | ||
| 144 | + sherpa_onnx::Alsa alsa(device_name.c_str()); | ||
| 145 | + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); | ||
| 146 | + | ||
| 147 | + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { | ||
| 148 | + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), | ||
| 149 | + expected_sample_rate); | ||
| 150 | + exit(-1); | ||
| 151 | + } | ||
| 152 | + | ||
| 153 | + int32_t window_size = 512; // samples, please don't change | ||
| 154 | + | ||
| 155 | + int32_t offset = 0; | ||
| 156 | + std::vector<float> buffer; | ||
| 157 | + bool speech_started = false; | ||
| 158 | + | ||
| 159 | + auto started_time = std::chrono::steady_clock::now(); | ||
| 160 | + | ||
| 161 | + SherpaDisplay display; | ||
| 162 | + | ||
| 163 | + std::thread record_thread(RecordCallback, &alsa); | ||
| 164 | + | ||
| 165 | + std::cout << "Started! Please speak\n"; | ||
| 166 | + | ||
| 167 | + while (!stop) { | ||
| 168 | + { | ||
| 169 | + std::unique_lock<std::mutex> lock(mutex); | ||
| 170 | + while (samples_queue.empty() && !stop) { | ||
| 171 | + condition_variable.wait(lock); | ||
| 172 | + } | ||
| 173 | + | ||
| 174 | + const auto &s = samples_queue.front(); | ||
| 175 | + buffer.insert(buffer.end(), s.begin(), s.end()); | ||
| 176 | + | ||
| 177 | + samples_queue.pop(); | ||
| 178 | + } | ||
| 179 | + | ||
| 180 | + for (; offset + window_size < buffer.size(); offset += window_size) { | ||
| 181 | + vad.AcceptWaveform(buffer.data() + offset, window_size); | ||
| 182 | + if (!speech_started && vad.IsDetected()) { | ||
| 183 | + speech_started = true; | ||
| 184 | + started_time = std::chrono::steady_clock::now(); | ||
| 185 | + } | ||
| 186 | + } | ||
| 187 | + if (!speech_started) { | ||
| 188 | + if (buffer.size() > 10 * window_size) { | ||
| 189 | + offset -= buffer.size() - 10 * window_size; | ||
| 190 | + buffer = {buffer.end() - 10 * window_size, buffer.end()}; | ||
| 191 | + } | ||
| 192 | + } | ||
| 193 | + | ||
| 194 | + auto current_time = std::chrono::steady_clock::now(); | ||
| 195 | + const float elapsed_seconds = | ||
| 196 | + std::chrono::duration_cast<std::chrono::milliseconds>(current_time - | ||
| 197 | + started_time) | ||
| 198 | + .count() / | ||
| 199 | + 1000.; | ||
| 200 | + | ||
| 201 | + if (speech_started && elapsed_seconds > 0.2) { | ||
| 202 | + OfflineStream stream = recognizer.CreateStream(); | ||
| 203 | + stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size()); | ||
| 204 | + | ||
| 205 | + recognizer.Decode(&stream); | ||
| 206 | + | ||
| 207 | + OfflineRecognizerResult result = recognizer.GetResult(&stream); | ||
| 208 | + display.UpdateText(result.text); | ||
| 209 | + display.Display(); | ||
| 210 | + | ||
| 211 | + started_time = std::chrono::steady_clock::now(); | ||
| 212 | + } | ||
| 213 | + | ||
| 214 | + while (!vad.IsEmpty()) { | ||
| 215 | + auto segment = vad.Front(); | ||
| 216 | + | ||
| 217 | + vad.Pop(); | ||
| 218 | + | ||
| 219 | + OfflineStream stream = recognizer.CreateStream(); | ||
| 220 | + stream.AcceptWaveform(expected_sample_rate, segment.samples.data(), | ||
| 221 | + segment.samples.size()); | ||
| 222 | + | ||
| 223 | + recognizer.Decode(&stream); | ||
| 224 | + | ||
| 225 | + OfflineRecognizerResult result = recognizer.GetResult(&stream); | ||
| 226 | + | ||
| 227 | + display.UpdateText(result.text); | ||
| 228 | + display.FinalizeCurrentSentence(); | ||
| 229 | + display.Display(); | ||
| 230 | + | ||
| 231 | + buffer.clear(); | ||
| 232 | + offset = 0; | ||
| 233 | + speech_started = false; | ||
| 234 | + } | ||
| 235 | + } | ||
| 236 | + | ||
| 237 | + record_thread.join(); | ||
| 238 | + | ||
| 239 | + return 0; | ||
| 240 | +} |
| 1 | +// cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc | ||
| 2 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 3 | + | ||
| 4 | +// | ||
| 5 | +// This file demonstrates how to use Zipformer CTC with sherpa-onnx's C++ API | ||
| 6 | +// for streaming speech recognition from a microphone. | ||
| 7 | +// | ||
| 8 | +// clang-format off | ||
| 9 | +// | ||
| 10 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 11 | +// | ||
| 12 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 13 | +// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 14 | +// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 15 | +// | ||
| 16 | +// clang-format on | ||
| 17 | + | ||
| 18 | +#include <signal.h> | ||
| 19 | +#include <stdio.h> | ||
| 20 | +#include <stdlib.h> | ||
| 21 | + | ||
| 22 | +#include <chrono> // NOLINT | ||
| 23 | +#include <condition_variable> // NOLINT | ||
| 24 | +#include <iostream> | ||
| 25 | +#include <mutex> // NOLINT | ||
| 26 | +#include <queue> | ||
| 27 | +#include <vector> | ||
| 28 | + | ||
| 29 | +#include "portaudio.h" // NOLINT | ||
| 30 | +#include "sherpa-display.h" // NOLINT | ||
| 31 | +#include "sherpa-onnx/c-api/cxx-api.h" | ||
| 32 | +#include "sherpa-onnx/csrc/microphone.h" | ||
| 33 | + | ||
| 34 | +std::queue<std::vector<float>> samples_queue; | ||
| 35 | +std::condition_variable condition_variable; | ||
| 36 | +std::mutex mutex; | ||
| 37 | +bool stop = false; | ||
| 38 | + | ||
| 39 | +static void Handler(int32_t /*sig*/) { | ||
| 40 | + stop = true; | ||
| 41 | + condition_variable.notify_one(); | ||
| 42 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); | ||
| 43 | +} | ||
| 44 | + | ||
| 45 | +static int32_t RecordCallback(const void *input_buffer, | ||
| 46 | + void * /*output_buffer*/, | ||
| 47 | + unsigned long frames_per_buffer, // NOLINT | ||
| 48 | + const PaStreamCallbackTimeInfo * /*time_info*/, | ||
| 49 | + PaStreamCallbackFlags /*status_flags*/, | ||
| 50 | + void * /*user_data*/) { | ||
| 51 | + std::lock_guard<std::mutex> lock(mutex); | ||
| 52 | + samples_queue.emplace( | ||
| 53 | + reinterpret_cast<const float *>(input_buffer), | ||
| 54 | + reinterpret_cast<const float *>(input_buffer) + frames_per_buffer); | ||
| 55 | + condition_variable.notify_one(); | ||
| 56 | + | ||
| 57 | + return stop ? paComplete : paContinue; | ||
| 58 | +} | ||
| 59 | + | ||
| 60 | +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { | ||
| 61 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 62 | + VadModelConfig config; | ||
| 63 | + config.silero_vad.model = "./silero_vad.onnx"; | ||
| 64 | + config.silero_vad.threshold = 0.5; | ||
| 65 | + config.silero_vad.min_silence_duration = 0.1; | ||
| 66 | + config.silero_vad.min_speech_duration = 0.25; | ||
| 67 | + config.silero_vad.max_speech_duration = 8; | ||
| 68 | + config.sample_rate = 16000; | ||
| 69 | + config.debug = false; | ||
| 70 | + | ||
| 71 | + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); | ||
| 72 | + if (!vad.Get()) { | ||
| 73 | + std::cerr << "Failed to create VAD. Please check your config\n"; | ||
| 74 | + exit(-1); | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + return vad; | ||
| 78 | +} | ||
| 79 | + | ||
| 80 | +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { | ||
| 81 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 82 | + OfflineRecognizerConfig config; | ||
| 83 | + | ||
| 84 | + config.model_config.zipformer_ctc.model = | ||
| 85 | + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"; | ||
| 86 | + config.model_config.tokens = | ||
| 87 | + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"; | ||
| 88 | + | ||
| 89 | + config.model_config.num_threads = 2; | ||
| 90 | + config.model_config.debug = false; | ||
| 91 | + | ||
| 92 | + std::cout << "Loading model\n"; | ||
| 93 | + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); | ||
| 94 | + if (!recognizer.Get()) { | ||
| 95 | + std::cerr << "Please check your config\n"; | ||
| 96 | + exit(-1); | ||
| 97 | + } | ||
| 98 | + std::cout << "Loading model done\n"; | ||
| 99 | + return recognizer; | ||
| 100 | +} | ||
| 101 | + | ||
| 102 | +int32_t main() { | ||
| 103 | + signal(SIGINT, Handler); | ||
| 104 | + | ||
| 105 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 106 | + | ||
| 107 | + auto vad = CreateVad(); | ||
| 108 | + auto recognizer = CreateOfflineRecognizer(); | ||
| 109 | + | ||
| 110 | + sherpa_onnx::Microphone mic; | ||
| 111 | + | ||
| 112 | + PaDeviceIndex num_devices = Pa_GetDeviceCount(); | ||
| 113 | + if (num_devices == 0) { | ||
| 114 | + std::cerr << " If you are using Linux, please try " | ||
| 115 | + "./build/bin/zipformer-ctc-simulate-streaming-alsa-cxx-api\n"; | ||
| 116 | + return -1; | ||
| 117 | + } | ||
| 118 | + | ||
| 119 | + int32_t device_index = Pa_GetDefaultInputDevice(); | ||
| 120 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 121 | + if (pDeviceIndex) { | ||
| 122 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 123 | + device_index = atoi(pDeviceIndex); | ||
| 124 | + } | ||
| 125 | + mic.PrintDevices(device_index); | ||
| 126 | + | ||
| 127 | + float mic_sample_rate = 16000; | ||
| 128 | + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 129 | + if (sample_rate_str) { | ||
| 130 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 131 | + mic_sample_rate = atof(sample_rate_str); | ||
| 132 | + } | ||
| 133 | + float sample_rate = 16000; | ||
| 134 | + LinearResampler resampler; | ||
| 135 | + if (mic_sample_rate != sample_rate) { | ||
| 136 | + float min_freq = std::min(mic_sample_rate, sample_rate); | ||
| 137 | + float lowpass_cutoff = 0.99 * 0.5 * min_freq; | ||
| 138 | + | ||
| 139 | + int32_t lowpass_filter_width = 6; | ||
| 140 | + resampler = LinearResampler::Create(mic_sample_rate, sample_rate, | ||
| 141 | + lowpass_cutoff, lowpass_filter_width); | ||
| 142 | + } | ||
| 143 | + if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, | ||
| 144 | + nullptr) == false) { | ||
| 145 | + std::cerr << "Failed to open microphone device\n"; | ||
| 146 | + return -1; | ||
| 147 | + } | ||
| 148 | + | ||
| 149 | + int32_t window_size = 512; // samples, please don't change | ||
| 150 | + | ||
| 151 | + int32_t offset = 0; | ||
| 152 | + std::vector<float> buffer; | ||
| 153 | + bool speech_started = false; | ||
| 154 | + | ||
| 155 | + auto started_time = std::chrono::steady_clock::now(); | ||
| 156 | + | ||
| 157 | + SherpaDisplay display; | ||
| 158 | + | ||
| 159 | + std::cout << "Started! Please speak\n"; | ||
| 160 | + | ||
| 161 | + while (!stop) { | ||
| 162 | + { | ||
| 163 | + std::unique_lock<std::mutex> lock(mutex); | ||
| 164 | + while (samples_queue.empty() && !stop) { | ||
| 165 | + condition_variable.wait(lock); | ||
| 166 | + } | ||
| 167 | + | ||
| 168 | + const auto &s = samples_queue.front(); | ||
| 169 | + if (!resampler.Get()) { | ||
| 170 | + buffer.insert(buffer.end(), s.begin(), s.end()); | ||
| 171 | + } else { | ||
| 172 | + auto resampled = resampler.Resample(s.data(), s.size(), false); | ||
| 173 | + buffer.insert(buffer.end(), resampled.begin(), resampled.end()); | ||
| 174 | + } | ||
| 175 | + | ||
| 176 | + samples_queue.pop(); | ||
| 177 | + } | ||
| 178 | + | ||
| 179 | + for (; offset + window_size < buffer.size(); offset += window_size) { | ||
| 180 | + vad.AcceptWaveform(buffer.data() + offset, window_size); | ||
| 181 | + if (!speech_started && vad.IsDetected()) { | ||
| 182 | + speech_started = true; | ||
| 183 | + started_time = std::chrono::steady_clock::now(); | ||
| 184 | + } | ||
| 185 | + } | ||
| 186 | + if (!speech_started) { | ||
| 187 | + if (buffer.size() > 10 * window_size) { | ||
| 188 | + offset -= buffer.size() - 10 * window_size; | ||
| 189 | + buffer = {buffer.end() - 10 * window_size, buffer.end()}; | ||
| 190 | + } | ||
| 191 | + } | ||
| 192 | + | ||
| 193 | + auto current_time = std::chrono::steady_clock::now(); | ||
| 194 | + const float elapsed_seconds = | ||
| 195 | + std::chrono::duration_cast<std::chrono::milliseconds>(current_time - | ||
| 196 | + started_time) | ||
| 197 | + .count() / | ||
| 198 | + 1000.; | ||
| 199 | + | ||
| 200 | + if (speech_started && elapsed_seconds > 0.2) { | ||
| 201 | + OfflineStream stream = recognizer.CreateStream(); | ||
| 202 | + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size()); | ||
| 203 | + | ||
| 204 | + recognizer.Decode(&stream); | ||
| 205 | + | ||
| 206 | + OfflineRecognizerResult result = recognizer.GetResult(&stream); | ||
| 207 | + display.UpdateText(result.text); | ||
| 208 | + display.Display(); | ||
| 209 | + | ||
| 210 | + started_time = std::chrono::steady_clock::now(); | ||
| 211 | + } | ||
| 212 | + | ||
| 213 | + while (!vad.IsEmpty()) { | ||
| 214 | + auto segment = vad.Front(); | ||
| 215 | + | ||
| 216 | + vad.Pop(); | ||
| 217 | + | ||
| 218 | + OfflineStream stream = recognizer.CreateStream(); | ||
| 219 | + stream.AcceptWaveform(sample_rate, segment.samples.data(), | ||
| 220 | + segment.samples.size()); | ||
| 221 | + | ||
| 222 | + recognizer.Decode(&stream); | ||
| 223 | + | ||
| 224 | + OfflineRecognizerResult result = recognizer.GetResult(&stream); | ||
| 225 | + | ||
| 226 | + display.UpdateText(result.text); | ||
| 227 | + display.FinalizeCurrentSentence(); | ||
| 228 | + display.Display(); | ||
| 229 | + | ||
| 230 | + buffer.clear(); | ||
| 231 | + offset = 0; | ||
| 232 | + speech_started = false; | ||
| 233 | + } | ||
| 234 | + } | ||
| 235 | + | ||
| 236 | + return 0; | ||
| 237 | +} |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 2 | +import 'dart:io'; | ||
| 3 | + | ||
| 4 | +import 'package:args/args.dart'; | ||
| 5 | +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; | ||
| 6 | + | ||
| 7 | +import './init.dart'; | ||
| 8 | + | ||
| 9 | +void main(List<String> arguments) async { | ||
| 10 | + await initSherpaOnnx(); | ||
| 11 | + | ||
| 12 | + final parser = ArgParser() | ||
| 13 | + ..addOption('model', help: 'Path to the Zipformer CTC model') | ||
| 14 | + ..addOption('tokens', help: 'Path to tokens.txt') | ||
| 15 | + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); | ||
| 16 | + | ||
| 17 | + final res = parser.parse(arguments); | ||
| 18 | + if (res['model'] == null || | ||
| 19 | + res['tokens'] == null || | ||
| 20 | + res['input-wav'] == null) { | ||
| 21 | + print(parser.usage); | ||
| 22 | + exit(1); | ||
| 23 | + } | ||
| 24 | + | ||
| 25 | + final model = res['model'] as String; | ||
| 26 | + final tokens = res['tokens'] as String; | ||
| 27 | + final inputWav = res['input-wav'] as String; | ||
| 28 | + | ||
| 29 | + final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model); | ||
| 30 | + | ||
| 31 | + final modelConfig = sherpa_onnx.OfflineModelConfig( | ||
| 32 | + zipformerCtc: zipformerCtc, | ||
| 33 | + tokens: tokens, | ||
| 34 | + debug: true, | ||
| 35 | + numThreads: 1, | ||
| 36 | + ); | ||
| 37 | + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); | ||
| 38 | + final recognizer = sherpa_onnx.OfflineRecognizer(config); | ||
| 39 | + | ||
| 40 | + final waveData = sherpa_onnx.readWave(inputWav); | ||
| 41 | + final stream = recognizer.createStream(); | ||
| 42 | + | ||
| 43 | + stream.acceptWaveform( | ||
| 44 | + samples: waveData.samples, sampleRate: waveData.sampleRate); | ||
| 45 | + recognizer.decode(stream); | ||
| 46 | + | ||
| 47 | + final result = recognizer.getResult(stream); | ||
| 48 | + print(result.text); | ||
| 49 | + | ||
| 50 | + stream.free(); | ||
| 51 | + recognizer.free(); | ||
| 52 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +dart pub get | ||
| 6 | + | ||
| 7 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then | ||
| 8 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 9 | + | ||
| 10 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 11 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +dart run \ | ||
| 15 | + ./bin/zipformer-ctc.dart \ | ||
| 16 | + --model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ | ||
| 17 | + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ | ||
| 18 | + --input-wav ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 2 | +import 'dart:io'; | ||
| 3 | +import 'dart:typed_data'; | ||
| 4 | + | ||
| 5 | +import 'package:args/args.dart'; | ||
| 6 | +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; | ||
| 7 | + | ||
| 8 | +import './init.dart'; | ||
| 9 | + | ||
| 10 | +void main(List<String> arguments) async { | ||
| 11 | + await initSherpaOnnx(); | ||
| 12 | + | ||
| 13 | + final parser = ArgParser() | ||
| 14 | + ..addOption('silero-vad', help: 'Path to silero_vad.onnx') | ||
| 15 | + ..addOption('model', help: 'Path to the Zipformer CTC model') | ||
| 16 | + ..addOption('tokens', help: 'Path to tokens.txt') | ||
| 17 | + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); | ||
| 18 | + | ||
| 19 | + final res = parser.parse(arguments); | ||
| 20 | + if (res['silero-vad'] == null || | ||
| 21 | + res['model'] == null || | ||
| 22 | + res['tokens'] == null || | ||
| 23 | + res['input-wav'] == null) { | ||
| 24 | + print(parser.usage); | ||
| 25 | + exit(1); | ||
| 26 | + } | ||
| 27 | + | ||
| 28 | + // create VAD | ||
| 29 | + final sileroVad = res['silero-vad'] as String; | ||
| 30 | + | ||
| 31 | + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig( | ||
| 32 | + model: sileroVad, | ||
| 33 | + minSilenceDuration: 0.25, | ||
| 34 | + minSpeechDuration: 0.5, | ||
| 35 | + maxSpeechDuration: 5.0, | ||
| 36 | + ); | ||
| 37 | + | ||
| 38 | + final vadConfig = sherpa_onnx.VadModelConfig( | ||
| 39 | + sileroVad: sileroVadConfig, | ||
| 40 | + numThreads: 1, | ||
| 41 | + debug: true, | ||
| 42 | + ); | ||
| 43 | + | ||
| 44 | + final vad = sherpa_onnx.VoiceActivityDetector( | ||
| 45 | + config: vadConfig, bufferSizeInSeconds: 10); | ||
| 46 | + | ||
| 47 | + // create offline recognizer | ||
| 48 | + final model = res['model'] as String; | ||
| 49 | + final tokens = res['tokens'] as String; | ||
| 50 | + final inputWav = res['input-wav'] as String; | ||
| 51 | + | ||
| 52 | + final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model); | ||
| 53 | + | ||
| 54 | + final modelConfig = sherpa_onnx.OfflineModelConfig( | ||
| 55 | + zipformerCtc: zipformerCtc, | ||
| 56 | + tokens: tokens, | ||
| 57 | + debug: true, | ||
| 58 | + numThreads: 1, | ||
| 59 | + ); | ||
| 60 | + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); | ||
| 61 | + final recognizer = sherpa_onnx.OfflineRecognizer(config); | ||
| 62 | + | ||
| 63 | + final waveData = sherpa_onnx.readWave(inputWav); | ||
| 64 | + if (waveData.sampleRate != 16000) { | ||
| 65 | + print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}'); | ||
| 66 | + exit(1); | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | + int numSamples = waveData.samples.length; | ||
| 70 | + int numIter = numSamples ~/ vadConfig.sileroVad.windowSize; | ||
| 71 | + | ||
| 72 | + for (int i = 0; i != numIter; ++i) { | ||
| 73 | + int start = i * vadConfig.sileroVad.windowSize; | ||
| 74 | + vad.acceptWaveform(Float32List.sublistView( | ||
| 75 | + waveData.samples, start, start + vadConfig.sileroVad.windowSize)); | ||
| 76 | + | ||
| 77 | + while (!vad.isEmpty()) { | ||
| 78 | + final samples = vad.front().samples; | ||
| 79 | + final startTime = vad.front().start.toDouble() / waveData.sampleRate; | ||
| 80 | + final endTime = | ||
| 81 | + startTime + samples.length.toDouble() / waveData.sampleRate; | ||
| 82 | + | ||
| 83 | + final stream = recognizer.createStream(); | ||
| 84 | + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); | ||
| 85 | + recognizer.decode(stream); | ||
| 86 | + | ||
| 87 | + final result = recognizer.getResult(stream); | ||
| 88 | + stream.free(); | ||
| 89 | + print( | ||
| 90 | + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); | ||
| 91 | + | ||
| 92 | + vad.pop(); | ||
| 93 | + } | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + vad.flush(); | ||
| 97 | + | ||
| 98 | + while (!vad.isEmpty()) { | ||
| 99 | + final samples = vad.front().samples; | ||
| 100 | + final startTime = vad.front().start.toDouble() / waveData.sampleRate; | ||
| 101 | + final endTime = startTime + samples.length.toDouble() / waveData.sampleRate; | ||
| 102 | + | ||
| 103 | + final stream = recognizer.createStream(); | ||
| 104 | + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); | ||
| 105 | + recognizer.decode(stream); | ||
| 106 | + | ||
| 107 | + final result = recognizer.getResult(stream); | ||
| 108 | + stream.free(); | ||
| 109 | + print( | ||
| 110 | + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); | ||
| 111 | + | ||
| 112 | + vad.pop(); | ||
| 113 | + } | ||
| 114 | + | ||
| 115 | + vad.free(); | ||
| 116 | + | ||
| 117 | + recognizer.free(); | ||
| 118 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +dart pub get | ||
| 6 | + | ||
| 7 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then | ||
| 8 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 9 | + | ||
| 10 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 11 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +if [ ! -f ./lei-jun-test.wav ]; then | ||
| 15 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 16 | +fi | ||
| 17 | + | ||
| 18 | +if [[ ! -f ./silero_vad.onnx ]]; then | ||
| 19 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +dart run \ | ||
| 23 | + ./bin/zipformer-ctc.dart \ | ||
| 24 | + --silero-vad ./silero_vad.onnx \ | ||
| 25 | + --model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ | ||
| 26 | + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ | ||
| 27 | + --input-wav ./lei-jun-test.wav |
| @@ -75,6 +75,9 @@ class OfflineDecodeFiles | @@ -75,6 +75,9 @@ class OfflineDecodeFiles | ||
| 75 | [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] | 75 | [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] |
| 76 | public string NeMoCtc { get; set; } = string.Empty; | 76 | public string NeMoCtc { get; set; } = string.Empty; |
| 77 | 77 | ||
| 78 | + [Option("zipformer-ctc", Required = false, HelpText = "Path to model.onnx. Used only for Zipformer CTC models")] | ||
| 79 | + public string ZipformerCtc { get; set; } = string.Empty; | ||
| 80 | + | ||
| 78 | [Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")] | 81 | [Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")] |
| 79 | public string DolphinModel { get; set; } = string.Empty; | 82 | public string DolphinModel { get; set; } = string.Empty; |
| 80 | 83 | ||
| @@ -240,6 +243,10 @@ to download pre-trained Tdnn models. | @@ -240,6 +243,10 @@ to download pre-trained Tdnn models. | ||
| 240 | { | 243 | { |
| 241 | config.ModelConfig.Dolphin.Model = options.DolphinModel; | 244 | config.ModelConfig.Dolphin.Model = options.DolphinModel; |
| 242 | } | 245 | } |
| 246 | + else if (!string.IsNullOrEmpty(options.ZipformerCtc)) | ||
| 247 | + { | ||
| 248 | + config.ModelConfig.ZipformerCtc.Model = options.ZipformerCtc; | ||
| 249 | + } | ||
| 243 | else if (!string.IsNullOrEmpty(options.TeleSpeechCtc)) | 250 | else if (!string.IsNullOrEmpty(options.TeleSpeechCtc)) |
| 244 | { | 251 | { |
| 245 | config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; | 252 | config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then | ||
| 6 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 7 | + | ||
| 8 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 9 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 10 | +fi | ||
| 11 | + | ||
| 12 | +dotnet run \ | ||
| 13 | + --tokens=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ | ||
| 14 | + --zipformer-ctc=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ | ||
| 15 | + --num-threads=1 \ | ||
| 16 | + --files ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav \ | ||
| 17 | + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/1.wav \ | ||
| 18 | + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/8k.wav |
| @@ -104,6 +104,27 @@ class OfflineDolphinModelConfig { | @@ -104,6 +104,27 @@ class OfflineDolphinModelConfig { | ||
| 104 | final String model; | 104 | final String model; |
| 105 | } | 105 | } |
| 106 | 106 | ||
| 107 | +class OfflineZipformerCtcModelConfig { | ||
| 108 | + const OfflineZipformerCtcModelConfig({this.model = ''}); | ||
| 109 | + | ||
| 110 | + factory OfflineZipformerCtcModelConfig.fromJson(Map<String, dynamic> json) { | ||
| 111 | + return OfflineZipformerCtcModelConfig( | ||
| 112 | + model: json['model'] as String? ?? '', | ||
| 113 | + ); | ||
| 114 | + } | ||
| 115 | + | ||
| 116 | + @override | ||
| 117 | + String toString() { | ||
| 118 | + return 'OfflineZipformerCtcModelConfig(model: $model)'; | ||
| 119 | + } | ||
| 120 | + | ||
| 121 | + Map<String, dynamic> toJson() => { | ||
| 122 | + 'model': model, | ||
| 123 | + }; | ||
| 124 | + | ||
| 125 | + final String model; | ||
| 126 | +} | ||
| 127 | + | ||
| 107 | class OfflineWhisperModelConfig { | 128 | class OfflineWhisperModelConfig { |
| 108 | const OfflineWhisperModelConfig( | 129 | const OfflineWhisperModelConfig( |
| 109 | {this.encoder = '', | 130 | {this.encoder = '', |
| @@ -288,6 +309,7 @@ class OfflineModelConfig { | @@ -288,6 +309,7 @@ class OfflineModelConfig { | ||
| 288 | this.moonshine = const OfflineMoonshineModelConfig(), | 309 | this.moonshine = const OfflineMoonshineModelConfig(), |
| 289 | this.fireRedAsr = const OfflineFireRedAsrModelConfig(), | 310 | this.fireRedAsr = const OfflineFireRedAsrModelConfig(), |
| 290 | this.dolphin = const OfflineDolphinModelConfig(), | 311 | this.dolphin = const OfflineDolphinModelConfig(), |
| 312 | + this.zipformerCtc = const OfflineZipformerCtcModelConfig(), | ||
| 291 | required this.tokens, | 313 | required this.tokens, |
| 292 | this.numThreads = 1, | 314 | this.numThreads = 1, |
| 293 | this.debug = true, | 315 | this.debug = true, |
| @@ -336,6 +358,10 @@ class OfflineModelConfig { | @@ -336,6 +358,10 @@ class OfflineModelConfig { | ||
| 336 | ? OfflineDolphinModelConfig.fromJson( | 358 | ? OfflineDolphinModelConfig.fromJson( |
| 337 | json['dolphin'] as Map<String, dynamic>) | 359 | json['dolphin'] as Map<String, dynamic>) |
| 338 | : const OfflineDolphinModelConfig(), | 360 | : const OfflineDolphinModelConfig(), |
| 361 | + zipformerCtc: json['zipformerCtc'] != null | ||
| 362 | + ? OfflineZipformerCtcModelConfig.fromJson( | ||
| 363 | + json['zipformerCtc'] as Map<String, dynamic>) | ||
| 364 | + : const OfflineZipformerCtcModelConfig(), | ||
| 339 | tokens: json['tokens'] as String, | 365 | tokens: json['tokens'] as String, |
| 340 | numThreads: json['numThreads'] as int? ?? 1, | 366 | numThreads: json['numThreads'] as int? ?? 1, |
| 341 | debug: json['debug'] as bool? ?? true, | 367 | debug: json['debug'] as bool? ?? true, |
| @@ -349,7 +375,7 @@ class OfflineModelConfig { | @@ -349,7 +375,7 @@ class OfflineModelConfig { | ||
| 349 | 375 | ||
| 350 | @override | 376 | @override |
| 351 | String toString() { | 377 | String toString() { |
| 352 | - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; | 378 | + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, zipformerCtc: $zipformerCtc, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; |
| 353 | } | 379 | } |
| 354 | 380 | ||
| 355 | Map<String, dynamic> toJson() => { | 381 | Map<String, dynamic> toJson() => { |
| @@ -362,6 +388,7 @@ class OfflineModelConfig { | @@ -362,6 +388,7 @@ class OfflineModelConfig { | ||
| 362 | 'moonshine': moonshine.toJson(), | 388 | 'moonshine': moonshine.toJson(), |
| 363 | 'fireRedAsr': fireRedAsr.toJson(), | 389 | 'fireRedAsr': fireRedAsr.toJson(), |
| 364 | 'dolphin': dolphin.toJson(), | 390 | 'dolphin': dolphin.toJson(), |
| 391 | + 'zipformerCtc': zipformerCtc.toJson(), | ||
| 365 | 'tokens': tokens, | 392 | 'tokens': tokens, |
| 366 | 'numThreads': numThreads, | 393 | 'numThreads': numThreads, |
| 367 | 'debug': debug, | 394 | 'debug': debug, |
| @@ -381,6 +408,7 @@ class OfflineModelConfig { | @@ -381,6 +408,7 @@ class OfflineModelConfig { | ||
| 381 | final OfflineMoonshineModelConfig moonshine; | 408 | final OfflineMoonshineModelConfig moonshine; |
| 382 | final OfflineFireRedAsrModelConfig fireRedAsr; | 409 | final OfflineFireRedAsrModelConfig fireRedAsr; |
| 383 | final OfflineDolphinModelConfig dolphin; | 410 | final OfflineDolphinModelConfig dolphin; |
| 411 | + final OfflineZipformerCtcModelConfig zipformerCtc; | ||
| 384 | 412 | ||
| 385 | final String tokens; | 413 | final String tokens; |
| 386 | final int numThreads; | 414 | final int numThreads; |
| @@ -578,6 +606,8 @@ class OfflineRecognizer { | @@ -578,6 +606,8 @@ class OfflineRecognizer { | ||
| 578 | config.model.fireRedAsr.decoder.toNativeUtf8(); | 606 | config.model.fireRedAsr.decoder.toNativeUtf8(); |
| 579 | 607 | ||
| 580 | c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8(); | 608 | c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8(); |
| 609 | + c.ref.model.zipformerCtc.model = | ||
| 610 | + config.model.zipformerCtc.model.toNativeUtf8(); | ||
| 581 | 611 | ||
| 582 | c.ref.model.tokens = config.model.tokens.toNativeUtf8(); | 612 | c.ref.model.tokens = config.model.tokens.toNativeUtf8(); |
| 583 | 613 | ||
| @@ -623,6 +653,7 @@ class OfflineRecognizer { | @@ -623,6 +653,7 @@ class OfflineRecognizer { | ||
| 623 | calloc.free(c.ref.model.modelType); | 653 | calloc.free(c.ref.model.modelType); |
| 624 | calloc.free(c.ref.model.provider); | 654 | calloc.free(c.ref.model.provider); |
| 625 | calloc.free(c.ref.model.tokens); | 655 | calloc.free(c.ref.model.tokens); |
| 656 | + calloc.free(c.ref.model.zipformerCtc.model); | ||
| 626 | calloc.free(c.ref.model.dolphin.model); | 657 | calloc.free(c.ref.model.dolphin.model); |
| 627 | calloc.free(c.ref.model.fireRedAsr.decoder); | 658 | calloc.free(c.ref.model.fireRedAsr.decoder); |
| 628 | calloc.free(c.ref.model.fireRedAsr.encoder); | 659 | calloc.free(c.ref.model.fireRedAsr.encoder); |
| @@ -266,6 +266,10 @@ final class SherpaOnnxOfflineDolphinModelConfig extends Struct { | @@ -266,6 +266,10 @@ final class SherpaOnnxOfflineDolphinModelConfig extends Struct { | ||
| 266 | external Pointer<Utf8> model; | 266 | external Pointer<Utf8> model; |
| 267 | } | 267 | } |
| 268 | 268 | ||
| 269 | +final class SherpaOnnxOfflineZipformerCtcModelConfig extends Struct { | ||
| 270 | + external Pointer<Utf8> model; | ||
| 271 | +} | ||
| 272 | + | ||
| 269 | final class SherpaOnnxOfflineWhisperModelConfig extends Struct { | 273 | final class SherpaOnnxOfflineWhisperModelConfig extends Struct { |
| 270 | external Pointer<Utf8> encoder; | 274 | external Pointer<Utf8> encoder; |
| 271 | external Pointer<Utf8> decoder; | 275 | external Pointer<Utf8> decoder; |
| @@ -333,6 +337,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct { | @@ -333,6 +337,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct { | ||
| 333 | external SherpaOnnxOfflineMoonshineModelConfig moonshine; | 337 | external SherpaOnnxOfflineMoonshineModelConfig moonshine; |
| 334 | external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; | 338 | external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; |
| 335 | external SherpaOnnxOfflineDolphinModelConfig dolphin; | 339 | external SherpaOnnxOfflineDolphinModelConfig dolphin; |
| 340 | + external SherpaOnnxOfflineZipformerCtcModelConfig zipformerCtc; | ||
| 336 | } | 341 | } |
| 337 | 342 | ||
| 338 | final class SherpaOnnxOfflineRecognizerConfig extends Struct { | 343 | final class SherpaOnnxOfflineRecognizerConfig extends Struct { |
| @@ -28,6 +28,8 @@ func main() { | @@ -28,6 +28,8 @@ func main() { | ||
| 28 | 28 | ||
| 29 | flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model") | 29 | flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model") |
| 30 | 30 | ||
| 31 | + flag.StringVar(&config.ModelConfig.ZipformerCtc.Model, "zipformer-ctc", "", "Path to the Zipformer CTC model") | ||
| 32 | + | ||
| 31 | flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model") | 33 | flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model") |
| 32 | 34 | ||
| 33 | flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model") | 35 | flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model") |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then | ||
| 6 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 7 | + | ||
| 8 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 9 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 10 | +fi | ||
| 11 | + | ||
| 12 | +go mod tidy | ||
| 13 | +go build | ||
| 14 | + | ||
| 15 | +./non-streaming-decode-files \ | ||
| 16 | + --zipformer-ctc ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ | ||
| 17 | + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ | ||
| 18 | + --debug 0 \ | ||
| 19 | + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav |
| @@ -15,6 +15,7 @@ export { Samples, | @@ -15,6 +15,7 @@ export { Samples, | ||
| 15 | OfflineTdnnModelConfig, | 15 | OfflineTdnnModelConfig, |
| 16 | OfflineSenseVoiceModelConfig, | 16 | OfflineSenseVoiceModelConfig, |
| 17 | OfflineMoonshineModelConfig, | 17 | OfflineMoonshineModelConfig, |
| 18 | + OfflineZipformerCtcModelConfig, | ||
| 18 | OfflineModelConfig, | 19 | OfflineModelConfig, |
| 19 | OfflineLMConfig, | 20 | OfflineLMConfig, |
| 20 | OfflineRecognizerConfig, | 21 | OfflineRecognizerConfig, |
| @@ -45,7 +45,23 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig( | @@ -45,7 +45,23 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig( | ||
| 45 | return c; | 45 | return c; |
| 46 | } | 46 | } |
| 47 | 47 | ||
| 48 | -static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinfig( | 48 | +static SherpaOnnxOfflineZipformerCtcModelConfig |
| 49 | +GetOfflineZipformerCtcModelConfig(Napi::Object obj) { | ||
| 50 | + SherpaOnnxOfflineZipformerCtcModelConfig c; | ||
| 51 | + memset(&c, 0, sizeof(c)); | ||
| 52 | + | ||
| 53 | + if (!obj.Has("zipformerCtc") || !obj.Get("zipformerCtc").IsObject()) { | ||
| 54 | + return c; | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + Napi::Object o = obj.Get("zipformerCtc").As<Napi::Object>(); | ||
| 58 | + | ||
| 59 | + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); | ||
| 60 | + | ||
| 61 | + return c; | ||
| 62 | +} | ||
| 63 | + | ||
| 64 | +static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinModelConfig( | ||
| 49 | Napi::Object obj) { | 65 | Napi::Object obj) { |
| 50 | SherpaOnnxOfflineDolphinModelConfig c; | 66 | SherpaOnnxOfflineDolphinModelConfig c; |
| 51 | memset(&c, 0, sizeof(c)); | 67 | memset(&c, 0, sizeof(c)); |
| @@ -185,7 +201,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { | @@ -185,7 +201,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { | ||
| 185 | c.sense_voice = GetOfflineSenseVoiceModelConfig(o); | 201 | c.sense_voice = GetOfflineSenseVoiceModelConfig(o); |
| 186 | c.moonshine = GetOfflineMoonshineModelConfig(o); | 202 | c.moonshine = GetOfflineMoonshineModelConfig(o); |
| 187 | c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o); | 203 | c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o); |
| 188 | - c.dolphin = GetOfflineDolphinfig(o); | 204 | + c.dolphin = GetOfflineDolphinModelConfig(o); |
| 205 | + c.zipformer_ctc = GetOfflineZipformerCtcModelConfig(o); | ||
| 189 | 206 | ||
| 190 | SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); | 207 | SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); |
| 191 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); | 208 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); |
| @@ -312,6 +329,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { | @@ -312,6 +329,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { | ||
| 312 | SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder); | 329 | SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder); |
| 313 | 330 | ||
| 314 | SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model); | 331 | SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model); |
| 332 | + SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model); | ||
| 315 | 333 | ||
| 316 | SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); | 334 | SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); |
| 317 | SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); | 335 | SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); |
| @@ -55,6 +55,10 @@ export class OfflineDolphinModelConfig { | @@ -55,6 +55,10 @@ export class OfflineDolphinModelConfig { | ||
| 55 | public model: string = ''; | 55 | public model: string = ''; |
| 56 | } | 56 | } |
| 57 | 57 | ||
| 58 | +export class OfflineZipformerCtcModelConfig { | ||
| 59 | + public model: string = ''; | ||
| 60 | +} | ||
| 61 | + | ||
| 58 | export class OfflineWhisperModelConfig { | 62 | export class OfflineWhisperModelConfig { |
| 59 | public encoder: string = ''; | 63 | public encoder: string = ''; |
| 60 | public decoder: string = ''; | 64 | public decoder: string = ''; |
| @@ -97,6 +101,7 @@ export class OfflineModelConfig { | @@ -97,6 +101,7 @@ export class OfflineModelConfig { | ||
| 97 | public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig(); | 101 | public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig(); |
| 98 | public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig(); | 102 | public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig(); |
| 99 | public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig(); | 103 | public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig(); |
| 104 | + public zipformerCtc: OfflineZipformerCtcModelConfig = new OfflineZipformerCtcModelConfig(); | ||
| 100 | } | 105 | } |
| 101 | 106 | ||
| 102 | export class OfflineLMConfig { | 107 | export class OfflineLMConfig { |
| 1 | +// Copyright 2025 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use an offline Zipformer CTC model, | ||
| 4 | +// i.e., non-streaming Zipformer CTC model, | ||
| 5 | +// to decode files. | ||
| 6 | +import com.k2fsa.sherpa.onnx.*; | ||
| 7 | + | ||
| 8 | +public class NonStreamingDecodeFileZipformerCtc { | ||
| 9 | + public static void main(String[] args) { | ||
| 10 | + // please refer to | ||
| 11 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 12 | + // to download model files | ||
| 13 | + String model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"; | ||
| 14 | + String tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"; | ||
| 15 | + | ||
| 16 | + String waveFilename = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"; | ||
| 17 | + | ||
| 18 | + WaveReader reader = new WaveReader(waveFilename); | ||
| 19 | + | ||
| 20 | + OfflineZipformerCtcModelConfig zipformerCtc = | ||
| 21 | + OfflineZipformerCtcModelConfig.builder().setModel(model).build(); | ||
| 22 | + | ||
| 23 | + OfflineModelConfig modelConfig = | ||
| 24 | + OfflineModelConfig.builder() | ||
| 25 | + .setZipformerCtc(zipformerCtc) | ||
| 26 | + .setTokens(tokens) | ||
| 27 | + .setNumThreads(1) | ||
| 28 | + .setDebug(true) | ||
| 29 | + .build(); | ||
| 30 | + | ||
| 31 | + OfflineRecognizerConfig config = | ||
| 32 | + OfflineRecognizerConfig.builder() | ||
| 33 | + .setOfflineModelConfig(modelConfig) | ||
| 34 | + .setDecodingMethod("greedy_search") | ||
| 35 | + .build(); | ||
| 36 | + | ||
| 37 | + OfflineRecognizer recognizer = new OfflineRecognizer(config); | ||
| 38 | + OfflineStream stream = recognizer.createStream(); | ||
| 39 | + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); | ||
| 40 | + | ||
| 41 | + recognizer.decode(stream); | ||
| 42 | + | ||
| 43 | + String text = recognizer.getResult(stream).getText(); | ||
| 44 | + | ||
| 45 | + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); | ||
| 46 | + | ||
| 47 | + stream.release(); | ||
| 48 | + recognizer.release(); | ||
| 49 | + } | ||
| 50 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then | ||
| 29 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 30 | + | ||
| 31 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 32 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 33 | +fi | ||
| 34 | + | ||
| 35 | +java \ | ||
| 36 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 37 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 38 | + NonStreamingDecodeFileZipformerCtc.java |
| @@ -253,6 +253,13 @@ function testOfflineAsr() { | @@ -253,6 +253,13 @@ function testOfflineAsr() { | ||
| 253 | rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 | 253 | rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 |
| 254 | fi | 254 | fi |
| 255 | 255 | ||
| 256 | + if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then | ||
| 257 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 258 | + | ||
| 259 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 260 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 261 | + fi | ||
| 262 | + | ||
| 256 | out_filename=test_offline_asr.jar | 263 | out_filename=test_offline_asr.jar |
| 257 | kotlinc-jvm -include-runtime -d $out_filename \ | 264 | kotlinc-jvm -include-runtime -d $out_filename \ |
| 258 | test_offline_asr.kt \ | 265 | test_offline_asr.kt \ |
| 1 | package com.k2fsa.sherpa.onnx | 1 | package com.k2fsa.sherpa.onnx |
| 2 | 2 | ||
| 3 | fun main() { | 3 | fun main() { |
| 4 | - val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25) | 4 | + val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25, 31) |
| 5 | for (type in types) { | 5 | for (type in types) { |
| 6 | test(type) | 6 | test(type) |
| 7 | } | 7 | } |
| @@ -19,6 +19,7 @@ fun test(type: Int) { | @@ -19,6 +19,7 @@ fun test(type: Int) { | ||
| 19 | 21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav" | 19 | 21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav" |
| 20 | 24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav" | 20 | 24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav" |
| 21 | 25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav" | 21 | 25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav" |
| 22 | + 31 -> "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav" | ||
| 22 | else -> null | 23 | else -> null |
| 23 | } | 24 | } |
| 24 | 25 |
| @@ -123,6 +123,7 @@ The following tables list the examples in this folder. | @@ -123,6 +123,7 @@ The following tables list the examples in this folder. | ||
| 123 | |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| | 123 | |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| |
| 124 | |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| | 124 | |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| |
| 125 | |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| | 125 | |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |
| 126 | +|[./test_asr_non_streaming_zipformer_ctc.js](./test_asr_non_streaming_zipformer_ctc.js)|Non-streaming speech recognition from a file using a Zipformer CTC model with greedy search| | ||
| 126 | |[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search| | 127 | |[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search| |
| 127 | |[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search| | 128 | |[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search| |
| 128 | |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| | 129 | |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |
| @@ -137,6 +138,7 @@ The following tables list the examples in this folder. | @@ -137,6 +138,7 @@ The following tables list the examples in this folder. | ||
| 137 | |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| | 138 | |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| |
| 138 | |[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)| | 139 | |[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)| |
| 139 | |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| | 140 | |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |
| 141 | +|[./test_vad_asr_non_streaming_zipformer_ctc_microphone.js](./test_vad_asr_non_streaming_zipformer_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer CTC model with greedy search| | ||
| 140 | |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| | 142 | |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |
| 141 | |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| | 143 | |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| |
| 142 | 144 | ||
| @@ -372,6 +374,21 @@ rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 | @@ -372,6 +374,21 @@ rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 | ||
| 372 | node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js | 374 | node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js |
| 373 | ``` | 375 | ``` |
| 374 | 376 | ||
| 377 | +### Non-streaming speech recognition with Zipformer CTC models | ||
| 378 | + | ||
| 379 | +```bash | ||
| 380 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 381 | + | ||
| 382 | +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 383 | +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 384 | + | ||
| 385 | +node ./test_asr_non_streaming_zipformer_ctc.js | ||
| 386 | + | ||
| 387 | +# To run VAD + non-streaming ASR with Paraformer using a microphone | ||
| 388 | +npm install naudiodon2 | ||
| 389 | +node ./test_vad_asr_non_streaming_zipformer_ctc_microphone.js | ||
| 390 | +``` | ||
| 391 | + | ||
| 375 | ### Non-streaming speech recognition with NeMo CTC models | 392 | ### Non-streaming speech recognition with NeMo CTC models |
| 376 | 393 | ||
| 377 | ```bash | 394 | ```bash |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | + | ||
| 4 | +// Please download test files from | ||
| 5 | +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 6 | +const config = { | ||
| 7 | + 'featConfig': { | ||
| 8 | + 'sampleRate': 16000, | ||
| 9 | + 'featureDim': 80, | ||
| 10 | + }, | ||
| 11 | + 'modelConfig': { | ||
| 12 | + 'zipformerCtc': { | ||
| 13 | + 'model': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx', | ||
| 14 | + }, | ||
| 15 | + 'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt', | ||
| 16 | + 'numThreads': 2, | ||
| 17 | + 'provider': 'cpu', | ||
| 18 | + 'debug': 1, | ||
| 19 | + } | ||
| 20 | +}; | ||
| 21 | + | ||
| 22 | +const waveFilename = | ||
| 23 | + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav'; | ||
| 24 | + | ||
| 25 | +const recognizer = new sherpa_onnx.OfflineRecognizer(config); | ||
| 26 | +console.log('Started') | ||
| 27 | +let start = Date.now(); | ||
| 28 | +const stream = recognizer.createStream(); | ||
| 29 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 30 | +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); | ||
| 31 | + | ||
| 32 | +recognizer.decode(stream); | ||
| 33 | +result = recognizer.getResult(stream) | ||
| 34 | +let stop = Date.now(); | ||
| 35 | +console.log('Done') | ||
| 36 | + | ||
| 37 | +const elapsed_seconds = (stop - start) / 1000; | ||
| 38 | +const duration = wave.samples.length / wave.sampleRate; | ||
| 39 | +const real_time_factor = elapsed_seconds / duration; | ||
| 40 | +console.log('Wave duration', duration.toFixed(3), 'seconds') | ||
| 41 | +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') | ||
| 42 | +console.log( | ||
| 43 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 44 | + real_time_factor.toFixed(3)) | ||
| 45 | +console.log(waveFilename) | ||
| 46 | +console.log('result\n', result) |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | +// | ||
| 3 | +const portAudio = require('naudiodon2'); | ||
| 4 | +// console.log(portAudio.getDevices()); | ||
| 5 | + | ||
| 6 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 7 | + | ||
| 8 | +function createRecognizer() { | ||
| 9 | + // Please download test files from | ||
| 10 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 11 | + const config = { | ||
| 12 | + 'featConfig': { | ||
| 13 | + 'sampleRate': 16000, | ||
| 14 | + 'featureDim': 80, | ||
| 15 | + }, | ||
| 16 | + 'modelConfig': { | ||
| 17 | + 'zipformerCtc': { | ||
| 18 | + 'model': | ||
| 19 | + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx', | ||
| 20 | + }, | ||
| 21 | + 'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt', | ||
| 22 | + 'numThreads': 2, | ||
| 23 | + 'provider': 'cpu', | ||
| 24 | + 'debug': 1, | ||
| 25 | + } | ||
| 26 | + }; | ||
| 27 | + | ||
| 28 | + return new sherpa_onnx.OfflineRecognizer(config); | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +function createVad() { | ||
| 32 | + // please download silero_vad.onnx from | ||
| 33 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 34 | + const config = { | ||
| 35 | + sileroVad: { | ||
| 36 | + model: './silero_vad.onnx', | ||
| 37 | + threshold: 0.5, | ||
| 38 | + minSpeechDuration: 0.25, | ||
| 39 | + minSilenceDuration: 0.5, | ||
| 40 | + windowSize: 512, | ||
| 41 | + }, | ||
| 42 | + sampleRate: 16000, | ||
| 43 | + debug: true, | ||
| 44 | + numThreads: 1, | ||
| 45 | + }; | ||
| 46 | + | ||
| 47 | + const bufferSizeInSeconds = 60; | ||
| 48 | + | ||
| 49 | + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); | ||
| 50 | +} | ||
| 51 | + | ||
| 52 | +const recognizer = createRecognizer(); | ||
| 53 | +const vad = createVad(); | ||
| 54 | + | ||
| 55 | +const bufferSizeInSeconds = 30; | ||
| 56 | +const buffer = | ||
| 57 | + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); | ||
| 58 | + | ||
| 59 | +const ai = new portAudio.AudioIO({ | ||
| 60 | + inOptions: { | ||
| 61 | + channelCount: 1, | ||
| 62 | + closeOnError: true, // Close the stream if an audio error is detected, if | ||
| 63 | + // set false then just log the error | ||
| 64 | + deviceId: -1, // Use -1 or omit the deviceId to select the default device | ||
| 65 | + sampleFormat: portAudio.SampleFormatFloat32, | ||
| 66 | + sampleRate: vad.config.sampleRate | ||
| 67 | + } | ||
| 68 | +}); | ||
| 69 | + | ||
| 70 | +let printed = false; | ||
| 71 | +let index = 0; | ||
| 72 | +ai.on('data', data => { | ||
| 73 | + const windowSize = vad.config.sileroVad.windowSize; | ||
| 74 | + buffer.push(new Float32Array(data.buffer)); | ||
| 75 | + while (buffer.size() > windowSize) { | ||
| 76 | + const samples = buffer.get(buffer.head(), windowSize); | ||
| 77 | + buffer.pop(windowSize); | ||
| 78 | + vad.acceptWaveform(samples); | ||
| 79 | + } | ||
| 80 | + | ||
| 81 | + while (!vad.isEmpty()) { | ||
| 82 | + const segment = vad.front(); | ||
| 83 | + vad.pop(); | ||
| 84 | + const stream = recognizer.createStream(); | ||
| 85 | + stream.acceptWaveform({ | ||
| 86 | + samples: segment.samples, | ||
| 87 | + sampleRate: recognizer.config.featConfig.sampleRate | ||
| 88 | + }); | ||
| 89 | + recognizer.decode(stream); | ||
| 90 | + const r = recognizer.getResult(stream); | ||
| 91 | + if (r.text.length > 0) { | ||
| 92 | + const text = r.text.toLowerCase().trim(); | ||
| 93 | + console.log(`${index}: ${text}`); | ||
| 94 | + | ||
| 95 | + const filename = `${index}-${text}-${ | ||
| 96 | + new Date() | ||
| 97 | + .toLocaleTimeString('en-US', {hour12: false}) | ||
| 98 | + .split(' ')[0]}.wav`; | ||
| 99 | + sherpa_onnx.writeWave( | ||
| 100 | + filename, | ||
| 101 | + {samples: segment.samples, sampleRate: vad.config.sampleRate}); | ||
| 102 | + | ||
| 103 | + index += 1; | ||
| 104 | + } | ||
| 105 | + } | ||
| 106 | +}); | ||
| 107 | + | ||
| 108 | +ai.start(); | ||
| 109 | +console.log('Started! Please speak') |
| @@ -154,6 +154,23 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | @@ -154,6 +154,23 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | ||
| 154 | node ./test-offline-dolphin-ctc.js | 154 | node ./test-offline-dolphin-ctc.js |
| 155 | ``` | 155 | ``` |
| 156 | 156 | ||
| 157 | +## ./test-offline-zipformer-ctc.js | ||
| 158 | + | ||
| 159 | +[./test-offline-zipformer-ctc.js](./test-offline-zipformer-ctc.js) demonstrates | ||
| 160 | +how to decode a file with a Zipformer CTC model. In the code we use | ||
| 161 | +[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese). | ||
| 162 | + | ||
| 163 | +You can use the following command to run it: | ||
| 164 | + | ||
| 165 | +```bash | ||
| 166 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 167 | + | ||
| 168 | +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 169 | +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 170 | + | ||
| 171 | +node ./test-offline-zipformer-ctc.js | ||
| 172 | +``` | ||
| 173 | + | ||
| 157 | ## ./test-offline-nemo-ctc.js | 174 | ## ./test-offline-nemo-ctc.js |
| 158 | 175 | ||
| 159 | [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates | 176 | [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | +// | ||
| 3 | +const fs = require('fs'); | ||
| 4 | +const {Readable} = require('stream'); | ||
| 5 | +const wav = require('wav'); | ||
| 6 | + | ||
| 7 | +const sherpa_onnx = require('sherpa-onnx'); | ||
| 8 | + | ||
| 9 | +function createOfflineRecognizer() { | ||
| 10 | + let config = { | ||
| 11 | + modelConfig: { | ||
| 12 | + zipformerCtc: { | ||
| 13 | + model: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx', | ||
| 14 | + }, | ||
| 15 | + tokens: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt', | ||
| 16 | + } | ||
| 17 | + }; | ||
| 18 | + | ||
| 19 | + return sherpa_onnx.createOfflineRecognizer(config); | ||
| 20 | +} | ||
| 21 | + | ||
| 22 | +const recognizer = createOfflineRecognizer(); | ||
| 23 | +const stream = recognizer.createStream(); | ||
| 24 | + | ||
| 25 | +const waveFilename = | ||
| 26 | + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav'; | ||
| 27 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 28 | +stream.acceptWaveform(wave.sampleRate, wave.samples); | ||
| 29 | + | ||
| 30 | +recognizer.decode(stream); | ||
| 31 | +const text = recognizer.getResult(stream).text; | ||
| 32 | +console.log(text); | ||
| 33 | + | ||
| 34 | +stream.free(); | ||
| 35 | +recognizer.free(); |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
| 6 | +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) | ||
| 7 | + | ||
| 8 | +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" | ||
| 9 | + | ||
| 10 | +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then | ||
| 11 | + mkdir -p ../../build | ||
| 12 | + pushd ../../build | ||
| 13 | + cmake \ | ||
| 14 | + -DCMAKE_INSTALL_PREFIX=./install \ | ||
| 15 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 16 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 17 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 18 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 19 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 20 | + .. | ||
| 21 | + | ||
| 22 | + cmake --build . --target install --config Release | ||
| 23 | + ls -lh lib | ||
| 24 | + popd | ||
| 25 | +fi | ||
| 26 | + | ||
| 27 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then | ||
| 28 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 29 | + | ||
| 30 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 31 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 32 | +fi | ||
| 33 | + | ||
| 34 | +fpc \ | ||
| 35 | + -dSHERPA_ONNX_USE_SHARED_LIBS \ | ||
| 36 | + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ | ||
| 37 | + -Fl$SHERPA_ONNX_DIR/build/install/lib \ | ||
| 38 | + ./zipformer_ctc.pas | ||
| 39 | + | ||
| 40 | +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH | ||
| 41 | +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 42 | + | ||
| 43 | +./zipformer_ctc |
| 1 | +{ Copyright (c) 2025 Xiaomi Corporation } | ||
| 2 | + | ||
| 3 | +{ | ||
| 4 | +This file shows how to use a non-streaming Zipformer CTC model | ||
| 5 | +to decode files. | ||
| 6 | + | ||
| 7 | +You can download the model files from | ||
| 8 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 9 | +} | ||
| 10 | + | ||
| 11 | +program zipformer_ctc; | ||
| 12 | + | ||
| 13 | +{$mode objfpc} | ||
| 14 | + | ||
| 15 | +uses | ||
| 16 | + sherpa_onnx, | ||
| 17 | + DateUtils, | ||
| 18 | + SysUtils; | ||
| 19 | + | ||
| 20 | +var | ||
| 21 | + Wave: TSherpaOnnxWave; | ||
| 22 | + WaveFilename: AnsiString; | ||
| 23 | + | ||
| 24 | + Config: TSherpaOnnxOfflineRecognizerConfig; | ||
| 25 | + Recognizer: TSherpaOnnxOfflineRecognizer; | ||
| 26 | + Stream: TSherpaOnnxOfflineStream; | ||
| 27 | + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; | ||
| 28 | + | ||
| 29 | + Start: TDateTime; | ||
| 30 | + Stop: TDateTime; | ||
| 31 | + | ||
| 32 | + Elapsed: Single; | ||
| 33 | + Duration: Single; | ||
| 34 | + RealTimeFactor: Single; | ||
| 35 | +begin | ||
| 36 | + Initialize(Config); | ||
| 37 | + | ||
| 38 | + Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx'; | ||
| 39 | + Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt'; | ||
| 40 | + Config.ModelConfig.Provider := 'cpu'; | ||
| 41 | + Config.ModelConfig.NumThreads := 1; | ||
| 42 | + Config.ModelConfig.Debug := False; | ||
| 43 | + | ||
| 44 | + WaveFilename := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav'; | ||
| 45 | + | ||
| 46 | + Wave := SherpaOnnxReadWave(WaveFilename); | ||
| 47 | + | ||
| 48 | + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); | ||
| 49 | + Stream := Recognizer.CreateStream(); | ||
| 50 | + Start := Now; | ||
| 51 | + | ||
| 52 | + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); | ||
| 53 | + Recognizer.Decode(Stream); | ||
| 54 | + | ||
| 55 | + RecognitionResult := Recognizer.GetResult(Stream); | ||
| 56 | + | ||
| 57 | + Stop := Now; | ||
| 58 | + | ||
| 59 | + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; | ||
| 60 | + Duration := Length(Wave.Samples) / Wave.SampleRate; | ||
| 61 | + RealTimeFactor := Elapsed / Duration; | ||
| 62 | + | ||
| 63 | + WriteLn(RecognitionResult.ToString); | ||
| 64 | + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); | ||
| 65 | + WriteLn(Format('Elapsed %.3f s', [Elapsed])); | ||
| 66 | + WriteLn(Format('Wave duration %.3f s', [Duration])); | ||
| 67 | + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); | ||
| 68 | + | ||
| 69 | + {Free resources to avoid memory leak. | ||
| 70 | + | ||
| 71 | + Note: You don't need to invoke them for this simple script. | ||
| 72 | + However, you have to invoke them in your own large/complex project. | ||
| 73 | + } | ||
| 74 | + FreeAndNil(Stream); | ||
| 75 | + FreeAndNil(Recognizer); | ||
| 76 | +end. |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
| 6 | +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) | ||
| 7 | + | ||
| 8 | +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" | ||
| 9 | + | ||
| 10 | +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then | ||
| 11 | + mkdir -p ../../build | ||
| 12 | + pushd ../../build | ||
| 13 | + cmake \ | ||
| 14 | + -DCMAKE_INSTALL_PREFIX=./install \ | ||
| 15 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 16 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 17 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 18 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 19 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 20 | + .. | ||
| 21 | + | ||
| 22 | + cmake --build . --target install --config Release | ||
| 23 | + popd | ||
| 24 | +fi | ||
| 25 | + | ||
| 26 | +if [[ ! -f ./silero_vad.onnx ]]; then | ||
| 27 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 28 | +fi | ||
| 29 | + | ||
| 30 | +if [ ! -f ./lei-jun-test.wav ]; then | ||
| 31 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 32 | +fi | ||
| 33 | + | ||
| 34 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then | ||
| 35 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 36 | + | ||
| 37 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 38 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 39 | +fi | ||
| 40 | + | ||
| 41 | +fpc \ | ||
| 42 | + -dSHERPA_ONNX_USE_SHARED_LIBS \ | ||
| 43 | + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ | ||
| 44 | + -Fl$SHERPA_ONNX_DIR/build/install/lib \ | ||
| 45 | + ./vad_with_zipformer_ctc.pas | ||
| 46 | + | ||
| 47 | +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH | ||
| 48 | +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 49 | + | ||
| 50 | +./vad_with_zipformer_ctc |
| 1 | +{ Copyright (c) 2025 Xiaomi Corporation } | ||
| 2 | + | ||
| 3 | +{ | ||
| 4 | +This file shows how to use a non-streaming Zipformer CTC model | ||
| 5 | +with silero VAD to decode files. | ||
| 6 | + | ||
| 7 | +You can download the model files from | ||
| 8 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 9 | +} | ||
| 10 | + | ||
| 11 | +program vad_with_zipformer_ctc; | ||
| 12 | + | ||
| 13 | +{$mode objfpc} | ||
| 14 | + | ||
| 15 | +uses | ||
| 16 | + sherpa_onnx, | ||
| 17 | + SysUtils; | ||
| 18 | + | ||
| 19 | +function CreateVad(): TSherpaOnnxVoiceActivityDetector; | ||
| 20 | +var | ||
| 21 | + Config: TSherpaOnnxVadModelConfig; | ||
| 22 | + | ||
| 23 | + SampleRate: Integer; | ||
| 24 | + WindowSize: Integer; | ||
| 25 | +begin | ||
| 26 | + Initialize(Config); | ||
| 27 | + | ||
| 28 | + SampleRate := 16000; {Please don't change it unless you know the details} | ||
| 29 | + WindowSize := 512; {Please don't change it unless you know the details} | ||
| 30 | + | ||
| 31 | + Config.SileroVad.Model := './silero_vad.onnx'; | ||
| 32 | + Config.SileroVad.MinSpeechDuration := 0.5; | ||
| 33 | + Config.SileroVad.MinSilenceDuration := 0.5; | ||
| 34 | + Config.SileroVad.Threshold := 0.5; | ||
| 35 | + Config.SileroVad.WindowSize := WindowSize; | ||
| 36 | + Config.NumThreads:= 1; | ||
| 37 | + Config.Debug:= True; | ||
| 38 | + Config.Provider:= 'cpu'; | ||
| 39 | + Config.SampleRate := SampleRate; | ||
| 40 | + | ||
| 41 | + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30); | ||
| 42 | +end; | ||
| 43 | + | ||
| 44 | +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer; | ||
| 45 | +var | ||
| 46 | + Config: TSherpaOnnxOfflineRecognizerConfig; | ||
| 47 | +begin | ||
| 48 | + Initialize(Config); | ||
| 49 | + | ||
| 50 | + Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx'; | ||
| 51 | + Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt'; | ||
| 52 | + Config.ModelConfig.Provider := 'cpu'; | ||
| 53 | + Config.ModelConfig.NumThreads := 1; | ||
| 54 | + Config.ModelConfig.Debug := False; | ||
| 55 | + | ||
| 56 | + Result := TSherpaOnnxOfflineRecognizer.Create(Config); | ||
| 57 | +end; | ||
| 58 | + | ||
| 59 | +var | ||
| 60 | + Wave: TSherpaOnnxWave; | ||
| 61 | + | ||
| 62 | + Recognizer: TSherpaOnnxOfflineRecognizer; | ||
| 63 | + Vad: TSherpaOnnxVoiceActivityDetector; | ||
| 64 | + | ||
| 65 | + Offset: Integer; | ||
| 66 | + WindowSize: Integer; | ||
| 67 | + SpeechSegment: TSherpaOnnxSpeechSegment; | ||
| 68 | + | ||
| 69 | + Start: Single; | ||
| 70 | + Duration: Single; | ||
| 71 | + | ||
| 72 | + Stream: TSherpaOnnxOfflineStream; | ||
| 73 | + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; | ||
| 74 | +begin | ||
| 75 | + Vad := CreateVad(); | ||
| 76 | + Recognizer := CreateOfflineRecognizer(); | ||
| 77 | + | ||
| 78 | + Wave := SherpaOnnxReadWave('./lei-jun-test.wav'); | ||
| 79 | + if Wave.SampleRate <> Vad.Config.SampleRate then | ||
| 80 | + begin | ||
| 81 | + WriteLn(Format('Expected sample rate: %d. Given: %d', | ||
| 82 | + [Vad.Config.SampleRate, Wave.SampleRate])); | ||
| 83 | + | ||
| 84 | + Exit; | ||
| 85 | + end; | ||
| 86 | + | ||
| 87 | + WindowSize := Vad.Config.SileroVad.WindowSize; | ||
| 88 | + Offset := 0; | ||
| 89 | + while Offset + WindowSize <= Length(Wave.Samples) do | ||
| 90 | + begin | ||
| 91 | + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); | ||
| 92 | + Offset += WindowSize; | ||
| 93 | + | ||
| 94 | + while not Vad.IsEmpty do | ||
| 95 | + begin | ||
| 96 | + SpeechSegment := Vad.Front(); | ||
| 97 | + Vad.Pop(); | ||
| 98 | + Stream := Recognizer.CreateStream(); | ||
| 99 | + | ||
| 100 | + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); | ||
| 101 | + Recognizer.Decode(Stream); | ||
| 102 | + RecognitionResult := Recognizer.GetResult(Stream); | ||
| 103 | + | ||
| 104 | + Start := SpeechSegment.Start / Wave.SampleRate; | ||
| 105 | + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; | ||
| 106 | + WriteLn(Format('%.3f -- %.3f %s', | ||
| 107 | + [Start, Start + Duration, RecognitionResult.Text])); | ||
| 108 | + | ||
| 109 | + FreeAndNil(Stream); | ||
| 110 | + end; | ||
| 111 | + end; | ||
| 112 | + | ||
| 113 | + Vad.Flush; | ||
| 114 | + | ||
| 115 | + while not Vad.IsEmpty do | ||
| 116 | + begin | ||
| 117 | + SpeechSegment := Vad.Front(); | ||
| 118 | + Vad.Pop(); | ||
| 119 | + Stream := Recognizer.CreateStream(); | ||
| 120 | + | ||
| 121 | + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); | ||
| 122 | + Recognizer.Decode(Stream); | ||
| 123 | + RecognitionResult := Recognizer.GetResult(Stream); | ||
| 124 | + | ||
| 125 | + Start := SpeechSegment.Start / Wave.SampleRate; | ||
| 126 | + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; | ||
| 127 | + WriteLn(Format('%.3f -- %.3f %s', | ||
| 128 | + [Start, Start + Duration, RecognitionResult.Text])); | ||
| 129 | + | ||
| 130 | + FreeAndNil(Stream); | ||
| 131 | + end; | ||
| 132 | + | ||
| 133 | + FreeAndNil(Recognizer); | ||
| 134 | + FreeAndNil(Vad); | ||
| 135 | +end. |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +This file shows how to use a non-streaming zipformer CTC model from icefall | ||
| 5 | +to decode files. | ||
| 6 | + | ||
| 7 | +Please download model files from | ||
| 8 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 9 | + | ||
| 10 | +""" | ||
| 11 | + | ||
| 12 | +from pathlib import Path | ||
| 13 | + | ||
| 14 | +import sherpa_onnx | ||
| 15 | +import soundfile as sf | ||
| 16 | + | ||
| 17 | + | ||
| 18 | +def create_recognizer(): | ||
| 19 | + model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx" | ||
| 20 | + tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt" | ||
| 21 | + test_wav = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav" | ||
| 22 | + | ||
| 23 | + if not Path(model).is_file() or not Path(test_wav).is_file(): | ||
| 24 | + raise ValueError( | ||
| 25 | + """Please download model files from | ||
| 26 | + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 27 | + """ | ||
| 28 | + ) | ||
| 29 | + return ( | ||
| 30 | + sherpa_onnx.OfflineRecognizer.from_zipformer_ctc( | ||
| 31 | + model=model, | ||
| 32 | + tokens=tokens, | ||
| 33 | + debug=True, | ||
| 34 | + ), | ||
| 35 | + test_wav, | ||
| 36 | + ) | ||
| 37 | + | ||
| 38 | + | ||
| 39 | +def main(): | ||
| 40 | + recognizer, wave_filename = create_recognizer() | ||
| 41 | + | ||
| 42 | + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) | ||
| 43 | + audio = audio[:, 0] # only use the first channel | ||
| 44 | + | ||
| 45 | + # audio is a 1-D float32 numpy array normalized to the range [-1, 1] | ||
| 46 | + # sample_rate does not need to be 16000 Hz | ||
| 47 | + | ||
| 48 | + stream = recognizer.create_stream() | ||
| 49 | + stream.accept_waveform(sample_rate, audio) | ||
| 50 | + recognizer.decode_stream(stream) | ||
| 51 | + print(wave_filename) | ||
| 52 | + print(stream.result) | ||
| 53 | + | ||
| 54 | + | ||
| 55 | +if __name__ == "__main__": | ||
| 56 | + main() |
| @@ -344,7 +344,7 @@ def get_models(): | @@ -344,7 +344,7 @@ def get_models(): | ||
| 344 | """, | 344 | """, |
| 345 | ), | 345 | ), |
| 346 | Model( | 346 | Model( |
| 347 | - model_name="sherpa-onnx-streaming-zipformer-ctc-fp16-zh-2025-06-30", | 347 | + model_name="sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30", |
| 348 | idx=19, | 348 | idx=19, |
| 349 | lang="zh", | 349 | lang="zh", |
| 350 | short_name="large_zipformer_fp16", | 350 | short_name="large_zipformer_fp16", |
| @@ -363,6 +363,26 @@ def get_models(): | @@ -363,6 +363,26 @@ def get_models(): | ||
| 363 | popd | 363 | popd |
| 364 | """, | 364 | """, |
| 365 | ), | 365 | ), |
| 366 | + Model( | ||
| 367 | + model_name="sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30", | ||
| 368 | + idx=20, | ||
| 369 | + lang="zh", | ||
| 370 | + short_name="large_zipformer_int8", | ||
| 371 | + rule_fsts="itn_zh_number.fst", | ||
| 372 | + cmd=""" | ||
| 373 | + if [ ! -f itn_zh_number.fst ]; then | ||
| 374 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst | ||
| 375 | + fi | ||
| 376 | + pushd $model_name | ||
| 377 | + rm -fv bpe.model | ||
| 378 | + | ||
| 379 | + rm -rf test_wavs | ||
| 380 | + | ||
| 381 | + ls -lh | ||
| 382 | + | ||
| 383 | + popd | ||
| 384 | + """, | ||
| 385 | + ), | ||
| 366 | ] | 386 | ] |
| 367 | 387 | ||
| 368 | return models | 388 | return models |
| @@ -551,6 +551,23 @@ def get_models(): | @@ -551,6 +551,23 @@ def get_models(): | ||
| 551 | popd | 551 | popd |
| 552 | """, | 552 | """, |
| 553 | ), | 553 | ), |
| 554 | + Model( | ||
| 555 | + model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03", | ||
| 556 | + idx=31, | ||
| 557 | + lang="zh", | ||
| 558 | + lang2="Chinese", | ||
| 559 | + short_name="zipformer_2025_07_03", | ||
| 560 | + cmd=""" | ||
| 561 | + pushd $model_name | ||
| 562 | + | ||
| 563 | + rm -rfv test_wavs | ||
| 564 | + rm -rfv bbpe.model | ||
| 565 | + | ||
| 566 | + ls -lh | ||
| 567 | + | ||
| 568 | + popd | ||
| 569 | + """, | ||
| 570 | + ), | ||
| 554 | ] | 571 | ] |
| 555 | return models | 572 | return models |
| 556 | 573 |
| @@ -27,6 +27,7 @@ namespace SherpaOnnx | @@ -27,6 +27,7 @@ namespace SherpaOnnx | ||
| 27 | Moonshine = new OfflineMoonshineModelConfig(); | 27 | Moonshine = new OfflineMoonshineModelConfig(); |
| 28 | FireRedAsr = new OfflineFireRedAsrModelConfig(); | 28 | FireRedAsr = new OfflineFireRedAsrModelConfig(); |
| 29 | Dolphin = new OfflineDolphinModelConfig(); | 29 | Dolphin = new OfflineDolphinModelConfig(); |
| 30 | + ZipformerCtc = new OfflineZipformerCtcModelConfig(); | ||
| 30 | } | 31 | } |
| 31 | public OfflineTransducerModelConfig Transducer; | 32 | public OfflineTransducerModelConfig Transducer; |
| 32 | public OfflineParaformerModelConfig Paraformer; | 33 | public OfflineParaformerModelConfig Paraformer; |
| @@ -60,5 +61,6 @@ namespace SherpaOnnx | @@ -60,5 +61,6 @@ namespace SherpaOnnx | ||
| 60 | public OfflineMoonshineModelConfig Moonshine; | 61 | public OfflineMoonshineModelConfig Moonshine; |
| 61 | public OfflineFireRedAsrModelConfig FireRedAsr; | 62 | public OfflineFireRedAsrModelConfig FireRedAsr; |
| 62 | public OfflineDolphinModelConfig Dolphin; | 63 | public OfflineDolphinModelConfig Dolphin; |
| 64 | + public OfflineZipformerCtcModelConfig ZipformerCtc; | ||
| 63 | } | 65 | } |
| 64 | } | 66 | } |
| 1 | +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | + | ||
| 3 | +using System.Runtime.InteropServices; | ||
| 4 | + | ||
| 5 | +namespace SherpaOnnx | ||
| 6 | +{ | ||
| 7 | + | ||
| 8 | + [StructLayout(LayoutKind.Sequential)] | ||
| 9 | + public struct OfflineZipformerCtcModelConfig | ||
| 10 | + { | ||
| 11 | + public OfflineZipformerCtcModelConfig() | ||
| 12 | + { | ||
| 13 | + Model = ""; | ||
| 14 | + } | ||
| 15 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 16 | + public string Model; | ||
| 17 | + } | ||
| 18 | +} |
| 1 | +../../../../go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh |
| @@ -398,6 +398,10 @@ type OfflineNemoEncDecCtcModelConfig struct { | @@ -398,6 +398,10 @@ type OfflineNemoEncDecCtcModelConfig struct { | ||
| 398 | Model string // Path to the model, e.g., model.onnx or model.int8.onnx | 398 | Model string // Path to the model, e.g., model.onnx or model.int8.onnx |
| 399 | } | 399 | } |
| 400 | 400 | ||
| 401 | +type OfflineZipformerCtcModelConfig struct { | ||
| 402 | + Model string // Path to the model, e.g., model.onnx or model.int8.onnx | ||
| 403 | +} | ||
| 404 | + | ||
| 401 | type OfflineDolphinModelConfig struct { | 405 | type OfflineDolphinModelConfig struct { |
| 402 | Model string // Path to the model, e.g., model.onnx or model.int8.onnx | 406 | Model string // Path to the model, e.g., model.onnx or model.int8.onnx |
| 403 | } | 407 | } |
| @@ -439,16 +443,17 @@ type OfflineLMConfig struct { | @@ -439,16 +443,17 @@ type OfflineLMConfig struct { | ||
| 439 | } | 443 | } |
| 440 | 444 | ||
| 441 | type OfflineModelConfig struct { | 445 | type OfflineModelConfig struct { |
| 442 | - Transducer OfflineTransducerModelConfig | ||
| 443 | - Paraformer OfflineParaformerModelConfig | ||
| 444 | - NemoCTC OfflineNemoEncDecCtcModelConfig | ||
| 445 | - Whisper OfflineWhisperModelConfig | ||
| 446 | - Tdnn OfflineTdnnModelConfig | ||
| 447 | - SenseVoice OfflineSenseVoiceModelConfig | ||
| 448 | - Moonshine OfflineMoonshineModelConfig | ||
| 449 | - FireRedAsr OfflineFireRedAsrModelConfig | ||
| 450 | - Dolphin OfflineDolphinModelConfig | ||
| 451 | - Tokens string // Path to tokens.txt | 446 | + Transducer OfflineTransducerModelConfig |
| 447 | + Paraformer OfflineParaformerModelConfig | ||
| 448 | + NemoCTC OfflineNemoEncDecCtcModelConfig | ||
| 449 | + Whisper OfflineWhisperModelConfig | ||
| 450 | + Tdnn OfflineTdnnModelConfig | ||
| 451 | + SenseVoice OfflineSenseVoiceModelConfig | ||
| 452 | + Moonshine OfflineMoonshineModelConfig | ||
| 453 | + FireRedAsr OfflineFireRedAsrModelConfig | ||
| 454 | + Dolphin OfflineDolphinModelConfig | ||
| 455 | + ZipformerCtc OfflineZipformerCtcModelConfig | ||
| 456 | + Tokens string // Path to tokens.txt | ||
| 452 | 457 | ||
| 453 | // Number of threads to use for neural network computation | 458 | // Number of threads to use for neural network computation |
| 454 | NumThreads int | 459 | NumThreads int |
| @@ -540,6 +545,7 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher | @@ -540,6 +545,7 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher | ||
| 540 | c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder) | 545 | c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder) |
| 541 | 546 | ||
| 542 | c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model) | 547 | c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model) |
| 548 | + c.model_config.zipformer_ctc.model = C.CString(config.ModelConfig.ZipformerCtc.Model) | ||
| 543 | 549 | ||
| 544 | c.model_config.tokens = C.CString(config.ModelConfig.Tokens) | 550 | c.model_config.tokens = C.CString(config.ModelConfig.Tokens) |
| 545 | 551 | ||
| @@ -653,11 +659,22 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig) | @@ -653,11 +659,22 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig) | ||
| 653 | C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder)) | 659 | C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder)) |
| 654 | c.model_config.fire_red_asr.encoder = nil | 660 | c.model_config.fire_red_asr.encoder = nil |
| 655 | } | 661 | } |
| 662 | + | ||
| 656 | if c.model_config.fire_red_asr.decoder != nil { | 663 | if c.model_config.fire_red_asr.decoder != nil { |
| 657 | C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder)) | 664 | C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder)) |
| 658 | c.model_config.fire_red_asr.decoder = nil | 665 | c.model_config.fire_red_asr.decoder = nil |
| 659 | } | 666 | } |
| 660 | 667 | ||
| 668 | + if c.model_config.dolphin.model != nil { | ||
| 669 | + C.free(unsafe.Pointer(c.model_config.dolphin.model)) | ||
| 670 | + c.model_config.dolphin.model = nil | ||
| 671 | + } | ||
| 672 | + | ||
| 673 | + if c.model_config.zipformer_ctc.model != nil { | ||
| 674 | + C.free(unsafe.Pointer(c.model_config.zipformer_ctc.model)) | ||
| 675 | + c.model_config.zipformer_ctc.model = nil | ||
| 676 | + } | ||
| 677 | + | ||
| 661 | if c.model_config.tokens != nil { | 678 | if c.model_config.tokens != nil { |
| 662 | C.free(unsafe.Pointer(c.model_config.tokens)) | 679 | C.free(unsafe.Pointer(c.model_config.tokens)) |
| 663 | c.model_config.tokens = nil | 680 | c.model_config.tokens = nil |
| @@ -212,6 +212,21 @@ def get_models(): | @@ -212,6 +212,21 @@ def get_models(): | ||
| 212 | git diff | 212 | git diff |
| 213 | """, | 213 | """, |
| 214 | ), | 214 | ), |
| 215 | + Model( | ||
| 216 | + model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03", | ||
| 217 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc", | ||
| 218 | + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc", | ||
| 219 | + short_name="vad-asr-zh-zipformer-ctc", | ||
| 220 | + cmd=""" | ||
| 221 | + pushd $model_name | ||
| 222 | + mv model.int8.onnx ../zipformer-ctc.onnx | ||
| 223 | + mv tokens.txt ../ | ||
| 224 | + popd | ||
| 225 | + rm -rf $model_name | ||
| 226 | + sed -i.bak 's/Zipformer/Zipformer CTC supporting Chinese 中文/g' ../index.html | ||
| 227 | + git diff | ||
| 228 | + """, | ||
| 229 | + ), | ||
| 215 | ] | 230 | ] |
| 216 | return models | 231 | return models |
| 217 | 232 |
| @@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig( | @@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig( | ||
| 484 | recognizer_config.model_config.dolphin.model = | 484 | recognizer_config.model_config.dolphin.model = |
| 485 | SHERPA_ONNX_OR(config->model_config.dolphin.model, ""); | 485 | SHERPA_ONNX_OR(config->model_config.dolphin.model, ""); |
| 486 | 486 | ||
| 487 | + recognizer_config.model_config.zipformer_ctc.model = | ||
| 488 | + SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, ""); | ||
| 489 | + | ||
| 487 | recognizer_config.lm_config.model = | 490 | recognizer_config.lm_config.model = |
| 488 | SHERPA_ONNX_OR(config->lm_config.model, ""); | 491 | SHERPA_ONNX_OR(config->lm_config.model, ""); |
| 489 | recognizer_config.lm_config.scale = | 492 | recognizer_config.lm_config.scale = |
| @@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig { | @@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig { | ||
| 451 | const char *model; | 451 | const char *model; |
| 452 | } SherpaOnnxOfflineDolphinModelConfig; | 452 | } SherpaOnnxOfflineDolphinModelConfig; |
| 453 | 453 | ||
| 454 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineZipformerCtcModelConfig { | ||
| 455 | + const char *model; | ||
| 456 | +} SherpaOnnxOfflineZipformerCtcModelConfig; | ||
| 457 | + | ||
| 454 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { | 458 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { |
| 455 | SherpaOnnxOfflineTransducerModelConfig transducer; | 459 | SherpaOnnxOfflineTransducerModelConfig transducer; |
| 456 | SherpaOnnxOfflineParaformerModelConfig paraformer; | 460 | SherpaOnnxOfflineParaformerModelConfig paraformer; |
| @@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { | @@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { | ||
| 474 | SherpaOnnxOfflineMoonshineModelConfig moonshine; | 478 | SherpaOnnxOfflineMoonshineModelConfig moonshine; |
| 475 | SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr; | 479 | SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr; |
| 476 | SherpaOnnxOfflineDolphinModelConfig dolphin; | 480 | SherpaOnnxOfflineDolphinModelConfig dolphin; |
| 481 | + SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc; | ||
| 477 | } SherpaOnnxOfflineModelConfig; | 482 | } SherpaOnnxOfflineModelConfig; |
| 478 | 483 | ||
| 479 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { | 484 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { |
| @@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create( | @@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create( | ||
| 252 | 252 | ||
| 253 | c.model_config.dolphin.model = config.model_config.dolphin.model.c_str(); | 253 | c.model_config.dolphin.model = config.model_config.dolphin.model.c_str(); |
| 254 | 254 | ||
| 255 | + c.model_config.zipformer_ctc.model = | ||
| 256 | + config.model_config.zipformer_ctc.model.c_str(); | ||
| 257 | + | ||
| 255 | c.lm_config.model = config.lm_config.model.c_str(); | 258 | c.lm_config.model = config.lm_config.model.c_str(); |
| 256 | c.lm_config.scale = config.lm_config.scale; | 259 | c.lm_config.scale = config.lm_config.scale; |
| 257 | 260 |
| @@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig { | @@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig { | ||
| 241 | std::string model; | 241 | std::string model; |
| 242 | }; | 242 | }; |
| 243 | 243 | ||
| 244 | +struct SHERPA_ONNX_API OfflineZipformerCtcModelConfig { | ||
| 245 | + std::string model; | ||
| 246 | +}; | ||
| 247 | + | ||
| 244 | struct SHERPA_ONNX_API OfflineMoonshineModelConfig { | 248 | struct SHERPA_ONNX_API OfflineMoonshineModelConfig { |
| 245 | std::string preprocessor; | 249 | std::string preprocessor; |
| 246 | std::string encoder; | 250 | std::string encoder; |
| @@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig { | @@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig { | ||
| 267 | OfflineMoonshineModelConfig moonshine; | 271 | OfflineMoonshineModelConfig moonshine; |
| 268 | OfflineFireRedAsrModelConfig fire_red_asr; | 272 | OfflineFireRedAsrModelConfig fire_red_asr; |
| 269 | OfflineDolphinModelConfig dolphin; | 273 | OfflineDolphinModelConfig dolphin; |
| 274 | + OfflineZipformerCtcModelConfig zipformer_ctc; | ||
| 270 | }; | 275 | }; |
| 271 | 276 | ||
| 272 | struct SHERPA_ONNX_API OfflineLMConfig { | 277 | struct SHERPA_ONNX_API OfflineLMConfig { |
| @@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | @@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | ||
| 113 | const OfflineModelConfig &config) { | 113 | const OfflineModelConfig &config) { |
| 114 | if (!config.dolphin.model.empty()) { | 114 | if (!config.dolphin.model.empty()) { |
| 115 | return std::make_unique<OfflineDolphinModel>(config); | 115 | return std::make_unique<OfflineDolphinModel>(config); |
| 116 | + } else if (!config.nemo_ctc.model.empty()) { | ||
| 117 | + return std::make_unique<OfflineNemoEncDecCtcModel>(config); | ||
| 118 | + } else if (!config.tdnn.model.empty()) { | ||
| 119 | + return std::make_unique<OfflineTdnnCtcModel>(config); | ||
| 120 | + } else if (!config.zipformer_ctc.model.empty()) { | ||
| 121 | + return std::make_unique<OfflineZipformerCtcModel>(config); | ||
| 122 | + } else if (!config.wenet_ctc.model.empty()) { | ||
| 123 | + return std::make_unique<OfflineWenetCtcModel>(config); | ||
| 124 | + } else if (!config.telespeech_ctc.empty()) { | ||
| 125 | + return std::make_unique<OfflineTeleSpeechCtcModel>(config); | ||
| 116 | } | 126 | } |
| 117 | 127 | ||
| 118 | // TODO(fangjun): Refactor it. We don't need to use model_type here | 128 | // TODO(fangjun): Refactor it. We don't need to use model_type here |
| @@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | @@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | ||
| 167 | Manager *mgr, const OfflineModelConfig &config) { | 177 | Manager *mgr, const OfflineModelConfig &config) { |
| 168 | if (!config.dolphin.model.empty()) { | 178 | if (!config.dolphin.model.empty()) { |
| 169 | return std::make_unique<OfflineDolphinModel>(mgr, config); | 179 | return std::make_unique<OfflineDolphinModel>(mgr, config); |
| 180 | + } else if (!config.nemo_ctc.model.empty()) { | ||
| 181 | + return std::make_unique<OfflineNemoEncDecCtcModel>(mgr, config); | ||
| 182 | + } else if (!config.tdnn.model.empty()) { | ||
| 183 | + return std::make_unique<OfflineTdnnCtcModel>(mgr, config); | ||
| 184 | + } else if (!config.zipformer_ctc.model.empty()) { | ||
| 185 | + return std::make_unique<OfflineZipformerCtcModel>(mgr, config); | ||
| 186 | + } else if (!config.wenet_ctc.model.empty()) { | ||
| 187 | + return std::make_unique<OfflineWenetCtcModel>(mgr, config); | ||
| 188 | + } else if (!config.telespeech_ctc.empty()) { | ||
| 189 | + return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config); | ||
| 170 | } | 190 | } |
| 171 | 191 | ||
| 172 | // TODO(fangjun): Refactor it. We don't need to use model_type here | 192 | // TODO(fangjun): Refactor it. We don't need to use model_type here |
| @@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java | @@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java | ||
| 33 | java_files += OfflineFireRedAsrModelConfig.java | 33 | java_files += OfflineFireRedAsrModelConfig.java |
| 34 | java_files += OfflineMoonshineModelConfig.java | 34 | java_files += OfflineMoonshineModelConfig.java |
| 35 | java_files += OfflineNemoEncDecCtcModelConfig.java | 35 | java_files += OfflineNemoEncDecCtcModelConfig.java |
| 36 | +java_files += OfflineZipformerCtcModelConfig.java | ||
| 36 | java_files += OfflineSenseVoiceModelConfig.java | 37 | java_files += OfflineSenseVoiceModelConfig.java |
| 37 | java_files += OfflineDolphinModelConfig.java | 38 | java_files += OfflineDolphinModelConfig.java |
| 38 | java_files += OfflineModelConfig.java | 39 | java_files += OfflineModelConfig.java |
| @@ -11,6 +11,7 @@ public class OfflineModelConfig { | @@ -11,6 +11,7 @@ public class OfflineModelConfig { | ||
| 11 | private final OfflineNemoEncDecCtcModelConfig nemo; | 11 | private final OfflineNemoEncDecCtcModelConfig nemo; |
| 12 | private final OfflineSenseVoiceModelConfig senseVoice; | 12 | private final OfflineSenseVoiceModelConfig senseVoice; |
| 13 | private final OfflineDolphinModelConfig dolphin; | 13 | private final OfflineDolphinModelConfig dolphin; |
| 14 | + private final OfflineZipformerCtcModelConfig zipformerCtc; | ||
| 14 | private final String teleSpeech; | 15 | private final String teleSpeech; |
| 15 | private final String tokens; | 16 | private final String tokens; |
| 16 | private final int numThreads; | 17 | private final int numThreads; |
| @@ -28,6 +29,7 @@ public class OfflineModelConfig { | @@ -28,6 +29,7 @@ public class OfflineModelConfig { | ||
| 28 | this.fireRedAsr = builder.fireRedAsr; | 29 | this.fireRedAsr = builder.fireRedAsr; |
| 29 | this.moonshine = builder.moonshine; | 30 | this.moonshine = builder.moonshine; |
| 30 | this.nemo = builder.nemo; | 31 | this.nemo = builder.nemo; |
| 32 | + this.zipformerCtc = builder.zipformerCtc; | ||
| 31 | this.senseVoice = builder.senseVoice; | 33 | this.senseVoice = builder.senseVoice; |
| 32 | this.dolphin = builder.dolphin; | 34 | this.dolphin = builder.dolphin; |
| 33 | this.teleSpeech = builder.teleSpeech; | 35 | this.teleSpeech = builder.teleSpeech; |
| @@ -52,7 +54,7 @@ public class OfflineModelConfig { | @@ -52,7 +54,7 @@ public class OfflineModelConfig { | ||
| 52 | return transducer; | 54 | return transducer; |
| 53 | } | 55 | } |
| 54 | 56 | ||
| 55 | - public OfflineWhisperModelConfig getZipformer2Ctc() { | 57 | + public OfflineWhisperModelConfig getWhisper() { |
| 56 | return whisper; | 58 | return whisper; |
| 57 | } | 59 | } |
| 58 | 60 | ||
| @@ -68,6 +70,14 @@ public class OfflineModelConfig { | @@ -68,6 +70,14 @@ public class OfflineModelConfig { | ||
| 68 | return dolphin; | 70 | return dolphin; |
| 69 | } | 71 | } |
| 70 | 72 | ||
| 73 | + public OfflineNemoEncDecCtcModelConfig getNemo() { | ||
| 74 | + return nemo; | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + public OfflineZipformerCtcModelConfig getZipformerCtc() { | ||
| 78 | + return zipformerCtc; | ||
| 79 | + } | ||
| 80 | + | ||
| 71 | public String getTokens() { | 81 | public String getTokens() { |
| 72 | return tokens; | 82 | return tokens; |
| 73 | } | 83 | } |
| @@ -109,6 +119,7 @@ public class OfflineModelConfig { | @@ -109,6 +119,7 @@ public class OfflineModelConfig { | ||
| 109 | private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); | 119 | private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); |
| 110 | private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build(); | 120 | private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build(); |
| 111 | private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build(); | 121 | private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build(); |
| 122 | + private OfflineZipformerCtcModelConfig zipformerCtc = OfflineZipformerCtcModelConfig.builder().build(); | ||
| 112 | private String teleSpeech = ""; | 123 | private String teleSpeech = ""; |
| 113 | private String tokens = ""; | 124 | private String tokens = ""; |
| 114 | private int numThreads = 1; | 125 | private int numThreads = 1; |
| @@ -142,6 +153,11 @@ public class OfflineModelConfig { | @@ -142,6 +153,11 @@ public class OfflineModelConfig { | ||
| 142 | return this; | 153 | return this; |
| 143 | } | 154 | } |
| 144 | 155 | ||
| 156 | + public Builder setZipformerCtc(OfflineZipformerCtcModelConfig zipformerCtc) { | ||
| 157 | + this.zipformerCtc = zipformerCtc; | ||
| 158 | + return this; | ||
| 159 | + } | ||
| 160 | + | ||
| 145 | public Builder setTeleSpeech(String teleSpeech) { | 161 | public Builder setTeleSpeech(String teleSpeech) { |
| 146 | this.teleSpeech = teleSpeech; | 162 | this.teleSpeech = teleSpeech; |
| 147 | return this; | 163 | return this; |
| 1 | +// Copyright 2025 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +package com.k2fsa.sherpa.onnx; | ||
| 4 | + | ||
| 5 | +public class OfflineZipformerCtcModelConfig { | ||
| 6 | + private final String model; | ||
| 7 | + | ||
| 8 | + private OfflineZipformerCtcModelConfig(Builder builder) { | ||
| 9 | + this.model = builder.model; | ||
| 10 | + } | ||
| 11 | + | ||
| 12 | + public static Builder builder() { | ||
| 13 | + return new Builder(); | ||
| 14 | + } | ||
| 15 | + | ||
| 16 | + public String getModel() { | ||
| 17 | + return model; | ||
| 18 | + } | ||
| 19 | + | ||
| 20 | + public static class Builder { | ||
| 21 | + private String model = ""; | ||
| 22 | + | ||
| 23 | + public OfflineZipformerCtcModelConfig build() { | ||
| 24 | + return new OfflineZipformerCtcModelConfig(this); | ||
| 25 | + } | ||
| 26 | + | ||
| 27 | + public Builder setModel(String model) { | ||
| 28 | + this.model = model; | ||
| 29 | + return this; | ||
| 30 | + } | ||
| 31 | + } | ||
| 32 | +} |
| @@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { | @@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { | ||
| 269 | ans.model_config.nemo_ctc.model = p; | 269 | ans.model_config.nemo_ctc.model = p; |
| 270 | env->ReleaseStringUTFChars(s, p); | 270 | env->ReleaseStringUTFChars(s, p); |
| 271 | 271 | ||
| 272 | + // zipformer ctc | ||
| 273 | + fid = | ||
| 274 | + env->GetFieldID(model_config_cls, "zipformerCtc", | ||
| 275 | + "Lcom/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig;"); | ||
| 276 | + jobject zipformer_ctc_config = env->GetObjectField(model_config, fid); | ||
| 277 | + jclass zipformer_ctc_config_cls = env->GetObjectClass(zipformer_ctc_config); | ||
| 278 | + | ||
| 279 | + fid = | ||
| 280 | + env->GetFieldID(zipformer_ctc_config_cls, "model", "Ljava/lang/String;"); | ||
| 281 | + | ||
| 282 | + s = (jstring)env->GetObjectField(zipformer_ctc_config, fid); | ||
| 283 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 284 | + ans.model_config.zipformer_ctc.model = p; | ||
| 285 | + env->ReleaseStringUTFChars(s, p); | ||
| 286 | + | ||
| 272 | // dolphin | 287 | // dolphin |
| 273 | fid = env->GetFieldID(model_config_cls, "dolphin", | 288 | fid = env->GetFieldID(model_config_cls, "dolphin", |
| 274 | "Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;"); | 289 | "Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;"); |
| @@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig( | @@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig( | ||
| 29 | var model: String = "", | 29 | var model: String = "", |
| 30 | ) | 30 | ) |
| 31 | 31 | ||
| 32 | +data class OfflineZipformerCtcModelConfig( | ||
| 33 | + var model: String = "", | ||
| 34 | +) | ||
| 35 | + | ||
| 32 | data class OfflineWhisperModelConfig( | 36 | data class OfflineWhisperModelConfig( |
| 33 | var encoder: String = "", | 37 | var encoder: String = "", |
| 34 | var decoder: String = "", | 38 | var decoder: String = "", |
| @@ -64,6 +68,7 @@ data class OfflineModelConfig( | @@ -64,6 +68,7 @@ data class OfflineModelConfig( | ||
| 64 | var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), | 68 | var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), |
| 65 | var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(), | 69 | var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(), |
| 66 | var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(), | 70 | var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(), |
| 71 | + var zipformerCtc: OfflineZipformerCtcModelConfig = OfflineZipformerCtcModelConfig(), | ||
| 67 | var teleSpeech: String = "", | 72 | var teleSpeech: String = "", |
| 68 | var numThreads: Int = 1, | 73 | var numThreads: Int = 1, |
| 69 | var debug: Boolean = false, | 74 | var debug: Boolean = false, |
| @@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { | @@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { | ||
| 559 | modelType = "nemo_transducer", | 564 | modelType = "nemo_transducer", |
| 560 | ) | 565 | ) |
| 561 | } | 566 | } |
| 567 | + | ||
| 568 | + 31 -> { | ||
| 569 | + val modelDir = "sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03" | ||
| 570 | + return OfflineModelConfig( | ||
| 571 | + zipformerCtc = OfflineZipformerCtcModelConfig( | ||
| 572 | + model = "$modelDir/model.int8.onnx", | ||
| 573 | + ), | ||
| 574 | + tokens = "$modelDir/tokens.txt", | ||
| 575 | + ) | ||
| 576 | + } | ||
| 562 | } | 577 | } |
| 563 | return null | 578 | return null |
| 564 | } | 579 | } |
| @@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { | @@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { | ||
| 412 | model = "$modelDir/model.onnx", | 412 | model = "$modelDir/model.onnx", |
| 413 | ), | 413 | ), |
| 414 | tokens = "$modelDir/tokens.txt", | 414 | tokens = "$modelDir/tokens.txt", |
| 415 | + modelType = "zipformer2", | ||
| 415 | ) | 416 | ) |
| 416 | } | 417 | } |
| 417 | 418 | ||
| @@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { | @@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { | ||
| 422 | model = "$modelDir/model.fp16.onnx", | 423 | model = "$modelDir/model.fp16.onnx", |
| 423 | ), | 424 | ), |
| 424 | tokens = "$modelDir/tokens.txt", | 425 | tokens = "$modelDir/tokens.txt", |
| 426 | + modelType = "zipformer2", | ||
| 425 | ) | 427 | ) |
| 426 | } | 428 | } |
| 427 | 429 |
| @@ -284,6 +284,11 @@ type | @@ -284,6 +284,11 @@ type | ||
| 284 | function ToString: AnsiString; | 284 | function ToString: AnsiString; |
| 285 | end; | 285 | end; |
| 286 | 286 | ||
| 287 | + TSherpaOnnxOfflineZipformerCtcModelConfig = record | ||
| 288 | + Model: AnsiString; | ||
| 289 | + function ToString: AnsiString; | ||
| 290 | + end; | ||
| 291 | + | ||
| 287 | TSherpaOnnxOfflineWhisperModelConfig = record | 292 | TSherpaOnnxOfflineWhisperModelConfig = record |
| 288 | Encoder: AnsiString; | 293 | Encoder: AnsiString; |
| 289 | Decoder: AnsiString; | 294 | Decoder: AnsiString; |
| @@ -346,6 +351,7 @@ type | @@ -346,6 +351,7 @@ type | ||
| 346 | Moonshine: TSherpaOnnxOfflineMoonshineModelConfig; | 351 | Moonshine: TSherpaOnnxOfflineMoonshineModelConfig; |
| 347 | FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig; | 352 | FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig; |
| 348 | Dolphin: TSherpaOnnxOfflineDolphinModelConfig; | 353 | Dolphin: TSherpaOnnxOfflineDolphinModelConfig; |
| 354 | + ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig; | ||
| 349 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); | 355 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); |
| 350 | function ToString: AnsiString; | 356 | function ToString: AnsiString; |
| 351 | end; | 357 | end; |
| @@ -726,6 +732,9 @@ type | @@ -726,6 +732,9 @@ type | ||
| 726 | SherpaOnnxOfflineDolphinModelConfig = record | 732 | SherpaOnnxOfflineDolphinModelConfig = record |
| 727 | Model: PAnsiChar; | 733 | Model: PAnsiChar; |
| 728 | end; | 734 | end; |
| 735 | + SherpaOnnxOfflineZipformerCtcModelConfig = record | ||
| 736 | + Model: PAnsiChar; | ||
| 737 | + end; | ||
| 729 | SherpaOnnxOfflineWhisperModelConfig = record | 738 | SherpaOnnxOfflineWhisperModelConfig = record |
| 730 | Encoder: PAnsiChar; | 739 | Encoder: PAnsiChar; |
| 731 | Decoder: PAnsiChar; | 740 | Decoder: PAnsiChar; |
| @@ -773,6 +782,7 @@ type | @@ -773,6 +782,7 @@ type | ||
| 773 | Moonshine: SherpaOnnxOfflineMoonshineModelConfig; | 782 | Moonshine: SherpaOnnxOfflineMoonshineModelConfig; |
| 774 | FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig; | 783 | FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig; |
| 775 | Dolphin: SherpaOnnxOfflineDolphinModelConfig; | 784 | Dolphin: SherpaOnnxOfflineDolphinModelConfig; |
| 785 | + ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig; | ||
| 776 | end; | 786 | end; |
| 777 | 787 | ||
| 778 | SherpaOnnxOfflineRecognizerConfig = record | 788 | SherpaOnnxOfflineRecognizerConfig = record |
| @@ -1536,6 +1546,12 @@ begin | @@ -1536,6 +1546,12 @@ begin | ||
| 1536 | [Self.Model]); | 1546 | [Self.Model]); |
| 1537 | end; | 1547 | end; |
| 1538 | 1548 | ||
| 1549 | +function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString; | ||
| 1550 | +begin | ||
| 1551 | + Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)', | ||
| 1552 | + [Self.Model]); | ||
| 1553 | +end; | ||
| 1554 | + | ||
| 1539 | function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; | 1555 | function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; |
| 1540 | begin | 1556 | begin |
| 1541 | Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' + | 1557 | Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' + |
| @@ -1610,14 +1626,15 @@ begin | @@ -1610,14 +1626,15 @@ begin | ||
| 1610 | 'SenseVoice := %s, ' + | 1626 | 'SenseVoice := %s, ' + |
| 1611 | 'Moonshine := %s, ' + | 1627 | 'Moonshine := %s, ' + |
| 1612 | 'FireRedAsr := %s, ' + | 1628 | 'FireRedAsr := %s, ' + |
| 1613 | - 'Dolphin := %s' + | 1629 | + 'Dolphin := %s, ' + |
| 1630 | + 'ZipformerCtc := %s' + | ||
| 1614 | ')', | 1631 | ')', |
| 1615 | [Self.Transducer.ToString, Self.Paraformer.ToString, | 1632 | [Self.Transducer.ToString, Self.Paraformer.ToString, |
| 1616 | Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, | 1633 | Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, |
| 1617 | Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, | 1634 | Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, |
| 1618 | Self.ModelType, Self.ModelingUnit, Self.BpeVocab, | 1635 | Self.ModelType, Self.ModelingUnit, Self.BpeVocab, |
| 1619 | Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString, | 1636 | Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString, |
| 1620 | - Self.FireRedAsr.ToString, Self.Dolphin.ToString | 1637 | + Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString |
| 1621 | ]); | 1638 | ]); |
| 1622 | end; | 1639 | end; |
| 1623 | 1640 | ||
| @@ -1688,6 +1705,7 @@ begin | @@ -1688,6 +1705,7 @@ begin | ||
| 1688 | C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder); | 1705 | C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder); |
| 1689 | 1706 | ||
| 1690 | C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model); | 1707 | C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model); |
| 1708 | + C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model); | ||
| 1691 | 1709 | ||
| 1692 | C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); | 1710 | C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); |
| 1693 | C.LMConfig.Scale := Config.LMConfig.Scale; | 1711 | C.LMConfig.Scale := Config.LMConfig.Scale; |
| @@ -528,6 +528,87 @@ class OfflineRecognizer(object): | @@ -528,6 +528,87 @@ class OfflineRecognizer(object): | ||
| 528 | return self | 528 | return self |
| 529 | 529 | ||
| 530 | @classmethod | 530 | @classmethod |
| 531 | + def from_zipformer_ctc( | ||
| 532 | + cls, | ||
| 533 | + model: str, | ||
| 534 | + tokens: str, | ||
| 535 | + num_threads: int = 1, | ||
| 536 | + sample_rate: int = 16000, | ||
| 537 | + feature_dim: int = 80, | ||
| 538 | + decoding_method: str = "greedy_search", | ||
| 539 | + debug: bool = False, | ||
| 540 | + provider: str = "cpu", | ||
| 541 | + rule_fsts: str = "", | ||
| 542 | + rule_fars: str = "", | ||
| 543 | + hr_dict_dir: str = "", | ||
| 544 | + hr_rule_fsts: str = "", | ||
| 545 | + hr_lexicon: str = "", | ||
| 546 | + ): | ||
| 547 | + """ | ||
| 548 | + Please refer to | ||
| 549 | + `<https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/index.html>`_ | ||
| 550 | + to download pre-trained models for different languages, e.g., Chinese, | ||
| 551 | + English, etc. | ||
| 552 | + | ||
| 553 | + Args: | ||
| 554 | + model: | ||
| 555 | + Path to ``model.onnx``. | ||
| 556 | + tokens: | ||
| 557 | + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two | ||
| 558 | + columns:: | ||
| 559 | + | ||
| 560 | + symbol integer_id | ||
| 561 | + | ||
| 562 | + num_threads: | ||
| 563 | + Number of threads for neural network computation. | ||
| 564 | + sample_rate: | ||
| 565 | + Sample rate of the training data used to train the model. | ||
| 566 | + feature_dim: | ||
| 567 | + Dimension of the feature used to train the model. | ||
| 568 | + decoding_method: | ||
| 569 | + Valid values are greedy_search. | ||
| 570 | + debug: | ||
| 571 | + True to show debug messages. | ||
| 572 | + provider: | ||
| 573 | + onnxruntime execution providers. Valid values are: cpu, cuda, coreml. | ||
| 574 | + rule_fsts: | ||
| 575 | + If not empty, it specifies fsts for inverse text normalization. | ||
| 576 | + If there are multiple fsts, they are separated by a comma. | ||
| 577 | + rule_fars: | ||
| 578 | + If not empty, it specifies fst archives for inverse text normalization. | ||
| 579 | + If there are multiple archives, they are separated by a comma. | ||
| 580 | + """ | ||
| 581 | + self = cls.__new__(cls) | ||
| 582 | + model_config = OfflineModelConfig( | ||
| 583 | + zipformer_ctc=OfflineZipformerCtcModelConfig(model=model), | ||
| 584 | + tokens=tokens, | ||
| 585 | + num_threads=num_threads, | ||
| 586 | + debug=debug, | ||
| 587 | + provider=provider, | ||
| 588 | + ) | ||
| 589 | + | ||
| 590 | + feat_config = FeatureExtractorConfig( | ||
| 591 | + sampling_rate=sample_rate, | ||
| 592 | + feature_dim=feature_dim, | ||
| 593 | + ) | ||
| 594 | + | ||
| 595 | + recognizer_config = OfflineRecognizerConfig( | ||
| 596 | + feat_config=feat_config, | ||
| 597 | + model_config=model_config, | ||
| 598 | + decoding_method=decoding_method, | ||
| 599 | + rule_fsts=rule_fsts, | ||
| 600 | + rule_fars=rule_fars, | ||
| 601 | + hr=HomophoneReplacerConfig( | ||
| 602 | + dict_dir=hr_dict_dir, | ||
| 603 | + lexicon=hr_lexicon, | ||
| 604 | + rule_fsts=hr_rule_fsts, | ||
| 605 | + ), | ||
| 606 | + ) | ||
| 607 | + self.recognizer = _Recognizer(recognizer_config) | ||
| 608 | + self.config = recognizer_config | ||
| 609 | + return self | ||
| 610 | + | ||
| 611 | + @classmethod | ||
| 531 | def from_nemo_ctc( | 612 | def from_nemo_ctc( |
| 532 | cls, | 613 | cls, |
| 533 | model: str, | 614 | model: str, |
| @@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig( | @@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig( | ||
| 346 | ) | 346 | ) |
| 347 | } | 347 | } |
| 348 | 348 | ||
| 349 | +func sherpaOnnxOfflineZipformerCtcModelConfig( | ||
| 350 | + model: String = "" | ||
| 351 | +) -> SherpaOnnxOfflineZipformerCtcModelConfig { | ||
| 352 | + return SherpaOnnxOfflineZipformerCtcModelConfig( | ||
| 353 | + model: toCPointer(model) | ||
| 354 | + ) | ||
| 355 | +} | ||
| 356 | + | ||
| 349 | func sherpaOnnxOfflineNemoEncDecCtcModelConfig( | 357 | func sherpaOnnxOfflineNemoEncDecCtcModelConfig( |
| 350 | model: String = "" | 358 | model: String = "" |
| 351 | ) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig { | 359 | ) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig { |
| @@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig( | @@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig( | ||
| 449 | senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(), | 457 | senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(), |
| 450 | moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(), | 458 | moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(), |
| 451 | fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(), | 459 | fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(), |
| 452 | - dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig() | 460 | + dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(), |
| 461 | + zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig = | ||
| 462 | + sherpaOnnxOfflineZipformerCtcModelConfig() | ||
| 453 | ) -> SherpaOnnxOfflineModelConfig { | 463 | ) -> SherpaOnnxOfflineModelConfig { |
| 454 | return SherpaOnnxOfflineModelConfig( | 464 | return SherpaOnnxOfflineModelConfig( |
| 455 | transducer: transducer, | 465 | transducer: transducer, |
| @@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig( | @@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig( | ||
| 468 | sense_voice: senseVoice, | 478 | sense_voice: senseVoice, |
| 469 | moonshine: moonshine, | 479 | moonshine: moonshine, |
| 470 | fire_red_asr: fireRedAsr, | 480 | fire_red_asr: fireRedAsr, |
| 471 | - dolphin: dolphin | 481 | + dolphin: dolphin, |
| 482 | + zipformer_ctc: zipformerCtc | ||
| 472 | ) | 483 | ) |
| 473 | } | 484 | } |
| 474 | 485 |
swift-api-examples/run-zipformer-ctc-asr.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d ../build-swift-macos ]; then | ||
| 6 | + echo "Please run ../build-swift-macos.sh first!" | ||
| 7 | + exit 1 | ||
| 8 | +fi | ||
| 9 | + | ||
| 10 | +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then | ||
| 11 | + echo "Please download the pre-trained model for testing." | ||
| 12 | + echo "You can refer to" | ||
| 13 | + echo "" | ||
| 14 | + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese" | ||
| 15 | + echo "" | ||
| 16 | + echo "for help" | ||
| 17 | + | ||
| 18 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 19 | + | ||
| 20 | + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 21 | + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 | ||
| 22 | + ls -lh sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 | ||
| 23 | +fi | ||
| 24 | + | ||
| 25 | +if [ ! -e ./zipformer-ctc-asr ]; then | ||
| 26 | + # Note: We use -lc++ to link against libc++ instead of libstdc++ | ||
| 27 | + swiftc \ | ||
| 28 | + -lc++ \ | ||
| 29 | + -I ../build-swift-macos/install/include \ | ||
| 30 | + -import-objc-header ./SherpaOnnx-Bridging-Header.h \ | ||
| 31 | + ./zipformer-ctc-asr.swift ./SherpaOnnx.swift \ | ||
| 32 | + -L ../build-swift-macos/install/lib/ \ | ||
| 33 | + -l sherpa-onnx \ | ||
| 34 | + -l onnxruntime \ | ||
| 35 | + -o zipformer-ctc-asr | ||
| 36 | + | ||
| 37 | + strip zipformer-ctc-asr | ||
| 38 | +else | ||
| 39 | + echo "./zipformer-ctc-asr exists - skip building" | ||
| 40 | +fi | ||
| 41 | + | ||
| 42 | +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH | ||
| 43 | +./zipformer-ctc-asr |
swift-api-examples/zipformer-ctc-asr.swift
0 → 100644
| 1 | +import AVFoundation | ||
| 2 | + | ||
| 3 | +extension AudioBuffer { | ||
| 4 | + func array() -> [Float] { | ||
| 5 | + return Array(UnsafeBufferPointer(self)) | ||
| 6 | + } | ||
| 7 | +} | ||
| 8 | + | ||
| 9 | +extension AVAudioPCMBuffer { | ||
| 10 | + func array() -> [Float] { | ||
| 11 | + return self.audioBufferList.pointee.mBuffers.array() | ||
| 12 | + } | ||
| 13 | +} | ||
| 14 | + | ||
| 15 | +func run() { | ||
| 16 | + let model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx" | ||
| 17 | + let tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt" | ||
| 18 | + | ||
| 19 | + let zipformerCtc = sherpaOnnxOfflineZipformerCtcModelConfig( | ||
| 20 | + model: model | ||
| 21 | + ) | ||
| 22 | + | ||
| 23 | + let modelConfig = sherpaOnnxOfflineModelConfig( | ||
| 24 | + tokens: tokens, | ||
| 25 | + debug: 0, | ||
| 26 | + zipformerCtc: zipformerCtc | ||
| 27 | + ) | ||
| 28 | + | ||
| 29 | + let featConfig = sherpaOnnxFeatureConfig( | ||
| 30 | + sampleRate: 16000, | ||
| 31 | + featureDim: 80 | ||
| 32 | + ) | ||
| 33 | + var config = sherpaOnnxOfflineRecognizerConfig( | ||
| 34 | + featConfig: featConfig, | ||
| 35 | + modelConfig: modelConfig | ||
| 36 | + ) | ||
| 37 | + | ||
| 38 | + let recognizer = SherpaOnnxOfflineRecognizer(config: &config) | ||
| 39 | + | ||
| 40 | + let filePath = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav" | ||
| 41 | + let fileURL: NSURL = NSURL(fileURLWithPath: filePath) | ||
| 42 | + let audioFile = try! AVAudioFile(forReading: fileURL as URL) | ||
| 43 | + | ||
| 44 | + let audioFormat = audioFile.processingFormat | ||
| 45 | + assert(audioFormat.channelCount == 1) | ||
| 46 | + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) | ||
| 47 | + | ||
| 48 | + let audioFrameCount = UInt32(audioFile.length) | ||
| 49 | + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) | ||
| 50 | + | ||
| 51 | + try! audioFile.read(into: audioFileBuffer!) | ||
| 52 | + let array: [Float]! = audioFileBuffer?.array() | ||
| 53 | + let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate)) | ||
| 54 | + print("\nresult is:\n\(result.text)") | ||
| 55 | + if result.timestamps.count != 0 { | ||
| 56 | + print("\ntimestamps is:\n\(result.timestamps)") | ||
| 57 | + } | ||
| 58 | + | ||
| 59 | +} | ||
| 60 | + | ||
| 61 | +@main | ||
| 62 | +struct App { | ||
| 63 | + static func main() { | ||
| 64 | + run() | ||
| 65 | + } | ||
| 66 | +} |
| @@ -43,6 +43,10 @@ function freeConfig(config, Module) { | @@ -43,6 +43,10 @@ function freeConfig(config, Module) { | ||
| 43 | freeConfig(config.dolphin, Module) | 43 | freeConfig(config.dolphin, Module) |
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | + if ('zipformerCtc' in config) { | ||
| 47 | + freeConfig(config.zipformerCtc, Module) | ||
| 48 | + } | ||
| 49 | + | ||
| 46 | if ('moonshine' in config) { | 50 | if ('moonshine' in config) { |
| 47 | freeConfig(config.moonshine, Module) | 51 | freeConfig(config.moonshine, Module) |
| 48 | } | 52 | } |
| @@ -627,6 +631,23 @@ function initSherpaOnnxOfflineDolphinModelConfig(config, Module) { | @@ -627,6 +631,23 @@ function initSherpaOnnxOfflineDolphinModelConfig(config, Module) { | ||
| 627 | } | 631 | } |
| 628 | } | 632 | } |
| 629 | 633 | ||
| 634 | +function initSherpaOnnxOfflineZipformerCtcModelConfig(config, Module) { | ||
| 635 | + const n = Module.lengthBytesUTF8(config.model || '') + 1; | ||
| 636 | + | ||
| 637 | + const buffer = Module._malloc(n); | ||
| 638 | + | ||
| 639 | + const len = 1 * 4; // 1 pointer | ||
| 640 | + const ptr = Module._malloc(len); | ||
| 641 | + | ||
| 642 | + Module.stringToUTF8(config.model || '', buffer, n); | ||
| 643 | + | ||
| 644 | + Module.setValue(ptr, buffer, 'i8*'); | ||
| 645 | + | ||
| 646 | + return { | ||
| 647 | + buffer: buffer, ptr: ptr, len: len, | ||
| 648 | + } | ||
| 649 | +} | ||
| 650 | + | ||
| 630 | function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { | 651 | function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { |
| 631 | const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; | 652 | const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; |
| 632 | const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; | 653 | const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; |
| @@ -840,6 +861,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -840,6 +861,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 840 | }; | 861 | }; |
| 841 | } | 862 | } |
| 842 | 863 | ||
| 864 | + if (!('zipformerCtc' in config)) { | ||
| 865 | + config.zipformerCtc = { | ||
| 866 | + model: '', | ||
| 867 | + }; | ||
| 868 | + } | ||
| 869 | + | ||
| 843 | if (!('whisper' in config)) { | 870 | if (!('whisper' in config)) { |
| 844 | config.whisper = { | 871 | config.whisper = { |
| 845 | encoder: '', | 872 | encoder: '', |
| @@ -906,9 +933,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -906,9 +933,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 906 | const dolphin = | 933 | const dolphin = |
| 907 | initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module); | 934 | initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module); |
| 908 | 935 | ||
| 936 | + const zipformerCtc = | ||
| 937 | + initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module); | ||
| 938 | + | ||
| 909 | const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + | 939 | const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + |
| 910 | tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + | 940 | tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + |
| 911 | - dolphin.len; | 941 | + dolphin.len + zipformerCtc.len; |
| 912 | 942 | ||
| 913 | const ptr = Module._malloc(len); | 943 | const ptr = Module._malloc(len); |
| 914 | 944 | ||
| @@ -1010,11 +1040,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -1010,11 +1040,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 1010 | Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset); | 1040 | Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset); |
| 1011 | offset += dolphin.len; | 1041 | offset += dolphin.len; |
| 1012 | 1042 | ||
| 1043 | + Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset); | ||
| 1044 | + offset += zipformerCtc.len; | ||
| 1045 | + | ||
| 1013 | return { | 1046 | return { |
| 1014 | buffer: buffer, ptr: ptr, len: len, transducer: transducer, | 1047 | buffer: buffer, ptr: ptr, len: len, transducer: transducer, |
| 1015 | paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, | 1048 | paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, |
| 1016 | senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, | 1049 | senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, |
| 1017 | - dolphin: dolphin | 1050 | + dolphin: dolphin, zipformerCtc: zipformerCtc |
| 1018 | } | 1051 | } |
| 1019 | } | 1052 | } |
| 1020 | 1053 |
| @@ -13,6 +13,7 @@ extern "C" { | @@ -13,6 +13,7 @@ extern "C" { | ||
| 13 | static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); | 13 | static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); |
| 14 | static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); | 14 | static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); |
| 15 | 15 | ||
| 16 | +static_assert(sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) == 4, ""); | ||
| 16 | static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, ""); | 17 | static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, ""); |
| 17 | static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); | 18 | static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); |
| 18 | static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); | 19 | static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); |
| @@ -31,7 +32,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == | @@ -31,7 +32,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == | ||
| 31 | sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + | 32 | sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + |
| 32 | sizeof(SherpaOnnxOfflineMoonshineModelConfig) + | 33 | sizeof(SherpaOnnxOfflineMoonshineModelConfig) + |
| 33 | sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + | 34 | sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + |
| 34 | - sizeof(SherpaOnnxOfflineDolphinModelConfig), | 35 | + sizeof(SherpaOnnxOfflineDolphinModelConfig) + |
| 36 | + sizeof(SherpaOnnxOfflineZipformerCtcModelConfig), | ||
| 35 | 37 | ||
| 36 | ""); | 38 | ""); |
| 37 | static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); | 39 | static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); |
| @@ -77,6 +79,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | @@ -77,6 +79,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | ||
| 77 | auto moonshine = &model_config->moonshine; | 79 | auto moonshine = &model_config->moonshine; |
| 78 | auto fire_red_asr = &model_config->fire_red_asr; | 80 | auto fire_red_asr = &model_config->fire_red_asr; |
| 79 | auto dolphin = &model_config->dolphin; | 81 | auto dolphin = &model_config->dolphin; |
| 82 | + auto zipformer_ctc = &model_config->zipformer_ctc; | ||
| 80 | 83 | ||
| 81 | fprintf(stdout, "----------offline transducer model config----------\n"); | 84 | fprintf(stdout, "----------offline transducer model config----------\n"); |
| 82 | fprintf(stdout, "encoder: %s\n", transducer->encoder); | 85 | fprintf(stdout, "encoder: %s\n", transducer->encoder); |
| @@ -117,6 +120,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | @@ -117,6 +120,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | ||
| 117 | fprintf(stdout, "----------offline Dolphin model config----------\n"); | 120 | fprintf(stdout, "----------offline Dolphin model config----------\n"); |
| 118 | fprintf(stdout, "model: %s\n", dolphin->model); | 121 | fprintf(stdout, "model: %s\n", dolphin->model); |
| 119 | 122 | ||
| 123 | + fprintf(stdout, "----------offline zipformer ctc model config----------\n"); | ||
| 124 | + fprintf(stdout, "model: %s\n", zipformer_ctc->model); | ||
| 125 | + | ||
| 120 | fprintf(stdout, "tokens: %s\n", model_config->tokens); | 126 | fprintf(stdout, "tokens: %s\n", model_config->tokens); |
| 121 | fprintf(stdout, "num_threads: %d\n", model_config->num_threads); | 127 | fprintf(stdout, "num_threads: %d\n", model_config->num_threads); |
| 122 | fprintf(stdout, "provider: %s\n", model_config->provider); | 128 | fprintf(stdout, "provider: %s\n", model_config->provider); |
| @@ -117,6 +117,10 @@ function initOfflineRecognizer() { | @@ -117,6 +117,10 @@ function initOfflineRecognizer() { | ||
| 117 | }; | 117 | }; |
| 118 | } else if (fileExists('dolphin.onnx')) { | 118 | } else if (fileExists('dolphin.onnx')) { |
| 119 | config.modelConfig.dolphin = {model: './dolphin.onnx'}; | 119 | config.modelConfig.dolphin = {model: './dolphin.onnx'}; |
| 120 | + } else if (fileExists('zipformer-ctc.onnx')) { | ||
| 121 | + // you need to rename model.int8.onnx from zipformer CTC to | ||
| 122 | + // zipformer-ctc.onnx | ||
| 123 | + config.modelConfig.zipformerCtc = {model: './zipformer-ctc.onnx'}; | ||
| 120 | } else { | 124 | } else { |
| 121 | console.log('Please specify a model.'); | 125 | console.log('Please specify a model.'); |
| 122 | alert('Please specify a model.'); | 126 | alert('Please specify a model.'); |
-
请 注册 或 登录 后发表评论