Fangjun Kuang
Committed by GitHub

Support non-streaming zipformer CTC ASR models (#2340)

This PR adds support for non-streaming Zipformer CTC ASR models across 
multiple language bindings, WebAssembly, examples, and CI workflows.

- Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs
- Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js
- Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models

Model doc is available at
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
正在显示 71 个修改的文件 包含 2121 行增加68 行删除
@@ -6,6 +6,10 @@ cd dart-api-examples @@ -6,6 +6,10 @@ cd dart-api-examples
6 6
7 pushd non-streaming-asr 7 pushd non-streaming-asr
8 8
  9 +echo '----------Zipformer CTC----------'
  10 +./run-zipformer-ctc.sh
  11 +rm -rf sherpa-onnx-*
  12 +
9 echo '----------SenseVoice----------' 13 echo '----------SenseVoice----------'
10 ./run-sense-voice-with-hr.sh 14 ./run-sense-voice-with-hr.sh
11 ./run-sense-voice.sh 15 ./run-sense-voice.sh
@@ -114,6 +118,10 @@ popd @@ -114,6 +118,10 @@ popd
114 118
115 pushd vad-with-non-streaming-asr 119 pushd vad-with-non-streaming-asr
116 120
  121 +echo '----------Zipformer CTC----------'
  122 +./run-zipformer-ctc.sh
  123 +rm -rf sherpa-onnx-*
  124 +
117 echo '----------Dolphin CTC----------' 125 echo '----------Dolphin CTC----------'
118 ./run-dolphin-ctc.sh 126 ./run-dolphin-ctc.sh
119 rm -rf sherpa-onnx-* 127 rm -rf sherpa-onnx-*
@@ -6,43 +6,11 @@ cd ./version-test @@ -6,43 +6,11 @@ cd ./version-test
6 ./run.sh 6 ./run.sh
7 ls -lh 7 ls -lh
8 8
9 -cd ../speech-enhancement-gtcrn  
10 -./run.sh  
11 -ls -lh  
12 -  
13 -cd ../kokoro-tts  
14 -./run-kokoro.sh  
15 -ls -lh  
16 -  
17 -cd ../offline-tts  
18 -./run-matcha-zh.sh  
19 -ls -lh *.wav  
20 -./run-matcha-en.sh  
21 -ls -lh *.wav  
22 -./run-aishell3.sh  
23 -ls -lh *.wav  
24 -./run-piper.sh  
25 -ls -lh *.wav  
26 -./run-hf-fanchen.sh  
27 -ls -lh *.wav  
28 -ls -lh  
29 -  
30 -pushd ../..  
31 -  
32 -mkdir tts  
33 -  
34 -cp -v dotnet-examples/kokoro-tts/*.wav ./tts  
35 -cp -v dotnet-examples/offline-tts/*.wav ./tts  
36 -popd  
37 -  
38 -cd ../offline-speaker-diarization  
39 -./run.sh  
40 -rm -rfv *.onnx  
41 -rm -fv *.wav  
42 -rm -rfv sherpa-onnx-pyannote-*  
43 -  
44 cd ../offline-decode-files 9 cd ../offline-decode-files
45 10
  11 +./run-zipformer-ctc.sh
  12 +rm -rf sherpa-onnx-*
  13 +
46 ./run-dolphin-ctc.sh 14 ./run-dolphin-ctc.sh
47 rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 15 rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
48 16
@@ -82,6 +50,41 @@ rm -rf sherpa-onnx-* @@ -82,6 +50,41 @@ rm -rf sherpa-onnx-*
82 ./run-tdnn-yesno.sh 50 ./run-tdnn-yesno.sh
83 rm -rf sherpa-onnx-* 51 rm -rf sherpa-onnx-*
84 52
  53 +cd ../speech-enhancement-gtcrn
  54 +./run.sh
  55 +ls -lh
  56 +
  57 +cd ../kokoro-tts
  58 +./run-kokoro.sh
  59 +ls -lh
  60 +
  61 +cd ../offline-tts
  62 +./run-matcha-zh.sh
  63 +ls -lh *.wav
  64 +./run-matcha-en.sh
  65 +ls -lh *.wav
  66 +./run-aishell3.sh
  67 +ls -lh *.wav
  68 +./run-piper.sh
  69 +ls -lh *.wav
  70 +./run-hf-fanchen.sh
  71 +ls -lh *.wav
  72 +ls -lh
  73 +
  74 +pushd ../..
  75 +
  76 +mkdir tts
  77 +
  78 +cp -v dotnet-examples/kokoro-tts/*.wav ./tts
  79 +cp -v dotnet-examples/offline-tts/*.wav ./tts
  80 +popd
  81 +
  82 +cd ../offline-speaker-diarization
  83 +./run.sh
  84 +rm -rfv *.onnx
  85 +rm -fv *.wav
  86 +rm -rfv sherpa-onnx-pyannote-*
  87 +
85 cd ../keyword-spotting-from-files 88 cd ../keyword-spotting-from-files
86 ./run.sh 89 ./run.sh
87 90
@@ -115,5 +118,3 @@ rm -rf sherpa-onnx-* @@ -115,5 +118,3 @@ rm -rf sherpa-onnx-*
115 cd ../spoken-language-identification 118 cd ../spoken-language-identification
116 ./run.sh 119 ./run.sh
117 rm -rf sherpa-onnx-* 120 rm -rf sherpa-onnx-*
118 -  
119 -  
@@ -10,6 +10,15 @@ arch=$(node -p "require('os').arch()") @@ -10,6 +10,15 @@ arch=$(node -p "require('os').arch()")
10 platform=$(node -p "require('os').platform()") 10 platform=$(node -p "require('os').platform()")
11 node_version=$(node -p "process.versions.node.split('.')[0]") 11 node_version=$(node -p "process.versions.node.split('.')[0]")
12 12
  13 +echo "----------non-streaming ASR Zipformer CTC----------"
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  15 +
  16 +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  17 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  18 +
  19 +node ./test_asr_non_streaming_zipformer_ctc.js
  20 +rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
  21 +
13 echo "----------non-streaming ASR NeMo parakeet tdt----------" 22 echo "----------non-streaming ASR NeMo parakeet tdt----------"
14 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 23 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
15 tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 24 tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
@@ -9,6 +9,15 @@ git status @@ -9,6 +9,15 @@ git status
9 ls -lh 9 ls -lh
10 ls -lh node_modules 10 ls -lh node_modules
11 11
  12 +# asr with offline zipformer ctc
  13 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  14 +
  15 +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  16 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  17 +
  18 +node ./test-offline-zipformer-ctc.js
  19 +rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
  20 +
12 # asr with offline dolphin ctc 21 # asr with offline dolphin ctc
13 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 22 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
14 tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 23 tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
@@ -9,6 +9,9 @@ ls -lh @@ -9,6 +9,9 @@ ls -lh
9 9
10 ./run-test-version.sh 10 ./run-test-version.sh
11 11
  12 +./run-zipformer-ctc-asr.sh
  13 +rm -rf sherpa-onnx-zipformer-*
  14 +
12 ./run-decode-file-sense-voice-with-hr.sh 15 ./run-decode-file-sense-voice-with-hr.sh
13 rm -rf sherpa-onnx-sense-voice-* 16 rm -rf sherpa-onnx-sense-voice-*
14 rm -rf dict lexicon.txt replace.fst test-hr.wav 17 rm -rf dict lexicon.txt replace.fst test-hr.wav
@@ -89,6 +89,7 @@ jobs: @@ -89,6 +89,7 @@ jobs:
89 make -j4 install 89 make -j4 install
90 90
91 cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin 91 cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
  92 + cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
92 93
93 rm -rf install/lib/pkgconfig 94 rm -rf install/lib/pkgconfig
94 rm -fv install/lib/cargs.h 95 rm -fv install/lib/cargs.h
@@ -135,6 +136,7 @@ jobs: @@ -135,6 +136,7 @@ jobs:
135 make -j4 install 136 make -j4 install
136 137
137 cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin 138 cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
  139 + cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
138 140
139 rm -rf install/lib/pkgconfig 141 rm -rf install/lib/pkgconfig
140 rm -fv install/lib/cargs.h 142 rm -fv install/lib/cargs.h
@@ -90,6 +90,7 @@ jobs: @@ -90,6 +90,7 @@ jobs:
90 make install 90 make install
91 91
92 cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin 92 cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
  93 + cp bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
93 94
94 ls -lh install/lib 95 ls -lh install/lib
95 96
@@ -37,7 +37,7 @@ jobs: @@ -37,7 +37,7 @@ jobs:
37 strategy: 37 strategy:
38 fail-fast: false 38 fail-fast: false
39 matrix: 39 matrix:
40 - os: [ubuntu-latest, macos-latest, macos-13, windows-latest] 40 + os: [ubuntu-latest, macos-latest, macos-13, windows-latest, ubuntu-22.04-arm]
41 41
42 steps: 42 steps:
43 - uses: actions/checkout@v4 43 - uses: actions/checkout@v4
@@ -56,7 +56,7 @@ jobs: @@ -56,7 +56,7 @@ jobs:
56 key: ${{ matrix.os }} 56 key: ${{ matrix.os }}
57 57
58 - name: Install Free pascal compiler (ubuntu) 58 - name: Install Free pascal compiler (ubuntu)
59 - if: matrix.os == 'ubuntu-latest' 59 + if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-22.04-arm'
60 shell: bash 60 shell: bash
61 run: | 61 run: |
62 sudo apt-get update 62 sudo apt-get update
@@ -156,6 +156,10 @@ jobs: @@ -156,6 +156,10 @@ jobs:
156 156
157 pushd non-streaming-asr 157 pushd non-streaming-asr
158 158
  159 + ./run-zipformer-ctc.sh
  160 + rm -rf sherpa-onnx-*
  161 + echo "---"
  162 +
159 ./run-dolphin-ctc.sh 163 ./run-dolphin-ctc.sh
160 rm -rf sherpa-onnx-* 164 rm -rf sherpa-onnx-*
161 echo "---" 165 echo "---"
@@ -264,9 +268,12 @@ jobs: @@ -264,9 +268,12 @@ jobs:
264 268
265 cd ./pascal-api-examples 269 cd ./pascal-api-examples
266 270
267 -  
268 pushd vad-with-non-streaming-asr 271 pushd vad-with-non-streaming-asr
269 272
  273 + time ./run-vad-with-zipformer-ctc.sh
  274 + rm -rf sherpa-onnx-*
  275 + echo "---"
  276 +
270 time ./run-vad-with-dolphin-ctc.sh 277 time ./run-vad-with-dolphin-ctc.sh
271 rm -rf sherpa-onnx-* 278 rm -rf sherpa-onnx-*
272 echo "---" 279 echo "---"
@@ -165,6 +165,9 @@ jobs: @@ -165,6 +165,9 @@ jobs:
165 run: | 165 run: |
166 cd ./java-api-examples 166 cd ./java-api-examples
167 167
  168 + ./run-non-streaming-decode-file-zipformer-ctc.sh
  169 + rm -rf sherpa-onnx-zipformer-ctc-*
  170 +
168 ./run-non-streaming-decode-file-dolphin-ctc.sh 171 ./run-non-streaming-decode-file-dolphin-ctc.sh
169 rm -rf sherpa-onnx-dolphin-* 172 rm -rf sherpa-onnx-dolphin-*
170 173
@@ -184,6 +184,10 @@ jobs: @@ -184,6 +184,10 @@ jobs:
184 go build 184 go build
185 ls -lh 185 ls -lh
186 186
  187 + echo "Test Zipformer CTC"
  188 + ./run-zipformer-ctc.sh
  189 + rm -rf sherpa-onnx-zipformer-*
  190 +
187 echo "Test SenseVoice ctc" 191 echo "Test SenseVoice ctc"
188 ./run-sense-voice-small-with-hr.sh 192 ./run-sense-voice-small-with-hr.sh
189 ./run-sense-voice-small.sh 193 ./run-sense-voice-small.sh
@@ -19,12 +19,36 @@ jobs: @@ -19,12 +19,36 @@ jobs:
19 fail-fast: false 19 fail-fast: false
20 matrix: 20 matrix:
21 os: [ubuntu-latest] 21 os: [ubuntu-latest]
22 - python-version: ["3.8"] 22 + python-version: ["3.10"]
23 23
24 steps: 24 steps:
25 - uses: actions/checkout@v4 25 - uses: actions/checkout@v4
26 26
  27 + - name: Zipformer CTC (non-streaming)
  28 + shell: bash
  29 + run: |
  30 + git lfs install
  31 + names=(
  32 + sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
  33 + sherpa-onnx-zipformer-ctc-zh-2025-07-03
  34 + sherpa-onnx-zipformer-ctc-zh-fp16-2025-07-03
  35 + )
  36 + for name in ${names[@]}; do
  37 + git clone https://huggingface.co/csukuangfj/$name
  38 + pushd $name
  39 + git lfs pull
  40 + rm -rf .git
  41 + rm -rfv .gitattributes
  42 + ls -lh
  43 + popd
  44 +
  45 + tar cjfv $name.tar.bz2 $name
  46 + rm -rf $name
  47 + ls -lh *.tar.bz2
  48 + done
  49 +
27 - name: Vietnamese (zipformer) 50 - name: Vietnamese (zipformer)
  51 + if: false
28 shell: bash 52 shell: bash
29 run: | 53 run: |
30 rm -rf models 54 rm -rf models
@@ -76,6 +100,7 @@ jobs: @@ -76,6 +100,7 @@ jobs:
76 mv models/* . 100 mv models/* .
77 101
78 - name: Publish to huggingface (Vietnamese zipformer) 102 - name: Publish to huggingface (Vietnamese zipformer)
  103 + if: false
79 env: 104 env:
80 HF_TOKEN: ${{ secrets.HF_TOKEN }} 105 HF_TOKEN: ${{ secrets.HF_TOKEN }}
81 uses: nick-fields/retry@v3 106 uses: nick-fields/retry@v3
@@ -114,6 +114,7 @@ We also have spaces built using WebAssembly. They are listed below: @@ -114,6 +114,7 @@ We also have spaces built using WebAssembly. They are listed below:
114 |Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| 114 |Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]|
115 |Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| 115 |Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]|
116 |Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| 116 |Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]|
  117 +|VAD + speech recognition (Chinese) with [Zipformer CTC](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|[Click me][wasm-hf-vad-asr-zh-zipformer-ctc-07-03]| [地址][wasm-ms-vad-asr-zh-zipformer-ctc-07-03]|
117 |VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| 118 |VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]|
118 |VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| 119 |VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]|
119 |VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]| 120 |VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]|
@@ -141,6 +142,7 @@ We also have spaces built using WebAssembly. They are listed below: @@ -141,6 +142,7 @@ We also have spaces built using WebAssembly. They are listed below:
141 |----------------------------------------|------------------------------------|-----------------------------------| 142 |----------------------------------------|------------------------------------|-----------------------------------|
142 | Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]| 143 | Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]|
143 | Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] | 144 | Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] |
  145 +| Simulated-streaming speech recognition | [Address][apk-simula-streaming-asr]| [点此][apk-simula-streaming-asr-cn]|
144 | Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | 146 | Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] |
145 | Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | 147 | Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] |
146 | VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | 148 | VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] |
@@ -250,8 +252,10 @@ for more models. The following table lists only **SOME** of them. @@ -250,8 +252,10 @@ for more models. The following table lists only **SOME** of them.
250 252
251 |Name | Supported Languages| Description| 253 |Name | Supported Languages| Description|
252 |-----|-----|----| 254 |-----|-----|----|
  255 +|[sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english)| English | It is converted from <https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2>|
253 |[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)| 256 |[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)|
254 |[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)| 257 |[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)|
  258 +|[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|Chinese| A Zipformer CTC model|
255 |[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)| 259 |[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)|
256 |[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)| 260 |[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)|
257 |[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)| 261 |[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)|
@@ -413,6 +417,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. @@ -413,6 +417,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
413 [wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en 417 [wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en
414 [wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en 418 [wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en
415 [SenseVoice]: https://github.com/FunAudioLLM/SenseVoice 419 [SenseVoice]: https://github.com/FunAudioLLM/SenseVoice
  420 +[wasm-hf-vad-asr-zh-zipformer-ctc-07-03]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc
  421 +[wasm-ms-vad-asr-zh-zipformer-ctc-07-03]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc/summary
416 [wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice 422 [wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice
417 [wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice 423 [wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice
418 [wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny 424 [wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny
@@ -423,20 +429,20 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. @@ -423,20 +429,20 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
423 [wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech 429 [wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech
424 [wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech 430 [wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
425 [wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech 431 [wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
426 -[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf 432 +[reazonspeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
427 [wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer 433 [wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
428 [wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer 434 [wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
429 -[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2 435 +[gigaspeech2]: https://github.com/speechcolab/gigaspeech2
430 [wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer 436 [wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer
431 [wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer 437 [wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer
432 -[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR 438 +[telespeech-asr]: https://github.com/tele-ai/telespeech-asr
433 [wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech 439 [wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
434 [wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech 440 [wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
435 [wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer 441 [wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
436 [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer 442 [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
437 [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small 443 [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
438 [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small 444 [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
439 -[Dolphin]: https://github.com/DataoceanAI/Dolphin 445 +[dolphin]: https://github.com/dataoceanai/dolphin
440 [wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc 446 [wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
441 [wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc 447 [wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
442 448
@@ -450,6 +456,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. @@ -450,6 +456,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
450 [apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html 456 [apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html
451 [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html 457 [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html
452 [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html 458 [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html
  459 +[apk-simula-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr.html
  460 +[apk-simula-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr-cn.html
453 [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html 461 [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html
454 [apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html 462 [apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html
455 [apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html 463 [apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html
@@ -45,6 +45,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -45,6 +45,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
45 sherpa-onnx-cxx-api 45 sherpa-onnx-cxx-api
46 portaudio_static 46 portaudio_static
47 ) 47 )
  48 +
  49 + add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api
  50 + ./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
  51 + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
  52 + )
  53 + target_link_libraries(zipformer-ctc-simulate-streaming-microphone-cxx-api
  54 + sherpa-onnx-cxx-api
  55 + portaudio_static
  56 + )
48 endif() 57 endif()
49 58
50 if(SHERPA_ONNX_HAS_ALSA) 59 if(SHERPA_ONNX_HAS_ALSA)
@@ -57,10 +66,21 @@ if(SHERPA_ONNX_HAS_ALSA) @@ -57,10 +66,21 @@ if(SHERPA_ONNX_HAS_ALSA)
57 portaudio_static 66 portaudio_static
58 ) 67 )
59 68
  69 + add_executable(zipformer-ctc-simulate-streaming-alsa-cxx-api
  70 + ./zipformer-ctc-simulate-streaming-alsa-cxx-api.cc
  71 + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc
  72 + )
  73 + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api
  74 + sherpa-onnx-cxx-api
  75 + portaudio_static
  76 + )
  77 +
60 if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) 78 if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
61 target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) 79 target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
  80 + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
62 else() 81 else()
63 target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound) 82 target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound)
  83 + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api asound)
64 endif() 84 endif()
65 endif() 85 endif()
66 86
  1 +// cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc
  2 +// Copyright (c) 2025 Xiaomi Corporation
  3 +
  4 +//
  5 +// This file demonstrates how to use zipformer CTC with sherpa-onnx's C++ API
  6 +// for streaming speech recognition from a microphone.
  7 +//
  8 +// clang-format off
  9 +//
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  13 +// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  14 +// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  15 +//
  16 +// clang-format on
  17 +
  18 +#include <signal.h>
  19 +#include <stdio.h>
  20 +#include <stdlib.h>
  21 +
  22 +#include <chrono> // NOLINT
  23 +#include <condition_variable> // NOLINT
  24 +#include <iostream>
  25 +#include <mutex> // NOLINT
  26 +#include <queue>
  27 +#include <thread> // NOLINT
  28 +#include <vector>
  29 +
  30 +#include "sherpa-display.h" // NOLINT
  31 +#include "sherpa-onnx/c-api/cxx-api.h"
  32 +#include "sherpa-onnx/csrc/alsa.h"
  33 +
  34 +std::queue<std::vector<float>> samples_queue;
  35 +std::condition_variable condition_variable;
  36 +std::mutex mutex;
  37 +bool stop = false;
  38 +
  39 +static void Handler(int32_t /*sig*/) {
  40 + stop = true;
  41 + condition_variable.notify_one();
  42 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  43 +}
  44 +
  45 +static void RecordCallback(sherpa_onnx::Alsa *alsa) {
  46 + int32_t chunk = 0.1 * alsa->GetActualSampleRate();
  47 + while (!stop) {
  48 + std::vector<float> samples = alsa->Read(chunk);
  49 +
  50 + std::lock_guard<std::mutex> lock(mutex);
  51 + samples_queue.emplace(std::move(samples));
  52 + condition_variable.notify_one();
  53 + }
  54 +}
  55 +
  56 +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
  57 + using namespace sherpa_onnx::cxx; // NOLINT
  58 + VadModelConfig config;
  59 + config.silero_vad.model = "./silero_vad.onnx";
  60 + config.silero_vad.threshold = 0.5;
  61 + config.silero_vad.min_silence_duration = 0.1;
  62 + config.silero_vad.min_speech_duration = 0.25;
  63 + config.silero_vad.max_speech_duration = 8;
  64 + config.sample_rate = 16000;
  65 + config.debug = false;
  66 +
  67 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
  68 + if (!vad.Get()) {
  69 + std::cerr << "Failed to create VAD. Please check your config\n";
  70 + exit(-1);
  71 + }
  72 +
  73 + return vad;
  74 +}
  75 +
  76 +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
  77 + using namespace sherpa_onnx::cxx; // NOLINT
  78 + OfflineRecognizerConfig config;
  79 +
  80 + config.model_config.zipformer_ctc.model =
  81 + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
  82 + config.model_config.tokens =
  83 + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
  84 +
  85 + config.model_config.num_threads = 2;
  86 + config.model_config.debug = false;
  87 +
  88 + std::cout << "Loading model\n";
  89 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  90 + if (!recognizer.Get()) {
  91 + std::cerr << "Please check your config\n";
  92 + exit(-1);
  93 + }
  94 + std::cout << "Loading model done\n";
  95 + return recognizer;
  96 +}
  97 +
  98 +int32_t main(int32_t argc, const char *argv[]) {
  99 + const char *kUsageMessage = R"usage(
  100 +Usage:
  101 +
  102 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  103 +
  104 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  105 +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  106 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  107 +
  108 +./zipformer-ctc-simulate-streaming-alsa-cxx-api device_name
  109 +
  110 +The device name specifies which microphone to use in case there are several
  111 +on your system. You can use
  112 +
  113 + arecord -l
  114 +
  115 +to find all available microphones on your computer. For instance, if it outputs
  116 +
  117 +**** List of CAPTURE Hardware Devices ****
  118 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  119 + Subdevices: 1/1
  120 + Subdevice #0: subdevice #0
  121 +
  122 +and if you want to select card 3 and device 0 on that card, please use:
  123 +
  124 + plughw:3,0
  125 +
  126 +as the device_name.
  127 +)usage";
  128 +
  129 + if (argc != 2) {
  130 + fprintf(stderr, "%s\n", kUsageMessage);
  131 + return -1;
  132 + }
  133 +
  134 + signal(SIGINT, Handler);
  135 +
  136 + using namespace sherpa_onnx::cxx; // NOLINT
  137 +
  138 + auto vad = CreateVad();
  139 + auto recognizer = CreateOfflineRecognizer();
  140 +
  141 + int32_t expected_sample_rate = 16000;
  142 +
  143 + std::string device_name = argv[1];
  144 + sherpa_onnx::Alsa alsa(device_name.c_str());
  145 + fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
  146 +
  147 + if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
  148 + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
  149 + expected_sample_rate);
  150 + exit(-1);
  151 + }
  152 +
  153 + int32_t window_size = 512; // samples, please don't change
  154 +
  155 + int32_t offset = 0;
  156 + std::vector<float> buffer;
  157 + bool speech_started = false;
  158 +
  159 + auto started_time = std::chrono::steady_clock::now();
  160 +
  161 + SherpaDisplay display;
  162 +
  163 + std::thread record_thread(RecordCallback, &alsa);
  164 +
  165 + std::cout << "Started! Please speak\n";
  166 +
  167 + while (!stop) {
  168 + {
  169 + std::unique_lock<std::mutex> lock(mutex);
  170 + while (samples_queue.empty() && !stop) {
  171 + condition_variable.wait(lock);
  172 + }
  173 +
  174 + const auto &s = samples_queue.front();
  175 + buffer.insert(buffer.end(), s.begin(), s.end());
  176 +
  177 + samples_queue.pop();
  178 + }
  179 +
  180 + for (; offset + window_size < buffer.size(); offset += window_size) {
  181 + vad.AcceptWaveform(buffer.data() + offset, window_size);
  182 + if (!speech_started && vad.IsDetected()) {
  183 + speech_started = true;
  184 + started_time = std::chrono::steady_clock::now();
  185 + }
  186 + }
  187 + if (!speech_started) {
  188 + if (buffer.size() > 10 * window_size) {
  189 + offset -= buffer.size() - 10 * window_size;
  190 + buffer = {buffer.end() - 10 * window_size, buffer.end()};
  191 + }
  192 + }
  193 +
  194 + auto current_time = std::chrono::steady_clock::now();
  195 + const float elapsed_seconds =
  196 + std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
  197 + started_time)
  198 + .count() /
  199 + 1000.;
  200 +
  201 + if (speech_started && elapsed_seconds > 0.2) {
  202 + OfflineStream stream = recognizer.CreateStream();
  203 + stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size());
  204 +
  205 + recognizer.Decode(&stream);
  206 +
  207 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  208 + display.UpdateText(result.text);
  209 + display.Display();
  210 +
  211 + started_time = std::chrono::steady_clock::now();
  212 + }
  213 +
  214 + while (!vad.IsEmpty()) {
  215 + auto segment = vad.Front();
  216 +
  217 + vad.Pop();
  218 +
  219 + OfflineStream stream = recognizer.CreateStream();
  220 + stream.AcceptWaveform(expected_sample_rate, segment.samples.data(),
  221 + segment.samples.size());
  222 +
  223 + recognizer.Decode(&stream);
  224 +
  225 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  226 +
  227 + display.UpdateText(result.text);
  228 + display.FinalizeCurrentSentence();
  229 + display.Display();
  230 +
  231 + buffer.clear();
  232 + offset = 0;
  233 + speech_started = false;
  234 + }
  235 + }
  236 +
  237 + record_thread.join();
  238 +
  239 + return 0;
  240 +}
  1 +// cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
  2 +// Copyright (c) 2025 Xiaomi Corporation
  3 +
  4 +//
  5 +// This file demonstrates how to use Zipformer CTC with sherpa-onnx's C++ API
  6 +// for streaming speech recognition from a microphone.
  7 +//
  8 +// clang-format off
  9 +//
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  13 +// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  14 +// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  15 +//
  16 +// clang-format on
  17 +
  18 +#include <signal.h>
  19 +#include <stdio.h>
  20 +#include <stdlib.h>
  21 +
  22 +#include <chrono> // NOLINT
  23 +#include <condition_variable> // NOLINT
  24 +#include <iostream>
  25 +#include <mutex> // NOLINT
  26 +#include <queue>
  27 +#include <vector>
  28 +
  29 +#include "portaudio.h" // NOLINT
  30 +#include "sherpa-display.h" // NOLINT
  31 +#include "sherpa-onnx/c-api/cxx-api.h"
  32 +#include "sherpa-onnx/csrc/microphone.h"
  33 +
  34 +std::queue<std::vector<float>> samples_queue;
  35 +std::condition_variable condition_variable;
  36 +std::mutex mutex;
  37 +bool stop = false;
  38 +
  39 +static void Handler(int32_t /*sig*/) {
  40 + stop = true;
  41 + condition_variable.notify_one();
  42 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  43 +}
  44 +
  45 +static int32_t RecordCallback(const void *input_buffer,
  46 + void * /*output_buffer*/,
  47 + unsigned long frames_per_buffer, // NOLINT
  48 + const PaStreamCallbackTimeInfo * /*time_info*/,
  49 + PaStreamCallbackFlags /*status_flags*/,
  50 + void * /*user_data*/) {
  51 + std::lock_guard<std::mutex> lock(mutex);
  52 + samples_queue.emplace(
  53 + reinterpret_cast<const float *>(input_buffer),
  54 + reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
  55 + condition_variable.notify_one();
  56 +
  57 + return stop ? paComplete : paContinue;
  58 +}
  59 +
  60 +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
  61 + using namespace sherpa_onnx::cxx; // NOLINT
  62 + VadModelConfig config;
  63 + config.silero_vad.model = "./silero_vad.onnx";
  64 + config.silero_vad.threshold = 0.5;
  65 + config.silero_vad.min_silence_duration = 0.1;
  66 + config.silero_vad.min_speech_duration = 0.25;
  67 + config.silero_vad.max_speech_duration = 8;
  68 + config.sample_rate = 16000;
  69 + config.debug = false;
  70 +
  71 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
  72 + if (!vad.Get()) {
  73 + std::cerr << "Failed to create VAD. Please check your config\n";
  74 + exit(-1);
  75 + }
  76 +
  77 + return vad;
  78 +}
  79 +
  80 +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
  81 + using namespace sherpa_onnx::cxx; // NOLINT
  82 + OfflineRecognizerConfig config;
  83 +
  84 + config.model_config.zipformer_ctc.model =
  85 + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
  86 + config.model_config.tokens =
  87 + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
  88 +
  89 + config.model_config.num_threads = 2;
  90 + config.model_config.debug = false;
  91 +
  92 + std::cout << "Loading model\n";
  93 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  94 + if (!recognizer.Get()) {
  95 + std::cerr << "Please check your config\n";
  96 + exit(-1);
  97 + }
  98 + std::cout << "Loading model done\n";
  99 + return recognizer;
  100 +}
  101 +
  102 +int32_t main() {
  103 + signal(SIGINT, Handler);
  104 +
  105 + using namespace sherpa_onnx::cxx; // NOLINT
  106 +
  107 + auto vad = CreateVad();
  108 + auto recognizer = CreateOfflineRecognizer();
  109 +
  110 + sherpa_onnx::Microphone mic;
  111 +
  112 + PaDeviceIndex num_devices = Pa_GetDeviceCount();
  113 + if (num_devices == 0) {
  114 + std::cerr << " If you are using Linux, please try "
  115 + "./build/bin/zipformer-ctc-simulate-streaming-alsa-cxx-api\n";
  116 + return -1;
  117 + }
  118 +
  119 + int32_t device_index = Pa_GetDefaultInputDevice();
  120 + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
  121 + if (pDeviceIndex) {
  122 + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
  123 + device_index = atoi(pDeviceIndex);
  124 + }
  125 + mic.PrintDevices(device_index);
  126 +
  127 + float mic_sample_rate = 16000;
  128 + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
  129 + if (sample_rate_str) {
  130 + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
  131 + mic_sample_rate = atof(sample_rate_str);
  132 + }
  133 + float sample_rate = 16000;
  134 + LinearResampler resampler;
  135 + if (mic_sample_rate != sample_rate) {
  136 + float min_freq = std::min(mic_sample_rate, sample_rate);
  137 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  138 +
  139 + int32_t lowpass_filter_width = 6;
  140 + resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
  141 + lowpass_cutoff, lowpass_filter_width);
  142 + }
  143 + if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
  144 + nullptr) == false) {
  145 + std::cerr << "Failed to open microphone device\n";
  146 + return -1;
  147 + }
  148 +
  149 + int32_t window_size = 512; // samples, please don't change
  150 +
  151 + int32_t offset = 0;
  152 + std::vector<float> buffer;
  153 + bool speech_started = false;
  154 +
  155 + auto started_time = std::chrono::steady_clock::now();
  156 +
  157 + SherpaDisplay display;
  158 +
  159 + std::cout << "Started! Please speak\n";
  160 +
  161 + while (!stop) {
  162 + {
  163 + std::unique_lock<std::mutex> lock(mutex);
  164 + while (samples_queue.empty() && !stop) {
  165 + condition_variable.wait(lock);
  166 + }
  167 +
  168 + const auto &s = samples_queue.front();
  169 + if (!resampler.Get()) {
  170 + buffer.insert(buffer.end(), s.begin(), s.end());
  171 + } else {
  172 + auto resampled = resampler.Resample(s.data(), s.size(), false);
  173 + buffer.insert(buffer.end(), resampled.begin(), resampled.end());
  174 + }
  175 +
  176 + samples_queue.pop();
  177 + }
  178 +
  179 + for (; offset + window_size < buffer.size(); offset += window_size) {
  180 + vad.AcceptWaveform(buffer.data() + offset, window_size);
  181 + if (!speech_started && vad.IsDetected()) {
  182 + speech_started = true;
  183 + started_time = std::chrono::steady_clock::now();
  184 + }
  185 + }
  186 + if (!speech_started) {
  187 + if (buffer.size() > 10 * window_size) {
  188 + offset -= buffer.size() - 10 * window_size;
  189 + buffer = {buffer.end() - 10 * window_size, buffer.end()};
  190 + }
  191 + }
  192 +
  193 + auto current_time = std::chrono::steady_clock::now();
  194 + const float elapsed_seconds =
  195 + std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
  196 + started_time)
  197 + .count() /
  198 + 1000.;
  199 +
  200 + if (speech_started && elapsed_seconds > 0.2) {
  201 + OfflineStream stream = recognizer.CreateStream();
  202 + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
  203 +
  204 + recognizer.Decode(&stream);
  205 +
  206 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  207 + display.UpdateText(result.text);
  208 + display.Display();
  209 +
  210 + started_time = std::chrono::steady_clock::now();
  211 + }
  212 +
  213 + while (!vad.IsEmpty()) {
  214 + auto segment = vad.Front();
  215 +
  216 + vad.Pop();
  217 +
  218 + OfflineStream stream = recognizer.CreateStream();
  219 + stream.AcceptWaveform(sample_rate, segment.samples.data(),
  220 + segment.samples.size());
  221 +
  222 + recognizer.Decode(&stream);
  223 +
  224 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  225 +
  226 + display.UpdateText(result.text);
  227 + display.FinalizeCurrentSentence();
  228 + display.Display();
  229 +
  230 + buffer.clear();
  231 + offset = 0;
  232 + speech_started = false;
  233 + }
  234 + }
  235 +
  236 + return 0;
  237 +}
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +import 'dart:io';
  3 +
  4 +import 'package:args/args.dart';
  5 +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
  6 +
  7 +import './init.dart';
  8 +
  9 +void main(List<String> arguments) async {
  10 + await initSherpaOnnx();
  11 +
  12 + final parser = ArgParser()
  13 + ..addOption('model', help: 'Path to the Zipformer CTC model')
  14 + ..addOption('tokens', help: 'Path to tokens.txt')
  15 + ..addOption('input-wav', help: 'Path to input.wav to transcribe');
  16 +
  17 + final res = parser.parse(arguments);
  18 + if (res['model'] == null ||
  19 + res['tokens'] == null ||
  20 + res['input-wav'] == null) {
  21 + print(parser.usage);
  22 + exit(1);
  23 + }
  24 +
  25 + final model = res['model'] as String;
  26 + final tokens = res['tokens'] as String;
  27 + final inputWav = res['input-wav'] as String;
  28 +
  29 + final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model);
  30 +
  31 + final modelConfig = sherpa_onnx.OfflineModelConfig(
  32 + zipformerCtc: zipformerCtc,
  33 + tokens: tokens,
  34 + debug: true,
  35 + numThreads: 1,
  36 + );
  37 + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
  38 + final recognizer = sherpa_onnx.OfflineRecognizer(config);
  39 +
  40 + final waveData = sherpa_onnx.readWave(inputWav);
  41 + final stream = recognizer.createStream();
  42 +
  43 + stream.acceptWaveform(
  44 + samples: waveData.samples, sampleRate: waveData.sampleRate);
  45 + recognizer.decode(stream);
  46 +
  47 + final result = recognizer.getResult(stream);
  48 + print(result.text);
  49 +
  50 + stream.free();
  51 + recognizer.free();
  52 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +dart pub get
  6 +
  7 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
  8 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  9 +
  10 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  11 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  12 +fi
  13 +
  14 +dart run \
  15 + ./bin/zipformer-ctc.dart \
  16 + --model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
  17 + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
  18 + --input-wav ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +import 'dart:io';
  3 +import 'dart:typed_data';
  4 +
  5 +import 'package:args/args.dart';
  6 +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
  7 +
  8 +import './init.dart';
  9 +
  10 +void main(List<String> arguments) async {
  11 + await initSherpaOnnx();
  12 +
  13 + final parser = ArgParser()
  14 + ..addOption('silero-vad', help: 'Path to silero_vad.onnx')
  15 + ..addOption('model', help: 'Path to the Zipformer CTC model')
  16 + ..addOption('tokens', help: 'Path to tokens.txt')
  17 + ..addOption('input-wav', help: 'Path to input.wav to transcribe');
  18 +
  19 + final res = parser.parse(arguments);
  20 + if (res['silero-vad'] == null ||
  21 + res['model'] == null ||
  22 + res['tokens'] == null ||
  23 + res['input-wav'] == null) {
  24 + print(parser.usage);
  25 + exit(1);
  26 + }
  27 +
  28 + // create VAD
  29 + final sileroVad = res['silero-vad'] as String;
  30 +
  31 + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig(
  32 + model: sileroVad,
  33 + minSilenceDuration: 0.25,
  34 + minSpeechDuration: 0.5,
  35 + maxSpeechDuration: 5.0,
  36 + );
  37 +
  38 + final vadConfig = sherpa_onnx.VadModelConfig(
  39 + sileroVad: sileroVadConfig,
  40 + numThreads: 1,
  41 + debug: true,
  42 + );
  43 +
  44 + final vad = sherpa_onnx.VoiceActivityDetector(
  45 + config: vadConfig, bufferSizeInSeconds: 10);
  46 +
  47 + // create offline recognizer
  48 + final model = res['model'] as String;
  49 + final tokens = res['tokens'] as String;
  50 + final inputWav = res['input-wav'] as String;
  51 +
  52 + final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model);
  53 +
  54 + final modelConfig = sherpa_onnx.OfflineModelConfig(
  55 + zipformerCtc: zipformerCtc,
  56 + tokens: tokens,
  57 + debug: true,
  58 + numThreads: 1,
  59 + );
  60 + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
  61 + final recognizer = sherpa_onnx.OfflineRecognizer(config);
  62 +
  63 + final waveData = sherpa_onnx.readWave(inputWav);
  64 + if (waveData.sampleRate != 16000) {
  65 + print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}');
  66 + exit(1);
  67 + }
  68 +
  69 + int numSamples = waveData.samples.length;
  70 + int numIter = numSamples ~/ vadConfig.sileroVad.windowSize;
  71 +
  72 + for (int i = 0; i != numIter; ++i) {
  73 + int start = i * vadConfig.sileroVad.windowSize;
  74 + vad.acceptWaveform(Float32List.sublistView(
  75 + waveData.samples, start, start + vadConfig.sileroVad.windowSize));
  76 +
  77 + while (!vad.isEmpty()) {
  78 + final samples = vad.front().samples;
  79 + final startTime = vad.front().start.toDouble() / waveData.sampleRate;
  80 + final endTime =
  81 + startTime + samples.length.toDouble() / waveData.sampleRate;
  82 +
  83 + final stream = recognizer.createStream();
  84 + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
  85 + recognizer.decode(stream);
  86 +
  87 + final result = recognizer.getResult(stream);
  88 + stream.free();
  89 + print(
  90 + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
  91 +
  92 + vad.pop();
  93 + }
  94 + }
  95 +
  96 + vad.flush();
  97 +
  98 + while (!vad.isEmpty()) {
  99 + final samples = vad.front().samples;
  100 + final startTime = vad.front().start.toDouble() / waveData.sampleRate;
  101 + final endTime = startTime + samples.length.toDouble() / waveData.sampleRate;
  102 +
  103 + final stream = recognizer.createStream();
  104 + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
  105 + recognizer.decode(stream);
  106 +
  107 + final result = recognizer.getResult(stream);
  108 + stream.free();
  109 + print(
  110 + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
  111 +
  112 + vad.pop();
  113 + }
  114 +
  115 + vad.free();
  116 +
  117 + recognizer.free();
  118 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +dart pub get
  6 +
  7 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
  8 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  9 +
  10 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  11 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  12 +fi
  13 +
  14 +if [ ! -f ./lei-jun-test.wav ]; then
  15 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  16 +fi
  17 +
  18 +if [[ ! -f ./silero_vad.onnx ]]; then
  19 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  20 +fi
  21 +
  22 +dart run \
  23 + ./bin/zipformer-ctc.dart \
  24 + --silero-vad ./silero_vad.onnx \
  25 + --model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
  26 + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
  27 + --input-wav ./lei-jun-test.wav
@@ -75,6 +75,9 @@ class OfflineDecodeFiles @@ -75,6 +75,9 @@ class OfflineDecodeFiles
75 [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] 75 [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")]
76 public string NeMoCtc { get; set; } = string.Empty; 76 public string NeMoCtc { get; set; } = string.Empty;
77 77
  78 + [Option("zipformer-ctc", Required = false, HelpText = "Path to model.onnx. Used only for Zipformer CTC models")]
  79 + public string ZipformerCtc { get; set; } = string.Empty;
  80 +
78 [Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")] 81 [Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")]
79 public string DolphinModel { get; set; } = string.Empty; 82 public string DolphinModel { get; set; } = string.Empty;
80 83
@@ -240,6 +243,10 @@ to download pre-trained Tdnn models. @@ -240,6 +243,10 @@ to download pre-trained Tdnn models.
240 { 243 {
241 config.ModelConfig.Dolphin.Model = options.DolphinModel; 244 config.ModelConfig.Dolphin.Model = options.DolphinModel;
242 } 245 }
  246 + else if (!string.IsNullOrEmpty(options.ZipformerCtc))
  247 + {
  248 + config.ModelConfig.ZipformerCtc.Model = options.ZipformerCtc;
  249 + }
243 else if (!string.IsNullOrEmpty(options.TeleSpeechCtc)) 250 else if (!string.IsNullOrEmpty(options.TeleSpeechCtc))
244 { 251 {
245 config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; 252 config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc;
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  7 +
  8 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  9 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  10 +fi
  11 +
  12 +dotnet run \
  13 + --tokens=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
  14 + --zipformer-ctc=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
  15 + --num-threads=1 \
  16 + --files ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav \
  17 + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/1.wav \
  18 + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/8k.wav
@@ -104,6 +104,27 @@ class OfflineDolphinModelConfig { @@ -104,6 +104,27 @@ class OfflineDolphinModelConfig {
104 final String model; 104 final String model;
105 } 105 }
106 106
  107 +class OfflineZipformerCtcModelConfig {
  108 + const OfflineZipformerCtcModelConfig({this.model = ''});
  109 +
  110 + factory OfflineZipformerCtcModelConfig.fromJson(Map<String, dynamic> json) {
  111 + return OfflineZipformerCtcModelConfig(
  112 + model: json['model'] as String? ?? '',
  113 + );
  114 + }
  115 +
  116 + @override
  117 + String toString() {
  118 + return 'OfflineZipformerCtcModelConfig(model: $model)';
  119 + }
  120 +
  121 + Map<String, dynamic> toJson() => {
  122 + 'model': model,
  123 + };
  124 +
  125 + final String model;
  126 +}
  127 +
107 class OfflineWhisperModelConfig { 128 class OfflineWhisperModelConfig {
108 const OfflineWhisperModelConfig( 129 const OfflineWhisperModelConfig(
109 {this.encoder = '', 130 {this.encoder = '',
@@ -288,6 +309,7 @@ class OfflineModelConfig { @@ -288,6 +309,7 @@ class OfflineModelConfig {
288 this.moonshine = const OfflineMoonshineModelConfig(), 309 this.moonshine = const OfflineMoonshineModelConfig(),
289 this.fireRedAsr = const OfflineFireRedAsrModelConfig(), 310 this.fireRedAsr = const OfflineFireRedAsrModelConfig(),
290 this.dolphin = const OfflineDolphinModelConfig(), 311 this.dolphin = const OfflineDolphinModelConfig(),
  312 + this.zipformerCtc = const OfflineZipformerCtcModelConfig(),
291 required this.tokens, 313 required this.tokens,
292 this.numThreads = 1, 314 this.numThreads = 1,
293 this.debug = true, 315 this.debug = true,
@@ -336,6 +358,10 @@ class OfflineModelConfig { @@ -336,6 +358,10 @@ class OfflineModelConfig {
336 ? OfflineDolphinModelConfig.fromJson( 358 ? OfflineDolphinModelConfig.fromJson(
337 json['dolphin'] as Map<String, dynamic>) 359 json['dolphin'] as Map<String, dynamic>)
338 : const OfflineDolphinModelConfig(), 360 : const OfflineDolphinModelConfig(),
  361 + zipformerCtc: json['zipformerCtc'] != null
  362 + ? OfflineZipformerCtcModelConfig.fromJson(
  363 + json['zipformerCtc'] as Map<String, dynamic>)
  364 + : const OfflineZipformerCtcModelConfig(),
339 tokens: json['tokens'] as String, 365 tokens: json['tokens'] as String,
340 numThreads: json['numThreads'] as int? ?? 1, 366 numThreads: json['numThreads'] as int? ?? 1,
341 debug: json['debug'] as bool? ?? true, 367 debug: json['debug'] as bool? ?? true,
@@ -349,7 +375,7 @@ class OfflineModelConfig { @@ -349,7 +375,7 @@ class OfflineModelConfig {
349 375
350 @override 376 @override
351 String toString() { 377 String toString() {
352 - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; 378 + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, zipformerCtc: $zipformerCtc, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)';
353 } 379 }
354 380
355 Map<String, dynamic> toJson() => { 381 Map<String, dynamic> toJson() => {
@@ -362,6 +388,7 @@ class OfflineModelConfig { @@ -362,6 +388,7 @@ class OfflineModelConfig {
362 'moonshine': moonshine.toJson(), 388 'moonshine': moonshine.toJson(),
363 'fireRedAsr': fireRedAsr.toJson(), 389 'fireRedAsr': fireRedAsr.toJson(),
364 'dolphin': dolphin.toJson(), 390 'dolphin': dolphin.toJson(),
  391 + 'zipformerCtc': zipformerCtc.toJson(),
365 'tokens': tokens, 392 'tokens': tokens,
366 'numThreads': numThreads, 393 'numThreads': numThreads,
367 'debug': debug, 394 'debug': debug,
@@ -381,6 +408,7 @@ class OfflineModelConfig { @@ -381,6 +408,7 @@ class OfflineModelConfig {
381 final OfflineMoonshineModelConfig moonshine; 408 final OfflineMoonshineModelConfig moonshine;
382 final OfflineFireRedAsrModelConfig fireRedAsr; 409 final OfflineFireRedAsrModelConfig fireRedAsr;
383 final OfflineDolphinModelConfig dolphin; 410 final OfflineDolphinModelConfig dolphin;
  411 + final OfflineZipformerCtcModelConfig zipformerCtc;
384 412
385 final String tokens; 413 final String tokens;
386 final int numThreads; 414 final int numThreads;
@@ -578,6 +606,8 @@ class OfflineRecognizer { @@ -578,6 +606,8 @@ class OfflineRecognizer {
578 config.model.fireRedAsr.decoder.toNativeUtf8(); 606 config.model.fireRedAsr.decoder.toNativeUtf8();
579 607
580 c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8(); 608 c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8();
  609 + c.ref.model.zipformerCtc.model =
  610 + config.model.zipformerCtc.model.toNativeUtf8();
581 611
582 c.ref.model.tokens = config.model.tokens.toNativeUtf8(); 612 c.ref.model.tokens = config.model.tokens.toNativeUtf8();
583 613
@@ -623,6 +653,7 @@ class OfflineRecognizer { @@ -623,6 +653,7 @@ class OfflineRecognizer {
623 calloc.free(c.ref.model.modelType); 653 calloc.free(c.ref.model.modelType);
624 calloc.free(c.ref.model.provider); 654 calloc.free(c.ref.model.provider);
625 calloc.free(c.ref.model.tokens); 655 calloc.free(c.ref.model.tokens);
  656 + calloc.free(c.ref.model.zipformerCtc.model);
626 calloc.free(c.ref.model.dolphin.model); 657 calloc.free(c.ref.model.dolphin.model);
627 calloc.free(c.ref.model.fireRedAsr.decoder); 658 calloc.free(c.ref.model.fireRedAsr.decoder);
628 calloc.free(c.ref.model.fireRedAsr.encoder); 659 calloc.free(c.ref.model.fireRedAsr.encoder);
@@ -266,6 +266,10 @@ final class SherpaOnnxOfflineDolphinModelConfig extends Struct { @@ -266,6 +266,10 @@ final class SherpaOnnxOfflineDolphinModelConfig extends Struct {
266 external Pointer<Utf8> model; 266 external Pointer<Utf8> model;
267 } 267 }
268 268
  269 +final class SherpaOnnxOfflineZipformerCtcModelConfig extends Struct {
  270 + external Pointer<Utf8> model;
  271 +}
  272 +
269 final class SherpaOnnxOfflineWhisperModelConfig extends Struct { 273 final class SherpaOnnxOfflineWhisperModelConfig extends Struct {
270 external Pointer<Utf8> encoder; 274 external Pointer<Utf8> encoder;
271 external Pointer<Utf8> decoder; 275 external Pointer<Utf8> decoder;
@@ -333,6 +337,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct { @@ -333,6 +337,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct {
333 external SherpaOnnxOfflineMoonshineModelConfig moonshine; 337 external SherpaOnnxOfflineMoonshineModelConfig moonshine;
334 external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; 338 external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr;
335 external SherpaOnnxOfflineDolphinModelConfig dolphin; 339 external SherpaOnnxOfflineDolphinModelConfig dolphin;
  340 + external SherpaOnnxOfflineZipformerCtcModelConfig zipformerCtc;
336 } 341 }
337 342
338 final class SherpaOnnxOfflineRecognizerConfig extends Struct { 343 final class SherpaOnnxOfflineRecognizerConfig extends Struct {
@@ -28,6 +28,8 @@ func main() { @@ -28,6 +28,8 @@ func main() {
28 28
29 flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model") 29 flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model")
30 30
  31 + flag.StringVar(&config.ModelConfig.ZipformerCtc.Model, "zipformer-ctc", "", "Path to the Zipformer CTC model")
  32 +
31 flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model") 33 flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model")
32 34
33 flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model") 35 flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model")
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  7 +
  8 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  9 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  10 +fi
  11 +
  12 +go mod tidy
  13 +go build
  14 +
  15 +./non-streaming-decode-files \
  16 + --zipformer-ctc ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
  17 + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
  18 + --debug 0 \
  19 + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav
@@ -15,6 +15,7 @@ export { Samples, @@ -15,6 +15,7 @@ export { Samples,
15 OfflineTdnnModelConfig, 15 OfflineTdnnModelConfig,
16 OfflineSenseVoiceModelConfig, 16 OfflineSenseVoiceModelConfig,
17 OfflineMoonshineModelConfig, 17 OfflineMoonshineModelConfig,
  18 + OfflineZipformerCtcModelConfig,
18 OfflineModelConfig, 19 OfflineModelConfig,
19 OfflineLMConfig, 20 OfflineLMConfig,
20 OfflineRecognizerConfig, 21 OfflineRecognizerConfig,
@@ -45,7 +45,23 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig( @@ -45,7 +45,23 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig(
45 return c; 45 return c;
46 } 46 }
47 47
48 -static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinfig( 48 +static SherpaOnnxOfflineZipformerCtcModelConfig
  49 +GetOfflineZipformerCtcModelConfig(Napi::Object obj) {
  50 + SherpaOnnxOfflineZipformerCtcModelConfig c;
  51 + memset(&c, 0, sizeof(c));
  52 +
  53 + if (!obj.Has("zipformerCtc") || !obj.Get("zipformerCtc").IsObject()) {
  54 + return c;
  55 + }
  56 +
  57 + Napi::Object o = obj.Get("zipformerCtc").As<Napi::Object>();
  58 +
  59 + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
  60 +
  61 + return c;
  62 +}
  63 +
  64 +static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinModelConfig(
49 Napi::Object obj) { 65 Napi::Object obj) {
50 SherpaOnnxOfflineDolphinModelConfig c; 66 SherpaOnnxOfflineDolphinModelConfig c;
51 memset(&c, 0, sizeof(c)); 67 memset(&c, 0, sizeof(c));
@@ -185,7 +201,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { @@ -185,7 +201,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
185 c.sense_voice = GetOfflineSenseVoiceModelConfig(o); 201 c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
186 c.moonshine = GetOfflineMoonshineModelConfig(o); 202 c.moonshine = GetOfflineMoonshineModelConfig(o);
187 c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o); 203 c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o);
188 - c.dolphin = GetOfflineDolphinfig(o); 204 + c.dolphin = GetOfflineDolphinModelConfig(o);
  205 + c.zipformer_ctc = GetOfflineZipformerCtcModelConfig(o);
189 206
190 SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); 207 SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
191 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); 208 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -312,6 +329,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { @@ -312,6 +329,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
312 SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder); 329 SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder);
313 330
314 SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model); 331 SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model);
  332 + SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model);
315 333
316 SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); 334 SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
317 SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); 335 SHERPA_ONNX_DELETE_C_STR(c.model_config.provider);
@@ -55,6 +55,10 @@ export class OfflineDolphinModelConfig { @@ -55,6 +55,10 @@ export class OfflineDolphinModelConfig {
55 public model: string = ''; 55 public model: string = '';
56 } 56 }
57 57
  58 +export class OfflineZipformerCtcModelConfig {
  59 + public model: string = '';
  60 +}
  61 +
58 export class OfflineWhisperModelConfig { 62 export class OfflineWhisperModelConfig {
59 public encoder: string = ''; 63 public encoder: string = '';
60 public decoder: string = ''; 64 public decoder: string = '';
@@ -97,6 +101,7 @@ export class OfflineModelConfig { @@ -97,6 +101,7 @@ export class OfflineModelConfig {
97 public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig(); 101 public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig();
98 public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig(); 102 public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig();
99 public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig(); 103 public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig();
  104 + public zipformerCtc: OfflineZipformerCtcModelConfig = new OfflineZipformerCtcModelConfig();
100 } 105 }
101 106
102 export class OfflineLMConfig { 107 export class OfflineLMConfig {
  1 +// Copyright 2025 Xiaomi Corporation
  2 +
  3 +// This file shows how to use an offline Zipformer CTC model,
  4 +// i.e., non-streaming Zipformer CTC model,
  5 +// to decode files.
  6 +import com.k2fsa.sherpa.onnx.*;
  7 +
  8 +public class NonStreamingDecodeFileZipformerCtc {
  9 + public static void main(String[] args) {
  10 + // please refer to
  11 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  12 + // to download model files
  13 + String model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
  14 + String tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
  15 +
  16 + String waveFilename = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav";
  17 +
  18 + WaveReader reader = new WaveReader(waveFilename);
  19 +
  20 + OfflineZipformerCtcModelConfig zipformerCtc =
  21 + OfflineZipformerCtcModelConfig.builder().setModel(model).build();
  22 +
  23 + OfflineModelConfig modelConfig =
  24 + OfflineModelConfig.builder()
  25 + .setZipformerCtc(zipformerCtc)
  26 + .setTokens(tokens)
  27 + .setNumThreads(1)
  28 + .setDebug(true)
  29 + .build();
  30 +
  31 + OfflineRecognizerConfig config =
  32 + OfflineRecognizerConfig.builder()
  33 + .setOfflineModelConfig(modelConfig)
  34 + .setDecodingMethod("greedy_search")
  35 + .build();
  36 +
  37 + OfflineRecognizer recognizer = new OfflineRecognizer(config);
  38 + OfflineStream stream = recognizer.createStream();
  39 + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
  40 +
  41 + recognizer.decode(stream);
  42 +
  43 + String text = recognizer.getResult(stream).getText();
  44 +
  45 + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
  46 +
  47 + stream.release();
  48 + recognizer.release();
  49 + }
  50 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  30 +
  31 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  32 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  33 +fi
  34 +
  35 +java \
  36 + -Djava.library.path=$PWD/../build/lib \
  37 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  38 + NonStreamingDecodeFileZipformerCtc.java
@@ -253,6 +253,13 @@ function testOfflineAsr() { @@ -253,6 +253,13 @@ function testOfflineAsr() {
253 rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 253 rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2
254 fi 254 fi
255 255
  256 + if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then
  257 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  258 +
  259 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  260 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  261 + fi
  262 +
256 out_filename=test_offline_asr.jar 263 out_filename=test_offline_asr.jar
257 kotlinc-jvm -include-runtime -d $out_filename \ 264 kotlinc-jvm -include-runtime -d $out_filename \
258 test_offline_asr.kt \ 265 test_offline_asr.kt \
1 package com.k2fsa.sherpa.onnx 1 package com.k2fsa.sherpa.onnx
2 2
3 fun main() { 3 fun main() {
4 - val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25) 4 + val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25, 31)
5 for (type in types) { 5 for (type in types) {
6 test(type) 6 test(type)
7 } 7 }
@@ -19,6 +19,7 @@ fun test(type: Int) { @@ -19,6 +19,7 @@ fun test(type: Int) {
19 21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav" 19 21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav"
20 24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav" 20 24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav"
21 25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav" 21 25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav"
  22 + 31 -> "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
22 else -> null 23 else -> null
23 } 24 }
24 25
@@ -123,6 +123,7 @@ The following tables list the examples in this folder. @@ -123,6 +123,7 @@ The following tables list the examples in this folder.
123 |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| 123 |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)|
124 |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| 124 |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)|
125 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| 125 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
  126 +|[./test_asr_non_streaming_zipformer_ctc.js](./test_asr_non_streaming_zipformer_ctc.js)|Non-streaming speech recognition from a file using a Zipformer CTC model with greedy search|
126 |[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search| 127 |[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search|
127 |[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search| 128 |[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search|
128 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| 129 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
@@ -137,6 +138,7 @@ The following tables list the examples in this folder. @@ -137,6 +138,7 @@ The following tables list the examples in this folder.
137 |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| 138 |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
138 |[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)| 139 |[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)|
139 |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| 140 |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
  141 +|[./test_vad_asr_non_streaming_zipformer_ctc_microphone.js](./test_vad_asr_non_streaming_zipformer_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer CTC model with greedy search|
140 |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| 142 |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
141 |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| 143 |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
142 144
@@ -372,6 +374,21 @@ rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 @@ -372,6 +374,21 @@ rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
372 node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js 374 node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js
373 ``` 375 ```
374 376
  377 +### Non-streaming speech recognition with Zipformer CTC models
  378 +
  379 +```bash
  380 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  381 +
  382 +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  383 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  384 +
  385 +node ./test_asr_non_streaming_zipformer_ctc.js
  386 +
  387 +# To run VAD + non-streaming ASR with Paraformer using a microphone
  388 +npm install naudiodon2
  389 +node ./test_vad_asr_non_streaming_zipformer_ctc_microphone.js
  390 +```
  391 +
375 ### Non-streaming speech recognition with NeMo CTC models 392 ### Non-streaming speech recognition with NeMo CTC models
376 393
377 ```bash 394 ```bash
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// Please download test files from
  5 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  6 +const config = {
  7 + 'featConfig': {
  8 + 'sampleRate': 16000,
  9 + 'featureDim': 80,
  10 + },
  11 + 'modelConfig': {
  12 + 'zipformerCtc': {
  13 + 'model': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
  14 + },
  15 + 'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
  16 + 'numThreads': 2,
  17 + 'provider': 'cpu',
  18 + 'debug': 1,
  19 + }
  20 +};
  21 +
  22 +const waveFilename =
  23 + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
  24 +
  25 +const recognizer = new sherpa_onnx.OfflineRecognizer(config);
  26 +console.log('Started')
  27 +let start = Date.now();
  28 +const stream = recognizer.createStream();
  29 +const wave = sherpa_onnx.readWave(waveFilename);
  30 +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
  31 +
  32 +recognizer.decode(stream);
  33 +result = recognizer.getResult(stream)
  34 +let stop = Date.now();
  35 +console.log('Done')
  36 +
  37 +const elapsed_seconds = (stop - start) / 1000;
  38 +const duration = wave.samples.length / wave.sampleRate;
  39 +const real_time_factor = elapsed_seconds / duration;
  40 +console.log('Wave duration', duration.toFixed(3), 'seconds')
  41 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
  42 +console.log(
  43 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  44 + real_time_factor.toFixed(3))
  45 +console.log(waveFilename)
  46 +console.log('result\n', result)
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +//
  3 +const portAudio = require('naudiodon2');
  4 +// console.log(portAudio.getDevices());
  5 +
  6 +const sherpa_onnx = require('sherpa-onnx-node');
  7 +
  8 +function createRecognizer() {
  9 + // Please download test files from
  10 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  11 + const config = {
  12 + 'featConfig': {
  13 + 'sampleRate': 16000,
  14 + 'featureDim': 80,
  15 + },
  16 + 'modelConfig': {
  17 + 'zipformerCtc': {
  18 + 'model':
  19 + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
  20 + },
  21 + 'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
  22 + 'numThreads': 2,
  23 + 'provider': 'cpu',
  24 + 'debug': 1,
  25 + }
  26 + };
  27 +
  28 + return new sherpa_onnx.OfflineRecognizer(config);
  29 +}
  30 +
  31 +function createVad() {
  32 + // please download silero_vad.onnx from
  33 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  34 + const config = {
  35 + sileroVad: {
  36 + model: './silero_vad.onnx',
  37 + threshold: 0.5,
  38 + minSpeechDuration: 0.25,
  39 + minSilenceDuration: 0.5,
  40 + windowSize: 512,
  41 + },
  42 + sampleRate: 16000,
  43 + debug: true,
  44 + numThreads: 1,
  45 + };
  46 +
  47 + const bufferSizeInSeconds = 60;
  48 +
  49 + return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
  50 +}
  51 +
  52 +const recognizer = createRecognizer();
  53 +const vad = createVad();
  54 +
  55 +const bufferSizeInSeconds = 30;
  56 +const buffer =
  57 + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
  58 +
  59 +const ai = new portAudio.AudioIO({
  60 + inOptions: {
  61 + channelCount: 1,
  62 + closeOnError: true, // Close the stream if an audio error is detected, if
  63 + // set false then just log the error
  64 + deviceId: -1, // Use -1 or omit the deviceId to select the default device
  65 + sampleFormat: portAudio.SampleFormatFloat32,
  66 + sampleRate: vad.config.sampleRate
  67 + }
  68 +});
  69 +
  70 +let printed = false;
  71 +let index = 0;
  72 +ai.on('data', data => {
  73 + const windowSize = vad.config.sileroVad.windowSize;
  74 + buffer.push(new Float32Array(data.buffer));
  75 + while (buffer.size() > windowSize) {
  76 + const samples = buffer.get(buffer.head(), windowSize);
  77 + buffer.pop(windowSize);
  78 + vad.acceptWaveform(samples);
  79 + }
  80 +
  81 + while (!vad.isEmpty()) {
  82 + const segment = vad.front();
  83 + vad.pop();
  84 + const stream = recognizer.createStream();
  85 + stream.acceptWaveform({
  86 + samples: segment.samples,
  87 + sampleRate: recognizer.config.featConfig.sampleRate
  88 + });
  89 + recognizer.decode(stream);
  90 + const r = recognizer.getResult(stream);
  91 + if (r.text.length > 0) {
  92 + const text = r.text.toLowerCase().trim();
  93 + console.log(`${index}: ${text}`);
  94 +
  95 + const filename = `${index}-${text}-${
  96 + new Date()
  97 + .toLocaleTimeString('en-US', {hour12: false})
  98 + .split(' ')[0]}.wav`;
  99 + sherpa_onnx.writeWave(
  100 + filename,
  101 + {samples: segment.samples, sampleRate: vad.config.sampleRate});
  102 +
  103 + index += 1;
  104 + }
  105 + }
  106 +});
  107 +
  108 +ai.start();
  109 +console.log('Started! Please speak')
@@ -154,6 +154,23 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 @@ -154,6 +154,23 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
154 node ./test-offline-dolphin-ctc.js 154 node ./test-offline-dolphin-ctc.js
155 ``` 155 ```
156 156
  157 +## ./test-offline-zipformer-ctc.js
  158 +
  159 +[./test-offline-zipformer-ctc.js](./test-offline-zipformer-ctc.js) demonstrates
  160 +how to decode a file with a Zipformer CTC model. In the code we use
  161 +[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese).
  162 +
  163 +You can use the following command to run it:
  164 +
  165 +```bash
  166 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  167 +
  168 +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  169 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  170 +
  171 +node ./test-offline-zipformer-ctc.js
  172 +```
  173 +
157 ## ./test-offline-nemo-ctc.js 174 ## ./test-offline-nemo-ctc.js
158 175
159 [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates 176 [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +//
  3 +const fs = require('fs');
  4 +const {Readable} = require('stream');
  5 +const wav = require('wav');
  6 +
  7 +const sherpa_onnx = require('sherpa-onnx');
  8 +
  9 +function createOfflineRecognizer() {
  10 + let config = {
  11 + modelConfig: {
  12 + zipformerCtc: {
  13 + model: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
  14 + },
  15 + tokens: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
  16 + }
  17 + };
  18 +
  19 + return sherpa_onnx.createOfflineRecognizer(config);
  20 +}
  21 +
  22 +const recognizer = createOfflineRecognizer();
  23 +const stream = recognizer.createStream();
  24 +
  25 +const waveFilename =
  26 + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
  27 +const wave = sherpa_onnx.readWave(waveFilename);
  28 +stream.acceptWaveform(wave.sampleRate, wave.samples);
  29 +
  30 +recognizer.decode(stream);
  31 +const text = recognizer.getResult(stream).text;
  32 +console.log(text);
  33 +
  34 +stream.free();
  35 +recognizer.free();
@@ -9,3 +9,4 @@ sense_voice @@ -9,3 +9,4 @@ sense_voice
9 telespeech_ctc 9 telespeech_ctc
10 moonshine 10 moonshine
11 dolphin_ctc 11 dolphin_ctc
  12 +zipformer_ctc
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + ls -lh lib
  24 + popd
  25 +fi
  26 +
  27 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
  28 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  29 +
  30 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  31 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  32 +fi
  33 +
  34 +fpc \
  35 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  36 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  37 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  38 + ./zipformer_ctc.pas
  39 +
  40 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  41 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  42 +
  43 +./zipformer_ctc
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +
  3 +{
  4 +This file shows how to use a non-streaming Zipformer CTC model
  5 +to decode files.
  6 +
  7 +You can download the model files from
  8 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  9 +}
  10 +
  11 +program zipformer_ctc;
  12 +
  13 +{$mode objfpc}
  14 +
  15 +uses
  16 + sherpa_onnx,
  17 + DateUtils,
  18 + SysUtils;
  19 +
  20 +var
  21 + Wave: TSherpaOnnxWave;
  22 + WaveFilename: AnsiString;
  23 +
  24 + Config: TSherpaOnnxOfflineRecognizerConfig;
  25 + Recognizer: TSherpaOnnxOfflineRecognizer;
  26 + Stream: TSherpaOnnxOfflineStream;
  27 + RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
  28 +
  29 + Start: TDateTime;
  30 + Stop: TDateTime;
  31 +
  32 + Elapsed: Single;
  33 + Duration: Single;
  34 + RealTimeFactor: Single;
  35 +begin
  36 + Initialize(Config);
  37 +
  38 + Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
  39 + Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
  40 + Config.ModelConfig.Provider := 'cpu';
  41 + Config.ModelConfig.NumThreads := 1;
  42 + Config.ModelConfig.Debug := False;
  43 +
  44 + WaveFilename := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
  45 +
  46 + Wave := SherpaOnnxReadWave(WaveFilename);
  47 +
  48 + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
  49 + Stream := Recognizer.CreateStream();
  50 + Start := Now;
  51 +
  52 + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
  53 + Recognizer.Decode(Stream);
  54 +
  55 + RecognitionResult := Recognizer.GetResult(Stream);
  56 +
  57 + Stop := Now;
  58 +
  59 + Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
  60 + Duration := Length(Wave.Samples) / Wave.SampleRate;
  61 + RealTimeFactor := Elapsed / Duration;
  62 +
  63 + WriteLn(RecognitionResult.ToString);
  64 + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
  65 + WriteLn(Format('Elapsed %.3f s', [Elapsed]));
  66 + WriteLn(Format('Wave duration %.3f s', [Duration]));
  67 + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
  68 +
  69 + {Free resources to avoid memory leak.
  70 +
  71 + Note: You don't need to invoke them for this simple script.
  72 + However, you have to invoke them in your own large/complex project.
  73 + }
  74 + FreeAndNil(Stream);
  75 + FreeAndNil(Recognizer);
  76 +end.
@@ -2,3 +2,5 @@ @@ -2,3 +2,5 @@
2 vad_with_whisper 2 vad_with_whisper
3 vad_with_sense_voice 3 vad_with_sense_voice
4 vad_with_moonshine 4 vad_with_moonshine
  5 +vad_with_zipformer_ctc
  6 +vad_with_dolphin
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [[ ! -f ./silero_vad.onnx ]]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  28 +fi
  29 +
  30 +if [ ! -f ./lei-jun-test.wav ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  32 +fi
  33 +
  34 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
  35 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  36 +
  37 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  38 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  39 +fi
  40 +
  41 +fpc \
  42 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  43 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  44 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  45 + ./vad_with_zipformer_ctc.pas
  46 +
  47 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  48 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  49 +
  50 +./vad_with_zipformer_ctc
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +
  3 +{
  4 +This file shows how to use a non-streaming Zipformer CTC model
  5 +with silero VAD to decode files.
  6 +
  7 +You can download the model files from
  8 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  9 +}
  10 +
  11 +program vad_with_zipformer_ctc;
  12 +
  13 +{$mode objfpc}
  14 +
  15 +uses
  16 + sherpa_onnx,
  17 + SysUtils;
  18 +
  19 +function CreateVad(): TSherpaOnnxVoiceActivityDetector;
  20 +var
  21 + Config: TSherpaOnnxVadModelConfig;
  22 +
  23 + SampleRate: Integer;
  24 + WindowSize: Integer;
  25 +begin
  26 + Initialize(Config);
  27 +
  28 + SampleRate := 16000; {Please don't change it unless you know the details}
  29 + WindowSize := 512; {Please don't change it unless you know the details}
  30 +
  31 + Config.SileroVad.Model := './silero_vad.onnx';
  32 + Config.SileroVad.MinSpeechDuration := 0.5;
  33 + Config.SileroVad.MinSilenceDuration := 0.5;
  34 + Config.SileroVad.Threshold := 0.5;
  35 + Config.SileroVad.WindowSize := WindowSize;
  36 + Config.NumThreads:= 1;
  37 + Config.Debug:= True;
  38 + Config.Provider:= 'cpu';
  39 + Config.SampleRate := SampleRate;
  40 +
  41 + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
  42 +end;
  43 +
  44 +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
  45 +var
  46 + Config: TSherpaOnnxOfflineRecognizerConfig;
  47 +begin
  48 + Initialize(Config);
  49 +
  50 + Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
  51 + Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
  52 + Config.ModelConfig.Provider := 'cpu';
  53 + Config.ModelConfig.NumThreads := 1;
  54 + Config.ModelConfig.Debug := False;
  55 +
  56 + Result := TSherpaOnnxOfflineRecognizer.Create(Config);
  57 +end;
  58 +
  59 +var
  60 + Wave: TSherpaOnnxWave;
  61 +
  62 + Recognizer: TSherpaOnnxOfflineRecognizer;
  63 + Vad: TSherpaOnnxVoiceActivityDetector;
  64 +
  65 + Offset: Integer;
  66 + WindowSize: Integer;
  67 + SpeechSegment: TSherpaOnnxSpeechSegment;
  68 +
  69 + Start: Single;
  70 + Duration: Single;
  71 +
  72 + Stream: TSherpaOnnxOfflineStream;
  73 + RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
  74 +begin
  75 + Vad := CreateVad();
  76 + Recognizer := CreateOfflineRecognizer();
  77 +
  78 + Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
  79 + if Wave.SampleRate <> Vad.Config.SampleRate then
  80 + begin
  81 + WriteLn(Format('Expected sample rate: %d. Given: %d',
  82 + [Vad.Config.SampleRate, Wave.SampleRate]));
  83 +
  84 + Exit;
  85 + end;
  86 +
  87 + WindowSize := Vad.Config.SileroVad.WindowSize;
  88 + Offset := 0;
  89 + while Offset + WindowSize <= Length(Wave.Samples) do
  90 + begin
  91 + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
  92 + Offset += WindowSize;
  93 +
  94 + while not Vad.IsEmpty do
  95 + begin
  96 + SpeechSegment := Vad.Front();
  97 + Vad.Pop();
  98 + Stream := Recognizer.CreateStream();
  99 +
  100 + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
  101 + Recognizer.Decode(Stream);
  102 + RecognitionResult := Recognizer.GetResult(Stream);
  103 +
  104 + Start := SpeechSegment.Start / Wave.SampleRate;
  105 + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
  106 + WriteLn(Format('%.3f -- %.3f %s',
  107 + [Start, Start + Duration, RecognitionResult.Text]));
  108 +
  109 + FreeAndNil(Stream);
  110 + end;
  111 + end;
  112 +
  113 + Vad.Flush;
  114 +
  115 + while not Vad.IsEmpty do
  116 + begin
  117 + SpeechSegment := Vad.Front();
  118 + Vad.Pop();
  119 + Stream := Recognizer.CreateStream();
  120 +
  121 + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
  122 + Recognizer.Decode(Stream);
  123 + RecognitionResult := Recognizer.GetResult(Stream);
  124 +
  125 + Start := SpeechSegment.Start / Wave.SampleRate;
  126 + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
  127 + WriteLn(Format('%.3f -- %.3f %s',
  128 + [Start, Start + Duration, RecognitionResult.Text]));
  129 +
  130 + FreeAndNil(Stream);
  131 + end;
  132 +
  133 + FreeAndNil(Recognizer);
  134 + FreeAndNil(Vad);
  135 +end.
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This file shows how to use a non-streaming zipformer CTC model from icefall
  5 +to decode files.
  6 +
  7 +Please download model files from
  8 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  9 +
  10 +"""
  11 +
  12 +from pathlib import Path
  13 +
  14 +import sherpa_onnx
  15 +import soundfile as sf
  16 +
  17 +
  18 +def create_recognizer():
  19 + model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"
  20 + tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"
  21 + test_wav = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
  22 +
  23 + if not Path(model).is_file() or not Path(test_wav).is_file():
  24 + raise ValueError(
  25 + """Please download model files from
  26 + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  27 + """
  28 + )
  29 + return (
  30 + sherpa_onnx.OfflineRecognizer.from_zipformer_ctc(
  31 + model=model,
  32 + tokens=tokens,
  33 + debug=True,
  34 + ),
  35 + test_wav,
  36 + )
  37 +
  38 +
  39 +def main():
  40 + recognizer, wave_filename = create_recognizer()
  41 +
  42 + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
  43 + audio = audio[:, 0] # only use the first channel
  44 +
  45 + # audio is a 1-D float32 numpy array normalized to the range [-1, 1]
  46 + # sample_rate does not need to be 16000 Hz
  47 +
  48 + stream = recognizer.create_stream()
  49 + stream.accept_waveform(sample_rate, audio)
  50 + recognizer.decode_stream(stream)
  51 + print(wave_filename)
  52 + print(stream.result)
  53 +
  54 +
  55 +if __name__ == "__main__":
  56 + main()
@@ -344,7 +344,7 @@ def get_models(): @@ -344,7 +344,7 @@ def get_models():
344 """, 344 """,
345 ), 345 ),
346 Model( 346 Model(
347 - model_name="sherpa-onnx-streaming-zipformer-ctc-fp16-zh-2025-06-30", 347 + model_name="sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30",
348 idx=19, 348 idx=19,
349 lang="zh", 349 lang="zh",
350 short_name="large_zipformer_fp16", 350 short_name="large_zipformer_fp16",
@@ -363,6 +363,26 @@ def get_models(): @@ -363,6 +363,26 @@ def get_models():
363 popd 363 popd
364 """, 364 """,
365 ), 365 ),
  366 + Model(
  367 + model_name="sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30",
  368 + idx=20,
  369 + lang="zh",
  370 + short_name="large_zipformer_int8",
  371 + rule_fsts="itn_zh_number.fst",
  372 + cmd="""
  373 + if [ ! -f itn_zh_number.fst ]; then
  374 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
  375 + fi
  376 + pushd $model_name
  377 + rm -fv bpe.model
  378 +
  379 + rm -rf test_wavs
  380 +
  381 + ls -lh
  382 +
  383 + popd
  384 + """,
  385 + ),
366 ] 386 ]
367 387
368 return models 388 return models
@@ -551,6 +551,23 @@ def get_models(): @@ -551,6 +551,23 @@ def get_models():
551 popd 551 popd
552 """, 552 """,
553 ), 553 ),
  554 + Model(
  555 + model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03",
  556 + idx=31,
  557 + lang="zh",
  558 + lang2="Chinese",
  559 + short_name="zipformer_2025_07_03",
  560 + cmd="""
  561 + pushd $model_name
  562 +
  563 + rm -rfv test_wavs
  564 + rm -rfv bbpe.model
  565 +
  566 + ls -lh
  567 +
  568 + popd
  569 + """,
  570 + ),
554 ] 571 ]
555 return models 572 return models
556 573
@@ -27,6 +27,7 @@ namespace SherpaOnnx @@ -27,6 +27,7 @@ namespace SherpaOnnx
27 Moonshine = new OfflineMoonshineModelConfig(); 27 Moonshine = new OfflineMoonshineModelConfig();
28 FireRedAsr = new OfflineFireRedAsrModelConfig(); 28 FireRedAsr = new OfflineFireRedAsrModelConfig();
29 Dolphin = new OfflineDolphinModelConfig(); 29 Dolphin = new OfflineDolphinModelConfig();
  30 + ZipformerCtc = new OfflineZipformerCtcModelConfig();
30 } 31 }
31 public OfflineTransducerModelConfig Transducer; 32 public OfflineTransducerModelConfig Transducer;
32 public OfflineParaformerModelConfig Paraformer; 33 public OfflineParaformerModelConfig Paraformer;
@@ -60,5 +61,6 @@ namespace SherpaOnnx @@ -60,5 +61,6 @@ namespace SherpaOnnx
60 public OfflineMoonshineModelConfig Moonshine; 61 public OfflineMoonshineModelConfig Moonshine;
61 public OfflineFireRedAsrModelConfig FireRedAsr; 62 public OfflineFireRedAsrModelConfig FireRedAsr;
62 public OfflineDolphinModelConfig Dolphin; 63 public OfflineDolphinModelConfig Dolphin;
  64 + public OfflineZipformerCtcModelConfig ZipformerCtc;
63 } 65 }
64 } 66 }
  1 +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +using System.Runtime.InteropServices;
  4 +
  5 +namespace SherpaOnnx
  6 +{
  7 +
  8 + [StructLayout(LayoutKind.Sequential)]
  9 + public struct OfflineZipformerCtcModelConfig
  10 + {
  11 + public OfflineZipformerCtcModelConfig()
  12 + {
  13 + Model = "";
  14 + }
  15 + [MarshalAs(UnmanagedType.LPStr)]
  16 + public string Model;
  17 + }
  18 +}
  1 +../../../../go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh
@@ -398,6 +398,10 @@ type OfflineNemoEncDecCtcModelConfig struct { @@ -398,6 +398,10 @@ type OfflineNemoEncDecCtcModelConfig struct {
398 Model string // Path to the model, e.g., model.onnx or model.int8.onnx 398 Model string // Path to the model, e.g., model.onnx or model.int8.onnx
399 } 399 }
400 400
  401 +type OfflineZipformerCtcModelConfig struct {
  402 + Model string // Path to the model, e.g., model.onnx or model.int8.onnx
  403 +}
  404 +
401 type OfflineDolphinModelConfig struct { 405 type OfflineDolphinModelConfig struct {
402 Model string // Path to the model, e.g., model.onnx or model.int8.onnx 406 Model string // Path to the model, e.g., model.onnx or model.int8.onnx
403 } 407 }
@@ -439,16 +443,17 @@ type OfflineLMConfig struct { @@ -439,16 +443,17 @@ type OfflineLMConfig struct {
439 } 443 }
440 444
441 type OfflineModelConfig struct { 445 type OfflineModelConfig struct {
442 - Transducer OfflineTransducerModelConfig  
443 - Paraformer OfflineParaformerModelConfig  
444 - NemoCTC OfflineNemoEncDecCtcModelConfig  
445 - Whisper OfflineWhisperModelConfig  
446 - Tdnn OfflineTdnnModelConfig  
447 - SenseVoice OfflineSenseVoiceModelConfig  
448 - Moonshine OfflineMoonshineModelConfig  
449 - FireRedAsr OfflineFireRedAsrModelConfig  
450 - Dolphin OfflineDolphinModelConfig  
451 - Tokens string // Path to tokens.txt 446 + Transducer OfflineTransducerModelConfig
  447 + Paraformer OfflineParaformerModelConfig
  448 + NemoCTC OfflineNemoEncDecCtcModelConfig
  449 + Whisper OfflineWhisperModelConfig
  450 + Tdnn OfflineTdnnModelConfig
  451 + SenseVoice OfflineSenseVoiceModelConfig
  452 + Moonshine OfflineMoonshineModelConfig
  453 + FireRedAsr OfflineFireRedAsrModelConfig
  454 + Dolphin OfflineDolphinModelConfig
  455 + ZipformerCtc OfflineZipformerCtcModelConfig
  456 + Tokens string // Path to tokens.txt
452 457
453 // Number of threads to use for neural network computation 458 // Number of threads to use for neural network computation
454 NumThreads int 459 NumThreads int
@@ -540,6 +545,7 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher @@ -540,6 +545,7 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher
540 c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder) 545 c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder)
541 546
542 c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model) 547 c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model)
  548 + c.model_config.zipformer_ctc.model = C.CString(config.ModelConfig.ZipformerCtc.Model)
543 549
544 c.model_config.tokens = C.CString(config.ModelConfig.Tokens) 550 c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
545 551
@@ -653,11 +659,22 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig) @@ -653,11 +659,22 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig)
653 C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder)) 659 C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder))
654 c.model_config.fire_red_asr.encoder = nil 660 c.model_config.fire_red_asr.encoder = nil
655 } 661 }
  662 +
656 if c.model_config.fire_red_asr.decoder != nil { 663 if c.model_config.fire_red_asr.decoder != nil {
657 C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder)) 664 C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder))
658 c.model_config.fire_red_asr.decoder = nil 665 c.model_config.fire_red_asr.decoder = nil
659 } 666 }
660 667
  668 + if c.model_config.dolphin.model != nil {
  669 + C.free(unsafe.Pointer(c.model_config.dolphin.model))
  670 + c.model_config.dolphin.model = nil
  671 + }
  672 +
  673 + if c.model_config.zipformer_ctc.model != nil {
  674 + C.free(unsafe.Pointer(c.model_config.zipformer_ctc.model))
  675 + c.model_config.zipformer_ctc.model = nil
  676 + }
  677 +
661 if c.model_config.tokens != nil { 678 if c.model_config.tokens != nil {
662 C.free(unsafe.Pointer(c.model_config.tokens)) 679 C.free(unsafe.Pointer(c.model_config.tokens))
663 c.model_config.tokens = nil 680 c.model_config.tokens = nil
@@ -212,6 +212,21 @@ def get_models(): @@ -212,6 +212,21 @@ def get_models():
212 git diff 212 git diff
213 """, 213 """,
214 ), 214 ),
  215 + Model(
  216 + model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03",
  217 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc",
  218 + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc",
  219 + short_name="vad-asr-zh-zipformer-ctc",
  220 + cmd="""
  221 + pushd $model_name
  222 + mv model.int8.onnx ../zipformer-ctc.onnx
  223 + mv tokens.txt ../
  224 + popd
  225 + rm -rf $model_name
  226 + sed -i.bak 's/Zipformer/Zipformer CTC supporting Chinese 中文/g' ../index.html
  227 + git diff
  228 + """,
  229 + ),
215 ] 230 ]
216 return models 231 return models
217 232
@@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig( @@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig(
484 recognizer_config.model_config.dolphin.model = 484 recognizer_config.model_config.dolphin.model =
485 SHERPA_ONNX_OR(config->model_config.dolphin.model, ""); 485 SHERPA_ONNX_OR(config->model_config.dolphin.model, "");
486 486
  487 + recognizer_config.model_config.zipformer_ctc.model =
  488 + SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, "");
  489 +
487 recognizer_config.lm_config.model = 490 recognizer_config.lm_config.model =
488 SHERPA_ONNX_OR(config->lm_config.model, ""); 491 SHERPA_ONNX_OR(config->lm_config.model, "");
489 recognizer_config.lm_config.scale = 492 recognizer_config.lm_config.scale =
@@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig { @@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig {
451 const char *model; 451 const char *model;
452 } SherpaOnnxOfflineDolphinModelConfig; 452 } SherpaOnnxOfflineDolphinModelConfig;
453 453
  454 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineZipformerCtcModelConfig {
  455 + const char *model;
  456 +} SherpaOnnxOfflineZipformerCtcModelConfig;
  457 +
454 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { 458 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
455 SherpaOnnxOfflineTransducerModelConfig transducer; 459 SherpaOnnxOfflineTransducerModelConfig transducer;
456 SherpaOnnxOfflineParaformerModelConfig paraformer; 460 SherpaOnnxOfflineParaformerModelConfig paraformer;
@@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { @@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
474 SherpaOnnxOfflineMoonshineModelConfig moonshine; 478 SherpaOnnxOfflineMoonshineModelConfig moonshine;
475 SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr; 479 SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr;
476 SherpaOnnxOfflineDolphinModelConfig dolphin; 480 SherpaOnnxOfflineDolphinModelConfig dolphin;
  481 + SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc;
477 } SherpaOnnxOfflineModelConfig; 482 } SherpaOnnxOfflineModelConfig;
478 483
479 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { 484 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {
@@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create( @@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create(
252 252
253 c.model_config.dolphin.model = config.model_config.dolphin.model.c_str(); 253 c.model_config.dolphin.model = config.model_config.dolphin.model.c_str();
254 254
  255 + c.model_config.zipformer_ctc.model =
  256 + config.model_config.zipformer_ctc.model.c_str();
  257 +
255 c.lm_config.model = config.lm_config.model.c_str(); 258 c.lm_config.model = config.lm_config.model.c_str();
256 c.lm_config.scale = config.lm_config.scale; 259 c.lm_config.scale = config.lm_config.scale;
257 260
@@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig { @@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig {
241 std::string model; 241 std::string model;
242 }; 242 };
243 243
  244 +struct SHERPA_ONNX_API OfflineZipformerCtcModelConfig {
  245 + std::string model;
  246 +};
  247 +
244 struct SHERPA_ONNX_API OfflineMoonshineModelConfig { 248 struct SHERPA_ONNX_API OfflineMoonshineModelConfig {
245 std::string preprocessor; 249 std::string preprocessor;
246 std::string encoder; 250 std::string encoder;
@@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig { @@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig {
267 OfflineMoonshineModelConfig moonshine; 271 OfflineMoonshineModelConfig moonshine;
268 OfflineFireRedAsrModelConfig fire_red_asr; 272 OfflineFireRedAsrModelConfig fire_red_asr;
269 OfflineDolphinModelConfig dolphin; 273 OfflineDolphinModelConfig dolphin;
  274 + OfflineZipformerCtcModelConfig zipformer_ctc;
270 }; 275 };
271 276
272 struct SHERPA_ONNX_API OfflineLMConfig { 277 struct SHERPA_ONNX_API OfflineLMConfig {
@@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( @@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
113 const OfflineModelConfig &config) { 113 const OfflineModelConfig &config) {
114 if (!config.dolphin.model.empty()) { 114 if (!config.dolphin.model.empty()) {
115 return std::make_unique<OfflineDolphinModel>(config); 115 return std::make_unique<OfflineDolphinModel>(config);
  116 + } else if (!config.nemo_ctc.model.empty()) {
  117 + return std::make_unique<OfflineNemoEncDecCtcModel>(config);
  118 + } else if (!config.tdnn.model.empty()) {
  119 + return std::make_unique<OfflineTdnnCtcModel>(config);
  120 + } else if (!config.zipformer_ctc.model.empty()) {
  121 + return std::make_unique<OfflineZipformerCtcModel>(config);
  122 + } else if (!config.wenet_ctc.model.empty()) {
  123 + return std::make_unique<OfflineWenetCtcModel>(config);
  124 + } else if (!config.telespeech_ctc.empty()) {
  125 + return std::make_unique<OfflineTeleSpeechCtcModel>(config);
116 } 126 }
117 127
118 // TODO(fangjun): Refactor it. We don't need to use model_type here 128 // TODO(fangjun): Refactor it. We don't need to use model_type here
@@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( @@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
167 Manager *mgr, const OfflineModelConfig &config) { 177 Manager *mgr, const OfflineModelConfig &config) {
168 if (!config.dolphin.model.empty()) { 178 if (!config.dolphin.model.empty()) {
169 return std::make_unique<OfflineDolphinModel>(mgr, config); 179 return std::make_unique<OfflineDolphinModel>(mgr, config);
  180 + } else if (!config.nemo_ctc.model.empty()) {
  181 + return std::make_unique<OfflineNemoEncDecCtcModel>(mgr, config);
  182 + } else if (!config.tdnn.model.empty()) {
  183 + return std::make_unique<OfflineTdnnCtcModel>(mgr, config);
  184 + } else if (!config.zipformer_ctc.model.empty()) {
  185 + return std::make_unique<OfflineZipformerCtcModel>(mgr, config);
  186 + } else if (!config.wenet_ctc.model.empty()) {
  187 + return std::make_unique<OfflineWenetCtcModel>(mgr, config);
  188 + } else if (!config.telespeech_ctc.empty()) {
  189 + return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config);
170 } 190 }
171 191
172 // TODO(fangjun): Refactor it. We don't need to use model_type here 192 // TODO(fangjun): Refactor it. We don't need to use model_type here
@@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java @@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java
33 java_files += OfflineFireRedAsrModelConfig.java 33 java_files += OfflineFireRedAsrModelConfig.java
34 java_files += OfflineMoonshineModelConfig.java 34 java_files += OfflineMoonshineModelConfig.java
35 java_files += OfflineNemoEncDecCtcModelConfig.java 35 java_files += OfflineNemoEncDecCtcModelConfig.java
  36 +java_files += OfflineZipformerCtcModelConfig.java
36 java_files += OfflineSenseVoiceModelConfig.java 37 java_files += OfflineSenseVoiceModelConfig.java
37 java_files += OfflineDolphinModelConfig.java 38 java_files += OfflineDolphinModelConfig.java
38 java_files += OfflineModelConfig.java 39 java_files += OfflineModelConfig.java
@@ -11,6 +11,7 @@ public class OfflineModelConfig { @@ -11,6 +11,7 @@ public class OfflineModelConfig {
11 private final OfflineNemoEncDecCtcModelConfig nemo; 11 private final OfflineNemoEncDecCtcModelConfig nemo;
12 private final OfflineSenseVoiceModelConfig senseVoice; 12 private final OfflineSenseVoiceModelConfig senseVoice;
13 private final OfflineDolphinModelConfig dolphin; 13 private final OfflineDolphinModelConfig dolphin;
  14 + private final OfflineZipformerCtcModelConfig zipformerCtc;
14 private final String teleSpeech; 15 private final String teleSpeech;
15 private final String tokens; 16 private final String tokens;
16 private final int numThreads; 17 private final int numThreads;
@@ -28,6 +29,7 @@ public class OfflineModelConfig { @@ -28,6 +29,7 @@ public class OfflineModelConfig {
28 this.fireRedAsr = builder.fireRedAsr; 29 this.fireRedAsr = builder.fireRedAsr;
29 this.moonshine = builder.moonshine; 30 this.moonshine = builder.moonshine;
30 this.nemo = builder.nemo; 31 this.nemo = builder.nemo;
  32 + this.zipformerCtc = builder.zipformerCtc;
31 this.senseVoice = builder.senseVoice; 33 this.senseVoice = builder.senseVoice;
32 this.dolphin = builder.dolphin; 34 this.dolphin = builder.dolphin;
33 this.teleSpeech = builder.teleSpeech; 35 this.teleSpeech = builder.teleSpeech;
@@ -52,7 +54,7 @@ public class OfflineModelConfig { @@ -52,7 +54,7 @@ public class OfflineModelConfig {
52 return transducer; 54 return transducer;
53 } 55 }
54 56
55 - public OfflineWhisperModelConfig getZipformer2Ctc() { 57 + public OfflineWhisperModelConfig getWhisper() {
56 return whisper; 58 return whisper;
57 } 59 }
58 60
@@ -68,6 +70,14 @@ public class OfflineModelConfig { @@ -68,6 +70,14 @@ public class OfflineModelConfig {
68 return dolphin; 70 return dolphin;
69 } 71 }
70 72
  73 + public OfflineNemoEncDecCtcModelConfig getNemo() {
  74 + return nemo;
  75 + }
  76 +
  77 + public OfflineZipformerCtcModelConfig getZipformerCtc() {
  78 + return zipformerCtc;
  79 + }
  80 +
71 public String getTokens() { 81 public String getTokens() {
72 return tokens; 82 return tokens;
73 } 83 }
@@ -109,6 +119,7 @@ public class OfflineModelConfig { @@ -109,6 +119,7 @@ public class OfflineModelConfig {
109 private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); 119 private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build();
110 private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build(); 120 private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build();
111 private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build(); 121 private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build();
  122 + private OfflineZipformerCtcModelConfig zipformerCtc = OfflineZipformerCtcModelConfig.builder().build();
112 private String teleSpeech = ""; 123 private String teleSpeech = "";
113 private String tokens = ""; 124 private String tokens = "";
114 private int numThreads = 1; 125 private int numThreads = 1;
@@ -142,6 +153,11 @@ public class OfflineModelConfig { @@ -142,6 +153,11 @@ public class OfflineModelConfig {
142 return this; 153 return this;
143 } 154 }
144 155
  156 + public Builder setZipformerCtc(OfflineZipformerCtcModelConfig zipformerCtc) {
  157 + this.zipformerCtc = zipformerCtc;
  158 + return this;
  159 + }
  160 +
145 public Builder setTeleSpeech(String teleSpeech) { 161 public Builder setTeleSpeech(String teleSpeech) {
146 this.teleSpeech = teleSpeech; 162 this.teleSpeech = teleSpeech;
147 return this; 163 return this;
  1 +// Copyright 2025 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class OfflineZipformerCtcModelConfig {
  6 + private final String model;
  7 +
  8 + private OfflineZipformerCtcModelConfig(Builder builder) {
  9 + this.model = builder.model;
  10 + }
  11 +
  12 + public static Builder builder() {
  13 + return new Builder();
  14 + }
  15 +
  16 + public String getModel() {
  17 + return model;
  18 + }
  19 +
  20 + public static class Builder {
  21 + private String model = "";
  22 +
  23 + public OfflineZipformerCtcModelConfig build() {
  24 + return new OfflineZipformerCtcModelConfig(this);
  25 + }
  26 +
  27 + public Builder setModel(String model) {
  28 + this.model = model;
  29 + return this;
  30 + }
  31 + }
  32 +}
@@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { @@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
269 ans.model_config.nemo_ctc.model = p; 269 ans.model_config.nemo_ctc.model = p;
270 env->ReleaseStringUTFChars(s, p); 270 env->ReleaseStringUTFChars(s, p);
271 271
  272 + // zipformer ctc
  273 + fid =
  274 + env->GetFieldID(model_config_cls, "zipformerCtc",
  275 + "Lcom/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig;");
  276 + jobject zipformer_ctc_config = env->GetObjectField(model_config, fid);
  277 + jclass zipformer_ctc_config_cls = env->GetObjectClass(zipformer_ctc_config);
  278 +
  279 + fid =
  280 + env->GetFieldID(zipformer_ctc_config_cls, "model", "Ljava/lang/String;");
  281 +
  282 + s = (jstring)env->GetObjectField(zipformer_ctc_config, fid);
  283 + p = env->GetStringUTFChars(s, nullptr);
  284 + ans.model_config.zipformer_ctc.model = p;
  285 + env->ReleaseStringUTFChars(s, p);
  286 +
272 // dolphin 287 // dolphin
273 fid = env->GetFieldID(model_config_cls, "dolphin", 288 fid = env->GetFieldID(model_config_cls, "dolphin",
274 "Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;"); 289 "Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;");
@@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig( @@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig(
29 var model: String = "", 29 var model: String = "",
30 ) 30 )
31 31
  32 +data class OfflineZipformerCtcModelConfig(
  33 + var model: String = "",
  34 +)
  35 +
32 data class OfflineWhisperModelConfig( 36 data class OfflineWhisperModelConfig(
33 var encoder: String = "", 37 var encoder: String = "",
34 var decoder: String = "", 38 var decoder: String = "",
@@ -64,6 +68,7 @@ data class OfflineModelConfig( @@ -64,6 +68,7 @@ data class OfflineModelConfig(
64 var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), 68 var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(),
65 var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(), 69 var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(),
66 var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(), 70 var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(),
  71 + var zipformerCtc: OfflineZipformerCtcModelConfig = OfflineZipformerCtcModelConfig(),
67 var teleSpeech: String = "", 72 var teleSpeech: String = "",
68 var numThreads: Int = 1, 73 var numThreads: Int = 1,
69 var debug: Boolean = false, 74 var debug: Boolean = false,
@@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { @@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
559 modelType = "nemo_transducer", 564 modelType = "nemo_transducer",
560 ) 565 )
561 } 566 }
  567 +
  568 + 31 -> {
  569 + val modelDir = "sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03"
  570 + return OfflineModelConfig(
  571 + zipformerCtc = OfflineZipformerCtcModelConfig(
  572 + model = "$modelDir/model.int8.onnx",
  573 + ),
  574 + tokens = "$modelDir/tokens.txt",
  575 + )
  576 + }
562 } 577 }
563 return null 578 return null
564 } 579 }
@@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { @@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
412 model = "$modelDir/model.onnx", 412 model = "$modelDir/model.onnx",
413 ), 413 ),
414 tokens = "$modelDir/tokens.txt", 414 tokens = "$modelDir/tokens.txt",
  415 + modelType = "zipformer2",
415 ) 416 )
416 } 417 }
417 418
@@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { @@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
422 model = "$modelDir/model.fp16.onnx", 423 model = "$modelDir/model.fp16.onnx",
423 ), 424 ),
424 tokens = "$modelDir/tokens.txt", 425 tokens = "$modelDir/tokens.txt",
  426 + modelType = "zipformer2",
425 ) 427 )
426 } 428 }
427 429
@@ -284,6 +284,11 @@ type @@ -284,6 +284,11 @@ type
284 function ToString: AnsiString; 284 function ToString: AnsiString;
285 end; 285 end;
286 286
  287 + TSherpaOnnxOfflineZipformerCtcModelConfig = record
  288 + Model: AnsiString;
  289 + function ToString: AnsiString;
  290 + end;
  291 +
287 TSherpaOnnxOfflineWhisperModelConfig = record 292 TSherpaOnnxOfflineWhisperModelConfig = record
288 Encoder: AnsiString; 293 Encoder: AnsiString;
289 Decoder: AnsiString; 294 Decoder: AnsiString;
@@ -346,6 +351,7 @@ type @@ -346,6 +351,7 @@ type
346 Moonshine: TSherpaOnnxOfflineMoonshineModelConfig; 351 Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
347 FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig; 352 FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig;
348 Dolphin: TSherpaOnnxOfflineDolphinModelConfig; 353 Dolphin: TSherpaOnnxOfflineDolphinModelConfig;
  354 + ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig;
349 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); 355 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
350 function ToString: AnsiString; 356 function ToString: AnsiString;
351 end; 357 end;
@@ -726,6 +732,9 @@ type @@ -726,6 +732,9 @@ type
726 SherpaOnnxOfflineDolphinModelConfig = record 732 SherpaOnnxOfflineDolphinModelConfig = record
727 Model: PAnsiChar; 733 Model: PAnsiChar;
728 end; 734 end;
  735 + SherpaOnnxOfflineZipformerCtcModelConfig = record
  736 + Model: PAnsiChar;
  737 + end;
729 SherpaOnnxOfflineWhisperModelConfig = record 738 SherpaOnnxOfflineWhisperModelConfig = record
730 Encoder: PAnsiChar; 739 Encoder: PAnsiChar;
731 Decoder: PAnsiChar; 740 Decoder: PAnsiChar;
@@ -773,6 +782,7 @@ type @@ -773,6 +782,7 @@ type
773 Moonshine: SherpaOnnxOfflineMoonshineModelConfig; 782 Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
774 FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig; 783 FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig;
775 Dolphin: SherpaOnnxOfflineDolphinModelConfig; 784 Dolphin: SherpaOnnxOfflineDolphinModelConfig;
  785 + ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig;
776 end; 786 end;
777 787
778 SherpaOnnxOfflineRecognizerConfig = record 788 SherpaOnnxOfflineRecognizerConfig = record
@@ -1536,6 +1546,12 @@ begin @@ -1536,6 +1546,12 @@ begin
1536 [Self.Model]); 1546 [Self.Model]);
1537 end; 1547 end;
1538 1548
  1549 +function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString;
  1550 +begin
  1551 + Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)',
  1552 + [Self.Model]);
  1553 +end;
  1554 +
1539 function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; 1555 function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString;
1540 begin 1556 begin
1541 Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' + 1557 Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' +
@@ -1610,14 +1626,15 @@ begin @@ -1610,14 +1626,15 @@ begin
1610 'SenseVoice := %s, ' + 1626 'SenseVoice := %s, ' +
1611 'Moonshine := %s, ' + 1627 'Moonshine := %s, ' +
1612 'FireRedAsr := %s, ' + 1628 'FireRedAsr := %s, ' +
1613 - 'Dolphin := %s' + 1629 + 'Dolphin := %s, ' +
  1630 + 'ZipformerCtc := %s' +
1614 ')', 1631 ')',
1615 [Self.Transducer.ToString, Self.Paraformer.ToString, 1632 [Self.Transducer.ToString, Self.Paraformer.ToString,
1616 Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, 1633 Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
1617 Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, 1634 Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
1618 Self.ModelType, Self.ModelingUnit, Self.BpeVocab, 1635 Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
1619 Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString, 1636 Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString,
1620 - Self.FireRedAsr.ToString, Self.Dolphin.ToString 1637 + Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString
1621 ]); 1638 ]);
1622 end; 1639 end;
1623 1640
@@ -1688,6 +1705,7 @@ begin @@ -1688,6 +1705,7 @@ begin
1688 C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder); 1705 C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder);
1689 1706
1690 C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model); 1707 C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model);
  1708 + C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model);
1691 1709
1692 C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); 1710 C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
1693 C.LMConfig.Scale := Config.LMConfig.Scale; 1711 C.LMConfig.Scale := Config.LMConfig.Scale;
@@ -528,6 +528,87 @@ class OfflineRecognizer(object): @@ -528,6 +528,87 @@ class OfflineRecognizer(object):
528 return self 528 return self
529 529
530 @classmethod 530 @classmethod
  531 + def from_zipformer_ctc(
  532 + cls,
  533 + model: str,
  534 + tokens: str,
  535 + num_threads: int = 1,
  536 + sample_rate: int = 16000,
  537 + feature_dim: int = 80,
  538 + decoding_method: str = "greedy_search",
  539 + debug: bool = False,
  540 + provider: str = "cpu",
  541 + rule_fsts: str = "",
  542 + rule_fars: str = "",
  543 + hr_dict_dir: str = "",
  544 + hr_rule_fsts: str = "",
  545 + hr_lexicon: str = "",
  546 + ):
  547 + """
  548 + Please refer to
  549 + `<https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/index.html>`_
  550 + to download pre-trained models for different languages, e.g., Chinese,
  551 + English, etc.
  552 +
  553 + Args:
  554 + model:
  555 + Path to ``model.onnx``.
  556 + tokens:
  557 + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
  558 + columns::
  559 +
  560 + symbol integer_id
  561 +
  562 + num_threads:
  563 + Number of threads for neural network computation.
  564 + sample_rate:
  565 + Sample rate of the training data used to train the model.
  566 + feature_dim:
  567 + Dimension of the feature used to train the model.
  568 + decoding_method:
  569 + Valid values are greedy_search.
  570 + debug:
  571 + True to show debug messages.
  572 + provider:
  573 + onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
  574 + rule_fsts:
  575 + If not empty, it specifies fsts for inverse text normalization.
  576 + If there are multiple fsts, they are separated by a comma.
  577 + rule_fars:
  578 + If not empty, it specifies fst archives for inverse text normalization.
  579 + If there are multiple archives, they are separated by a comma.
  580 + """
  581 + self = cls.__new__(cls)
  582 + model_config = OfflineModelConfig(
  583 + zipformer_ctc=OfflineZipformerCtcModelConfig(model=model),
  584 + tokens=tokens,
  585 + num_threads=num_threads,
  586 + debug=debug,
  587 + provider=provider,
  588 + )
  589 +
  590 + feat_config = FeatureExtractorConfig(
  591 + sampling_rate=sample_rate,
  592 + feature_dim=feature_dim,
  593 + )
  594 +
  595 + recognizer_config = OfflineRecognizerConfig(
  596 + feat_config=feat_config,
  597 + model_config=model_config,
  598 + decoding_method=decoding_method,
  599 + rule_fsts=rule_fsts,
  600 + rule_fars=rule_fars,
  601 + hr=HomophoneReplacerConfig(
  602 + dict_dir=hr_dict_dir,
  603 + lexicon=hr_lexicon,
  604 + rule_fsts=hr_rule_fsts,
  605 + ),
  606 + )
  607 + self.recognizer = _Recognizer(recognizer_config)
  608 + self.config = recognizer_config
  609 + return self
  610 +
  611 + @classmethod
531 def from_nemo_ctc( 612 def from_nemo_ctc(
532 cls, 613 cls,
533 model: str, 614 model: str,
@@ -16,3 +16,6 @@ tts-kokoro-en @@ -16,3 +16,6 @@ tts-kokoro-en
16 tts-kokoro-zh-en 16 tts-kokoro-zh-en
17 speech-enhancement-gtcrn 17 speech-enhancement-gtcrn
18 decode-file-sense-voice-with-hr 18 decode-file-sense-voice-with-hr
  19 +test-version
  20 +zipformer-ctc-asr
  21 +dolphin-ctc-asr
@@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig( @@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig(
346 ) 346 )
347 } 347 }
348 348
  349 +func sherpaOnnxOfflineZipformerCtcModelConfig(
  350 + model: String = ""
  351 +) -> SherpaOnnxOfflineZipformerCtcModelConfig {
  352 + return SherpaOnnxOfflineZipformerCtcModelConfig(
  353 + model: toCPointer(model)
  354 + )
  355 +}
  356 +
349 func sherpaOnnxOfflineNemoEncDecCtcModelConfig( 357 func sherpaOnnxOfflineNemoEncDecCtcModelConfig(
350 model: String = "" 358 model: String = ""
351 ) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig { 359 ) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig {
@@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig( @@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig(
449 senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(), 457 senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(),
450 moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(), 458 moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(),
451 fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(), 459 fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(),
452 - dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig() 460 + dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(),
  461 + zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig =
  462 + sherpaOnnxOfflineZipformerCtcModelConfig()
453 ) -> SherpaOnnxOfflineModelConfig { 463 ) -> SherpaOnnxOfflineModelConfig {
454 return SherpaOnnxOfflineModelConfig( 464 return SherpaOnnxOfflineModelConfig(
455 transducer: transducer, 465 transducer: transducer,
@@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig( @@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig(
468 sense_voice: senseVoice, 478 sense_voice: senseVoice,
469 moonshine: moonshine, 479 moonshine: moonshine,
470 fire_red_asr: fireRedAsr, 480 fire_red_asr: fireRedAsr,
471 - dolphin: dolphin 481 + dolphin: dolphin,
  482 + zipformer_ctc: zipformerCtc
472 ) 483 )
473 } 484 }
474 485
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then
  11 + echo "Please download the pre-trained model for testing."
  12 + echo "You can refer to"
  13 + echo ""
  14 + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese"
  15 + echo ""
  16 + echo "for help"
  17 +
  18 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  19 +
  20 + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  21 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
  22 + ls -lh sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
  23 +fi
  24 +
  25 +if [ ! -e ./zipformer-ctc-asr ]; then
  26 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  27 + swiftc \
  28 + -lc++ \
  29 + -I ../build-swift-macos/install/include \
  30 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  31 + ./zipformer-ctc-asr.swift ./SherpaOnnx.swift \
  32 + -L ../build-swift-macos/install/lib/ \
  33 + -l sherpa-onnx \
  34 + -l onnxruntime \
  35 + -o zipformer-ctc-asr
  36 +
  37 + strip zipformer-ctc-asr
  38 +else
  39 + echo "./zipformer-ctc-asr exists - skip building"
  40 +fi
  41 +
  42 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  43 +./zipformer-ctc-asr
  1 +import AVFoundation
  2 +
  3 +extension AudioBuffer {
  4 + func array() -> [Float] {
  5 + return Array(UnsafeBufferPointer(self))
  6 + }
  7 +}
  8 +
  9 +extension AVAudioPCMBuffer {
  10 + func array() -> [Float] {
  11 + return self.audioBufferList.pointee.mBuffers.array()
  12 + }
  13 +}
  14 +
  15 +func run() {
  16 + let model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"
  17 + let tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"
  18 +
  19 + let zipformerCtc = sherpaOnnxOfflineZipformerCtcModelConfig(
  20 + model: model
  21 + )
  22 +
  23 + let modelConfig = sherpaOnnxOfflineModelConfig(
  24 + tokens: tokens,
  25 + debug: 0,
  26 + zipformerCtc: zipformerCtc
  27 + )
  28 +
  29 + let featConfig = sherpaOnnxFeatureConfig(
  30 + sampleRate: 16000,
  31 + featureDim: 80
  32 + )
  33 + var config = sherpaOnnxOfflineRecognizerConfig(
  34 + featConfig: featConfig,
  35 + modelConfig: modelConfig
  36 + )
  37 +
  38 + let recognizer = SherpaOnnxOfflineRecognizer(config: &config)
  39 +
  40 + let filePath = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
  41 + let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
  42 + let audioFile = try! AVAudioFile(forReading: fileURL as URL)
  43 +
  44 + let audioFormat = audioFile.processingFormat
  45 + assert(audioFormat.channelCount == 1)
  46 + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
  47 +
  48 + let audioFrameCount = UInt32(audioFile.length)
  49 + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
  50 +
  51 + try! audioFile.read(into: audioFileBuffer!)
  52 + let array: [Float]! = audioFileBuffer?.array()
  53 + let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate))
  54 + print("\nresult is:\n\(result.text)")
  55 + if result.timestamps.count != 0 {
  56 + print("\ntimestamps is:\n\(result.timestamps)")
  57 + }
  58 +
  59 +}
  60 +
  61 +@main
  62 +struct App {
  63 + static func main() {
  64 + run()
  65 + }
  66 +}
@@ -43,6 +43,10 @@ function freeConfig(config, Module) { @@ -43,6 +43,10 @@ function freeConfig(config, Module) {
43 freeConfig(config.dolphin, Module) 43 freeConfig(config.dolphin, Module)
44 } 44 }
45 45
  46 + if ('zipformerCtc' in config) {
  47 + freeConfig(config.zipformerCtc, Module)
  48 + }
  49 +
46 if ('moonshine' in config) { 50 if ('moonshine' in config) {
47 freeConfig(config.moonshine, Module) 51 freeConfig(config.moonshine, Module)
48 } 52 }
@@ -627,6 +631,23 @@ function initSherpaOnnxOfflineDolphinModelConfig(config, Module) { @@ -627,6 +631,23 @@ function initSherpaOnnxOfflineDolphinModelConfig(config, Module) {
627 } 631 }
628 } 632 }
629 633
  634 +function initSherpaOnnxOfflineZipformerCtcModelConfig(config, Module) {
  635 + const n = Module.lengthBytesUTF8(config.model || '') + 1;
  636 +
  637 + const buffer = Module._malloc(n);
  638 +
  639 + const len = 1 * 4; // 1 pointer
  640 + const ptr = Module._malloc(len);
  641 +
  642 + Module.stringToUTF8(config.model || '', buffer, n);
  643 +
  644 + Module.setValue(ptr, buffer, 'i8*');
  645 +
  646 + return {
  647 + buffer: buffer, ptr: ptr, len: len,
  648 + }
  649 +}
  650 +
630 function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { 651 function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
631 const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; 652 const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
632 const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; 653 const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
@@ -840,6 +861,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -840,6 +861,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
840 }; 861 };
841 } 862 }
842 863
  864 + if (!('zipformerCtc' in config)) {
  865 + config.zipformerCtc = {
  866 + model: '',
  867 + };
  868 + }
  869 +
843 if (!('whisper' in config)) { 870 if (!('whisper' in config)) {
844 config.whisper = { 871 config.whisper = {
845 encoder: '', 872 encoder: '',
@@ -906,9 +933,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -906,9 +933,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
906 const dolphin = 933 const dolphin =
907 initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module); 934 initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module);
908 935
  936 + const zipformerCtc =
  937 + initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module);
  938 +
909 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + 939 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
910 tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + 940 tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
911 - dolphin.len; 941 + dolphin.len + zipformerCtc.len;
912 942
913 const ptr = Module._malloc(len); 943 const ptr = Module._malloc(len);
914 944
@@ -1010,11 +1040,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -1010,11 +1040,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
1010 Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset); 1040 Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset);
1011 offset += dolphin.len; 1041 offset += dolphin.len;
1012 1042
  1043 + Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset);
  1044 + offset += zipformerCtc.len;
  1045 +
1013 return { 1046 return {
1014 buffer: buffer, ptr: ptr, len: len, transducer: transducer, 1047 buffer: buffer, ptr: ptr, len: len, transducer: transducer,
1015 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, 1048 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
1016 senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, 1049 senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
1017 - dolphin: dolphin 1050 + dolphin: dolphin, zipformerCtc: zipformerCtc
1018 } 1051 }
1019 } 1052 }
1020 1053
@@ -13,6 +13,7 @@ extern "C" { @@ -13,6 +13,7 @@ extern "C" {
13 static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); 13 static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, "");
14 static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); 14 static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, "");
15 15
  16 +static_assert(sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) == 4, "");
16 static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); 18 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, "");
18 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); 19 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, "");
@@ -31,7 +32,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == @@ -31,7 +32,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
31 sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + 32 sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) +
32 sizeof(SherpaOnnxOfflineMoonshineModelConfig) + 33 sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
33 sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + 34 sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) +
34 - sizeof(SherpaOnnxOfflineDolphinModelConfig), 35 + sizeof(SherpaOnnxOfflineDolphinModelConfig) +
  36 + sizeof(SherpaOnnxOfflineZipformerCtcModelConfig),
35 37
36 ""); 38 "");
37 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); 39 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
@@ -77,6 +79,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -77,6 +79,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
77 auto moonshine = &model_config->moonshine; 79 auto moonshine = &model_config->moonshine;
78 auto fire_red_asr = &model_config->fire_red_asr; 80 auto fire_red_asr = &model_config->fire_red_asr;
79 auto dolphin = &model_config->dolphin; 81 auto dolphin = &model_config->dolphin;
  82 + auto zipformer_ctc = &model_config->zipformer_ctc;
80 83
81 fprintf(stdout, "----------offline transducer model config----------\n"); 84 fprintf(stdout, "----------offline transducer model config----------\n");
82 fprintf(stdout, "encoder: %s\n", transducer->encoder); 85 fprintf(stdout, "encoder: %s\n", transducer->encoder);
@@ -117,6 +120,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -117,6 +120,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
117 fprintf(stdout, "----------offline Dolphin model config----------\n"); 120 fprintf(stdout, "----------offline Dolphin model config----------\n");
118 fprintf(stdout, "model: %s\n", dolphin->model); 121 fprintf(stdout, "model: %s\n", dolphin->model);
119 122
  123 + fprintf(stdout, "----------offline zipformer ctc model config----------\n");
  124 + fprintf(stdout, "model: %s\n", zipformer_ctc->model);
  125 +
120 fprintf(stdout, "tokens: %s\n", model_config->tokens); 126 fprintf(stdout, "tokens: %s\n", model_config->tokens);
121 fprintf(stdout, "num_threads: %d\n", model_config->num_threads); 127 fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
122 fprintf(stdout, "provider: %s\n", model_config->provider); 128 fprintf(stdout, "provider: %s\n", model_config->provider);
@@ -117,6 +117,10 @@ function initOfflineRecognizer() { @@ -117,6 +117,10 @@ function initOfflineRecognizer() {
117 }; 117 };
118 } else if (fileExists('dolphin.onnx')) { 118 } else if (fileExists('dolphin.onnx')) {
119 config.modelConfig.dolphin = {model: './dolphin.onnx'}; 119 config.modelConfig.dolphin = {model: './dolphin.onnx'};
  120 + } else if (fileExists('zipformer-ctc.onnx')) {
  121 + // you need to rename model.int8.onnx from zipformer CTC to
  122 + // zipformer-ctc.onnx
  123 + config.modelConfig.zipformerCtc = {model: './zipformer-ctc.onnx'};
120 } else { 124 } else {
121 console.log('Please specify a model.'); 125 console.log('Please specify a model.');
122 alert('Please specify a model.'); 126 alert('Please specify a model.');