Fangjun Kuang
Committed by GitHub

Fix C api for Go and MFC to support streaming paraformer (#268)

@@ -178,9 +178,14 @@ jobs: @@ -178,9 +178,14 @@ jobs:
178 178
179 echo "Test transducer" 179 echo "Test transducer"
180 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 180 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
181 - ./run.sh 181 + ./run-transducer.sh
182 rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 182 rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26
183 183
  184 + echo "Test paraformer"
  185 + git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en
  186 + ./run-paraformer.sh
  187 + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
  188 +
184 - name: Test streaming decoding files (Win64) 189 - name: Test streaming decoding files (Win64)
185 if: matrix.os == 'windows-latest' && matrix.arch == 'x64' 190 if: matrix.os == 'windows-latest' && matrix.arch == 'x64'
186 shell: bash 191 shell: bash
@@ -202,9 +207,14 @@ jobs: @@ -202,9 +207,14 @@ jobs:
202 207
203 echo "Test transducer" 208 echo "Test transducer"
204 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 209 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
205 - ./run.sh 210 + ./run-transducer.sh
206 rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 211 rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26
207 212
  213 + echo "Test paraformer"
  214 + git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en
  215 + ./run-paraformer.sh
  216 + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
  217 +
208 - name: Test streaming decoding files (Win32) 218 - name: Test streaming decoding files (Win32)
209 if: matrix.os == 'windows-latest' && matrix.arch == 'x86' 219 if: matrix.os == 'windows-latest' && matrix.arch == 'x86'
210 shell: bash 220 shell: bash
@@ -235,5 +245,10 @@ jobs: @@ -235,5 +245,10 @@ jobs:
235 245
236 echo "Test transducer" 246 echo "Test transducer"
237 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 247 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
238 - ./run.sh 248 + ./run-transducer.sh
239 rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 249 rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26
  250 +
  251 + echo "Test paraformer"
  252 + git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en
  253 + ./run-paraformer.sh
  254 + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.7.4") 4 +set(SHERPA_ONNX_VERSION "1.7.5")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -33,9 +33,11 @@ func main() { @@ -33,9 +33,11 @@ func main() {
33 config := sherpa.OnlineRecognizerConfig{} 33 config := sherpa.OnlineRecognizerConfig{}
34 config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} 34 config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
35 35
36 - flag.StringVar(&config.ModelConfig.Encoder, "encoder", "", "Path to the encoder model")  
37 - flag.StringVar(&config.ModelConfig.Decoder, "decoder", "", "Path to the decoder model")  
38 - flag.StringVar(&config.ModelConfig.Joiner, "joiner", "", "Path to the joiner model") 36 + flag.StringVar(&config.ModelConfig.Transducer.Encoder, "encoder", "", "Path to the transducer encoder model")
  37 + flag.StringVar(&config.ModelConfig.Transducer.Decoder, "decoder", "", "Path to the transducer decoder model")
  38 + flag.StringVar(&config.ModelConfig.Transducer.Joiner, "joiner", "", "Path to the transducer joiner model")
  39 + flag.StringVar(&config.ModelConfig.Paraformer.Encoder, "paraformer-encoder", "", "Path to the paraformer encoder model")
  40 + flag.StringVar(&config.ModelConfig.Paraformer.Decoder, "paraformer-decoder", "", "Path to the paraformer decoder model")
39 flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file") 41 flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file")
40 flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing") 42 flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing")
41 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") 43 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message")
@@ -17,9 +17,11 @@ func main() { @@ -17,9 +17,11 @@ func main() {
17 config := sherpa.OnlineRecognizerConfig{} 17 config := sherpa.OnlineRecognizerConfig{}
18 config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} 18 config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
19 19
20 - flag.StringVar(&config.ModelConfig.Encoder, "encoder", "", "Path to the encoder model")  
21 - flag.StringVar(&config.ModelConfig.Decoder, "decoder", "", "Path to the decoder model")  
22 - flag.StringVar(&config.ModelConfig.Joiner, "joiner", "", "Path to the joiner model") 20 + flag.StringVar(&config.ModelConfig.Transducer.Encoder, "encoder", "", "Path to the transducer encoder model")
  21 + flag.StringVar(&config.ModelConfig.Transducer.Decoder, "decoder", "", "Path to the transducer decoder model")
  22 + flag.StringVar(&config.ModelConfig.Transducer.Joiner, "joiner", "", "Path to the transducer joiner model")
  23 + flag.StringVar(&config.ModelConfig.Paraformer.Encoder, "paraformer-encoder", "", "Path to the paraformer encoder model")
  24 + flag.StringVar(&config.ModelConfig.Paraformer.Decoder, "paraformer-decoder", "", "Path to the paraformer decoder model")
23 flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file") 25 flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file")
24 flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing") 26 flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing")
25 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") 27 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message")
  1 +#!/usr/bin/env bash
  2 +
  3 +# Please refer to
  4 +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english
  5 +# to download the model files
  6 +
  7 +if [ ! -d ./sherpa-onnx-streaming-paraformer-bilingual-zh-en ]; then
  8 + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en
  9 + cd sherpa-onnx-streaming-paraformer-bilingual-zh-en
  10 + git lfs pull --include "*.onnx"
  11 + cd ..
  12 +fi
  13 +
  14 +./streaming-decode-files \
  15 + --paraformer-encoder ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx \
  16 + --paraformer-decoder ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx \
  17 + --tokens ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \
  18 + --decoding-method greedy_search \
  19 + --model-type paraformer \
  20 + --debug 0 \
  21 + ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav
@@ -306,12 +306,10 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { @@ -306,12 +306,10 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() {
306 "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " 306 "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html "
307 "\r\n"; 307 "\r\n";
308 msg += "to download a non-streaming model, i.e., an offline model.\r\n"; 308 msg += "to download a non-streaming model, i.e., an offline model.\r\n";
  309 + msg += "You need to rename them after downloading\r\n\r\n";
  310 + msg += "It supports transducer, paraformer, and whisper models.\r\n\r\n";
309 msg += 311 msg +=
310 - "You need to rename them to encoder.onnx, decoder.onnx, and "  
311 - "joiner.onnx correspoondingly.\r\n\r\n";  
312 - msg += "It supports both transducer models and paraformer models.\r\n\r\n";  
313 - msg +=  
314 - "We give two examples below to show you how to download models\r\n\r\n"; 312 + "We give three examples below to show you how to download models\r\n\r\n";
315 msg += "(1) Transducer\r\n\r\n"; 313 msg += "(1) Transducer\r\n\r\n";
316 msg += 314 msg +=
317 "We use " 315 "We use "
@@ -346,13 +344,82 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { @@ -346,13 +344,82 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() {
346 "https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28/" 344 "https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28/"
347 "resolve/main/tokens.txt\r\n\r\n"; 345 "resolve/main/tokens.txt\r\n\r\n";
348 msg += "\r\n Now rename them\r\n"; 346 msg += "\r\n Now rename them\r\n";
349 - msg += "mv model.onnx paraformer.onnx\r\n"; 347 + msg += "mv model.onnx paraformer.onnx\r\n\r\n";
  348 + msg += "(3) Whisper\r\n\r\n";
  349 + msg +=
  350 + "wget "
  351 + "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/"
  352 + "main/tiny.en-encoder.onnx\r\n";
  353 + msg +=
  354 + "wget "
  355 + "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/"
  356 + "main/tiny.en-decoder.onnx\r\n";
  357 + msg +=
  358 + "wget "
  359 + "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/"
  360 + "main/tiny.en-tokens.txt\r\n";
  361 + msg += "\r\n Now rename them\r\n";
  362 + msg += "mv tiny.en-encoder.onnx whisper-encoder.onnx\r\n";
  363 + msg += "mv tiny.en-decoder.onnx whisper-decoder.onnx\r\n";
350 msg += "\r\n"; 364 msg += "\r\n";
351 msg += "That's it!\r\n"; 365 msg += "That's it!\r\n";
352 366
353 AppendLineToMultilineEditCtrl(msg); 367 AppendLineToMultilineEditCtrl(msg);
354 } 368 }
355 369
  370 +void CNonStreamingSpeechRecognitionDlg::InitWhisper() {
  371 + std::string whisper_encoder = "./whisper-encoder.onnx";
  372 + std::string whisper_decoder = "./whisper-decoder.onnx";
  373 +
  374 + std::string tokens = "./tokens.txt";
  375 +
  376 + bool is_ok = true;
  377 +
  378 + if (Exists("./whisper-encoder.int8.onnx")) {
  379 + whisper_encoder = "./whisper-encoder.int8.onnx";
  380 + } else if (!Exists(whisper_encoder)) {
  381 + std::string msg = whisper_encoder + " does not exist!";
  382 + AppendLineToMultilineEditCtrl(msg);
  383 + is_ok = false;
  384 + }
  385 +
  386 + if (Exists("./whisper-decoder.int8.onnx")) {
  387 + whisper_decoder = "./whisper-decoder.int8.onnx";
  388 + } else if (!Exists(whisper_decoder)) {
  389 + std::string msg = whisper_decoder + " does not exist!";
  390 + AppendLineToMultilineEditCtrl(msg);
  391 + is_ok = false;
  392 + }
  393 +
  394 + if (!Exists(tokens)) {
  395 + std::string msg = tokens + " does not exist!";
  396 + AppendLineToMultilineEditCtrl(msg);
  397 + is_ok = false;
  398 + }
  399 +
  400 + if (!is_ok) {
  401 + ShowInitRecognizerHelpMessage();
  402 + return;
  403 + }
  404 +
  405 + memset(&config_, 0, sizeof(config_));
  406 +
  407 + config_.feat_config.sample_rate = 16000;
  408 + config_.feat_config.feature_dim = 80;
  409 +
  410 + config_.model_config.whisper.encoder = whisper_encoder.c_str();
  411 + config_.model_config.whisper.decoder = whisper_decoder.c_str();
  412 + config_.model_config.tokens = tokens.c_str();
  413 + config_.model_config.num_threads = 1;
  414 + config_.model_config.debug = 1;
  415 + config_.model_config.model_type = "whisper";
  416 +
  417 + config_.decoding_method = "greedy_search";
  418 + config_.max_active_paths = 4;
  419 +
  420 + recognizer_ = CreateOfflineRecognizer(&config_);
  421 +}
  422 +
356 void CNonStreamingSpeechRecognitionDlg::InitParaformer() { 423 void CNonStreamingSpeechRecognitionDlg::InitParaformer() {
357 std::string paraformer = "./paraformer.onnx"; 424 std::string paraformer = "./paraformer.onnx";
358 std::string tokens = "./tokens.txt"; 425 std::string tokens = "./tokens.txt";
@@ -401,6 +468,11 @@ void CNonStreamingSpeechRecognitionDlg::InitRecognizer() { @@ -401,6 +468,11 @@ void CNonStreamingSpeechRecognitionDlg::InitRecognizer() {
401 return; 468 return;
402 } 469 }
403 470
  471 + if (Exists("./whisper-encoder.onnx") || Exists("./whisper-encoder.int8.onnx")) {
  472 + InitWhisper();
  473 + return;
  474 + }
  475 +
404 // assume it is transducer 476 // assume it is transducer
405 477
406 std::string encoder = "./encoder.onnx"; 478 std::string encoder = "./encoder.onnx";
@@ -69,5 +69,6 @@ class CNonStreamingSpeechRecognitionDlg : public CDialogEx { @@ -69,5 +69,6 @@ class CNonStreamingSpeechRecognitionDlg : public CDialogEx {
69 void InitRecognizer(); 69 void InitRecognizer();
70 70
71 void InitParaformer(); 71 void InitParaformer();
  72 + void InitWhisper();
72 void ShowInitRecognizerHelpMessage(); 73 void ShowInitRecognizerHelpMessage();
73 }; 74 };
@@ -234,50 +234,18 @@ bool CStreamingSpeechRecognitionDlg::Exists(const std::string &filename) { @@ -234,50 +234,18 @@ bool CStreamingSpeechRecognitionDlg::Exists(const std::string &filename) {
234 return is.good(); 234 return is.good();
235 } 235 }
236 236
237 -void CStreamingSpeechRecognitionDlg::InitRecognizer() {  
238 - std::string encoder = "./encoder.onnx";  
239 - std::string decoder = "./decoder.onnx";  
240 - std::string joiner = "./joiner.onnx";  
241 - std::string tokens = "./tokens.txt";  
242 -  
243 - bool is_ok = true;  
244 - if (!Exists(encoder)) {  
245 - std::string msg = encoder + " does not exist!";  
246 - AppendLineToMultilineEditCtrl(msg);  
247 - is_ok = false;  
248 - }  
249 -  
250 - if (!Exists(decoder)) {  
251 - std::string msg = decoder + " does not exist!";  
252 - AppendLineToMultilineEditCtrl(msg);  
253 - is_ok = false;  
254 - }  
255 -  
256 - if (!Exists(joiner)) {  
257 - std::string msg = joiner + " does not exist!";  
258 - AppendLineToMultilineEditCtrl(msg);  
259 - is_ok = false;  
260 - }  
261 -  
262 - if (!Exists(tokens)) {  
263 - std::string msg = tokens + " does not exist!";  
264 - AppendLineToMultilineEditCtrl(msg);  
265 - is_ok = false;  
266 - }  
267 -  
268 - if (!is_ok) { 237 +void CStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() {
269 my_btn_.EnableWindow(FALSE); 238 my_btn_.EnableWindow(FALSE);
270 std::string msg = 239 std::string msg =
271 "\r\nPlease go to\r\n" 240 "\r\nPlease go to\r\n"
272 "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " 241 "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html "
273 "\r\n"; 242 "\r\n";
274 msg += "to download a streaming model, i.e., an online model.\r\n"; 243 msg += "to download a streaming model, i.e., an online model.\r\n";
  244 + msg += "You need to rename them after downloading\r\n\r\n";
  245 + msg += "It supports both transducer and paraformer models.\r\n\r\n";
275 msg += 246 msg +=
276 - "You need to rename them to encoder.onnx, decoder.onnx, and "  
277 - "joiner.onnx correspoondingly.\r\n\r\n";  
278 - msg +=  
279 - "We use the following model as an example to show you how to do "  
280 - "that.\r\n"; 247 + "We give two examples below to show you how to download models\r\n\r\n";
  248 + msg += "(1) Transducer\r\n\r\n";
281 msg += 249 msg +=
282 "https://huggingface.co/pkufool/" 250 "https://huggingface.co/pkufool/"
283 "icefall-asr-zipformer-streaming-wenetspeech-20230615"; 251 "icefall-asr-zipformer-streaming-wenetspeech-20230615";
@@ -308,13 +276,132 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() { @@ -308,13 +276,132 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() {
308 msg += "mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx\r\n"; 276 msg += "mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx\r\n";
309 msg += "mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx\r\n"; 277 msg += "mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx\r\n";
310 msg += "\r\n"; 278 msg += "\r\n";
  279 + msg += "(2) Paraformer\r\n\r\n";
  280 + msg +=
  281 + "wget "
  282 + "https://huggingface.co/csukuangfj/"
  283 + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/"
  284 + "encoder.int8.onnx\r\n";
  285 + msg +=
  286 + "wget "
  287 + "https://huggingface.co/csukuangfj/"
  288 + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/"
  289 + "decoder.int8.onnx\r\n";
  290 + msg +=
  291 + "wget "
  292 + "https://huggingface.co/csukuangfj/"
  293 + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/"
  294 + "tokens.txt\r\n";
  295 + msg += "\r\nNow rename them.\r\n";
  296 + msg += "mv encoder.int8.onnx paraformer-encoder.onnx\r\n";
  297 + msg += "mv decoder.int8.onnx paraformer-decoder.onnx\r\n\r\n";
311 msg += "That's it!\r\n"; 298 msg += "That's it!\r\n";
312 299
313 AppendLineToMultilineEditCtrl(msg); 300 AppendLineToMultilineEditCtrl(msg);
  301 +}
  302 +
  303 +void CStreamingSpeechRecognitionDlg::InitParaformer() {
  304 + std::string paraformer_encoder = "./paraformer-encoder.onnx";
  305 + std::string paraformer_decoder = "./paraformer-decoder.onnx";
  306 +
  307 + std::string tokens = "./tokens.txt";
  308 +
  309 + bool is_ok = true;
  310 +
  311 + if (Exists("./paraformer-encoder.int8.onnx")) {
  312 + paraformer_encoder = "./paraformer-encoder.int8.onnx";
  313 + } else if (!Exists(paraformer_encoder)) {
  314 + std::string msg = paraformer_encoder + " does not exist!";
  315 + AppendLineToMultilineEditCtrl(msg);
  316 + is_ok = false;
  317 + }
  318 +
  319 + if (Exists("./paraformer-decoder.int8.onnx")) {
  320 + paraformer_decoder = "./paraformer-decoder.int8.onnx";
  321 + } else if (!Exists(paraformer_decoder)) {
  322 + std::string msg = paraformer_decoder + " does not exist!";
  323 + AppendLineToMultilineEditCtrl(msg);
  324 + is_ok = false;
  325 + }
  326 +
  327 + if (!Exists(tokens)) {
  328 + std::string msg = tokens + " does not exist!";
  329 + AppendLineToMultilineEditCtrl(msg);
  330 + is_ok = false;
  331 + }
  332 +
  333 + if (!is_ok) {
  334 + ShowInitRecognizerHelpMessage();
  335 + return;
  336 + }
  337 +
  338 + SherpaOnnxOnlineRecognizerConfig config;
  339 + memset(&config, 0, sizeof(config));
  340 + config.model_config.debug = 0;
  341 + config.model_config.num_threads = 1;
  342 + config.model_config.provider = "cpu";
  343 +
  344 + config.decoding_method = "greedy_search";
  345 + config.max_active_paths = 4;
  346 +
  347 + config.feat_config.sample_rate = 16000;
  348 + config.feat_config.feature_dim = 80;
  349 +
  350 + config.enable_endpoint = 1;
  351 + config.rule1_min_trailing_silence = 1.2f;
  352 + config.rule2_min_trailing_silence = 0.8f;
  353 + config.rule3_min_utterance_length = 300.0f;
  354 +
  355 + config.model_config.tokens = tokens.c_str();
  356 + config.model_config.paraformer.encoder = paraformer_encoder.c_str();
  357 + config.model_config.paraformer.decoder = paraformer_decoder.c_str();
  358 +
  359 + recognizer_ = CreateOnlineRecognizer(&config);
  360 +}
  361 +
  362 +void CStreamingSpeechRecognitionDlg::InitRecognizer() {
  363 + if (Exists("./paraformer-encoder.onnx") || Exists("./paraformer-encoder.int8.onnx")) {
  364 + InitParaformer();
  365 + return;
  366 + }
  367 +
  368 + std::string encoder = "./encoder.onnx";
  369 + std::string decoder = "./decoder.onnx";
  370 + std::string joiner = "./joiner.onnx";
  371 + std::string tokens = "./tokens.txt";
  372 +
  373 + bool is_ok = true;
  374 + if (!Exists(encoder)) {
  375 + std::string msg = encoder + " does not exist!";
  376 + AppendLineToMultilineEditCtrl(msg);
  377 + is_ok = false;
  378 + }
  379 +
  380 + if (!Exists(decoder)) {
  381 + std::string msg = decoder + " does not exist!";
  382 + AppendLineToMultilineEditCtrl(msg);
  383 + is_ok = false;
  384 + }
  385 +
  386 + if (!Exists(joiner)) {
  387 + std::string msg = joiner + " does not exist!";
  388 + AppendLineToMultilineEditCtrl(msg);
  389 + is_ok = false;
  390 + }
  391 +
  392 + if (!Exists(tokens)) {
  393 + std::string msg = tokens + " does not exist!";
  394 + AppendLineToMultilineEditCtrl(msg);
  395 + is_ok = false;
  396 + }
  397 +
  398 + if (!is_ok) {
  399 + ShowInitRecognizerHelpMessage();
314 return; 400 return;
315 } 401 }
316 402
317 SherpaOnnxOnlineRecognizerConfig config; 403 SherpaOnnxOnlineRecognizerConfig config;
  404 + memset(&config, 0, sizeof(config));
318 config.model_config.debug = 0; 405 config.model_config.debug = 0;
319 config.model_config.num_threads = 1; 406 config.model_config.num_threads = 1;
320 config.model_config.provider = "cpu"; 407 config.model_config.provider = "cpu";
@@ -331,9 +418,9 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() { @@ -331,9 +418,9 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() {
331 config.rule3_min_utterance_length = 300.0f; 418 config.rule3_min_utterance_length = 300.0f;
332 419
333 config.model_config.tokens = tokens.c_str(); 420 config.model_config.tokens = tokens.c_str();
334 - config.model_config.encoder = encoder.c_str();  
335 - config.model_config.decoder = decoder.c_str();  
336 - config.model_config.joiner = joiner.c_str(); 421 + config.model_config.transducer.encoder = encoder.c_str();
  422 + config.model_config.transducer.decoder = decoder.c_str();
  423 + config.model_config.transducer.joiner = joiner.c_str();
337 424
338 recognizer_ = CreateOnlineRecognizer(&config); 425 recognizer_ = CreateOnlineRecognizer(&config);
339 } 426 }
@@ -67,6 +67,8 @@ class CStreamingSpeechRecognitionDlg : public CDialogEx { @@ -67,6 +67,8 @@ class CStreamingSpeechRecognitionDlg : public CDialogEx {
67 67
68 bool Exists(const std::string &filename); 68 bool Exists(const std::string &filename);
69 void InitRecognizer(); 69 void InitRecognizer();
  70 + void InitParaformer();
  71 + void ShowInitRecognizerHelpMessage();
70 }; 72 };
71 73
72 class RecognizerThread : public CWinThread { 74 class RecognizerThread : public CWinThread {
@@ -45,9 +45,30 @@ import "unsafe" @@ -45,9 +45,30 @@ import "unsafe"
45 // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html 45 // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
46 // to download pre-trained models 46 // to download pre-trained models
47 type OnlineTransducerModelConfig struct { 47 type OnlineTransducerModelConfig struct {
48 - Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx  
49 - Decoder string // Path to the decoder model.  
50 - Joiner string // Path to the joiner model. 48 + Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
  49 + Decoder string // Path to the decoder model.
  50 + Joiner string // Path to the joiner model.
  51 +}
  52 +
  53 +// Configuration for online/streaming paraformer models
  54 +//
  55 +// Please refer to
  56 +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html
  57 +// to download pre-trained models
  58 +type OnlineParaformerModelConfig struct {
  59 + Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx
  60 + Decoder string // Path to the decoder model.
  61 +}
  62 +
  63 +// Configuration for online/streaming models
  64 +//
  65 +// Please refer to
  66 +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
  67 +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html
  68 +// to download pre-trained models
  69 +type OnlineModelConfig struct {
  70 + Transducer OnlineTransducerModelConfig
  71 + Paraformer OnlineParaformerModelConfig
51 Tokens string // Path to tokens.txt 72 Tokens string // Path to tokens.txt
52 NumThreads int // Number of threads to use for neural network computation 73 NumThreads int // Number of threads to use for neural network computation
53 Provider string // Optional. Valid values are: cpu, cuda, coreml 74 Provider string // Optional. Valid values are: cpu, cuda, coreml
@@ -68,7 +89,7 @@ type FeatureConfig struct { @@ -68,7 +89,7 @@ type FeatureConfig struct {
68 // Configuration for the online/streaming recognizer. 89 // Configuration for the online/streaming recognizer.
69 type OnlineRecognizerConfig struct { 90 type OnlineRecognizerConfig struct {
70 FeatConfig FeatureConfig 91 FeatConfig FeatureConfig
71 - ModelConfig OnlineTransducerModelConfig 92 + ModelConfig OnlineModelConfig
72 93
73 // Valid decoding methods: greedy_search, modified_beam_search 94 // Valid decoding methods: greedy_search, modified_beam_search
74 DecodingMethod string 95 DecodingMethod string
@@ -116,14 +137,20 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { @@ -116,14 +137,20 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer {
116 c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate) 137 c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate)
117 c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim) 138 c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim)
118 139
119 - c.model_config.encoder = C.CString(config.ModelConfig.Encoder)  
120 - defer C.free(unsafe.Pointer(c.model_config.encoder)) 140 + c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder)
  141 + defer C.free(unsafe.Pointer(c.model_config.transducer.encoder))
  142 +
  143 + c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder)
  144 + defer C.free(unsafe.Pointer(c.model_config.transducer.decoder))
  145 +
  146 + c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner)
  147 + defer C.free(unsafe.Pointer(c.model_config.transducer.joiner))
121 148
122 - c.model_config.decoder = C.CString(config.ModelConfig.Decoder)  
123 - defer C.free(unsafe.Pointer(c.model_config.decoder)) 149 + c.model_config.paraformer.encoder = C.CString(config.ModelConfig.Paraformer.Encoder)
  150 + defer C.free(unsafe.Pointer(c.model_config.paraformer.encoder))
124 151
125 - c.model_config.joiner = C.CString(config.ModelConfig.Joiner)  
126 - defer C.free(unsafe.Pointer(c.model_config.joiner)) 152 + c.model_config.paraformer.decoder = C.CString(config.ModelConfig.Paraformer.Decoder)
  153 + defer C.free(unsafe.Pointer(c.model_config.paraformer.decoder))
127 154
128 c.model_config.tokens = C.CString(config.ModelConfig.Tokens) 155 c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
129 defer C.free(unsafe.Pointer(c.model_config.tokens)) 156 defer C.free(unsafe.Pointer(c.model_config.tokens))
@@ -265,6 +265,12 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( @@ -265,6 +265,12 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
265 recognizer_config.model_config.nemo_ctc.model = 265 recognizer_config.model_config.nemo_ctc.model =
266 SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, ""); 266 SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, "");
267 267
  268 + recognizer_config.model_config.whisper.encoder =
  269 + SHERPA_ONNX_OR(config->model_config.whisper.encoder, "");
  270 +
  271 + recognizer_config.model_config.whisper.decoder =
  272 + SHERPA_ONNX_OR(config->model_config.whisper.decoder, "");
  273 +
268 recognizer_config.model_config.tokens = 274 recognizer_config.model_config.tokens =
269 SHERPA_ONNX_OR(config->model_config.tokens, ""); 275 SHERPA_ONNX_OR(config->model_config.tokens, "");
270 recognizer_config.model_config.num_threads = 276 recognizer_config.model_config.num_threads =
@@ -300,6 +300,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineNemoEncDecCtcModelConfig { @@ -300,6 +300,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineNemoEncDecCtcModelConfig {
300 const char *model; 300 const char *model;
301 } SherpaOnnxOfflineNemoEncDecCtcModelConfig; 301 } SherpaOnnxOfflineNemoEncDecCtcModelConfig;
302 302
  303 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineWhisperModelConfig {
  304 + const char *encoder;
  305 + const char *decoder;
  306 +} SherpaOnnxOfflineWhisperModelConfig;
  307 +
303 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineLMConfig { 308 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineLMConfig {
304 const char *model; 309 const char *model;
305 float scale; 310 float scale;
@@ -309,6 +314,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { @@ -309,6 +314,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
309 SherpaOnnxOfflineTransducerModelConfig transducer; 314 SherpaOnnxOfflineTransducerModelConfig transducer;
310 SherpaOnnxOfflineParaformerModelConfig paraformer; 315 SherpaOnnxOfflineParaformerModelConfig paraformer;
311 SherpaOnnxOfflineNemoEncDecCtcModelConfig nemo_ctc; 316 SherpaOnnxOfflineNemoEncDecCtcModelConfig nemo_ctc;
  317 + SherpaOnnxOfflineWhisperModelConfig whisper;
312 318
313 const char *tokens; 319 const char *tokens;
314 int32_t num_threads; 320 int32_t num_threads;