Committed by
GitHub
Fix C api for Go and MFC to support streaming paraformer (#268)
正在显示
13 个修改的文件
包含
307 行增加
和
66 行删除
| @@ -178,9 +178,14 @@ jobs: | @@ -178,9 +178,14 @@ jobs: | ||
| 178 | 178 | ||
| 179 | echo "Test transducer" | 179 | echo "Test transducer" |
| 180 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 | 180 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 |
| 181 | - ./run.sh | 181 | + ./run-transducer.sh |
| 182 | rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 | 182 | rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 |
| 183 | 183 | ||
| 184 | + echo "Test paraformer" | ||
| 185 | + git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en | ||
| 186 | + ./run-paraformer.sh | ||
| 187 | + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en | ||
| 188 | + | ||
| 184 | - name: Test streaming decoding files (Win64) | 189 | - name: Test streaming decoding files (Win64) |
| 185 | if: matrix.os == 'windows-latest' && matrix.arch == 'x64' | 190 | if: matrix.os == 'windows-latest' && matrix.arch == 'x64' |
| 186 | shell: bash | 191 | shell: bash |
| @@ -202,9 +207,14 @@ jobs: | @@ -202,9 +207,14 @@ jobs: | ||
| 202 | 207 | ||
| 203 | echo "Test transducer" | 208 | echo "Test transducer" |
| 204 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 | 209 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 |
| 205 | - ./run.sh | 210 | + ./run-transducer.sh |
| 206 | rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 | 211 | rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 |
| 207 | 212 | ||
| 213 | + echo "Test paraformer" | ||
| 214 | + git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en | ||
| 215 | + ./run-paraformer.sh | ||
| 216 | + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en | ||
| 217 | + | ||
| 208 | - name: Test streaming decoding files (Win32) | 218 | - name: Test streaming decoding files (Win32) |
| 209 | if: matrix.os == 'windows-latest' && matrix.arch == 'x86' | 219 | if: matrix.os == 'windows-latest' && matrix.arch == 'x86' |
| 210 | shell: bash | 220 | shell: bash |
| @@ -235,5 +245,10 @@ jobs: | @@ -235,5 +245,10 @@ jobs: | ||
| 235 | 245 | ||
| 236 | echo "Test transducer" | 246 | echo "Test transducer" |
| 237 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 | 247 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26 |
| 238 | - ./run.sh | 248 | + ./run-transducer.sh |
| 239 | rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 | 249 | rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 |
| 250 | + | ||
| 251 | + echo "Test paraformer" | ||
| 252 | + git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en | ||
| 253 | + ./run-paraformer.sh | ||
| 254 | + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en |
| @@ -33,9 +33,11 @@ func main() { | @@ -33,9 +33,11 @@ func main() { | ||
| 33 | config := sherpa.OnlineRecognizerConfig{} | 33 | config := sherpa.OnlineRecognizerConfig{} |
| 34 | config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} | 34 | config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} |
| 35 | 35 | ||
| 36 | - flag.StringVar(&config.ModelConfig.Encoder, "encoder", "", "Path to the encoder model") | ||
| 37 | - flag.StringVar(&config.ModelConfig.Decoder, "decoder", "", "Path to the decoder model") | ||
| 38 | - flag.StringVar(&config.ModelConfig.Joiner, "joiner", "", "Path to the joiner model") | 36 | + flag.StringVar(&config.ModelConfig.Transducer.Encoder, "encoder", "", "Path to the transducer encoder model") |
| 37 | + flag.StringVar(&config.ModelConfig.Transducer.Decoder, "decoder", "", "Path to the transducer decoder model") | ||
| 38 | + flag.StringVar(&config.ModelConfig.Transducer.Joiner, "joiner", "", "Path to the transducer joiner model") | ||
| 39 | + flag.StringVar(&config.ModelConfig.Paraformer.Encoder, "paraformer-encoder", "", "Path to the paraformer encoder model") | ||
| 40 | + flag.StringVar(&config.ModelConfig.Paraformer.Decoder, "paraformer-decoder", "", "Path to the paraformer decoder model") | ||
| 39 | flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file") | 41 | flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file") |
| 40 | flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing") | 42 | flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing") |
| 41 | flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") | 43 | flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") |
| @@ -17,9 +17,11 @@ func main() { | @@ -17,9 +17,11 @@ func main() { | ||
| 17 | config := sherpa.OnlineRecognizerConfig{} | 17 | config := sherpa.OnlineRecognizerConfig{} |
| 18 | config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} | 18 | config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80} |
| 19 | 19 | ||
| 20 | - flag.StringVar(&config.ModelConfig.Encoder, "encoder", "", "Path to the encoder model") | ||
| 21 | - flag.StringVar(&config.ModelConfig.Decoder, "decoder", "", "Path to the decoder model") | ||
| 22 | - flag.StringVar(&config.ModelConfig.Joiner, "joiner", "", "Path to the joiner model") | 20 | + flag.StringVar(&config.ModelConfig.Transducer.Encoder, "encoder", "", "Path to the transducer encoder model") |
| 21 | + flag.StringVar(&config.ModelConfig.Transducer.Decoder, "decoder", "", "Path to the transducer decoder model") | ||
| 22 | + flag.StringVar(&config.ModelConfig.Transducer.Joiner, "joiner", "", "Path to the transducer joiner model") | ||
| 23 | + flag.StringVar(&config.ModelConfig.Paraformer.Encoder, "paraformer-encoder", "", "Path to the paraformer encoder model") | ||
| 24 | + flag.StringVar(&config.ModelConfig.Paraformer.Decoder, "paraformer-decoder", "", "Path to the paraformer decoder model") | ||
| 23 | flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file") | 25 | flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file") |
| 24 | flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing") | 26 | flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing") |
| 25 | flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") | 27 | flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +# Please refer to | ||
| 4 | +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english | ||
| 5 | +# to download the model files | ||
| 6 | + | ||
| 7 | +if [ ! -d ./sherpa-onnx-streaming-paraformer-bilingual-zh-en ]; then | ||
| 8 | + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en | ||
| 9 | + cd sherpa-onnx-streaming-paraformer-bilingual-zh-en | ||
| 10 | + git lfs pull --include "*.onnx" | ||
| 11 | + cd .. | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +./streaming-decode-files \ | ||
| 15 | + --paraformer-encoder ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx \ | ||
| 16 | + --paraformer-decoder ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx \ | ||
| 17 | + --tokens ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \ | ||
| 18 | + --decoding-method greedy_search \ | ||
| 19 | + --model-type paraformer \ | ||
| 20 | + --debug 0 \ | ||
| 21 | + ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav |
| @@ -306,12 +306,10 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { | @@ -306,12 +306,10 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { | ||
| 306 | "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " | 306 | "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " |
| 307 | "\r\n"; | 307 | "\r\n"; |
| 308 | msg += "to download a non-streaming model, i.e., an offline model.\r\n"; | 308 | msg += "to download a non-streaming model, i.e., an offline model.\r\n"; |
| 309 | + msg += "You need to rename them after downloading\r\n\r\n"; | ||
| 310 | + msg += "It supports transducer, paraformer, and whisper models.\r\n\r\n"; | ||
| 309 | msg += | 311 | msg += |
| 310 | - "You need to rename them to encoder.onnx, decoder.onnx, and " | ||
| 311 | - "joiner.onnx correspoondingly.\r\n\r\n"; | ||
| 312 | - msg += "It supports both transducer models and paraformer models.\r\n\r\n"; | ||
| 313 | - msg += | ||
| 314 | - "We give two examples below to show you how to download models\r\n\r\n"; | 312 | + "We give three examples below to show you how to download models\r\n\r\n"; |
| 315 | msg += "(1) Transducer\r\n\r\n"; | 313 | msg += "(1) Transducer\r\n\r\n"; |
| 316 | msg += | 314 | msg += |
| 317 | "We use " | 315 | "We use " |
| @@ -346,13 +344,82 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { | @@ -346,13 +344,82 @@ void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { | ||
| 346 | "https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28/" | 344 | "https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28/" |
| 347 | "resolve/main/tokens.txt\r\n\r\n"; | 345 | "resolve/main/tokens.txt\r\n\r\n"; |
| 348 | msg += "\r\n Now rename them\r\n"; | 346 | msg += "\r\n Now rename them\r\n"; |
| 349 | - msg += "mv model.onnx paraformer.onnx\r\n"; | 347 | + msg += "mv model.onnx paraformer.onnx\r\n\r\n"; |
| 348 | + msg += "(3) Whisper\r\n\r\n"; | ||
| 349 | + msg += | ||
| 350 | + "wget " | ||
| 351 | + "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/" | ||
| 352 | + "main/tiny.en-encoder.onnx\r\n"; | ||
| 353 | + msg += | ||
| 354 | + "wget " | ||
| 355 | + "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/" | ||
| 356 | + "main/tiny.en-decoder.onnx\r\n"; | ||
| 357 | + msg += | ||
| 358 | + "wget " | ||
| 359 | + "https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en/resolve/" | ||
| 360 | + "main/tiny.en-tokens.txt\r\n"; | ||
| 361 | + msg += "\r\n Now rename them\r\n"; | ||
| 362 | + msg += "mv tiny.en-encoder.onnx whisper-encoder.onnx\r\n"; | ||
| 363 | + msg += "mv tiny.en-decoder.onnx whisper-decoder.onnx\r\n"; | ||
| 350 | msg += "\r\n"; | 364 | msg += "\r\n"; |
| 351 | msg += "That's it!\r\n"; | 365 | msg += "That's it!\r\n"; |
| 352 | 366 | ||
| 353 | AppendLineToMultilineEditCtrl(msg); | 367 | AppendLineToMultilineEditCtrl(msg); |
| 354 | } | 368 | } |
| 355 | 369 | ||
| 370 | +void CNonStreamingSpeechRecognitionDlg::InitWhisper() { | ||
| 371 | + std::string whisper_encoder = "./whisper-encoder.onnx"; | ||
| 372 | + std::string whisper_decoder = "./whisper-decoder.onnx"; | ||
| 373 | + | ||
| 374 | + std::string tokens = "./tokens.txt"; | ||
| 375 | + | ||
| 376 | + bool is_ok = true; | ||
| 377 | + | ||
| 378 | + if (Exists("./whisper-encoder.int8.onnx")) { | ||
| 379 | + whisper_encoder = "./whisper-encoder.int8.onnx"; | ||
| 380 | + } else if (!Exists(whisper_encoder)) { | ||
| 381 | + std::string msg = whisper_encoder + " does not exist!"; | ||
| 382 | + AppendLineToMultilineEditCtrl(msg); | ||
| 383 | + is_ok = false; | ||
| 384 | + } | ||
| 385 | + | ||
| 386 | + if (Exists("./whisper-decoder.int8.onnx")) { | ||
| 387 | + whisper_decoder = "./whisper-decoder.int8.onnx"; | ||
| 388 | + } else if (!Exists(whisper_decoder)) { | ||
| 389 | + std::string msg = whisper_decoder + " does not exist!"; | ||
| 390 | + AppendLineToMultilineEditCtrl(msg); | ||
| 391 | + is_ok = false; | ||
| 392 | + } | ||
| 393 | + | ||
| 394 | + if (!Exists(tokens)) { | ||
| 395 | + std::string msg = tokens + " does not exist!"; | ||
| 396 | + AppendLineToMultilineEditCtrl(msg); | ||
| 397 | + is_ok = false; | ||
| 398 | + } | ||
| 399 | + | ||
| 400 | + if (!is_ok) { | ||
| 401 | + ShowInitRecognizerHelpMessage(); | ||
| 402 | + return; | ||
| 403 | + } | ||
| 404 | + | ||
| 405 | + memset(&config_, 0, sizeof(config_)); | ||
| 406 | + | ||
| 407 | + config_.feat_config.sample_rate = 16000; | ||
| 408 | + config_.feat_config.feature_dim = 80; | ||
| 409 | + | ||
| 410 | + config_.model_config.whisper.encoder = whisper_encoder.c_str(); | ||
| 411 | + config_.model_config.whisper.decoder = whisper_decoder.c_str(); | ||
| 412 | + config_.model_config.tokens = tokens.c_str(); | ||
| 413 | + config_.model_config.num_threads = 1; | ||
| 414 | + config_.model_config.debug = 1; | ||
| 415 | + config_.model_config.model_type = "whisper"; | ||
| 416 | + | ||
| 417 | + config_.decoding_method = "greedy_search"; | ||
| 418 | + config_.max_active_paths = 4; | ||
| 419 | + | ||
| 420 | + recognizer_ = CreateOfflineRecognizer(&config_); | ||
| 421 | +} | ||
| 422 | + | ||
| 356 | void CNonStreamingSpeechRecognitionDlg::InitParaformer() { | 423 | void CNonStreamingSpeechRecognitionDlg::InitParaformer() { |
| 357 | std::string paraformer = "./paraformer.onnx"; | 424 | std::string paraformer = "./paraformer.onnx"; |
| 358 | std::string tokens = "./tokens.txt"; | 425 | std::string tokens = "./tokens.txt"; |
| @@ -401,6 +468,11 @@ void CNonStreamingSpeechRecognitionDlg::InitRecognizer() { | @@ -401,6 +468,11 @@ void CNonStreamingSpeechRecognitionDlg::InitRecognizer() { | ||
| 401 | return; | 468 | return; |
| 402 | } | 469 | } |
| 403 | 470 | ||
| 471 | + if (Exists("./whisper-encoder.onnx") || Exists("./whisper-encoder.int8.onnx")) { | ||
| 472 | + InitWhisper(); | ||
| 473 | + return; | ||
| 474 | + } | ||
| 475 | + | ||
| 404 | // assume it is transducer | 476 | // assume it is transducer |
| 405 | 477 | ||
| 406 | std::string encoder = "./encoder.onnx"; | 478 | std::string encoder = "./encoder.onnx"; |
| @@ -69,5 +69,6 @@ class CNonStreamingSpeechRecognitionDlg : public CDialogEx { | @@ -69,5 +69,6 @@ class CNonStreamingSpeechRecognitionDlg : public CDialogEx { | ||
| 69 | void InitRecognizer(); | 69 | void InitRecognizer(); |
| 70 | 70 | ||
| 71 | void InitParaformer(); | 71 | void InitParaformer(); |
| 72 | + void InitWhisper(); | ||
| 72 | void ShowInitRecognizerHelpMessage(); | 73 | void ShowInitRecognizerHelpMessage(); |
| 73 | }; | 74 | }; |
| @@ -234,50 +234,18 @@ bool CStreamingSpeechRecognitionDlg::Exists(const std::string &filename) { | @@ -234,50 +234,18 @@ bool CStreamingSpeechRecognitionDlg::Exists(const std::string &filename) { | ||
| 234 | return is.good(); | 234 | return is.good(); |
| 235 | } | 235 | } |
| 236 | 236 | ||
| 237 | -void CStreamingSpeechRecognitionDlg::InitRecognizer() { | ||
| 238 | - std::string encoder = "./encoder.onnx"; | ||
| 239 | - std::string decoder = "./decoder.onnx"; | ||
| 240 | - std::string joiner = "./joiner.onnx"; | ||
| 241 | - std::string tokens = "./tokens.txt"; | ||
| 242 | - | ||
| 243 | - bool is_ok = true; | ||
| 244 | - if (!Exists(encoder)) { | ||
| 245 | - std::string msg = encoder + " does not exist!"; | ||
| 246 | - AppendLineToMultilineEditCtrl(msg); | ||
| 247 | - is_ok = false; | ||
| 248 | - } | ||
| 249 | - | ||
| 250 | - if (!Exists(decoder)) { | ||
| 251 | - std::string msg = decoder + " does not exist!"; | ||
| 252 | - AppendLineToMultilineEditCtrl(msg); | ||
| 253 | - is_ok = false; | ||
| 254 | - } | ||
| 255 | - | ||
| 256 | - if (!Exists(joiner)) { | ||
| 257 | - std::string msg = joiner + " does not exist!"; | ||
| 258 | - AppendLineToMultilineEditCtrl(msg); | ||
| 259 | - is_ok = false; | ||
| 260 | - } | ||
| 261 | - | ||
| 262 | - if (!Exists(tokens)) { | ||
| 263 | - std::string msg = tokens + " does not exist!"; | ||
| 264 | - AppendLineToMultilineEditCtrl(msg); | ||
| 265 | - is_ok = false; | ||
| 266 | - } | ||
| 267 | - | ||
| 268 | - if (!is_ok) { | 237 | +void CStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { |
| 269 | my_btn_.EnableWindow(FALSE); | 238 | my_btn_.EnableWindow(FALSE); |
| 270 | std::string msg = | 239 | std::string msg = |
| 271 | "\r\nPlease go to\r\n" | 240 | "\r\nPlease go to\r\n" |
| 272 | "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " | 241 | "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " |
| 273 | "\r\n"; | 242 | "\r\n"; |
| 274 | msg += "to download a streaming model, i.e., an online model.\r\n"; | 243 | msg += "to download a streaming model, i.e., an online model.\r\n"; |
| 244 | + msg += "You need to rename them after downloading\r\n\r\n"; | ||
| 245 | + msg += "It supports both transducer and paraformer models.\r\n\r\n"; | ||
| 275 | msg += | 246 | msg += |
| 276 | - "You need to rename them to encoder.onnx, decoder.onnx, and " | ||
| 277 | - "joiner.onnx correspoondingly.\r\n\r\n"; | ||
| 278 | - msg += | ||
| 279 | - "We use the following model as an example to show you how to do " | ||
| 280 | - "that.\r\n"; | 247 | + "We give two examples below to show you how to download models\r\n\r\n"; |
| 248 | + msg += "(1) Transducer\r\n\r\n"; | ||
| 281 | msg += | 249 | msg += |
| 282 | "https://huggingface.co/pkufool/" | 250 | "https://huggingface.co/pkufool/" |
| 283 | "icefall-asr-zipformer-streaming-wenetspeech-20230615"; | 251 | "icefall-asr-zipformer-streaming-wenetspeech-20230615"; |
| @@ -308,13 +276,132 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() { | @@ -308,13 +276,132 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() { | ||
| 308 | msg += "mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx\r\n"; | 276 | msg += "mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx\r\n"; |
| 309 | msg += "mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx\r\n"; | 277 | msg += "mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx\r\n"; |
| 310 | msg += "\r\n"; | 278 | msg += "\r\n"; |
| 279 | + msg += "(2) Paraformer\r\n\r\n"; | ||
| 280 | + msg += | ||
| 281 | + "wget " | ||
| 282 | + "https://huggingface.co/csukuangfj/" | ||
| 283 | + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/" | ||
| 284 | + "encoder.int8.onnx\r\n"; | ||
| 285 | + msg += | ||
| 286 | + "wget " | ||
| 287 | + "https://huggingface.co/csukuangfj/" | ||
| 288 | + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/" | ||
| 289 | + "decoder.int8.onnx\r\n"; | ||
| 290 | + msg += | ||
| 291 | + "wget " | ||
| 292 | + "https://huggingface.co/csukuangfj/" | ||
| 293 | + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/resolve/main/" | ||
| 294 | + "tokens.txt\r\n"; | ||
| 295 | + msg += "\r\nNow rename them.\r\n"; | ||
| 296 | + msg += "mv encoder.int8.onnx paraformer-encoder.onnx\r\n"; | ||
| 297 | + msg += "mv decoder.int8.onnx paraformer-decoder.onnx\r\n\r\n"; | ||
| 311 | msg += "That's it!\r\n"; | 298 | msg += "That's it!\r\n"; |
| 312 | 299 | ||
| 313 | AppendLineToMultilineEditCtrl(msg); | 300 | AppendLineToMultilineEditCtrl(msg); |
| 301 | +} | ||
| 302 | + | ||
| 303 | +void CStreamingSpeechRecognitionDlg::InitParaformer() { | ||
| 304 | + std::string paraformer_encoder = "./paraformer-encoder.onnx"; | ||
| 305 | + std::string paraformer_decoder = "./paraformer-decoder.onnx"; | ||
| 306 | + | ||
| 307 | + std::string tokens = "./tokens.txt"; | ||
| 308 | + | ||
| 309 | + bool is_ok = true; | ||
| 310 | + | ||
| 311 | + if (Exists("./paraformer-encoder.int8.onnx")) { | ||
| 312 | + paraformer_encoder = "./paraformer-encoder.int8.onnx"; | ||
| 313 | + } else if (!Exists(paraformer_encoder)) { | ||
| 314 | + std::string msg = paraformer_encoder + " does not exist!"; | ||
| 315 | + AppendLineToMultilineEditCtrl(msg); | ||
| 316 | + is_ok = false; | ||
| 317 | + } | ||
| 318 | + | ||
| 319 | + if (Exists("./paraformer-decoder.int8.onnx")) { | ||
| 320 | + paraformer_decoder = "./paraformer-decoder.int8.onnx"; | ||
| 321 | + } else if (!Exists(paraformer_decoder)) { | ||
| 322 | + std::string msg = paraformer_decoder + " does not exist!"; | ||
| 323 | + AppendLineToMultilineEditCtrl(msg); | ||
| 324 | + is_ok = false; | ||
| 325 | + } | ||
| 326 | + | ||
| 327 | + if (!Exists(tokens)) { | ||
| 328 | + std::string msg = tokens + " does not exist!"; | ||
| 329 | + AppendLineToMultilineEditCtrl(msg); | ||
| 330 | + is_ok = false; | ||
| 331 | + } | ||
| 332 | + | ||
| 333 | + if (!is_ok) { | ||
| 334 | + ShowInitRecognizerHelpMessage(); | ||
| 335 | + return; | ||
| 336 | + } | ||
| 337 | + | ||
| 338 | + SherpaOnnxOnlineRecognizerConfig config; | ||
| 339 | + memset(&config, 0, sizeof(config)); | ||
| 340 | + config.model_config.debug = 0; | ||
| 341 | + config.model_config.num_threads = 1; | ||
| 342 | + config.model_config.provider = "cpu"; | ||
| 343 | + | ||
| 344 | + config.decoding_method = "greedy_search"; | ||
| 345 | + config.max_active_paths = 4; | ||
| 346 | + | ||
| 347 | + config.feat_config.sample_rate = 16000; | ||
| 348 | + config.feat_config.feature_dim = 80; | ||
| 349 | + | ||
| 350 | + config.enable_endpoint = 1; | ||
| 351 | + config.rule1_min_trailing_silence = 1.2f; | ||
| 352 | + config.rule2_min_trailing_silence = 0.8f; | ||
| 353 | + config.rule3_min_utterance_length = 300.0f; | ||
| 354 | + | ||
| 355 | + config.model_config.tokens = tokens.c_str(); | ||
| 356 | + config.model_config.paraformer.encoder = paraformer_encoder.c_str(); | ||
| 357 | + config.model_config.paraformer.decoder = paraformer_decoder.c_str(); | ||
| 358 | + | ||
| 359 | + recognizer_ = CreateOnlineRecognizer(&config); | ||
| 360 | +} | ||
| 361 | + | ||
| 362 | +void CStreamingSpeechRecognitionDlg::InitRecognizer() { | ||
| 363 | + if (Exists("./paraformer-encoder.onnx") || Exists("./paraformer-encoder.int8.onnx")) { | ||
| 364 | + InitParaformer(); | ||
| 365 | + return; | ||
| 366 | + } | ||
| 367 | + | ||
| 368 | + std::string encoder = "./encoder.onnx"; | ||
| 369 | + std::string decoder = "./decoder.onnx"; | ||
| 370 | + std::string joiner = "./joiner.onnx"; | ||
| 371 | + std::string tokens = "./tokens.txt"; | ||
| 372 | + | ||
| 373 | + bool is_ok = true; | ||
| 374 | + if (!Exists(encoder)) { | ||
| 375 | + std::string msg = encoder + " does not exist!"; | ||
| 376 | + AppendLineToMultilineEditCtrl(msg); | ||
| 377 | + is_ok = false; | ||
| 378 | + } | ||
| 379 | + | ||
| 380 | + if (!Exists(decoder)) { | ||
| 381 | + std::string msg = decoder + " does not exist!"; | ||
| 382 | + AppendLineToMultilineEditCtrl(msg); | ||
| 383 | + is_ok = false; | ||
| 384 | + } | ||
| 385 | + | ||
| 386 | + if (!Exists(joiner)) { | ||
| 387 | + std::string msg = joiner + " does not exist!"; | ||
| 388 | + AppendLineToMultilineEditCtrl(msg); | ||
| 389 | + is_ok = false; | ||
| 390 | + } | ||
| 391 | + | ||
| 392 | + if (!Exists(tokens)) { | ||
| 393 | + std::string msg = tokens + " does not exist!"; | ||
| 394 | + AppendLineToMultilineEditCtrl(msg); | ||
| 395 | + is_ok = false; | ||
| 396 | + } | ||
| 397 | + | ||
| 398 | + if (!is_ok) { | ||
| 399 | + ShowInitRecognizerHelpMessage(); | ||
| 314 | return; | 400 | return; |
| 315 | } | 401 | } |
| 316 | 402 | ||
| 317 | SherpaOnnxOnlineRecognizerConfig config; | 403 | SherpaOnnxOnlineRecognizerConfig config; |
| 404 | + memset(&config, 0, sizeof(config)); | ||
| 318 | config.model_config.debug = 0; | 405 | config.model_config.debug = 0; |
| 319 | config.model_config.num_threads = 1; | 406 | config.model_config.num_threads = 1; |
| 320 | config.model_config.provider = "cpu"; | 407 | config.model_config.provider = "cpu"; |
| @@ -331,9 +418,9 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() { | @@ -331,9 +418,9 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() { | ||
| 331 | config.rule3_min_utterance_length = 300.0f; | 418 | config.rule3_min_utterance_length = 300.0f; |
| 332 | 419 | ||
| 333 | config.model_config.tokens = tokens.c_str(); | 420 | config.model_config.tokens = tokens.c_str(); |
| 334 | - config.model_config.encoder = encoder.c_str(); | ||
| 335 | - config.model_config.decoder = decoder.c_str(); | ||
| 336 | - config.model_config.joiner = joiner.c_str(); | 421 | + config.model_config.transducer.encoder = encoder.c_str(); |
| 422 | + config.model_config.transducer.decoder = decoder.c_str(); | ||
| 423 | + config.model_config.transducer.joiner = joiner.c_str(); | ||
| 337 | 424 | ||
| 338 | recognizer_ = CreateOnlineRecognizer(&config); | 425 | recognizer_ = CreateOnlineRecognizer(&config); |
| 339 | } | 426 | } |
| @@ -67,6 +67,8 @@ class CStreamingSpeechRecognitionDlg : public CDialogEx { | @@ -67,6 +67,8 @@ class CStreamingSpeechRecognitionDlg : public CDialogEx { | ||
| 67 | 67 | ||
| 68 | bool Exists(const std::string &filename); | 68 | bool Exists(const std::string &filename); |
| 69 | void InitRecognizer(); | 69 | void InitRecognizer(); |
| 70 | + void InitParaformer(); | ||
| 71 | + void ShowInitRecognizerHelpMessage(); | ||
| 70 | }; | 72 | }; |
| 71 | 73 | ||
| 72 | class RecognizerThread : public CWinThread { | 74 | class RecognizerThread : public CWinThread { |
| @@ -45,9 +45,30 @@ import "unsafe" | @@ -45,9 +45,30 @@ import "unsafe" | ||
| 45 | // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html | 45 | // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html |
| 46 | // to download pre-trained models | 46 | // to download pre-trained models |
| 47 | type OnlineTransducerModelConfig struct { | 47 | type OnlineTransducerModelConfig struct { |
| 48 | - Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx | ||
| 49 | - Decoder string // Path to the decoder model. | ||
| 50 | - Joiner string // Path to the joiner model. | 48 | + Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx |
| 49 | + Decoder string // Path to the decoder model. | ||
| 50 | + Joiner string // Path to the joiner model. | ||
| 51 | +} | ||
| 52 | + | ||
| 53 | +// Configuration for online/streaming paraformer models | ||
| 54 | +// | ||
| 55 | +// Please refer to | ||
| 56 | +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html | ||
| 57 | +// to download pre-trained models | ||
| 58 | +type OnlineParaformerModelConfig struct { | ||
| 59 | + Encoder string // Path to the encoder model, e.g., encoder.onnx or encoder.int8.onnx | ||
| 60 | + Decoder string // Path to the decoder model. | ||
| 61 | +} | ||
| 62 | + | ||
| 63 | +// Configuration for online/streaming models | ||
| 64 | +// | ||
| 65 | +// Please refer to | ||
| 66 | +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html | ||
| 67 | +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/index.html | ||
| 68 | +// to download pre-trained models | ||
| 69 | +type OnlineModelConfig struct { | ||
| 70 | + Transducer OnlineTransducerModelConfig | ||
| 71 | + Paraformer OnlineParaformerModelConfig | ||
| 51 | Tokens string // Path to tokens.txt | 72 | Tokens string // Path to tokens.txt |
| 52 | NumThreads int // Number of threads to use for neural network computation | 73 | NumThreads int // Number of threads to use for neural network computation |
| 53 | Provider string // Optional. Valid values are: cpu, cuda, coreml | 74 | Provider string // Optional. Valid values are: cpu, cuda, coreml |
| @@ -68,7 +89,7 @@ type FeatureConfig struct { | @@ -68,7 +89,7 @@ type FeatureConfig struct { | ||
| 68 | // Configuration for the online/streaming recognizer. | 89 | // Configuration for the online/streaming recognizer. |
| 69 | type OnlineRecognizerConfig struct { | 90 | type OnlineRecognizerConfig struct { |
| 70 | FeatConfig FeatureConfig | 91 | FeatConfig FeatureConfig |
| 71 | - ModelConfig OnlineTransducerModelConfig | 92 | + ModelConfig OnlineModelConfig |
| 72 | 93 | ||
| 73 | // Valid decoding methods: greedy_search, modified_beam_search | 94 | // Valid decoding methods: greedy_search, modified_beam_search |
| 74 | DecodingMethod string | 95 | DecodingMethod string |
| @@ -116,14 +137,20 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { | @@ -116,14 +137,20 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { | ||
| 116 | c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate) | 137 | c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate) |
| 117 | c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim) | 138 | c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim) |
| 118 | 139 | ||
| 119 | - c.model_config.encoder = C.CString(config.ModelConfig.Encoder) | ||
| 120 | - defer C.free(unsafe.Pointer(c.model_config.encoder)) | 140 | + c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder) |
| 141 | + defer C.free(unsafe.Pointer(c.model_config.transducer.encoder)) | ||
| 142 | + | ||
| 143 | + c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder) | ||
| 144 | + defer C.free(unsafe.Pointer(c.model_config.transducer.decoder)) | ||
| 145 | + | ||
| 146 | + c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner) | ||
| 147 | + defer C.free(unsafe.Pointer(c.model_config.transducer.joiner)) | ||
| 121 | 148 | ||
| 122 | - c.model_config.decoder = C.CString(config.ModelConfig.Decoder) | ||
| 123 | - defer C.free(unsafe.Pointer(c.model_config.decoder)) | 149 | + c.model_config.paraformer.encoder = C.CString(config.ModelConfig.Paraformer.Encoder) |
| 150 | + defer C.free(unsafe.Pointer(c.model_config.paraformer.encoder)) | ||
| 124 | 151 | ||
| 125 | - c.model_config.joiner = C.CString(config.ModelConfig.Joiner) | ||
| 126 | - defer C.free(unsafe.Pointer(c.model_config.joiner)) | 152 | + c.model_config.paraformer.decoder = C.CString(config.ModelConfig.Paraformer.Decoder) |
| 153 | + defer C.free(unsafe.Pointer(c.model_config.paraformer.decoder)) | ||
| 127 | 154 | ||
| 128 | c.model_config.tokens = C.CString(config.ModelConfig.Tokens) | 155 | c.model_config.tokens = C.CString(config.ModelConfig.Tokens) |
| 129 | defer C.free(unsafe.Pointer(c.model_config.tokens)) | 156 | defer C.free(unsafe.Pointer(c.model_config.tokens)) |
| @@ -265,6 +265,12 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | @@ -265,6 +265,12 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | ||
| 265 | recognizer_config.model_config.nemo_ctc.model = | 265 | recognizer_config.model_config.nemo_ctc.model = |
| 266 | SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, ""); | 266 | SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, ""); |
| 267 | 267 | ||
| 268 | + recognizer_config.model_config.whisper.encoder = | ||
| 269 | + SHERPA_ONNX_OR(config->model_config.whisper.encoder, ""); | ||
| 270 | + | ||
| 271 | + recognizer_config.model_config.whisper.decoder = | ||
| 272 | + SHERPA_ONNX_OR(config->model_config.whisper.decoder, ""); | ||
| 273 | + | ||
| 268 | recognizer_config.model_config.tokens = | 274 | recognizer_config.model_config.tokens = |
| 269 | SHERPA_ONNX_OR(config->model_config.tokens, ""); | 275 | SHERPA_ONNX_OR(config->model_config.tokens, ""); |
| 270 | recognizer_config.model_config.num_threads = | 276 | recognizer_config.model_config.num_threads = |
| @@ -300,6 +300,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineNemoEncDecCtcModelConfig { | @@ -300,6 +300,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineNemoEncDecCtcModelConfig { | ||
| 300 | const char *model; | 300 | const char *model; |
| 301 | } SherpaOnnxOfflineNemoEncDecCtcModelConfig; | 301 | } SherpaOnnxOfflineNemoEncDecCtcModelConfig; |
| 302 | 302 | ||
| 303 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineWhisperModelConfig { | ||
| 304 | + const char *encoder; | ||
| 305 | + const char *decoder; | ||
| 306 | +} SherpaOnnxOfflineWhisperModelConfig; | ||
| 307 | + | ||
| 303 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineLMConfig { | 308 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineLMConfig { |
| 304 | const char *model; | 309 | const char *model; |
| 305 | float scale; | 310 | float scale; |
| @@ -309,6 +314,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { | @@ -309,6 +314,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { | ||
| 309 | SherpaOnnxOfflineTransducerModelConfig transducer; | 314 | SherpaOnnxOfflineTransducerModelConfig transducer; |
| 310 | SherpaOnnxOfflineParaformerModelConfig paraformer; | 315 | SherpaOnnxOfflineParaformerModelConfig paraformer; |
| 311 | SherpaOnnxOfflineNemoEncDecCtcModelConfig nemo_ctc; | 316 | SherpaOnnxOfflineNemoEncDecCtcModelConfig nemo_ctc; |
| 317 | + SherpaOnnxOfflineWhisperModelConfig whisper; | ||
| 312 | 318 | ||
| 313 | const char *tokens; | 319 | const char *tokens; |
| 314 | int32_t num_threads; | 320 | int32_t num_threads; |
-
请 注册 或 登录 后发表评论