正在显示
9 个修改的文件
包含
302 行增加
和
34 行删除
| @@ -376,7 +376,7 @@ jobs: | @@ -376,7 +376,7 @@ jobs: | ||
| 376 | name: matcha-tts-${{ matrix.os }} | 376 | name: matcha-tts-${{ matrix.os }} |
| 377 | path: ./generated-matcha-*.wav | 377 | path: ./generated-matcha-*.wav |
| 378 | 378 | ||
| 379 | - - name: Test vad + Whisper tiny.en | 379 | + - name: Test silero-vad + Whisper tiny.en |
| 380 | shell: bash | 380 | shell: bash |
| 381 | run: | | 381 | run: | |
| 382 | gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \ | 382 | gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \ |
| @@ -403,7 +403,34 @@ jobs: | @@ -403,7 +403,34 @@ jobs: | ||
| 403 | rm -rf *.onnx | 403 | rm -rf *.onnx |
| 404 | rm *.wav | 404 | rm *.wav |
| 405 | 405 | ||
| 406 | - - name: Test vad + Moonshine | 406 | + - name: Test ten-vad + Whisper tiny.en |
| 407 | + shell: bash | ||
| 408 | + run: | | ||
| 409 | + gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \ | ||
| 410 | + -I ./build/install/include \ | ||
| 411 | + -L ./build/install/lib/ \ | ||
| 412 | + -l sherpa-onnx-c-api \ | ||
| 413 | + -l onnxruntime | ||
| 414 | + | ||
| 415 | + # Now download models | ||
| 416 | + # | ||
| 417 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 418 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav | ||
| 419 | + | ||
| 420 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 421 | + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 422 | + rm sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 423 | + | ||
| 424 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 425 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 426 | + | ||
| 427 | + ./vad-whisper-c-api | ||
| 428 | + | ||
| 429 | + rm -rf sherpa-onnx-* | ||
| 430 | + rm -rf *.onnx | ||
| 431 | + rm *.wav | ||
| 432 | + | ||
| 433 | + - name: Test silero-vad + Moonshine | ||
| 407 | shell: bash | 434 | shell: bash |
| 408 | run: | | 435 | run: | |
| 409 | gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \ | 436 | gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \ |
| @@ -430,6 +457,33 @@ jobs: | @@ -430,6 +457,33 @@ jobs: | ||
| 430 | rm -rf *.onnx | 457 | rm -rf *.onnx |
| 431 | rm *.wav | 458 | rm *.wav |
| 432 | 459 | ||
| 460 | + - name: Test ten-vad + Moonshine | ||
| 461 | + shell: bash | ||
| 462 | + run: | | ||
| 463 | + gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \ | ||
| 464 | + -I ./build/install/include \ | ||
| 465 | + -L ./build/install/lib/ \ | ||
| 466 | + -l sherpa-onnx-c-api \ | ||
| 467 | + -l onnxruntime | ||
| 468 | + | ||
| 469 | + # Now download models | ||
| 470 | + # | ||
| 471 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 472 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav | ||
| 473 | + | ||
| 474 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 475 | + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 476 | + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 477 | + | ||
| 478 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 479 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 480 | + | ||
| 481 | + ./vad-moonshine-c-api | ||
| 482 | + | ||
| 483 | + rm -rf sherpa-onnx-* | ||
| 484 | + rm -rf *.onnx | ||
| 485 | + rm *.wav | ||
| 486 | + | ||
| 433 | - name: Test Moonshine | 487 | - name: Test Moonshine |
| 434 | shell: bash | 488 | shell: bash |
| 435 | run: | | 489 | run: | |
| @@ -466,7 +520,7 @@ jobs: | @@ -466,7 +520,7 @@ jobs: | ||
| 466 | ./run.sh | 520 | ./run.sh |
| 467 | rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 | 521 | rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 |
| 468 | 522 | ||
| 469 | - - name: Test vad + sense-voice | 523 | + - name: Test silero-vad + sense-voice |
| 470 | shell: bash | 524 | shell: bash |
| 471 | run: | | 525 | run: | |
| 472 | gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \ | 526 | gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \ |
| @@ -505,6 +559,45 @@ jobs: | @@ -505,6 +559,45 @@ jobs: | ||
| 505 | rm -rf *.onnx | 559 | rm -rf *.onnx |
| 506 | rm *.wav | 560 | rm *.wav |
| 507 | 561 | ||
| 562 | + - name: Test ten-vad + sense-voice | ||
| 563 | + shell: bash | ||
| 564 | + run: | | ||
| 565 | + gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \ | ||
| 566 | + -I ./build/install/include \ | ||
| 567 | + -L ./build/install/lib/ \ | ||
| 568 | + -l sherpa-onnx-c-api \ | ||
| 569 | + -l onnxruntime | ||
| 570 | + | ||
| 571 | + ls -lh vad-sense-voice-c-api | ||
| 572 | + | ||
| 573 | + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then | ||
| 574 | + ldd ./vad-sense-voice-c-api | ||
| 575 | + echo "----" | ||
| 576 | + readelf -d ./vad-sense-voice-c-api | ||
| 577 | + fi | ||
| 578 | + | ||
| 579 | + # Now download models | ||
| 580 | + # | ||
| 581 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 582 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 583 | + | ||
| 584 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 585 | + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 586 | + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 587 | + | ||
| 588 | + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 | ||
| 589 | + echo "---" | ||
| 590 | + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs | ||
| 591 | + | ||
| 592 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 593 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 594 | + | ||
| 595 | + ./vad-sense-voice-c-api | ||
| 596 | + | ||
| 597 | + rm -rf sherpa-onnx-sense-voice-* | ||
| 598 | + rm -rf *.onnx | ||
| 599 | + rm *.wav | ||
| 600 | + | ||
| 508 | - name: Test sense-voice | 601 | - name: Test sense-voice |
| 509 | shell: bash | 602 | shell: bash |
| 510 | run: | | 603 | run: | |
| @@ -6,7 +6,12 @@ | @@ -6,7 +6,12 @@ | ||
| 6 | // This file demonstrates how to use VAD + Moonshine with sherpa-onnx's C API. | 6 | // This file demonstrates how to use VAD + Moonshine with sherpa-onnx's C API. |
| 7 | // clang-format off | 7 | // clang-format off |
| 8 | // | 8 | // |
| 9 | -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | 9 | +// To use silero-vad: |
| 10 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 11 | +// | ||
| 12 | +// To use ten-vad: | ||
| 13 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 14 | +// | ||
| 10 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav | 15 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav |
| 11 | // | 16 | // |
| 12 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | 17 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 |
| @@ -23,7 +28,27 @@ | @@ -23,7 +28,27 @@ | ||
| 23 | 28 | ||
| 24 | int32_t main() { | 29 | int32_t main() { |
| 25 | const char *wav_filename = "./Obama.wav"; | 30 | const char *wav_filename = "./Obama.wav"; |
| 26 | - const char *vad_filename = "./silero_vad.onnx"; | 31 | + if (!SherpaOnnxFileExists(wav_filename)) { |
| 32 | + fprintf(stderr, "Please download %s\n", wav_filename); | ||
| 33 | + return -1; | ||
| 34 | + } | ||
| 35 | + | ||
| 36 | + const char *vad_filename; | ||
| 37 | + int32_t use_silero_vad = 0; | ||
| 38 | + int32_t use_ten_vad = 0; | ||
| 39 | + | ||
| 40 | + if (SherpaOnnxFileExists("./silero_vad.onnx")) { | ||
| 41 | + printf("Use silero-vad\n"); | ||
| 42 | + vad_filename = "./silero_vad.onnx"; | ||
| 43 | + use_silero_vad = 1; | ||
| 44 | + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) { | ||
| 45 | + printf("Use ten-vad\n"); | ||
| 46 | + vad_filename = "./ten-vad.onnx"; | ||
| 47 | + use_ten_vad = 1; | ||
| 48 | + } else { | ||
| 49 | + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n"); | ||
| 50 | + return -1; | ||
| 51 | + } | ||
| 27 | 52 | ||
| 28 | const char *preprocessor = | 53 | const char *preprocessor = |
| 29 | "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx"; | 54 | "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx"; |
| @@ -76,12 +101,22 @@ int32_t main() { | @@ -76,12 +101,22 @@ int32_t main() { | ||
| 76 | 101 | ||
| 77 | SherpaOnnxVadModelConfig vadConfig; | 102 | SherpaOnnxVadModelConfig vadConfig; |
| 78 | memset(&vadConfig, 0, sizeof(vadConfig)); | 103 | memset(&vadConfig, 0, sizeof(vadConfig)); |
| 79 | - vadConfig.silero_vad.model = vad_filename; | ||
| 80 | - vadConfig.silero_vad.threshold = 0.5; | ||
| 81 | - vadConfig.silero_vad.min_silence_duration = 0.5; | ||
| 82 | - vadConfig.silero_vad.min_speech_duration = 0.5; | ||
| 83 | - vadConfig.silero_vad.max_speech_duration = 10; | ||
| 84 | - vadConfig.silero_vad.window_size = 512; | 104 | + if (use_silero_vad) { |
| 105 | + vadConfig.silero_vad.model = vad_filename; | ||
| 106 | + vadConfig.silero_vad.threshold = 0.25; | ||
| 107 | + vadConfig.silero_vad.min_silence_duration = 0.5; | ||
| 108 | + vadConfig.silero_vad.min_speech_duration = 0.5; | ||
| 109 | + vadConfig.silero_vad.max_speech_duration = 10; | ||
| 110 | + vadConfig.silero_vad.window_size = 512; | ||
| 111 | + } else if (use_ten_vad) { | ||
| 112 | + vadConfig.ten_vad.model = vad_filename; | ||
| 113 | + vadConfig.ten_vad.threshold = 0.25; | ||
| 114 | + vadConfig.ten_vad.min_silence_duration = 0.5; | ||
| 115 | + vadConfig.ten_vad.min_speech_duration = 0.5; | ||
| 116 | + vadConfig.ten_vad.max_speech_duration = 10; | ||
| 117 | + vadConfig.ten_vad.window_size = 256; | ||
| 118 | + } | ||
| 119 | + | ||
| 85 | vadConfig.sample_rate = 16000; | 120 | vadConfig.sample_rate = 16000; |
| 86 | vadConfig.num_threads = 1; | 121 | vadConfig.num_threads = 1; |
| 87 | vadConfig.debug = 1; | 122 | vadConfig.debug = 1; |
| @@ -96,7 +131,9 @@ int32_t main() { | @@ -96,7 +131,9 @@ int32_t main() { | ||
| 96 | return -1; | 131 | return -1; |
| 97 | } | 132 | } |
| 98 | 133 | ||
| 99 | - int32_t window_size = vadConfig.silero_vad.window_size; | 134 | + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size |
| 135 | + : vadConfig.ten_vad.window_size; | ||
| 136 | + | ||
| 100 | int32_t i = 0; | 137 | int32_t i = 0; |
| 101 | int is_eof = 0; | 138 | int is_eof = 0; |
| 102 | 139 |
| @@ -6,7 +6,12 @@ | @@ -6,7 +6,12 @@ | ||
| 6 | // This file demonstrates how to use VAD + SenseVoice with sherpa-onnx's C API. | 6 | // This file demonstrates how to use VAD + SenseVoice with sherpa-onnx's C API. |
| 7 | // clang-format off | 7 | // clang-format off |
| 8 | // | 8 | // |
| 9 | -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | 9 | +// To use silero-vad: |
| 10 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 11 | +// | ||
| 12 | +// To use ten-vad: | ||
| 13 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 14 | +// | ||
| 10 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | 15 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav |
| 11 | // | 16 | // |
| 12 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | 17 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 |
| @@ -23,7 +28,28 @@ | @@ -23,7 +28,28 @@ | ||
| 23 | 28 | ||
| 24 | int32_t main() { | 29 | int32_t main() { |
| 25 | const char *wav_filename = "./lei-jun-test.wav"; | 30 | const char *wav_filename = "./lei-jun-test.wav"; |
| 26 | - const char *vad_filename = "./silero_vad.onnx"; | 31 | + if (!SherpaOnnxFileExists(wav_filename)) { |
| 32 | + fprintf(stderr, "Please download %s\n", wav_filename); | ||
| 33 | + return -1; | ||
| 34 | + } | ||
| 35 | + | ||
| 36 | + const char *vad_filename; | ||
| 37 | + int32_t use_silero_vad = 0; | ||
| 38 | + int32_t use_ten_vad = 0; | ||
| 39 | + | ||
| 40 | + if (SherpaOnnxFileExists("./silero_vad.onnx")) { | ||
| 41 | + printf("Use silero-vad\n"); | ||
| 42 | + vad_filename = "./silero_vad.onnx"; | ||
| 43 | + use_silero_vad = 1; | ||
| 44 | + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) { | ||
| 45 | + printf("Use ten-vad\n"); | ||
| 46 | + vad_filename = "./ten-vad.onnx"; | ||
| 47 | + use_ten_vad = 1; | ||
| 48 | + } else { | ||
| 49 | + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n"); | ||
| 50 | + return -1; | ||
| 51 | + } | ||
| 52 | + | ||
| 27 | const char *model_filename = | 53 | const char *model_filename = |
| 28 | "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | 54 | "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; |
| 29 | const char *tokens_filename = | 55 | const char *tokens_filename = |
| @@ -77,12 +103,23 @@ int32_t main() { | @@ -77,12 +103,23 @@ int32_t main() { | ||
| 77 | 103 | ||
| 78 | SherpaOnnxVadModelConfig vadConfig; | 104 | SherpaOnnxVadModelConfig vadConfig; |
| 79 | memset(&vadConfig, 0, sizeof(vadConfig)); | 105 | memset(&vadConfig, 0, sizeof(vadConfig)); |
| 80 | - vadConfig.silero_vad.model = vad_filename; | ||
| 81 | - vadConfig.silero_vad.threshold = 0.5; | ||
| 82 | - vadConfig.silero_vad.min_silence_duration = 0.5; | ||
| 83 | - vadConfig.silero_vad.min_speech_duration = 0.5; | ||
| 84 | - vadConfig.silero_vad.max_speech_duration = 5; | ||
| 85 | - vadConfig.silero_vad.window_size = 512; | 106 | + |
| 107 | + if (use_silero_vad) { | ||
| 108 | + vadConfig.silero_vad.model = vad_filename; | ||
| 109 | + vadConfig.silero_vad.threshold = 0.25; | ||
| 110 | + vadConfig.silero_vad.min_silence_duration = 0.5; | ||
| 111 | + vadConfig.silero_vad.min_speech_duration = 0.5; | ||
| 112 | + vadConfig.silero_vad.max_speech_duration = 10; | ||
| 113 | + vadConfig.silero_vad.window_size = 512; | ||
| 114 | + } else if (use_ten_vad) { | ||
| 115 | + vadConfig.ten_vad.model = vad_filename; | ||
| 116 | + vadConfig.ten_vad.threshold = 0.25; | ||
| 117 | + vadConfig.ten_vad.min_silence_duration = 0.5; | ||
| 118 | + vadConfig.ten_vad.min_speech_duration = 0.5; | ||
| 119 | + vadConfig.ten_vad.max_speech_duration = 10; | ||
| 120 | + vadConfig.ten_vad.window_size = 256; | ||
| 121 | + } | ||
| 122 | + | ||
| 86 | vadConfig.sample_rate = 16000; | 123 | vadConfig.sample_rate = 16000; |
| 87 | vadConfig.num_threads = 1; | 124 | vadConfig.num_threads = 1; |
| 88 | vadConfig.debug = 1; | 125 | vadConfig.debug = 1; |
| @@ -97,7 +134,8 @@ int32_t main() { | @@ -97,7 +134,8 @@ int32_t main() { | ||
| 97 | return -1; | 134 | return -1; |
| 98 | } | 135 | } |
| 99 | 136 | ||
| 100 | - int32_t window_size = vadConfig.silero_vad.window_size; | 137 | + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size |
| 138 | + : vadConfig.ten_vad.window_size; | ||
| 101 | int32_t i = 0; | 139 | int32_t i = 0; |
| 102 | int is_eof = 0; | 140 | int is_eof = 0; |
| 103 | 141 |
| @@ -8,7 +8,12 @@ | @@ -8,7 +8,12 @@ | ||
| 8 | // | 8 | // |
| 9 | // clang-format off | 9 | // clang-format off |
| 10 | // | 10 | // |
| 11 | -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | 11 | +// To use silero-vad: |
| 12 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 13 | +// | ||
| 14 | +// To use ten-vad: | ||
| 15 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 16 | +// | ||
| 12 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav | 17 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav |
| 13 | // | 18 | // |
| 14 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | 19 | // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 |
| @@ -25,7 +30,28 @@ | @@ -25,7 +30,28 @@ | ||
| 25 | 30 | ||
| 26 | int32_t main() { | 31 | int32_t main() { |
| 27 | const char *wav_filename = "./Obama.wav"; | 32 | const char *wav_filename = "./Obama.wav"; |
| 28 | - const char *vad_filename = "./silero_vad.onnx"; | 33 | + |
| 34 | + if (!SherpaOnnxFileExists(wav_filename)) { | ||
| 35 | + fprintf(stderr, "Please download %s\n", wav_filename); | ||
| 36 | + return -1; | ||
| 37 | + } | ||
| 38 | + | ||
| 39 | + const char *vad_filename; | ||
| 40 | + int32_t use_silero_vad = 0; | ||
| 41 | + int32_t use_ten_vad = 0; | ||
| 42 | + | ||
| 43 | + if (SherpaOnnxFileExists("./silero_vad.onnx")) { | ||
| 44 | + printf("Use silero-vad\n"); | ||
| 45 | + vad_filename = "./silero_vad.onnx"; | ||
| 46 | + use_silero_vad = 1; | ||
| 47 | + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) { | ||
| 48 | + printf("Use ten-vad\n"); | ||
| 49 | + vad_filename = "./ten-vad.onnx"; | ||
| 50 | + use_ten_vad = 1; | ||
| 51 | + } else { | ||
| 52 | + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n"); | ||
| 53 | + return -1; | ||
| 54 | + } | ||
| 29 | 55 | ||
| 30 | const char *encoder = "sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"; | 56 | const char *encoder = "sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"; |
| 31 | const char *decoder = "sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"; | 57 | const char *decoder = "sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"; |
| @@ -74,12 +100,23 @@ int32_t main() { | @@ -74,12 +100,23 @@ int32_t main() { | ||
| 74 | 100 | ||
| 75 | SherpaOnnxVadModelConfig vadConfig; | 101 | SherpaOnnxVadModelConfig vadConfig; |
| 76 | memset(&vadConfig, 0, sizeof(vadConfig)); | 102 | memset(&vadConfig, 0, sizeof(vadConfig)); |
| 77 | - vadConfig.silero_vad.model = vad_filename; | ||
| 78 | - vadConfig.silero_vad.threshold = 0.5; | ||
| 79 | - vadConfig.silero_vad.min_silence_duration = 0.5; | ||
| 80 | - vadConfig.silero_vad.min_speech_duration = 0.5; | ||
| 81 | - vadConfig.silero_vad.max_speech_duration = 10; | ||
| 82 | - vadConfig.silero_vad.window_size = 512; | 103 | + |
| 104 | + if (use_silero_vad) { | ||
| 105 | + vadConfig.silero_vad.model = vad_filename; | ||
| 106 | + vadConfig.silero_vad.threshold = 0.25; | ||
| 107 | + vadConfig.silero_vad.min_silence_duration = 0.5; | ||
| 108 | + vadConfig.silero_vad.min_speech_duration = 0.5; | ||
| 109 | + vadConfig.silero_vad.max_speech_duration = 10; | ||
| 110 | + vadConfig.silero_vad.window_size = 512; | ||
| 111 | + } else if (use_ten_vad) { | ||
| 112 | + vadConfig.ten_vad.model = vad_filename; | ||
| 113 | + vadConfig.ten_vad.threshold = 0.25; | ||
| 114 | + vadConfig.ten_vad.min_silence_duration = 0.5; | ||
| 115 | + vadConfig.ten_vad.min_speech_duration = 0.5; | ||
| 116 | + vadConfig.ten_vad.max_speech_duration = 10; | ||
| 117 | + vadConfig.ten_vad.window_size = 256; | ||
| 118 | + } | ||
| 119 | + | ||
| 83 | vadConfig.sample_rate = 16000; | 120 | vadConfig.sample_rate = 16000; |
| 84 | vadConfig.num_threads = 1; | 121 | vadConfig.num_threads = 1; |
| 85 | vadConfig.debug = 1; | 122 | vadConfig.debug = 1; |
| @@ -94,7 +131,8 @@ int32_t main() { | @@ -94,7 +131,8 @@ int32_t main() { | ||
| 94 | return -1; | 131 | return -1; |
| 95 | } | 132 | } |
| 96 | 133 | ||
| 97 | - int32_t window_size = vadConfig.silero_vad.window_size; | 134 | + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size |
| 135 | + : vadConfig.ten_vad.window_size; | ||
| 98 | int32_t i = 0; | 136 | int32_t i = 0; |
| 99 | int is_eof = 0; | 137 | int is_eof = 0; |
| 100 | 138 |
| @@ -1033,6 +1033,21 @@ sherpa_onnx::VadModelConfig GetVadModelConfig( | @@ -1033,6 +1033,21 @@ sherpa_onnx::VadModelConfig GetVadModelConfig( | ||
| 1033 | vad_config.silero_vad.max_speech_duration = | 1033 | vad_config.silero_vad.max_speech_duration = |
| 1034 | SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20); | 1034 | SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20); |
| 1035 | 1035 | ||
| 1036 | + vad_config.ten_vad.model = SHERPA_ONNX_OR(config->ten_vad.model, ""); | ||
| 1037 | + vad_config.ten_vad.threshold = SHERPA_ONNX_OR(config->ten_vad.threshold, 0.5); | ||
| 1038 | + | ||
| 1039 | + vad_config.ten_vad.min_silence_duration = | ||
| 1040 | + SHERPA_ONNX_OR(config->ten_vad.min_silence_duration, 0.5); | ||
| 1041 | + | ||
| 1042 | + vad_config.ten_vad.min_speech_duration = | ||
| 1043 | + SHERPA_ONNX_OR(config->ten_vad.min_speech_duration, 0.25); | ||
| 1044 | + | ||
| 1045 | + vad_config.ten_vad.window_size = | ||
| 1046 | + SHERPA_ONNX_OR(config->ten_vad.window_size, 256); | ||
| 1047 | + | ||
| 1048 | + vad_config.ten_vad.max_speech_duration = | ||
| 1049 | + SHERPA_ONNX_OR(config->ten_vad.max_speech_duration, 20); | ||
| 1050 | + | ||
| 1036 | vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); | 1051 | vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); |
| 1037 | vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); | 1052 | vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); |
| 1038 | vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); | 1053 | vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); |
| @@ -71,6 +71,9 @@ SHERPA_ONNX_API const char *SherpaOnnxGetGitSha1(); | @@ -71,6 +71,9 @@ SHERPA_ONNX_API const char *SherpaOnnxGetGitSha1(); | ||
| 71 | // Example return value: "Fri Jun 20 11:22:52 2025" | 71 | // Example return value: "Fri Jun 20 11:22:52 2025" |
| 72 | SHERPA_ONNX_API const char *SherpaOnnxGetGitDate(); | 72 | SHERPA_ONNX_API const char *SherpaOnnxGetGitDate(); |
| 73 | 73 | ||
| 74 | +// return 1 if the given file exists; return 0 otherwise | ||
| 75 | +SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); | ||
| 76 | + | ||
| 74 | /// Please refer to | 77 | /// Please refer to |
| 75 | /// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | 78 | /// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html |
| 76 | /// to download pre-trained models. That is, you can find encoder-xxx.onnx | 79 | /// to download pre-trained models. That is, you can find encoder-xxx.onnx |
| @@ -845,6 +848,30 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { | @@ -845,6 +848,30 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { | ||
| 845 | float max_speech_duration; | 848 | float max_speech_duration; |
| 846 | } SherpaOnnxSileroVadModelConfig; | 849 | } SherpaOnnxSileroVadModelConfig; |
| 847 | 850 | ||
| 851 | +SHERPA_ONNX_API typedef struct SherpaOnnxTenVadModelConfig { | ||
| 852 | + // Path to the ten-vad model | ||
| 853 | + const char *model; | ||
| 854 | + | ||
| 855 | + // threshold to classify a segment as speech | ||
| 856 | + // | ||
| 857 | + // If the predicted probability of a segment is larger than this | ||
| 858 | + // value, then it is classified as speech. | ||
| 859 | + float threshold; | ||
| 860 | + | ||
| 861 | + // in seconds | ||
| 862 | + float min_silence_duration; | ||
| 863 | + | ||
| 864 | + // in seconds | ||
| 865 | + float min_speech_duration; | ||
| 866 | + | ||
| 867 | + int32_t window_size; | ||
| 868 | + | ||
| 869 | + // If a speech segment is longer than this value, then we increase | ||
| 870 | + // the threshold to 0.9. After finishing detecting the segment, | ||
| 871 | + // the threshold value is reset to its original value. | ||
| 872 | + float max_speech_duration; | ||
| 873 | +} SherpaOnnxTenVadModelConfig; | ||
| 874 | + | ||
| 848 | SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { | 875 | SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { |
| 849 | SherpaOnnxSileroVadModelConfig silero_vad; | 876 | SherpaOnnxSileroVadModelConfig silero_vad; |
| 850 | 877 | ||
| @@ -852,6 +879,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { | @@ -852,6 +879,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { | ||
| 852 | int32_t num_threads; | 879 | int32_t num_threads; |
| 853 | const char *provider; | 880 | const char *provider; |
| 854 | int32_t debug; | 881 | int32_t debug; |
| 882 | + SherpaOnnxTenVadModelConfig ten_vad; | ||
| 855 | } SherpaOnnxVadModelConfig; | 883 | } SherpaOnnxVadModelConfig; |
| 856 | 884 | ||
| 857 | SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer | 885 | SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer |
| @@ -1567,9 +1595,6 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( | @@ -1567,9 +1595,6 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( | ||
| 1567 | SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( | 1595 | SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( |
| 1568 | const SherpaOnnxLinearResampler *p); | 1596 | const SherpaOnnxLinearResampler *p); |
| 1569 | 1597 | ||
| 1570 | -// Return 1 if the file exists; return 0 if the file does not exist. | ||
| 1571 | -SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); | ||
| 1572 | - | ||
| 1573 | // ========================================================================= | 1598 | // ========================================================================= |
| 1574 | // For offline speaker diarization (i.e., non-streaming speaker diarization) | 1599 | // For offline speaker diarization (i.e., non-streaming speaker diarization) |
| 1575 | // ========================================================================= | 1600 | // ========================================================================= |
| @@ -655,6 +655,13 @@ VoiceActivityDetector VoiceActivityDetector::Create( | @@ -655,6 +655,13 @@ VoiceActivityDetector VoiceActivityDetector::Create( | ||
| 655 | c.silero_vad.window_size = config.silero_vad.window_size; | 655 | c.silero_vad.window_size = config.silero_vad.window_size; |
| 656 | c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration; | 656 | c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration; |
| 657 | 657 | ||
| 658 | + c.ten_vad.model = config.ten_vad.model.c_str(); | ||
| 659 | + c.ten_vad.threshold = config.ten_vad.threshold; | ||
| 660 | + c.ten_vad.min_silence_duration = config.ten_vad.min_silence_duration; | ||
| 661 | + c.ten_vad.min_speech_duration = config.ten_vad.min_speech_duration; | ||
| 662 | + c.ten_vad.window_size = config.ten_vad.window_size; | ||
| 663 | + c.ten_vad.max_speech_duration = config.ten_vad.max_speech_duration; | ||
| 664 | + | ||
| 658 | c.sample_rate = config.sample_rate; | 665 | c.sample_rate = config.sample_rate; |
| 659 | c.num_threads = config.num_threads; | 666 | c.num_threads = config.num_threads; |
| 660 | c.provider = config.provider.c_str(); | 667 | c.provider = config.provider.c_str(); |
| @@ -758,4 +765,8 @@ std::string GetGitSha1() { return SherpaOnnxGetGitSha1(); } | @@ -758,4 +765,8 @@ std::string GetGitSha1() { return SherpaOnnxGetGitSha1(); } | ||
| 758 | 765 | ||
| 759 | std::string GetGitDate() { return SherpaOnnxGetGitDate(); } | 766 | std::string GetGitDate() { return SherpaOnnxGetGitDate(); } |
| 760 | 767 | ||
| 768 | +bool FileExists(const std::string &filename) { | ||
| 769 | + return SherpaOnnxFileExists(filename.c_str()); | ||
| 770 | +} | ||
| 771 | + | ||
| 761 | } // namespace sherpa_onnx::cxx | 772 | } // namespace sherpa_onnx::cxx |
| @@ -552,8 +552,18 @@ struct SileroVadModelConfig { | @@ -552,8 +552,18 @@ struct SileroVadModelConfig { | ||
| 552 | float max_speech_duration = 20; | 552 | float max_speech_duration = 20; |
| 553 | }; | 553 | }; |
| 554 | 554 | ||
| 555 | +struct TenVadModelConfig { | ||
| 556 | + std::string model; | ||
| 557 | + float threshold = 0.5; | ||
| 558 | + float min_silence_duration = 0.5; | ||
| 559 | + float min_speech_duration = 0.25; | ||
| 560 | + int32_t window_size = 256; | ||
| 561 | + float max_speech_duration = 20; | ||
| 562 | +}; | ||
| 563 | + | ||
| 555 | struct VadModelConfig { | 564 | struct VadModelConfig { |
| 556 | SileroVadModelConfig silero_vad; | 565 | SileroVadModelConfig silero_vad; |
| 566 | + TenVadModelConfig ten_vad; | ||
| 557 | 567 | ||
| 558 | int32_t sample_rate = 16000; | 568 | int32_t sample_rate = 16000; |
| 559 | int32_t num_threads = 1; | 569 | int32_t num_threads = 1; |
| @@ -642,6 +652,7 @@ class SHERPA_ONNX_API LinearResampler | @@ -642,6 +652,7 @@ class SHERPA_ONNX_API LinearResampler | ||
| 642 | std::string GetVersionStr(); | 652 | std::string GetVersionStr(); |
| 643 | std::string GetGitSha1(); | 653 | std::string GetGitSha1(); |
| 644 | std::string GetGitDate(); | 654 | std::string GetGitDate(); |
| 655 | +bool FileExists(const std::string &filename); | ||
| 645 | 656 | ||
| 646 | } // namespace sherpa_onnx::cxx | 657 | } // namespace sherpa_onnx::cxx |
| 647 | 658 |
| @@ -321,7 +321,7 @@ class TenVadModel::Impl { | @@ -321,7 +321,7 @@ class TenVadModel::Impl { | ||
| 321 | static void LogMel(const float *in, int32_t n, float *out) { | 321 | static void LogMel(const float *in, int32_t n, float *out) { |
| 322 | for (int32_t i = 0; i != n; ++i) { | 322 | for (int32_t i = 0; i != n; ++i) { |
| 323 | // 20.79441541679836 is log(32768*32768) | 323 | // 20.79441541679836 is log(32768*32768) |
| 324 | - out[i] = logf(in[i] + 1e-10) - 20.79441541679836f; | 324 | + out[i] = logf(in[i] + 1e-10f) - 20.79441541679836f; |
| 325 | } | 325 | } |
| 326 | } | 326 | } |
| 327 | 327 |
-
请 注册 或 登录 后发表评论