Fangjun Kuang
Committed by GitHub

Add C API for ten-vad (#2379)

@@ -376,7 +376,7 @@ jobs: @@ -376,7 +376,7 @@ jobs:
376 name: matcha-tts-${{ matrix.os }} 376 name: matcha-tts-${{ matrix.os }}
377 path: ./generated-matcha-*.wav 377 path: ./generated-matcha-*.wav
378 378
379 - - name: Test vad + Whisper tiny.en 379 + - name: Test silero-vad + Whisper tiny.en
380 shell: bash 380 shell: bash
381 run: | 381 run: |
382 gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \ 382 gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \
@@ -403,7 +403,34 @@ jobs: @@ -403,7 +403,34 @@ jobs:
403 rm -rf *.onnx 403 rm -rf *.onnx
404 rm *.wav 404 rm *.wav
405 405
406 - - name: Test vad + Moonshine 406 + - name: Test ten-vad + Whisper tiny.en
  407 + shell: bash
  408 + run: |
  409 + gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \
  410 + -I ./build/install/include \
  411 + -L ./build/install/lib/ \
  412 + -l sherpa-onnx-c-api \
  413 + -l onnxruntime
  414 +
  415 + # Now download models
  416 + #
  417 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  418 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  419 +
  420 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  421 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  422 + rm sherpa-onnx-whisper-tiny.en.tar.bz2
  423 +
  424 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  425 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  426 +
  427 + ./vad-whisper-c-api
  428 +
  429 + rm -rf sherpa-onnx-*
  430 + rm -rf *.onnx
  431 + rm *.wav
  432 +
  433 + - name: Test silero-vad + Moonshine
407 shell: bash 434 shell: bash
408 run: | 435 run: |
409 gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \ 436 gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \
@@ -430,6 +457,33 @@ jobs: @@ -430,6 +457,33 @@ jobs:
430 rm -rf *.onnx 457 rm -rf *.onnx
431 rm *.wav 458 rm *.wav
432 459
  460 + - name: Test ten-vad + Moonshine
  461 + shell: bash
  462 + run: |
  463 + gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \
  464 + -I ./build/install/include \
  465 + -L ./build/install/lib/ \
  466 + -l sherpa-onnx-c-api \
  467 + -l onnxruntime
  468 +
  469 + # Now download models
  470 + #
  471 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  472 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  473 +
  474 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  475 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  476 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  477 +
  478 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  479 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  480 +
  481 + ./vad-moonshine-c-api
  482 +
  483 + rm -rf sherpa-onnx-*
  484 + rm -rf *.onnx
  485 + rm *.wav
  486 +
433 - name: Test Moonshine 487 - name: Test Moonshine
434 shell: bash 488 shell: bash
435 run: | 489 run: |
@@ -466,7 +520,7 @@ jobs: @@ -466,7 +520,7 @@ jobs:
466 ./run.sh 520 ./run.sh
467 rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 521 rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20
468 522
469 - - name: Test vad + sense-voice 523 + - name: Test silero-vad + sense-voice
470 shell: bash 524 shell: bash
471 run: | 525 run: |
472 gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \ 526 gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \
@@ -505,6 +559,45 @@ jobs: @@ -505,6 +559,45 @@ jobs:
505 rm -rf *.onnx 559 rm -rf *.onnx
506 rm *.wav 560 rm *.wav
507 561
  562 + - name: Test ten-vad + sense-voice
  563 + shell: bash
  564 + run: |
  565 + gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \
  566 + -I ./build/install/include \
  567 + -L ./build/install/lib/ \
  568 + -l sherpa-onnx-c-api \
  569 + -l onnxruntime
  570 +
  571 + ls -lh vad-sense-voice-c-api
  572 +
  573 + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
  574 + ldd ./vad-sense-voice-c-api
  575 + echo "----"
  576 + readelf -d ./vad-sense-voice-c-api
  577 + fi
  578 +
  579 + # Now download models
  580 + #
  581 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  582 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  583 +
  584 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  585 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  586 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  587 +
  588 + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
  589 + echo "---"
  590 + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs
  591 +
  592 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  593 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  594 +
  595 + ./vad-sense-voice-c-api
  596 +
  597 + rm -rf sherpa-onnx-sense-voice-*
  598 + rm -rf *.onnx
  599 + rm *.wav
  600 +
508 - name: Test sense-voice 601 - name: Test sense-voice
509 shell: bash 602 shell: bash
510 run: | 603 run: |
@@ -6,7 +6,12 @@ @@ -6,7 +6,12 @@
6 // This file demonstrates how to use VAD + Moonshine with sherpa-onnx's C API. 6 // This file demonstrates how to use VAD + Moonshine with sherpa-onnx's C API.
7 // clang-format off 7 // clang-format off
8 // 8 //
9 -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx 9 +// To use silero-vad:
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// To use ten-vad:
  13 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  14 +//
10 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav 15 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
11 // 16 //
12 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 17 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
@@ -23,7 +28,27 @@ @@ -23,7 +28,27 @@
23 28
24 int32_t main() { 29 int32_t main() {
25 const char *wav_filename = "./Obama.wav"; 30 const char *wav_filename = "./Obama.wav";
26 - const char *vad_filename = "./silero_vad.onnx"; 31 + if (!SherpaOnnxFileExists(wav_filename)) {
  32 + fprintf(stderr, "Please download %s\n", wav_filename);
  33 + return -1;
  34 + }
  35 +
  36 + const char *vad_filename;
  37 + int32_t use_silero_vad = 0;
  38 + int32_t use_ten_vad = 0;
  39 +
  40 + if (SherpaOnnxFileExists("./silero_vad.onnx")) {
  41 + printf("Use silero-vad\n");
  42 + vad_filename = "./silero_vad.onnx";
  43 + use_silero_vad = 1;
  44 + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) {
  45 + printf("Use ten-vad\n");
  46 + vad_filename = "./ten-vad.onnx";
  47 + use_ten_vad = 1;
  48 + } else {
  49 + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");
  50 + return -1;
  51 + }
27 52
28 const char *preprocessor = 53 const char *preprocessor =
29 "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx"; 54 "./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx";
@@ -76,12 +101,22 @@ int32_t main() { @@ -76,12 +101,22 @@ int32_t main() {
76 101
77 SherpaOnnxVadModelConfig vadConfig; 102 SherpaOnnxVadModelConfig vadConfig;
78 memset(&vadConfig, 0, sizeof(vadConfig)); 103 memset(&vadConfig, 0, sizeof(vadConfig));
79 - vadConfig.silero_vad.model = vad_filename;  
80 - vadConfig.silero_vad.threshold = 0.5;  
81 - vadConfig.silero_vad.min_silence_duration = 0.5;  
82 - vadConfig.silero_vad.min_speech_duration = 0.5;  
83 - vadConfig.silero_vad.max_speech_duration = 10;  
84 - vadConfig.silero_vad.window_size = 512; 104 + if (use_silero_vad) {
  105 + vadConfig.silero_vad.model = vad_filename;
  106 + vadConfig.silero_vad.threshold = 0.25;
  107 + vadConfig.silero_vad.min_silence_duration = 0.5;
  108 + vadConfig.silero_vad.min_speech_duration = 0.5;
  109 + vadConfig.silero_vad.max_speech_duration = 10;
  110 + vadConfig.silero_vad.window_size = 512;
  111 + } else if (use_ten_vad) {
  112 + vadConfig.ten_vad.model = vad_filename;
  113 + vadConfig.ten_vad.threshold = 0.25;
  114 + vadConfig.ten_vad.min_silence_duration = 0.5;
  115 + vadConfig.ten_vad.min_speech_duration = 0.5;
  116 + vadConfig.ten_vad.max_speech_duration = 10;
  117 + vadConfig.ten_vad.window_size = 256;
  118 + }
  119 +
85 vadConfig.sample_rate = 16000; 120 vadConfig.sample_rate = 16000;
86 vadConfig.num_threads = 1; 121 vadConfig.num_threads = 1;
87 vadConfig.debug = 1; 122 vadConfig.debug = 1;
@@ -96,7 +131,9 @@ int32_t main() { @@ -96,7 +131,9 @@ int32_t main() {
96 return -1; 131 return -1;
97 } 132 }
98 133
99 - int32_t window_size = vadConfig.silero_vad.window_size; 134 + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
  135 + : vadConfig.ten_vad.window_size;
  136 +
100 int32_t i = 0; 137 int32_t i = 0;
101 int is_eof = 0; 138 int is_eof = 0;
102 139
@@ -6,7 +6,12 @@ @@ -6,7 +6,12 @@
6 // This file demonstrates how to use VAD + SenseVoice with sherpa-onnx's C API. 6 // This file demonstrates how to use VAD + SenseVoice with sherpa-onnx's C API.
7 // clang-format off 7 // clang-format off
8 // 8 //
9 -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx 9 +// To use silero-vad:
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// To use ten-vad:
  13 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  14 +//
10 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav 15 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
11 // 16 //
12 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 17 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
@@ -23,7 +28,28 @@ @@ -23,7 +28,28 @@
23 28
24 int32_t main() { 29 int32_t main() {
25 const char *wav_filename = "./lei-jun-test.wav"; 30 const char *wav_filename = "./lei-jun-test.wav";
26 - const char *vad_filename = "./silero_vad.onnx"; 31 + if (!SherpaOnnxFileExists(wav_filename)) {
  32 + fprintf(stderr, "Please download %s\n", wav_filename);
  33 + return -1;
  34 + }
  35 +
  36 + const char *vad_filename;
  37 + int32_t use_silero_vad = 0;
  38 + int32_t use_ten_vad = 0;
  39 +
  40 + if (SherpaOnnxFileExists("./silero_vad.onnx")) {
  41 + printf("Use silero-vad\n");
  42 + vad_filename = "./silero_vad.onnx";
  43 + use_silero_vad = 1;
  44 + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) {
  45 + printf("Use ten-vad\n");
  46 + vad_filename = "./ten-vad.onnx";
  47 + use_ten_vad = 1;
  48 + } else {
  49 + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");
  50 + return -1;
  51 + }
  52 +
27 const char *model_filename = 53 const char *model_filename =
28 "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; 54 "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
29 const char *tokens_filename = 55 const char *tokens_filename =
@@ -77,12 +103,23 @@ int32_t main() { @@ -77,12 +103,23 @@ int32_t main() {
77 103
78 SherpaOnnxVadModelConfig vadConfig; 104 SherpaOnnxVadModelConfig vadConfig;
79 memset(&vadConfig, 0, sizeof(vadConfig)); 105 memset(&vadConfig, 0, sizeof(vadConfig));
80 - vadConfig.silero_vad.model = vad_filename;  
81 - vadConfig.silero_vad.threshold = 0.5;  
82 - vadConfig.silero_vad.min_silence_duration = 0.5;  
83 - vadConfig.silero_vad.min_speech_duration = 0.5;  
84 - vadConfig.silero_vad.max_speech_duration = 5;  
85 - vadConfig.silero_vad.window_size = 512; 106 +
  107 + if (use_silero_vad) {
  108 + vadConfig.silero_vad.model = vad_filename;
  109 + vadConfig.silero_vad.threshold = 0.25;
  110 + vadConfig.silero_vad.min_silence_duration = 0.5;
  111 + vadConfig.silero_vad.min_speech_duration = 0.5;
  112 + vadConfig.silero_vad.max_speech_duration = 10;
  113 + vadConfig.silero_vad.window_size = 512;
  114 + } else if (use_ten_vad) {
  115 + vadConfig.ten_vad.model = vad_filename;
  116 + vadConfig.ten_vad.threshold = 0.25;
  117 + vadConfig.ten_vad.min_silence_duration = 0.5;
  118 + vadConfig.ten_vad.min_speech_duration = 0.5;
  119 + vadConfig.ten_vad.max_speech_duration = 10;
  120 + vadConfig.ten_vad.window_size = 256;
  121 + }
  122 +
86 vadConfig.sample_rate = 16000; 123 vadConfig.sample_rate = 16000;
87 vadConfig.num_threads = 1; 124 vadConfig.num_threads = 1;
88 vadConfig.debug = 1; 125 vadConfig.debug = 1;
@@ -97,7 +134,8 @@ int32_t main() { @@ -97,7 +134,8 @@ int32_t main() {
97 return -1; 134 return -1;
98 } 135 }
99 136
100 - int32_t window_size = vadConfig.silero_vad.window_size; 137 + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
  138 + : vadConfig.ten_vad.window_size;
101 int32_t i = 0; 139 int32_t i = 0;
102 int is_eof = 0; 140 int is_eof = 0;
103 141
@@ -8,7 +8,12 @@ @@ -8,7 +8,12 @@
8 // 8 //
9 // clang-format off 9 // clang-format off
10 // 10 //
11 -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx 11 +// To use silero-vad:
  12 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  13 +//
  14 +// To use ten-vad:
  15 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  16 +//
12 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav 17 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
13 // 18 //
14 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 19 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
@@ -25,7 +30,28 @@ @@ -25,7 +30,28 @@
25 30
26 int32_t main() { 31 int32_t main() {
27 const char *wav_filename = "./Obama.wav"; 32 const char *wav_filename = "./Obama.wav";
28 - const char *vad_filename = "./silero_vad.onnx"; 33 +
  34 + if (!SherpaOnnxFileExists(wav_filename)) {
  35 + fprintf(stderr, "Please download %s\n", wav_filename);
  36 + return -1;
  37 + }
  38 +
  39 + const char *vad_filename;
  40 + int32_t use_silero_vad = 0;
  41 + int32_t use_ten_vad = 0;
  42 +
  43 + if (SherpaOnnxFileExists("./silero_vad.onnx")) {
  44 + printf("Use silero-vad\n");
  45 + vad_filename = "./silero_vad.onnx";
  46 + use_silero_vad = 1;
  47 + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) {
  48 + printf("Use ten-vad\n");
  49 + vad_filename = "./ten-vad.onnx";
  50 + use_ten_vad = 1;
  51 + } else {
  52 + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");
  53 + return -1;
  54 + }
29 55
30 const char *encoder = "sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"; 56 const char *encoder = "sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx";
31 const char *decoder = "sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"; 57 const char *decoder = "sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx";
@@ -74,12 +100,23 @@ int32_t main() { @@ -74,12 +100,23 @@ int32_t main() {
74 100
75 SherpaOnnxVadModelConfig vadConfig; 101 SherpaOnnxVadModelConfig vadConfig;
76 memset(&vadConfig, 0, sizeof(vadConfig)); 102 memset(&vadConfig, 0, sizeof(vadConfig));
77 - vadConfig.silero_vad.model = vad_filename;  
78 - vadConfig.silero_vad.threshold = 0.5;  
79 - vadConfig.silero_vad.min_silence_duration = 0.5;  
80 - vadConfig.silero_vad.min_speech_duration = 0.5;  
81 - vadConfig.silero_vad.max_speech_duration = 10;  
82 - vadConfig.silero_vad.window_size = 512; 103 +
  104 + if (use_silero_vad) {
  105 + vadConfig.silero_vad.model = vad_filename;
  106 + vadConfig.silero_vad.threshold = 0.25;
  107 + vadConfig.silero_vad.min_silence_duration = 0.5;
  108 + vadConfig.silero_vad.min_speech_duration = 0.5;
  109 + vadConfig.silero_vad.max_speech_duration = 10;
  110 + vadConfig.silero_vad.window_size = 512;
  111 + } else if (use_ten_vad) {
  112 + vadConfig.ten_vad.model = vad_filename;
  113 + vadConfig.ten_vad.threshold = 0.25;
  114 + vadConfig.ten_vad.min_silence_duration = 0.5;
  115 + vadConfig.ten_vad.min_speech_duration = 0.5;
  116 + vadConfig.ten_vad.max_speech_duration = 10;
  117 + vadConfig.ten_vad.window_size = 256;
  118 + }
  119 +
83 vadConfig.sample_rate = 16000; 120 vadConfig.sample_rate = 16000;
84 vadConfig.num_threads = 1; 121 vadConfig.num_threads = 1;
85 vadConfig.debug = 1; 122 vadConfig.debug = 1;
@@ -94,7 +131,8 @@ int32_t main() { @@ -94,7 +131,8 @@ int32_t main() {
94 return -1; 131 return -1;
95 } 132 }
96 133
97 - int32_t window_size = vadConfig.silero_vad.window_size; 134 + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
  135 + : vadConfig.ten_vad.window_size;
98 int32_t i = 0; 136 int32_t i = 0;
99 int is_eof = 0; 137 int is_eof = 0;
100 138
@@ -1033,6 +1033,21 @@ sherpa_onnx::VadModelConfig GetVadModelConfig( @@ -1033,6 +1033,21 @@ sherpa_onnx::VadModelConfig GetVadModelConfig(
1033 vad_config.silero_vad.max_speech_duration = 1033 vad_config.silero_vad.max_speech_duration =
1034 SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20); 1034 SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);
1035 1035
  1036 + vad_config.ten_vad.model = SHERPA_ONNX_OR(config->ten_vad.model, "");
  1037 + vad_config.ten_vad.threshold = SHERPA_ONNX_OR(config->ten_vad.threshold, 0.5);
  1038 +
  1039 + vad_config.ten_vad.min_silence_duration =
  1040 + SHERPA_ONNX_OR(config->ten_vad.min_silence_duration, 0.5);
  1041 +
  1042 + vad_config.ten_vad.min_speech_duration =
  1043 + SHERPA_ONNX_OR(config->ten_vad.min_speech_duration, 0.25);
  1044 +
  1045 + vad_config.ten_vad.window_size =
  1046 + SHERPA_ONNX_OR(config->ten_vad.window_size, 256);
  1047 +
  1048 + vad_config.ten_vad.max_speech_duration =
  1049 + SHERPA_ONNX_OR(config->ten_vad.max_speech_duration, 20);
  1050 +
1036 vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); 1051 vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
1037 vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); 1052 vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
1038 vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); 1053 vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
@@ -71,6 +71,9 @@ SHERPA_ONNX_API const char *SherpaOnnxGetGitSha1(); @@ -71,6 +71,9 @@ SHERPA_ONNX_API const char *SherpaOnnxGetGitSha1();
71 // Example return value: "Fri Jun 20 11:22:52 2025" 71 // Example return value: "Fri Jun 20 11:22:52 2025"
72 SHERPA_ONNX_API const char *SherpaOnnxGetGitDate(); 72 SHERPA_ONNX_API const char *SherpaOnnxGetGitDate();
73 73
  74 +// return 1 if the given file exists; return 0 otherwise
  75 +SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
  76 +
74 /// Please refer to 77 /// Please refer to
75 /// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html 78 /// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
76 /// to download pre-trained models. That is, you can find encoder-xxx.onnx 79 /// to download pre-trained models. That is, you can find encoder-xxx.onnx
@@ -845,6 +848,30 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { @@ -845,6 +848,30 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
845 float max_speech_duration; 848 float max_speech_duration;
846 } SherpaOnnxSileroVadModelConfig; 849 } SherpaOnnxSileroVadModelConfig;
847 850
  851 +SHERPA_ONNX_API typedef struct SherpaOnnxTenVadModelConfig {
  852 + // Path to the ten-vad model
  853 + const char *model;
  854 +
  855 + // threshold to classify a segment as speech
  856 + //
  857 + // If the predicted probability of a segment is larger than this
  858 + // value, then it is classified as speech.
  859 + float threshold;
  860 +
  861 + // in seconds
  862 + float min_silence_duration;
  863 +
  864 + // in seconds
  865 + float min_speech_duration;
  866 +
  867 + int32_t window_size;
  868 +
  869 + // If a speech segment is longer than this value, then we increase
  870 + // the threshold to 0.9. After finishing detecting the segment,
  871 + // the threshold value is reset to its original value.
  872 + float max_speech_duration;
  873 +} SherpaOnnxTenVadModelConfig;
  874 +
848 SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { 875 SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
849 SherpaOnnxSileroVadModelConfig silero_vad; 876 SherpaOnnxSileroVadModelConfig silero_vad;
850 877
@@ -852,6 +879,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { @@ -852,6 +879,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
852 int32_t num_threads; 879 int32_t num_threads;
853 const char *provider; 880 const char *provider;
854 int32_t debug; 881 int32_t debug;
  882 + SherpaOnnxTenVadModelConfig ten_vad;
855 } SherpaOnnxVadModelConfig; 883 } SherpaOnnxVadModelConfig;
856 884
857 SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer 885 SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
@@ -1567,9 +1595,6 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( @@ -1567,9 +1595,6 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
1567 SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( 1595 SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
1568 const SherpaOnnxLinearResampler *p); 1596 const SherpaOnnxLinearResampler *p);
1569 1597
1570 -// Return 1 if the file exists; return 0 if the file does not exist.  
1571 -SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);  
1572 -  
1573 // ========================================================================= 1598 // =========================================================================
1574 // For offline speaker diarization (i.e., non-streaming speaker diarization) 1599 // For offline speaker diarization (i.e., non-streaming speaker diarization)
1575 // ========================================================================= 1600 // =========================================================================
@@ -655,6 +655,13 @@ VoiceActivityDetector VoiceActivityDetector::Create( @@ -655,6 +655,13 @@ VoiceActivityDetector VoiceActivityDetector::Create(
655 c.silero_vad.window_size = config.silero_vad.window_size; 655 c.silero_vad.window_size = config.silero_vad.window_size;
656 c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration; 656 c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration;
657 657
  658 + c.ten_vad.model = config.ten_vad.model.c_str();
  659 + c.ten_vad.threshold = config.ten_vad.threshold;
  660 + c.ten_vad.min_silence_duration = config.ten_vad.min_silence_duration;
  661 + c.ten_vad.min_speech_duration = config.ten_vad.min_speech_duration;
  662 + c.ten_vad.window_size = config.ten_vad.window_size;
  663 + c.ten_vad.max_speech_duration = config.ten_vad.max_speech_duration;
  664 +
658 c.sample_rate = config.sample_rate; 665 c.sample_rate = config.sample_rate;
659 c.num_threads = config.num_threads; 666 c.num_threads = config.num_threads;
660 c.provider = config.provider.c_str(); 667 c.provider = config.provider.c_str();
@@ -758,4 +765,8 @@ std::string GetGitSha1() { return SherpaOnnxGetGitSha1(); } @@ -758,4 +765,8 @@ std::string GetGitSha1() { return SherpaOnnxGetGitSha1(); }
758 765
759 std::string GetGitDate() { return SherpaOnnxGetGitDate(); } 766 std::string GetGitDate() { return SherpaOnnxGetGitDate(); }
760 767
  768 +bool FileExists(const std::string &filename) {
  769 + return SherpaOnnxFileExists(filename.c_str());
  770 +}
  771 +
761 } // namespace sherpa_onnx::cxx 772 } // namespace sherpa_onnx::cxx
@@ -552,8 +552,18 @@ struct SileroVadModelConfig { @@ -552,8 +552,18 @@ struct SileroVadModelConfig {
552 float max_speech_duration = 20; 552 float max_speech_duration = 20;
553 }; 553 };
554 554
  555 +struct TenVadModelConfig {
  556 + std::string model;
  557 + float threshold = 0.5;
  558 + float min_silence_duration = 0.5;
  559 + float min_speech_duration = 0.25;
  560 + int32_t window_size = 256;
  561 + float max_speech_duration = 20;
  562 +};
  563 +
555 struct VadModelConfig { 564 struct VadModelConfig {
556 SileroVadModelConfig silero_vad; 565 SileroVadModelConfig silero_vad;
  566 + TenVadModelConfig ten_vad;
557 567
558 int32_t sample_rate = 16000; 568 int32_t sample_rate = 16000;
559 int32_t num_threads = 1; 569 int32_t num_threads = 1;
@@ -642,6 +652,7 @@ class SHERPA_ONNX_API LinearResampler @@ -642,6 +652,7 @@ class SHERPA_ONNX_API LinearResampler
642 std::string GetVersionStr(); 652 std::string GetVersionStr();
643 std::string GetGitSha1(); 653 std::string GetGitSha1();
644 std::string GetGitDate(); 654 std::string GetGitDate();
  655 +bool FileExists(const std::string &filename);
645 656
646 } // namespace sherpa_onnx::cxx 657 } // namespace sherpa_onnx::cxx
647 658
@@ -321,7 +321,7 @@ class TenVadModel::Impl { @@ -321,7 +321,7 @@ class TenVadModel::Impl {
321 static void LogMel(const float *in, int32_t n, float *out) { 321 static void LogMel(const float *in, int32_t n, float *out) {
322 for (int32_t i = 0; i != n; ++i) { 322 for (int32_t i = 0; i != n; ++i) {
323 // 20.79441541679836 is log(32768*32768) 323 // 20.79441541679836 is log(32768*32768)
324 - out[i] = logf(in[i] + 1e-10) - 20.79441541679836f; 324 + out[i] = logf(in[i] + 1e-10f) - 20.79441541679836f;
325 } 325 }
326 } 326 }
327 327