Fangjun Kuang
Committed by GitHub

Add CXX API for VAD (#2077)

@@ -81,6 +81,45 @@ jobs: @@ -81,6 +81,45 @@ jobs:
81 otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib 81 otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib
82 fi 82 fi
83 83
  84 + - name: Test VAD
  85 + shell: bash
  86 + run: |
  87 + name=vad-cxx-api
  88 + g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \
  89 + -I ./build/install/include \
  90 + -L ./build/install/lib/ \
  91 + -l sherpa-onnx-cxx-api \
  92 + -l sherpa-onnx-c-api \
  93 + -l onnxruntime
  94 +
  95 + ls -lh $name
  96 +
  97 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  98 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  99 +
  100 + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
  101 + ldd ./$name
  102 + echo "----"
  103 + readelf -d ./$name
  104 + fi
  105 +
  106 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  107 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  108 +
  109 + ./$name
  110 +
  111 + mkdir vad-test
  112 + cp -v lei-jun-test*.wav vad-test
  113 +
  114 + ls -lh vad-test
  115 +
  116 + rm $name
  117 +
  118 + - uses: actions/upload-artifact@v4
  119 + with:
  120 + name: vad-test-wavs-cxx-${{ matrix.os }}
  121 + path: ./vad-test/*.wav
  122 +
84 - name: Test Speech Enhancement (GTCRN) 123 - name: Test Speech Enhancement (GTCRN)
85 shell: bash 124 shell: bash
86 run: | 125 run: |
@@ -53,6 +53,7 @@ def get_binaries(): @@ -53,6 +53,7 @@ def get_binaries():
53 "sherpa-onnx-microphone-offline-speaker-identification", 53 "sherpa-onnx-microphone-offline-speaker-identification",
54 "sherpa-onnx-offline", 54 "sherpa-onnx-offline",
55 "sherpa-onnx-offline-audio-tagging", 55 "sherpa-onnx-offline-audio-tagging",
  56 + "sherpa-onnx-offline-denoiser",
56 "sherpa-onnx-offline-language-identification", 57 "sherpa-onnx-offline-language-identification",
57 "sherpa-onnx-offline-punctuation", 58 "sherpa-onnx-offline-punctuation",
58 "sherpa-onnx-offline-speaker-diarization", 59 "sherpa-onnx-offline-speaker-diarization",
@@ -62,6 +63,7 @@ def get_binaries(): @@ -62,6 +63,7 @@ def get_binaries():
62 "sherpa-onnx-online-punctuation", 63 "sherpa-onnx-online-punctuation",
63 "sherpa-onnx-online-websocket-client", 64 "sherpa-onnx-online-websocket-client",
64 "sherpa-onnx-online-websocket-server", 65 "sherpa-onnx-online-websocket-server",
  66 + "sherpa-onnx-vad",
65 "sherpa-onnx-vad-microphone", 67 "sherpa-onnx-vad-microphone",
66 "sherpa-onnx-vad-microphone-offline-asr", 68 "sherpa-onnx-vad-microphone-offline-asr",
67 "sherpa-onnx-vad-with-offline-asr", 69 "sherpa-onnx-vad-with-offline-asr",
@@ -24,6 +24,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) @@ -24,6 +24,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api)
24 add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) 24 add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc)
25 target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) 25 target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api)
26 26
  27 +add_executable(vad-cxx-api ./vad-cxx-api.cc)
  28 +target_link_libraries(vad-cxx-api sherpa-onnx-cxx-api)
  29 +
27 if(SHERPA_ONNX_ENABLE_TTS) 30 if(SHERPA_ONNX_ENABLE_TTS)
28 add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc) 31 add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc)
29 target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api) 32 target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api)
  1 +// cxx-api-examples/vad-cxx-api.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +//
  6 +// This file demonstrates how to use VAD to remove silences from a file
  7 +// clang-format off
  8 +//
  9 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  11 +//
  12 +// clang-format on
  13 +#include <iostream>
  14 +#include <string>
  15 +
  16 +#include "sherpa-onnx/c-api/cxx-api.h"
  17 +
  18 +int32_t main() {
  19 + using namespace sherpa_onnx::cxx; // NOLINT
  20 +
  21 + std::string wave_filename = "./lei-jun-test.wav";
  22 + std::string vad_filename = "./silero_vad.onnx";
  23 +
  24 + VadModelConfig config;
  25 + config.silero_vad.model = vad_filename;
  26 + config.silero_vad.threshold = 0.1;
  27 + config.silero_vad.min_silence_duration = 0.5;
  28 + config.silero_vad.min_speech_duration = 0.25;
  29 + config.silero_vad.max_speech_duration = 20;
  30 + config.sample_rate = 16000;
  31 + config.debug = true;
  32 +
  33 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
  34 + if (!vad.Get()) {
  35 + std::cerr << "Failed to create VAD. Please check your config\n";
  36 + return -1;
  37 + }
  38 +
  39 + Wave wave = ReadWave(wave_filename);
  40 + if (wave.samples.empty()) {
  41 + std::cerr << "Failed to read: '" << wave_filename << "'\n";
  42 + return -1;
  43 + }
  44 + bool is_eof = false;
  45 + int32_t i = 0;
  46 + int32_t window_size = config.silero_vad.window_size;
  47 +
  48 + int32_t sample_rate = config.sample_rate;
  49 +
  50 + std::vector<float> samples_without_silence;
  51 +
  52 + while (!is_eof) {
  53 + if (i + window_size < wave.samples.size()) {
  54 + vad.AcceptWaveform(wave.samples.data() + i, window_size);
  55 + i += window_size;
  56 + } else {
  57 + is_eof = true;
  58 + vad.Flush();
  59 + }
  60 +
  61 + while (!vad.IsEmpty()) {
  62 + auto segment = vad.Front();
  63 + float start_time = segment.start / static_cast<float>(sample_rate);
  64 + float end_time =
  65 + start_time + segment.samples.size() / static_cast<float>(sample_rate);
  66 + printf("%.3f -- %.3f\n", start_time, end_time);
  67 +
  68 + samples_without_silence.insert(samples_without_silence.end(),
  69 + segment.samples.begin(),
  70 + segment.samples.end());
  71 +
  72 + vad.Pop();
  73 + }
  74 + }
  75 +
  76 + bool ok = WriteWave("./lei-jun-test-no-silence.wav",
  77 + {samples_without_silence, sample_rate});
  78 + if (ok) {
  79 + std::cout << "Saved to ./lei-jun-test-no-silence.wav\n";
  80 + } else {
  81 + std::cerr << "Failed to write ./lei-jun-test-no-silence.wav\n";
  82 + }
  83 +
  84 + return 0;
  85 +}
@@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { @@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
785 // in seconds 785 // in seconds
786 float min_speech_duration; 786 float min_speech_duration;
787 787
788 - int window_size; 788 + int32_t window_size;
789 789
790 // If a speech segment is longer than this value, then we increase 790 // If a speech segment is longer than this value, then we increase
791 // the threshold to 0.9. After finishing detecting the segment, 791 // the threshold to 0.9. After finishing detecting the segment,
@@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const { @@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const {
558 return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_); 558 return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_);
559 } 559 }
560 560
  561 +CircularBuffer CircularBuffer::Create(int32_t capacity) {
  562 + auto p = SherpaOnnxCreateCircularBuffer(capacity);
  563 + return CircularBuffer(p);
  564 +}
  565 +
  566 +CircularBuffer::CircularBuffer(const SherpaOnnxCircularBuffer *p)
  567 + : MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer>(p) {}
  568 +
  569 +void CircularBuffer::Destroy(const SherpaOnnxCircularBuffer *p) const {
  570 + SherpaOnnxDestroyCircularBuffer(p);
  571 +}
  572 +
  573 +void CircularBuffer::Push(const float *samples, int32_t n) const {
  574 + SherpaOnnxCircularBufferPush(p_, samples, n);
  575 +}
  576 +
  577 +std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
  578 + const float *samples = SherpaOnnxCircularBufferGet(p_, start_index, n);
  579 + std::vector<float> ans(n);
  580 + std::copy(samples, samples + n, ans.begin());
  581 +
  582 + SherpaOnnxCircularBufferFree(samples);
  583 + return ans;
  584 +}
  585 +
  586 +void CircularBuffer::Pop(int32_t n) const {
  587 + SherpaOnnxCircularBufferPop(p_, n);
  588 +}
  589 +
  590 +int32_t CircularBuffer::Size() const {
  591 + return SherpaOnnxCircularBufferSize(p_);
  592 +}
  593 +
  594 +int32_t CircularBuffer::Head() const {
  595 + return SherpaOnnxCircularBufferHead(p_);
  596 +}
  597 +
  598 +void CircularBuffer::Reset() const { SherpaOnnxCircularBufferReset(p_); }
  599 +
  600 +VoiceActivityDetector VoiceActivityDetector::Create(
  601 + const VadModelConfig &config, float buffer_size_in_seconds) {
  602 + struct SherpaOnnxVadModelConfig c;
  603 + memset(&c, 0, sizeof(c));
  604 +
  605 + c.silero_vad.model = config.silero_vad.model.c_str();
  606 + c.silero_vad.threshold = config.silero_vad.threshold;
  607 + c.silero_vad.min_silence_duration = config.silero_vad.min_silence_duration;
  608 + c.silero_vad.min_speech_duration = config.silero_vad.min_speech_duration;
  609 + c.silero_vad.window_size = config.silero_vad.window_size;
  610 + c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration;
  611 +
  612 + c.sample_rate = config.sample_rate;
  613 + c.num_threads = config.num_threads;
  614 + c.provider = config.provider.c_str();
  615 + c.debug = config.debug;
  616 +
  617 + auto p = SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds);
  618 + return VoiceActivityDetector(p);
  619 +}
  620 +
  621 +VoiceActivityDetector::VoiceActivityDetector(
  622 + const SherpaOnnxVoiceActivityDetector *p)
  623 + : MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector>(p) {}
  624 +
  625 +void VoiceActivityDetector::Destroy(
  626 + const SherpaOnnxVoiceActivityDetector *p) const {
  627 + SherpaOnnxDestroyVoiceActivityDetector(p);
  628 +}
  629 +
  630 +void VoiceActivityDetector::AcceptWaveform(const float *samples,
  631 + int32_t n) const {
  632 + SherpaOnnxVoiceActivityDetectorAcceptWaveform(p_, samples, n);
  633 +}
  634 +
  635 +bool VoiceActivityDetector::IsEmpty() const {
  636 + return SherpaOnnxVoiceActivityDetectorEmpty(p_);
  637 +}
  638 +
  639 +bool VoiceActivityDetector ::IsDetected() const {
  640 + return SherpaOnnxVoiceActivityDetectorDetected(p_);
  641 +}
  642 +
  643 +void VoiceActivityDetector::Pop() const {
  644 + SherpaOnnxVoiceActivityDetectorPop(p_);
  645 +}
  646 +
  647 +void VoiceActivityDetector::Clear() const {
  648 + SherpaOnnxVoiceActivityDetectorClear(p_);
  649 +}
  650 +
  651 +SpeechSegment VoiceActivityDetector::Front() const {
  652 + auto f = SherpaOnnxVoiceActivityDetectorFront(p_);
  653 +
  654 + SpeechSegment segment;
  655 + segment.start = f->start;
  656 + segment.samples = std::vector<float>{f->samples, f->samples + f->n};
  657 +
  658 + SherpaOnnxDestroySpeechSegment(f);
  659 +
  660 + return segment;
  661 +}
  662 +
  663 +void VoiceActivityDetector::Reset() const {
  664 + SherpaOnnxVoiceActivityDetectorReset(p_);
  665 +}
  666 +
  667 +void VoiceActivityDetector::Flush() const {
  668 + SherpaOnnxVoiceActivityDetectorFlush(p_);
  669 +}
  670 +
561 } // namespace sherpa_onnx::cxx 671 } // namespace sherpa_onnx::cxx
@@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser @@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser
500 explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p); 500 explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p);
501 }; 501 };
502 502
  503 +// ==============================
  504 +// VAD
  505 +// ==============================
  506 +
  507 +struct SileroVadModelConfig {
  508 + std::string model;
  509 + float threshold = 0.5;
  510 + float min_silence_duration = 0.5;
  511 + float min_speech_duration = 0.25;
  512 + int32_t window_size = 512;
  513 + float max_speech_duration = 20;
  514 +};
  515 +
  516 +struct VadModelConfig {
  517 + SileroVadModelConfig silero_vad;
  518 +
  519 + int32_t sample_rate = 16000;
  520 + int32_t num_threads = 1;
  521 + std::string provider = "cpu";
  522 + bool debug = false;
  523 +};
  524 +
  525 +struct SpeechSegment {
  526 + int32_t start;
  527 + std::vector<float> samples;
  528 +};
  529 +
  530 +class SHERPA_ONNX_API CircularBuffer
  531 + : public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> {
  532 + public:
  533 + static CircularBuffer Create(int32_t capacity);
  534 +
  535 + void Destroy(const SherpaOnnxCircularBuffer *p) const;
  536 +
  537 + void Push(const float *p, int32_t n) const;
  538 +
  539 + std::vector<float> Get(int32_t start_index, int32_t n) const;
  540 +
  541 + void Pop(int32_t n) const;
  542 +
  543 + int32_t Size() const;
  544 +
  545 + int32_t Head() const;
  546 +
  547 + void Reset() const;
  548 +
  549 + private:
  550 + explicit CircularBuffer(const SherpaOnnxCircularBuffer *p);
  551 +};
  552 +
  553 +class SHERPA_ONNX_API VoiceActivityDetector
  554 + : public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> {
  555 + public:
  556 + static VoiceActivityDetector Create(const VadModelConfig &config,
  557 + float buffer_size_in_seconds);
  558 +
  559 + void Destroy(const SherpaOnnxVoiceActivityDetector *p) const;
  560 +
  561 + void AcceptWaveform(const float *samples, int32_t n) const;
  562 +
  563 + bool IsEmpty() const;
  564 +
  565 + bool IsDetected() const;
  566 +
  567 + void Pop() const;
  568 +
  569 + void Clear() const;
  570 +
  571 + SpeechSegment Front() const;
  572 +
  573 + void Reset() const;
  574 +
  575 + void Flush() const;
  576 +
  577 + private:
  578 + explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
  579 +};
  580 +
503 } // namespace sherpa_onnx::cxx 581 } // namespace sherpa_onnx::cxx
504 582
505 #endif // SHERPA_ONNX_C_API_CXX_API_H_ 583 #endif // SHERPA_ONNX_C_API_CXX_API_H_
@@ -317,11 +317,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) @@ -317,11 +317,12 @@ if(SHERPA_ONNX_ENABLE_BINARY)
317 add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc) 317 add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc)
318 add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc) 318 add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc)
319 add_executable(sherpa-onnx-offline-audio-tagging sherpa-onnx-offline-audio-tagging.cc) 319 add_executable(sherpa-onnx-offline-audio-tagging sherpa-onnx-offline-audio-tagging.cc)
  320 + add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc)
320 add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc) 321 add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc)
321 add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc) 322 add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc)
322 add_executable(sherpa-onnx-offline-punctuation sherpa-onnx-offline-punctuation.cc) 323 add_executable(sherpa-onnx-offline-punctuation sherpa-onnx-offline-punctuation.cc)
323 add_executable(sherpa-onnx-online-punctuation sherpa-onnx-online-punctuation.cc) 324 add_executable(sherpa-onnx-online-punctuation sherpa-onnx-online-punctuation.cc)
324 - add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc) 325 + add_executable(sherpa-onnx-vad sherpa-onnx-vad.cc)
325 326
326 if(SHERPA_ONNX_ENABLE_TTS) 327 if(SHERPA_ONNX_ENABLE_TTS)
327 add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc) 328 add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
@@ -336,11 +337,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) @@ -336,11 +337,12 @@ if(SHERPA_ONNX_ENABLE_BINARY)
336 sherpa-onnx-keyword-spotter 337 sherpa-onnx-keyword-spotter
337 sherpa-onnx-offline 338 sherpa-onnx-offline
338 sherpa-onnx-offline-audio-tagging 339 sherpa-onnx-offline-audio-tagging
  340 + sherpa-onnx-offline-denoiser
339 sherpa-onnx-offline-language-identification 341 sherpa-onnx-offline-language-identification
340 sherpa-onnx-offline-parallel 342 sherpa-onnx-offline-parallel
341 sherpa-onnx-offline-punctuation 343 sherpa-onnx-offline-punctuation
342 - sherpa-onnx-offline-denoiser  
343 sherpa-onnx-online-punctuation 344 sherpa-onnx-online-punctuation
  345 + sherpa-onnx-vad
344 ) 346 )
345 if(SHERPA_ONNX_ENABLE_TTS) 347 if(SHERPA_ONNX_ENABLE_TTS)
346 list(APPEND main_exes 348 list(APPEND main_exes
@@ -7,9 +7,9 @@ @@ -7,9 +7,9 @@
7 #include <stdlib.h> 7 #include <stdlib.h>
8 8
9 #include <algorithm> 9 #include <algorithm>
  10 +#include <iomanip>
10 11
11 #include "sherpa-onnx/csrc/alsa.h" 12 #include "sherpa-onnx/csrc/alsa.h"
12 -#include "sherpa-onnx/csrc/circular-buffer.h"  
13 #include "sherpa-onnx/csrc/voice-activity-detector.h" 13 #include "sherpa-onnx/csrc/voice-activity-detector.h"
14 #include "sherpa-onnx/csrc/wave-writer.h" 14 #include "sherpa-onnx/csrc/wave-writer.h"
15 15
@@ -84,8 +84,6 @@ as the device_name. @@ -84,8 +84,6 @@ as the device_name.
84 exit(-1); 84 exit(-1);
85 } 85 }
86 86
87 - int32_t chunk = 0.1 * alsa.GetActualSampleRate();  
88 -  
89 auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config); 87 auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config);
90 88
91 fprintf(stderr, "Started. Please speak\n"); 89 fprintf(stderr, "Started. Please speak\n");
@@ -95,36 +93,34 @@ as the device_name. @@ -95,36 +93,34 @@ as the device_name.
95 93
96 int32_t k = 0; 94 int32_t k = 0;
97 while (!stop) { 95 while (!stop) {
98 - {  
99 - const std::vector<float> &samples = alsa.Read(chunk);  
100 -  
101 - vad->AcceptWaveform(samples.data(), samples.size());  
102 -  
103 - if (vad->IsSpeechDetected() && !printed) {  
104 - printed = true;  
105 - fprintf(stderr, "\nDetected speech!\n");  
106 - }  
107 - if (!vad->IsSpeechDetected()) {  
108 - printed = false;  
109 - }  
110 -  
111 - while (!vad->Empty()) {  
112 - const auto &segment = vad->Front();  
113 - float duration =  
114 - segment.samples.size() / static_cast<float>(sample_rate);  
115 -  
116 - fprintf(stderr, "Duration: %.3f seconds\n", duration);  
117 -  
118 - char filename[128];  
119 - snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration);  
120 - k += 1;  
121 - sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(),  
122 - segment.samples.size());  
123 - fprintf(stderr, "Saved to %s\n", filename);  
124 - fprintf(stderr, "----------\n");  
125 -  
126 - vad->Pop();  
127 - } 96 + const std::vector<float> &samples = alsa.Read(window_size);
  97 +
  98 + vad->AcceptWaveform(samples.data(), samples.size());
  99 +
  100 + if (vad->IsSpeechDetected() && !printed) {
  101 + printed = true;
  102 + fprintf(stderr, "\nDetected speech!\n");
  103 + }
  104 + if (!vad->IsSpeechDetected()) {
  105 + printed = false;
  106 + }
  107 +
  108 + while (!vad->Empty()) {
  109 + const auto &segment = vad->Front();
  110 + float duration = segment.samples.size() / static_cast<float>(sample_rate);
  111 +
  112 + fprintf(stderr, "Duration: %.3f seconds\n", duration);
  113 +
  114 + std::ostringstream os;
  115 + os << "seg-" << k << "-" << std::fixed << std::setprecision(3) << duration
  116 + << "s.wav";
  117 + k += 1;
  118 + sherpa_onnx::WriteWave(os.str(), 16000, segment.samples.data(),
  119 + segment.samples.size());
  120 + fprintf(stderr, "Saved to %s\n", os.str().c_str());
  121 + fprintf(stderr, "----------\n");
  122 +
  123 + vad->Pop();
128 } 124 }
129 } 125 }
130 126
  1 +// sherpa-onnx/csrc/sherpa-onnx-vad.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include <stdio.h>
  6 +#include <stdlib.h>
  7 +
  8 +#include <algorithm>
  9 +#include <iomanip>
  10 +
  11 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  12 +#include "sherpa-onnx/csrc/wave-reader.h"
  13 +#include "sherpa-onnx/csrc/wave-writer.h"
  14 +
  15 +int32_t main(int32_t argc, char *argv[]) {
  16 + const char *kUsageMessage = R"usage(
  17 +This program shows how to use VAD in sherpa-onnx
  18 +to remove silences from a file.
  19 +
  20 + ./bin/sherpa-onnx-vad \
  21 + --silero-vad-model=/path/to/silero_vad.onnx \
  22 + /path/to/input.wav
  23 + /path/to/output.wav
  24 +
  25 +Please download silero_vad.onnx from
  26 +https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
  27 +
  28 +For instance, use
  29 +wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
  30 +
  31 +input.wav should be 16kHz.
  32 +)usage";
  33 +
  34 + sherpa_onnx::ParseOptions po(kUsageMessage);
  35 + sherpa_onnx::VadModelConfig config;
  36 +
  37 + config.Register(&po);
  38 + po.Read(argc, argv);
  39 + if (po.NumArgs() != 2) {
  40 + fprintf(
  41 + stderr,
  42 + "Please provide only 2 argument2: the input wav and the output wav\n");
  43 + po.PrintUsage();
  44 + exit(EXIT_FAILURE);
  45 + }
  46 +
  47 + fprintf(stderr, "%s\n", config.ToString().c_str());
  48 +
  49 + if (!config.Validate()) {
  50 + fprintf(stderr, "Errors in config!\n");
  51 + return -1;
  52 + }
  53 +
  54 + std::string wav_filename = po.GetArg(1);
  55 + int32_t sampling_rate = -1;
  56 +
  57 + bool is_ok = false;
  58 + std::vector<float> samples =
  59 + sherpa_onnx::ReadWave(wav_filename, &sampling_rate, &is_ok);
  60 +
  61 + if (!is_ok) {
  62 + fprintf(stderr, "Failed to read '%s'\n", wav_filename.c_str());
  63 + return -1;
  64 + }
  65 +
  66 + if (sampling_rate != 16000) {
  67 + fprintf(stderr, "Support only 16000Hz. Given: %d\n", sampling_rate);
  68 + return -1;
  69 + }
  70 +
  71 + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config);
  72 +
  73 + int32_t window_size = config.silero_vad.window_size;
  74 +
  75 + int32_t i = 0;
  76 + bool is_eof = false;
  77 +
  78 + std::vector<float> samples_without_silence;
  79 +
  80 + while (!is_eof) {
  81 + if (i + window_size < samples.size()) {
  82 + vad->AcceptWaveform(samples.data() + i, window_size);
  83 + i += window_size;
  84 + } else {
  85 + vad->Flush();
  86 + is_eof = true;
  87 + }
  88 +
  89 + while (!vad->Empty()) {
  90 + const auto &segment = vad->Front();
  91 + float start_time = segment.start / static_cast<float>(sampling_rate);
  92 + float end_time = start_time + segment.samples.size() /
  93 + static_cast<float>(sampling_rate);
  94 +
  95 + fprintf(stderr, "%.3f -- %.3f\n", start_time, end_time);
  96 + samples_without_silence.insert(samples_without_silence.end(),
  97 + segment.samples.begin(),
  98 + segment.samples.end());
  99 + vad->Pop();
  100 + }
  101 + }
  102 +
  103 + sherpa_onnx::WriteWave(po.GetArg(2), sampling_rate,
  104 + samples_without_silence.data(),
  105 + samples_without_silence.size());
  106 +
  107 + fprintf(stderr, "Saved to %s\n", po.GetArg(2).c_str());
  108 +
  109 + return 0;
  110 +}