Fangjun Kuang
Committed by GitHub

Fix for silero vad v5. (#1065)

The network input is 64 + 512 samples instead of 512 samples for 16kHz.
@@ -74,9 +74,8 @@ class SileroVadModel::Impl { @@ -74,9 +74,8 @@ class SileroVadModel::Impl {
74 } 74 }
75 75
76 bool IsSpeech(const float *samples, int32_t n) { 76 bool IsSpeech(const float *samples, int32_t n) {
77 - if (n != config_.silero_vad.window_size) {  
78 - SHERPA_ONNX_LOGE("n: %d != window_size: %d", n,  
79 - config_.silero_vad.window_size); 77 + if (n != WindowSize()) {
  78 + SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize());
80 exit(-1); 79 exit(-1);
81 } 80 }
82 81
@@ -146,9 +145,11 @@ class SileroVadModel::Impl { @@ -146,9 +145,11 @@ class SileroVadModel::Impl {
146 return false; 145 return false;
147 } 146 }
148 147
149 - int32_t WindowSize() const { return config_.silero_vad.window_size; } 148 + int32_t WindowShift() const { return config_.silero_vad.window_size; }
150 149
151 - int32_t WindowShift() const { return WindowSize() - window_shift_; } 150 + int32_t WindowSize() const {
  151 + return config_.silero_vad.window_size + window_overlap_;
  152 + }
152 153
153 int32_t MinSilenceDurationSamples() const { return min_silence_samples_; } 154 int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
154 155
@@ -177,9 +178,9 @@ class SileroVadModel::Impl { @@ -177,9 +178,9 @@ class SileroVadModel::Impl {
177 178
178 // 64 for 16kHz 179 // 64 for 16kHz
179 // 32 for 8kHz 180 // 32 for 8kHz
180 - window_shift_ = 64; 181 + window_overlap_ = 64;
181 182
182 - if (WindowSize() != 512) { 183 + if (config_.silero_vad.window_size != 512) {
183 SHERPA_ONNX_LOGE( 184 SHERPA_ONNX_LOGE(
184 "For silero_vad v5, we require window_size to be 512 for 16kHz"); 185 "For silero_vad v5, we require window_size to be 512 for 16kHz");
185 exit(-1); 186 exit(-1);
@@ -423,7 +424,7 @@ class SileroVadModel::Impl { @@ -423,7 +424,7 @@ class SileroVadModel::Impl {
423 int32_t temp_start_ = 0; 424 int32_t temp_start_ = 0;
424 int32_t temp_end_ = 0; 425 int32_t temp_end_ = 0;
425 426
426 - int32_t window_shift_ = 0; 427 + int32_t window_overlap_ = 0;
427 428
428 bool is_v5_ = false; 429 bool is_v5_ = false;
429 }; 430 };
@@ -37,11 +37,12 @@ class SileroVadModel : public VadModel { @@ -37,11 +37,12 @@ class SileroVadModel : public VadModel {
37 */ 37 */
38 bool IsSpeech(const float *samples, int32_t n) override; 38 bool IsSpeech(const float *samples, int32_t n) override;
39 39
  40 + // For silero vad V4, it is WindowShift().
  41 + // For silero vad V5, it is WindowShift()+64 for 16kHz and
  42 + // WindowShift()+32 for 8kHz
40 int32_t WindowSize() const override; 43 int32_t WindowSize() const override;
41 44
42 - // For silero vad V4, it is WindowSize().  
43 - // For silero vad V5, it is WindowSize()-64 for 16kHz and  
44 - // WindowSize()-32 for 8kHz 45 + // 512
45 int32_t WindowShift() const override; 46 int32_t WindowShift() const override;
46 47
47 int32_t MinSilenceDurationSamples() const override; 48 int32_t MinSilenceDurationSamples() const override;
@@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl { @@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl {
44 // an extra buffer here 44 // an extra buffer here
45 last_.insert(last_.end(), samples, samples + n); 45 last_.insert(last_.end(), samples, samples + n);
46 46
  47 + if (last_.size() < window_size) {
  48 + return;
  49 + }
  50 +
47 // Note: For v4, window_shift == window_size 51 // Note: For v4, window_shift == window_size
48 int32_t k = 52 int32_t k =
49 (static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1; 53 (static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1;
50 const float *p = last_.data(); 54 const float *p = last_.data();
51 bool is_speech = false; 55 bool is_speech = false;
52 56
53 - for (int32_t i = 0; i != k; ++i, p += window_shift) { 57 + for (int32_t i = 0; i < k; ++i, p += window_shift) {
54 buffer_.Push(p, window_shift); 58 buffer_.Push(p, window_shift);
55 // NOTE(fangjun): Please don't use a very large n. 59 // NOTE(fangjun): Please don't use a very large n.
56 bool this_window_is_speech = model_->IsSpeech(p, window_size); 60 bool this_window_is_speech = model_->IsSpeech(p, window_size);