Committed by
GitHub
Fix for silero vad v5. (#1065)
The network input is 64 + 512 samples instead of 512 samples for 16kHz.
正在显示
3 个修改的文件
包含
18 行增加
和
12 行删除
| @@ -74,9 +74,8 @@ class SileroVadModel::Impl { | @@ -74,9 +74,8 @@ class SileroVadModel::Impl { | ||
| 74 | } | 74 | } |
| 75 | 75 | ||
| 76 | bool IsSpeech(const float *samples, int32_t n) { | 76 | bool IsSpeech(const float *samples, int32_t n) { |
| 77 | - if (n != config_.silero_vad.window_size) { | ||
| 78 | - SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, | ||
| 79 | - config_.silero_vad.window_size); | 77 | + if (n != WindowSize()) { |
| 78 | + SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize()); | ||
| 80 | exit(-1); | 79 | exit(-1); |
| 81 | } | 80 | } |
| 82 | 81 | ||
| @@ -146,9 +145,11 @@ class SileroVadModel::Impl { | @@ -146,9 +145,11 @@ class SileroVadModel::Impl { | ||
| 146 | return false; | 145 | return false; |
| 147 | } | 146 | } |
| 148 | 147 | ||
| 149 | - int32_t WindowSize() const { return config_.silero_vad.window_size; } | 148 | + int32_t WindowShift() const { return config_.silero_vad.window_size; } |
| 150 | 149 | ||
| 151 | - int32_t WindowShift() const { return WindowSize() - window_shift_; } | 150 | + int32_t WindowSize() const { |
| 151 | + return config_.silero_vad.window_size + window_overlap_; | ||
| 152 | + } | ||
| 152 | 153 | ||
| 153 | int32_t MinSilenceDurationSamples() const { return min_silence_samples_; } | 154 | int32_t MinSilenceDurationSamples() const { return min_silence_samples_; } |
| 154 | 155 | ||
| @@ -177,9 +178,9 @@ class SileroVadModel::Impl { | @@ -177,9 +178,9 @@ class SileroVadModel::Impl { | ||
| 177 | 178 | ||
| 178 | // 64 for 16kHz | 179 | // 64 for 16kHz |
| 179 | // 32 for 8kHz | 180 | // 32 for 8kHz |
| 180 | - window_shift_ = 64; | 181 | + window_overlap_ = 64; |
| 181 | 182 | ||
| 182 | - if (WindowSize() != 512) { | 183 | + if (config_.silero_vad.window_size != 512) { |
| 183 | SHERPA_ONNX_LOGE( | 184 | SHERPA_ONNX_LOGE( |
| 184 | "For silero_vad v5, we require window_size to be 512 for 16kHz"); | 185 | "For silero_vad v5, we require window_size to be 512 for 16kHz"); |
| 185 | exit(-1); | 186 | exit(-1); |
| @@ -423,7 +424,7 @@ class SileroVadModel::Impl { | @@ -423,7 +424,7 @@ class SileroVadModel::Impl { | ||
| 423 | int32_t temp_start_ = 0; | 424 | int32_t temp_start_ = 0; |
| 424 | int32_t temp_end_ = 0; | 425 | int32_t temp_end_ = 0; |
| 425 | 426 | ||
| 426 | - int32_t window_shift_ = 0; | 427 | + int32_t window_overlap_ = 0; |
| 427 | 428 | ||
| 428 | bool is_v5_ = false; | 429 | bool is_v5_ = false; |
| 429 | }; | 430 | }; |
| @@ -37,11 +37,12 @@ class SileroVadModel : public VadModel { | @@ -37,11 +37,12 @@ class SileroVadModel : public VadModel { | ||
| 37 | */ | 37 | */ |
| 38 | bool IsSpeech(const float *samples, int32_t n) override; | 38 | bool IsSpeech(const float *samples, int32_t n) override; |
| 39 | 39 | ||
| 40 | + // For silero vad V4, it is WindowShift(). | ||
| 41 | + // For silero vad V5, it is WindowShift()+64 for 16kHz and | ||
| 42 | + // WindowShift()+32 for 8kHz | ||
| 40 | int32_t WindowSize() const override; | 43 | int32_t WindowSize() const override; |
| 41 | 44 | ||
| 42 | - // For silero vad V4, it is WindowSize(). | ||
| 43 | - // For silero vad V5, it is WindowSize()-64 for 16kHz and | ||
| 44 | - // WindowSize()-32 for 8kHz | 45 | + // 512 |
| 45 | int32_t WindowShift() const override; | 46 | int32_t WindowShift() const override; |
| 46 | 47 | ||
| 47 | int32_t MinSilenceDurationSamples() const override; | 48 | int32_t MinSilenceDurationSamples() const override; |
| @@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl { | @@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl { | ||
| 44 | // an extra buffer here | 44 | // an extra buffer here |
| 45 | last_.insert(last_.end(), samples, samples + n); | 45 | last_.insert(last_.end(), samples, samples + n); |
| 46 | 46 | ||
| 47 | + if (last_.size() < window_size) { | ||
| 48 | + return; | ||
| 49 | + } | ||
| 50 | + | ||
| 47 | // Note: For v4, window_shift == window_size | 51 | // Note: For v4, window_shift == window_size |
| 48 | int32_t k = | 52 | int32_t k = |
| 49 | (static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1; | 53 | (static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1; |
| 50 | const float *p = last_.data(); | 54 | const float *p = last_.data(); |
| 51 | bool is_speech = false; | 55 | bool is_speech = false; |
| 52 | 56 | ||
| 53 | - for (int32_t i = 0; i != k; ++i, p += window_shift) { | 57 | + for (int32_t i = 0; i < k; ++i, p += window_shift) { |
| 54 | buffer_.Push(p, window_shift); | 58 | buffer_.Push(p, window_shift); |
| 55 | // NOTE(fangjun): Please don't use a very large n. | 59 | // NOTE(fangjun): Please don't use a very large n. |
| 56 | bool this_window_is_speech = model_->IsSpeech(p, window_size); | 60 | bool this_window_is_speech = model_->IsSpeech(p, window_size); |
-
请 注册 或 登录 后发表评论