Committed by
GitHub
Limit the maximum segment length for VAD. (#990)
正在显示
4 个修改的文件
包含
33 行增加
和
0 行删除
| @@ -190,6 +190,14 @@ class SileroVadModel::Impl { | @@ -190,6 +190,14 @@ class SileroVadModel::Impl { | ||
| 190 | 190 | ||
| 191 | int32_t MinSpeechDurationSamples() const { return min_speech_samples_; } | 191 | int32_t MinSpeechDurationSamples() const { return min_speech_samples_; } |
| 192 | 192 | ||
| 193 | + void SetMinSilenceDuration(float s) { | ||
| 194 | + min_silence_samples_ = sample_rate_ * s; | ||
| 195 | + } | ||
| 196 | + | ||
| 197 | + void SetThreshold(float threshold) { | ||
| 198 | + config_.silero_vad.threshold = threshold; | ||
| 199 | + } | ||
| 200 | + | ||
| 193 | private: | 201 | private: |
| 194 | void Init(void *model_data, size_t model_data_length) { | 202 | void Init(void *model_data, size_t model_data_length) { |
| 195 | sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | 203 | sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, |
| @@ -306,4 +314,12 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const { | @@ -306,4 +314,12 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const { | ||
| 306 | return impl_->MinSpeechDurationSamples(); | 314 | return impl_->MinSpeechDurationSamples(); |
| 307 | } | 315 | } |
| 308 | 316 | ||
| 317 | +void SileroVadModel::SetMinSilenceDuration(float s) { | ||
| 318 | + impl_->SetMinSilenceDuration(s); | ||
| 319 | +} | ||
| 320 | + | ||
| 321 | +void SileroVadModel::SetThreshold(float threshold) { | ||
| 322 | + impl_->SetThreshold(threshold); | ||
| 323 | +} | ||
| 324 | + | ||
| 309 | } // namespace sherpa_onnx | 325 | } // namespace sherpa_onnx |
| @@ -42,6 +42,9 @@ class SileroVadModel : public VadModel { | @@ -42,6 +42,9 @@ class SileroVadModel : public VadModel { | ||
| 42 | int32_t MinSilenceDurationSamples() const override; | 42 | int32_t MinSilenceDurationSamples() const override; |
| 43 | int32_t MinSpeechDurationSamples() const override; | 43 | int32_t MinSpeechDurationSamples() const override; |
| 44 | 44 | ||
| 45 | + void SetMinSilenceDuration(float s) override; | ||
| 46 | + void SetThreshold(float threshold) override; | ||
| 47 | + | ||
| 45 | private: | 48 | private: |
| 46 | class Impl; | 49 | class Impl; |
| 47 | std::unique_ptr<Impl> impl_; | 50 | std::unique_ptr<Impl> impl_; |
| @@ -42,6 +42,8 @@ class VadModel { | @@ -42,6 +42,8 @@ class VadModel { | ||
| 42 | 42 | ||
| 43 | virtual int32_t MinSilenceDurationSamples() const = 0; | 43 | virtual int32_t MinSilenceDurationSamples() const = 0; |
| 44 | virtual int32_t MinSpeechDurationSamples() const = 0; | 44 | virtual int32_t MinSpeechDurationSamples() const = 0; |
| 45 | + virtual void SetMinSilenceDuration(float s) = 0; | ||
| 46 | + virtual void SetThreshold(float threshold) = 0; | ||
| 45 | }; | 47 | }; |
| 46 | 48 | ||
| 47 | } // namespace sherpa_onnx | 49 | } // namespace sherpa_onnx |
| @@ -29,6 +29,14 @@ class VoiceActivityDetector::Impl { | @@ -29,6 +29,14 @@ class VoiceActivityDetector::Impl { | ||
| 29 | #endif | 29 | #endif |
| 30 | 30 | ||
| 31 | void AcceptWaveform(const float *samples, int32_t n) { | 31 | void AcceptWaveform(const float *samples, int32_t n) { |
| 32 | + if (buffer_.Size() > max_utterance_length_) { | ||
| 33 | + model_->SetMinSilenceDuration(new_min_silence_duration_s_); | ||
| 34 | + model_->SetThreshold(new_threshold_); | ||
| 35 | + } else { | ||
| 36 | + model_->SetMinSilenceDuration(config_.silero_vad.min_silence_duration); | ||
| 37 | + model_->SetThreshold(config_.silero_vad.threshold); | ||
| 38 | + } | ||
| 39 | + | ||
| 32 | int32_t window_size = model_->WindowSize(); | 40 | int32_t window_size = model_->WindowSize(); |
| 33 | 41 | ||
| 34 | // note n is usually window_size and there is no need to use | 42 | // note n is usually window_size and there is no need to use |
| @@ -114,6 +122,10 @@ class VoiceActivityDetector::Impl { | @@ -114,6 +122,10 @@ class VoiceActivityDetector::Impl { | ||
| 114 | CircularBuffer buffer_; | 122 | CircularBuffer buffer_; |
| 115 | std::vector<float> last_; | 123 | std::vector<float> last_; |
| 116 | 124 | ||
| 125 | + int max_utterance_length_ = 16000 * 20; // in samples | ||
| 126 | + float new_min_silence_duration_s_ = 0.1; | ||
| 127 | + float new_threshold_ = 1.10; | ||
| 128 | + | ||
| 117 | int32_t start_ = -1; | 129 | int32_t start_ = -1; |
| 118 | }; | 130 | }; |
| 119 | 131 |
-
请 注册 或 登录 后发表评论