Limit the maximum segment length for VAD. (#990)

Fangjun Kuang · GitHub
Commit 208da78343838b754ec26dc117ddf6f0b8d1b3d7 208da783 1 parent aac86847
sherpa-onnx/csrc/silero-vad-model.cc
sherpa-onnx/csrc/silero-vad-model.h
sherpa-onnx/csrc/vad-model.h
sherpa-onnx/csrc/voice-activity-detector.cc
--- a/sherpa-onnx/csrc/silero-vad-model.cc
查看文件 @208da78
+++ b/sherpa-onnx/csrc/silero-vad-model.cc
查看文件 @208da78
@@ -190,6 +190,14 @@ class SileroVadModel::Impl {
 
   int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }
 
+   void SetMinSilenceDuration(float s) {
+     min_silence_samples_ = sample_rate_ * s;
+   }
+ 
+   void SetThreshold(float threshold) {
+     config_.silero_vad.threshold = threshold;
+   }
+ 
  private:
   void Init(void *model_data, size_t model_data_length) {
     sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
@@ -306,4 +314,12 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const {
   return impl_->MinSpeechDurationSamples();
 }
 
+ void SileroVadModel::SetMinSilenceDuration(float s) {
+   impl_->SetMinSilenceDuration(s);
+ }
+ 
+ void SileroVadModel::SetThreshold(float threshold) {
+   impl_->SetThreshold(threshold);
+ }
+ 
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/silero-vad-model.h
查看文件 @208da78
+++ b/sherpa-onnx/csrc/silero-vad-model.h
查看文件 @208da78
@@ -42,6 +42,9 @@ class SileroVadModel : public VadModel {
   int32_t MinSilenceDurationSamples() const override;
   int32_t MinSpeechDurationSamples() const override;
 
+   void SetMinSilenceDuration(float s) override;
+   void SetThreshold(float threshold) override;
+ 
  private:
   class Impl;
   std::unique_ptr<Impl> impl_;
--- a/sherpa-onnx/csrc/vad-model.h
查看文件 @208da78
+++ b/sherpa-onnx/csrc/vad-model.h
查看文件 @208da78
@@ -42,6 +42,8 @@ class VadModel {
 
   virtual int32_t MinSilenceDurationSamples() const = 0;
   virtual int32_t MinSpeechDurationSamples() const = 0;
+   virtual void SetMinSilenceDuration(float s) = 0;
+   virtual void SetThreshold(float threshold) = 0;
 };
 
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @208da78
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @208da78
@@ -29,6 +29,14 @@ class VoiceActivityDetector::Impl {
 #endif
 
   void AcceptWaveform(const float *samples, int32_t n) {
+     if (buffer_.Size() > max_utterance_length_) {
+       model_->SetMinSilenceDuration(new_min_silence_duration_s_);
+       model_->SetThreshold(new_threshold_);
+     } else {
+       model_->SetMinSilenceDuration(config_.silero_vad.min_silence_duration);
+       model_->SetThreshold(config_.silero_vad.threshold);
+     }
+ 
     int32_t window_size = model_->WindowSize();
 
     // note n is usually window_size and there is no need to use
@@ -114,6 +122,10 @@ class VoiceActivityDetector::Impl {
   CircularBuffer buffer_;
   std::vector<float> last_;
 
+   int max_utterance_length_ = 16000 * 20;  // in samples
+   float new_min_silence_duration_s_ = 0.1;
+   float new_threshold_ = 1.10;
+ 
   int32_t start_ = -1;
 };