Fangjun Kuang
Committed by GitHub

Limit the maximum segment length for VAD. (#990)

@@ -190,6 +190,14 @@ class SileroVadModel::Impl { @@ -190,6 +190,14 @@ class SileroVadModel::Impl {
190 190
191 int32_t MinSpeechDurationSamples() const { return min_speech_samples_; } 191 int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }
192 192
  193 + void SetMinSilenceDuration(float s) {
  194 + min_silence_samples_ = sample_rate_ * s;
  195 + }
  196 +
  197 + void SetThreshold(float threshold) {
  198 + config_.silero_vad.threshold = threshold;
  199 + }
  200 +
193 private: 201 private:
194 void Init(void *model_data, size_t model_data_length) { 202 void Init(void *model_data, size_t model_data_length) {
195 sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, 203 sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
@@ -306,4 +314,12 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const { @@ -306,4 +314,12 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const {
306 return impl_->MinSpeechDurationSamples(); 314 return impl_->MinSpeechDurationSamples();
307 } 315 }
308 316
  317 +void SileroVadModel::SetMinSilenceDuration(float s) {
  318 + impl_->SetMinSilenceDuration(s);
  319 +}
  320 +
  321 +void SileroVadModel::SetThreshold(float threshold) {
  322 + impl_->SetThreshold(threshold);
  323 +}
  324 +
309 } // namespace sherpa_onnx 325 } // namespace sherpa_onnx
@@ -42,6 +42,9 @@ class SileroVadModel : public VadModel { @@ -42,6 +42,9 @@ class SileroVadModel : public VadModel {
42 int32_t MinSilenceDurationSamples() const override; 42 int32_t MinSilenceDurationSamples() const override;
43 int32_t MinSpeechDurationSamples() const override; 43 int32_t MinSpeechDurationSamples() const override;
44 44
  45 + void SetMinSilenceDuration(float s) override;
  46 + void SetThreshold(float threshold) override;
  47 +
45 private: 48 private:
46 class Impl; 49 class Impl;
47 std::unique_ptr<Impl> impl_; 50 std::unique_ptr<Impl> impl_;
@@ -42,6 +42,8 @@ class VadModel { @@ -42,6 +42,8 @@ class VadModel {
42 42
43 virtual int32_t MinSilenceDurationSamples() const = 0; 43 virtual int32_t MinSilenceDurationSamples() const = 0;
44 virtual int32_t MinSpeechDurationSamples() const = 0; 44 virtual int32_t MinSpeechDurationSamples() const = 0;
  45 + virtual void SetMinSilenceDuration(float s) = 0;
  46 + virtual void SetThreshold(float threshold) = 0;
45 }; 47 };
46 48
47 } // namespace sherpa_onnx 49 } // namespace sherpa_onnx
@@ -29,6 +29,14 @@ class VoiceActivityDetector::Impl { @@ -29,6 +29,14 @@ class VoiceActivityDetector::Impl {
29 #endif 29 #endif
30 30
31 void AcceptWaveform(const float *samples, int32_t n) { 31 void AcceptWaveform(const float *samples, int32_t n) {
  32 + if (buffer_.Size() > max_utterance_length_) {
  33 + model_->SetMinSilenceDuration(new_min_silence_duration_s_);
  34 + model_->SetThreshold(new_threshold_);
  35 + } else {
  36 + model_->SetMinSilenceDuration(config_.silero_vad.min_silence_duration);
  37 + model_->SetThreshold(config_.silero_vad.threshold);
  38 + }
  39 +
32 int32_t window_size = model_->WindowSize(); 40 int32_t window_size = model_->WindowSize();
33 41
34 // note n is usually window_size and there is no need to use 42 // note n is usually window_size and there is no need to use
@@ -114,6 +122,10 @@ class VoiceActivityDetector::Impl { @@ -114,6 +122,10 @@ class VoiceActivityDetector::Impl {
114 CircularBuffer buffer_; 122 CircularBuffer buffer_;
115 std::vector<float> last_; 123 std::vector<float> last_;
116 124
  125 + int max_utterance_length_ = 16000 * 20; // in samples
  126 + float new_min_silence_duration_s_ = 0.1;
  127 + float new_threshold_ = 1.10;
  128 +
117 int32_t start_ = -1; 129 int32_t start_ = -1;
118 }; 130 };
119 131