Support specifying max speech duration for VAD. (#1348)

Fangjun Kuang · GitHub
Commit 1423ddb1f030addd7070ba309bc63885517d33ba 1423ddb1 1 parent 5d761712
python-api-examples/generate-subtitles.py
sherpa-onnx/csrc/silero-vad-model-config.cc
sherpa-onnx/csrc/silero-vad-model-config.h
sherpa-onnx/csrc/voice-activity-detector.cc
sherpa-onnx/python/csrc/silero-vad-model-config.cc
--- a/python-api-examples/generate-subtitles.py
查看文件 @1423ddb
+++ b/python-api-examples/generate-subtitles.py
查看文件 @1423ddb
@@ -406,7 +406,14 @@ def main():
 
     config = sherpa_onnx.VadModelConfig()
     config.silero_vad.model = args.silero_vad_model
-     config.silero_vad.min_silence_duration = 0.25
+     config.silero_vad.threshold = 0.5
+     config.silero_vad.min_silence_duration = 0.25  # seconds
+     config.silero_vad.min_speech_duration = 0.25  # seconds
+ 
+     # If the current segment is larger than this value, then it increases
+     # the threshold to 0.9 internally. After detecting this segment,
+     # it resets the threshold to its original value.
+     config.silero_vad.max_speech_duration = 5  # seconds
     config.sample_rate = args.sample_rate
 
     window_size = config.silero_vad.window_size
--- a/sherpa-onnx/csrc/silero-vad-model-config.cc
查看文件 @1423ddb
+++ b/sherpa-onnx/csrc/silero-vad-model-config.cc
查看文件 @1423ddb
@@ -29,6 +29,12 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
                "--silero-vad-min-speech-duration seconds before separating it");
 
   po->Register(
+       "silero-vad-max-speech-duration", &max_speech_duration,
+       "In seconds. If a speech segment is longer than this value, then we "
+       "increase the threshold to 0.9. After finishing detecting the segment, "
+       "the threshold value is reset to its original value.");
+ 
+   po->Register(
       "silero-vad-window-size", &window_size,
       "In samples. Audio chunks of --silero-vad-window-size samples are fed "
       "to the silero VAD model. WARNING! Silero VAD models were trained using "
@@ -63,6 +69,33 @@ bool SileroVadModelConfig::Validate() const {
     return false;
   }
 
+   if (min_silence_duration <= 0) {
+     SHERPA_ONNX_LOGE(
+         "Please use a larger value for --silero-vad-min-silence-duration. "
+         "Given: "
+         "%f",
+         min_silence_duration);
+     return false;
+   }
+ 
+   if (min_speech_duration <= 0) {
+     SHERPA_ONNX_LOGE(
+         "Please use a larger value for --silero-vad-min-speech-duration. "
+         "Given: "
+         "%f",
+         min_speech_duration);
+     return false;
+   }
+ 
+   if (max_speech_duration <= 0) {
+     SHERPA_ONNX_LOGE(
+         "Please use a larger value for --silero-vad-max-speech-duration. "
+         "Given: "
+         "%f",
+         max_speech_duration);
+     return false;
+   }
+ 
   return true;
 }
 
@@ -74,6 +107,7 @@ std::string SileroVadModelConfig::ToString() const {
   os << "threshold=" << threshold << ", ";
   os << "min_silence_duration=" << min_silence_duration << ", ";
   os << "min_speech_duration=" << min_speech_duration << ", ";
+   os << "max_speech_duration=" << max_speech_duration << ", ";
   os << "window_size=" << window_size << ")";
 
   return os.str();
--- a/sherpa-onnx/csrc/silero-vad-model-config.h
查看文件 @1423ddb
+++ b/sherpa-onnx/csrc/silero-vad-model-config.h
查看文件 @1423ddb
@@ -27,6 +27,11 @@ struct SileroVadModelConfig {
   // 256, 512, 768 samples for 800 Hz
   int32_t window_size = 512;  // in samples
 
+   // If a speech segment is longer than this value, then we increase
+   // the threshold to 0.9. After finishing detecting the segment,
+   // the threshold value is reset to its original value.
+   float max_speech_duration = 20;  // in seconds
+ 
   SileroVadModelConfig() = default;
 
   void Register(ParseOptions *po);
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @1423ddb
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @1423ddb
@@ -18,14 +18,18 @@ class VoiceActivityDetector::Impl {
   explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60)
       : model_(VadModel::Create(config)),
         config_(config),
-         buffer_(buffer_size_in_seconds * config.sample_rate) {}
+         buffer_(buffer_size_in_seconds * config.sample_rate) {
+     Init();
+   }
 
 #if __ANDROID_API__ >= 9
   Impl(AAssetManager *mgr, const VadModelConfig &config,
        float buffer_size_in_seconds = 60)
       : model_(VadModel::Create(mgr, config)),
         config_(config),
-         buffer_(buffer_size_in_seconds * config.sample_rate) {}
+         buffer_(buffer_size_in_seconds * config.sample_rate) {
+     Init();
+   }
 #endif
 
   void AcceptWaveform(const float *samples, int32_t n) {
@@ -146,6 +150,15 @@ class VoiceActivityDetector::Impl {
   const VadModelConfig &GetConfig() const { return config_; }
 
  private:
+   void Init() {
+     // TODO(fangjun): Currently, we support only one vad model.
+     // If a new vad model is added, we need to change the place
+     // where max_speech_duration is placed.
+     max_utterance_length_ =
+         config_.sample_rate * config_.silero_vad.max_speech_duration;
+   }
+ 
+  private:
   std::queue<SpeechSegment> segments_;
 
   std::unique_ptr<VadModel> model_;
@@ -153,9 +166,9 @@ class VoiceActivityDetector::Impl {
   CircularBuffer buffer_;
   std::vector<float> last_;
 
-   int max_utterance_length_ = 16000 * 20;  // in samples
+   int max_utterance_length_ = -1;  // in samples
   float new_min_silence_duration_s_ = 0.1;
-   float new_threshold_ = 1.10;
+   float new_threshold_ = 0.90;
 
   int32_t start_ = -1;
 };
--- a/sherpa-onnx/python/csrc/silero-vad-model-config.cc
查看文件 @1423ddb
+++ b/sherpa-onnx/python/csrc/silero-vad-model-config.cc
查看文件 @1423ddb
@@ -17,7 +17,8 @@ void PybindSileroVadModelConfig(py::module *m) {
       .def(py::init<>())
       .def(py::init([](const std::string &model, float threshold,
                        float min_silence_duration, float min_speech_duration,
-                        int32_t window_size) -> std::unique_ptr<PyClass> {
+                        int32_t window_size,
+                        float max_speech_duration) -> std::unique_ptr<PyClass> {
              auto ans = std::make_unique<PyClass>();
 
              ans->model = model;
@@ -25,17 +26,20 @@ void PybindSileroVadModelConfig(py::module *m) {
              ans->min_silence_duration = min_silence_duration;
              ans->min_speech_duration = min_speech_duration;
              ans->window_size = window_size;
+              ans->max_speech_duration = max_speech_duration;
 
              return ans;
            }),
            py::arg("model"), py::arg("threshold") = 0.5,
            py::arg("min_silence_duration") = 0.5,
-            py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512)
+            py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512,
+            py::arg("max_speech_duration") = 20)
       .def_readwrite("model", &PyClass::model)
       .def_readwrite("threshold", &PyClass::threshold)
       .def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
       .def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
       .def_readwrite("window_size", &PyClass::window_size)
+       .def_readwrite("max_speech_duration", &PyClass::max_speech_duration)
       .def("__str__", &PyClass::ToString)
       .def("validate", &PyClass::Validate);
 }