Support returning the current speech segment for VAD. (#2397)

A new method and property were introduced in the VoiceActivityDetector C++ and Python APIs to provide access to the current speech segment as soon as speech is detected, rather than only after the segment completes.

Support returning the current speech segment for VAD. (#2397)
A new method and property were introduced in the VoiceActivityDetector C++ and Python APIs to provide access to the current speech segment as soon as speech is detected, rather than only after the segment completes.
Fangjun Kuang · GitHub
Commit a77ba48d6cd1bc8afd6060978f2abd783d3bfe05 a77ba48d 1 parent 8693b1ee
python-api-examples/generate-subtitles.py
sherpa-onnx/csrc/voice-activity-detector.cc
sherpa-onnx/csrc/voice-activity-detector.h
sherpa-onnx/python/csrc/voice-activity-detector.cc
--- a/python-api-examples/generate-subtitles.py
查看文件 @a77ba48
+++ b/python-api-examples/generate-subtitles.py
查看文件 @a77ba48
@@ -609,6 +609,16 @@ def main():
                 vad.accept_waveform(buffer[:window_size])
                 buffer = buffer[window_size:]
+                if False:
+                    # If you want to process the speech segment as soon as
+                    # speech is detected, you can use
+                    current_segment = vad.current_segment
+                    if len(current_segment.samples) > 0:
+                        print(
+                            f"speech starts at {current_segment.start/16000} seconds: ",
+                            f"duration {len(current_segment.samples)/16000} seconds",
+                        )
+
         streams = []
         segments = []
         while not vad.empty():
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @a77ba48
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @a77ba48
@@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl {
         start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() -
                               model_->MinSpeechDurationSamples(),
                           buffer_.Head());
+        cur_segment_.start = start_;
       }
+      int32_t num_samples = buffer_.Tail() - start_ - 1;
+      cur_segment_.samples = buffer_.Get(start_, num_samples);
     } else {
       // non-speech
+
+      cur_segment_.start = -1;
+      cur_segment_.samples.clear();
+
       if (start_ != -1 && buffer_.Size()) {
         // end of speech, save the speech segment
         int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
@@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl {
     last_.clear();
     start_ = -1;
+
+    cur_segment_.start = -1;
+    cur_segment_.samples.clear();
   }
   void Flush() {
@@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl {
     buffer_.Pop(end - buffer_.Head());
     start_ = -1;
+
+    cur_segment_.start = -1;
+    cur_segment_.samples.clear();
   }
   bool IsSpeechDetected() const { return start_ != -1; }
+  SpeechSegment CurrentSpeechSegment() const { return cur_segment_; }
+
   const VadModelConfig &GetConfig() const { return config_; }
  private:
@@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl {
  private:
   std::queue<SpeechSegment> segments_;
+  // it is empty if no speech is detected
+  SpeechSegment cur_segment_;
+
   std::unique_ptr<VadModel> model_;
   VadModelConfig config_;
   CircularBuffer buffer_;
@@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const {
   return impl_->IsSpeechDetected();
 }
+SpeechSegment VoiceActivityDetector::CurrentSpeechSegment() const {
+  return impl_->CurrentSpeechSegment();
+}
+
 const VadModelConfig &VoiceActivityDetector::GetConfig() const {
   return impl_->GetConfig();
 }
--- a/sherpa-onnx/csrc/voice-activity-detector.h
查看文件 @a77ba48
+++ b/sherpa-onnx/csrc/voice-activity-detector.h
查看文件 @a77ba48
@@ -31,10 +31,18 @@ class VoiceActivityDetector {
   bool Empty() const;
   void Pop();
   void Clear();
+
+  // It is an error to call Front() if Empty() returns true.
+  //
+  // The returned reference is valid until the next call to any
+  // methods of VoiceActivityDetector.
   const SpeechSegment &Front() const;
   bool IsSpeechDetected() const;
+  // It is empty if IsSpeechDetected() returns false
+  SpeechSegment CurrentSpeechSegment() const;
+
   void Reset() const;
   // At the end of the utterance, you can invoke this method so that
--- a/sherpa-onnx/python/csrc/voice-activity-detector.cc
查看文件 @a77ba48
+++ b/sherpa-onnx/python/csrc/voice-activity-detector.cc
查看文件 @a77ba48
@@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) {
 void PybindVoiceActivityDetector(py::module *m) {
   PybindSpeechSegment(m);
   using PyClass = VoiceActivityDetector;
-  py::class_<PyClass>(*m, "VoiceActivityDetector")
+  py::class_<PyClass>(*m, "VoiceActivityDetector",
+                      R"(
+1. It is an error to call the front property when the method empty() returns True
+2. The property front returns a reference, which is valid until the next call of any
+   methods of this class
+3. When speech is detected, the method is_speech_detected() return True, you can
+   use the property current_segment to get the speech samples since
+   is_speech_detected() returns true
+4. When is_speech_detected() is changed from True to False, the method
+   empty() returns False.
+      )")
       .def(py::init<const VadModelConfig &, float>(), py::arg("config"),
            py::arg("buffer_size_in_seconds") = 60,
            py::call_guard<py::gil_scoped_release>())
@@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
       .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
-      .def_property_readonly("front", &PyClass::Front);
+      .def_property_readonly("front", &PyClass::Front)
+      .def_property_readonly("current_segment", &PyClass::CurrentSpeechSegment);
 }
 }  // namespace sherpa_onnx