Fangjun Kuang
Committed by GitHub

Support returning the current speech segment for VAD. (#2397)

A new method and property were introduced in the VoiceActivityDetector C++ and Python APIs to 
provide access to the current speech segment as soon as speech is detected, rather than only 
after the segment completes.
@@ -609,6 +609,16 @@ def main(): @@ -609,6 +609,16 @@ def main():
609 vad.accept_waveform(buffer[:window_size]) 609 vad.accept_waveform(buffer[:window_size])
610 buffer = buffer[window_size:] 610 buffer = buffer[window_size:]
611 611
  612 + if False:
  613 + # If you want to process the speech segment as soon as
  614 + # speech is detected, you can use
  615 + current_segment = vad.current_segment
  616 + if len(current_segment.samples) > 0:
  617 + print(
  618 + f"speech starts at {current_segment.start/16000} seconds: ",
  619 + f"duration {len(current_segment.samples)/16000} seconds",
  620 + )
  621 +
612 streams = [] 622 streams = []
613 segments = [] 623 segments = []
614 while not vad.empty(): 624 while not vad.empty():
@@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl { @@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl {
91 start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() - 91 start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() -
92 model_->MinSpeechDurationSamples(), 92 model_->MinSpeechDurationSamples(),
93 buffer_.Head()); 93 buffer_.Head());
  94 + cur_segment_.start = start_;
94 } 95 }
  96 + int32_t num_samples = buffer_.Tail() - start_ - 1;
  97 + cur_segment_.samples = buffer_.Get(start_, num_samples);
95 } else { 98 } else {
96 // non-speech 99 // non-speech
  100 +
  101 + cur_segment_.start = -1;
  102 + cur_segment_.samples.clear();
  103 +
97 if (start_ != -1 && buffer_.Size()) { 104 if (start_ != -1 && buffer_.Size()) {
98 // end of speech, save the speech segment 105 // end of speech, save the speech segment
99 int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples(); 106 int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
@@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl { @@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl {
138 last_.clear(); 145 last_.clear();
139 146
140 start_ = -1; 147 start_ = -1;
  148 +
  149 + cur_segment_.start = -1;
  150 + cur_segment_.samples.clear();
141 } 151 }
142 152
143 void Flush() { 153 void Flush() {
@@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl { @@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl {
161 171
162 buffer_.Pop(end - buffer_.Head()); 172 buffer_.Pop(end - buffer_.Head());
163 start_ = -1; 173 start_ = -1;
  174 +
  175 + cur_segment_.start = -1;
  176 + cur_segment_.samples.clear();
164 } 177 }
165 178
166 bool IsSpeechDetected() const { return start_ != -1; } 179 bool IsSpeechDetected() const { return start_ != -1; }
167 180
  181 + SpeechSegment CurrentSpeechSegment() const { return cur_segment_; }
  182 +
168 const VadModelConfig &GetConfig() const { return config_; } 183 const VadModelConfig &GetConfig() const { return config_; }
169 184
170 private: 185 private:
@@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl { @@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl {
184 private: 199 private:
185 std::queue<SpeechSegment> segments_; 200 std::queue<SpeechSegment> segments_;
186 201
  202 + // it is empty if no speech is detected
  203 + SpeechSegment cur_segment_;
  204 +
187 std::unique_ptr<VadModel> model_; 205 std::unique_ptr<VadModel> model_;
188 VadModelConfig config_; 206 VadModelConfig config_;
189 CircularBuffer buffer_; 207 CircularBuffer buffer_;
@@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const { @@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const {
230 return impl_->IsSpeechDetected(); 248 return impl_->IsSpeechDetected();
231 } 249 }
232 250
  251 +SpeechSegment VoiceActivityDetector::CurrentSpeechSegment() const {
  252 + return impl_->CurrentSpeechSegment();
  253 +}
  254 +
233 const VadModelConfig &VoiceActivityDetector::GetConfig() const { 255 const VadModelConfig &VoiceActivityDetector::GetConfig() const {
234 return impl_->GetConfig(); 256 return impl_->GetConfig();
235 } 257 }
@@ -31,10 +31,18 @@ class VoiceActivityDetector { @@ -31,10 +31,18 @@ class VoiceActivityDetector {
31 bool Empty() const; 31 bool Empty() const;
32 void Pop(); 32 void Pop();
33 void Clear(); 33 void Clear();
  34 +
  35 + // It is an error to call Front() if Empty() returns true.
  36 + //
  37 + // The returned reference is valid until the next call to any
  38 + // methods of VoiceActivityDetector.
34 const SpeechSegment &Front() const; 39 const SpeechSegment &Front() const;
35 40
36 bool IsSpeechDetected() const; 41 bool IsSpeechDetected() const;
37 42
  43 + // It is empty if IsSpeechDetected() returns false
  44 + SpeechSegment CurrentSpeechSegment() const;
  45 +
38 void Reset() const; 46 void Reset() const;
39 47
40 // At the end of the utterance, you can invoke this method so that 48 // At the end of the utterance, you can invoke this method so that
@@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) { @@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) {
22 void PybindVoiceActivityDetector(py::module *m) { 22 void PybindVoiceActivityDetector(py::module *m) {
23 PybindSpeechSegment(m); 23 PybindSpeechSegment(m);
24 using PyClass = VoiceActivityDetector; 24 using PyClass = VoiceActivityDetector;
25 - py::class_<PyClass>(*m, "VoiceActivityDetector") 25 + py::class_<PyClass>(*m, "VoiceActivityDetector",
  26 + R"(
  27 +1. It is an error to call the front property when the method empty() returns True
  28 +2. The property front returns a reference, which is valid until the next call of any
  29 + methods of this class
  30 +3. When speech is detected, the method is_speech_detected() return True, you can
  31 + use the property current_segment to get the speech samples since
  32 + is_speech_detected() returns true
  33 +4. When is_speech_detected() is changed from True to False, the method
  34 + empty() returns False.
  35 + )")
26 .def(py::init<const VadModelConfig &, float>(), py::arg("config"), 36 .def(py::init<const VadModelConfig &, float>(), py::arg("config"),
27 py::arg("buffer_size_in_seconds") = 60, 37 py::arg("buffer_size_in_seconds") = 60,
28 py::call_guard<py::gil_scoped_release>()) 38 py::call_guard<py::gil_scoped_release>())
@@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) { @@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) {
39 py::call_guard<py::gil_scoped_release>()) 49 py::call_guard<py::gil_scoped_release>())
40 .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>()) 50 .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
41 .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>()) 51 .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
42 - .def_property_readonly("front", &PyClass::Front); 52 + .def_property_readonly("front", &PyClass::Front)
  53 + .def_property_readonly("current_segment", &PyClass::CurrentSpeechSegment);
43 } 54 }
44 55
45 } // namespace sherpa_onnx 56 } // namespace sherpa_onnx