Committed by
GitHub
Support returning the current speech segment for VAD. (#2397)
A new method and property were introduced in the VoiceActivityDetector C++ and Python APIs to provide access to the current speech segment as soon as speech is detected, rather than only after the segment completes.
正在显示
4 个修改的文件
包含
53 行增加
和
2 行删除
| @@ -609,6 +609,16 @@ def main(): | @@ -609,6 +609,16 @@ def main(): | ||
| 609 | vad.accept_waveform(buffer[:window_size]) | 609 | vad.accept_waveform(buffer[:window_size]) |
| 610 | buffer = buffer[window_size:] | 610 | buffer = buffer[window_size:] |
| 611 | 611 | ||
| 612 | + if False: | ||
| 613 | + # If you want to process the speech segment as soon as | ||
| 614 | + # speech is detected, you can use | ||
| 615 | + current_segment = vad.current_segment | ||
| 616 | + if len(current_segment.samples) > 0: | ||
| 617 | + print( | ||
| 618 | + f"speech starts at {current_segment.start/16000} seconds: ", | ||
| 619 | + f"duration {len(current_segment.samples)/16000} seconds", | ||
| 620 | + ) | ||
| 621 | + | ||
| 612 | streams = [] | 622 | streams = [] |
| 613 | segments = [] | 623 | segments = [] |
| 614 | while not vad.empty(): | 624 | while not vad.empty(): |
| @@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl { | @@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl { | ||
| 91 | start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() - | 91 | start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() - |
| 92 | model_->MinSpeechDurationSamples(), | 92 | model_->MinSpeechDurationSamples(), |
| 93 | buffer_.Head()); | 93 | buffer_.Head()); |
| 94 | + cur_segment_.start = start_; | ||
| 94 | } | 95 | } |
| 96 | + int32_t num_samples = buffer_.Tail() - start_ - 1; | ||
| 97 | + cur_segment_.samples = buffer_.Get(start_, num_samples); | ||
| 95 | } else { | 98 | } else { |
| 96 | // non-speech | 99 | // non-speech |
| 100 | + | ||
| 101 | + cur_segment_.start = -1; | ||
| 102 | + cur_segment_.samples.clear(); | ||
| 103 | + | ||
| 97 | if (start_ != -1 && buffer_.Size()) { | 104 | if (start_ != -1 && buffer_.Size()) { |
| 98 | // end of speech, save the speech segment | 105 | // end of speech, save the speech segment |
| 99 | int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples(); | 106 | int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples(); |
| @@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl { | @@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl { | ||
| 138 | last_.clear(); | 145 | last_.clear(); |
| 139 | 146 | ||
| 140 | start_ = -1; | 147 | start_ = -1; |
| 148 | + | ||
| 149 | + cur_segment_.start = -1; | ||
| 150 | + cur_segment_.samples.clear(); | ||
| 141 | } | 151 | } |
| 142 | 152 | ||
| 143 | void Flush() { | 153 | void Flush() { |
| @@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl { | @@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl { | ||
| 161 | 171 | ||
| 162 | buffer_.Pop(end - buffer_.Head()); | 172 | buffer_.Pop(end - buffer_.Head()); |
| 163 | start_ = -1; | 173 | start_ = -1; |
| 174 | + | ||
| 175 | + cur_segment_.start = -1; | ||
| 176 | + cur_segment_.samples.clear(); | ||
| 164 | } | 177 | } |
| 165 | 178 | ||
| 166 | bool IsSpeechDetected() const { return start_ != -1; } | 179 | bool IsSpeechDetected() const { return start_ != -1; } |
| 167 | 180 | ||
| 181 | + SpeechSegment CurrentSpeechSegment() const { return cur_segment_; } | ||
| 182 | + | ||
| 168 | const VadModelConfig &GetConfig() const { return config_; } | 183 | const VadModelConfig &GetConfig() const { return config_; } |
| 169 | 184 | ||
| 170 | private: | 185 | private: |
| @@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl { | @@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl { | ||
| 184 | private: | 199 | private: |
| 185 | std::queue<SpeechSegment> segments_; | 200 | std::queue<SpeechSegment> segments_; |
| 186 | 201 | ||
| 202 | + // it is empty if no speech is detected | ||
| 203 | + SpeechSegment cur_segment_; | ||
| 204 | + | ||
| 187 | std::unique_ptr<VadModel> model_; | 205 | std::unique_ptr<VadModel> model_; |
| 188 | VadModelConfig config_; | 206 | VadModelConfig config_; |
| 189 | CircularBuffer buffer_; | 207 | CircularBuffer buffer_; |
| @@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const { | @@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const { | ||
| 230 | return impl_->IsSpeechDetected(); | 248 | return impl_->IsSpeechDetected(); |
| 231 | } | 249 | } |
| 232 | 250 | ||
| 251 | +SpeechSegment VoiceActivityDetector::CurrentSpeechSegment() const { | ||
| 252 | + return impl_->CurrentSpeechSegment(); | ||
| 253 | +} | ||
| 254 | + | ||
| 233 | const VadModelConfig &VoiceActivityDetector::GetConfig() const { | 255 | const VadModelConfig &VoiceActivityDetector::GetConfig() const { |
| 234 | return impl_->GetConfig(); | 256 | return impl_->GetConfig(); |
| 235 | } | 257 | } |
| @@ -31,10 +31,18 @@ class VoiceActivityDetector { | @@ -31,10 +31,18 @@ class VoiceActivityDetector { | ||
| 31 | bool Empty() const; | 31 | bool Empty() const; |
| 32 | void Pop(); | 32 | void Pop(); |
| 33 | void Clear(); | 33 | void Clear(); |
| 34 | + | ||
| 35 | + // It is an error to call Front() if Empty() returns true. | ||
| 36 | + // | ||
| 37 | + // The returned reference is valid until the next call to any | ||
| 38 | + // methods of VoiceActivityDetector. | ||
| 34 | const SpeechSegment &Front() const; | 39 | const SpeechSegment &Front() const; |
| 35 | 40 | ||
| 36 | bool IsSpeechDetected() const; | 41 | bool IsSpeechDetected() const; |
| 37 | 42 | ||
| 43 | + // It is empty if IsSpeechDetected() returns false | ||
| 44 | + SpeechSegment CurrentSpeechSegment() const; | ||
| 45 | + | ||
| 38 | void Reset() const; | 46 | void Reset() const; |
| 39 | 47 | ||
| 40 | // At the end of the utterance, you can invoke this method so that | 48 | // At the end of the utterance, you can invoke this method so that |
| @@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) { | @@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) { | ||
| 22 | void PybindVoiceActivityDetector(py::module *m) { | 22 | void PybindVoiceActivityDetector(py::module *m) { |
| 23 | PybindSpeechSegment(m); | 23 | PybindSpeechSegment(m); |
| 24 | using PyClass = VoiceActivityDetector; | 24 | using PyClass = VoiceActivityDetector; |
| 25 | - py::class_<PyClass>(*m, "VoiceActivityDetector") | 25 | + py::class_<PyClass>(*m, "VoiceActivityDetector", |
| 26 | + R"( | ||
| 27 | +1. It is an error to call the front property when the method empty() returns True | ||
| 28 | +2. The property front returns a reference, which is valid until the next call of any | ||
| 29 | + methods of this class | ||
| 30 | +3. When speech is detected, the method is_speech_detected() return True, you can | ||
| 31 | + use the property current_segment to get the speech samples since | ||
| 32 | + is_speech_detected() returns true | ||
| 33 | +4. When is_speech_detected() is changed from True to False, the method | ||
| 34 | + empty() returns False. | ||
| 35 | + )") | ||
| 26 | .def(py::init<const VadModelConfig &, float>(), py::arg("config"), | 36 | .def(py::init<const VadModelConfig &, float>(), py::arg("config"), |
| 27 | py::arg("buffer_size_in_seconds") = 60, | 37 | py::arg("buffer_size_in_seconds") = 60, |
| 28 | py::call_guard<py::gil_scoped_release>()) | 38 | py::call_guard<py::gil_scoped_release>()) |
| @@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) { | @@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) { | ||
| 39 | py::call_guard<py::gil_scoped_release>()) | 49 | py::call_guard<py::gil_scoped_release>()) |
| 40 | .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>()) | 50 | .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>()) |
| 41 | .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>()) | 51 | .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>()) |
| 42 | - .def_property_readonly("front", &PyClass::Front); | 52 | + .def_property_readonly("front", &PyClass::Front) |
| 53 | + .def_property_readonly("current_segment", &PyClass::CurrentSpeechSegment); | ||
| 43 | } | 54 | } |
| 44 | 55 | ||
| 45 | } // namespace sherpa_onnx | 56 | } // namespace sherpa_onnx |
-
请 注册 或 登录 后发表评论