Fangjun Kuang
Committed by GitHub

Support returning the current speech segment for VAD. (#2397)

A new method and property were introduced in the VoiceActivityDetector C++ and Python APIs to 
provide access to the current speech segment as soon as speech is detected, rather than only 
after the segment completes.
... ... @@ -609,6 +609,16 @@ def main():
vad.accept_waveform(buffer[:window_size])
buffer = buffer[window_size:]
if False:
# If you want to process the speech segment as soon as
# speech is detected, you can use
current_segment = vad.current_segment
if len(current_segment.samples) > 0:
print(
f"speech starts at {current_segment.start/16000} seconds: ",
f"duration {len(current_segment.samples)/16000} seconds",
)
streams = []
segments = []
while not vad.empty():
... ...
... ... @@ -91,9 +91,16 @@ class VoiceActivityDetector::Impl {
start_ = std::max(buffer_.Tail() - 2 * model_->WindowSize() -
model_->MinSpeechDurationSamples(),
buffer_.Head());
cur_segment_.start = start_;
}
int32_t num_samples = buffer_.Tail() - start_ - 1;
cur_segment_.samples = buffer_.Get(start_, num_samples);
} else {
// non-speech
cur_segment_.start = -1;
cur_segment_.samples.clear();
if (start_ != -1 && buffer_.Size()) {
// end of speech, save the speech segment
int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
... ... @@ -138,6 +145,9 @@ class VoiceActivityDetector::Impl {
last_.clear();
start_ = -1;
cur_segment_.start = -1;
cur_segment_.samples.clear();
}
void Flush() {
... ... @@ -161,10 +171,15 @@ class VoiceActivityDetector::Impl {
buffer_.Pop(end - buffer_.Head());
start_ = -1;
cur_segment_.start = -1;
cur_segment_.samples.clear();
}
bool IsSpeechDetected() const { return start_ != -1; }
SpeechSegment CurrentSpeechSegment() const { return cur_segment_; }
const VadModelConfig &GetConfig() const { return config_; }
private:
... ... @@ -184,6 +199,9 @@ class VoiceActivityDetector::Impl {
private:
std::queue<SpeechSegment> segments_;
// it is empty if no speech is detected
SpeechSegment cur_segment_;
std::unique_ptr<VadModel> model_;
VadModelConfig config_;
CircularBuffer buffer_;
... ... @@ -230,6 +248,10 @@ bool VoiceActivityDetector::IsSpeechDetected() const {
return impl_->IsSpeechDetected();
}
SpeechSegment VoiceActivityDetector::CurrentSpeechSegment() const {
return impl_->CurrentSpeechSegment();
}
const VadModelConfig &VoiceActivityDetector::GetConfig() const {
return impl_->GetConfig();
}
... ...
... ... @@ -31,10 +31,18 @@ class VoiceActivityDetector {
bool Empty() const;
void Pop();
void Clear();
// It is an error to call Front() if Empty() returns true.
//
// The returned reference is valid until the next call to any
// methods of VoiceActivityDetector.
const SpeechSegment &Front() const;
bool IsSpeechDetected() const;
// It is empty if IsSpeechDetected() returns false
SpeechSegment CurrentSpeechSegment() const;
void Reset() const;
// At the end of the utterance, you can invoke this method so that
... ...
... ... @@ -22,7 +22,17 @@ void PybindSpeechSegment(py::module *m) {
void PybindVoiceActivityDetector(py::module *m) {
PybindSpeechSegment(m);
using PyClass = VoiceActivityDetector;
py::class_<PyClass>(*m, "VoiceActivityDetector")
py::class_<PyClass>(*m, "VoiceActivityDetector",
R"(
1. It is an error to call the front property when the method empty() returns True
2. The property front returns a reference, which is valid until the next call of any
methods of this class
3. When speech is detected, the method is_speech_detected() return True, you can
use the property current_segment to get the speech samples since
is_speech_detected() returns true
4. When is_speech_detected() is changed from True to False, the method
empty() returns False.
)")
.def(py::init<const VadModelConfig &, float>(), py::arg("config"),
py::arg("buffer_size_in_seconds") = 60,
py::call_guard<py::gil_scoped_release>())
... ... @@ -39,7 +49,8 @@ void PybindVoiceActivityDetector(py::module *m) {
py::call_guard<py::gil_scoped_release>())
.def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
.def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
.def_property_readonly("front", &PyClass::Front);
.def_property_readonly("front", &PyClass::Front)
.def_property_readonly("current_segment", &PyClass::CurrentSpeechSegment);
}
} // namespace sherpa_onnx
... ...