voice-activity-detector.cc
2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// sherpa-onnx/csrc/voice-activity-detector.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/voice-activity-detector.h"
#include <queue>
#include <utility>
#include "sherpa-onnx/csrc/circular-buffer.h"
#include "sherpa-onnx/csrc/vad-model.h"
namespace sherpa_onnx {
class VoiceActivityDetector::Impl {
public:
explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60)
: model_(VadModel::Create(config)),
config_(config),
buffer_(buffer_size_in_seconds * config.sample_rate) {}
void AcceptWaveform(const float *samples, int32_t n) {
buffer_.Push(samples, n);
bool is_speech = model_->IsSpeech(samples, n);
if (is_speech) {
if (start_ == -1) {
// beginning of speech
start_ = buffer_.Tail() - 2 * model_->WindowSize() -
model_->MinSpeechDurationSamples();
}
} else {
// non-speech
if (start_ != -1) {
// end of speech, save the speech segment
int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
std::vector<float> samples = buffer_.Get(start_, end - start_);
SpeechSegment segment;
segment.start = start_;
segment.samples = std::move(samples);
segments_.push(std::move(segment));
buffer_.Pop(end - buffer_.Head());
}
start_ = -1;
}
}
bool Empty() const { return segments_.empty(); }
void Pop() { segments_.pop(); }
const SpeechSegment &Front() const { return segments_.front(); }
void Reset() {
std::queue<SpeechSegment>().swap(segments_);
model_->Reset();
buffer_.Reset();
start_ = -1;
}
bool IsSpeechDetected() const { return start_ != -1; }
private:
std::queue<SpeechSegment> segments_;
std::unique_ptr<VadModel> model_;
VadModelConfig config_;
CircularBuffer buffer_;
int32_t start_ = -1;
};
VoiceActivityDetector::VoiceActivityDetector(
const VadModelConfig &config, float buffer_size_in_seconds /*= 60*/)
: impl_(std::make_unique<Impl>(config, buffer_size_in_seconds)) {}
VoiceActivityDetector::~VoiceActivityDetector() = default;
void VoiceActivityDetector::AcceptWaveform(const float *samples, int32_t n) {
impl_->AcceptWaveform(samples, n);
}
bool VoiceActivityDetector::Empty() const { return impl_->Empty(); }
void VoiceActivityDetector::Pop() { impl_->Pop(); }
const SpeechSegment &VoiceActivityDetector::Front() const {
return impl_->Front();
}
void VoiceActivityDetector::Reset() { impl_->Reset(); }
bool VoiceActivityDetector::IsSpeechDetected() const {
return impl_->IsSpeechDetected();
}
} // namespace sherpa_onnx