Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2024-09-14 10:57:46 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-09-14 10:57:46 +0800
Commit
1423ddb1f030addd7070ba309bc63885517d33ba
1423ddb1
1 parent
5d761712
Support specifying max speech duration for VAD. (#1348)
隐藏空白字符变更
内嵌
并排对比
正在显示
5 个修改的文件
包含
70 行增加
和
7 行删除
python-api-examples/generate-subtitles.py
sherpa-onnx/csrc/silero-vad-model-config.cc
sherpa-onnx/csrc/silero-vad-model-config.h
sherpa-onnx/csrc/voice-activity-detector.cc
sherpa-onnx/python/csrc/silero-vad-model-config.cc
python-api-examples/generate-subtitles.py
查看文件 @
1423ddb
...
...
@@ -406,7 +406,14 @@ def main():
config
=
sherpa_onnx
.
VadModelConfig
()
config
.
silero_vad
.
model
=
args
.
silero_vad_model
config
.
silero_vad
.
min_silence_duration
=
0.25
config
.
silero_vad
.
threshold
=
0.5
config
.
silero_vad
.
min_silence_duration
=
0.25
# seconds
config
.
silero_vad
.
min_speech_duration
=
0.25
# seconds
# If the current segment is larger than this value, then it increases
# the threshold to 0.9 internally. After detecting this segment,
# it resets the threshold to its original value.
config
.
silero_vad
.
max_speech_duration
=
5
# seconds
config
.
sample_rate
=
args
.
sample_rate
window_size
=
config
.
silero_vad
.
window_size
...
...
sherpa-onnx/csrc/silero-vad-model-config.cc
查看文件 @
1423ddb
...
...
@@ -29,6 +29,12 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
"--silero-vad-min-speech-duration seconds before separating it"
);
po
->
Register
(
"silero-vad-max-speech-duration"
,
&
max_speech_duration
,
"In seconds. If a speech segment is longer than this value, then we "
"increase the threshold to 0.9. After finishing detecting the segment, "
"the threshold value is reset to its original value."
);
po
->
Register
(
"silero-vad-window-size"
,
&
window_size
,
"In samples. Audio chunks of --silero-vad-window-size samples are fed "
"to the silero VAD model. WARNING! Silero VAD models were trained using "
...
...
@@ -63,6 +69,33 @@ bool SileroVadModelConfig::Validate() const {
return
false
;
}
if
(
min_silence_duration
<=
0
)
{
SHERPA_ONNX_LOGE
(
"Please use a larger value for --silero-vad-min-silence-duration. "
"Given: "
"%f"
,
min_silence_duration
);
return
false
;
}
if
(
min_speech_duration
<=
0
)
{
SHERPA_ONNX_LOGE
(
"Please use a larger value for --silero-vad-min-speech-duration. "
"Given: "
"%f"
,
min_speech_duration
);
return
false
;
}
if
(
max_speech_duration
<=
0
)
{
SHERPA_ONNX_LOGE
(
"Please use a larger value for --silero-vad-max-speech-duration. "
"Given: "
"%f"
,
max_speech_duration
);
return
false
;
}
return
true
;
}
...
...
@@ -74,6 +107,7 @@ std::string SileroVadModelConfig::ToString() const {
os
<<
"threshold="
<<
threshold
<<
", "
;
os
<<
"min_silence_duration="
<<
min_silence_duration
<<
", "
;
os
<<
"min_speech_duration="
<<
min_speech_duration
<<
", "
;
os
<<
"max_speech_duration="
<<
max_speech_duration
<<
", "
;
os
<<
"window_size="
<<
window_size
<<
")"
;
return
os
.
str
();
...
...
sherpa-onnx/csrc/silero-vad-model-config.h
查看文件 @
1423ddb
...
...
@@ -27,6 +27,11 @@ struct SileroVadModelConfig {
// 256, 512, 768 samples for 800 Hz
int32_t
window_size
=
512
;
// in samples
// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,
// the threshold value is reset to its original value.
float
max_speech_duration
=
20
;
// in seconds
SileroVadModelConfig
()
=
default
;
void
Register
(
ParseOptions
*
po
);
...
...
sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @
1423ddb
...
...
@@ -18,14 +18,18 @@ class VoiceActivityDetector::Impl {
explicit
Impl
(
const
VadModelConfig
&
config
,
float
buffer_size_in_seconds
=
60
)
:
model_
(
VadModel
::
Create
(
config
)),
config_
(
config
),
buffer_
(
buffer_size_in_seconds
*
config
.
sample_rate
)
{}
buffer_
(
buffer_size_in_seconds
*
config
.
sample_rate
)
{
Init
();
}
#if __ANDROID_API__ >= 9
Impl
(
AAssetManager
*
mgr
,
const
VadModelConfig
&
config
,
float
buffer_size_in_seconds
=
60
)
:
model_
(
VadModel
::
Create
(
mgr
,
config
)),
config_
(
config
),
buffer_
(
buffer_size_in_seconds
*
config
.
sample_rate
)
{}
buffer_
(
buffer_size_in_seconds
*
config
.
sample_rate
)
{
Init
();
}
#endif
void
AcceptWaveform
(
const
float
*
samples
,
int32_t
n
)
{
...
...
@@ -146,6 +150,15 @@ class VoiceActivityDetector::Impl {
const
VadModelConfig
&
GetConfig
()
const
{
return
config_
;
}
private
:
void
Init
()
{
// TODO(fangjun): Currently, we support only one vad model.
// If a new vad model is added, we need to change the place
// where max_speech_duration is placed.
max_utterance_length_
=
config_
.
sample_rate
*
config_
.
silero_vad
.
max_speech_duration
;
}
private
:
std
::
queue
<
SpeechSegment
>
segments_
;
std
::
unique_ptr
<
VadModel
>
model_
;
...
...
@@ -153,9 +166,9 @@ class VoiceActivityDetector::Impl {
CircularBuffer
buffer_
;
std
::
vector
<
float
>
last_
;
int
max_utterance_length_
=
16000
*
20
;
// in samples
int
max_utterance_length_
=
-
1
;
// in samples
float
new_min_silence_duration_s_
=
0.1
;
float
new_threshold_
=
1.1
0
;
float
new_threshold_
=
0.9
0
;
int32_t
start_
=
-
1
;
};
...
...
sherpa-onnx/python/csrc/silero-vad-model-config.cc
查看文件 @
1423ddb
...
...
@@ -17,7 +17,8 @@ void PybindSileroVadModelConfig(py::module *m) {
.
def
(
py
::
init
<>
())
.
def
(
py
::
init
([](
const
std
::
string
&
model
,
float
threshold
,
float
min_silence_duration
,
float
min_speech_duration
,
int32_t
window_size
)
->
std
::
unique_ptr
<
PyClass
>
{
int32_t
window_size
,
float
max_speech_duration
)
->
std
::
unique_ptr
<
PyClass
>
{
auto
ans
=
std
::
make_unique
<
PyClass
>
();
ans
->
model
=
model
;
...
...
@@ -25,17 +26,20 @@ void PybindSileroVadModelConfig(py::module *m) {
ans
->
min_silence_duration
=
min_silence_duration
;
ans
->
min_speech_duration
=
min_speech_duration
;
ans
->
window_size
=
window_size
;
ans
->
max_speech_duration
=
max_speech_duration
;
return
ans
;
}),
py
::
arg
(
"model"
),
py
::
arg
(
"threshold"
)
=
0.5
,
py
::
arg
(
"min_silence_duration"
)
=
0.5
,
py
::
arg
(
"min_speech_duration"
)
=
0.25
,
py
::
arg
(
"window_size"
)
=
512
)
py
::
arg
(
"min_speech_duration"
)
=
0.25
,
py
::
arg
(
"window_size"
)
=
512
,
py
::
arg
(
"max_speech_duration"
)
=
20
)
.
def_readwrite
(
"model"
,
&
PyClass
::
model
)
.
def_readwrite
(
"threshold"
,
&
PyClass
::
threshold
)
.
def_readwrite
(
"min_silence_duration"
,
&
PyClass
::
min_silence_duration
)
.
def_readwrite
(
"min_speech_duration"
,
&
PyClass
::
min_speech_duration
)
.
def_readwrite
(
"window_size"
,
&
PyClass
::
window_size
)
.
def_readwrite
(
"max_speech_duration"
,
&
PyClass
::
max_speech_duration
)
.
def
(
"__str__"
,
&
PyClass
::
ToString
)
.
def
(
"validate"
,
&
PyClass
::
Validate
);
}
...
...
请
注册
或
登录
后发表评论