Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2024-06-29 11:45:04 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-06-29 11:45:04 +0800
Commit
61c7eb3063242fe0590faeb09047e9712f87b7b1
61c7eb30
1 parent
8c4f576f
Support silero_vad version 5 (#1064)
显示空白字符变更
内嵌
并排对比
正在显示
6 个修改的文件
包含
203 行增加
和
50 行删除
CMakeLists.txt
nodejs-addon-examples/package.json
sherpa-onnx/csrc/silero-vad-model.cc
sherpa-onnx/csrc/silero-vad-model.h
sherpa-onnx/csrc/vad-model.h
sherpa-onnx/csrc/voice-activity-detector.cc
CMakeLists.txt
查看文件 @
61c7eb3
...
...
@@ -8,7 +8,7 @@ project(sherpa-onnx)
# ./nodejs-addon-examples
# ./dart-api-examples/
# ./sherpa-onnx/flutter/CHANGELOG.md
set
(
SHERPA_ONNX_VERSION
"1.10.
5
"
)
set
(
SHERPA_ONNX_VERSION
"1.10.
6
"
)
# Disable warning about
#
...
...
nodejs-addon-examples/package.json
查看文件 @
61c7eb3
{
"dependencies"
:
{
"sherpa-onnx-node"
:
"^1.10.
3
"
"sherpa-onnx-node"
:
"^1.10.
6
"
}
}
...
...
sherpa-onnx/csrc/silero-vad-model.cc
查看文件 @
61c7eb3
...
...
@@ -61,25 +61,11 @@ class SileroVadModel::Impl {
#endif
void
Reset
()
{
// 2 - number of LSTM layer
// 1 - batch size
// 64 - hidden dim
std
::
array
<
int64_t
,
3
>
shape
{
2
,
1
,
64
};
Ort
::
Value
h
=
Ort
::
Value
::
CreateTensor
<
float
>
(
allocator_
,
shape
.
data
(),
shape
.
size
());
Ort
::
Value
c
=
Ort
::
Value
::
CreateTensor
<
float
>
(
allocator_
,
shape
.
data
(),
shape
.
size
());
Fill
<
float
>
(
&
h
,
0
);
Fill
<
float
>
(
&
c
,
0
);
states_
.
clear
();
states_
.
reserve
(
2
);
states_
.
push_back
(
std
::
move
(
h
));
states_
.
push_back
(
std
::
move
(
c
));
if
(
is_v5_
)
{
ResetV5
();
}
else
{
ResetV4
();
}
triggered_
=
false
;
current_sample_
=
0
;
...
...
@@ -94,31 +80,7 @@ class SileroVadModel::Impl {
exit
(
-
1
);
}
auto
memory_info
=
Ort
::
MemoryInfo
::
CreateCpu
(
OrtDeviceAllocator
,
OrtMemTypeDefault
);
std
::
array
<
int64_t
,
2
>
x_shape
=
{
1
,
n
};
Ort
::
Value
x
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
const_cast
<
float
*>
(
samples
),
n
,
x_shape
.
data
(),
x_shape
.
size
());
int64_t
sr_shape
=
1
;
Ort
::
Value
sr
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
&
sample_rate_
,
1
,
&
sr_shape
,
1
);
std
::
array
<
Ort
::
Value
,
4
>
inputs
=
{
std
::
move
(
x
),
std
::
move
(
sr
),
std
::
move
(
states_
[
0
]),
std
::
move
(
states_
[
1
])};
auto
out
=
sess_
->
Run
({},
input_names_ptr_
.
data
(),
inputs
.
data
(),
inputs
.
size
(),
output_names_ptr_
.
data
(),
output_names_ptr_
.
size
());
states_
[
0
]
=
std
::
move
(
out
[
1
]);
states_
[
1
]
=
std
::
move
(
out
[
2
]);
float
prob
=
out
[
0
].
GetTensorData
<
float
>
()[
0
];
float
prob
=
Run
(
samples
,
n
);
float
threshold
=
config_
.
silero_vad
.
threshold
;
...
...
@@ -186,6 +148,8 @@ class SileroVadModel::Impl {
int32_t
WindowSize
()
const
{
return
config_
.
silero_vad
.
window_size
;
}
int32_t
WindowShift
()
const
{
return
WindowSize
()
-
window_shift_
;
}
int32_t
MinSilenceDurationSamples
()
const
{
return
min_silence_samples_
;
}
int32_t
MinSpeechDurationSamples
()
const
{
return
min_speech_samples_
;
}
...
...
@@ -205,12 +169,76 @@ class SileroVadModel::Impl {
GetInputNames
(
sess_
.
get
(),
&
input_names_
,
&
input_names_ptr_
);
GetOutputNames
(
sess_
.
get
(),
&
output_names_
,
&
output_names_ptr_
);
if
(
input_names_
.
size
()
==
4
&&
output_names_
.
size
()
==
3
)
{
is_v5_
=
false
;
}
else
if
(
input_names_
.
size
()
==
3
&&
output_names_
.
size
()
==
2
)
{
is_v5_
=
true
;
// 64 for 16kHz
// 32 for 8kHz
window_shift_
=
64
;
if
(
WindowSize
()
!=
512
)
{
SHERPA_ONNX_LOGE
(
"For silero_vad v5, we require window_size to be 512 for 16kHz"
);
exit
(
-
1
);
}
}
else
{
SHERPA_ONNX_LOGE
(
"Unsupported silero vad model"
);
exit
(
-
1
);
}
Check
();
Reset
();
}
void
Check
()
{
void
ResetV5
()
{
// 2 - number of LSTM layer
// 1 - batch size
// 128 - hidden dim
std
::
array
<
int64_t
,
3
>
shape
{
2
,
1
,
128
};
Ort
::
Value
s
=
Ort
::
Value
::
CreateTensor
<
float
>
(
allocator_
,
shape
.
data
(),
shape
.
size
());
Fill
<
float
>
(
&
s
,
0
);
states_
.
clear
();
states_
.
push_back
(
std
::
move
(
s
));
}
void
ResetV4
()
{
// 2 - number of LSTM layer
// 1 - batch size
// 64 - hidden dim
std
::
array
<
int64_t
,
3
>
shape
{
2
,
1
,
64
};
Ort
::
Value
h
=
Ort
::
Value
::
CreateTensor
<
float
>
(
allocator_
,
shape
.
data
(),
shape
.
size
());
Ort
::
Value
c
=
Ort
::
Value
::
CreateTensor
<
float
>
(
allocator_
,
shape
.
data
(),
shape
.
size
());
Fill
<
float
>
(
&
h
,
0
);
Fill
<
float
>
(
&
c
,
0
);
states_
.
clear
();
states_
.
reserve
(
2
);
states_
.
push_back
(
std
::
move
(
h
));
states_
.
push_back
(
std
::
move
(
c
));
}
void
Check
()
const
{
if
(
is_v5_
)
{
CheckV5
();
}
else
{
CheckV4
();
}
}
void
CheckV4
()
const
{
if
(
input_names_
.
size
()
!=
4
)
{
SHERPA_ONNX_LOGE
(
"Expect 4 inputs. Given: %d"
,
static_cast
<
int32_t
>
(
input_names_
.
size
()));
...
...
@@ -262,6 +290,114 @@ class SileroVadModel::Impl {
}
}
void
CheckV5
()
const
{
if
(
input_names_
.
size
()
!=
3
)
{
SHERPA_ONNX_LOGE
(
"Expect 3 inputs. Given: %d"
,
static_cast
<
int32_t
>
(
input_names_
.
size
()));
exit
(
-
1
);
}
if
(
input_names_
[
0
]
!=
"input"
)
{
SHERPA_ONNX_LOGE
(
"Input[0]: %s. Expected: input"
,
input_names_
[
0
].
c_str
());
exit
(
-
1
);
}
if
(
input_names_
[
1
]
!=
"state"
)
{
SHERPA_ONNX_LOGE
(
"Input[1]: %s. Expected: state"
,
input_names_
[
1
].
c_str
());
exit
(
-
1
);
}
if
(
input_names_
[
2
]
!=
"sr"
)
{
SHERPA_ONNX_LOGE
(
"Input[2]: %s. Expected: sr"
,
input_names_
[
2
].
c_str
());
exit
(
-
1
);
}
// Now for outputs
if
(
output_names_
.
size
()
!=
2
)
{
SHERPA_ONNX_LOGE
(
"Expect 2 outputs. Given: %d"
,
static_cast
<
int32_t
>
(
output_names_
.
size
()));
exit
(
-
1
);
}
if
(
output_names_
[
0
]
!=
"output"
)
{
SHERPA_ONNX_LOGE
(
"Output[0]: %s. Expected: output"
,
output_names_
[
0
].
c_str
());
exit
(
-
1
);
}
if
(
output_names_
[
1
]
!=
"stateN"
)
{
SHERPA_ONNX_LOGE
(
"Output[1]: %s. Expected: stateN"
,
output_names_
[
1
].
c_str
());
exit
(
-
1
);
}
}
float
Run
(
const
float
*
samples
,
int32_t
n
)
{
if
(
is_v5_
)
{
return
RunV5
(
samples
,
n
);
}
else
{
return
RunV4
(
samples
,
n
);
}
}
float
RunV5
(
const
float
*
samples
,
int32_t
n
)
{
auto
memory_info
=
Ort
::
MemoryInfo
::
CreateCpu
(
OrtDeviceAllocator
,
OrtMemTypeDefault
);
std
::
array
<
int64_t
,
2
>
x_shape
=
{
1
,
n
};
Ort
::
Value
x
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
const_cast
<
float
*>
(
samples
),
n
,
x_shape
.
data
(),
x_shape
.
size
());
int64_t
sr_shape
=
1
;
Ort
::
Value
sr
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
&
sample_rate_
,
1
,
&
sr_shape
,
1
);
std
::
array
<
Ort
::
Value
,
3
>
inputs
=
{
std
::
move
(
x
),
std
::
move
(
states_
[
0
]),
std
::
move
(
sr
)};
auto
out
=
sess_
->
Run
({},
input_names_ptr_
.
data
(),
inputs
.
data
(),
inputs
.
size
(),
output_names_ptr_
.
data
(),
output_names_ptr_
.
size
());
states_
[
0
]
=
std
::
move
(
out
[
1
]);
float
prob
=
out
[
0
].
GetTensorData
<
float
>
()[
0
];
return
prob
;
}
float
RunV4
(
const
float
*
samples
,
int32_t
n
)
{
auto
memory_info
=
Ort
::
MemoryInfo
::
CreateCpu
(
OrtDeviceAllocator
,
OrtMemTypeDefault
);
std
::
array
<
int64_t
,
2
>
x_shape
=
{
1
,
n
};
Ort
::
Value
x
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
const_cast
<
float
*>
(
samples
),
n
,
x_shape
.
data
(),
x_shape
.
size
());
int64_t
sr_shape
=
1
;
Ort
::
Value
sr
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
&
sample_rate_
,
1
,
&
sr_shape
,
1
);
std
::
array
<
Ort
::
Value
,
4
>
inputs
=
{
std
::
move
(
x
),
std
::
move
(
sr
),
std
::
move
(
states_
[
0
]),
std
::
move
(
states_
[
1
])};
auto
out
=
sess_
->
Run
({},
input_names_ptr_
.
data
(),
inputs
.
data
(),
inputs
.
size
(),
output_names_ptr_
.
data
(),
output_names_ptr_
.
size
());
states_
[
0
]
=
std
::
move
(
out
[
1
]);
states_
[
1
]
=
std
::
move
(
out
[
2
]);
float
prob
=
out
[
0
].
GetTensorData
<
float
>
()[
0
];
return
prob
;
}
private
:
VadModelConfig
config_
;
...
...
@@ -286,6 +422,10 @@ class SileroVadModel::Impl {
int32_t
current_sample_
=
0
;
int32_t
temp_start_
=
0
;
int32_t
temp_end_
=
0
;
int32_t
window_shift_
=
0
;
bool
is_v5_
=
false
;
};
SileroVadModel
::
SileroVadModel
(
const
VadModelConfig
&
config
)
...
...
@@ -306,6 +446,8 @@ bool SileroVadModel::IsSpeech(const float *samples, int32_t n) {
int32_t
SileroVadModel
::
WindowSize
()
const
{
return
impl_
->
WindowSize
();
}
int32_t
SileroVadModel
::
WindowShift
()
const
{
return
impl_
->
WindowShift
();
}
int32_t
SileroVadModel
::
MinSilenceDurationSamples
()
const
{
return
impl_
->
MinSilenceDurationSamples
();
}
...
...
sherpa-onnx/csrc/silero-vad-model.h
查看文件 @
61c7eb3
...
...
@@ -39,6 +39,11 @@ class SileroVadModel : public VadModel {
int32_t
WindowSize
()
const
override
;
// For silero vad V4, it is WindowSize().
// For silero vad V5, it is WindowSize()-64 for 16kHz and
// WindowSize()-32 for 8kHz
int32_t
WindowShift
()
const
override
;
int32_t
MinSilenceDurationSamples
()
const
override
;
int32_t
MinSpeechDurationSamples
()
const
override
;
...
...
sherpa-onnx/csrc/vad-model.h
查看文件 @
61c7eb3
...
...
@@ -40,6 +40,8 @@ class VadModel {
virtual
int32_t
WindowSize
()
const
=
0
;
virtual
int32_t
WindowShift
()
const
=
0
;
virtual
int32_t
MinSilenceDurationSamples
()
const
=
0
;
virtual
int32_t
MinSpeechDurationSamples
()
const
=
0
;
virtual
void
SetMinSilenceDuration
(
float
s
)
=
0
;
...
...
sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @
61c7eb3
...
...
@@ -38,16 +38,20 @@ class VoiceActivityDetector::Impl {
}
int32_t
window_size
=
model_
->
WindowSize
();
int32_t
window_shift
=
model_
->
WindowShift
();
// note n is usually window_size and there is no need to use
// an extra buffer here
last_
.
insert
(
last_
.
end
(),
samples
,
samples
+
n
);
int32_t
k
=
static_cast
<
int32_t
>
(
last_
.
size
())
/
window_size
;
// Note: For v4, window_shift == window_size
int32_t
k
=
(
static_cast
<
int32_t
>
(
last_
.
size
())
-
window_size
)
/
window_shift
+
1
;
const
float
*
p
=
last_
.
data
();
bool
is_speech
=
false
;
for
(
int32_t
i
=
0
;
i
!=
k
;
++
i
,
p
+=
window_size
)
{
buffer_
.
Push
(
p
,
window_size
);
for
(
int32_t
i
=
0
;
i
!=
k
;
++
i
,
p
+=
window_shift
)
{
buffer_
.
Push
(
p
,
window_shift
);
// NOTE(fangjun): Please don't use a very large n.
bool
this_window_is_speech
=
model_
->
IsSpeech
(
p
,
window_size
);
is_speech
=
is_speech
||
this_window_is_speech
;
...
...
请
注册
或
登录
后发表评论