Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-09-22 11:53:47 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-09-22 11:53:47 +0800
Commit
969fff56229209e704f573dd209b4085a6f31b8a
969fff56
1 parent
cf199ad4
Add VAD + Non-streaming ASR Python example. (#332)
隐藏空白字符变更
内嵌
并排对比
正在显示
2 个修改的文件
包含
340 行增加
和
0 行删除
python-api-examples/README.md
python-api-examples/vad-with-non-streaming-asr.py
python-api-examples/README.md
查看文件 @
969fff5
...
...
@@ -7,3 +7,6 @@
-
[
vad-remove-non-speech-segments.py
](
./vad-remove-non-speech-segments.py
)
It uses
[
silero-vad
](
https://github.com/snakers4/silero-vad
)
to remove non-speech
segments and concatenate all speech segments into a single one.
-
[
vad-with-non-streaming-asr.py
](
./vad-with-non-streaming-asr.py
)
It shows
how to use VAD with a non-streaming ASR model for speech recognition from
a microphone
...
...
python-api-examples/vad-with-non-streaming-asr.py
0 → 100755
查看文件 @
969fff5
#!/usr/bin/env python3
#
# Copyright (c) 2023 Xiaomi Corporation
"""
This file demonstrates how to use sherpa-onnx Python APIs
with VAD and non-streaming ASR models for speech recognition
from a microphone.
Note that you need a non-streaming model for this script.
(1) For paraformer
./python-api-examples/vad-with-non-streaming-asr.py
\
--silero-vad-model=/path/to/silero_vad.onnx
\
--tokens=/path/to/tokens.txt
\
--paraformer=/path/to/paraformer.onnx
\
--num-threads=2
\
--decoding-method=greedy_search
\
--debug=false
\
--sample-rate=16000
\
--feature-dim=80
(2) For transducer models from icefall
./python-api-examples/vad-with-non-streaming-asr.py
\
--silero-vad-model=/path/to/silero_vad.onnx
\
--tokens=/path/to/tokens.txt
\
--encoder=/path/to/encoder.onnx
\
--decoder=/path/to/decoder.onnx
\
--joiner=/path/to/joiner.onnx
\
--num-threads=2
\
--decoding-method=greedy_search
\
--debug=false
\
--sample-rate=16000
\
--feature-dim=80
(3) For Whisper models
./python-api-examples/vad-with-non-streaming-asr.py
\
--silero-vad-model=/path/to/silero_vad.onnx
\
--whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx
\
--whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx
\
--tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt
\
--whisper-task=transcribe
\
--num-threads=2
Please refer to
https://k2-fsa.github.io/sherpa/onnx/index.html
to install sherpa-onnx and to download non-streaming pre-trained models
used in this file.
Please visit
https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
to download silero_vad.onnx
For instance,
wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
"""
import
argparse
import
sys
from
pathlib
import
Path
import
numpy
as
np
try
:
import
sounddevice
as
sd
except
ImportError
:
print
(
"Please install sounddevice first. You can use"
)
print
()
print
(
" pip install sounddevice"
)
print
()
print
(
"to install it"
)
sys
.
exit
(
-
1
)
import
sherpa_onnx
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
"--silero-vad-model"
,
type
=
str
,
required
=
True
,
help
=
"Path to silero_vad.onnx"
,
)
parser
.
add_argument
(
"--tokens"
,
type
=
str
,
help
=
"Path to tokens.txt"
,
)
parser
.
add_argument
(
"--encoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to the transducer encoder model"
,
)
parser
.
add_argument
(
"--decoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to the transducer decoder model"
,
)
parser
.
add_argument
(
"--joiner"
,
default
=
""
,
type
=
str
,
help
=
"Path to the transducer joiner model"
,
)
parser
.
add_argument
(
"--paraformer"
,
default
=
""
,
type
=
str
,
help
=
"Path to the model.onnx from Paraformer"
,
)
parser
.
add_argument
(
"--num-threads"
,
type
=
int
,
default
=
1
,
help
=
"Number of threads for neural network computation"
,
)
parser
.
add_argument
(
"--whisper-encoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to whisper encoder model"
,
)
parser
.
add_argument
(
"--whisper-decoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to whisper decoder model"
,
)
parser
.
add_argument
(
"--whisper-language"
,
default
=
""
,
type
=
str
,
help
=
"""It specifies the spoken language in the input file.
Example values: en, fr, de, zh, jp.
Available languages for multilingual models can be found at
https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
If not specified, we infer the language from the input audio file.
"""
,
)
parser
.
add_argument
(
"--whisper-task"
,
default
=
"transcribe"
,
choices
=
[
"transcribe"
,
"translate"
],
type
=
str
,
help
=
"""For multilingual models, if you specify translate, the output
will be in English.
"""
,
)
parser
.
add_argument
(
"--decoding-method"
,
type
=
str
,
default
=
"greedy_search"
,
help
=
"""Valid values are greedy_search and modified_beam_search.
modified_beam_search is valid only for transducer models.
"""
,
)
parser
.
add_argument
(
"--debug"
,
type
=
bool
,
default
=
False
,
help
=
"True to show debug messages when loading modes."
,
)
parser
.
add_argument
(
"--sample-rate"
,
type
=
int
,
default
=
16000
,
help
=
"""Sample rate of the feature extractor. Must match the one
expected by the model."""
,
)
parser
.
add_argument
(
"--feature-dim"
,
type
=
int
,
default
=
80
,
help
=
"Feature dimension. Must match the one expected by the model"
,
)
return
parser
.
parse_args
()
def
assert_file_exists
(
filename
:
str
):
assert
Path
(
filename
)
.
is_file
(),
(
f
"{filename} does not exist!
\n
"
"Please refer to "
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
)
def
create_recognizer
(
args
)
->
sherpa_onnx
.
OfflineRecognizer
:
if
args
.
encoder
:
assert
len
(
args
.
paraformer
)
==
0
,
args
.
paraformer
assert
len
(
args
.
whisper_encoder
)
==
0
,
args
.
whisper_encoder
assert
len
(
args
.
whisper_decoder
)
==
0
,
args
.
whisper_decoder
assert_file_exists
(
args
.
encoder
)
assert_file_exists
(
args
.
decoder
)
assert_file_exists
(
args
.
joiner
)
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_transducer
(
encoder
=
args
.
encoder
,
decoder
=
args
.
decoder
,
joiner
=
args
.
joiner
,
tokens
=
args
.
tokens
,
num_threads
=
args
.
num_threads
,
sample_rate
=
args
.
sample_rate
,
feature_dim
=
args
.
feature_dim
,
decoding_method
=
args
.
decoding_method
,
debug
=
args
.
debug
,
)
elif
args
.
paraformer
:
assert
len
(
args
.
whisper_encoder
)
==
0
,
args
.
whisper_encoder
assert
len
(
args
.
whisper_decoder
)
==
0
,
args
.
whisper_decoder
assert_file_exists
(
args
.
paraformer
)
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_paraformer
(
paraformer
=
args
.
paraformer
,
tokens
=
args
.
tokens
,
num_threads
=
args
.
num_threads
,
sample_rate
=
args
.
sample_rate
,
feature_dim
=
args
.
feature_dim
,
decoding_method
=
args
.
decoding_method
,
debug
=
args
.
debug
,
)
elif
args
.
whisper_encoder
:
assert_file_exists
(
args
.
whisper_encoder
)
assert_file_exists
(
args
.
whisper_decoder
)
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_whisper
(
encoder
=
args
.
whisper_encoder
,
decoder
=
args
.
whisper_decoder
,
tokens
=
args
.
tokens
,
num_threads
=
args
.
num_threads
,
decoding_method
=
args
.
decoding_method
,
debug
=
args
.
debug
,
language
=
args
.
whisper_language
,
task
=
args
.
whisper_task
,
)
else
:
raise
ValueError
(
"Please specify at least one model"
)
return
recognizer
def
main
():
devices
=
sd
.
query_devices
()
if
len
(
devices
)
==
0
:
print
(
"No microphone devices found"
)
sys
.
exit
(
0
)
print
(
devices
)
# If you want to select a different input device, please use
# sd.default.device[0] = xxx
# where xxx is the device number
default_input_device_idx
=
sd
.
default
.
device
[
0
]
print
(
f
'Use default device: {devices[default_input_device_idx]["name"]}'
)
args
=
get_args
()
assert_file_exists
(
args
.
tokens
)
assert_file_exists
(
args
.
silero_vad_model
)
assert
args
.
num_threads
>
0
,
args
.
num_threads
assert
(
args
.
sample_rate
==
16000
),
f
"Only sample rate 16000 is supported.Given: {args.sample_rate}"
print
(
"Creating recognizer. Please wait..."
)
recognizer
=
create_recognizer
(
args
)
config
=
sherpa_onnx
.
VadModelConfig
()
config
.
silero_vad
.
model
=
args
.
silero_vad_model
config
.
silero_vad
.
min_silence_duration
=
0.25
config
.
sample_rate
=
args
.
sample_rate
window_size
=
config
.
silero_vad
.
window_size
vad
=
sherpa_onnx
.
VoiceActivityDetector
(
config
,
buffer_size_in_seconds
=
100
)
samples_per_read
=
int
(
0.1
*
args
.
sample_rate
)
# 0.1 second = 100 ms
print
(
"Started! Please speak"
)
buffer
=
[]
texts
=
[]
with
sd
.
InputStream
(
channels
=
1
,
dtype
=
"float32"
,
samplerate
=
args
.
sample_rate
)
as
s
:
while
True
:
samples
,
_
=
s
.
read
(
samples_per_read
)
# a blocking read
samples
=
samples
.
reshape
(
-
1
)
buffer
=
np
.
concatenate
([
buffer
,
samples
])
while
len
(
buffer
)
>
window_size
:
vad
.
accept_waveform
(
buffer
[:
window_size
])
buffer
=
buffer
[
window_size
:]
while
not
vad
.
empty
():
stream
=
recognizer
.
create_stream
()
stream
.
accept_waveform
(
args
.
sample_rate
,
vad
.
front
.
samples
)
vad
.
pop
()
recognizer
.
decode_stream
(
stream
)
text
=
stream
.
result
.
text
.
strip
()
.
lower
()
if
len
(
text
):
idx
=
len
(
texts
)
texts
.
append
(
text
)
print
(
f
"{idx}: {text}"
)
if
__name__
==
"__main__"
:
try
:
main
()
except
KeyboardInterrupt
:
print
(
"
\n
Caught Ctrl + C. Exiting"
)
...
...
请
注册
或
登录
后发表评论