Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-09-10 17:56:13 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-09-10 17:56:13 +0800
Commit
8982984ea28fc452d83419b4b9e89e003357a02b
8982984e
1 parent
86b18184
add a two-pass python example (#303)
隐藏空白字符变更
内嵌
并排对比
正在显示
1 个修改的文件
包含
423 行增加
和
0 行删除
python-api-examples/two-pass-speech-recognition-from-microphone.py
python-api-examples/two-pass-speech-recognition-from-microphone.py
0 → 100755
查看文件 @
8982984
#!/usr/bin/env python3
# Two-pass real-time speech recognition from a microphone with sherpa-onnx
# Python API.
#
# The first pass uses a streaming model, which has two purposes:
#
# (1) Display a temporary result to users
#
# (2) Endpointing
#
# The second pass uses a non-streaming model. It has a higher recognition
# accuracy than the first pass model and its result is used as the final result.
#
# Please refer to
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
# to download pre-trained models
"""
Usage examples:
(1) Chinese: Streaming zipformer (1st pass) + Non-streaming paraformer (2nd pass)
python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py
\
--first-encoder ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/encoder-epoch-99-avg-1.onnx
\
--first-decoder ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/decoder-epoch-99-avg-1.onnx
\
--first-joiner ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/joiner-epoch-99-avg-1.onnx
\
--first-tokens ./sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23/tokens.txt
\
\
--second-paraformer ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx
\
--second-tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt
(2) English: Streaming zipformer (1st pass) + Non-streaming whisper (2nd pass)
python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py
\
--first-encoder ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx
\
--first-decoder ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx
\
--first-joiner ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx
\
--first-tokens ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt
\
\
--second-whisper-encoder ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx
\
--second-whisper-decoder ./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx
\
--second-tokens ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt
"""
import
argparse
import
sys
from
pathlib
import
Path
from
typing
import
List
import
numpy
as
np
try
:
import
sounddevice
as
sd
except
ImportError
:
print
(
"Please install sounddevice first. You can use"
)
print
()
print
(
" pip install sounddevice"
)
print
()
print
(
"to install it"
)
sys
.
exit
(
-
1
)
import
sherpa_onnx
def
assert_file_exists
(
filename
:
str
,
message
:
str
):
if
not
filename
:
raise
ValueError
(
f
"Please specify {message}"
)
if
not
Path
(
filename
)
.
is_file
():
raise
ValueError
(
f
"{message} {filename} does not exist"
)
def
add_first_pass_streaming_model_args
(
parser
:
argparse
.
ArgumentParser
):
parser
.
add_argument
(
"--first-tokens"
,
type
=
str
,
required
=
True
,
help
=
"Path to tokens.txt for the first pass"
,
)
parser
.
add_argument
(
"--first-encoder"
,
type
=
str
,
required
=
True
,
help
=
"Path to the encoder model for the first pass"
,
)
parser
.
add_argument
(
"--first-decoder"
,
type
=
str
,
required
=
True
,
help
=
"Path to the decoder model for the first pass"
,
)
parser
.
add_argument
(
"--first-joiner"
,
type
=
str
,
help
=
"Path to the joiner model for the first pass"
,
)
parser
.
add_argument
(
"--first-decoding-method"
,
type
=
str
,
default
=
"greedy_search"
,
help
=
"""Decoding method for the first pass. Valid values are
greedy_search and modified_beam_search"""
,
)
parser
.
add_argument
(
"--first-max-active-paths"
,
type
=
int
,
default
=
4
,
help
=
"""Used only when --first-decoding-method is modified_beam_search.
It specifies number of active paths to keep during decoding.
"""
,
)
def
add_second_pass_transducer_model_args
(
parser
:
argparse
.
ArgumentParser
):
parser
.
add_argument
(
"--second-encoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to the transducer encoder model for the second pass"
,
)
parser
.
add_argument
(
"--second-decoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to the transducer decoder model for the second pass"
,
)
parser
.
add_argument
(
"--second-joiner"
,
default
=
""
,
type
=
str
,
help
=
"Path to the transducer joiner model for the second pass"
,
)
def
add_second_pass_paraformer_model_args
(
parser
:
argparse
.
ArgumentParser
):
parser
.
add_argument
(
"--second-paraformer"
,
default
=
""
,
type
=
str
,
help
=
"Path to the model.onnx for Paraformer for the second pass"
,
)
def
add_second_pass_nemo_ctc_model_args
(
parser
:
argparse
.
ArgumentParser
):
parser
.
add_argument
(
"--second-nemo-ctc"
,
default
=
""
,
type
=
str
,
help
=
"Path to the model.onnx for NeMo CTC for the second pass"
,
)
def
add_second_pass_whisper_model_args
(
parser
:
argparse
.
ArgumentParser
):
parser
.
add_argument
(
"--second-whisper-encoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to whisper encoder model for the second pass"
,
)
parser
.
add_argument
(
"--second-whisper-decoder"
,
default
=
""
,
type
=
str
,
help
=
"Path to whisper decoder model for the second pass"
,
)
parser
.
add_argument
(
"--second-whisper-language"
,
default
=
""
,
type
=
str
,
help
=
"""It specifies the spoken language in the input audio file.
Example values: en, fr, de, zh, jp.
Available languages for multilingual models can be found at
https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
If not specified, we infer the language from the input audio file.
"""
,
)
parser
.
add_argument
(
"--second-whisper-task"
,
default
=
"transcribe"
,
choices
=
[
"transcribe"
,
"translate"
],
type
=
str
,
help
=
"""For multilingual models, if you specify translate, the output
will be in English.
"""
,
)
def
add_second_pass_non_streaming_model_args
(
parser
:
argparse
.
ArgumentParser
):
add_second_pass_transducer_model_args
(
parser
)
add_second_pass_nemo_ctc_model_args
(
parser
)
add_second_pass_paraformer_model_args
(
parser
)
add_second_pass_whisper_model_args
(
parser
)
parser
.
add_argument
(
"--second-tokens"
,
type
=
str
,
help
=
"Path to tokens.txt for the second pass"
,
)
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
"--provider"
,
type
=
str
,
default
=
"cpu"
,
help
=
"Valid values: cpu, cuda, coreml"
,
)
add_first_pass_streaming_model_args
(
parser
)
add_second_pass_non_streaming_model_args
(
parser
)
return
parser
.
parse_args
()
def
check_first_pass_args
(
args
):
assert_file_exists
(
args
.
first_tokens
,
"--first-tokens"
)
assert_file_exists
(
args
.
first_encoder
,
"--first-encoder"
)
assert_file_exists
(
args
.
first_decoder
,
"--first-decoder"
)
assert_file_exists
(
args
.
first_joiner
,
"--first-joiner"
)
def
check_second_pass_args
(
args
):
assert_file_exists
(
args
.
second_tokens
,
"--second-tokens"
)
if
args
.
second_encoder
:
assert_file_exists
(
args
.
second_encoder
,
"--second-encoder"
)
assert_file_exists
(
args
.
second_decoder
,
"--second-decoder"
)
assert_file_exists
(
args
.
second_joiner
,
"--second-joiner"
)
elif
args
.
second_paraformer
:
assert_file_exists
(
args
.
second_paraformer
,
"--second-paraformer"
)
elif
args
.
second_nemo_ctc
:
assert_file_exists
(
args
.
second_nemo_ctc
,
"--second-nemo-ctc"
)
elif
args
.
second_whisper_encoder
:
assert_file_exists
(
args
.
second_whisper_encoder
,
"--second-whisper-encoder"
)
assert_file_exists
(
args
.
second_whisper_decoder
,
"--second-whisper-decoder"
)
else
:
raise
ValueError
(
"Please specify the model for the second pass"
)
def
create_first_pass_recognizer
(
args
):
# Please replace the model files if needed.
# See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
# for download links.
recognizer
=
sherpa_onnx
.
OnlineRecognizer
.
from_transducer
(
tokens
=
args
.
first_tokens
,
encoder
=
args
.
first_encoder
,
decoder
=
args
.
first_decoder
,
joiner
=
args
.
first_joiner
,
num_threads
=
1
,
sample_rate
=
16000
,
feature_dim
=
80
,
decoding_method
=
args
.
first_decoding_method
,
max_active_paths
=
args
.
first_max_active_paths
,
provider
=
args
.
provider
,
enable_endpoint_detection
=
True
,
rule1_min_trailing_silence
=
2.4
,
rule2_min_trailing_silence
=
1.2
,
rule3_min_utterance_length
=
20
,
)
return
recognizer
def
create_second_pass_recognizer
(
args
)
->
sherpa_onnx
.
OfflineRecognizer
:
if
args
.
second_encoder
:
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_transducer
(
encoder
=
args
.
second_encoder
,
decoder
=
args
.
second_decoder
,
joiner
=
args
.
second_joiner
,
tokens
=
args
.
second_tokens
,
sample_rate
=
16000
,
feature_dim
=
80
,
decoding_method
=
"greedy_search"
,
max_active_paths
=
4
,
)
elif
args
.
second_paraformer
:
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_paraformer
(
paraformer
=
args
.
second_paraformer
,
tokens
=
args
.
second_tokens
,
num_threads
=
1
,
sample_rate
=
16000
,
feature_dim
=
80
,
decoding_method
=
"greedy_search"
,
)
elif
args
.
second_nemo_ctc
:
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_nemo_ctc
(
model
=
args
.
second_nemo_ctc
,
tokens
=
args
.
second_tokens
,
num_threads
=
1
,
sample_rate
=
16000
,
feature_dim
=
80
,
decoding_method
=
"greedy_search"
,
)
elif
args
.
second_whisper_encoder
:
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_whisper
(
encoder
=
args
.
second_whisper_encoder
,
decoder
=
args
.
second_whisper_decoder
,
tokens
=
args
.
second_tokens
,
num_threads
=
1
,
decoding_method
=
"greedy_search"
,
language
=
args
.
second_whisper_language
,
task
=
args
.
second_whisper_task
,
)
else
:
raise
ValueError
(
"Please specify at least one model for the second pass"
)
return
recognizer
def
run_second_pass
(
recognizer
:
sherpa_onnx
.
OfflineRecognizer
,
sample_buffers
:
List
[
np
.
ndarray
],
sample_rate
:
int
,
):
stream
=
recognizer
.
create_stream
()
samples
=
np
.
concatenate
(
sample_buffers
)
stream
.
accept_waveform
(
sample_rate
,
samples
)
recognizer
.
decode_stream
(
stream
)
return
stream
.
result
.
text
def
main
():
args
=
get_args
()
check_first_pass_args
(
args
)
check_second_pass_args
(
args
)
devices
=
sd
.
query_devices
()
if
len
(
devices
)
==
0
:
print
(
"No microphone devices found"
)
sys
.
exit
(
0
)
print
(
devices
)
# If you want to select a different input device, please use
# sd.default.device[0] = xxx
# where xxx is the device number
default_input_device_idx
=
sd
.
default
.
device
[
0
]
print
(
f
'Use default device: {devices[default_input_device_idx]["name"]}'
)
print
(
"Creating recognizers. Please wait..."
)
first_recognizer
=
create_first_pass_recognizer
(
args
)
second_recognizer
=
create_second_pass_recognizer
(
args
)
print
(
"Started! Please speak"
)
sample_rate
=
16000
samples_per_read
=
int
(
0.1
*
sample_rate
)
# 0.1 second = 100 ms
stream
=
first_recognizer
.
create_stream
()
last_result
=
""
segment_id
=
0
sample_buffers
=
[]
with
sd
.
InputStream
(
channels
=
1
,
dtype
=
"float32"
,
samplerate
=
sample_rate
)
as
s
:
while
True
:
samples
,
_
=
s
.
read
(
samples_per_read
)
# a blocking read
samples
=
samples
.
reshape
(
-
1
)
stream
.
accept_waveform
(
sample_rate
,
samples
)
sample_buffers
.
append
(
samples
)
while
first_recognizer
.
is_ready
(
stream
):
first_recognizer
.
decode_stream
(
stream
)
is_endpoint
=
first_recognizer
.
is_endpoint
(
stream
)
result
=
first_recognizer
.
get_result
(
stream
)
result
=
result
.
lower
()
.
strip
()
if
last_result
!=
result
:
print
(
"
\r
{}:{}"
.
format
(
segment_id
,
" "
*
len
(
last_result
)),
end
=
""
,
flush
=
True
,
)
last_result
=
result
print
(
"
\r
{}:{}"
.
format
(
segment_id
,
result
),
end
=
""
,
flush
=
True
)
if
is_endpoint
:
if
result
:
result
=
run_second_pass
(
recognizer
=
second_recognizer
,
sample_buffers
=
sample_buffers
,
sample_rate
=
sample_rate
,
)
result
=
result
.
lower
()
.
strip
()
sample_buffers
=
[]
print
(
"
\r
{}:{}"
.
format
(
segment_id
,
" "
*
len
(
last_result
)),
end
=
""
,
flush
=
True
,
)
print
(
"
\r
{}:{}"
.
format
(
segment_id
,
result
),
flush
=
True
)
segment_id
+=
1
else
:
sample_buffers
=
[]
first_recognizer
.
reset
(
stream
)
if
__name__
==
"__main__"
:
try
:
main
()
except
KeyboardInterrupt
:
print
(
"
\n
Caught Ctrl + C. Exiting"
)
...
...
请
注册
或
登录
后发表评论