Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2025-05-10 00:50:40 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2025-05-10 00:50:40 +0800
Commit
53518efd2fe70f49b86f180a4e5b49fdc374da82
53518efd
1 parent
4a833a75
Add real-time speech recognition example for SenseVoice. (#2197)
显示空白字符变更
内嵌
并排对比
正在显示
2 个修改的文件
包含
242 行增加
和
2 行删除
python-api-examples/simulate-streaming-sense-voice-microphone.py
sherpa-onnx/python/sherpa_onnx/display.py
python-api-examples/simulate-streaming-sense-voice-microphone.py
0 → 100755
查看文件 @
53518ef
#!/usr/bin/env python3
#
# Copyright (c) 2025 Xiaomi Corporation
"""
This file demonstrates how to use sherpa-onnx Python APIs
with VAD and non-streaming SenseVoice for real-time speech recognition
from a microphone.
Usage:
wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
./python-api-examples/simulate-streaming-sense-voice-microphone.py
\
--silero-vad-model=./silero_vad.onnx
\
--sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx
\
--tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt
"""
import
argparse
import
queue
import
sys
import
threading
import
time
from
pathlib
import
Path
import
numpy
as
np
try
:
import
sounddevice
as
sd
except
ImportError
:
print
(
"Please install sounddevice first. You can use"
)
print
()
print
(
" pip install sounddevice"
)
print
()
print
(
"to install it"
)
sys
.
exit
(
-
1
)
import
sherpa_onnx
killed
=
False
recording_thread
=
None
sample_rate
=
16000
# Please don't change it
# buffer saves audio samples to be played
samples_queue
=
queue
.
Queue
()
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
"--silero-vad-model"
,
type
=
str
,
required
=
True
,
help
=
"Path to silero_vad.onnx"
,
)
parser
.
add_argument
(
"--tokens"
,
type
=
str
,
help
=
"Path to tokens.txt"
,
)
parser
.
add_argument
(
"--sense-voice"
,
default
=
""
,
type
=
str
,
help
=
"Path to the model.onnx from SenseVoice"
,
)
parser
.
add_argument
(
"--num-threads"
,
type
=
int
,
default
=
1
,
help
=
"Number of threads for neural network computation"
,
)
parser
.
add_argument
(
"--hr-dict-dir"
,
type
=
str
,
default
=
""
,
help
=
"If not empty, it is the jieba dict directory for homophone replacer"
,
)
parser
.
add_argument
(
"--hr-lexicon"
,
type
=
str
,
default
=
""
,
help
=
"If not empty, it is the lexicon.txt for homophone replacer"
,
)
parser
.
add_argument
(
"--hr-rule-fsts"
,
type
=
str
,
default
=
""
,
help
=
"If not empty, it is the replace.fst for homophone replacer"
,
)
return
parser
.
parse_args
()
def
assert_file_exists
(
filename
:
str
):
assert
Path
(
filename
)
.
is_file
(),
(
f
"{filename} does not exist!
\n
"
"Please refer to "
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
)
def
create_recognizer
(
args
)
->
sherpa_onnx
.
OfflineRecognizer
:
assert_file_exists
(
args
.
sense_voice
)
recognizer
=
sherpa_onnx
.
OfflineRecognizer
.
from_sense_voice
(
model
=
args
.
sense_voice
,
tokens
=
args
.
tokens
,
num_threads
=
args
.
num_threads
,
use_itn
=
False
,
debug
=
False
,
hr_dict_dir
=
args
.
hr_dict_dir
,
hr_rule_fsts
=
args
.
hr_rule_fsts
,
hr_lexicon
=
args
.
hr_lexicon
,
)
return
recognizer
def
start_recording
():
# You can use any value you like for samples_per_read
samples_per_read
=
int
(
0.1
*
sample_rate
)
# 0.1 second = 100 ms
with
sd
.
InputStream
(
channels
=
1
,
dtype
=
"float32"
,
samplerate
=
sample_rate
)
as
s
:
while
not
killed
:
samples
,
_
=
s
.
read
(
samples_per_read
)
# a blocking read
samples
=
samples
.
reshape
(
-
1
)
samples
=
np
.
copy
(
samples
)
samples_queue
.
put
(
samples
)
def
main
():
devices
=
sd
.
query_devices
()
if
len
(
devices
)
==
0
:
print
(
"No microphone devices found"
)
sys
.
exit
(
0
)
print
(
devices
)
# If you want to select a different input device, please use
# sd.default.device[0] = xxx
# where xxx is the device number
default_input_device_idx
=
sd
.
default
.
device
[
0
]
print
(
f
'Use default device: {devices[default_input_device_idx]["name"]}'
)
args
=
get_args
()
assert_file_exists
(
args
.
tokens
)
assert_file_exists
(
args
.
silero_vad_model
)
assert
args
.
num_threads
>
0
,
args
.
num_threads
print
(
"Creating recognizer. Please wait..."
)
recognizer
=
create_recognizer
(
args
)
config
=
sherpa_onnx
.
VadModelConfig
()
config
.
silero_vad
.
model
=
args
.
silero_vad_model
config
.
silero_vad
.
min_silence_duration
=
0.25
config
.
sample_rate
=
sample_rate
window_size
=
config
.
silero_vad
.
window_size
vad
=
sherpa_onnx
.
VoiceActivityDetector
(
config
,
buffer_size_in_seconds
=
100
)
print
(
"Started! Please speak"
)
buffer
=
[]
global
recording_thread
recording_thread
=
threading
.
Thread
(
target
=
start_recording
)
recording_thread
.
start
()
display
=
sherpa_onnx
.
Display
()
started
=
False
started_time
=
None
while
not
killed
:
samples
=
samples_queue
.
get
()
# a blocking read
buffer
=
np
.
concatenate
([
buffer
,
samples
])
offset
=
0
while
offset
+
window_size
<
samples
.
shape
[
0
]:
vad
.
accept_waveform
(
samples
[
offset
:
offset
+
window_size
])
if
not
started
and
vad
.
is_speech_detected
():
started
=
True
started_time
=
time
.
time
()
offset
+=
window_size
if
not
started
:
buffer
=
buffer
[
-
10
*
window_size
:]
if
started
and
time
.
time
()
-
started_time
>
0.2
:
stream
=
recognizer
.
create_stream
()
stream
.
accept_waveform
(
sample_rate
,
buffer
)
recognizer
.
decode_stream
(
stream
)
text
=
stream
.
result
.
text
.
strip
()
if
text
:
display
.
update_text
(
text
)
display
.
display
()
started_time
=
time
.
time
()
while
not
vad
.
empty
():
# In general, this while loop is executed only once
stream
=
recognizer
.
create_stream
()
stream
.
accept_waveform
(
sample_rate
,
vad
.
front
.
samples
)
vad
.
pop
()
recognizer
.
decode_stream
(
stream
)
text
=
stream
.
result
.
text
.
strip
()
display
.
update_text
(
text
)
buffer
=
[]
started
=
False
started_time
=
None
display
.
finalize_current_sentence
()
display
.
display
()
if
__name__
==
"__main__"
:
try
:
main
()
except
KeyboardInterrupt
:
killed
=
True
if
recording_thread
:
recording_thread
.
join
()
print
(
"
\n
Caught Ctrl + C. Exiting"
)
...
...
sherpa-onnx/python/sherpa_onnx/display.py
查看文件 @
53518ef
# Copyright (c) 2025 Xiaomi Corporation
import
os
from
time
import
gm
time
,
strftime
from
time
import
local
time
,
strftime
def
get_current_time
():
return
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
gm
time
())
return
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
,
local
time
())
def
clear_console
():
...
...
请
注册
或
登录
后发表评论