Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2025-05-11 16:30:38 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2025-05-11 16:30:38 +0800
Commit
b269e5cccc03c3e61dd9f363bb7b64b5b0ce6c3a
b269e5cc
1 parent
028b8f27
Add C++ example for real-time ASR with nvidia/parakeet-tdt-0.6b-v2. (#2201)
隐藏空白字符变更
内嵌
并排对比
正在显示
3 个修改的文件
包含
297 行增加
和
1 行删除
cxx-api-examples/CMakeLists.txt
cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
cxx-api-examples/sherpa-display.h
cxx-api-examples/CMakeLists.txt
查看文件 @
b269e5c
...
...
@@ -36,6 +36,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
sherpa-onnx-cxx-api
portaudio_static
)
add_executable
(
parakeet-tdt-simulate-streaming-microphone-cxx-api
./parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
${
CMAKE_CURRENT_LIST_DIR
}
/../sherpa-onnx/csrc/microphone.cc
)
target_link_libraries
(
parakeet-tdt-simulate-streaming-microphone-cxx-api
sherpa-onnx-cxx-api
portaudio_static
)
endif
()
add_executable
(
sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc
)
...
...
cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
0 → 100644
查看文件 @
b269e5c
// cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
// Copyright (c) 2025 Xiaomi Corporation
//
// This file demonstrates how to use parakeet-tdt with sherpa-onnx's C++ API
// for streaming speech recognition from a microphone.
//
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
// tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
// rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
//
// clang-format on
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <iostream>
#include <mutex> // NOLINT
#include <queue>
#include <vector>
#include "portaudio.h" // NOLINT
#include "sherpa-display.h" // NOLINT
#include "sherpa-onnx/c-api/cxx-api.h"
#include "sherpa-onnx/csrc/microphone.h"
std
::
queue
<
std
::
vector
<
float
>>
samples_queue
;
std
::
condition_variable
condition_variable
;
std
::
mutex
mutex
;
bool
stop
=
false
;
static
void
Handler
(
int32_t
/*sig*/
)
{
stop
=
true
;
condition_variable
.
notify_one
();
fprintf
(
stderr
,
"
\n
Caught Ctrl + C. Exiting...
\n
"
);
}
static
int32_t
RecordCallback
(
const
void
*
input_buffer
,
void
*
/*output_buffer*/
,
unsigned
long
frames_per_buffer
,
// NOLINT
const
PaStreamCallbackTimeInfo
*
/*time_info*/
,
PaStreamCallbackFlags
/*status_flags*/
,
void
*
/*user_data*/
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex
);
samples_queue
.
emplace
(
reinterpret_cast
<
const
float
*>
(
input_buffer
),
reinterpret_cast
<
const
float
*>
(
input_buffer
)
+
frames_per_buffer
);
condition_variable
.
notify_one
();
return
stop
?
paComplete
:
paContinue
;
}
static
sherpa_onnx
::
cxx
::
VoiceActivityDetector
CreateVad
()
{
using
namespace
sherpa_onnx
::
cxx
;
// NOLINT
VadModelConfig
config
;
config
.
silero_vad
.
model
=
"./silero_vad.onnx"
;
config
.
silero_vad
.
threshold
=
0.5
;
config
.
silero_vad
.
min_silence_duration
=
0.25
;
config
.
silero_vad
.
min_speech_duration
=
0.25
;
config
.
silero_vad
.
max_speech_duration
=
5
;
config
.
sample_rate
=
16000
;
config
.
debug
=
false
;
VoiceActivityDetector
vad
=
VoiceActivityDetector
::
Create
(
config
,
60
);
if
(
!
vad
.
Get
())
{
std
::
cerr
<<
"Failed to create VAD. Please check your config
\n
"
;
exit
(
-
1
);
}
return
vad
;
}
static
sherpa_onnx
::
cxx
::
OfflineRecognizer
CreateOfflineRecognizer
()
{
using
namespace
sherpa_onnx
::
cxx
;
// NOLINT
OfflineRecognizerConfig
config
;
config
.
model_config
.
transducer
.
encoder
=
"./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/encoder.int8.onnx"
;
config
.
model_config
.
transducer
.
decoder
=
"./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/decoder.int8.onnx"
;
config
.
model_config
.
transducer
.
joiner
=
"./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/joiner.int8.onnx"
;
config
.
model_config
.
tokens
=
"./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/tokens.txt"
;
config
.
model_config
.
model_type
=
"nemo_transducer"
;
config
.
model_config
.
num_threads
=
2
;
config
.
model_config
.
debug
=
false
;
std
::
cout
<<
"Loading model
\n
"
;
OfflineRecognizer
recognizer
=
OfflineRecognizer
::
Create
(
config
);
if
(
!
recognizer
.
Get
())
{
std
::
cerr
<<
"Please check your config
\n
"
;
exit
(
-
1
);
}
std
::
cout
<<
"Loading model done
\n
"
;
return
recognizer
;
}
int32_t
main
()
{
signal
(
SIGINT
,
Handler
);
using
namespace
sherpa_onnx
::
cxx
;
// NOLINT
auto
vad
=
CreateVad
();
auto
recognizer
=
CreateOfflineRecognizer
();
sherpa_onnx
::
Microphone
mic
;
PaDeviceIndex
num_devices
=
Pa_GetDeviceCount
();
std
::
cout
<<
"Num devices: "
<<
num_devices
<<
"
\n
"
;
if
(
num_devices
==
0
)
{
std
::
cerr
<<
" If you are using Linux, please try "
"./build/bin/sense-voice-simulate-streaming-alsa-cxx-api
\n
"
;
return
-
1
;
}
int32_t
device_index
=
Pa_GetDefaultInputDevice
();
const
char
*
pDeviceIndex
=
std
::
getenv
(
"SHERPA_ONNX_MIC_DEVICE"
);
if
(
pDeviceIndex
)
{
fprintf
(
stderr
,
"Use specified device: %s
\n
"
,
pDeviceIndex
);
device_index
=
atoi
(
pDeviceIndex
);
}
for
(
int32_t
i
=
0
;
i
!=
num_devices
;
++
i
)
{
const
PaDeviceInfo
*
info
=
Pa_GetDeviceInfo
(
i
);
fprintf
(
stderr
,
" %s %d %s
\n
"
,
(
i
==
device_index
)
?
"*"
:
" "
,
i
,
info
->
name
);
}
PaStreamParameters
param
;
param
.
device
=
device_index
;
fprintf
(
stderr
,
"Use device: %d
\n
"
,
param
.
device
);
const
PaDeviceInfo
*
info
=
Pa_GetDeviceInfo
(
param
.
device
);
fprintf
(
stderr
,
" Name: %s
\n
"
,
info
->
name
);
fprintf
(
stderr
,
" Max input channels: %d
\n
"
,
info
->
maxInputChannels
);
param
.
channelCount
=
1
;
param
.
sampleFormat
=
paFloat32
;
param
.
suggestedLatency
=
info
->
defaultLowInputLatency
;
param
.
hostApiSpecificStreamInfo
=
nullptr
;
float
mic_sample_rate
=
16000
;
const
char
*
sample_rate_str
=
std
::
getenv
(
"SHERPA_ONNX_MIC_SAMPLE_RATE"
);
if
(
sample_rate_str
)
{
fprintf
(
stderr
,
"Use sample rate %f for mic
\n
"
,
mic_sample_rate
);
mic_sample_rate
=
atof
(
sample_rate_str
);
}
float
sample_rate
=
16000
;
LinearResampler
resampler
;
if
(
mic_sample_rate
!=
sample_rate
)
{
float
min_freq
=
std
::
min
(
mic_sample_rate
,
sample_rate
);
float
lowpass_cutoff
=
0.99
*
0.5
*
min_freq
;
int32_t
lowpass_filter_width
=
6
;
resampler
=
LinearResampler
::
Create
(
mic_sample_rate
,
sample_rate
,
lowpass_cutoff
,
lowpass_filter_width
);
}
PaStream
*
stream
;
PaError
err
=
Pa_OpenStream
(
&
stream
,
&
param
,
nullptr
,
/* &outputParameters, */
mic_sample_rate
,
0
,
// frames per buffer
paClipOff
,
// we won't output out of range samples
// so don't bother clipping them
RecordCallback
,
// RecordCallback is run in a separate
// thread created by portaudio
nullptr
);
if
(
err
!=
paNoError
)
{
fprintf
(
stderr
,
"portaudio error: %s
\n
"
,
Pa_GetErrorText
(
err
));
exit
(
EXIT_FAILURE
);
}
err
=
Pa_StartStream
(
stream
);
if
(
err
!=
paNoError
)
{
fprintf
(
stderr
,
"portaudio error: %s
\n
"
,
Pa_GetErrorText
(
err
));
exit
(
EXIT_FAILURE
);
}
int32_t
window_size
=
512
;
// samples, please don't change
int32_t
offset
=
0
;
std
::
vector
<
float
>
buffer
;
bool
speech_started
=
false
;
auto
started_time
=
std
::
chrono
::
steady_clock
::
now
();
SherpaDisplay
display
;
std
::
cout
<<
"Started! Please speak
\n
"
;
while
(
!
stop
)
{
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex
);
while
(
samples_queue
.
empty
()
&&
!
stop
)
{
condition_variable
.
wait
(
lock
);
}
const
auto
&
s
=
samples_queue
.
front
();
if
(
!
resampler
.
Get
())
{
buffer
.
insert
(
buffer
.
end
(),
s
.
begin
(),
s
.
end
());
}
else
{
auto
resampled
=
resampler
.
Resample
(
s
.
data
(),
s
.
size
(),
false
);
buffer
.
insert
(
buffer
.
end
(),
resampled
.
begin
(),
resampled
.
end
());
}
samples_queue
.
pop
();
}
for
(;
offset
+
window_size
<
buffer
.
size
();
offset
+=
window_size
)
{
vad
.
AcceptWaveform
(
buffer
.
data
()
+
offset
,
window_size
);
if
(
!
speech_started
&&
vad
.
IsDetected
())
{
speech_started
=
true
;
started_time
=
std
::
chrono
::
steady_clock
::
now
();
}
}
if
(
!
speech_started
)
{
if
(
buffer
.
size
()
>
10
*
window_size
)
{
offset
-=
buffer
.
size
()
-
10
*
window_size
;
buffer
=
{
buffer
.
end
()
-
10
*
window_size
,
buffer
.
end
()};
}
}
auto
current_time
=
std
::
chrono
::
steady_clock
::
now
();
const
float
elapsed_seconds
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
milliseconds
>
(
current_time
-
started_time
)
.
count
()
/
1000.
;
if
(
speech_started
&&
elapsed_seconds
>
0.2
)
{
OfflineStream
stream
=
recognizer
.
CreateStream
();
stream
.
AcceptWaveform
(
sample_rate
,
buffer
.
data
(),
buffer
.
size
());
recognizer
.
Decode
(
&
stream
);
OfflineRecognizerResult
result
=
recognizer
.
GetResult
(
&
stream
);
display
.
UpdateText
(
result
.
text
);
display
.
Display
();
started_time
=
std
::
chrono
::
steady_clock
::
now
();
}
while
(
!
vad
.
IsEmpty
())
{
auto
segment
=
vad
.
Front
();
vad
.
Pop
();
OfflineStream
stream
=
recognizer
.
CreateStream
();
stream
.
AcceptWaveform
(
sample_rate
,
segment
.
samples
.
data
(),
segment
.
samples
.
size
());
recognizer
.
Decode
(
&
stream
);
OfflineRecognizerResult
result
=
recognizer
.
GetResult
(
&
stream
);
display
.
UpdateText
(
result
.
text
);
display
.
FinalizeCurrentSentence
();
display
.
Display
();
buffer
.
clear
();
offset
=
0
;
speech_started
=
false
;
}
}
err
=
Pa_CloseStream
(
stream
);
if
(
err
!=
paNoError
)
{
fprintf
(
stderr
,
"portaudio error: %s
\n
"
,
Pa_GetErrorText
(
err
));
exit
(
EXIT_FAILURE
);
}
return
0
;
}
...
...
cxx-api-examples/sherpa-display.h
查看文件 @
b269e5c
...
...
@@ -14,7 +14,8 @@ class SherpaDisplay {
void
UpdateText
(
const
std
::
string
&
text
)
{
current_text_
=
text
;
}
void
FinalizeCurrentSentence
()
{
if
(
!
current_text_
.
empty
()
&&
current_text_
[
0
]
!=
' '
)
{
if
(
!
current_text_
.
empty
()
&&
(
current_text_
[
0
]
!=
' '
||
current_text_
.
size
()
>
1
))
{
sentences_
.
push_back
({
GetCurrentDateTime
(),
std
::
move
(
current_text_
)});
}
}
...
...
请
注册
或
登录
后发表评论