Fangjun Kuang
Committed by GitHub

Add C++ demo for VAD+non-streaming ASR (#1964)

@@ -64,6 +64,7 @@ def get_binaries(): @@ -64,6 +64,7 @@ def get_binaries():
64 "sherpa-onnx-online-websocket-server", 64 "sherpa-onnx-online-websocket-server",
65 "sherpa-onnx-vad-microphone", 65 "sherpa-onnx-vad-microphone",
66 "sherpa-onnx-vad-microphone-offline-asr", 66 "sherpa-onnx-vad-microphone-offline-asr",
  67 + "sherpa-onnx-vad-with-offline-asr",
67 ] 68 ]
68 69
69 if enable_alsa(): 70 if enable_alsa():
@@ -452,6 +452,10 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) @@ -452,6 +452,10 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
452 microphone.cc 452 microphone.cc
453 ) 453 )
454 454
  455 + add_executable(sherpa-onnx-vad-with-offline-asr
  456 + sherpa-onnx-vad-with-offline-asr.cc
  457 + )
  458 +
455 add_executable(sherpa-onnx-vad-microphone-offline-asr 459 add_executable(sherpa-onnx-vad-microphone-offline-asr
456 sherpa-onnx-vad-microphone-offline-asr.cc 460 sherpa-onnx-vad-microphone-offline-asr.cc
457 microphone.cc 461 microphone.cc
@@ -475,6 +479,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) @@ -475,6 +479,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
475 sherpa-onnx-microphone-offline-audio-tagging 479 sherpa-onnx-microphone-offline-audio-tagging
476 sherpa-onnx-vad-microphone 480 sherpa-onnx-vad-microphone
477 sherpa-onnx-vad-microphone-offline-asr 481 sherpa-onnx-vad-microphone-offline-asr
  482 + sherpa-onnx-vad-with-offline-asr
478 ) 483 )
479 if(SHERPA_ONNX_ENABLE_TTS) 484 if(SHERPA_ONNX_ENABLE_TTS)
480 list(APPEND exes 485 list(APPEND exes
@@ -85,9 +85,8 @@ OnlineEbranchformerTransducerModel::OnlineEbranchformerTransducerModel( @@ -85,9 +85,8 @@ OnlineEbranchformerTransducerModel::OnlineEbranchformerTransducerModel(
85 } 85 }
86 } 86 }
87 87
88 -  
89 void OnlineEbranchformerTransducerModel::InitEncoder(void *model_data, 88 void OnlineEbranchformerTransducerModel::InitEncoder(void *model_data,
90 - size_t model_data_length) { 89 + size_t model_data_length) {
91 encoder_sess_ = std::make_unique<Ort::Session>( 90 encoder_sess_ = std::make_unique<Ort::Session>(
92 env_, model_data, model_data_length, encoder_sess_opts_); 91 env_, model_data, model_data_length, encoder_sess_opts_);
93 92
@@ -153,9 +152,8 @@ void OnlineEbranchformerTransducerModel::InitEncoder(void *model_data, @@ -153,9 +152,8 @@ void OnlineEbranchformerTransducerModel::InitEncoder(void *model_data,
153 } 152 }
154 } 153 }
155 154
156 -  
157 void OnlineEbranchformerTransducerModel::InitDecoder(void *model_data, 155 void OnlineEbranchformerTransducerModel::InitDecoder(void *model_data,
158 - size_t model_data_length) { 156 + size_t model_data_length) {
159 decoder_sess_ = std::make_unique<Ort::Session>( 157 decoder_sess_ = std::make_unique<Ort::Session>(
160 env_, model_data, model_data_length, decoder_sess_opts_); 158 env_, model_data, model_data_length, decoder_sess_opts_);
161 159
@@ -180,7 +178,7 @@ void OnlineEbranchformerTransducerModel::InitDecoder(void *model_data, @@ -180,7 +178,7 @@ void OnlineEbranchformerTransducerModel::InitDecoder(void *model_data,
180 } 178 }
181 179
182 void OnlineEbranchformerTransducerModel::InitJoiner(void *model_data, 180 void OnlineEbranchformerTransducerModel::InitJoiner(void *model_data,
183 - size_t model_data_length) { 181 + size_t model_data_length) {
184 joiner_sess_ = std::make_unique<Ort::Session>( 182 joiner_sess_ = std::make_unique<Ort::Session>(
185 env_, model_data, model_data_length, joiner_sess_opts_); 183 env_, model_data, model_data_length, joiner_sess_opts_);
186 184
@@ -200,7 +198,6 @@ void OnlineEbranchformerTransducerModel::InitJoiner(void *model_data, @@ -200,7 +198,6 @@ void OnlineEbranchformerTransducerModel::InitJoiner(void *model_data,
200 } 198 }
201 } 199 }
202 200
203 -  
204 std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates( 201 std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates(
205 const std::vector<std::vector<Ort::Value>> &states) const { 202 const std::vector<std::vector<Ort::Value>> &states) const {
206 int32_t batch_size = static_cast<int32_t>(states.size()); 203 int32_t batch_size = static_cast<int32_t>(states.size());
@@ -215,28 +212,28 @@ std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates( @@ -215,28 +212,28 @@ std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates(
215 ans.reserve(num_states); 212 ans.reserve(num_states);
216 213
217 for (int32_t i = 0; i != num_hidden_layers_; ++i) { 214 for (int32_t i = 0; i != num_hidden_layers_; ++i) {
218 - { // cached_key 215 + { // cached_key
219 for (int32_t n = 0; n != batch_size; ++n) { 216 for (int32_t n = 0; n != batch_size; ++n) {
220 buf[n] = &states[n][4 * i]; 217 buf[n] = &states[n][4 * i];
221 } 218 }
222 auto v = Cat(allocator, buf, /* axis */ 0); 219 auto v = Cat(allocator, buf, /* axis */ 0);
223 ans.push_back(std::move(v)); 220 ans.push_back(std::move(v));
224 } 221 }
225 - { // cached_value 222 + { // cached_value
226 for (int32_t n = 0; n != batch_size; ++n) { 223 for (int32_t n = 0; n != batch_size; ++n) {
227 buf[n] = &states[n][4 * i + 1]; 224 buf[n] = &states[n][4 * i + 1];
228 } 225 }
229 auto v = Cat(allocator, buf, 0); 226 auto v = Cat(allocator, buf, 0);
230 ans.push_back(std::move(v)); 227 ans.push_back(std::move(v));
231 } 228 }
232 - { // cached_conv 229 + { // cached_conv
233 for (int32_t n = 0; n != batch_size; ++n) { 230 for (int32_t n = 0; n != batch_size; ++n) {
234 buf[n] = &states[n][4 * i + 2]; 231 buf[n] = &states[n][4 * i + 2];
235 } 232 }
236 auto v = Cat(allocator, buf, 0); 233 auto v = Cat(allocator, buf, 0);
237 ans.push_back(std::move(v)); 234 ans.push_back(std::move(v));
238 } 235 }
239 - { // cached_conv_fusion 236 + { // cached_conv_fusion
240 for (int32_t n = 0; n != batch_size; ++n) { 237 for (int32_t n = 0; n != batch_size; ++n) {
241 buf[n] = &states[n][4 * i + 3]; 238 buf[n] = &states[n][4 * i + 3];
242 } 239 }
@@ -245,7 +242,7 @@ std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates( @@ -245,7 +242,7 @@ std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates(
245 } 242 }
246 } 243 }
247 244
248 - { // processed_lens 245 + { // processed_lens
249 for (int32_t n = 0; n != batch_size; ++n) { 246 for (int32_t n = 0; n != batch_size; ++n) {
250 buf[n] = &states[n][num_states - 1]; 247 buf[n] = &states[n][num_states - 1];
251 } 248 }
@@ -256,11 +253,9 @@ std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates( @@ -256,11 +253,9 @@ std::vector<Ort::Value> OnlineEbranchformerTransducerModel::StackStates(
256 return ans; 253 return ans;
257 } 254 }
258 255
259 -  
260 std::vector<std::vector<Ort::Value>> 256 std::vector<std::vector<Ort::Value>>
261 OnlineEbranchformerTransducerModel::UnStackStates( 257 OnlineEbranchformerTransducerModel::UnStackStates(
262 const std::vector<Ort::Value> &states) const { 258 const std::vector<Ort::Value> &states) const {
263 -  
264 assert(static_cast<int32_t>(states.size()) == num_hidden_layers_ * 4 + 1); 259 assert(static_cast<int32_t>(states.size()) == num_hidden_layers_ * 4 + 1);
265 260
266 int32_t batch_size = states[0].GetTensorTypeAndShapeInfo().GetShape()[0]; 261 int32_t batch_size = states[0].GetTensorTypeAndShapeInfo().GetShape()[0];
@@ -272,7 +267,7 @@ OnlineEbranchformerTransducerModel::UnStackStates( @@ -272,7 +267,7 @@ OnlineEbranchformerTransducerModel::UnStackStates(
272 ans.resize(batch_size); 267 ans.resize(batch_size);
273 268
274 for (int32_t i = 0; i != num_hidden_layers_; ++i) { 269 for (int32_t i = 0; i != num_hidden_layers_; ++i) {
275 - { // cached_key 270 + { // cached_key
276 auto v = Unbind(allocator, &states[i * 4], /* axis */ 0); 271 auto v = Unbind(allocator, &states[i * 4], /* axis */ 0);
277 assert(static_cast<int32_t>(v.size()) == batch_size); 272 assert(static_cast<int32_t>(v.size()) == batch_size);
278 273
@@ -280,7 +275,7 @@ OnlineEbranchformerTransducerModel::UnStackStates( @@ -280,7 +275,7 @@ OnlineEbranchformerTransducerModel::UnStackStates(
280 ans[n].push_back(std::move(v[n])); 275 ans[n].push_back(std::move(v[n]));
281 } 276 }
282 } 277 }
283 - { // cached_value 278 + { // cached_value
284 auto v = Unbind(allocator, &states[i * 4 + 1], 0); 279 auto v = Unbind(allocator, &states[i * 4 + 1], 0);
285 assert(static_cast<int32_t>(v.size()) == batch_size); 280 assert(static_cast<int32_t>(v.size()) == batch_size);
286 281
@@ -288,7 +283,7 @@ OnlineEbranchformerTransducerModel::UnStackStates( @@ -288,7 +283,7 @@ OnlineEbranchformerTransducerModel::UnStackStates(
288 ans[n].push_back(std::move(v[n])); 283 ans[n].push_back(std::move(v[n]));
289 } 284 }
290 } 285 }
291 - { // cached_conv 286 + { // cached_conv
292 auto v = Unbind(allocator, &states[i * 4 + 2], 0); 287 auto v = Unbind(allocator, &states[i * 4 + 2], 0);
293 assert(static_cast<int32_t>(v.size()) == batch_size); 288 assert(static_cast<int32_t>(v.size()) == batch_size);
294 289
@@ -296,7 +291,7 @@ OnlineEbranchformerTransducerModel::UnStackStates( @@ -296,7 +291,7 @@ OnlineEbranchformerTransducerModel::UnStackStates(
296 ans[n].push_back(std::move(v[n])); 291 ans[n].push_back(std::move(v[n]));
297 } 292 }
298 } 293 }
299 - { // cached_conv_fusion 294 + { // cached_conv_fusion
300 auto v = Unbind(allocator, &states[i * 4 + 3], 0); 295 auto v = Unbind(allocator, &states[i * 4 + 3], 0);
301 assert(static_cast<int32_t>(v.size()) == batch_size); 296 assert(static_cast<int32_t>(v.size()) == batch_size);
302 297
@@ -306,7 +301,7 @@ OnlineEbranchformerTransducerModel::UnStackStates( @@ -306,7 +301,7 @@ OnlineEbranchformerTransducerModel::UnStackStates(
306 } 301 }
307 } 302 }
308 303
309 - { // processed_lens 304 + { // processed_lens
310 auto v = Unbind<int64_t>(allocator, &states.back(), 0); 305 auto v = Unbind<int64_t>(allocator, &states.back(), 0);
311 assert(static_cast<int32_t>(v.size()) == batch_size); 306 assert(static_cast<int32_t>(v.size()) == batch_size);
312 307
@@ -318,7 +313,6 @@ OnlineEbranchformerTransducerModel::UnStackStates( @@ -318,7 +313,6 @@ OnlineEbranchformerTransducerModel::UnStackStates(
318 return ans; 313 return ans;
319 } 314 }
320 315
321 -  
322 std::vector<Ort::Value> 316 std::vector<Ort::Value>
323 OnlineEbranchformerTransducerModel::GetEncoderInitStates() { 317 OnlineEbranchformerTransducerModel::GetEncoderInitStates() {
324 std::vector<Ort::Value> ans; 318 std::vector<Ort::Value> ans;
@@ -332,40 +326,37 @@ OnlineEbranchformerTransducerModel::GetEncoderInitStates() { @@ -332,40 +326,37 @@ OnlineEbranchformerTransducerModel::GetEncoderInitStates() {
332 int32_t channels_conv_fusion = 2 * hidden_size_; 326 int32_t channels_conv_fusion = 2 * hidden_size_;
333 327
334 for (int32_t i = 0; i != num_hidden_layers_; ++i) { 328 for (int32_t i = 0; i != num_hidden_layers_; ++i) {
335 - { // cached_key_{i} 329 + { // cached_key_{i}
336 std::array<int64_t, 4> s{1, num_heads_, left_context_len_, head_dim_}; 330 std::array<int64_t, 4> s{1, num_heads_, left_context_len_, head_dim_};
337 - auto v =  
338 - Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size()); 331 + auto v = Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
339 Fill(&v, 0); 332 Fill(&v, 0);
340 ans.push_back(std::move(v)); 333 ans.push_back(std::move(v));
341 } 334 }
342 335
343 - { // cahced_value_{i} 336 + { // cahced_value_{i}
344 std::array<int64_t, 4> s{1, num_heads_, left_context_len_, head_dim_}; 337 std::array<int64_t, 4> s{1, num_heads_, left_context_len_, head_dim_};
345 - auto v =  
346 - Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size()); 338 + auto v = Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
347 Fill(&v, 0); 339 Fill(&v, 0);
348 ans.push_back(std::move(v)); 340 ans.push_back(std::move(v));
349 } 341 }
350 342
351 - { // cached_conv_{i} 343 + { // cached_conv_{i}
352 std::array<int64_t, 3> s{1, channels_conv, left_context_conv}; 344 std::array<int64_t, 3> s{1, channels_conv, left_context_conv};
353 - auto v =  
354 - Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size()); 345 + auto v = Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
355 Fill(&v, 0); 346 Fill(&v, 0);
356 ans.push_back(std::move(v)); 347 ans.push_back(std::move(v));
357 } 348 }
358 349
359 - { // cached_conv_fusion_{i}  
360 - std::array<int64_t, 3> s{1, channels_conv_fusion, left_context_conv_fusion};  
361 - auto v =  
362 - Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size()); 350 + { // cached_conv_fusion_{i}
  351 + std::array<int64_t, 3> s{1, channels_conv_fusion,
  352 + left_context_conv_fusion};
  353 + auto v = Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
363 Fill(&v, 0); 354 Fill(&v, 0);
364 ans.push_back(std::move(v)); 355 ans.push_back(std::move(v));
365 } 356 }
366 } // num_hidden_layers_ 357 } // num_hidden_layers_
367 358
368 - { // processed_lens 359 + { // processed_lens
369 std::array<int64_t, 1> s{1}; 360 std::array<int64_t, 1> s{1};
370 auto v = Ort::Value::CreateTensor<int64_t>(allocator_, s.data(), s.size()); 361 auto v = Ort::Value::CreateTensor<int64_t>(allocator_, s.data(), s.size());
371 Fill<int64_t>(&v, 0); 362 Fill<int64_t>(&v, 0);
@@ -375,11 +366,10 @@ OnlineEbranchformerTransducerModel::GetEncoderInitStates() { @@ -375,11 +366,10 @@ OnlineEbranchformerTransducerModel::GetEncoderInitStates() {
375 return ans; 366 return ans;
376 } 367 }
377 368
378 -  
379 std::pair<Ort::Value, std::vector<Ort::Value>> 369 std::pair<Ort::Value, std::vector<Ort::Value>>
380 -OnlineEbranchformerTransducerModel::RunEncoder(Ort::Value features,  
381 - std::vector<Ort::Value> states,  
382 - Ort::Value /* processed_frames */) { 370 +OnlineEbranchformerTransducerModel::RunEncoder(
  371 + Ort::Value features, std::vector<Ort::Value> states,
  372 + Ort::Value /* processed_frames */) {
383 std::vector<Ort::Value> encoder_inputs; 373 std::vector<Ort::Value> encoder_inputs;
384 encoder_inputs.reserve(1 + states.size()); 374 encoder_inputs.reserve(1 + states.size());
385 375
@@ -402,7 +392,6 @@ OnlineEbranchformerTransducerModel::RunEncoder(Ort::Value features, @@ -402,7 +392,6 @@ OnlineEbranchformerTransducerModel::RunEncoder(Ort::Value features,
402 return {std::move(encoder_out[0]), std::move(next_states)}; 392 return {std::move(encoder_out[0]), std::move(next_states)};
403 } 393 }
404 394
405 -  
406 Ort::Value OnlineEbranchformerTransducerModel::RunDecoder( 395 Ort::Value OnlineEbranchformerTransducerModel::RunDecoder(
407 Ort::Value decoder_input) { 396 Ort::Value decoder_input) {
408 auto decoder_out = decoder_sess_->Run( 397 auto decoder_out = decoder_sess_->Run(
@@ -411,9 +400,8 @@ Ort::Value OnlineEbranchformerTransducerModel::RunDecoder( @@ -411,9 +400,8 @@ Ort::Value OnlineEbranchformerTransducerModel::RunDecoder(
411 return std::move(decoder_out[0]); 400 return std::move(decoder_out[0]);
412 } 401 }
413 402
414 -  
415 -Ort::Value OnlineEbranchformerTransducerModel::RunJoiner(Ort::Value encoder_out,  
416 - Ort::Value decoder_out) { 403 +Ort::Value OnlineEbranchformerTransducerModel::RunJoiner(
  404 + Ort::Value encoder_out, Ort::Value decoder_out) {
417 std::array<Ort::Value, 2> joiner_input = {std::move(encoder_out), 405 std::array<Ort::Value, 2> joiner_input = {std::move(encoder_out),
418 std::move(decoder_out)}; 406 std::move(decoder_out)};
419 auto logit = 407 auto logit =
@@ -424,7 +412,6 @@ Ort::Value OnlineEbranchformerTransducerModel::RunJoiner(Ort::Value encoder_out, @@ -424,7 +412,6 @@ Ort::Value OnlineEbranchformerTransducerModel::RunJoiner(Ort::Value encoder_out,
424 return std::move(logit[0]); 412 return std::move(logit[0]);
425 } 413 }
426 414
427 -  
428 #if __ANDROID_API__ >= 9 415 #if __ANDROID_API__ >= 9
429 template OnlineEbranchformerTransducerModel::OnlineEbranchformerTransducerModel( 416 template OnlineEbranchformerTransducerModel::OnlineEbranchformerTransducerModel(
430 AAssetManager *mgr, const OnlineModelConfig &config); 417 AAssetManager *mgr, const OnlineModelConfig &config);
@@ -22,7 +22,7 @@ class OnlineEbranchformerTransducerModel : public OnlineTransducerModel { @@ -22,7 +22,7 @@ class OnlineEbranchformerTransducerModel : public OnlineTransducerModel {
22 22
23 template <typename Manager> 23 template <typename Manager>
24 OnlineEbranchformerTransducerModel(Manager *mgr, 24 OnlineEbranchformerTransducerModel(Manager *mgr,
25 - const OnlineModelConfig &config); 25 + const OnlineModelConfig &config);
26 26
27 std::vector<Ort::Value> StackStates( 27 std::vector<Ort::Value> StackStates(
28 const std::vector<std::vector<Ort::Value>> &states) const override; 28 const std::vector<std::vector<Ort::Value>> &states) const override;
@@ -131,10 +131,10 @@ for a list of pre-trained models to download. @@ -131,10 +131,10 @@ for a list of pre-trained models to download.
131 std::vector<sherpa_onnx::OfflineStream *> ss_pointers; 131 std::vector<sherpa_onnx::OfflineStream *> ss_pointers;
132 float duration = 0; 132 float duration = 0;
133 for (int32_t i = 1; i <= po.NumArgs(); ++i) { 133 for (int32_t i = 1; i <= po.NumArgs(); ++i) {
134 - const std::string wav_filename = po.GetArg(i); 134 + std::string wav_filename = po.GetArg(i);
135 int32_t sampling_rate = -1; 135 int32_t sampling_rate = -1;
136 bool is_ok = false; 136 bool is_ok = false;
137 - const std::vector<float> samples = 137 + std::vector<float> samples =
138 sherpa_onnx::ReadWave(wav_filename, &sampling_rate, &is_ok); 138 sherpa_onnx::ReadWave(wav_filename, &sampling_rate, &is_ok);
139 if (!is_ok) { 139 if (!is_ok) {
140 fprintf(stderr, "Failed to read '%s'\n", wav_filename.c_str()); 140 fprintf(stderr, "Failed to read '%s'\n", wav_filename.c_str());
  1 +// sherpa-onnx/csrc/sherpa-onnx-vad-with-offline-asr.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include <stdio.h>
  6 +
  7 +#include <chrono> // NOLINT
  8 +#include <string>
  9 +#include <vector>
  10 +
  11 +#include "sherpa-onnx/csrc/offline-recognizer.h"
  12 +#include "sherpa-onnx/csrc/parse-options.h"
  13 +#include "sherpa-onnx/csrc/resample.h"
  14 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  15 +#include "sherpa-onnx/csrc/wave-reader.h"
  16 +
  17 +int main(int32_t argc, char *argv[]) {
  18 + const char *kUsageMessage = R"usage(
  19 +Speech recognition using VAD + non-streaming models with sherpa-onnx.
  20 +
  21 +Usage:
  22 +
  23 +Note you can download silero_vad.onnx using
  24 +
  25 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  26 +
  27 +(0) FireRedAsr
  28 +
  29 +See https://k2-fsa.github.io/sherpa/onnx/FireRedAsr/pretrained.html
  30 +
  31 + ./bin/sherpa-onnx-vad-with-offline-asr \
  32 + --tokens=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/tokens.txt \
  33 + --fire-red-asr-encoder=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/encoder.int8.onnx \
  34 + --fire-red-asr-decoder=./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/decoder.int8.onnx \
  35 + --num-threads=1 \
  36 + --silero-vad-model=/path/to/silero_vad.onnx \
  37 + /path/to/foo.wav
  38 +
  39 +(1) Transducer from icefall
  40 +
  41 +See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/index.html
  42 +
  43 + ./bin/sherpa-onnx-vad-with-offline-asr \
  44 + --silero-vad-model=/path/to/silero_vad.onnx \
  45 + --tokens=/path/to/tokens.txt \
  46 + --encoder=/path/to/encoder.onnx \
  47 + --decoder=/path/to/decoder.onnx \
  48 + --joiner=/path/to/joiner.onnx \
  49 + --num-threads=1 \
  50 + --decoding-method=greedy_search \
  51 + /path/to/foo.wav
  52 +
  53 +
  54 +(2) Paraformer from FunASR
  55 +
  56 +See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html
  57 +
  58 + ./bin/sherpa-onnx-vad-with-offline-asr \
  59 + --silero-vad-model=/path/to/silero_vad.onnx \
  60 + --tokens=/path/to/tokens.txt \
  61 + --paraformer=/path/to/model.onnx \
  62 + --num-threads=1 \
  63 + --decoding-method=greedy_search \
  64 + /path/to/foo.wav
  65 +
  66 +(3) Moonshine models
  67 +
  68 +See https://k2-fsa.github.io/sherpa/onnx/moonshine/index.html
  69 +
  70 + ./bin/sherpa-onnx-vad-with-offline-asr \
  71 + --silero-vad-model=/path/to/silero_vad.onnx \
  72 + --moonshine-preprocessor=/Users/fangjun/open-source/sherpa-onnx/scripts/moonshine/preprocess.onnx \
  73 + --moonshine-encoder=/Users/fangjun/open-source/sherpa-onnx/scripts/moonshine/encode.int8.onnx \
  74 + --moonshine-uncached-decoder=/Users/fangjun/open-source/sherpa-onnx/scripts/moonshine/uncached_decode.int8.onnx \
  75 + --moonshine-cached-decoder=/Users/fangjun/open-source/sherpa-onnx/scripts/moonshine/cached_decode.int8.onnx \
  76 + --tokens=/Users/fangjun/open-source/sherpa-onnx/scripts/moonshine/tokens.txt \
  77 + --num-threads=1 \
  78 + /path/to/foo.wav
  79 +
  80 +(4) Whisper models
  81 +
  82 +See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html
  83 +
  84 + ./bin/sherpa-onnx-vad-with-offline-asr \
  85 + --silero-vad-model=/path/to/silero_vad.onnx \
  86 + --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \
  87 + --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \
  88 + --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \
  89 + --num-threads=1 \
  90 + /path/to/foo.wav
  91 +
  92 +(5) NeMo CTC models
  93 +
  94 +See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html
  95 +
  96 + ./bin/sherpa-onnx-vad-with-offline-asr \
  97 + --silero-vad-model=/path/to/silero_vad.onnx \
  98 + --tokens=./sherpa-onnx-nemo-ctc-en-conformer-medium/tokens.txt \
  99 + --nemo-ctc-model=./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx \
  100 + --num-threads=2 \
  101 + --decoding-method=greedy_search \
  102 + --debug=false \
  103 + ./sherpa-onnx-nemo-ctc-en-conformer-medium/test_wavs/0.wav
  104 +
  105 +(6) TDNN CTC model for the yesno recipe from icefall
  106 +
  107 +See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/yesno/index.html
  108 +
  109 + ./bin/sherpa-onnx-vad-with-offline-asr \
  110 + --silero-vad-model=/path/to/silero_vad.onnx \
  111 + --sample-rate=8000 \
  112 + --feat-dim=23 \
  113 + --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt \
  114 + --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \
  115 + ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_0_1_0_0_0_1.wav
  116 +
  117 +The input wav should be of single channel, 16-bit PCM encoded wave file; its
  118 +sampling rate can be arbitrary and does not need to be 16kHz.
  119 +
  120 +Please refer to
  121 +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  122 +for a list of pre-trained models to download.
  123 +)usage";
  124 +
  125 + sherpa_onnx::ParseOptions po(kUsageMessage);
  126 + sherpa_onnx::OfflineRecognizerConfig asr_config;
  127 + asr_config.Register(&po);
  128 +
  129 + sherpa_onnx::VadModelConfig vad_config;
  130 + vad_config.Register(&po);
  131 +
  132 + po.Read(argc, argv);
  133 + if (po.NumArgs() != 1) {
  134 + fprintf(stderr, "Error: Please provide at only 1 wave file. Given: %d\n\n",
  135 + po.NumArgs());
  136 + po.PrintUsage();
  137 + exit(EXIT_FAILURE);
  138 + }
  139 +
  140 + fprintf(stderr, "%s\n", vad_config.ToString().c_str());
  141 + fprintf(stderr, "%s\n", asr_config.ToString().c_str());
  142 +
  143 + if (!vad_config.Validate()) {
  144 + fprintf(stderr, "Errors in vad_config!\n");
  145 + return -1;
  146 + }
  147 +
  148 + if (!asr_config.Validate()) {
  149 + fprintf(stderr, "Errors in ASR config!\n");
  150 + return -1;
  151 + }
  152 +
  153 + fprintf(stderr, "Creating recognizer ...\n");
  154 + sherpa_onnx::OfflineRecognizer recognizer(asr_config);
  155 + fprintf(stderr, "Recognizer created!\n");
  156 +
  157 + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config);
  158 +
  159 + fprintf(stderr, "Started\n");
  160 + const auto begin = std::chrono::steady_clock::now();
  161 +
  162 + std::string wave_filename = po.GetArg(1);
  163 + fprintf(stderr, "Reading: %s\n", wave_filename.c_str());
  164 + int32_t sampling_rate = -1;
  165 + bool is_ok = false;
  166 + auto samples = sherpa_onnx::ReadWave(wave_filename, &sampling_rate, &is_ok);
  167 + if (!is_ok) {
  168 + fprintf(stderr, "Failed to read '%s'\n", wave_filename.c_str());
  169 + return -1;
  170 + }
  171 +
  172 + if (sampling_rate != 16000) {
  173 + fprintf(stderr, "Resampling from %d Hz to 16000 Hz", sampling_rate);
  174 + float min_freq = std::min<int32_t>(sampling_rate, 16000);
  175 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  176 +
  177 + int32_t lowpass_filter_width = 6;
  178 + auto resampler = std::make_unique<sherpa_onnx::LinearResample>(
  179 + sampling_rate, 16000, lowpass_cutoff, lowpass_filter_width);
  180 + std::vector<float> out_samples;
  181 + resampler->Resample(samples.data(), samples.size(), true, &out_samples);
  182 + samples = std::move(out_samples);
  183 + fprintf(stderr, "Resampling done\n");
  184 + }
  185 +
  186 + fprintf(stderr, "Started!\n");
  187 + int32_t window_size = vad_config.silero_vad.window_size;
  188 + int32_t i = 0;
  189 + while (i + window_size < samples.size()) {
  190 + vad->AcceptWaveform(samples.data() + i, window_size);
  191 + i += window_size;
  192 + if (i >= samples.size()) {
  193 + vad->Flush();
  194 + }
  195 +
  196 + while (!vad->Empty()) {
  197 + const auto &segment = vad->Front();
  198 + float duration = segment.samples.size() / 16000.;
  199 + float start_time = segment.start / 16000.;
  200 + float end_time = start_time + duration;
  201 + if (duration < 0.1) {
  202 + vad->Pop();
  203 + continue;
  204 + }
  205 +
  206 + auto s = recognizer.CreateStream();
  207 + s->AcceptWaveform(16000, segment.samples.data(), segment.samples.size());
  208 + recognizer.DecodeStream(s.get());
  209 + const auto &result = s->GetResult();
  210 + if (!result.text.empty()) {
  211 + fprintf(stderr, "%.3f -- %.3f: %s\n", start_time, end_time,
  212 + result.text.c_str());
  213 + }
  214 + vad->Pop();
  215 + }
  216 + }
  217 +
  218 + const auto end = std::chrono::steady_clock::now();
  219 +
  220 + float elapsed_seconds =
  221 + std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
  222 + .count() /
  223 + 1000.;
  224 +
  225 + fprintf(stderr, "num threads: %d\n", asr_config.model_config.num_threads);
  226 + fprintf(stderr, "decoding method: %s\n", asr_config.decoding_method.c_str());
  227 + if (asr_config.decoding_method == "modified_beam_search") {
  228 + fprintf(stderr, "max active paths: %d\n", asr_config.max_active_paths);
  229 + }
  230 +
  231 + float duration = samples.size() / 16000.;
  232 + fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
  233 + float rtf = elapsed_seconds / duration;
  234 + fprintf(stderr, "Real time factor (RTF): %.3f / %.3f = %.3f\n",
  235 + elapsed_seconds, duration, rtf);
  236 +
  237 + return 0;
  238 +}