infinite42
Committed by GitHub

support test long audio with streaming-model & vad (#2405)

@@ -68,6 +68,7 @@ def get_binaries(): @@ -68,6 +68,7 @@ def get_binaries():
68 "sherpa-onnx-vad-microphone", 68 "sherpa-onnx-vad-microphone",
69 "sherpa-onnx-vad-microphone-offline-asr", 69 "sherpa-onnx-vad-microphone-offline-asr",
70 "sherpa-onnx-vad-with-offline-asr", 70 "sherpa-onnx-vad-with-offline-asr",
  71 + "sherpa-onnx-vad-with-online-asr",
71 "sherpa-onnx-version", 72 "sherpa-onnx-version",
72 ] 73 ]
73 74
@@ -505,6 +505,10 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) @@ -505,6 +505,10 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
505 sherpa-onnx-vad-with-offline-asr.cc 505 sherpa-onnx-vad-with-offline-asr.cc
506 ) 506 )
507 507
  508 + add_executable(sherpa-onnx-vad-with-online-asr
  509 + sherpa-onnx-vad-with-online-asr.cc
  510 + )
  511 +
508 add_executable(sherpa-onnx-vad-microphone-offline-asr 512 add_executable(sherpa-onnx-vad-microphone-offline-asr
509 sherpa-onnx-vad-microphone-offline-asr.cc 513 sherpa-onnx-vad-microphone-offline-asr.cc
510 microphone.cc 514 microphone.cc
@@ -529,6 +533,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) @@ -529,6 +533,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY)
529 sherpa-onnx-vad-microphone 533 sherpa-onnx-vad-microphone
530 sherpa-onnx-vad-microphone-offline-asr 534 sherpa-onnx-vad-microphone-offline-asr
531 sherpa-onnx-vad-with-offline-asr 535 sherpa-onnx-vad-with-offline-asr
  536 + sherpa-onnx-vad-with-online-asr
532 ) 537 )
533 if(SHERPA_ONNX_ENABLE_TTS) 538 if(SHERPA_ONNX_ENABLE_TTS)
534 list(APPEND exes 539 list(APPEND exes
  1 +// sherpa-onnx/csrc/sherpa-onnx-vad-with-online-asr.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +// Copyright (c) 2025 Pingfeng Luo
  5 +//
  6 +// This file demonstrates how to use vad in streaming speech recognition
  7 +//
  8 +
  9 +#include <stdio.h>
  10 +
  11 +#include <chrono> // NOLINT
  12 +#include <string>
  13 +#include <vector>
  14 +
  15 +#include "sherpa-onnx/csrc/online-recognizer.h"
  16 +#include "sherpa-onnx/csrc/online-stream.h"
  17 +#include "sherpa-onnx/csrc/parse-options.h"
  18 +#include "sherpa-onnx/csrc/resample.h"
  19 +#include "sherpa-onnx/csrc/symbol-table.h"
  20 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  21 +#include "sherpa-onnx/csrc/wave-reader.h"
  22 +
  23 +int32_t main(int32_t argc, char *argv[]) {
  24 + const char *kUsageMessage = R"usage(
  25 +Speech recognition using VAD + streaming models with sherpa-onnx-vad-with-online-asr.
  26 +This is useful when testing long audio.
  27 +
  28 +Usage:
  29 +
  30 +Note you can download silero_vad.onnx using
  31 +
  32 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  33 +
  34 +(1) Streaming transducer
  35 +
  36 + ./bin/sherpa-onnx-vad-with-online-asr \
  37 + --silero-vad-model=/path/to/silero_vad.onnx \
  38 + --tokens=/path/to/tokens.txt \
  39 + --encoder=/path/to/encoder.onnx \
  40 + --decoder=/path/to/decoder.onnx \
  41 + --joiner=/path/to/joiner.onnx \
  42 + --provider=cpu \
  43 + --num-threads=2 \
  44 + --decoding-method=greedy_search \
  45 + /path/to/long_duration.wav
  46 +
  47 +(2) Streaming zipformer2 CTC
  48 +
  49 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
  50 + tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
  51 +
  52 + ./bin/sherpa-onnx-vad-with-online-asr \
  53 + --debug=1 \
  54 + --silero-vad-model=/path/to/silero_vad.onnx \
  55 + --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
  56 + --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \
  57 + ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav
  58 +
  59 +(3) Streaming paraformer
  60 +
  61 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  62 + tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  63 +
  64 + ./bin/sherpa-onnx-vad-with-online-asr \
  65 + --silero-vad-model=/path/to/silero_vad.onnx \
  66 + --tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \
  67 + --paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx \
  68 + --paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx \
  69 + /path/to/long_duration.wav
  70 +
  71 +
  72 +The input wav should be of single channel, 16-bit PCM encoded wave file; its
  73 +sampling rate can be arbitrary and does not need to be 16kHz.
  74 +
  75 +Please refer to
  76 +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  77 +for a list of pre-trained models to download.
  78 +)usage";
  79 +
  80 + sherpa_onnx::ParseOptions po(kUsageMessage);
  81 + sherpa_onnx::OnlineRecognizerConfig asr_config;
  82 + asr_config.Register(&po);
  83 +
  84 + sherpa_onnx::VadModelConfig vad_config;
  85 + vad_config.Register(&po);
  86 +
  87 + po.Read(argc, argv);
  88 + if (po.NumArgs() != 1) {
  89 + fprintf(stderr, "Error: Please provide exactly 1 wave file. Given: %d\n\n",
  90 + po.NumArgs());
  91 + po.PrintUsage();
  92 + exit(EXIT_FAILURE);
  93 + }
  94 +
  95 + fprintf(stderr, "%s\n", vad_config.ToString().c_str());
  96 + fprintf(stderr, "%s\n", asr_config.ToString().c_str());
  97 +
  98 + if (!vad_config.Validate()) {
  99 + fprintf(stderr, "Errors in vad_config!\n");
  100 + return -1;
  101 + }
  102 +
  103 + if (!asr_config.Validate()) {
  104 + fprintf(stderr, "Errors in ASR config!\n");
  105 + return -1;
  106 + }
  107 +
  108 + fprintf(stderr, "Creating recognizer ...\n");
  109 + sherpa_onnx::OnlineRecognizer recognizer(asr_config);
  110 + fprintf(stderr, "Recognizer created!\n");
  111 +
  112 + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config);
  113 +
  114 + fprintf(stderr, "Started\n");
  115 + const auto begin = std::chrono::steady_clock::now();
  116 +
  117 + std::string wave_filename = po.GetArg(1);
  118 + fprintf(stderr, "Reading: %s\n", wave_filename.c_str());
  119 + int32_t sampling_rate = -1;
  120 + bool is_ok = false;
  121 + auto samples = sherpa_onnx::ReadWave(wave_filename, &sampling_rate, &is_ok);
  122 + if (!is_ok) {
  123 + fprintf(stderr, "Failed to read '%s'\n", wave_filename.c_str());
  124 + return -1;
  125 + }
  126 +
  127 + if (sampling_rate != 16000) {
  128 + fprintf(stderr, "Resampling from %d Hz to 16000 Hz\n", sampling_rate);
  129 + float min_freq = std::min(sampling_rate, 16000)
  130 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  131 +
  132 + int32_t lowpass_filter_width = 6;
  133 + auto resampler = std::make_unique<sherpa_onnx::LinearResample>(
  134 + sampling_rate, 16000, lowpass_cutoff, lowpass_filter_width);
  135 + std::vector<float> out_samples;
  136 + resampler->Resample(samples.data(), samples.size(), true, &out_samples);
  137 + samples = std::move(out_samples);
  138 + fprintf(stderr, "Resampling done\n");
  139 + }
  140 +
  141 + fprintf(stderr, "Started!\n");
  142 + int32_t window_size = vad_config.ten_vad.model.empty()
  143 + ? vad_config.silero_vad.window_size : vad_config.ten_vad.window_size;
  144 + int32_t offset = 0;
  145 + int32_t segment_id = 0;
  146 + bool speech_started = false;
  147 + while (offset < samples.size()) {
  148 + if (offset + window_size <= samples.size()) {
  149 + vad->AcceptWaveform(samples.data() + offset, window_size);
  150 + } else {
  151 + vad->Flush();
  152 + }
  153 + offset += window_size;
  154 + if (vad->IsSpeechDetected() && !speech_started) {
  155 + // new voice activity
  156 + speech_started = true;
  157 + segment_id++;
  158 + } else if (!vad->IsSpeechDetected() && speech_started) {
  159 + // end voice activity
  160 + speech_started = false;
  161 + }
  162 +
  163 + while (!vad->Empty()) {
  164 + const auto &segment = vad->Front();
  165 + float duration = segment.samples.size() / 16000.;
  166 + float start_time = segment.start / 16000.;
  167 + float end_time = start_time + duration;
  168 + auto s = recognizer.CreateStream();
  169 + s->AcceptWaveform(16000, segment.samples.data(), segment.samples.size());
  170 + s->InputFinished();
  171 + while (recognizer.IsReady(s.get())) {
  172 + recognizer.DecodeStream(s.get());
  173 + }
  174 + auto text = recognizer.GetResult(s.get()).text;
  175 + if (!text.empty()) {
  176 + fprintf(stderr, "vad segment(%d:%.3f-%.3f) results: %s\n",
  177 + segment_id, start_time, end_time, text.c_str());
  178 + }
  179 + vad->Pop();
  180 + }
  181 + }
  182 +
  183 + const auto end = std::chrono::steady_clock::now();
  184 +
  185 + float elapsed_seconds =
  186 + std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
  187 + .count() /
  188 + 1000.;
  189 +
  190 + fprintf(stderr, "num threads: %d\n", asr_config.model_config.num_threads);
  191 + fprintf(stderr, "decoding method: %s\n", asr_config.decoding_method.c_str());
  192 + if (asr_config.decoding_method == "modified_beam_search") {
  193 + fprintf(stderr, "max active paths: %d\n", asr_config.max_active_paths);
  194 + }
  195 +
  196 + float duration = samples.size() / 16000.;
  197 + fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
  198 + float rtf = elapsed_seconds / duration;
  199 + fprintf(stderr, "Real time factor (RTF): %.3f / %.3f = %.3f\n",
  200 + elapsed_seconds, duration, rtf);
  201 +
  202 + return 0;
  203 +}