Fangjun Kuang
Committed by GitHub

Add a C++ example to show streaming VAD + non-streaming ASR. (#420)

@@ -137,6 +137,7 @@ class BuildExtension(build_ext): @@ -137,6 +137,7 @@ class BuildExtension(build_ext):
137 binaries += ["sherpa-onnx-offline-websocket-server"] 137 binaries += ["sherpa-onnx-offline-websocket-server"]
138 binaries += ["sherpa-onnx-online-websocket-client"] 138 binaries += ["sherpa-onnx-online-websocket-client"]
139 binaries += ["sherpa-onnx-vad-microphone"] 139 binaries += ["sherpa-onnx-vad-microphone"]
  140 + binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
140 binaries += ["sherpa-onnx-offline-tts"] 141 binaries += ["sherpa-onnx-offline-tts"]
141 142
142 if is_windows(): 143 if is_windows():
@@ -57,6 +57,7 @@ def get_binaries_to_install(): @@ -57,6 +57,7 @@ def get_binaries_to_install():
57 binaries += ["sherpa-onnx-offline-websocket-server"] 57 binaries += ["sherpa-onnx-offline-websocket-server"]
58 binaries += ["sherpa-onnx-online-websocket-client"] 58 binaries += ["sherpa-onnx-online-websocket-client"]
59 binaries += ["sherpa-onnx-vad-microphone"] 59 binaries += ["sherpa-onnx-vad-microphone"]
  60 + binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
60 binaries += ["sherpa-onnx-offline-tts"] 61 binaries += ["sherpa-onnx-offline-tts"]
61 if is_windows(): 62 if is_windows():
62 binaries += ["kaldi-native-fbank-core.dll"] 63 binaries += ["kaldi-native-fbank-core.dll"]
@@ -225,6 +225,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -225,6 +225,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
225 microphone.cc 225 microphone.cc
226 ) 226 )
227 227
  228 + add_executable(sherpa-onnx-vad-microphone-offline-asr
  229 + sherpa-onnx-vad-microphone-offline-asr.cc
  230 + microphone.cc
  231 + )
  232 +
228 if(BUILD_SHARED_LIBS) 233 if(BUILD_SHARED_LIBS)
229 set(PA_LIB portaudio) 234 set(PA_LIB portaudio)
230 else() 235 else()
@@ -235,6 +240,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -235,6 +240,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
235 sherpa-onnx-microphone 240 sherpa-onnx-microphone
236 sherpa-onnx-microphone-offline 241 sherpa-onnx-microphone-offline
237 sherpa-onnx-vad-microphone 242 sherpa-onnx-vad-microphone
  243 + sherpa-onnx-vad-microphone-offline-asr
238 ) 244 )
239 foreach(exe IN LISTS exes) 245 foreach(exe IN LISTS exes)
240 target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core) 246 target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core)
  1 +// sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc
  2 +//
  3 +// Copyright (c) 2022-2023 Xiaomi Corporation
  4 +
  5 +#include <signal.h>
  6 +#include <stdio.h>
  7 +#include <stdlib.h>
  8 +
  9 +#include <algorithm>
  10 +#include <mutex> // NOLINT
  11 +
  12 +#include "portaudio.h" // NOLINT
  13 +#include "sherpa-onnx/csrc/circular-buffer.h"
  14 +#include "sherpa-onnx/csrc/microphone.h"
  15 +#include "sherpa-onnx/csrc/offline-recognizer.h"
  16 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  17 +
  18 +bool stop = false;
  19 +std::mutex mutex;
  20 +sherpa_onnx::CircularBuffer buffer(16000 * 60);
  21 +
  22 +static int32_t RecordCallback(const void *input_buffer,
  23 + void * /*output_buffer*/,
  24 + unsigned long frames_per_buffer, // NOLINT
  25 + const PaStreamCallbackTimeInfo * /*time_info*/,
  26 + PaStreamCallbackFlags /*status_flags*/,
  27 + void *user_data) {
  28 + std::lock_guard<std::mutex> lock(mutex);
  29 + buffer.Push(reinterpret_cast<const float *>(input_buffer), frames_per_buffer);
  30 +
  31 + return stop ? paComplete : paContinue;
  32 +}
  33 +
  34 +static void Handler(int32_t sig) {
  35 + stop = true;
  36 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  37 +}
  38 +
  39 +int32_t main(int32_t argc, char *argv[]) {
  40 + signal(SIGINT, Handler);
  41 +
  42 + const char *kUsageMessage = R"usage(
  43 +This program shows how to use a streaming VAD with non-streaming ASR in
  44 +sherpa-onnx.
  45 +
  46 +Please download silero_vad.onnx from
  47 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  48 +
  49 +For instance, use
  50 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  51 +
  52 +Please refer to ./sherpa-onnx-microphone-offline.cc
  53 +to download models for offline ASR.
  54 +
  55 +(1) Transducer from icefall
  56 +
  57 + ./bin/sherpa-onnx-vad-microphone-offline-asr \
  58 + --silero-vad-model=/path/to/silero_vad.onnx \
  59 + --tokens=/path/to/tokens.txt \
  60 + --encoder=/path/to/encoder.onnx \
  61 + --decoder=/path/to/decoder.onnx \
  62 + --joiner=/path/to/joiner.onnx
  63 +
  64 +(2) Paraformer from FunASR
  65 +
  66 + ./bin/sherpa-onnx-vad-microphone-offline-asr \
  67 + --silero-vad-model=/path/to/silero_vad.onnx \
  68 + --tokens=/path/to/tokens.txt \
  69 + --paraformer=/path/to/model.onnx \
  70 + --num-threads=1
  71 +
  72 +(3) Whisper models
  73 +
  74 + ./bin/sherpa-onnx-vad-microphone-offline-asr \
  75 + --silero-vad-model=/path/to/silero_vad.onnx \
  76 + --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \
  77 + --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \
  78 + --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \
  79 + --num-threads=1
  80 +)usage";
  81 +
  82 + sherpa_onnx::ParseOptions po(kUsageMessage);
  83 + sherpa_onnx::VadModelConfig vad_config;
  84 +
  85 + sherpa_onnx::OfflineRecognizerConfig asr_config;
  86 +
  87 + vad_config.Register(&po);
  88 + asr_config.Register(&po);
  89 +
  90 + po.Read(argc, argv);
  91 + if (po.NumArgs() != 0) {
  92 + po.PrintUsage();
  93 + exit(EXIT_FAILURE);
  94 + }
  95 +
  96 + fprintf(stderr, "%s\n", vad_config.ToString().c_str());
  97 + fprintf(stderr, "%s\n", asr_config.ToString().c_str());
  98 +
  99 + if (!vad_config.Validate()) {
  100 + fprintf(stderr, "Errors in vad_config!\n");
  101 + return -1;
  102 + }
  103 +
  104 + if (!asr_config.Validate()) {
  105 + fprintf(stderr, "Errors in asr_config!\n");
  106 + return -1;
  107 + }
  108 +
  109 + fprintf(stderr, "Creating recognizer ...\n");
  110 + sherpa_onnx::OfflineRecognizer recognizer(asr_config);
  111 + fprintf(stderr, "Recognizer created!\n");
  112 +
  113 + sherpa_onnx::Microphone mic;
  114 +
  115 + PaDeviceIndex num_devices = Pa_GetDeviceCount();
  116 + fprintf(stderr, "Num devices: %d\n", num_devices);
  117 +
  118 + PaStreamParameters param;
  119 +
  120 + param.device = Pa_GetDefaultInputDevice();
  121 + if (param.device == paNoDevice) {
  122 + fprintf(stderr, "No default input device found\n");
  123 + exit(EXIT_FAILURE);
  124 + }
  125 + fprintf(stderr, "Use default device: %d\n", param.device);
  126 +
  127 + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
  128 + fprintf(stderr, " Name: %s\n", info->name);
  129 + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels);
  130 +
  131 + param.channelCount = 1;
  132 + param.sampleFormat = paFloat32;
  133 +
  134 + param.suggestedLatency = info->defaultLowInputLatency;
  135 + param.hostApiSpecificStreamInfo = nullptr;
  136 + float sample_rate = 16000;
  137 +
  138 + PaStream *stream;
  139 + PaError err =
  140 + Pa_OpenStream(&stream, &param, nullptr, /* &outputParameters, */
  141 + sample_rate,
  142 + 0, // frames per buffer
  143 + paClipOff, // we won't output out of range samples
  144 + // so don't bother clipping them
  145 + RecordCallback, nullptr);
  146 + if (err != paNoError) {
  147 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  148 + exit(EXIT_FAILURE);
  149 + }
  150 +
  151 + err = Pa_StartStream(stream);
  152 + if (err != paNoError) {
  153 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  154 + exit(EXIT_FAILURE);
  155 + }
  156 +
  157 + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config);
  158 +
  159 + fprintf(stderr, "Started. Please speak\n");
  160 +
  161 + int32_t window_size = vad_config.silero_vad.window_size;
  162 + int32_t index = 0;
  163 +
  164 + while (!stop) {
  165 + {
  166 + std::lock_guard<std::mutex> lock(mutex);
  167 +
  168 + while (buffer.Size() >= window_size) {
  169 + std::vector<float> samples = buffer.Get(buffer.Head(), window_size);
  170 + buffer.Pop(window_size);
  171 + vad->AcceptWaveform(samples.data(), samples.size());
  172 + }
  173 + }
  174 +
  175 + while (!vad->Empty()) {
  176 + auto &segment = vad->Front();
  177 + auto s = recognizer.CreateStream();
  178 + s->AcceptWaveform(sample_rate, segment.samples.data(),
  179 + segment.samples.size());
  180 + recognizer.DecodeStream(s.get());
  181 + const auto &result = s->GetResult();
  182 + if (!result.text.empty()) {
  183 + fprintf(stderr, "%2d: %s\n", index, result.text.c_str());
  184 + ++index;
  185 + }
  186 + vad->Pop();
  187 + }
  188 +
  189 + Pa_Sleep(100); // sleep for 100ms
  190 + }
  191 +
  192 + err = Pa_CloseStream(stream);
  193 + if (err != paNoError) {
  194 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  195 + exit(EXIT_FAILURE);
  196 + }
  197 +
  198 + return 0;
  199 +}
@@ -102,7 +102,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx @@ -102,7 +102,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
102 0, // frames per buffer 102 0, // frames per buffer
103 paClipOff, // we won't output out of range samples 103 paClipOff, // we won't output out of range samples
104 // so don't bother clipping them 104 // so don't bother clipping them
105 - RecordCallback, &config.silero_vad.window_size); 105 + RecordCallback, nullptr);
106 if (err != paNoError) { 106 if (err != paNoError) {
107 fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); 107 fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
108 exit(EXIT_FAILURE); 108 exit(EXIT_FAILURE);