Fangjun Kuang
Committed by GitHub

Add C++ example for real-time ASR with nvidia/parakeet-tdt-0.6b-v2. (#2201)

@@ -36,6 +36,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -36,6 +36,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
36 sherpa-onnx-cxx-api 36 sherpa-onnx-cxx-api
37 portaudio_static 37 portaudio_static
38 ) 38 )
  39 +
  40 + add_executable(parakeet-tdt-simulate-streaming-microphone-cxx-api
  41 + ./parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
  42 + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
  43 + )
  44 + target_link_libraries(parakeet-tdt-simulate-streaming-microphone-cxx-api
  45 + sherpa-onnx-cxx-api
  46 + portaudio_static
  47 + )
39 endif() 48 endif()
40 49
41 add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) 50 add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc)
  1 +// cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
  2 +// Copyright (c) 2025 Xiaomi Corporation
  3 +
  4 +//
  5 +// This file demonstrates how to use parakeet-tdt with sherpa-onnx's C++ API
  6 +// for streaming speech recognition from a microphone.
  7 +//
  8 +// clang-format off
  9 +//
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
  13 +// tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
  14 +// rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
  15 +//
  16 +// clang-format on
  17 +
  18 +#include <signal.h>
  19 +#include <stdio.h>
  20 +#include <stdlib.h>
  21 +
  22 +#include <chrono> // NOLINT
  23 +#include <condition_variable> // NOLINT
  24 +#include <iostream>
  25 +#include <mutex> // NOLINT
  26 +#include <queue>
  27 +#include <vector>
  28 +
  29 +#include "portaudio.h" // NOLINT
  30 +#include "sherpa-display.h" // NOLINT
  31 +#include "sherpa-onnx/c-api/cxx-api.h"
  32 +#include "sherpa-onnx/csrc/microphone.h"
  33 +
  34 +std::queue<std::vector<float>> samples_queue;
  35 +std::condition_variable condition_variable;
  36 +std::mutex mutex;
  37 +bool stop = false;
  38 +
  39 +static void Handler(int32_t /*sig*/) {
  40 + stop = true;
  41 + condition_variable.notify_one();
  42 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  43 +}
  44 +
  45 +static int32_t RecordCallback(const void *input_buffer,
  46 + void * /*output_buffer*/,
  47 + unsigned long frames_per_buffer, // NOLINT
  48 + const PaStreamCallbackTimeInfo * /*time_info*/,
  49 + PaStreamCallbackFlags /*status_flags*/,
  50 + void * /*user_data*/) {
  51 + std::lock_guard<std::mutex> lock(mutex);
  52 + samples_queue.emplace(
  53 + reinterpret_cast<const float *>(input_buffer),
  54 + reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
  55 + condition_variable.notify_one();
  56 +
  57 + return stop ? paComplete : paContinue;
  58 +}
  59 +
  60 +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
  61 + using namespace sherpa_onnx::cxx; // NOLINT
  62 + VadModelConfig config;
  63 + config.silero_vad.model = "./silero_vad.onnx";
  64 + config.silero_vad.threshold = 0.5;
  65 + config.silero_vad.min_silence_duration = 0.25;
  66 + config.silero_vad.min_speech_duration = 0.25;
  67 + config.silero_vad.max_speech_duration = 5;
  68 + config.sample_rate = 16000;
  69 + config.debug = false;
  70 +
  71 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 60);
  72 + if (!vad.Get()) {
  73 + std::cerr << "Failed to create VAD. Please check your config\n";
  74 + exit(-1);
  75 + }
  76 +
  77 + return vad;
  78 +}
  79 +
  80 +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
  81 + using namespace sherpa_onnx::cxx; // NOLINT
  82 + OfflineRecognizerConfig config;
  83 +
  84 + config.model_config.transducer.encoder =
  85 + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/encoder.int8.onnx";
  86 + config.model_config.transducer.decoder =
  87 + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/decoder.int8.onnx";
  88 + config.model_config.transducer.joiner =
  89 + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/joiner.int8.onnx";
  90 + config.model_config.tokens =
  91 + "./sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8/tokens.txt";
  92 +
  93 + config.model_config.model_type = "nemo_transducer";
  94 +
  95 + config.model_config.num_threads = 2;
  96 + config.model_config.debug = false;
  97 +
  98 + std::cout << "Loading model\n";
  99 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  100 + if (!recognizer.Get()) {
  101 + std::cerr << "Please check your config\n";
  102 + exit(-1);
  103 + }
  104 + std::cout << "Loading model done\n";
  105 + return recognizer;
  106 +}
  107 +
  108 +int32_t main() {
  109 + signal(SIGINT, Handler);
  110 +
  111 + using namespace sherpa_onnx::cxx; // NOLINT
  112 +
  113 + auto vad = CreateVad();
  114 + auto recognizer = CreateOfflineRecognizer();
  115 +
  116 + sherpa_onnx::Microphone mic;
  117 +
  118 + PaDeviceIndex num_devices = Pa_GetDeviceCount();
  119 + std::cout << "Num devices: " << num_devices << "\n";
  120 + if (num_devices == 0) {
  121 + std::cerr << " If you are using Linux, please try "
  122 + "./build/bin/sense-voice-simulate-streaming-alsa-cxx-api\n";
  123 + return -1;
  124 + }
  125 +
  126 + int32_t device_index = Pa_GetDefaultInputDevice();
  127 +
  128 + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
  129 + if (pDeviceIndex) {
  130 + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
  131 + device_index = atoi(pDeviceIndex);
  132 + }
  133 +
  134 + for (int32_t i = 0; i != num_devices; ++i) {
  135 + const PaDeviceInfo *info = Pa_GetDeviceInfo(i);
  136 + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i,
  137 + info->name);
  138 + }
  139 +
  140 + PaStreamParameters param;
  141 + param.device = device_index;
  142 +
  143 + fprintf(stderr, "Use device: %d\n", param.device);
  144 +
  145 + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
  146 + fprintf(stderr, " Name: %s\n", info->name);
  147 + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels);
  148 +
  149 + param.channelCount = 1;
  150 + param.sampleFormat = paFloat32;
  151 +
  152 + param.suggestedLatency = info->defaultLowInputLatency;
  153 + param.hostApiSpecificStreamInfo = nullptr;
  154 + float mic_sample_rate = 16000;
  155 + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
  156 + if (sample_rate_str) {
  157 + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
  158 + mic_sample_rate = atof(sample_rate_str);
  159 + }
  160 + float sample_rate = 16000;
  161 + LinearResampler resampler;
  162 + if (mic_sample_rate != sample_rate) {
  163 + float min_freq = std::min(mic_sample_rate, sample_rate);
  164 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  165 +
  166 + int32_t lowpass_filter_width = 6;
  167 + resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
  168 + lowpass_cutoff, lowpass_filter_width);
  169 + }
  170 +
  171 + PaStream *stream;
  172 + PaError err =
  173 + Pa_OpenStream(&stream, &param, nullptr, /* &outputParameters, */
  174 + mic_sample_rate,
  175 + 0, // frames per buffer
  176 + paClipOff, // we won't output out of range samples
  177 + // so don't bother clipping them
  178 + RecordCallback, // RecordCallback is run in a separate
  179 + // thread created by portaudio
  180 + nullptr);
  181 + if (err != paNoError) {
  182 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  183 + exit(EXIT_FAILURE);
  184 + }
  185 +
  186 + err = Pa_StartStream(stream);
  187 + if (err != paNoError) {
  188 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  189 + exit(EXIT_FAILURE);
  190 + }
  191 +
  192 + int32_t window_size = 512; // samples, please don't change
  193 +
  194 + int32_t offset = 0;
  195 + std::vector<float> buffer;
  196 + bool speech_started = false;
  197 +
  198 + auto started_time = std::chrono::steady_clock::now();
  199 +
  200 + SherpaDisplay display;
  201 +
  202 + std::cout << "Started! Please speak\n";
  203 +
  204 + while (!stop) {
  205 + {
  206 + std::unique_lock<std::mutex> lock(mutex);
  207 + while (samples_queue.empty() && !stop) {
  208 + condition_variable.wait(lock);
  209 + }
  210 +
  211 + const auto &s = samples_queue.front();
  212 + if (!resampler.Get()) {
  213 + buffer.insert(buffer.end(), s.begin(), s.end());
  214 + } else {
  215 + auto resampled = resampler.Resample(s.data(), s.size(), false);
  216 + buffer.insert(buffer.end(), resampled.begin(), resampled.end());
  217 + }
  218 +
  219 + samples_queue.pop();
  220 + }
  221 +
  222 + for (; offset + window_size < buffer.size(); offset += window_size) {
  223 + vad.AcceptWaveform(buffer.data() + offset, window_size);
  224 + if (!speech_started && vad.IsDetected()) {
  225 + speech_started = true;
  226 + started_time = std::chrono::steady_clock::now();
  227 + }
  228 + }
  229 + if (!speech_started) {
  230 + if (buffer.size() > 10 * window_size) {
  231 + offset -= buffer.size() - 10 * window_size;
  232 + buffer = {buffer.end() - 10 * window_size, buffer.end()};
  233 + }
  234 + }
  235 +
  236 + auto current_time = std::chrono::steady_clock::now();
  237 + const float elapsed_seconds =
  238 + std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
  239 + started_time)
  240 + .count() /
  241 + 1000.;
  242 +
  243 + if (speech_started && elapsed_seconds > 0.2) {
  244 + OfflineStream stream = recognizer.CreateStream();
  245 + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
  246 +
  247 + recognizer.Decode(&stream);
  248 +
  249 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  250 + display.UpdateText(result.text);
  251 + display.Display();
  252 +
  253 + started_time = std::chrono::steady_clock::now();
  254 + }
  255 +
  256 + while (!vad.IsEmpty()) {
  257 + auto segment = vad.Front();
  258 +
  259 + vad.Pop();
  260 +
  261 + OfflineStream stream = recognizer.CreateStream();
  262 + stream.AcceptWaveform(sample_rate, segment.samples.data(),
  263 + segment.samples.size());
  264 +
  265 + recognizer.Decode(&stream);
  266 +
  267 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  268 +
  269 + display.UpdateText(result.text);
  270 + display.FinalizeCurrentSentence();
  271 + display.Display();
  272 +
  273 + buffer.clear();
  274 + offset = 0;
  275 + speech_started = false;
  276 + }
  277 + }
  278 +
  279 + err = Pa_CloseStream(stream);
  280 + if (err != paNoError) {
  281 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  282 + exit(EXIT_FAILURE);
  283 + }
  284 +
  285 + return 0;
  286 +}
@@ -14,7 +14,8 @@ class SherpaDisplay { @@ -14,7 +14,8 @@ class SherpaDisplay {
14 void UpdateText(const std::string &text) { current_text_ = text; } 14 void UpdateText(const std::string &text) { current_text_ = text; }
15 15
16 void FinalizeCurrentSentence() { 16 void FinalizeCurrentSentence() {
17 - if (!current_text_.empty() && current_text_[0] != ' ') { 17 + if (!current_text_.empty() &&
  18 + (current_text_[0] != ' ' || current_text_.size() > 1)) {
18 sentences_.push_back({GetCurrentDateTime(), std::move(current_text_)}); 19 sentences_.push_back({GetCurrentDateTime(), std::move(current_text_)});
19 } 20 }
20 } 21 }