Fangjun Kuang
Committed by GitHub

Add CXX examples for NeMo TDT ASR. (#2363)

# New Features
- Added new example programs demonstrating streaming speech recognition from a microphone using Parakeet-TDT CTC and Zipformer Transducer models with voice activity detection.
- These examples support microphone input via PortAudio and display recognized text incrementally.

# Bug Fixes
- Improved error handling and logic when opening microphone devices in several example programs for more reliable device initialization.

# Chores
- Updated build configuration to include new executable examples when PortAudio support is enabled.
@@ -49,6 +49,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -49,6 +49,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
49 portaudio_static 49 portaudio_static
50 ) 50 )
51 51
  52 + add_executable(parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api
  53 + ./parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api.cc
  54 + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
  55 + )
  56 + target_link_libraries(parakeet-tdt-ctc-simulate-streaming-microphone-cxx-api
  57 + sherpa-onnx-cxx-api
  58 + portaudio_static
  59 + )
  60 +
52 add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api 61 add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api
53 ./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc 62 ./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
54 ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc 63 ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
@@ -57,6 +66,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -57,6 +66,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
57 sherpa-onnx-cxx-api 66 sherpa-onnx-cxx-api
58 portaudio_static 67 portaudio_static
59 ) 68 )
  69 +
  70 + add_executable(zipformer-transducer-simulate-streaming-microphone-cxx-api
  71 + ./zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
  72 + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
  73 + )
  74 + target_link_libraries(zipformer-transducer-simulate-streaming-microphone-cxx-api
  75 + sherpa-onnx-cxx-api
  76 + portaudio_static
  77 + )
60 endif() 78 endif()
61 79
62 if(SHERPA_ONNX_HAS_ALSA) 80 if(SHERPA_ONNX_HAS_ALSA)
  1 +// cxx-api-examples/parakeet-tdt-simulate-streaming-microphone-cxx-api.cc
  2 +// Copyright (c) 2025 Xiaomi Corporation
  3 +
  4 +//
  5 +// This file demonstrates how to use parakeet-tdt with sherpa-onnx's C++ API
  6 +// for streaming speech recognition from a microphone.
  7 +//
  8 +// clang-format off
  9 +//
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2
  13 +// tar xvf sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2
  14 +// rm sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8.tar.bz2
  15 +//
  16 +// clang-format on
  17 +
  18 +#include <signal.h>
  19 +#include <stdio.h>
  20 +#include <stdlib.h>
  21 +
  22 +#include <chrono> // NOLINT
  23 +#include <condition_variable> // NOLINT
  24 +#include <iostream>
  25 +#include <mutex> // NOLINT
  26 +#include <queue>
  27 +#include <vector>
  28 +
  29 +#include "portaudio.h" // NOLINT
  30 +#include "sherpa-display.h" // NOLINT
  31 +#include "sherpa-onnx/c-api/cxx-api.h"
  32 +#include "sherpa-onnx/csrc/microphone.h"
  33 +
  34 +std::queue<std::vector<float>> samples_queue;
  35 +std::condition_variable condition_variable;
  36 +std::mutex mutex;
  37 +bool stop = false;
  38 +
  39 +static void Handler(int32_t /*sig*/) {
  40 + stop = true;
  41 + condition_variable.notify_one();
  42 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  43 +}
  44 +
  45 +static int32_t RecordCallback(const void *input_buffer,
  46 + void * /*output_buffer*/,
  47 + unsigned long frames_per_buffer, // NOLINT
  48 + const PaStreamCallbackTimeInfo * /*time_info*/,
  49 + PaStreamCallbackFlags /*status_flags*/,
  50 + void * /*user_data*/) {
  51 + std::lock_guard<std::mutex> lock(mutex);
  52 + samples_queue.emplace(
  53 + reinterpret_cast<const float *>(input_buffer),
  54 + reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
  55 + condition_variable.notify_one();
  56 +
  57 + return stop ? paComplete : paContinue;
  58 +}
  59 +
  60 +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
  61 + using namespace sherpa_onnx::cxx; // NOLINT
  62 + VadModelConfig config;
  63 + config.silero_vad.model = "./silero_vad.onnx";
  64 + config.silero_vad.threshold = 0.25;
  65 + config.silero_vad.min_silence_duration = 0.25;
  66 + config.silero_vad.min_speech_duration = 0.25;
  67 + config.silero_vad.max_speech_duration = 5;
  68 + config.sample_rate = 16000;
  69 + config.debug = false;
  70 +
  71 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 60);
  72 + if (!vad.Get()) {
  73 + std::cerr << "Failed to create VAD. Please check your config\n";
  74 + exit(-1);
  75 + }
  76 +
  77 + return vad;
  78 +}
  79 +
  80 +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
  81 + using namespace sherpa_onnx::cxx; // NOLINT
  82 + OfflineRecognizerConfig config;
  83 +
  84 + config.model_config.nemo_ctc.model =
  85 + "./sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/model.int8.onnx";
  86 + config.model_config.tokens =
  87 + "./sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/tokens.txt";
  88 +
  89 + config.model_config.num_threads = 2;
  90 + config.model_config.debug = false;
  91 +
  92 + std::cout << "Loading model\n";
  93 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  94 + if (!recognizer.Get()) {
  95 + std::cerr << "Please check your config\n";
  96 + exit(-1);
  97 + }
  98 + std::cout << "Loading model done\n";
  99 + return recognizer;
  100 +}
  101 +
  102 +int32_t main() {
  103 + signal(SIGINT, Handler);
  104 +
  105 + using namespace sherpa_onnx::cxx; // NOLINT
  106 +
  107 + auto vad = CreateVad();
  108 + auto recognizer = CreateOfflineRecognizer();
  109 +
  110 + sherpa_onnx::Microphone mic;
  111 +
  112 + PaDeviceIndex num_devices = Pa_GetDeviceCount();
  113 + if (num_devices == 0) {
  114 + std::cerr << " If you are using Linux, please try to modify "
  115 + "./build/bin/sense-voice-simulate-streaming-alsa-cxx-api\n";
  116 + return -1;
  117 + }
  118 +
  119 + int32_t device_index = Pa_GetDefaultInputDevice();
  120 + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
  121 + if (pDeviceIndex) {
  122 + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
  123 + device_index = atoi(pDeviceIndex);
  124 + }
  125 + mic.PrintDevices(device_index);
  126 +
  127 + float mic_sample_rate = 16000;
  128 + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
  129 + if (sample_rate_str) {
  130 + mic_sample_rate = atof(sample_rate_str);
  131 + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
  132 + }
  133 +
  134 + float sample_rate = 16000;
  135 + LinearResampler resampler;
  136 + if (mic_sample_rate != sample_rate) {
  137 + float min_freq = std::min(mic_sample_rate, sample_rate);
  138 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  139 +
  140 + int32_t lowpass_filter_width = 6;
  141 + resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
  142 + lowpass_cutoff, lowpass_filter_width);
  143 + }
  144 + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
  145 + nullptr)) {
  146 + std::cerr << "Failed to open microphone device\n";
  147 + return -1;
  148 + }
  149 +
  150 + int32_t window_size = 512; // samples, please don't change
  151 +
  152 + int32_t offset = 0;
  153 + std::vector<float> buffer;
  154 + bool speech_started = false;
  155 +
  156 + auto started_time = std::chrono::steady_clock::now();
  157 +
  158 + SherpaDisplay display;
  159 +
  160 + std::cout << "Started! Please speak\n";
  161 +
  162 + while (!stop) {
  163 + {
  164 + std::unique_lock<std::mutex> lock(mutex);
  165 + while (samples_queue.empty() && !stop) {
  166 + condition_variable.wait(lock);
  167 + }
  168 +
  169 + const auto &s = samples_queue.front();
  170 + if (!resampler.Get()) {
  171 + buffer.insert(buffer.end(), s.begin(), s.end());
  172 + } else {
  173 + auto resampled = resampler.Resample(s.data(), s.size(), false);
  174 + buffer.insert(buffer.end(), resampled.begin(), resampled.end());
  175 + }
  176 +
  177 + samples_queue.pop();
  178 + }
  179 +
  180 + for (; offset + window_size < buffer.size(); offset += window_size) {
  181 + vad.AcceptWaveform(buffer.data() + offset, window_size);
  182 + if (!speech_started && vad.IsDetected()) {
  183 + speech_started = true;
  184 + started_time = std::chrono::steady_clock::now();
  185 + }
  186 + }
  187 + if (!speech_started) {
  188 + if (buffer.size() > 10 * window_size) {
  189 + offset -= buffer.size() - 10 * window_size;
  190 + buffer = {buffer.end() - 10 * window_size, buffer.end()};
  191 + }
  192 + }
  193 +
  194 + auto current_time = std::chrono::steady_clock::now();
  195 + const float elapsed_seconds =
  196 + std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
  197 + started_time)
  198 + .count() /
  199 + 1000.;
  200 +
  201 + if (speech_started && elapsed_seconds > 0.2) {
  202 + OfflineStream stream = recognizer.CreateStream();
  203 + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
  204 +
  205 + recognizer.Decode(&stream);
  206 +
  207 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  208 + display.UpdateText(result.text);
  209 + display.Display();
  210 +
  211 + started_time = std::chrono::steady_clock::now();
  212 + }
  213 +
  214 + while (!vad.IsEmpty()) {
  215 + auto segment = vad.Front();
  216 +
  217 + vad.Pop();
  218 +
  219 + OfflineStream stream = recognizer.CreateStream();
  220 + stream.AcceptWaveform(sample_rate, segment.samples.data(),
  221 + segment.samples.size());
  222 +
  223 + recognizer.Decode(&stream);
  224 +
  225 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  226 +
  227 + display.UpdateText(result.text);
  228 + display.FinalizeCurrentSentence();
  229 + display.Display();
  230 +
  231 + buffer.clear();
  232 + offset = 0;
  233 + speech_started = false;
  234 + }
  235 + }
  236 +
  237 + return 0;
  238 +}
@@ -136,11 +136,7 @@ int32_t main() { @@ -136,11 +136,7 @@ int32_t main() {
136 fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); 136 fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
137 mic_sample_rate = atof(sample_rate_str); 137 mic_sample_rate = atof(sample_rate_str);
138 } 138 }
139 - if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,  
140 - nullptr) == false) {  
141 - std::cerr << "Failed to open microphone device\n";  
142 - return -1;  
143 - } 139 +
144 float sample_rate = 16000; 140 float sample_rate = 16000;
145 LinearResampler resampler; 141 LinearResampler resampler;
146 if (mic_sample_rate != sample_rate) { 142 if (mic_sample_rate != sample_rate) {
@@ -152,6 +148,12 @@ int32_t main() { @@ -152,6 +148,12 @@ int32_t main() {
152 lowpass_cutoff, lowpass_filter_width); 148 lowpass_cutoff, lowpass_filter_width);
153 } 149 }
154 150
  151 + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
  152 + nullptr)) {
  153 + std::cerr << "Failed to open microphone device\n";
  154 + return -1;
  155 + }
  156 +
155 int32_t window_size = 512; // samples, please don't change 157 int32_t window_size = 512; // samples, please don't change
156 158
157 int32_t offset = 0; 159 int32_t offset = 0;
@@ -142,8 +142,8 @@ int32_t main() { @@ -142,8 +142,8 @@ int32_t main() {
142 resampler = LinearResampler::Create(mic_sample_rate, sample_rate, 142 resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
143 lowpass_cutoff, lowpass_filter_width); 143 lowpass_cutoff, lowpass_filter_width);
144 } 144 }
145 - if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,  
146 - nullptr) == false) { 145 + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
  146 + nullptr)) {
147 std::cerr << "Failed to open microphone device\n"; 147 std::cerr << "Failed to open microphone device\n";
148 return -1; 148 return -1;
149 } 149 }
@@ -140,8 +140,8 @@ int32_t main() { @@ -140,8 +140,8 @@ int32_t main() {
140 resampler = LinearResampler::Create(mic_sample_rate, sample_rate, 140 resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
141 lowpass_cutoff, lowpass_filter_width); 141 lowpass_cutoff, lowpass_filter_width);
142 } 142 }
143 - if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,  
144 - nullptr) == false) { 143 + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
  144 + nullptr)) {
145 std::cerr << "Failed to open microphone device\n"; 145 std::cerr << "Failed to open microphone device\n";
146 return -1; 146 return -1;
147 } 147 }
  1 +// cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
  2 +// Copyright (c) 2025 Xiaomi Corporation
  3 +//
  4 +// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++ API
  5 +// for streaming speech recognition from a microphone.
  6 +//
  7 +// clang-format off
  8 +//
  9 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  10 +//
  11 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2
  12 +// tar xvf sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2
  13 +// rm sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01.tar.bz2
  14 +//
  15 +// clang-format on
  16 +
  17 +#include <signal.h>
  18 +#include <stdio.h>
  19 +#include <stdlib.h>
  20 +
  21 +#include <chrono> // NOLINT
  22 +#include <condition_variable> // NOLINT
  23 +#include <iostream>
  24 +#include <mutex> // NOLINT
  25 +#include <queue>
  26 +#include <vector>
  27 +
  28 +#include "portaudio.h" // NOLINT
  29 +#include "sherpa-display.h" // NOLINT
  30 +#include "sherpa-onnx/c-api/cxx-api.h"
  31 +#include "sherpa-onnx/csrc/microphone.h"
  32 +
  33 +std::queue<std::vector<float>> samples_queue;
  34 +std::condition_variable condition_variable;
  35 +std::mutex mutex;
  36 +bool stop = false;
  37 +
  38 +static void Handler(int32_t /*sig*/) {
  39 + stop = true;
  40 + condition_variable.notify_one();
  41 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  42 +}
  43 +
  44 +static int32_t RecordCallback(const void *input_buffer,
  45 + void * /*output_buffer*/,
  46 + unsigned long frames_per_buffer, // NOLINT
  47 + const PaStreamCallbackTimeInfo * /*time_info*/,
  48 + PaStreamCallbackFlags /*status_flags*/,
  49 + void * /*user_data*/) {
  50 + std::lock_guard<std::mutex> lock(mutex);
  51 + samples_queue.emplace(
  52 + reinterpret_cast<const float *>(input_buffer),
  53 + reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
  54 + condition_variable.notify_one();
  55 +
  56 + return stop ? paComplete : paContinue;
  57 +}
  58 +
  59 +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
  60 + using namespace sherpa_onnx::cxx; // NOLINT
  61 + VadModelConfig config;
  62 + config.silero_vad.model = "./silero_vad.onnx";
  63 + config.silero_vad.threshold = 0.5;
  64 + config.silero_vad.min_silence_duration = 0.1;
  65 + config.silero_vad.min_speech_duration = 0.25;
  66 + config.silero_vad.max_speech_duration = 8;
  67 + config.sample_rate = 16000;
  68 + config.debug = false;
  69 +
  70 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
  71 + if (!vad.Get()) {
  72 + std::cerr << "Failed to create VAD. Please check your config\n";
  73 + exit(-1);
  74 + }
  75 +
  76 + return vad;
  77 +}
  78 +
  79 +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
  80 + using namespace sherpa_onnx::cxx; // NOLINT
  81 + OfflineRecognizerConfig config;
  82 +
  83 + config.model_config.transducer.encoder =
  84 + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/"
  85 + "encoder-epoch-99-avg-1.int8.onnx";
  86 +
  87 + config.model_config.transducer.decoder =
  88 + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/"
  89 + "decoder-epoch-99-avg-1.onnx";
  90 +
  91 + config.model_config.transducer.joiner =
  92 + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/"
  93 + "joiner-epoch-99-avg-1.int8.onnx";
  94 + config.model_config.tokens =
  95 + "./sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01/tokens.txt";
  96 +
  97 + config.model_config.num_threads = 2;
  98 + config.model_config.debug = false;
  99 +
  100 + std::cout << "Loading model\n";
  101 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  102 + if (!recognizer.Get()) {
  103 + std::cerr << "Please check your config\n";
  104 + exit(-1);
  105 + }
  106 + std::cout << "Loading model done\n";
  107 + return recognizer;
  108 +}
  109 +
  110 +int32_t main() {
  111 + signal(SIGINT, Handler);
  112 +
  113 + using namespace sherpa_onnx::cxx; // NOLINT
  114 +
  115 + auto vad = CreateVad();
  116 + auto recognizer = CreateOfflineRecognizer();
  117 +
  118 + sherpa_onnx::Microphone mic;
  119 +
  120 + PaDeviceIndex num_devices = Pa_GetDeviceCount();
  121 + if (num_devices == 0) {
  122 + std::cerr << " If you are using Linux, please try "
  123 + "./build/bin/zipformer-ctc-simulate-streaming-alsa-cxx-api\n";
  124 + return -1;
  125 + }
  126 +
  127 + int32_t device_index = Pa_GetDefaultInputDevice();
  128 + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
  129 + if (pDeviceIndex) {
  130 + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
  131 + device_index = atoi(pDeviceIndex);
  132 + }
  133 + mic.PrintDevices(device_index);
  134 +
  135 + float mic_sample_rate = 16000;
  136 + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
  137 + if (sample_rate_str) {
  138 + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
  139 + mic_sample_rate = atof(sample_rate_str);
  140 + }
  141 + float sample_rate = 16000;
  142 + LinearResampler resampler;
  143 + if (mic_sample_rate != sample_rate) {
  144 + float min_freq = std::min(mic_sample_rate, sample_rate);
  145 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  146 +
  147 + int32_t lowpass_filter_width = 6;
  148 + resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
  149 + lowpass_cutoff, lowpass_filter_width);
  150 + }
  151 + if (!mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
  152 + nullptr)) {
  153 + std::cerr << "Failed to open microphone device\n";
  154 + return -1;
  155 + }
  156 +
  157 + int32_t window_size = 512; // samples, please don't change
  158 +
  159 + int32_t offset = 0;
  160 + std::vector<float> buffer;
  161 + bool speech_started = false;
  162 +
  163 + auto started_time = std::chrono::steady_clock::now();
  164 +
  165 + SherpaDisplay display;
  166 +
  167 + std::cout << "Started! Please speak\n";
  168 +
  169 + while (!stop) {
  170 + {
  171 + std::unique_lock<std::mutex> lock(mutex);
  172 + while (samples_queue.empty() && !stop) {
  173 + condition_variable.wait(lock);
  174 + }
  175 +
  176 + const auto &s = samples_queue.front();
  177 + if (!resampler.Get()) {
  178 + buffer.insert(buffer.end(), s.begin(), s.end());
  179 + } else {
  180 + auto resampled = resampler.Resample(s.data(), s.size(), false);
  181 + buffer.insert(buffer.end(), resampled.begin(), resampled.end());
  182 + }
  183 +
  184 + samples_queue.pop();
  185 + }
  186 +
  187 + for (; offset + window_size < buffer.size(); offset += window_size) {
  188 + vad.AcceptWaveform(buffer.data() + offset, window_size);
  189 + if (!speech_started && vad.IsDetected()) {
  190 + speech_started = true;
  191 + started_time = std::chrono::steady_clock::now();
  192 + }
  193 + }
  194 + if (!speech_started) {
  195 + if (buffer.size() > 10 * window_size) {
  196 + offset -= buffer.size() - 10 * window_size;
  197 + buffer = {buffer.end() - 10 * window_size, buffer.end()};
  198 + }
  199 + }
  200 +
  201 + auto current_time = std::chrono::steady_clock::now();
  202 + const float elapsed_seconds =
  203 + std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
  204 + started_time)
  205 + .count() /
  206 + 1000.;
  207 +
  208 + if (speech_started && elapsed_seconds > 0.2) {
  209 + OfflineStream stream = recognizer.CreateStream();
  210 + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
  211 +
  212 + recognizer.Decode(&stream);
  213 +
  214 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  215 + display.UpdateText(result.text);
  216 + display.Display();
  217 +
  218 + started_time = std::chrono::steady_clock::now();
  219 + }
  220 +
  221 + while (!vad.IsEmpty()) {
  222 + auto segment = vad.Front();
  223 +
  224 + vad.Pop();
  225 +
  226 + OfflineStream stream = recognizer.CreateStream();
  227 + stream.AcceptWaveform(sample_rate, segment.samples.data(),
  228 + segment.samples.size());
  229 +
  230 + recognizer.Decode(&stream);
  231 +
  232 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  233 +
  234 + display.UpdateText(result.text);
  235 + display.FinalizeCurrentSentence();
  236 + display.Display();
  237 +
  238 + buffer.clear();
  239 + offset = 0;
  240 + speech_started = false;
  241 + }
  242 + }
  243 +
  244 + return 0;
  245 +}