Fangjun Kuang
Committed by GitHub

Add C++ example for streaming ASR with SenseVoice. (#2199)

@@ -27,6 +27,17 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) @@ -27,6 +27,17 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api)
27 add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) 27 add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc)
28 target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) 28 target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api)
29 29
  30 +if(SHERPA_ONNX_ENABLE_PORTAUDIO)
  31 + add_executable(sense-voice-simulate-streaming-microphone-cxx-api
  32 + ./sense-voice-simulate-streaming-microphone-cxx-api.cc
  33 + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
  34 + )
  35 + target_link_libraries(sense-voice-simulate-streaming-microphone-cxx-api
  36 + sherpa-onnx-cxx-api
  37 + portaudio_static
  38 + )
  39 +endif()
  40 +
30 add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) 41 add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc)
31 target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api) 42 target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api)
32 43
@@ -33,8 +33,8 @@ int32_t main() { @@ -33,8 +33,8 @@ int32_t main() {
33 config.model_config.num_threads = 1; 33 config.model_config.num_threads = 1;
34 34
35 std::cout << "Loading model\n"; 35 std::cout << "Loading model\n";
36 - OfflineRecognizer recongizer = OfflineRecognizer::Create(config);  
37 - if (!recongizer.Get()) { 36 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  37 + if (!recognizer.Get()) {
38 std::cerr << "Please check your config\n"; 38 std::cerr << "Please check your config\n";
39 return -1; 39 return -1;
40 } 40 }
@@ -49,13 +49,13 @@ int32_t main() { @@ -49,13 +49,13 @@ int32_t main() {
49 std::cout << "Start recognition\n"; 49 std::cout << "Start recognition\n";
50 const auto begin = std::chrono::steady_clock::now(); 50 const auto begin = std::chrono::steady_clock::now();
51 51
52 - OfflineStream stream = recongizer.CreateStream(); 52 + OfflineStream stream = recognizer.CreateStream();
53 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 53 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
54 wave.samples.size()); 54 wave.samples.size());
55 55
56 - recongizer.Decode(&stream); 56 + recognizer.Decode(&stream);
57 57
58 - OfflineRecognizerResult result = recongizer.GetResult(&stream); 58 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
59 59
60 const auto end = std::chrono::steady_clock::now(); 60 const auto end = std::chrono::steady_clock::now();
61 const float elapsed_seconds = 61 const float elapsed_seconds =
@@ -32,8 +32,8 @@ int32_t main() { @@ -32,8 +32,8 @@ int32_t main() {
32 config.model_config.num_threads = 1; 32 config.model_config.num_threads = 1;
33 33
34 std::cout << "Loading model\n"; 34 std::cout << "Loading model\n";
35 - OfflineRecognizer recongizer = OfflineRecognizer::Create(config);  
36 - if (!recongizer.Get()) { 35 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  36 + if (!recognizer.Get()) {
37 std::cerr << "Please check your config\n"; 37 std::cerr << "Please check your config\n";
38 return -1; 38 return -1;
39 } 39 }
@@ -50,13 +50,13 @@ int32_t main() { @@ -50,13 +50,13 @@ int32_t main() {
50 std::cout << "Start recognition\n"; 50 std::cout << "Start recognition\n";
51 const auto begin = std::chrono::steady_clock::now(); 51 const auto begin = std::chrono::steady_clock::now();
52 52
53 - OfflineStream stream = recongizer.CreateStream(); 53 + OfflineStream stream = recognizer.CreateStream();
54 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 54 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
55 wave.samples.size()); 55 wave.samples.size());
56 56
57 - recongizer.Decode(&stream); 57 + recognizer.Decode(&stream);
58 58
59 - OfflineRecognizerResult result = recongizer.GetResult(&stream); 59 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
60 60
61 const auto end = std::chrono::steady_clock::now(); 61 const auto end = std::chrono::steady_clock::now();
62 const float elapsed_seconds = 62 const float elapsed_seconds =
@@ -36,8 +36,8 @@ int32_t main() { @@ -36,8 +36,8 @@ int32_t main() {
36 config.model_config.num_threads = 1; 36 config.model_config.num_threads = 1;
37 37
38 std::cout << "Loading model\n"; 38 std::cout << "Loading model\n";
39 - OfflineRecognizer recongizer = OfflineRecognizer::Create(config);  
40 - if (!recongizer.Get()) { 39 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  40 + if (!recognizer.Get()) {
41 std::cerr << "Please check your config\n"; 41 std::cerr << "Please check your config\n";
42 return -1; 42 return -1;
43 } 43 }
@@ -54,13 +54,13 @@ int32_t main() { @@ -54,13 +54,13 @@ int32_t main() {
54 std::cout << "Start recognition\n"; 54 std::cout << "Start recognition\n";
55 const auto begin = std::chrono::steady_clock::now(); 55 const auto begin = std::chrono::steady_clock::now();
56 56
57 - OfflineStream stream = recongizer.CreateStream(); 57 + OfflineStream stream = recognizer.CreateStream();
58 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 58 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
59 wave.samples.size()); 59 wave.samples.size());
60 60
61 - recongizer.Decode(&stream); 61 + recognizer.Decode(&stream);
62 62
63 - OfflineRecognizerResult result = recongizer.GetResult(&stream); 63 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
64 64
65 const auto end = std::chrono::steady_clock::now(); 65 const auto end = std::chrono::steady_clock::now();
66 const float elapsed_seconds = 66 const float elapsed_seconds =
@@ -32,8 +32,8 @@ int32_t main() { @@ -32,8 +32,8 @@ int32_t main() {
32 config.model_config.num_threads = 1; 32 config.model_config.num_threads = 1;
33 33
34 std::cout << "Loading model\n"; 34 std::cout << "Loading model\n";
35 - OfflineRecognizer recongizer = OfflineRecognizer::Create(config);  
36 - if (!recongizer.Get()) { 35 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  36 + if (!recognizer.Get()) {
37 std::cerr << "Please check your config\n"; 37 std::cerr << "Please check your config\n";
38 return -1; 38 return -1;
39 } 39 }
@@ -51,13 +51,13 @@ int32_t main() { @@ -51,13 +51,13 @@ int32_t main() {
51 std::cout << "Start recognition\n"; 51 std::cout << "Start recognition\n";
52 const auto begin = std::chrono::steady_clock::now(); 52 const auto begin = std::chrono::steady_clock::now();
53 53
54 - OfflineStream stream = recongizer.CreateStream(); 54 + OfflineStream stream = recognizer.CreateStream();
55 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 55 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
56 wave.samples.size()); 56 wave.samples.size());
57 57
58 - recongizer.Decode(&stream); 58 + recognizer.Decode(&stream);
59 59
60 - OfflineRecognizerResult result = recongizer.GetResult(&stream); 60 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
61 61
62 const auto end = std::chrono::steady_clock::now(); 62 const auto end = std::chrono::steady_clock::now();
63 const float elapsed_seconds = 63 const float elapsed_seconds =
  1 +// cxx-api-examples/sense-voice-simulate-streaming-microphone-cxx-api.cc
  2 +// Copyright (c) 2025 Xiaomi Corporation
  3 +
  4 +//
  5 +// This file demonstrates how to use sense voice with sherpa-onnx's C++ API
  6 +// for streaming speech recognition from a microphone.
  7 +//
  8 +// clang-format off
  9 +//
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  13 +// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  14 +// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  15 +//
  16 +// clang-format on
  17 +
  18 +#include <signal.h>
  19 +#include <stdio.h>
  20 +#include <stdlib.h>
  21 +
  22 +#include <chrono> // NOLINT
  23 +#include <condition_variable> // NOLINT
  24 +#include <iostream>
  25 +#include <mutex> // NOLINT
  26 +#include <queue>
  27 +#include <vector>
  28 +
  29 +#include "portaudio.h" // NOLINT
  30 +#include "sherpa-display.h" // NOLINT
  31 +#include "sherpa-onnx/c-api/cxx-api.h"
  32 +#include "sherpa-onnx/csrc/microphone.h"
  33 +
  34 +std::queue<std::vector<float>> samples_queue;
  35 +std::condition_variable condition_variable;
  36 +std::mutex mutex;
  37 +bool stop = false;
  38 +
  39 +static void Handler(int32_t /*sig*/) {
  40 + stop = true;
  41 + condition_variable.notify_one();
  42 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  43 +}
  44 +
  45 +static int32_t RecordCallback(const void *input_buffer,
  46 + void * /*output_buffer*/,
  47 + unsigned long frames_per_buffer, // NOLINT
  48 + const PaStreamCallbackTimeInfo * /*time_info*/,
  49 + PaStreamCallbackFlags /*status_flags*/,
  50 + void * /*user_data*/) {
  51 + std::lock_guard<std::mutex> lock(mutex);
  52 + samples_queue.emplace(
  53 + reinterpret_cast<const float *>(input_buffer),
  54 + reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
  55 + condition_variable.notify_one();
  56 +
  57 + return stop ? paComplete : paContinue;
  58 +}
  59 +
  60 +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
  61 + using namespace sherpa_onnx::cxx; // NOLINT
  62 + VadModelConfig config;
  63 + config.silero_vad.model = "./silero_vad.onnx";
  64 + config.silero_vad.threshold = 0.5;
  65 + config.silero_vad.min_silence_duration = 0.1;
  66 + config.silero_vad.min_speech_duration = 0.25;
  67 + config.silero_vad.max_speech_duration = 8;
  68 + config.sample_rate = 16000;
  69 + config.debug = false;
  70 +
  71 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
  72 + if (!vad.Get()) {
  73 + std::cerr << "Failed to create VAD. Please check your config\n";
  74 + exit(-1);
  75 + }
  76 +
  77 + return vad;
  78 +}
  79 +
  80 +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
  81 + using namespace sherpa_onnx::cxx; // NOLINT
  82 + OfflineRecognizerConfig config;
  83 +
  84 + config.model_config.sense_voice.model =
  85 + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
  86 + config.model_config.sense_voice.use_itn = false;
  87 + config.model_config.sense_voice.language = "auto";
  88 + config.model_config.tokens =
  89 + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
  90 +
  91 + config.model_config.num_threads = 2;
  92 + config.model_config.debug = false;
  93 +
  94 + std::cout << "Loading model\n";
  95 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  96 + if (!recognizer.Get()) {
  97 + std::cerr << "Please check your config\n";
  98 + exit(-1);
  99 + }
  100 + std::cout << "Loading model done\n";
  101 + return recognizer;
  102 +}
  103 +
  104 +int32_t main() {
  105 + signal(SIGINT, Handler);
  106 +
  107 + using namespace sherpa_onnx::cxx; // NOLINT
  108 +
  109 + auto vad = CreateVad();
  110 + auto recognizer = CreateOfflineRecognizer();
  111 +
  112 + sherpa_onnx::Microphone mic;
  113 +
  114 + PaDeviceIndex num_devices = Pa_GetDeviceCount();
  115 + std::cout << "Num devices: " << num_devices << "\n";
  116 + if (num_devices == 0) {
  117 + std::cerr << " If you are using Linux, please try "
  118 + "./build/bin/sense-voice-simulate-streaming-alsa-cxx-api\n";
  119 + return -1;
  120 + }
  121 +
  122 + int32_t device_index = Pa_GetDefaultInputDevice();
  123 +
  124 + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
  125 + if (pDeviceIndex) {
  126 + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
  127 + device_index = atoi(pDeviceIndex);
  128 + }
  129 +
  130 + for (int32_t i = 0; i != num_devices; ++i) {
  131 + const PaDeviceInfo *info = Pa_GetDeviceInfo(i);
  132 + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i,
  133 + info->name);
  134 + }
  135 +
  136 + PaStreamParameters param;
  137 + param.device = device_index;
  138 +
  139 + fprintf(stderr, "Use device: %d\n", param.device);
  140 +
  141 + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
  142 + fprintf(stderr, " Name: %s\n", info->name);
  143 + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels);
  144 +
  145 + param.channelCount = 1;
  146 + param.sampleFormat = paFloat32;
  147 +
  148 + param.suggestedLatency = info->defaultLowInputLatency;
  149 + param.hostApiSpecificStreamInfo = nullptr;
  150 + float mic_sample_rate = 16000;
  151 + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
  152 + if (sample_rate_str) {
  153 + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
  154 + mic_sample_rate = atof(sample_rate_str);
  155 + }
  156 + float sample_rate = 16000;
  157 + LinearResampler resampler;
  158 + if (mic_sample_rate != sample_rate) {
  159 + float min_freq = std::min(mic_sample_rate, sample_rate);
  160 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  161 +
  162 + int32_t lowpass_filter_width = 6;
  163 + resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
  164 + lowpass_cutoff, lowpass_filter_width);
  165 + }
  166 +
  167 + PaStream *stream;
  168 + PaError err =
  169 + Pa_OpenStream(&stream, &param, nullptr, /* &outputParameters, */
  170 + mic_sample_rate,
  171 + 0, // frames per buffer
  172 + paClipOff, // we won't output out of range samples
  173 + // so don't bother clipping them
  174 + RecordCallback, // RecordCallback is run in a separate
  175 + // thread created by portaudio
  176 + nullptr);
  177 + if (err != paNoError) {
  178 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  179 + exit(EXIT_FAILURE);
  180 + }
  181 +
  182 + err = Pa_StartStream(stream);
  183 + if (err != paNoError) {
  184 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  185 + exit(EXIT_FAILURE);
  186 + }
  187 +
  188 + int32_t window_size = 512; // samples, please don't change
  189 +
  190 + int32_t offset = 0;
  191 + std::vector<float> buffer;
  192 + bool speech_started = false;
  193 +
  194 + auto started_time = std::chrono::steady_clock::now();
  195 +
  196 + SherpaDisplay display;
  197 +
  198 + std::cout << "Started! Please speak\n";
  199 +
  200 + while (!stop) {
  201 + {
  202 + std::unique_lock<std::mutex> lock(mutex);
  203 + while (samples_queue.empty() && !stop) {
  204 + condition_variable.wait(lock);
  205 + }
  206 +
  207 + const auto &s = samples_queue.front();
  208 + if (!resampler.Get()) {
  209 + buffer.insert(buffer.end(), s.begin(), s.end());
  210 + } else {
  211 + auto resampled = resampler.Resample(s.data(), s.size(), false);
  212 + buffer.insert(buffer.end(), resampled.begin(), resampled.end());
  213 + }
  214 +
  215 + samples_queue.pop();
  216 + }
  217 +
  218 + for (; offset + window_size < buffer.size(); offset += window_size) {
  219 + vad.AcceptWaveform(buffer.data() + offset, window_size);
  220 + if (!speech_started && vad.IsDetected()) {
  221 + speech_started = true;
  222 + started_time = std::chrono::steady_clock::now();
  223 + }
  224 + }
  225 + if (!speech_started) {
  226 + if (buffer.size() > 10 * window_size) {
  227 + offset -= buffer.size() - 10 * window_size;
  228 + buffer = {buffer.end() - 10 * window_size, buffer.end()};
  229 + }
  230 + }
  231 +
  232 + auto current_time = std::chrono::steady_clock::now();
  233 + const float elapsed_seconds =
  234 + std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
  235 + started_time)
  236 + .count() /
  237 + 1000.;
  238 +
  239 + if (speech_started && elapsed_seconds > 0.2) {
  240 + OfflineStream stream = recognizer.CreateStream();
  241 + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
  242 +
  243 + recognizer.Decode(&stream);
  244 +
  245 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  246 + display.UpdateText(result.text);
  247 + display.Display();
  248 +
  249 + started_time = std::chrono::steady_clock::now();
  250 + }
  251 +
  252 + while (!vad.IsEmpty()) {
  253 + auto segment = vad.Front();
  254 +
  255 + vad.Pop();
  256 +
  257 + OfflineStream stream = recognizer.CreateStream();
  258 + stream.AcceptWaveform(sample_rate, segment.samples.data(),
  259 + segment.samples.size());
  260 +
  261 + recognizer.Decode(&stream);
  262 +
  263 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  264 +
  265 + display.UpdateText(result.text);
  266 + display.FinalizeCurrentSentence();
  267 + display.Display();
  268 +
  269 + buffer.clear();
  270 + offset = 0;
  271 + speech_started = false;
  272 + }
  273 + }
  274 +
  275 + err = Pa_CloseStream(stream);
  276 + if (err != paNoError) {
  277 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  278 + exit(EXIT_FAILURE);
  279 + }
  280 +
  281 + return 0;
  282 +}
@@ -47,8 +47,8 @@ int32_t main() { @@ -47,8 +47,8 @@ int32_t main() {
47 config.model_config.num_threads = 1; 47 config.model_config.num_threads = 1;
48 48
49 std::cout << "Loading model\n"; 49 std::cout << "Loading model\n";
50 - OfflineRecognizer recongizer = OfflineRecognizer::Create(config);  
51 - if (!recongizer.Get()) { 50 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  51 + if (!recognizer.Get()) {
52 std::cerr << "Please check your config\n"; 52 std::cerr << "Please check your config\n";
53 return -1; 53 return -1;
54 } 54 }
@@ -65,13 +65,13 @@ int32_t main() { @@ -65,13 +65,13 @@ int32_t main() {
65 std::cout << "Start recognition\n"; 65 std::cout << "Start recognition\n";
66 const auto begin = std::chrono::steady_clock::now(); 66 const auto begin = std::chrono::steady_clock::now();
67 67
68 - OfflineStream stream = recongizer.CreateStream(); 68 + OfflineStream stream = recognizer.CreateStream();
69 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 69 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
70 wave.samples.size()); 70 wave.samples.size());
71 71
72 - recongizer.Decode(&stream); 72 + recognizer.Decode(&stream);
73 73
74 - OfflineRecognizerResult result = recongizer.GetResult(&stream); 74 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
75 75
76 const auto end = std::chrono::steady_clock::now(); 76 const auto end = std::chrono::steady_clock::now();
77 const float elapsed_seconds = 77 const float elapsed_seconds =
  1 +#pragma once
  2 +
  3 +#include <stdlib.h>
  4 +
  5 +#include <ctime>
  6 +#include <iomanip>
  7 +#include <sstream>
  8 +#include <string>
  9 +
  10 +namespace sherpa_onnx::cxx {
  11 +
  12 +class SherpaDisplay {
  13 + public:
  14 + void UpdateText(const std::string &text) { current_text_ = text; }
  15 +
  16 + void FinalizeCurrentSentence() {
  17 + if (!current_text_.empty() && current_text_[0] != ' ') {
  18 + sentences_.push_back({GetCurrentDateTime(), std::move(current_text_)});
  19 + }
  20 + }
  21 +
  22 + void Display() const {
  23 + if (!sentences_.empty() || !current_text_.empty()) {
  24 + ClearScreen();
  25 + }
  26 +
  27 + printf("=== Speech Recognition with Next-gen Kaldi ===\n");
  28 + printf("------------------------------\n");
  29 + if (!sentences_.empty()) {
  30 + int32_t i = 1;
  31 + for (const auto &p : sentences_) {
  32 + printf("[%s] %d. %s\n", p.first.c_str(), i, p.second.c_str());
  33 + i += 1;
  34 + }
  35 +
  36 + printf("------------------------------\n");
  37 + }
  38 +
  39 + if (!current_text_.empty()) {
  40 + printf("Recognizing: %s\n", current_text_.c_str());
  41 + }
  42 + }
  43 +
  44 + private:
  45 + static void ClearScreen() {
  46 +#ifdef _MSC_VER
  47 + system("cls");
  48 +#else
  49 + system("clear");
  50 +#endif
  51 + }
  52 +
  53 + static std::string GetCurrentDateTime() {
  54 + std::ostringstream os;
  55 + auto t = std::time(nullptr);
  56 + auto tm = std::localtime(&t);
  57 + os << std::put_time(tm, "%Y-%m-%d %H:%M:%S");
  58 + return os.str();
  59 + }
  60 +
  61 + private:
  62 + std::vector<std::pair<std::string, std::string>> sentences_;
  63 + std::string current_text_;
  64 +};
  65 +
  66 +} // namespace sherpa_onnx::cxx
@@ -44,8 +44,8 @@ int32_t main() { @@ -44,8 +44,8 @@ int32_t main() {
44 config.model_config.num_threads = 1; 44 config.model_config.num_threads = 1;
45 45
46 std::cout << "Loading model\n"; 46 std::cout << "Loading model\n";
47 - OnlineRecognizer recongizer = OnlineRecognizer::Create(config);  
48 - if (!recongizer.Get()) { 47 + OnlineRecognizer recognizer = OnlineRecognizer::Create(config);
  48 + if (!recognizer.Get()) {
49 std::cerr << "Please check your config\n"; 49 std::cerr << "Please check your config\n";
50 return -1; 50 return -1;
51 } 51 }
@@ -63,16 +63,16 @@ int32_t main() { @@ -63,16 +63,16 @@ int32_t main() {
63 std::cout << "Start recognition\n"; 63 std::cout << "Start recognition\n";
64 const auto begin = std::chrono::steady_clock::now(); 64 const auto begin = std::chrono::steady_clock::now();
65 65
66 - OnlineStream stream = recongizer.CreateStream(); 66 + OnlineStream stream = recognizer.CreateStream();
67 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 67 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
68 wave.samples.size()); 68 wave.samples.size());
69 stream.InputFinished(); 69 stream.InputFinished();
70 70
71 - while (recongizer.IsReady(&stream)) {  
72 - recongizer.Decode(&stream); 71 + while (recognizer.IsReady(&stream)) {
  72 + recognizer.Decode(&stream);
73 } 73 }
74 74
75 - OnlineRecognizerResult result = recongizer.GetResult(&stream); 75 + OnlineRecognizerResult result = recognizer.GetResult(&stream);
76 76
77 const auto end = std::chrono::steady_clock::now(); 77 const auto end = std::chrono::steady_clock::now();
78 const float elapsed_seconds = 78 const float elapsed_seconds =
@@ -73,8 +73,8 @@ int32_t main(int argc, char *argv[]) { @@ -73,8 +73,8 @@ int32_t main(int argc, char *argv[]) {
73 config.model_config.provider = use_gpu ? "cuda" : "cpu"; 73 config.model_config.provider = use_gpu ? "cuda" : "cpu";
74 74
75 std::cout << "Loading model\n"; 75 std::cout << "Loading model\n";
76 - OnlineRecognizer recongizer = OnlineRecognizer::Create(config);  
77 - if (!recongizer.Get()) { 76 + OnlineRecognizer recognizer = OnlineRecognizer::Create(config);
  77 + if (!recognizer.Get()) {
78 std::cerr << "Please check your config\n"; 78 std::cerr << "Please check your config\n";
79 return -1; 79 return -1;
80 } 80 }
@@ -95,16 +95,16 @@ int32_t main(int argc, char *argv[]) { @@ -95,16 +95,16 @@ int32_t main(int argc, char *argv[]) {
95 for (int32_t i = 0; i < num_runs; ++i) { 95 for (int32_t i = 0; i < num_runs; ++i) {
96 const auto begin = std::chrono::steady_clock::now(); 96 const auto begin = std::chrono::steady_clock::now();
97 97
98 - OnlineStream stream = recongizer.CreateStream(); 98 + OnlineStream stream = recognizer.CreateStream();
99 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 99 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
100 wave.samples.size()); 100 wave.samples.size());
101 stream.InputFinished(); 101 stream.InputFinished();
102 102
103 - while (recongizer.IsReady(&stream)) {  
104 - recongizer.Decode(&stream); 103 + while (recognizer.IsReady(&stream)) {
  104 + recognizer.Decode(&stream);
105 } 105 }
106 106
107 - result = recongizer.GetResult(&stream); 107 + result = recognizer.GetResult(&stream);
108 108
109 auto end = std::chrono::steady_clock::now(); 109 auto end = std::chrono::steady_clock::now();
110 float elapsed_seconds = 110 float elapsed_seconds =
@@ -59,8 +59,8 @@ int32_t main() { @@ -59,8 +59,8 @@ int32_t main() {
59 config.hr.rule_fsts = "./replace.fst"; 59 config.hr.rule_fsts = "./replace.fst";
60 60
61 std::cout << "Loading model\n"; 61 std::cout << "Loading model\n";
62 - OnlineRecognizer recongizer = OnlineRecognizer::Create(config);  
63 - if (!recongizer.Get()) { 62 + OnlineRecognizer recognizer = OnlineRecognizer::Create(config);
  63 + if (!recognizer.Get()) {
64 std::cerr << "Please check your config\n"; 64 std::cerr << "Please check your config\n";
65 return -1; 65 return -1;
66 } 66 }
@@ -76,16 +76,16 @@ int32_t main() { @@ -76,16 +76,16 @@ int32_t main() {
76 std::cout << "Start recognition\n"; 76 std::cout << "Start recognition\n";
77 const auto begin = std::chrono::steady_clock::now(); 77 const auto begin = std::chrono::steady_clock::now();
78 78
79 - OnlineStream stream = recongizer.CreateStream(); 79 + OnlineStream stream = recognizer.CreateStream();
80 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 80 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
81 wave.samples.size()); 81 wave.samples.size());
82 stream.InputFinished(); 82 stream.InputFinished();
83 83
84 - while (recongizer.IsReady(&stream)) {  
85 - recongizer.Decode(&stream); 84 + while (recognizer.IsReady(&stream)) {
  85 + recognizer.Decode(&stream);
86 } 86 }
87 87
88 - OnlineRecognizerResult result = recongizer.GetResult(&stream); 88 + OnlineRecognizerResult result = recognizer.GetResult(&stream);
89 89
90 const auto end = std::chrono::steady_clock::now(); 90 const auto end = std::chrono::steady_clock::now();
91 const float elapsed_seconds = 91 const float elapsed_seconds =
@@ -32,8 +32,8 @@ int32_t main() { @@ -32,8 +32,8 @@ int32_t main() {
32 config.model_config.num_threads = 1; 32 config.model_config.num_threads = 1;
33 33
34 std::cout << "Loading model\n"; 34 std::cout << "Loading model\n";
35 - OfflineRecognizer recongizer = OfflineRecognizer::Create(config);  
36 - if (!recongizer.Get()) { 35 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  36 + if (!recognizer.Get()) {
37 std::cerr << "Please check your config\n"; 37 std::cerr << "Please check your config\n";
38 return -1; 38 return -1;
39 } 39 }
@@ -49,13 +49,13 @@ int32_t main() { @@ -49,13 +49,13 @@ int32_t main() {
49 std::cout << "Start recognition\n"; 49 std::cout << "Start recognition\n";
50 const auto begin = std::chrono::steady_clock::now(); 50 const auto begin = std::chrono::steady_clock::now();
51 51
52 - OfflineStream stream = recongizer.CreateStream(); 52 + OfflineStream stream = recognizer.CreateStream();
53 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), 53 stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
54 wave.samples.size()); 54 wave.samples.size());
55 55
56 - recongizer.Decode(&stream); 56 + recognizer.Decode(&stream);
57 57
58 - OfflineRecognizerResult result = recongizer.GetResult(&stream); 58 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
59 59
60 const auto end = std::chrono::steady_clock::now(); 60 const auto end = std::chrono::steady_clock::now();
61 const float elapsed_seconds = 61 const float elapsed_seconds =
@@ -74,7 +74,7 @@ def get_args(): @@ -74,7 +74,7 @@ def get_args():
74 parser.add_argument( 74 parser.add_argument(
75 "--num-threads", 75 "--num-threads",
76 type=int, 76 type=int,
77 - default=1, 77 + default=2,
78 help="Number of threads for neural network computation", 78 help="Number of threads for neural network computation",
79 ) 79 )
80 80
@@ -164,7 +164,13 @@ def main(): @@ -164,7 +164,13 @@ def main():
164 164
165 config = sherpa_onnx.VadModelConfig() 165 config = sherpa_onnx.VadModelConfig()
166 config.silero_vad.model = args.silero_vad_model 166 config.silero_vad.model = args.silero_vad_model
167 - config.silero_vad.min_silence_duration = 0.25 167 + config.silero_vad.threshold = 0.5
  168 + config.silero_vad.min_silence_duration = 0.1 # seconds
  169 + config.silero_vad.min_speech_duration = 0.25 # seconds
  170 + # If the current segment is larger than this value, then it increases
  171 + # the threshold to 0.9 internally. After detecting this segment,
  172 + # it resets the threshold to its original value.
  173 + config.silero_vad.max_speech_duration = 8 # seconds
168 config.sample_rate = sample_rate 174 config.sample_rate = sample_rate
169 175
170 window_size = config.silero_vad.window_size 176 window_size = config.silero_vad.window_size
@@ -184,20 +190,22 @@ def main(): @@ -184,20 +190,22 @@ def main():
184 started = False 190 started = False
185 started_time = None 191 started_time = None
186 192
  193 + offset = 0
187 while not killed: 194 while not killed:
188 samples = samples_queue.get() # a blocking read 195 samples = samples_queue.get() # a blocking read
189 196
190 buffer = np.concatenate([buffer, samples]) 197 buffer = np.concatenate([buffer, samples])
191 - offset = 0  
192 - while offset + window_size < samples.shape[0]:  
193 - vad.accept_waveform(samples[offset : offset + window_size]) 198 + while offset + window_size < len(buffer):
  199 + vad.accept_waveform(buffer[offset : offset + window_size])
194 if not started and vad.is_speech_detected(): 200 if not started and vad.is_speech_detected():
195 started = True 201 started = True
196 started_time = time.time() 202 started_time = time.time()
197 offset += window_size 203 offset += window_size
198 204
199 if not started: 205 if not started:
200 - buffer = buffer[-10 * window_size :] 206 + if len(buffer) > 10 * window_size:
  207 + offset -= len(buffer) - 10 * window_size
  208 + buffer = buffer[-10 * window_size :]
201 209
202 if started and time.time() - started_time > 0.2: 210 if started and time.time() - started_time > 0.2:
203 stream = recognizer.create_stream() 211 stream = recognizer.create_stream()
@@ -223,6 +231,7 @@ def main(): @@ -223,6 +231,7 @@ def main():
223 display.update_text(text) 231 display.update_text(text)
224 232
225 buffer = [] 233 buffer = []
  234 + offset = 0
226 started = False 235 started = False
227 started_time = None 236 started_time = None
228 237
@@ -678,4 +678,42 @@ void VoiceActivityDetector::Flush() const { @@ -678,4 +678,42 @@ void VoiceActivityDetector::Flush() const {
678 SherpaOnnxVoiceActivityDetectorFlush(p_); 678 SherpaOnnxVoiceActivityDetectorFlush(p_);
679 } 679 }
680 680
  681 +LinearResampler LinearResampler::Create(int32_t samp_rate_in_hz,
  682 + int32_t samp_rate_out_hz,
  683 + float filter_cutoff_hz,
  684 + int32_t num_zeros) {
  685 + auto p = SherpaOnnxCreateLinearResampler(samp_rate_in_hz, samp_rate_out_hz,
  686 + filter_cutoff_hz, num_zeros);
  687 + return LinearResampler(p);
  688 +}
  689 +
  690 +LinearResampler::LinearResampler(const SherpaOnnxLinearResampler *p)
  691 + : MoveOnly<LinearResampler, SherpaOnnxLinearResampler>(p) {}
  692 +
  693 +void LinearResampler::Destroy(const SherpaOnnxLinearResampler *p) const {
  694 + SherpaOnnxDestroyLinearResampler(p);
  695 +}
  696 +
  697 +void LinearResampler::Reset() const { SherpaOnnxLinearResamplerReset(p_); }
  698 +
  699 +std::vector<float> LinearResampler::Resample(const float *input,
  700 + int32_t input_dim,
  701 + bool flush) const {
  702 + auto out = SherpaOnnxLinearResamplerResample(p_, input, input_dim, flush);
  703 +
  704 + std::vector<float> ans{out->samples, out->samples + out->n};
  705 +
  706 + SherpaOnnxLinearResamplerResampleFree(out);
  707 +
  708 + return ans;
  709 +}
  710 +
  711 +int32_t LinearResampler::GetInputSamplingRate() const {
  712 + return SherpaOnnxLinearResamplerResampleGetInputSampleRate(p_);
  713 +}
  714 +
  715 +int32_t LinearResampler::GetOutputSamplingRate() const {
  716 + return SherpaOnnxLinearResamplerResampleGetOutputSampleRate(p_);
  717 +}
  718 +
681 } // namespace sherpa_onnx::cxx 719 } // namespace sherpa_onnx::cxx
@@ -111,6 +111,7 @@ SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave); @@ -111,6 +111,7 @@ SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave);
111 template <typename Derived, typename T> 111 template <typename Derived, typename T>
112 class SHERPA_ONNX_API MoveOnly { 112 class SHERPA_ONNX_API MoveOnly {
113 public: 113 public:
  114 + MoveOnly() = default;
114 explicit MoveOnly(const T *p) : p_(p) {} 115 explicit MoveOnly(const T *p) : p_(p) {}
115 116
116 ~MoveOnly() { Destroy(); } 117 ~MoveOnly() { Destroy(); }
@@ -591,6 +592,28 @@ class SHERPA_ONNX_API VoiceActivityDetector @@ -591,6 +592,28 @@ class SHERPA_ONNX_API VoiceActivityDetector
591 explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p); 592 explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
592 }; 593 };
593 594
  595 +class SHERPA_ONNX_API LinearResampler
  596 + : public MoveOnly<LinearResampler, SherpaOnnxLinearResampler> {
  597 + public:
  598 + LinearResampler() = default;
  599 + static LinearResampler Create(int32_t samp_rate_in_hz,
  600 + int32_t samp_rate_out_hz,
  601 + float filter_cutoff_hz, int32_t num_zeros);
  602 +
  603 + void Destroy(const SherpaOnnxLinearResampler *p) const;
  604 +
  605 + void Reset() const;
  606 +
  607 + std::vector<float> Resample(const float *input, int32_t input_dim,
  608 + bool flush) const;
  609 +
  610 + int32_t GetInputSamplingRate() const;
  611 + int32_t GetOutputSamplingRate() const;
  612 +
  613 + private:
  614 + explicit LinearResampler(const SherpaOnnxLinearResampler *p);
  615 +};
  616 +
594 } // namespace sherpa_onnx::cxx 617 } // namespace sherpa_onnx::cxx
595 618
596 #endif // SHERPA_ONNX_C_API_CXX_API_H_ 619 #endif // SHERPA_ONNX_C_API_CXX_API_H_
@@ -166,20 +166,32 @@ class HomophoneReplacer::Impl { @@ -166,20 +166,32 @@ class HomophoneReplacer::Impl {
166 } 166 }
167 167
168 // convert words to pronunciations 168 // convert words to pronunciations
169 - std::vector<std::string> pronunciations; 169 + std::vector<std::string> current_words;
  170 + std::vector<std::string> current_pronunciations;
170 171
171 for (const auto &w : words) { 172 for (const auto &w : words) {
  173 + if (w.size() < 3 ||
  174 + reinterpret_cast<const uint8_t *>(w.data())[0] < 128) {
  175 + if (!current_words.empty()) {
  176 + ans += ApplyImpl(current_words, current_pronunciations);
  177 + current_words.clear();
  178 + current_pronunciations.clear();
  179 + }
  180 + ans += w;
  181 + continue;
  182 + }
  183 +
172 auto p = ConvertWordToPronunciation(w); 184 auto p = ConvertWordToPronunciation(w);
173 if (config_.debug) { 185 if (config_.debug) {
174 SHERPA_ONNX_LOGE("%s %s", w.c_str(), p.c_str()); 186 SHERPA_ONNX_LOGE("%s %s", w.c_str(), p.c_str());
175 } 187 }
176 - pronunciations.push_back(std::move(p)); 188 +
  189 + current_words.push_back(w);
  190 + current_pronunciations.push_back(std::move(p));
177 } 191 }
178 192
179 - for (const auto &r : replacer_list_) {  
180 - ans = r->Normalize(words, pronunciations);  
181 - // TODO(fangjun): We support only 1 rule fst at present.  
182 - break; 193 + if (!current_words.empty()) {
  194 + ans += ApplyImpl(current_words, current_pronunciations);
183 } 195 }
184 196
185 if (config_.debug) { 197 if (config_.debug) {
@@ -190,6 +202,16 @@ class HomophoneReplacer::Impl { @@ -190,6 +202,16 @@ class HomophoneReplacer::Impl {
190 } 202 }
191 203
192 private: 204 private:
  205 + std::string ApplyImpl(const std::vector<std::string> &words,
  206 + const std::vector<std::string> &pronunciations) const {
  207 + std::string ans;
  208 + for (const auto &r : replacer_list_) {
  209 + ans = r->Normalize(words, pronunciations);
  210 + // TODO(fangjun): We support only 1 rule fst at present.
  211 + break;
  212 + }
  213 + return ans;
  214 + }
193 std::string ConvertWordToPronunciation(const std::string &word) const { 215 std::string ConvertWordToPronunciation(const std::string &word) const {
194 if (word2pron_.count(word)) { 216 if (word2pron_.count(word)) {
195 return word2pron_.at(word); 217 return word2pron_.at(word);
@@ -239,6 +261,9 @@ class HomophoneReplacer::Impl { @@ -239,6 +261,9 @@ class HomophoneReplacer::Impl {
239 } 261 }
240 262
241 while (iss >> p) { 263 while (iss >> p) {
  264 + if (p.back() > '4') {
  265 + p.push_back('1');
  266 + }
242 pron.append(std::move(p)); 267 pron.append(std::move(p));
243 } 268 }
244 269