Fangjun Kuang
Committed by GitHub

Add alsa-based streaming ASR example for sense voice. (#2207)

@@ -82,6 +82,8 @@ jobs: @@ -82,6 +82,8 @@ jobs:
82 .. 82 ..
83 make -j4 install 83 make -j4 install
84 84
  85 + cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
  86 +
85 rm -rf install/lib/pkgconfig 87 rm -rf install/lib/pkgconfig
86 rm -fv install/lib/cargs.h 88 rm -fv install/lib/cargs.h
87 rm -fv install/lib/libcargs.so 89 rm -fv install/lib/libcargs.so
@@ -126,6 +128,8 @@ jobs: @@ -126,6 +128,8 @@ jobs:
126 128
127 make -j4 install 129 make -j4 install
128 130
  131 + cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
  132 +
129 rm -rf install/lib/pkgconfig 133 rm -rf install/lib/pkgconfig
130 rm -fv install/lib/cargs.h 134 rm -fv install/lib/cargs.h
131 rm -fv install/lib/libcargs.so 135 rm -fv install/lib/libcargs.so
@@ -242,7 +246,7 @@ jobs: @@ -242,7 +246,7 @@ jobs:
242 file: sherpa-onnx-*linux-aarch64*.tar.bz2 246 file: sherpa-onnx-*linux-aarch64*.tar.bz2
243 # repo_name: k2-fsa/sherpa-onnx 247 # repo_name: k2-fsa/sherpa-onnx
244 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} 248 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
245 - # tag: v1.11.1 249 + # tag: v1.11.5
246 250
247 - name: Test offline Moonshine 251 - name: Test offline Moonshine
248 if: matrix.build_type != 'Debug' 252 if: matrix.build_type != 'Debug'
@@ -83,6 +83,8 @@ jobs: @@ -83,6 +83,8 @@ jobs:
83 83
84 make install 84 make install
85 85
  86 + cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
  87 +
86 ls -lh install/lib 88 ls -lh install/lib
87 89
88 rm -rf install/lib/pkgconfig 90 rm -rf install/lib/pkgconfig
@@ -164,7 +166,7 @@ jobs: @@ -164,7 +166,7 @@ jobs:
164 file: sherpa-onnx-*linux-aarch64*.tar.bz2 166 file: sherpa-onnx-*linux-aarch64*.tar.bz2
165 # repo_name: k2-fsa/sherpa-onnx 167 # repo_name: k2-fsa/sherpa-onnx
166 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} 168 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
167 - # tag: v1.10.42 169 + # tag: v1.11.5
168 170
169 - name: Test offline Moonshine 171 - name: Test offline Moonshine
170 if: matrix.build_type != 'Debug' 172 if: matrix.build_type != 'Debug'
@@ -168,7 +168,7 @@ jobs: @@ -168,7 +168,7 @@ jobs:
168 file: sherpa-onnx-*-android.tar.bz2 168 file: sherpa-onnx-*-android.tar.bz2
169 # repo_name: k2-fsa/sherpa-onnx 169 # repo_name: k2-fsa/sherpa-onnx
170 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} 170 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
171 - # tag: v1.11.3 171 + # tag: v1.11.5
172 172
173 build-android-aar: 173 build-android-aar:
174 needs: [build-android-libs] 174 needs: [build-android-libs]
@@ -297,7 +297,7 @@ jobs: @@ -297,7 +297,7 @@ jobs:
297 file: ./*.aar 297 file: ./*.aar
298 # repo_name: k2-fsa/sherpa-onnx 298 # repo_name: k2-fsa/sherpa-onnx
299 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} 299 # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
300 - # tag: v1.11.3 300 + # tag: v1.11.5
301 301
302 - name: Release android aar 302 - name: Release android aar
303 if: github.repository_owner == 'k2-fsa' && github.event_name == 'push' && contains(github.ref, 'refs/tags/') 303 if: github.repository_owner == 'k2-fsa' && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
@@ -47,6 +47,23 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -47,6 +47,23 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
47 ) 47 )
48 endif() 48 endif()
49 49
  50 +if(SHERPA_ONNX_HAS_ALSA)
  51 + add_executable(sense-voice-simulate-streaming-alsa-cxx-api
  52 + ./sense-voice-simulate-streaming-alsa-cxx-api.cc
  53 + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc
  54 + )
  55 + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api
  56 + sherpa-onnx-cxx-api
  57 + portaudio_static
  58 + )
  59 +
  60 + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
  61 + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
  62 + else()
  63 + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound)
  64 + endif()
  65 +endif()
  66 +
50 add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) 67 add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc)
51 target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api) 68 target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api)
52 69
  1 +// cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc
  2 +// Copyright (c) 2025 Xiaomi Corporation
  3 +
  4 +//
  5 +// This file demonstrates how to use sense voice with sherpa-onnx's C++ API
  6 +// for streaming speech recognition from a microphone.
  7 +//
  8 +// clang-format off
  9 +//
  10 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 +//
  12 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  13 +// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  14 +// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  15 +//
  16 +// clang-format on
  17 +
  18 +#include <signal.h>
  19 +#include <stdio.h>
  20 +#include <stdlib.h>
  21 +
  22 +#include <chrono> // NOLINT
  23 +#include <condition_variable> // NOLINT
  24 +#include <iostream>
  25 +#include <mutex> // NOLINT
  26 +#include <queue>
  27 +#include <thread>
  28 +#include <vector>
  29 +
  30 +#include "portaudio.h" // NOLINT
  31 +#include "sherpa-display.h" // NOLINT
  32 +#include "sherpa-onnx/c-api/cxx-api.h"
  33 +#include "sherpa-onnx/csrc/alsa.h"
  34 +
  35 +std::queue<std::vector<float>> samples_queue;
  36 +std::condition_variable condition_variable;
  37 +std::mutex mutex;
  38 +bool stop = false;
  39 +
  40 +static void Handler(int32_t /*sig*/) {
  41 + stop = true;
  42 + condition_variable.notify_one();
  43 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  44 +}
  45 +
  46 +static void RecordCallback(sherpa_onnx::Alsa *alsa) {
  47 + int32_t chunk = 0.1 * alsa->GetActualSampleRate();
  48 + while (!stop) {
  49 + std::vector<float> samples = alsa->Read(chunk);
  50 +
  51 + std::lock_guard<std::mutex> lock(mutex);
  52 + samples_queue.emplace(std::move(samples));
  53 + condition_variable.notify_one();
  54 + }
  55 +}
  56 +
  57 +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
  58 + using namespace sherpa_onnx::cxx; // NOLINT
  59 + VadModelConfig config;
  60 + config.silero_vad.model = "./silero_vad.onnx";
  61 + config.silero_vad.threshold = 0.5;
  62 + config.silero_vad.min_silence_duration = 0.1;
  63 + config.silero_vad.min_speech_duration = 0.25;
  64 + config.silero_vad.max_speech_duration = 8;
  65 + config.sample_rate = 16000;
  66 + config.debug = false;
  67 +
  68 + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
  69 + if (!vad.Get()) {
  70 + std::cerr << "Failed to create VAD. Please check your config\n";
  71 + exit(-1);
  72 + }
  73 +
  74 + return vad;
  75 +}
  76 +
  77 +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
  78 + using namespace sherpa_onnx::cxx; // NOLINT
  79 + OfflineRecognizerConfig config;
  80 +
  81 + config.model_config.sense_voice.model =
  82 + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
  83 + config.model_config.sense_voice.use_itn = false;
  84 + config.model_config.sense_voice.language = "auto";
  85 + config.model_config.tokens =
  86 + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
  87 +
  88 + config.model_config.num_threads = 2;
  89 + config.model_config.debug = false;
  90 +
  91 + std::cout << "Loading model\n";
  92 + OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
  93 + if (!recognizer.Get()) {
  94 + std::cerr << "Please check your config\n";
  95 + exit(-1);
  96 + }
  97 + std::cout << "Loading model done\n";
  98 + return recognizer;
  99 +}
  100 +
  101 +int32_t main(int32_t argc, const char *argv[]) {
  102 + const char *kUsageMessage = R"usage(
  103 +Usage:
  104 +
  105 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  106 +
  107 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  108 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  109 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  110 +
  111 +./sense-voice-simulate-streaming-alsa-cxx-api device_name
  112 +
  113 +The device name specifies which microphone to use in case there are several
  114 +on your system. You can use
  115 +
  116 + arecord -l
  117 +
  118 +to find all available microphones on your computer. For instance, if it outputs
  119 +
  120 +**** List of CAPTURE Hardware Devices ****
  121 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  122 + Subdevices: 1/1
  123 + Subdevice #0: subdevice #0
  124 +
  125 +and if you want to select card 3 and device 0 on that card, please use:
  126 +
  127 + plughw:3,0
  128 +
  129 +as the device_name.
  130 +)usage";
  131 +
  132 + if (argc != 2) {
  133 + fprintf(stderr, "%s\n", kUsageMessage);
  134 + return -1;
  135 + }
  136 +
  137 + signal(SIGINT, Handler);
  138 +
  139 + using namespace sherpa_onnx::cxx; // NOLINT
  140 +
  141 + auto vad = CreateVad();
  142 + auto recognizer = CreateOfflineRecognizer();
  143 +
  144 + int32_t expected_sample_rate = 16000;
  145 +
  146 + std::string device_name = argv[1];
  147 + sherpa_onnx::Alsa alsa(device_name.c_str());
  148 + fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
  149 +
  150 + if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
  151 + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
  152 + expected_sample_rate);
  153 + exit(-1);
  154 + }
  155 +
  156 + int32_t window_size = 512; // samples, please don't change
  157 +
  158 + int32_t offset = 0;
  159 + std::vector<float> buffer;
  160 + bool speech_started = false;
  161 +
  162 + auto started_time = std::chrono::steady_clock::now();
  163 +
  164 + SherpaDisplay display;
  165 +
  166 + std::thread record_thread(RecordCallback, &alsa);
  167 +
  168 + std::cout << "Started! Please speak\n";
  169 +
  170 + while (!stop) {
  171 + {
  172 + std::unique_lock<std::mutex> lock(mutex);
  173 + while (samples_queue.empty() && !stop) {
  174 + condition_variable.wait(lock);
  175 + }
  176 +
  177 + const auto &s = samples_queue.front();
  178 + buffer.insert(buffer.end(), s.begin(), s.end());
  179 +
  180 + samples_queue.pop();
  181 + }
  182 +
  183 + for (; offset + window_size < buffer.size(); offset += window_size) {
  184 + vad.AcceptWaveform(buffer.data() + offset, window_size);
  185 + if (!speech_started && vad.IsDetected()) {
  186 + speech_started = true;
  187 + started_time = std::chrono::steady_clock::now();
  188 + }
  189 + }
  190 + if (!speech_started) {
  191 + if (buffer.size() > 10 * window_size) {
  192 + offset -= buffer.size() - 10 * window_size;
  193 + buffer = {buffer.end() - 10 * window_size, buffer.end()};
  194 + }
  195 + }
  196 +
  197 + auto current_time = std::chrono::steady_clock::now();
  198 + const float elapsed_seconds =
  199 + std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
  200 + started_time)
  201 + .count() /
  202 + 1000.;
  203 +
  204 + if (speech_started && elapsed_seconds > 0.2) {
  205 + OfflineStream stream = recognizer.CreateStream();
  206 + stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size());
  207 +
  208 + recognizer.Decode(&stream);
  209 +
  210 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  211 + display.UpdateText(result.text);
  212 + display.Display();
  213 +
  214 + started_time = std::chrono::steady_clock::now();
  215 + }
  216 +
  217 + while (!vad.IsEmpty()) {
  218 + auto segment = vad.Front();
  219 +
  220 + vad.Pop();
  221 +
  222 + OfflineStream stream = recognizer.CreateStream();
  223 + stream.AcceptWaveform(expected_sample_rate, segment.samples.data(),
  224 + segment.samples.size());
  225 +
  226 + recognizer.Decode(&stream);
  227 +
  228 + OfflineRecognizerResult result = recognizer.GetResult(&stream);
  229 +
  230 + display.UpdateText(result.text);
  231 + display.FinalizeCurrentSentence();
  232 + display.Display();
  233 +
  234 + buffer.clear();
  235 + offset = 0;
  236 + speech_started = false;
  237 + }
  238 + }
  239 +
  240 + record_thread.join();
  241 +
  242 + return 0;
  243 +}
@@ -45,10 +45,11 @@ class SherpaDisplay { @@ -45,10 +45,11 @@ class SherpaDisplay {
45 private: 45 private:
46 static void ClearScreen() { 46 static void ClearScreen() {
47 #ifdef _MSC_VER 47 #ifdef _MSC_VER
48 - system("cls"); 48 + auto ret = system("cls");
49 #else 49 #else
50 - system("clear"); 50 + auto ret = system("clear");
51 #endif 51 #endif
  52 + (void)ret;
52 } 53 }
53 54
54 static std::string GetCurrentDateTime() { 55 static std::string GetCurrentDateTime() {