Committed by
GitHub
Add alsa-based streaming ASR example for sense voice. (#2207)
正在显示
6 个修改的文件
包含
273 行增加
和
6 行删除
| @@ -82,6 +82,8 @@ jobs: | @@ -82,6 +82,8 @@ jobs: | ||
| 82 | .. | 82 | .. |
| 83 | make -j4 install | 83 | make -j4 install |
| 84 | 84 | ||
| 85 | + cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin | ||
| 86 | + | ||
| 85 | rm -rf install/lib/pkgconfig | 87 | rm -rf install/lib/pkgconfig |
| 86 | rm -fv install/lib/cargs.h | 88 | rm -fv install/lib/cargs.h |
| 87 | rm -fv install/lib/libcargs.so | 89 | rm -fv install/lib/libcargs.so |
| @@ -126,6 +128,8 @@ jobs: | @@ -126,6 +128,8 @@ jobs: | ||
| 126 | 128 | ||
| 127 | make -j4 install | 129 | make -j4 install |
| 128 | 130 | ||
| 131 | + cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin | ||
| 132 | + | ||
| 129 | rm -rf install/lib/pkgconfig | 133 | rm -rf install/lib/pkgconfig |
| 130 | rm -fv install/lib/cargs.h | 134 | rm -fv install/lib/cargs.h |
| 131 | rm -fv install/lib/libcargs.so | 135 | rm -fv install/lib/libcargs.so |
| @@ -242,7 +246,7 @@ jobs: | @@ -242,7 +246,7 @@ jobs: | ||
| 242 | file: sherpa-onnx-*linux-aarch64*.tar.bz2 | 246 | file: sherpa-onnx-*linux-aarch64*.tar.bz2 |
| 243 | # repo_name: k2-fsa/sherpa-onnx | 247 | # repo_name: k2-fsa/sherpa-onnx |
| 244 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | 248 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} |
| 245 | - # tag: v1.11.1 | 249 | + # tag: v1.11.5 |
| 246 | 250 | ||
| 247 | - name: Test offline Moonshine | 251 | - name: Test offline Moonshine |
| 248 | if: matrix.build_type != 'Debug' | 252 | if: matrix.build_type != 'Debug' |
| @@ -83,6 +83,8 @@ jobs: | @@ -83,6 +83,8 @@ jobs: | ||
| 83 | 83 | ||
| 84 | make install | 84 | make install |
| 85 | 85 | ||
| 86 | + cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin | ||
| 87 | + | ||
| 86 | ls -lh install/lib | 88 | ls -lh install/lib |
| 87 | 89 | ||
| 88 | rm -rf install/lib/pkgconfig | 90 | rm -rf install/lib/pkgconfig |
| @@ -164,7 +166,7 @@ jobs: | @@ -164,7 +166,7 @@ jobs: | ||
| 164 | file: sherpa-onnx-*linux-aarch64*.tar.bz2 | 166 | file: sherpa-onnx-*linux-aarch64*.tar.bz2 |
| 165 | # repo_name: k2-fsa/sherpa-onnx | 167 | # repo_name: k2-fsa/sherpa-onnx |
| 166 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | 168 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} |
| 167 | - # tag: v1.10.42 | 169 | + # tag: v1.11.5 |
| 168 | 170 | ||
| 169 | - name: Test offline Moonshine | 171 | - name: Test offline Moonshine |
| 170 | if: matrix.build_type != 'Debug' | 172 | if: matrix.build_type != 'Debug' |
| @@ -168,7 +168,7 @@ jobs: | @@ -168,7 +168,7 @@ jobs: | ||
| 168 | file: sherpa-onnx-*-android.tar.bz2 | 168 | file: sherpa-onnx-*-android.tar.bz2 |
| 169 | # repo_name: k2-fsa/sherpa-onnx | 169 | # repo_name: k2-fsa/sherpa-onnx |
| 170 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | 170 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} |
| 171 | - # tag: v1.11.3 | 171 | + # tag: v1.11.5 |
| 172 | 172 | ||
| 173 | build-android-aar: | 173 | build-android-aar: |
| 174 | needs: [build-android-libs] | 174 | needs: [build-android-libs] |
| @@ -297,7 +297,7 @@ jobs: | @@ -297,7 +297,7 @@ jobs: | ||
| 297 | file: ./*.aar | 297 | file: ./*.aar |
| 298 | # repo_name: k2-fsa/sherpa-onnx | 298 | # repo_name: k2-fsa/sherpa-onnx |
| 299 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | 299 | # repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} |
| 300 | - # tag: v1.11.3 | 300 | + # tag: v1.11.5 |
| 301 | 301 | ||
| 302 | - name: Release android aar | 302 | - name: Release android aar |
| 303 | if: github.repository_owner == 'k2-fsa' && github.event_name == 'push' && contains(github.ref, 'refs/tags/') | 303 | if: github.repository_owner == 'k2-fsa' && github.event_name == 'push' && contains(github.ref, 'refs/tags/') |
| @@ -47,6 +47,23 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | @@ -47,6 +47,23 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | ||
| 47 | ) | 47 | ) |
| 48 | endif() | 48 | endif() |
| 49 | 49 | ||
| 50 | +if(SHERPA_ONNX_HAS_ALSA) | ||
| 51 | + add_executable(sense-voice-simulate-streaming-alsa-cxx-api | ||
| 52 | + ./sense-voice-simulate-streaming-alsa-cxx-api.cc | ||
| 53 | + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc | ||
| 54 | + ) | ||
| 55 | + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api | ||
| 56 | + sherpa-onnx-cxx-api | ||
| 57 | + portaudio_static | ||
| 58 | + ) | ||
| 59 | + | ||
| 60 | + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) | ||
| 61 | + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) | ||
| 62 | + else() | ||
| 63 | + target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound) | ||
| 64 | + endif() | ||
| 65 | +endif() | ||
| 66 | + | ||
| 50 | add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) | 67 | add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc) |
| 51 | target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api) | 68 | target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api) |
| 52 | 69 |
| 1 | +// cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc | ||
| 2 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 3 | + | ||
| 4 | +// | ||
| 5 | +// This file demonstrates how to use sense voice with sherpa-onnx's C++ API | ||
| 6 | +// for streaming speech recognition from a microphone. | ||
| 7 | +// | ||
| 8 | +// clang-format off | ||
| 9 | +// | ||
| 10 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 11 | +// | ||
| 12 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 13 | +// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 14 | +// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 15 | +// | ||
| 16 | +// clang-format on | ||
| 17 | + | ||
| 18 | +#include <signal.h> | ||
| 19 | +#include <stdio.h> | ||
| 20 | +#include <stdlib.h> | ||
| 21 | + | ||
| 22 | +#include <chrono> // NOLINT | ||
| 23 | +#include <condition_variable> // NOLINT | ||
| 24 | +#include <iostream> | ||
| 25 | +#include <mutex> // NOLINT | ||
| 26 | +#include <queue> | ||
| 27 | +#include <thread> | ||
| 28 | +#include <vector> | ||
| 29 | + | ||
| 30 | +#include "portaudio.h" // NOLINT | ||
| 31 | +#include "sherpa-display.h" // NOLINT | ||
| 32 | +#include "sherpa-onnx/c-api/cxx-api.h" | ||
| 33 | +#include "sherpa-onnx/csrc/alsa.h" | ||
| 34 | + | ||
| 35 | +std::queue<std::vector<float>> samples_queue; | ||
| 36 | +std::condition_variable condition_variable; | ||
| 37 | +std::mutex mutex; | ||
| 38 | +bool stop = false; | ||
| 39 | + | ||
| 40 | +static void Handler(int32_t /*sig*/) { | ||
| 41 | + stop = true; | ||
| 42 | + condition_variable.notify_one(); | ||
| 43 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); | ||
| 44 | +} | ||
| 45 | + | ||
| 46 | +static void RecordCallback(sherpa_onnx::Alsa *alsa) { | ||
| 47 | + int32_t chunk = 0.1 * alsa->GetActualSampleRate(); | ||
| 48 | + while (!stop) { | ||
| 49 | + std::vector<float> samples = alsa->Read(chunk); | ||
| 50 | + | ||
| 51 | + std::lock_guard<std::mutex> lock(mutex); | ||
| 52 | + samples_queue.emplace(std::move(samples)); | ||
| 53 | + condition_variable.notify_one(); | ||
| 54 | + } | ||
| 55 | +} | ||
| 56 | + | ||
| 57 | +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { | ||
| 58 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 59 | + VadModelConfig config; | ||
| 60 | + config.silero_vad.model = "./silero_vad.onnx"; | ||
| 61 | + config.silero_vad.threshold = 0.5; | ||
| 62 | + config.silero_vad.min_silence_duration = 0.1; | ||
| 63 | + config.silero_vad.min_speech_duration = 0.25; | ||
| 64 | + config.silero_vad.max_speech_duration = 8; | ||
| 65 | + config.sample_rate = 16000; | ||
| 66 | + config.debug = false; | ||
| 67 | + | ||
| 68 | + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); | ||
| 69 | + if (!vad.Get()) { | ||
| 70 | + std::cerr << "Failed to create VAD. Please check your config\n"; | ||
| 71 | + exit(-1); | ||
| 72 | + } | ||
| 73 | + | ||
| 74 | + return vad; | ||
| 75 | +} | ||
| 76 | + | ||
| 77 | +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { | ||
| 78 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 79 | + OfflineRecognizerConfig config; | ||
| 80 | + | ||
| 81 | + config.model_config.sense_voice.model = | ||
| 82 | + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
| 83 | + config.model_config.sense_voice.use_itn = false; | ||
| 84 | + config.model_config.sense_voice.language = "auto"; | ||
| 85 | + config.model_config.tokens = | ||
| 86 | + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
| 87 | + | ||
| 88 | + config.model_config.num_threads = 2; | ||
| 89 | + config.model_config.debug = false; | ||
| 90 | + | ||
| 91 | + std::cout << "Loading model\n"; | ||
| 92 | + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); | ||
| 93 | + if (!recognizer.Get()) { | ||
| 94 | + std::cerr << "Please check your config\n"; | ||
| 95 | + exit(-1); | ||
| 96 | + } | ||
| 97 | + std::cout << "Loading model done\n"; | ||
| 98 | + return recognizer; | ||
| 99 | +} | ||
| 100 | + | ||
| 101 | +int32_t main(int32_t argc, const char *argv[]) { | ||
| 102 | + const char *kUsageMessage = R"usage( | ||
| 103 | +Usage: | ||
| 104 | + | ||
| 105 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 106 | + | ||
| 107 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 108 | +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 109 | +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 110 | + | ||
| 111 | +./sense-voice-simulate-streaming-alsa-cxx-api device_name | ||
| 112 | + | ||
| 113 | +The device name specifies which microphone to use in case there are several | ||
| 114 | +on your system. You can use | ||
| 115 | + | ||
| 116 | + arecord -l | ||
| 117 | + | ||
| 118 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 119 | + | ||
| 120 | +**** List of CAPTURE Hardware Devices **** | ||
| 121 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 122 | + Subdevices: 1/1 | ||
| 123 | + Subdevice #0: subdevice #0 | ||
| 124 | + | ||
| 125 | +and if you want to select card 3 and device 0 on that card, please use: | ||
| 126 | + | ||
| 127 | + plughw:3,0 | ||
| 128 | + | ||
| 129 | +as the device_name. | ||
| 130 | +)usage"; | ||
| 131 | + | ||
| 132 | + if (argc != 2) { | ||
| 133 | + fprintf(stderr, "%s\n", kUsageMessage); | ||
| 134 | + return -1; | ||
| 135 | + } | ||
| 136 | + | ||
| 137 | + signal(SIGINT, Handler); | ||
| 138 | + | ||
| 139 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 140 | + | ||
| 141 | + auto vad = CreateVad(); | ||
| 142 | + auto recognizer = CreateOfflineRecognizer(); | ||
| 143 | + | ||
| 144 | + int32_t expected_sample_rate = 16000; | ||
| 145 | + | ||
| 146 | + std::string device_name = argv[1]; | ||
| 147 | + sherpa_onnx::Alsa alsa(device_name.c_str()); | ||
| 148 | + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); | ||
| 149 | + | ||
| 150 | + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { | ||
| 151 | + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), | ||
| 152 | + expected_sample_rate); | ||
| 153 | + exit(-1); | ||
| 154 | + } | ||
| 155 | + | ||
| 156 | + int32_t window_size = 512; // samples, please don't change | ||
| 157 | + | ||
| 158 | + int32_t offset = 0; | ||
| 159 | + std::vector<float> buffer; | ||
| 160 | + bool speech_started = false; | ||
| 161 | + | ||
| 162 | + auto started_time = std::chrono::steady_clock::now(); | ||
| 163 | + | ||
| 164 | + SherpaDisplay display; | ||
| 165 | + | ||
| 166 | + std::thread record_thread(RecordCallback, &alsa); | ||
| 167 | + | ||
| 168 | + std::cout << "Started! Please speak\n"; | ||
| 169 | + | ||
| 170 | + while (!stop) { | ||
| 171 | + { | ||
| 172 | + std::unique_lock<std::mutex> lock(mutex); | ||
| 173 | + while (samples_queue.empty() && !stop) { | ||
| 174 | + condition_variable.wait(lock); | ||
| 175 | + } | ||
| 176 | + | ||
| 177 | + const auto &s = samples_queue.front(); | ||
| 178 | + buffer.insert(buffer.end(), s.begin(), s.end()); | ||
| 179 | + | ||
| 180 | + samples_queue.pop(); | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + for (; offset + window_size < buffer.size(); offset += window_size) { | ||
| 184 | + vad.AcceptWaveform(buffer.data() + offset, window_size); | ||
| 185 | + if (!speech_started && vad.IsDetected()) { | ||
| 186 | + speech_started = true; | ||
| 187 | + started_time = std::chrono::steady_clock::now(); | ||
| 188 | + } | ||
| 189 | + } | ||
| 190 | + if (!speech_started) { | ||
| 191 | + if (buffer.size() > 10 * window_size) { | ||
| 192 | + offset -= buffer.size() - 10 * window_size; | ||
| 193 | + buffer = {buffer.end() - 10 * window_size, buffer.end()}; | ||
| 194 | + } | ||
| 195 | + } | ||
| 196 | + | ||
| 197 | + auto current_time = std::chrono::steady_clock::now(); | ||
| 198 | + const float elapsed_seconds = | ||
| 199 | + std::chrono::duration_cast<std::chrono::milliseconds>(current_time - | ||
| 200 | + started_time) | ||
| 201 | + .count() / | ||
| 202 | + 1000.; | ||
| 203 | + | ||
| 204 | + if (speech_started && elapsed_seconds > 0.2) { | ||
| 205 | + OfflineStream stream = recognizer.CreateStream(); | ||
| 206 | + stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size()); | ||
| 207 | + | ||
| 208 | + recognizer.Decode(&stream); | ||
| 209 | + | ||
| 210 | + OfflineRecognizerResult result = recognizer.GetResult(&stream); | ||
| 211 | + display.UpdateText(result.text); | ||
| 212 | + display.Display(); | ||
| 213 | + | ||
| 214 | + started_time = std::chrono::steady_clock::now(); | ||
| 215 | + } | ||
| 216 | + | ||
| 217 | + while (!vad.IsEmpty()) { | ||
| 218 | + auto segment = vad.Front(); | ||
| 219 | + | ||
| 220 | + vad.Pop(); | ||
| 221 | + | ||
| 222 | + OfflineStream stream = recognizer.CreateStream(); | ||
| 223 | + stream.AcceptWaveform(expected_sample_rate, segment.samples.data(), | ||
| 224 | + segment.samples.size()); | ||
| 225 | + | ||
| 226 | + recognizer.Decode(&stream); | ||
| 227 | + | ||
| 228 | + OfflineRecognizerResult result = recognizer.GetResult(&stream); | ||
| 229 | + | ||
| 230 | + display.UpdateText(result.text); | ||
| 231 | + display.FinalizeCurrentSentence(); | ||
| 232 | + display.Display(); | ||
| 233 | + | ||
| 234 | + buffer.clear(); | ||
| 235 | + offset = 0; | ||
| 236 | + speech_started = false; | ||
| 237 | + } | ||
| 238 | + } | ||
| 239 | + | ||
| 240 | + record_thread.join(); | ||
| 241 | + | ||
| 242 | + return 0; | ||
| 243 | +} |
| @@ -45,10 +45,11 @@ class SherpaDisplay { | @@ -45,10 +45,11 @@ class SherpaDisplay { | ||
| 45 | private: | 45 | private: |
| 46 | static void ClearScreen() { | 46 | static void ClearScreen() { |
| 47 | #ifdef _MSC_VER | 47 | #ifdef _MSC_VER |
| 48 | - system("cls"); | 48 | + auto ret = system("cls"); |
| 49 | #else | 49 | #else |
| 50 | - system("clear"); | 50 | + auto ret = system("clear"); |
| 51 | #endif | 51 | #endif |
| 52 | + (void)ret; | ||
| 52 | } | 53 | } |
| 53 | 54 | ||
| 54 | static std::string GetCurrentDateTime() { | 55 | static std::string GetCurrentDateTime() { |
-
请 注册 或 登录 后发表评论