Committed by
GitHub
Support using alsa to access the microphone with non-streaming ASR models (#517)
正在显示
4 个修改的文件
包含
206 行增加
和
0 行删除
| @@ -166,6 +166,7 @@ class BuildExtension(build_ext): | @@ -166,6 +166,7 @@ class BuildExtension(build_ext): | ||
| 166 | 166 | ||
| 167 | if enable_alsa(): | 167 | if enable_alsa(): |
| 168 | binaries += ["sherpa-onnx-alsa"] | 168 | binaries += ["sherpa-onnx-alsa"] |
| 169 | + binaries += ["sherpa-onnx-alsa-offline"] | ||
| 169 | binaries += ["sherpa-onnx-offline-tts-play-alsa"] | 170 | binaries += ["sherpa-onnx-offline-tts-play-alsa"] |
| 170 | binaries += ["sherpa-onnx-alsa-offline-speaker-identification"] | 171 | binaries += ["sherpa-onnx-alsa-offline-speaker-identification"] |
| 171 | 172 |
| @@ -59,6 +59,7 @@ def get_binaries_to_install(): | @@ -59,6 +59,7 @@ def get_binaries_to_install(): | ||
| 59 | 59 | ||
| 60 | if enable_alsa(): | 60 | if enable_alsa(): |
| 61 | binaries += ["sherpa-onnx-alsa"] | 61 | binaries += ["sherpa-onnx-alsa"] |
| 62 | + binaries += ["sherpa-onnx-alsa-offline"] | ||
| 62 | binaries += ["sherpa-onnx-offline-tts-play-alsa"] | 63 | binaries += ["sherpa-onnx-offline-tts-play-alsa"] |
| 63 | binaries += ["sherpa-onnx-alsa-offline-speaker-identification"] | 64 | binaries += ["sherpa-onnx-alsa-offline-speaker-identification"] |
| 64 | 65 |
| @@ -231,10 +231,12 @@ endif() | @@ -231,10 +231,12 @@ endif() | ||
| 231 | if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | 231 | if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) |
| 232 | add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) | 232 | add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) |
| 233 | add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc) | 233 | add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc) |
| 234 | + add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc) | ||
| 234 | add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) | 235 | add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) |
| 235 | 236 | ||
| 236 | set(exes | 237 | set(exes |
| 237 | sherpa-onnx-alsa | 238 | sherpa-onnx-alsa |
| 239 | + sherpa-onnx-alsa-offline | ||
| 238 | sherpa-onnx-offline-tts-play-alsa | 240 | sherpa-onnx-offline-tts-play-alsa |
| 239 | sherpa-onnx-alsa-offline-speaker-identification | 241 | sherpa-onnx-alsa-offline-speaker-identification |
| 240 | ) | 242 | ) |
sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <signal.h> | ||
| 6 | +#include <stdio.h> | ||
| 7 | +#include <stdlib.h> | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | +#include <cctype> // std::tolower | ||
| 11 | +#include <chrono> // NOLINT | ||
| 12 | +#include <mutex> // NOLINT | ||
| 13 | +#include <thread> // NOLINT | ||
| 14 | + | ||
| 15 | +#include "sherpa-onnx/csrc/alsa.h" | ||
| 16 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 17 | +#include "sherpa-onnx/csrc/offline-recognizer.h" | ||
| 18 | + | ||
| 19 | +enum class State { | ||
| 20 | + kIdle, | ||
| 21 | + kRecording, | ||
| 22 | + kDecoding, | ||
| 23 | +}; | ||
| 24 | + | ||
| 25 | +State state = State::kIdle; | ||
| 26 | + | ||
| 27 | +// true to stop the program and exit | ||
| 28 | +bool stop = false; | ||
| 29 | + | ||
| 30 | +std::vector<float> samples; | ||
| 31 | +std::mutex samples_mutex; | ||
| 32 | + | ||
| 33 | +static void DetectKeyPress() { | ||
| 34 | + SHERPA_ONNX_LOGE("Press Enter to start"); | ||
| 35 | + int32_t key; | ||
| 36 | + while (!stop && (key = getchar())) { | ||
| 37 | + if (key != 0x0a) { | ||
| 38 | + continue; | ||
| 39 | + } | ||
| 40 | + | ||
| 41 | + switch (state) { | ||
| 42 | + case State::kIdle: | ||
| 43 | + SHERPA_ONNX_LOGE("Start recording. Press Enter to stop recording"); | ||
| 44 | + state = State::kRecording; | ||
| 45 | + { | ||
| 46 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 47 | + samples.clear(); | ||
| 48 | + } | ||
| 49 | + break; | ||
| 50 | + case State::kRecording: | ||
| 51 | + SHERPA_ONNX_LOGE("Stop recording. Decoding ..."); | ||
| 52 | + state = State::kDecoding; | ||
| 53 | + break; | ||
| 54 | + case State::kDecoding: | ||
| 55 | + break; | ||
| 56 | + } | ||
| 57 | + } | ||
| 58 | +} | ||
| 59 | + | ||
| 60 | +static void Record(const char *device_name, int32_t expected_sample_rate) { | ||
| 61 | + sherpa_onnx::Alsa alsa(device_name); | ||
| 62 | + | ||
| 63 | + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { | ||
| 64 | + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), | ||
| 65 | + expected_sample_rate); | ||
| 66 | + exit(-1); | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | + int32_t chunk = 0.1 * alsa.GetActualSampleRate(); | ||
| 70 | + while (!stop) { | ||
| 71 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 72 | + const std::vector<float> &s = alsa.Read(chunk); | ||
| 73 | + samples.insert(samples.end(), s.begin(), s.end()); | ||
| 74 | + } | ||
| 75 | +} | ||
| 76 | + | ||
| 77 | +static void Handler(int32_t sig) { | ||
| 78 | + stop = true; | ||
| 79 | + fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n"); | ||
| 80 | +} | ||
| 81 | + | ||
| 82 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 83 | + signal(SIGINT, Handler); | ||
| 84 | + | ||
| 85 | + const char *kUsageMessage = R"usage( | ||
| 86 | +This program uses non-streaming models with microphone for speech recognition. | ||
| 87 | +Usage: | ||
| 88 | + | ||
| 89 | +(1) Transducer from icefall | ||
| 90 | + | ||
| 91 | + ./bin/sherpa-onnx-alsa-offline \ | ||
| 92 | + --tokens=/path/to/tokens.txt \ | ||
| 93 | + --encoder=/path/to/encoder.onnx \ | ||
| 94 | + --decoder=/path/to/decoder.onnx \ | ||
| 95 | + --joiner=/path/to/joiner.onnx \ | ||
| 96 | + --num-threads=2 \ | ||
| 97 | + --decoding-method=greedy_search \ | ||
| 98 | + device_name | ||
| 99 | + | ||
| 100 | +(2) Paraformer from FunASR | ||
| 101 | + | ||
| 102 | + ./bin/sherpa-onnx-alsa-offline \ | ||
| 103 | + --tokens=/path/to/tokens.txt \ | ||
| 104 | + --paraformer=/path/to/model.onnx \ | ||
| 105 | + --num-threads=1 \ | ||
| 106 | + device_name | ||
| 107 | + | ||
| 108 | +(3) Whisper models | ||
| 109 | + | ||
| 110 | + ./bin/sherpa-onnx-alsa-offline \ | ||
| 111 | + --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \ | ||
| 112 | + --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \ | ||
| 113 | + --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \ | ||
| 114 | + --num-threads=1 \ | ||
| 115 | + device_name | ||
| 116 | + | ||
| 117 | +Please refer to | ||
| 118 | +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
| 119 | +for a list of pre-trained models to download. | ||
| 120 | + | ||
| 121 | +The device name specifies which microphone to use in case there are several | ||
| 122 | +on you system. You can use | ||
| 123 | + | ||
| 124 | + arecord -l | ||
| 125 | + | ||
| 126 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 127 | + | ||
| 128 | +**** List of CAPTURE Hardware Devices **** | ||
| 129 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 130 | + Subdevices: 1/1 | ||
| 131 | + Subdevice #0: subdevice #0 | ||
| 132 | + | ||
| 133 | +and if you want to select card 3 and the device 0 on that card, please use: | ||
| 134 | + | ||
| 135 | + plughw:3,0 | ||
| 136 | + | ||
| 137 | +as the device_name. | ||
| 138 | +)usage"; | ||
| 139 | + | ||
| 140 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 141 | + sherpa_onnx::OfflineRecognizerConfig config; | ||
| 142 | + config.Register(&po); | ||
| 143 | + | ||
| 144 | + po.Read(argc, argv); | ||
| 145 | + if (po.NumArgs() != 1) { | ||
| 146 | + fprintf(stderr, "Please provide only 1 argument: the device name\n"); | ||
| 147 | + po.PrintUsage(); | ||
| 148 | + exit(EXIT_FAILURE); | ||
| 149 | + } | ||
| 150 | + | ||
| 151 | + fprintf(stderr, "%s\n", config.ToString().c_str()); | ||
| 152 | + | ||
| 153 | + if (!config.Validate()) { | ||
| 154 | + fprintf(stderr, "Errors in config!\n"); | ||
| 155 | + return -1; | ||
| 156 | + } | ||
| 157 | + | ||
| 158 | + SHERPA_ONNX_LOGE("Creating recognizer ..."); | ||
| 159 | + sherpa_onnx::OfflineRecognizer recognizer(config); | ||
| 160 | + SHERPA_ONNX_LOGE("Recognizer created!"); | ||
| 161 | + | ||
| 162 | + std::string device_name = po.GetArg(1); | ||
| 163 | + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); | ||
| 164 | + | ||
| 165 | + int32_t sample_rate = config.feat_config.sampling_rate; | ||
| 166 | + | ||
| 167 | + std::thread t(DetectKeyPress); | ||
| 168 | + std::thread t2(Record, device_name.c_str(), sample_rate); | ||
| 169 | + | ||
| 170 | + while (!stop) { | ||
| 171 | + switch (state) { | ||
| 172 | + case State::kIdle: | ||
| 173 | + break; | ||
| 174 | + case State::kRecording: | ||
| 175 | + break; | ||
| 176 | + case State::kDecoding: { | ||
| 177 | + std::vector<float> buf; | ||
| 178 | + { | ||
| 179 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 180 | + buf = std::move(samples); | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + auto s = recognizer.CreateStream(); | ||
| 184 | + s->AcceptWaveform(sample_rate, buf.data(), buf.size()); | ||
| 185 | + recognizer.DecodeStream(s.get()); | ||
| 186 | + SHERPA_ONNX_LOGE("Decoding Done! Result is:"); | ||
| 187 | + SHERPA_ONNX_LOGE("%s", s->GetResult().text.c_str()); | ||
| 188 | + | ||
| 189 | + state = State::kIdle; | ||
| 190 | + SHERPA_ONNX_LOGE("Press Enter to start"); | ||
| 191 | + break; | ||
| 192 | + } | ||
| 193 | + } | ||
| 194 | + | ||
| 195 | + using namespace std::chrono_literals; | ||
| 196 | + std::this_thread::sleep_for(20ms); // sleep for 20ms | ||
| 197 | + } | ||
| 198 | + t.join(); | ||
| 199 | + t2.join(); | ||
| 200 | + | ||
| 201 | + return 0; | ||
| 202 | +} |
-
请 注册 或 登录 后发表评论