Committed by
GitHub
Add a C++ example to show streaming VAD + non-streaming ASR. (#420)
正在显示
5 个修改的文件
包含
208 行增加
和
1 行删除
| @@ -137,6 +137,7 @@ class BuildExtension(build_ext): | @@ -137,6 +137,7 @@ class BuildExtension(build_ext): | ||
| 137 | binaries += ["sherpa-onnx-offline-websocket-server"] | 137 | binaries += ["sherpa-onnx-offline-websocket-server"] |
| 138 | binaries += ["sherpa-onnx-online-websocket-client"] | 138 | binaries += ["sherpa-onnx-online-websocket-client"] |
| 139 | binaries += ["sherpa-onnx-vad-microphone"] | 139 | binaries += ["sherpa-onnx-vad-microphone"] |
| 140 | + binaries += ["sherpa-onnx-vad-microphone-offline-asr"] | ||
| 140 | binaries += ["sherpa-onnx-offline-tts"] | 141 | binaries += ["sherpa-onnx-offline-tts"] |
| 141 | 142 | ||
| 142 | if is_windows(): | 143 | if is_windows(): |
| @@ -57,6 +57,7 @@ def get_binaries_to_install(): | @@ -57,6 +57,7 @@ def get_binaries_to_install(): | ||
| 57 | binaries += ["sherpa-onnx-offline-websocket-server"] | 57 | binaries += ["sherpa-onnx-offline-websocket-server"] |
| 58 | binaries += ["sherpa-onnx-online-websocket-client"] | 58 | binaries += ["sherpa-onnx-online-websocket-client"] |
| 59 | binaries += ["sherpa-onnx-vad-microphone"] | 59 | binaries += ["sherpa-onnx-vad-microphone"] |
| 60 | + binaries += ["sherpa-onnx-vad-microphone-offline-asr"] | ||
| 60 | binaries += ["sherpa-onnx-offline-tts"] | 61 | binaries += ["sherpa-onnx-offline-tts"] |
| 61 | if is_windows(): | 62 | if is_windows(): |
| 62 | binaries += ["kaldi-native-fbank-core.dll"] | 63 | binaries += ["kaldi-native-fbank-core.dll"] |
| @@ -225,6 +225,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | @@ -225,6 +225,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | ||
| 225 | microphone.cc | 225 | microphone.cc |
| 226 | ) | 226 | ) |
| 227 | 227 | ||
| 228 | + add_executable(sherpa-onnx-vad-microphone-offline-asr | ||
| 229 | + sherpa-onnx-vad-microphone-offline-asr.cc | ||
| 230 | + microphone.cc | ||
| 231 | + ) | ||
| 232 | + | ||
| 228 | if(BUILD_SHARED_LIBS) | 233 | if(BUILD_SHARED_LIBS) |
| 229 | set(PA_LIB portaudio) | 234 | set(PA_LIB portaudio) |
| 230 | else() | 235 | else() |
| @@ -235,6 +240,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | @@ -235,6 +240,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | ||
| 235 | sherpa-onnx-microphone | 240 | sherpa-onnx-microphone |
| 236 | sherpa-onnx-microphone-offline | 241 | sherpa-onnx-microphone-offline |
| 237 | sherpa-onnx-vad-microphone | 242 | sherpa-onnx-vad-microphone |
| 243 | + sherpa-onnx-vad-microphone-offline-asr | ||
| 238 | ) | 244 | ) |
| 239 | foreach(exe IN LISTS exes) | 245 | foreach(exe IN LISTS exes) |
| 240 | target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core) | 246 | target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core) |
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <signal.h> | ||
| 6 | +#include <stdio.h> | ||
| 7 | +#include <stdlib.h> | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | +#include <mutex> // NOLINT | ||
| 11 | + | ||
| 12 | +#include "portaudio.h" // NOLINT | ||
| 13 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 14 | +#include "sherpa-onnx/csrc/microphone.h" | ||
| 15 | +#include "sherpa-onnx/csrc/offline-recognizer.h" | ||
| 16 | +#include "sherpa-onnx/csrc/voice-activity-detector.h" | ||
| 17 | + | ||
| 18 | +bool stop = false; | ||
| 19 | +std::mutex mutex; | ||
| 20 | +sherpa_onnx::CircularBuffer buffer(16000 * 60); | ||
| 21 | + | ||
| 22 | +static int32_t RecordCallback(const void *input_buffer, | ||
| 23 | + void * /*output_buffer*/, | ||
| 24 | + unsigned long frames_per_buffer, // NOLINT | ||
| 25 | + const PaStreamCallbackTimeInfo * /*time_info*/, | ||
| 26 | + PaStreamCallbackFlags /*status_flags*/, | ||
| 27 | + void *user_data) { | ||
| 28 | + std::lock_guard<std::mutex> lock(mutex); | ||
| 29 | + buffer.Push(reinterpret_cast<const float *>(input_buffer), frames_per_buffer); | ||
| 30 | + | ||
| 31 | + return stop ? paComplete : paContinue; | ||
| 32 | +} | ||
| 33 | + | ||
| 34 | +static void Handler(int32_t sig) { | ||
| 35 | + stop = true; | ||
| 36 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); | ||
| 37 | +} | ||
| 38 | + | ||
| 39 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 40 | + signal(SIGINT, Handler); | ||
| 41 | + | ||
| 42 | + const char *kUsageMessage = R"usage( | ||
| 43 | +This program shows how to use a streaming VAD with non-streaming ASR in | ||
| 44 | +sherpa-onnx. | ||
| 45 | + | ||
| 46 | +Please download silero_vad.onnx from | ||
| 47 | +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 48 | + | ||
| 49 | +For instance, use | ||
| 50 | +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 51 | + | ||
| 52 | +Please refer to ./sherpa-onnx-microphone-offline.cc | ||
| 53 | +to download models for offline ASR. | ||
| 54 | + | ||
| 55 | +(1) Transducer from icefall | ||
| 56 | + | ||
| 57 | + ./bin/sherpa-onnx-vad-microphone-offline-asr \ | ||
| 58 | + --silero-vad-model=/path/to/silero_vad.onnx \ | ||
| 59 | + --tokens=/path/to/tokens.txt \ | ||
| 60 | + --encoder=/path/to/encoder.onnx \ | ||
| 61 | + --decoder=/path/to/decoder.onnx \ | ||
| 62 | + --joiner=/path/to/joiner.onnx | ||
| 63 | + | ||
| 64 | +(2) Paraformer from FunASR | ||
| 65 | + | ||
| 66 | + ./bin/sherpa-onnx-vad-microphone-offline-asr \ | ||
| 67 | + --silero-vad-model=/path/to/silero_vad.onnx \ | ||
| 68 | + --tokens=/path/to/tokens.txt \ | ||
| 69 | + --paraformer=/path/to/model.onnx \ | ||
| 70 | + --num-threads=1 | ||
| 71 | + | ||
| 72 | +(3) Whisper models | ||
| 73 | + | ||
| 74 | + ./bin/sherpa-onnx-vad-microphone-offline-asr \ | ||
| 75 | + --silero-vad-model=/path/to/silero_vad.onnx \ | ||
| 76 | + --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \ | ||
| 77 | + --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \ | ||
| 78 | + --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \ | ||
| 79 | + --num-threads=1 | ||
| 80 | +)usage"; | ||
| 81 | + | ||
| 82 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 83 | + sherpa_onnx::VadModelConfig vad_config; | ||
| 84 | + | ||
| 85 | + sherpa_onnx::OfflineRecognizerConfig asr_config; | ||
| 86 | + | ||
| 87 | + vad_config.Register(&po); | ||
| 88 | + asr_config.Register(&po); | ||
| 89 | + | ||
| 90 | + po.Read(argc, argv); | ||
| 91 | + if (po.NumArgs() != 0) { | ||
| 92 | + po.PrintUsage(); | ||
| 93 | + exit(EXIT_FAILURE); | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + fprintf(stderr, "%s\n", vad_config.ToString().c_str()); | ||
| 97 | + fprintf(stderr, "%s\n", asr_config.ToString().c_str()); | ||
| 98 | + | ||
| 99 | + if (!vad_config.Validate()) { | ||
| 100 | + fprintf(stderr, "Errors in vad_config!\n"); | ||
| 101 | + return -1; | ||
| 102 | + } | ||
| 103 | + | ||
| 104 | + if (!asr_config.Validate()) { | ||
| 105 | + fprintf(stderr, "Errors in asr_config!\n"); | ||
| 106 | + return -1; | ||
| 107 | + } | ||
| 108 | + | ||
| 109 | + fprintf(stderr, "Creating recognizer ...\n"); | ||
| 110 | + sherpa_onnx::OfflineRecognizer recognizer(asr_config); | ||
| 111 | + fprintf(stderr, "Recognizer created!\n"); | ||
| 112 | + | ||
| 113 | + sherpa_onnx::Microphone mic; | ||
| 114 | + | ||
| 115 | + PaDeviceIndex num_devices = Pa_GetDeviceCount(); | ||
| 116 | + fprintf(stderr, "Num devices: %d\n", num_devices); | ||
| 117 | + | ||
| 118 | + PaStreamParameters param; | ||
| 119 | + | ||
| 120 | + param.device = Pa_GetDefaultInputDevice(); | ||
| 121 | + if (param.device == paNoDevice) { | ||
| 122 | + fprintf(stderr, "No default input device found\n"); | ||
| 123 | + exit(EXIT_FAILURE); | ||
| 124 | + } | ||
| 125 | + fprintf(stderr, "Use default device: %d\n", param.device); | ||
| 126 | + | ||
| 127 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | ||
| 128 | + fprintf(stderr, " Name: %s\n", info->name); | ||
| 129 | + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); | ||
| 130 | + | ||
| 131 | + param.channelCount = 1; | ||
| 132 | + param.sampleFormat = paFloat32; | ||
| 133 | + | ||
| 134 | + param.suggestedLatency = info->defaultLowInputLatency; | ||
| 135 | + param.hostApiSpecificStreamInfo = nullptr; | ||
| 136 | + float sample_rate = 16000; | ||
| 137 | + | ||
| 138 | + PaStream *stream; | ||
| 139 | + PaError err = | ||
| 140 | + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | ||
| 141 | + sample_rate, | ||
| 142 | + 0, // frames per buffer | ||
| 143 | + paClipOff, // we won't output out of range samples | ||
| 144 | + // so don't bother clipping them | ||
| 145 | + RecordCallback, nullptr); | ||
| 146 | + if (err != paNoError) { | ||
| 147 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 148 | + exit(EXIT_FAILURE); | ||
| 149 | + } | ||
| 150 | + | ||
| 151 | + err = Pa_StartStream(stream); | ||
| 152 | + if (err != paNoError) { | ||
| 153 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 154 | + exit(EXIT_FAILURE); | ||
| 155 | + } | ||
| 156 | + | ||
| 157 | + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config); | ||
| 158 | + | ||
| 159 | + fprintf(stderr, "Started. Please speak\n"); | ||
| 160 | + | ||
| 161 | + int32_t window_size = vad_config.silero_vad.window_size; | ||
| 162 | + int32_t index = 0; | ||
| 163 | + | ||
| 164 | + while (!stop) { | ||
| 165 | + { | ||
| 166 | + std::lock_guard<std::mutex> lock(mutex); | ||
| 167 | + | ||
| 168 | + while (buffer.Size() >= window_size) { | ||
| 169 | + std::vector<float> samples = buffer.Get(buffer.Head(), window_size); | ||
| 170 | + buffer.Pop(window_size); | ||
| 171 | + vad->AcceptWaveform(samples.data(), samples.size()); | ||
| 172 | + } | ||
| 173 | + } | ||
| 174 | + | ||
| 175 | + while (!vad->Empty()) { | ||
| 176 | + auto &segment = vad->Front(); | ||
| 177 | + auto s = recognizer.CreateStream(); | ||
| 178 | + s->AcceptWaveform(sample_rate, segment.samples.data(), | ||
| 179 | + segment.samples.size()); | ||
| 180 | + recognizer.DecodeStream(s.get()); | ||
| 181 | + const auto &result = s->GetResult(); | ||
| 182 | + if (!result.text.empty()) { | ||
| 183 | + fprintf(stderr, "%2d: %s\n", index, result.text.c_str()); | ||
| 184 | + ++index; | ||
| 185 | + } | ||
| 186 | + vad->Pop(); | ||
| 187 | + } | ||
| 188 | + | ||
| 189 | + Pa_Sleep(100); // sleep for 100ms | ||
| 190 | + } | ||
| 191 | + | ||
| 192 | + err = Pa_CloseStream(stream); | ||
| 193 | + if (err != paNoError) { | ||
| 194 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 195 | + exit(EXIT_FAILURE); | ||
| 196 | + } | ||
| 197 | + | ||
| 198 | + return 0; | ||
| 199 | +} |
| @@ -102,7 +102,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | @@ -102,7 +102,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 102 | 0, // frames per buffer | 102 | 0, // frames per buffer |
| 103 | paClipOff, // we won't output out of range samples | 103 | paClipOff, // we won't output out of range samples |
| 104 | // so don't bother clipping them | 104 | // so don't bother clipping them |
| 105 | - RecordCallback, &config.silero_vad.window_size); | 105 | + RecordCallback, nullptr); |
| 106 | if (err != paNoError) { | 106 | if (err != paNoError) { |
| 107 | fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | 107 | fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); |
| 108 | exit(EXIT_FAILURE); | 108 | exit(EXIT_FAILURE); |
-
请 注册 或 登录 后发表评论