Committed by
GitHub
Add C++ microphone examples for audio tagging (#749)
正在显示
24 个修改的文件
包含
706 行增加
和
60 行删除
| @@ -89,7 +89,7 @@ jobs: | @@ -89,7 +89,7 @@ jobs: | ||
| 89 | export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH | 89 | export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH |
| 90 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH | 90 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH |
| 91 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH | 91 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH |
| 92 | - export PATH=/c/hostedtoolcache/windows/Python/3.11.8/x64/bin:$PATH | 92 | + export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH |
| 93 | export PATH=/c/hostedtoolcache/windows/Python/3.12.2/x64/bin:$PATH | 93 | export PATH=/c/hostedtoolcache/windows/Python/3.12.2/x64/bin:$PATH |
| 94 | 94 | ||
| 95 | which sherpa-onnx | 95 | which sherpa-onnx |
| @@ -67,7 +67,7 @@ jobs: | @@ -67,7 +67,7 @@ jobs: | ||
| 67 | export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH | 67 | export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH |
| 68 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH | 68 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH |
| 69 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH | 69 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH |
| 70 | - export PATH=/c/hostedtoolcache/windows/Python/3.11.8/x64/bin:$PATH | 70 | + export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH |
| 71 | export PATH=/c/hostedtoolcache/windows/Python/3.12.2/x64/bin:$PATH | 71 | export PATH=/c/hostedtoolcache/windows/Python/3.12.2/x64/bin:$PATH |
| 72 | 72 | ||
| 73 | sherpa-onnx --help | 73 | sherpa-onnx --help |
| @@ -2,23 +2,48 @@ | @@ -2,23 +2,48 @@ | ||
| 2 | 2 | ||
| 3 | This repository supports running the following functions **locally** | 3 | This repository supports running the following functions **locally** |
| 4 | 4 | ||
| 5 | - - Speech-to-text (i.e., ASR) | 5 | + - Speech-to-text (i.e., ASR); both streaming and non-streaming are supported |
| 6 | - Text-to-speech (i.e., TTS) | 6 | - Text-to-speech (i.e., TTS) |
| 7 | - Speaker identification | 7 | - Speaker identification |
| 8 | + - Speaker verification | ||
| 9 | + - Spoken language identification | ||
| 10 | + - Audio tagging | ||
| 11 | + - VAD (e.g., [silero-vad](https://github.com/snakers4/silero-vad)) | ||
| 8 | 12 | ||
| 9 | on the following platforms and operating systems: | 13 | on the following platforms and operating systems: |
| 10 | 14 | ||
| 11 | - - Linux, macOS, Windows | ||
| 12 | - - Android | 15 | + - x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64) |
| 16 | + - Linux, macOS, Windows, openKylin | ||
| 17 | + - Android, WearOS | ||
| 13 | - iOS | 18 | - iOS |
| 14 | - - Raspberry Pi | 19 | + - NodeJS |
| 20 | + - WebAssembly | ||
| 21 | + - [Raspberry Pi](https://www.raspberrypi.com/) | ||
| 22 | + - [RV1126](https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf) | ||
| 23 | + - [LicheePi4A](https://sipeed.com/licheepi4a) | ||
| 24 | + - [VisionFive 2](https://www.starfivetech.com/en/site/boards) | ||
| 25 | + - [旭日X3派](https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html) | ||
| 15 | - etc | 26 | - etc |
| 16 | 27 | ||
| 28 | +with the following APIs | ||
| 29 | + | ||
| 30 | + - C++ | ||
| 31 | + - C | ||
| 32 | + - Python | ||
| 33 | + - Go | ||
| 34 | + - ``C#`` | ||
| 35 | + - Javascript | ||
| 36 | + - Java | ||
| 37 | + - Kotlin | ||
| 38 | + - Swift | ||
| 39 | + | ||
| 17 | # Useful links | 40 | # Useful links |
| 18 | 41 | ||
| 19 | - Documentation: https://k2-fsa.github.io/sherpa/onnx/ | 42 | - Documentation: https://k2-fsa.github.io/sherpa/onnx/ |
| 20 | - APK for the text-to-speech engine: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html | 43 | - APK for the text-to-speech engine: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html |
| 21 | - APK for speaker identification: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html | 44 | - APK for speaker identification: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html |
| 45 | +- APK for speech recognition: https://github.com/k2-fsa/sherpa-onnx/releases/ | ||
| 46 | +- Bilibili 演示视频: https://search.bilibili.com/all?keyword=%E6%96%B0%E4%B8%80%E4%BB%A3Kaldi | ||
| 22 | 47 | ||
| 23 | # How to reach us | 48 | # How to reach us |
| 24 | 49 |
| @@ -7,14 +7,22 @@ for usage. | @@ -7,14 +7,22 @@ for usage. | ||
| 7 | - [SherpaOnnx](./SherpaOnnx) It uses a streaming ASR model. | 7 | - [SherpaOnnx](./SherpaOnnx) It uses a streaming ASR model. |
| 8 | 8 | ||
| 9 | - [SherpaOnnx2Pass](./SherpaOnnx2Pass) It uses a streaming ASR model | 9 | - [SherpaOnnx2Pass](./SherpaOnnx2Pass) It uses a streaming ASR model |
| 10 | - for the first pass and use a non-streaming ASR model for the second pass. | 10 | + for the first pass and use a non-streaming ASR model for the second pass |
| 11 | 11 | ||
| 12 | -- [SherpaOnnxVad](./SherpaOnnxVad) It demonstrates how to use a VAD | 12 | +- [SherpaOnnxKws](./SherpaOnnxKws) It demonstrates how to use keyword spotting |
| 13 | 13 | ||
| 14 | -- [SherpaOnnxVadAsr](./SherpaOnnxVadAsr) It uses a VAD with a non-streaming | ||
| 15 | - ASR model. | 14 | +- [SherpaOnnxSpeakerIdentification](./SherpaOnnxSpeakerIdentification) It demonstrates |
| 15 | + how to use speaker identification | ||
| 16 | 16 | ||
| 17 | - [SherpaOnnxTts](./SherpaOnnxTts) It is for standalone text-to-speech. | 17 | - [SherpaOnnxTts](./SherpaOnnxTts) It is for standalone text-to-speech. |
| 18 | 18 | ||
| 19 | - [SherpaOnnxTtsEngine](./SherpaOnnxTtsEngine) It is for text-to-speech engine; | 19 | - [SherpaOnnxTtsEngine](./SherpaOnnxTtsEngine) It is for text-to-speech engine; |
| 20 | you can use it to replace the system TTS engine. | 20 | you can use it to replace the system TTS engine. |
| 21 | + | ||
| 22 | +- [SherpaOnnxVad](./SherpaOnnxVad) It demonstrates how to use a VAD | ||
| 23 | + | ||
| 24 | +- [SherpaOnnxVadAsr](./SherpaOnnxVadAsr) It uses a VAD with a non-streaming | ||
| 25 | + ASR model. | ||
| 26 | + | ||
| 27 | +- [SherpaOnnxWebSocket](./SherpaOnnxWebSocket) It shows how to write a websocket | ||
| 28 | + client for the Python streaming websocket server. |
| @@ -99,7 +99,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -99,7 +99,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 99 | Subdevices: 1/1 | 99 | Subdevices: 1/1 |
| 100 | Subdevice #0: subdevice #0 | 100 | Subdevice #0: subdevice #0 |
| 101 | 101 | ||
| 102 | -and if you want to select card 3 and the device 0 on that card, please use: | 102 | +and if you want to select card 3 and device 0 on that card, please use: |
| 103 | 103 | ||
| 104 | plughw:3,0 | 104 | plughw:3,0 |
| 105 | 105 |
| @@ -50,6 +50,7 @@ def get_binaries(): | @@ -50,6 +50,7 @@ def get_binaries(): | ||
| 50 | "sherpa-onnx-keyword-spotter", | 50 | "sherpa-onnx-keyword-spotter", |
| 51 | "sherpa-onnx-microphone", | 51 | "sherpa-onnx-microphone", |
| 52 | "sherpa-onnx-microphone-offline", | 52 | "sherpa-onnx-microphone-offline", |
| 53 | + "sherpa-onnx-microphone-offline-audio-tagging", | ||
| 53 | "sherpa-onnx-microphone-offline-speaker-identification", | 54 | "sherpa-onnx-microphone-offline-speaker-identification", |
| 54 | "sherpa-onnx-offline", | 55 | "sherpa-onnx-offline", |
| 55 | "sherpa-onnx-offline-language-identification", | 56 | "sherpa-onnx-offline-language-identification", |
| @@ -69,6 +70,7 @@ def get_binaries(): | @@ -69,6 +70,7 @@ def get_binaries(): | ||
| 69 | "sherpa-onnx-alsa-offline-speaker-identification", | 70 | "sherpa-onnx-alsa-offline-speaker-identification", |
| 70 | "sherpa-onnx-offline-tts-play-alsa", | 71 | "sherpa-onnx-offline-tts-play-alsa", |
| 71 | "sherpa-onnx-vad-alsa", | 72 | "sherpa-onnx-vad-alsa", |
| 73 | + "sherpa-onnx-alsa-offline-audio-tagging", | ||
| 72 | ] | 74 | ] |
| 73 | 75 | ||
| 74 | if is_windows(): | 76 | if is_windows(): |
| @@ -123,7 +123,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -123,7 +123,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 123 | Subdevices: 1/1 | 123 | Subdevices: 1/1 |
| 124 | Subdevice #0: subdevice #0 | 124 | Subdevice #0: subdevice #0 |
| 125 | 125 | ||
| 126 | -and if you want to select card 3 and the device 0 on that card, please use: | 126 | +and if you want to select card 3 and device 0 on that card, please use: |
| 127 | 127 | ||
| 128 | plughw:3,0 | 128 | plughw:3,0 |
| 129 | 129 |
| @@ -39,7 +39,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -39,7 +39,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 39 | Subdevices: 1/1 | 39 | Subdevices: 1/1 |
| 40 | Subdevice #0: subdevice #0 | 40 | Subdevice #0: subdevice #0 |
| 41 | 41 | ||
| 42 | -and if you want to select card 3 and the device 0 on that card, please use: | 42 | +and if you want to select card 3 and device 0 on that card, please use: |
| 43 | 43 | ||
| 44 | plughw:3,0 | 44 | plughw:3,0 |
| 45 | 45 |
| @@ -68,7 +68,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -68,7 +68,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 68 | Subdevices: 1/1 | 68 | Subdevices: 1/1 |
| 69 | Subdevice #0: subdevice #0 | 69 | Subdevice #0: subdevice #0 |
| 70 | 70 | ||
| 71 | -and if you want to select card 3 and the device 0 on that card, please use: | 71 | +and if you want to select card 3 and device 0 on that card, please use: |
| 72 | 72 | ||
| 73 | plughw:3,0 | 73 | plughw:3,0 |
| 74 | 74 |
| @@ -264,6 +264,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | @@ -264,6 +264,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | ||
| 264 | add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc) | 264 | add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc) |
| 265 | add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) | 265 | add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) |
| 266 | add_executable(sherpa-onnx-vad-alsa sherpa-onnx-vad-alsa.cc alsa.cc) | 266 | add_executable(sherpa-onnx-vad-alsa sherpa-onnx-vad-alsa.cc alsa.cc) |
| 267 | + add_executable(sherpa-onnx-alsa-offline-audio-tagging sherpa-onnx-alsa-offline-audio-tagging.cc alsa.cc) | ||
| 267 | 268 | ||
| 268 | 269 | ||
| 269 | if(SHERPA_ONNX_ENABLE_TTS) | 270 | if(SHERPA_ONNX_ENABLE_TTS) |
| @@ -276,6 +277,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | @@ -276,6 +277,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | ||
| 276 | sherpa-onnx-alsa-offline-speaker-identification | 277 | sherpa-onnx-alsa-offline-speaker-identification |
| 277 | sherpa-onnx-keyword-spotter-alsa | 278 | sherpa-onnx-keyword-spotter-alsa |
| 278 | sherpa-onnx-vad-alsa | 279 | sherpa-onnx-vad-alsa |
| 280 | + sherpa-onnx-alsa-offline-audio-tagging | ||
| 279 | ) | 281 | ) |
| 280 | 282 | ||
| 281 | if(SHERPA_ONNX_ENABLE_TTS) | 283 | if(SHERPA_ONNX_ENABLE_TTS) |
| @@ -354,6 +356,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) | @@ -354,6 +356,11 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) | ||
| 354 | microphone.cc | 356 | microphone.cc |
| 355 | ) | 357 | ) |
| 356 | 358 | ||
| 359 | + add_executable(sherpa-onnx-microphone-offline-audio-tagging | ||
| 360 | + sherpa-onnx-microphone-offline-audio-tagging.cc | ||
| 361 | + microphone.cc | ||
| 362 | + ) | ||
| 363 | + | ||
| 357 | if(BUILD_SHARED_LIBS) | 364 | if(BUILD_SHARED_LIBS) |
| 358 | set(PA_LIB portaudio) | 365 | set(PA_LIB portaudio) |
| 359 | else() | 366 | else() |
| @@ -365,6 +372,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) | @@ -365,6 +372,7 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO AND SHERPA_ONNX_ENABLE_BINARY) | ||
| 365 | sherpa-onnx-keyword-spotter-microphone | 372 | sherpa-onnx-keyword-spotter-microphone |
| 366 | sherpa-onnx-microphone-offline | 373 | sherpa-onnx-microphone-offline |
| 367 | sherpa-onnx-microphone-offline-speaker-identification | 374 | sherpa-onnx-microphone-offline-speaker-identification |
| 375 | + sherpa-onnx-microphone-offline-audio-tagging | ||
| 368 | sherpa-onnx-vad-microphone | 376 | sherpa-onnx-vad-microphone |
| 369 | sherpa-onnx-vad-microphone-offline-asr | 377 | sherpa-onnx-vad-microphone-offline-asr |
| 370 | ) | 378 | ) |
| @@ -35,7 +35,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -35,7 +35,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 35 | Subdevices: 1/1 | 35 | Subdevices: 1/1 |
| 36 | Subdevice #0: subdevice #0 | 36 | Subdevice #0: subdevice #0 |
| 37 | 37 | ||
| 38 | -and if you want to select card 3 and the device 0 on that card, please use: | 38 | +and if you want to select card 3 and device 0 on that card, please use: |
| 39 | 39 | ||
| 40 | plughw:3,0 | 40 | plughw:3,0 |
| 41 | 41 |
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-alsa-offline-audio-tagging.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <signal.h> | ||
| 6 | +#include <stdio.h> | ||
| 7 | +#include <stdlib.h> | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | +#include <mutex> // NOLINT | ||
| 11 | +#include <thread> // NOLINT | ||
| 12 | + | ||
| 13 | +#include "sherpa-onnx/csrc/alsa.h" | ||
| 14 | +#include "sherpa-onnx/csrc/audio-tagging.h" | ||
| 15 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 16 | + | ||
| 17 | +enum class State { | ||
| 18 | + kIdle, | ||
| 19 | + kRecording, | ||
| 20 | + kDecoding, | ||
| 21 | +}; | ||
| 22 | + | ||
| 23 | +State state = State::kIdle; | ||
| 24 | + | ||
| 25 | +// true to stop the program and exit | ||
| 26 | +bool stop = false; | ||
| 27 | + | ||
| 28 | +std::vector<float> samples; | ||
| 29 | +std::mutex samples_mutex; | ||
| 30 | + | ||
| 31 | +static void DetectKeyPress() { | ||
| 32 | + SHERPA_ONNX_LOGE("Press Enter to start"); | ||
| 33 | + int32_t key; | ||
| 34 | + while (!stop && (key = getchar())) { | ||
| 35 | + if (key != 0x0a) { | ||
| 36 | + continue; | ||
| 37 | + } | ||
| 38 | + | ||
| 39 | + switch (state) { | ||
| 40 | + case State::kIdle: | ||
| 41 | + SHERPA_ONNX_LOGE("Start recording. Press Enter to stop recording"); | ||
| 42 | + state = State::kRecording; | ||
| 43 | + { | ||
| 44 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 45 | + samples.clear(); | ||
| 46 | + } | ||
| 47 | + break; | ||
| 48 | + case State::kRecording: | ||
| 49 | + SHERPA_ONNX_LOGE("Stop recording. Decoding ..."); | ||
| 50 | + state = State::kDecoding; | ||
| 51 | + break; | ||
| 52 | + case State::kDecoding: | ||
| 53 | + break; | ||
| 54 | + } | ||
| 55 | + } | ||
| 56 | +} | ||
| 57 | + | ||
| 58 | +static void Record(const char *device_name, int32_t expected_sample_rate) { | ||
| 59 | + sherpa_onnx::Alsa alsa(device_name); | ||
| 60 | + | ||
| 61 | + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { | ||
| 62 | + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), | ||
| 63 | + expected_sample_rate); | ||
| 64 | + exit(-1); | ||
| 65 | + } | ||
| 66 | + | ||
| 67 | + int32_t chunk = 0.1 * alsa.GetActualSampleRate(); | ||
| 68 | + while (!stop) { | ||
| 69 | + const std::vector<float> &s = alsa.Read(chunk); | ||
| 70 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 71 | + samples.insert(samples.end(), s.begin(), s.end()); | ||
| 72 | + } | ||
| 73 | +} | ||
| 74 | + | ||
| 75 | +static void Handler(int32_t sig) { | ||
| 76 | + stop = true; | ||
| 77 | + fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n"); | ||
| 78 | +} | ||
| 79 | + | ||
| 80 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 81 | + signal(SIGINT, Handler); | ||
| 82 | + | ||
| 83 | + const char *kUsageMessage = R"usage( | ||
| 84 | +Audio tagging from microphone (Linux only). | ||
| 85 | +Usage: | ||
| 86 | + | ||
| 87 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 | ||
| 88 | +tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 | ||
| 89 | +rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 | ||
| 90 | + | ||
| 91 | +./bin/sherpa-onnx-alsa-offline-audio-tagging \ | ||
| 92 | + --zipformer-model=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.onnx \ | ||
| 93 | + --labels=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv \ | ||
| 94 | + device_name | ||
| 95 | + | ||
| 96 | +Please refer to | ||
| 97 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models | ||
| 98 | +for a list of pre-trained models to download. | ||
| 99 | + | ||
| 100 | +The device name specifies which microphone to use in case there are several | ||
| 101 | +on your system. You can use | ||
| 102 | + | ||
| 103 | + arecord -l | ||
| 104 | + | ||
| 105 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 106 | + | ||
| 107 | +**** List of CAPTURE Hardware Devices **** | ||
| 108 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 109 | + Subdevices: 1/1 | ||
| 110 | + Subdevice #0: subdevice #0 | ||
| 111 | + | ||
| 112 | +and if you want to select card 3 and device 0 on that card, please use: | ||
| 113 | + | ||
| 114 | + plughw:3,0 | ||
| 115 | + | ||
| 116 | +as the device_name. | ||
| 117 | +)usage"; | ||
| 118 | + | ||
| 119 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 120 | + sherpa_onnx::AudioTaggingConfig config; | ||
| 121 | + config.Register(&po); | ||
| 122 | + | ||
| 123 | + po.Read(argc, argv); | ||
| 124 | + if (po.NumArgs() != 1) { | ||
| 125 | + fprintf(stderr, "Please provide only 1 argument: the device name\n"); | ||
| 126 | + po.PrintUsage(); | ||
| 127 | + exit(EXIT_FAILURE); | ||
| 128 | + } | ||
| 129 | + | ||
| 130 | + fprintf(stderr, "%s\n", config.ToString().c_str()); | ||
| 131 | + | ||
| 132 | + if (!config.Validate()) { | ||
| 133 | + fprintf(stderr, "Errors in config!\n"); | ||
| 134 | + return -1; | ||
| 135 | + } | ||
| 136 | + | ||
| 137 | + SHERPA_ONNX_LOGE("Creating audio tagger ..."); | ||
| 138 | + sherpa_onnx::AudioTagging tagger(config); | ||
| 139 | + SHERPA_ONNX_LOGE("Audio tagger created created!"); | ||
| 140 | + | ||
| 141 | + std::string device_name = po.GetArg(1); | ||
| 142 | + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); | ||
| 143 | + | ||
| 144 | + int32_t sample_rate = 16000; // fixed to 16000Hz for all models from icefall | ||
| 145 | + | ||
| 146 | + std::thread t2(Record, device_name.c_str(), sample_rate); | ||
| 147 | + using namespace std::chrono_literals; // NOLINT | ||
| 148 | + std::this_thread::sleep_for(100ms); // sleep for 100ms | ||
| 149 | + std::thread t(DetectKeyPress); | ||
| 150 | + | ||
| 151 | + while (!stop) { | ||
| 152 | + switch (state) { | ||
| 153 | + case State::kIdle: | ||
| 154 | + break; | ||
| 155 | + case State::kRecording: | ||
| 156 | + break; | ||
| 157 | + case State::kDecoding: { | ||
| 158 | + std::vector<float> buf; | ||
| 159 | + { | ||
| 160 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 161 | + buf = std::move(samples); | ||
| 162 | + } | ||
| 163 | + SHERPA_ONNX_LOGE("Computing..."); | ||
| 164 | + auto s = tagger.CreateStream(); | ||
| 165 | + s->AcceptWaveform(sample_rate, buf.data(), buf.size()); | ||
| 166 | + auto results = tagger.Compute(s.get()); | ||
| 167 | + SHERPA_ONNX_LOGE("Result is:"); | ||
| 168 | + | ||
| 169 | + int32_t i = 0; | ||
| 170 | + std::ostringstream os; | ||
| 171 | + for (const auto &event : results) { | ||
| 172 | + os << i << ": " << event.ToString() << "\n"; | ||
| 173 | + i += 1; | ||
| 174 | + } | ||
| 175 | + | ||
| 176 | + SHERPA_ONNX_LOGE("\n%s\n", os.str().c_str()); | ||
| 177 | + | ||
| 178 | + state = State::kIdle; | ||
| 179 | + SHERPA_ONNX_LOGE("Press Enter to start"); | ||
| 180 | + break; | ||
| 181 | + } | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + std::this_thread::sleep_for(20ms); // sleep for 20ms | ||
| 185 | + } | ||
| 186 | + t.join(); | ||
| 187 | + t2.join(); | ||
| 188 | + | ||
| 189 | + return 0; | ||
| 190 | +} |
| @@ -71,8 +71,8 @@ static void Record(const char *device_name, int32_t expected_sample_rate) { | @@ -71,8 +71,8 @@ static void Record(const char *device_name, int32_t expected_sample_rate) { | ||
| 71 | 71 | ||
| 72 | int32_t chunk = 0.1 * alsa.GetActualSampleRate(); | 72 | int32_t chunk = 0.1 * alsa.GetActualSampleRate(); |
| 73 | while (!stop) { | 73 | while (!stop) { |
| 74 | - std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 75 | const std::vector<float> &s = alsa.Read(chunk); | 74 | const std::vector<float> &s = alsa.Read(chunk); |
| 75 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 76 | samples.insert(samples.end(), s.begin(), s.end()); | 76 | samples.insert(samples.end(), s.begin(), s.end()); |
| 77 | } | 77 | } |
| 78 | } | 78 | } |
| @@ -193,7 +193,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -193,7 +193,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 193 | Subdevices: 1/1 | 193 | Subdevices: 1/1 |
| 194 | Subdevice #0: subdevice #0 | 194 | Subdevice #0: subdevice #0 |
| 195 | 195 | ||
| 196 | -and if you want to select card 3 and the device 0 on that card, please use: | 196 | +and if you want to select card 3 and device 0 on that card, please use: |
| 197 | plughw:3,0 | 197 | plughw:3,0 |
| 198 | as the device_name. | 198 | as the device_name. |
| 199 | 199 |
| @@ -68,8 +68,8 @@ static void Record(const char *device_name, int32_t expected_sample_rate) { | @@ -68,8 +68,8 @@ static void Record(const char *device_name, int32_t expected_sample_rate) { | ||
| 68 | 68 | ||
| 69 | int32_t chunk = 0.1 * alsa.GetActualSampleRate(); | 69 | int32_t chunk = 0.1 * alsa.GetActualSampleRate(); |
| 70 | while (!stop) { | 70 | while (!stop) { |
| 71 | - std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 72 | const std::vector<float> &s = alsa.Read(chunk); | 71 | const std::vector<float> &s = alsa.Read(chunk); |
| 72 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 73 | samples.insert(samples.end(), s.begin(), s.end()); | 73 | samples.insert(samples.end(), s.begin(), s.end()); |
| 74 | } | 74 | } |
| 75 | } | 75 | } |
| @@ -119,7 +119,7 @@ https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | @@ -119,7 +119,7 @@ https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
| 119 | for a list of pre-trained models to download. | 119 | for a list of pre-trained models to download. |
| 120 | 120 | ||
| 121 | The device name specifies which microphone to use in case there are several | 121 | The device name specifies which microphone to use in case there are several |
| 122 | -on you system. You can use | 122 | +on your system. You can use |
| 123 | 123 | ||
| 124 | arecord -l | 124 | arecord -l |
| 125 | 125 | ||
| @@ -130,7 +130,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -130,7 +130,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 130 | Subdevices: 1/1 | 130 | Subdevices: 1/1 |
| 131 | Subdevice #0: subdevice #0 | 131 | Subdevice #0: subdevice #0 |
| 132 | 132 | ||
| 133 | -and if you want to select card 3 and the device 0 on that card, please use: | 133 | +and if you want to select card 3 and device 0 on that card, please use: |
| 134 | 134 | ||
| 135 | plughw:3,0 | 135 | plughw:3,0 |
| 136 | 136 |
| @@ -52,7 +52,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -52,7 +52,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 52 | Subdevices: 1/1 | 52 | Subdevices: 1/1 |
| 53 | Subdevice #0: subdevice #0 | 53 | Subdevice #0: subdevice #0 |
| 54 | 54 | ||
| 55 | -and if you want to select card 3 and the device 0 on that card, please use: | 55 | +and if you want to select card 3 and device 0 on that card, please use: |
| 56 | 56 | ||
| 57 | plughw:3,0 | 57 | plughw:3,0 |
| 58 | 58 |
| @@ -40,7 +40,7 @@ https://k2-fsa.github.io/sherpa/onnx/kws/pretrained_models/index.html | @@ -40,7 +40,7 @@ https://k2-fsa.github.io/sherpa/onnx/kws/pretrained_models/index.html | ||
| 40 | for a list of pre-trained models to download. | 40 | for a list of pre-trained models to download. |
| 41 | 41 | ||
| 42 | The device name specifies which microphone to use in case there are several | 42 | The device name specifies which microphone to use in case there are several |
| 43 | -on you system. You can use | 43 | +on your system. You can use |
| 44 | 44 | ||
| 45 | arecord -l | 45 | arecord -l |
| 46 | 46 | ||
| @@ -51,7 +51,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -51,7 +51,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 51 | Subdevices: 1/1 | 51 | Subdevices: 1/1 |
| 52 | Subdevice #0: subdevice #0 | 52 | Subdevice #0: subdevice #0 |
| 53 | 53 | ||
| 54 | -and if you want to select card 3 and the device 0 on that card, please use: | 54 | +and if you want to select card 3 and device 0 on that card, please use: |
| 55 | 55 | ||
| 56 | plughw:3,0 | 56 | plughw:3,0 |
| 57 | 57 |
| @@ -10,10 +10,11 @@ | @@ -10,10 +10,11 @@ | ||
| 10 | 10 | ||
| 11 | #include "portaudio.h" // NOLINT | 11 | #include "portaudio.h" // NOLINT |
| 12 | #include "sherpa-onnx/csrc/display.h" | 12 | #include "sherpa-onnx/csrc/display.h" |
| 13 | -#include "sherpa-onnx/csrc/microphone.h" | ||
| 14 | #include "sherpa-onnx/csrc/keyword-spotter.h" | 13 | #include "sherpa-onnx/csrc/keyword-spotter.h" |
| 14 | +#include "sherpa-onnx/csrc/microphone.h" | ||
| 15 | 15 | ||
| 16 | bool stop = false; | 16 | bool stop = false; |
| 17 | +float mic_sample_rate = 16000; | ||
| 17 | 18 | ||
| 18 | static int32_t RecordCallback(const void *input_buffer, | 19 | static int32_t RecordCallback(const void *input_buffer, |
| 19 | void * /*output_buffer*/, | 20 | void * /*output_buffer*/, |
| @@ -23,7 +24,8 @@ static int32_t RecordCallback(const void *input_buffer, | @@ -23,7 +24,8 @@ static int32_t RecordCallback(const void *input_buffer, | ||
| 23 | void *user_data) { | 24 | void *user_data) { |
| 24 | auto stream = reinterpret_cast<sherpa_onnx::OnlineStream *>(user_data); | 25 | auto stream = reinterpret_cast<sherpa_onnx::OnlineStream *>(user_data); |
| 25 | 26 | ||
| 26 | - stream->AcceptWaveform(16000, reinterpret_cast<const float *>(input_buffer), | 27 | + stream->AcceptWaveform(mic_sample_rate, |
| 28 | + reinterpret_cast<const float *>(input_buffer), | ||
| 27 | frames_per_buffer); | 29 | frames_per_buffer); |
| 28 | 30 | ||
| 29 | return stop ? paComplete : paContinue; | 31 | return stop ? paComplete : paContinue; |
| @@ -80,14 +82,31 @@ for a list of pre-trained models to download. | @@ -80,14 +82,31 @@ for a list of pre-trained models to download. | ||
| 80 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); | 82 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); |
| 81 | fprintf(stderr, "Num devices: %d\n", num_devices); | 83 | fprintf(stderr, "Num devices: %d\n", num_devices); |
| 82 | 84 | ||
| 83 | - PaStreamParameters param; | 85 | + int32_t device_index = Pa_GetDefaultInputDevice(); |
| 84 | 86 | ||
| 85 | - param.device = Pa_GetDefaultInputDevice(); | ||
| 86 | - if (param.device == paNoDevice) { | 87 | + if (device_index == paNoDevice) { |
| 87 | fprintf(stderr, "No default input device found\n"); | 88 | fprintf(stderr, "No default input device found\n"); |
| 89 | + fprintf(stderr, "If you are using Linux, please switch to \n"); | ||
| 90 | + fprintf(stderr, " ./bin/sherpa-onnx-keyword-spotter-alsa \n"); | ||
| 88 | exit(EXIT_FAILURE); | 91 | exit(EXIT_FAILURE); |
| 89 | } | 92 | } |
| 90 | - fprintf(stderr, "Use default device: %d\n", param.device); | 93 | + |
| 94 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 95 | + if (pDeviceIndex) { | ||
| 96 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 97 | + device_index = atoi(pDeviceIndex); | ||
| 98 | + } | ||
| 99 | + | ||
| 100 | + for (int32_t i = 0; i != num_devices; ++i) { | ||
| 101 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); | ||
| 102 | + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, | ||
| 103 | + info->name); | ||
| 104 | + } | ||
| 105 | + | ||
| 106 | + PaStreamParameters param; | ||
| 107 | + param.device = device_index; | ||
| 108 | + | ||
| 109 | + fprintf(stderr, "Use device: %d\n", param.device); | ||
| 91 | 110 | ||
| 92 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | 111 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); |
| 93 | fprintf(stderr, " Name: %s\n", info->name); | 112 | fprintf(stderr, " Name: %s\n", info->name); |
| @@ -98,12 +117,19 @@ for a list of pre-trained models to download. | @@ -98,12 +117,19 @@ for a list of pre-trained models to download. | ||
| 98 | 117 | ||
| 99 | param.suggestedLatency = info->defaultLowInputLatency; | 118 | param.suggestedLatency = info->defaultLowInputLatency; |
| 100 | param.hostApiSpecificStreamInfo = nullptr; | 119 | param.hostApiSpecificStreamInfo = nullptr; |
| 120 | + | ||
| 121 | + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 122 | + if (pSampleRateStr) { | ||
| 123 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 124 | + mic_sample_rate = atof(pSampleRateStr); | ||
| 125 | + } | ||
| 126 | + | ||
| 101 | float sample_rate = 16000; | 127 | float sample_rate = 16000; |
| 102 | 128 | ||
| 103 | PaStream *stream; | 129 | PaStream *stream; |
| 104 | PaError err = | 130 | PaError err = |
| 105 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | 131 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ |
| 106 | - sample_rate, | 132 | + mic_sample_rate, |
| 107 | 0, // frames per buffer | 133 | 0, // frames per buffer |
| 108 | paClipOff, // we won't output out of range samples | 134 | paClipOff, // we won't output out of range samples |
| 109 | // so don't bother clipping them | 135 | // so don't bother clipping them |
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-microphone-offline-audio-tagging.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <signal.h> | ||
| 6 | +#include <stdio.h> | ||
| 7 | +#include <stdlib.h> | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | +#include <cctype> // std::tolower | ||
| 11 | +#include <mutex> // NOLINT | ||
| 12 | +#include <thread> // NOLINT | ||
| 13 | + | ||
| 14 | +#include "portaudio.h" // NOLINT | ||
| 15 | +#include "sherpa-onnx/csrc/audio-tagging.h" | ||
| 16 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 17 | +#include "sherpa-onnx/csrc/microphone.h" | ||
| 18 | + | ||
| 19 | +enum class State { | ||
| 20 | + kIdle, | ||
| 21 | + kRecording, | ||
| 22 | + kDecoding, | ||
| 23 | +}; | ||
| 24 | + | ||
| 25 | +State state = State::kIdle; | ||
| 26 | + | ||
| 27 | +// true to stop the program and exit | ||
| 28 | +bool stop = false; | ||
| 29 | + | ||
| 30 | +std::vector<float> samples; | ||
| 31 | +std::mutex samples_mutex; | ||
| 32 | + | ||
| 33 | +static void DetectKeyPress() { | ||
| 34 | + SHERPA_ONNX_LOGE("Press Enter to start"); | ||
| 35 | + int32_t key; | ||
| 36 | + while (!stop && (key = getchar())) { | ||
| 37 | + if (key != 0x0a) { | ||
| 38 | + continue; | ||
| 39 | + } | ||
| 40 | + | ||
| 41 | + switch (state) { | ||
| 42 | + case State::kIdle: | ||
| 43 | + SHERPA_ONNX_LOGE("Start recording. Press Enter to stop recording"); | ||
| 44 | + state = State::kRecording; | ||
| 45 | + { | ||
| 46 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 47 | + samples.clear(); | ||
| 48 | + } | ||
| 49 | + break; | ||
| 50 | + case State::kRecording: | ||
| 51 | + SHERPA_ONNX_LOGE("Stop recording. Decoding ..."); | ||
| 52 | + state = State::kDecoding; | ||
| 53 | + break; | ||
| 54 | + case State::kDecoding: | ||
| 55 | + break; | ||
| 56 | + } | ||
| 57 | + } | ||
| 58 | +} | ||
| 59 | + | ||
| 60 | +static int32_t RecordCallback(const void *input_buffer, | ||
| 61 | + void * /*output_buffer*/, | ||
| 62 | + unsigned long frames_per_buffer, // NOLINT | ||
| 63 | + const PaStreamCallbackTimeInfo * /*time_info*/, | ||
| 64 | + PaStreamCallbackFlags /*status_flags*/, | ||
| 65 | + void *user_data) { | ||
| 66 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 67 | + | ||
| 68 | + auto p = reinterpret_cast<const float *>(input_buffer); | ||
| 69 | + samples.insert(samples.end(), p, p + frames_per_buffer); | ||
| 70 | + | ||
| 71 | + return stop ? paComplete : paContinue; | ||
| 72 | +} | ||
| 73 | + | ||
| 74 | +static void Handler(int32_t sig) { | ||
| 75 | + stop = true; | ||
| 76 | + fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n"); | ||
| 77 | +} | ||
| 78 | + | ||
| 79 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 80 | + signal(SIGINT, Handler); | ||
| 81 | + | ||
| 82 | + const char *kUsageMessage = R"usage( | ||
| 83 | +Audio tagging from microphone. | ||
| 84 | +Usage: | ||
| 85 | + | ||
| 86 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 | ||
| 87 | +tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 | ||
| 88 | +rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 | ||
| 89 | + | ||
| 90 | +./bin/sherpa-onnx-microphone-offline-audio-tagging \ | ||
| 91 | + --zipformer-model=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.onnx \ | ||
| 92 | + --labels=./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv | ||
| 93 | + | ||
| 94 | +Please see | ||
| 95 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models | ||
| 96 | +for more models. | ||
| 97 | +)usage"; | ||
| 98 | + | ||
| 99 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 100 | + sherpa_onnx::AudioTaggingConfig config; | ||
| 101 | + config.Register(&po); | ||
| 102 | + | ||
| 103 | + po.Read(argc, argv); | ||
| 104 | + if (po.NumArgs() != 0) { | ||
| 105 | + fprintf(stderr, "\nThis program does not support positional arguments\n\n"); | ||
| 106 | + po.PrintUsage(); | ||
| 107 | + exit(EXIT_FAILURE); | ||
| 108 | + } | ||
| 109 | + | ||
| 110 | + fprintf(stderr, "%s\n", config.ToString().c_str()); | ||
| 111 | + | ||
| 112 | + if (!config.Validate()) { | ||
| 113 | + fprintf(stderr, "Errors in config!\n"); | ||
| 114 | + return -1; | ||
| 115 | + } | ||
| 116 | + | ||
| 117 | + SHERPA_ONNX_LOGE("Creating audio tagger ..."); | ||
| 118 | + sherpa_onnx::AudioTagging tagger(config); | ||
| 119 | + SHERPA_ONNX_LOGE("Audio tagger created created!"); | ||
| 120 | + | ||
| 121 | + sherpa_onnx::Microphone mic; | ||
| 122 | + | ||
| 123 | + PaDeviceIndex num_devices = Pa_GetDeviceCount(); | ||
| 124 | + fprintf(stderr, "Num devices: %d\n", num_devices); | ||
| 125 | + | ||
| 126 | + int32_t device_index = Pa_GetDefaultInputDevice(); | ||
| 127 | + | ||
| 128 | + if (device_index == paNoDevice) { | ||
| 129 | + fprintf(stderr, "No default input device found\n"); | ||
| 130 | + fprintf(stderr, "If you are using Linux, please switch to \n"); | ||
| 131 | + fprintf(stderr, " ./bin/sherpa-onnx-alsa-offline-audio-tagging \n"); | ||
| 132 | + exit(EXIT_FAILURE); | ||
| 133 | + } | ||
| 134 | + | ||
| 135 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 136 | + if (pDeviceIndex) { | ||
| 137 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 138 | + device_index = atoi(pDeviceIndex); | ||
| 139 | + } | ||
| 140 | + | ||
| 141 | + for (int32_t i = 0; i != num_devices; ++i) { | ||
| 142 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); | ||
| 143 | + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, | ||
| 144 | + info->name); | ||
| 145 | + } | ||
| 146 | + | ||
| 147 | + PaStreamParameters param; | ||
| 148 | + param.device = device_index; | ||
| 149 | + | ||
| 150 | + fprintf(stderr, "Use device: %d\n", param.device); | ||
| 151 | + | ||
| 152 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | ||
| 153 | + fprintf(stderr, " Name: %s\n", info->name); | ||
| 154 | + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); | ||
| 155 | + | ||
| 156 | + param.channelCount = 1; | ||
| 157 | + param.sampleFormat = paFloat32; | ||
| 158 | + | ||
| 159 | + param.suggestedLatency = info->defaultLowInputLatency; | ||
| 160 | + param.hostApiSpecificStreamInfo = nullptr; | ||
| 161 | + float mic_sample_rate = 16000; | ||
| 162 | + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 163 | + if (pSampleRateStr) { | ||
| 164 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 165 | + mic_sample_rate = atof(pSampleRateStr); | ||
| 166 | + } | ||
| 167 | + | ||
| 168 | + float sample_rate = 16000; | ||
| 169 | + | ||
| 170 | + PaStream *stream; | ||
| 171 | + PaError err = | ||
| 172 | + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | ||
| 173 | + mic_sample_rate, | ||
| 174 | + 0, // frames per buffer | ||
| 175 | + paClipOff, // we won't output out of range samples | ||
| 176 | + // so don't bother clipping them | ||
| 177 | + RecordCallback, nullptr); | ||
| 178 | + if (err != paNoError) { | ||
| 179 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 180 | + exit(EXIT_FAILURE); | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + err = Pa_StartStream(stream); | ||
| 184 | + fprintf(stderr, "Started\n"); | ||
| 185 | + | ||
| 186 | + if (err != paNoError) { | ||
| 187 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 188 | + exit(EXIT_FAILURE); | ||
| 189 | + } | ||
| 190 | + | ||
| 191 | + std::thread t(DetectKeyPress); | ||
| 192 | + while (!stop) { | ||
| 193 | + switch (state) { | ||
| 194 | + case State::kIdle: | ||
| 195 | + break; | ||
| 196 | + case State::kRecording: | ||
| 197 | + break; | ||
| 198 | + case State::kDecoding: { | ||
| 199 | + std::vector<float> buf; | ||
| 200 | + { | ||
| 201 | + std::lock_guard<std::mutex> lock(samples_mutex); | ||
| 202 | + buf = std::move(samples); | ||
| 203 | + } | ||
| 204 | + | ||
| 205 | + SHERPA_ONNX_LOGE("Computing..."); | ||
| 206 | + auto s = tagger.CreateStream(); | ||
| 207 | + s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size()); | ||
| 208 | + auto results = tagger.Compute(s.get()); | ||
| 209 | + | ||
| 210 | + SHERPA_ONNX_LOGE("Result is:"); | ||
| 211 | + | ||
| 212 | + int32_t i = 0; | ||
| 213 | + std::ostringstream os; | ||
| 214 | + for (const auto &event : results) { | ||
| 215 | + os << i << ": " << event.ToString() << "\n"; | ||
| 216 | + i += 1; | ||
| 217 | + } | ||
| 218 | + | ||
| 219 | + SHERPA_ONNX_LOGE("\n%s\n", os.str().c_str()); | ||
| 220 | + | ||
| 221 | + state = State::kIdle; | ||
| 222 | + SHERPA_ONNX_LOGE("Press Enter to start"); | ||
| 223 | + break; | ||
| 224 | + } | ||
| 225 | + } | ||
| 226 | + | ||
| 227 | + Pa_Sleep(20); // sleep for 20ms | ||
| 228 | + } | ||
| 229 | + t.join(); | ||
| 230 | + | ||
| 231 | + err = Pa_CloseStream(stream); | ||
| 232 | + if (err != paNoError) { | ||
| 233 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 234 | + exit(EXIT_FAILURE); | ||
| 235 | + } | ||
| 236 | + | ||
| 237 | + return 0; | ||
| 238 | +} |
| @@ -223,14 +223,31 @@ Note that `zh` means Chinese, while `en` means English. | @@ -223,14 +223,31 @@ Note that `zh` means Chinese, while `en` means English. | ||
| 223 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); | 223 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); |
| 224 | fprintf(stderr, "Num devices: %d\n", num_devices); | 224 | fprintf(stderr, "Num devices: %d\n", num_devices); |
| 225 | 225 | ||
| 226 | - PaStreamParameters param; | ||
| 227 | - | ||
| 228 | - param.device = Pa_GetDefaultInputDevice(); | ||
| 229 | - if (param.device == paNoDevice) { | 226 | + int32_t device_index = Pa_GetDefaultInputDevice(); |
| 227 | + if (device_index == paNoDevice) { | ||
| 230 | fprintf(stderr, "No default input device found\n"); | 228 | fprintf(stderr, "No default input device found\n"); |
| 229 | + fprintf(stderr, "If you are using Linux, please switch to \n"); | ||
| 230 | + fprintf(stderr, | ||
| 231 | + " ./bin/sherpa-onnx-alsa-offline-speaker-identification \n"); | ||
| 231 | exit(EXIT_FAILURE); | 232 | exit(EXIT_FAILURE); |
| 232 | } | 233 | } |
| 233 | - fprintf(stderr, "Use default device: %d\n", param.device); | 234 | + |
| 235 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 236 | + if (pDeviceIndex) { | ||
| 237 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 238 | + device_index = atoi(pDeviceIndex); | ||
| 239 | + } | ||
| 240 | + | ||
| 241 | + for (int32_t i = 0; i != num_devices; ++i) { | ||
| 242 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); | ||
| 243 | + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, | ||
| 244 | + info->name); | ||
| 245 | + } | ||
| 246 | + | ||
| 247 | + PaStreamParameters param; | ||
| 248 | + param.device = device_index; | ||
| 249 | + | ||
| 250 | + fprintf(stderr, "Use device: %d\n", param.device); | ||
| 234 | 251 | ||
| 235 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | 252 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); |
| 236 | fprintf(stderr, " Name: %s\n", info->name); | 253 | fprintf(stderr, " Name: %s\n", info->name); |
| @@ -241,12 +258,18 @@ Note that `zh` means Chinese, while `en` means English. | @@ -241,12 +258,18 @@ Note that `zh` means Chinese, while `en` means English. | ||
| 241 | 258 | ||
| 242 | param.suggestedLatency = info->defaultLowInputLatency; | 259 | param.suggestedLatency = info->defaultLowInputLatency; |
| 243 | param.hostApiSpecificStreamInfo = nullptr; | 260 | param.hostApiSpecificStreamInfo = nullptr; |
| 261 | + float mic_sample_rate = 16000; | ||
| 262 | + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 263 | + if (pSampleRateStr) { | ||
| 264 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 265 | + mic_sample_rate = atof(pSampleRateStr); | ||
| 266 | + } | ||
| 244 | float sample_rate = 16000; | 267 | float sample_rate = 16000; |
| 245 | 268 | ||
| 246 | PaStream *stream; | 269 | PaStream *stream; |
| 247 | PaError err = | 270 | PaError err = |
| 248 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | 271 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ |
| 249 | - sample_rate, | 272 | + mic_sample_rate, |
| 250 | 0, // frames per buffer | 273 | 0, // frames per buffer |
| 251 | paClipOff, // we won't output out of range samples | 274 | paClipOff, // we won't output out of range samples |
| 252 | // so don't bother clipping them | 275 | // so don't bother clipping them |
| @@ -279,7 +302,7 @@ Note that `zh` means Chinese, while `en` means English. | @@ -279,7 +302,7 @@ Note that `zh` means Chinese, while `en` means English. | ||
| 279 | } | 302 | } |
| 280 | 303 | ||
| 281 | auto s = extractor.CreateStream(); | 304 | auto s = extractor.CreateStream(); |
| 282 | - s->AcceptWaveform(sample_rate, buf.data(), buf.size()); | 305 | + s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size()); |
| 283 | s->InputFinished(); | 306 | s->InputFinished(); |
| 284 | auto embedding = extractor.Compute(s.get()); | 307 | auto embedding = extractor.Compute(s.get()); |
| 285 | auto name = manager.Search(embedding.data(), threshold); | 308 | auto name = manager.Search(embedding.data(), threshold); |
| @@ -139,14 +139,31 @@ for a list of pre-trained models to download. | @@ -139,14 +139,31 @@ for a list of pre-trained models to download. | ||
| 139 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); | 139 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); |
| 140 | fprintf(stderr, "Num devices: %d\n", num_devices); | 140 | fprintf(stderr, "Num devices: %d\n", num_devices); |
| 141 | 141 | ||
| 142 | - PaStreamParameters param; | 142 | + int32_t device_index = Pa_GetDefaultInputDevice(); |
| 143 | 143 | ||
| 144 | - param.device = Pa_GetDefaultInputDevice(); | ||
| 145 | - if (param.device == paNoDevice) { | 144 | + if (device_index == paNoDevice) { |
| 146 | fprintf(stderr, "No default input device found\n"); | 145 | fprintf(stderr, "No default input device found\n"); |
| 146 | + fprintf(stderr, "If you are using Linux, please switch to \n"); | ||
| 147 | + fprintf(stderr, " ./bin/sherpa-onnx-alsa-offline \n"); | ||
| 147 | exit(EXIT_FAILURE); | 148 | exit(EXIT_FAILURE); |
| 148 | } | 149 | } |
| 149 | - fprintf(stderr, "Use default device: %d\n", param.device); | 150 | + |
| 151 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 152 | + if (pDeviceIndex) { | ||
| 153 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 154 | + device_index = atoi(pDeviceIndex); | ||
| 155 | + } | ||
| 156 | + | ||
| 157 | + for (int32_t i = 0; i != num_devices; ++i) { | ||
| 158 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); | ||
| 159 | + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, | ||
| 160 | + info->name); | ||
| 161 | + } | ||
| 162 | + | ||
| 163 | + PaStreamParameters param; | ||
| 164 | + param.device = device_index; | ||
| 165 | + | ||
| 166 | + fprintf(stderr, "Use device: %d\n", param.device); | ||
| 150 | 167 | ||
| 151 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | 168 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); |
| 152 | fprintf(stderr, " Name: %s\n", info->name); | 169 | fprintf(stderr, " Name: %s\n", info->name); |
| @@ -157,12 +174,18 @@ for a list of pre-trained models to download. | @@ -157,12 +174,18 @@ for a list of pre-trained models to download. | ||
| 157 | 174 | ||
| 158 | param.suggestedLatency = info->defaultLowInputLatency; | 175 | param.suggestedLatency = info->defaultLowInputLatency; |
| 159 | param.hostApiSpecificStreamInfo = nullptr; | 176 | param.hostApiSpecificStreamInfo = nullptr; |
| 177 | + float mic_sample_rate = 16000; | ||
| 178 | + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 179 | + if (pSampleRateStr) { | ||
| 180 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 181 | + mic_sample_rate = atof(pSampleRateStr); | ||
| 182 | + } | ||
| 160 | float sample_rate = 16000; | 183 | float sample_rate = 16000; |
| 161 | 184 | ||
| 162 | PaStream *stream; | 185 | PaStream *stream; |
| 163 | PaError err = | 186 | PaError err = |
| 164 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | 187 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ |
| 165 | - sample_rate, | 188 | + mic_sample_rate, |
| 166 | 0, // frames per buffer | 189 | 0, // frames per buffer |
| 167 | paClipOff, // we won't output out of range samples | 190 | paClipOff, // we won't output out of range samples |
| 168 | // so don't bother clipping them | 191 | // so don't bother clipping them |
| @@ -195,7 +218,7 @@ for a list of pre-trained models to download. | @@ -195,7 +218,7 @@ for a list of pre-trained models to download. | ||
| 195 | } | 218 | } |
| 196 | 219 | ||
| 197 | auto s = recognizer.CreateStream(); | 220 | auto s = recognizer.CreateStream(); |
| 198 | - s->AcceptWaveform(sample_rate, buf.data(), buf.size()); | 221 | + s->AcceptWaveform(mic_sample_rate, buf.data(), buf.size()); |
| 199 | recognizer.DecodeStream(s.get()); | 222 | recognizer.DecodeStream(s.get()); |
| 200 | SHERPA_ONNX_LOGE("Decoding Done! Result is:"); | 223 | SHERPA_ONNX_LOGE("Decoding Done! Result is:"); |
| 201 | SHERPA_ONNX_LOGE("%s", s->GetResult().text.c_str()); | 224 | SHERPA_ONNX_LOGE("%s", s->GetResult().text.c_str()); |
| @@ -15,6 +15,7 @@ | @@ -15,6 +15,7 @@ | ||
| 15 | #include "sherpa-onnx/csrc/online-recognizer.h" | 15 | #include "sherpa-onnx/csrc/online-recognizer.h" |
| 16 | 16 | ||
| 17 | bool stop = false; | 17 | bool stop = false; |
| 18 | +float mic_sample_rate = 16000; | ||
| 18 | 19 | ||
| 19 | static int32_t RecordCallback(const void *input_buffer, | 20 | static int32_t RecordCallback(const void *input_buffer, |
| 20 | void * /*output_buffer*/, | 21 | void * /*output_buffer*/, |
| @@ -24,7 +25,8 @@ static int32_t RecordCallback(const void *input_buffer, | @@ -24,7 +25,8 @@ static int32_t RecordCallback(const void *input_buffer, | ||
| 24 | void *user_data) { | 25 | void *user_data) { |
| 25 | auto stream = reinterpret_cast<sherpa_onnx::OnlineStream *>(user_data); | 26 | auto stream = reinterpret_cast<sherpa_onnx::OnlineStream *>(user_data); |
| 26 | 27 | ||
| 27 | - stream->AcceptWaveform(16000, reinterpret_cast<const float *>(input_buffer), | 28 | + stream->AcceptWaveform(mic_sample_rate, |
| 29 | + reinterpret_cast<const float *>(input_buffer), | ||
| 28 | frames_per_buffer); | 30 | frames_per_buffer); |
| 29 | 31 | ||
| 30 | return stop ? paComplete : paContinue; | 32 | return stop ? paComplete : paContinue; |
| @@ -81,14 +83,31 @@ for a list of pre-trained models to download. | @@ -81,14 +83,31 @@ for a list of pre-trained models to download. | ||
| 81 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); | 83 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); |
| 82 | fprintf(stderr, "Num devices: %d\n", num_devices); | 84 | fprintf(stderr, "Num devices: %d\n", num_devices); |
| 83 | 85 | ||
| 84 | - PaStreamParameters param; | 86 | + int32_t device_index = Pa_GetDefaultInputDevice(); |
| 85 | 87 | ||
| 86 | - param.device = Pa_GetDefaultInputDevice(); | ||
| 87 | - if (param.device == paNoDevice) { | 88 | + if (device_index == paNoDevice) { |
| 88 | fprintf(stderr, "No default input device found\n"); | 89 | fprintf(stderr, "No default input device found\n"); |
| 90 | + fprintf(stderr, "If you are using Linux, please switch to \n"); | ||
| 91 | + fprintf(stderr, " ./bin/sherpa-onnx-alsa \n"); | ||
| 89 | exit(EXIT_FAILURE); | 92 | exit(EXIT_FAILURE); |
| 90 | } | 93 | } |
| 91 | - fprintf(stderr, "Use default device: %d\n", param.device); | 94 | + |
| 95 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 96 | + if (pDeviceIndex) { | ||
| 97 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 98 | + device_index = atoi(pDeviceIndex); | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + for (int32_t i = 0; i != num_devices; ++i) { | ||
| 102 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); | ||
| 103 | + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, | ||
| 104 | + info->name); | ||
| 105 | + } | ||
| 106 | + | ||
| 107 | + PaStreamParameters param; | ||
| 108 | + param.device = device_index; | ||
| 109 | + | ||
| 110 | + fprintf(stderr, "Use device: %d\n", param.device); | ||
| 92 | 111 | ||
| 93 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | 112 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); |
| 94 | fprintf(stderr, " Name: %s\n", info->name); | 113 | fprintf(stderr, " Name: %s\n", info->name); |
| @@ -99,6 +118,11 @@ for a list of pre-trained models to download. | @@ -99,6 +118,11 @@ for a list of pre-trained models to download. | ||
| 99 | 118 | ||
| 100 | param.suggestedLatency = info->defaultLowInputLatency; | 119 | param.suggestedLatency = info->defaultLowInputLatency; |
| 101 | param.hostApiSpecificStreamInfo = nullptr; | 120 | param.hostApiSpecificStreamInfo = nullptr; |
| 121 | + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 122 | + if (pSampleRateStr) { | ||
| 123 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 124 | + mic_sample_rate = atof(pSampleRateStr); | ||
| 125 | + } | ||
| 102 | float sample_rate = 16000; | 126 | float sample_rate = 16000; |
| 103 | 127 | ||
| 104 | PaStream *stream; | 128 | PaStream *stream; |
| @@ -47,7 +47,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -47,7 +47,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 47 | Subdevices: 1/1 | 47 | Subdevices: 1/1 |
| 48 | Subdevice #0: subdevice #0 | 48 | Subdevice #0: subdevice #0 |
| 49 | 49 | ||
| 50 | -and if you want to select card 3 and the device 0 on that card, please use: | 50 | +and if you want to select card 3 and device 0 on that card, please use: |
| 51 | 51 | ||
| 52 | plughw:3,0 | 52 | plughw:3,0 |
| 53 | 53 |
| @@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
| 13 | #include "sherpa-onnx/csrc/circular-buffer.h" | 13 | #include "sherpa-onnx/csrc/circular-buffer.h" |
| 14 | #include "sherpa-onnx/csrc/microphone.h" | 14 | #include "sherpa-onnx/csrc/microphone.h" |
| 15 | #include "sherpa-onnx/csrc/offline-recognizer.h" | 15 | #include "sherpa-onnx/csrc/offline-recognizer.h" |
| 16 | +#include "sherpa-onnx/csrc/resample.h" | ||
| 16 | #include "sherpa-onnx/csrc/voice-activity-detector.h" | 17 | #include "sherpa-onnx/csrc/voice-activity-detector.h" |
| 17 | 18 | ||
| 18 | bool stop = false; | 19 | bool stop = false; |
| @@ -115,14 +116,29 @@ to download models for offline ASR. | @@ -115,14 +116,29 @@ to download models for offline ASR. | ||
| 115 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); | 116 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); |
| 116 | fprintf(stderr, "Num devices: %d\n", num_devices); | 117 | fprintf(stderr, "Num devices: %d\n", num_devices); |
| 117 | 118 | ||
| 118 | - PaStreamParameters param; | 119 | + int32_t device_index = Pa_GetDefaultInputDevice(); |
| 119 | 120 | ||
| 120 | - param.device = Pa_GetDefaultInputDevice(); | ||
| 121 | - if (param.device == paNoDevice) { | 121 | + if (device_index == paNoDevice) { |
| 122 | fprintf(stderr, "No default input device found\n"); | 122 | fprintf(stderr, "No default input device found\n"); |
| 123 | exit(EXIT_FAILURE); | 123 | exit(EXIT_FAILURE); |
| 124 | } | 124 | } |
| 125 | - fprintf(stderr, "Use default device: %d\n", param.device); | 125 | + |
| 126 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 127 | + if (pDeviceIndex) { | ||
| 128 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 129 | + device_index = atoi(pDeviceIndex); | ||
| 130 | + } | ||
| 131 | + | ||
| 132 | + for (int32_t i = 0; i != num_devices; ++i) { | ||
| 133 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); | ||
| 134 | + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, | ||
| 135 | + info->name); | ||
| 136 | + } | ||
| 137 | + | ||
| 138 | + PaStreamParameters param; | ||
| 139 | + param.device = device_index; | ||
| 140 | + | ||
| 141 | + fprintf(stderr, "Use device: %d\n", param.device); | ||
| 126 | 142 | ||
| 127 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | 143 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); |
| 128 | fprintf(stderr, " Name: %s\n", info->name); | 144 | fprintf(stderr, " Name: %s\n", info->name); |
| @@ -133,12 +149,27 @@ to download models for offline ASR. | @@ -133,12 +149,27 @@ to download models for offline ASR. | ||
| 133 | 149 | ||
| 134 | param.suggestedLatency = info->defaultLowInputLatency; | 150 | param.suggestedLatency = info->defaultLowInputLatency; |
| 135 | param.hostApiSpecificStreamInfo = nullptr; | 151 | param.hostApiSpecificStreamInfo = nullptr; |
| 152 | + float mic_sample_rate = 16000; | ||
| 153 | + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 154 | + if (pSampleRateStr) { | ||
| 155 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 156 | + mic_sample_rate = atof(pSampleRateStr); | ||
| 157 | + } | ||
| 136 | float sample_rate = 16000; | 158 | float sample_rate = 16000; |
| 159 | + std::unique_ptr<sherpa_onnx::LinearResample> resampler; | ||
| 160 | + if (mic_sample_rate != sample_rate) { | ||
| 161 | + float min_freq = std::min(mic_sample_rate, sample_rate); | ||
| 162 | + float lowpass_cutoff = 0.99 * 0.5 * min_freq; | ||
| 163 | + | ||
| 164 | + int32_t lowpass_filter_width = 6; | ||
| 165 | + resampler = std::make_unique<sherpa_onnx::LinearResample>( | ||
| 166 | + mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width); | ||
| 167 | + } | ||
| 137 | 168 | ||
| 138 | PaStream *stream; | 169 | PaStream *stream; |
| 139 | PaError err = | 170 | PaError err = |
| 140 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | 171 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ |
| 141 | - sample_rate, | 172 | + mic_sample_rate, |
| 142 | 0, // frames per buffer | 173 | 0, // frames per buffer |
| 143 | paClipOff, // we won't output out of range samples | 174 | paClipOff, // we won't output out of range samples |
| 144 | // so don't bother clipping them | 175 | // so don't bother clipping them |
| @@ -168,6 +199,13 @@ to download models for offline ASR. | @@ -168,6 +199,13 @@ to download models for offline ASR. | ||
| 168 | while (buffer.Size() >= window_size) { | 199 | while (buffer.Size() >= window_size) { |
| 169 | std::vector<float> samples = buffer.Get(buffer.Head(), window_size); | 200 | std::vector<float> samples = buffer.Get(buffer.Head(), window_size); |
| 170 | buffer.Pop(window_size); | 201 | buffer.Pop(window_size); |
| 202 | + | ||
| 203 | + if (resampler) { | ||
| 204 | + std::vector<float> tmp; | ||
| 205 | + resampler->Resample(samples.data(), samples.size(), true, &tmp); | ||
| 206 | + samples = std::move(tmp); | ||
| 207 | + } | ||
| 208 | + | ||
| 171 | vad->AcceptWaveform(samples.data(), samples.size()); | 209 | vad->AcceptWaveform(samples.data(), samples.size()); |
| 172 | } | 210 | } |
| 173 | } | 211 | } |
| @@ -12,6 +12,7 @@ | @@ -12,6 +12,7 @@ | ||
| 12 | #include "portaudio.h" // NOLINT | 12 | #include "portaudio.h" // NOLINT |
| 13 | #include "sherpa-onnx/csrc/circular-buffer.h" | 13 | #include "sherpa-onnx/csrc/circular-buffer.h" |
| 14 | #include "sherpa-onnx/csrc/microphone.h" | 14 | #include "sherpa-onnx/csrc/microphone.h" |
| 15 | +#include "sherpa-onnx/csrc/resample.h" | ||
| 15 | #include "sherpa-onnx/csrc/voice-activity-detector.h" | 16 | #include "sherpa-onnx/csrc/voice-activity-detector.h" |
| 16 | #include "sherpa-onnx/csrc/wave-writer.h" | 17 | #include "sherpa-onnx/csrc/wave-writer.h" |
| 17 | 18 | ||
| @@ -76,14 +77,31 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | @@ -76,14 +77,31 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 76 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); | 77 | PaDeviceIndex num_devices = Pa_GetDeviceCount(); |
| 77 | fprintf(stderr, "Num devices: %d\n", num_devices); | 78 | fprintf(stderr, "Num devices: %d\n", num_devices); |
| 78 | 79 | ||
| 79 | - PaStreamParameters param; | 80 | + int32_t device_index = Pa_GetDefaultInputDevice(); |
| 80 | 81 | ||
| 81 | - param.device = Pa_GetDefaultInputDevice(); | ||
| 82 | - if (param.device == paNoDevice) { | 82 | + if (device_index == paNoDevice) { |
| 83 | fprintf(stderr, "No default input device found\n"); | 83 | fprintf(stderr, "No default input device found\n"); |
| 84 | + fprintf(stderr, "If you are using Linux, please switch to \n"); | ||
| 85 | + fprintf(stderr, " ./bin/sherpa-onnx-vad-alsa \n"); | ||
| 84 | exit(EXIT_FAILURE); | 86 | exit(EXIT_FAILURE); |
| 85 | } | 87 | } |
| 86 | - fprintf(stderr, "Use default device: %d\n", param.device); | 88 | + |
| 89 | + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); | ||
| 90 | + if (pDeviceIndex) { | ||
| 91 | + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); | ||
| 92 | + device_index = atoi(pDeviceIndex); | ||
| 93 | + } | ||
| 94 | + | ||
| 95 | + for (int32_t i = 0; i != num_devices; ++i) { | ||
| 96 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(i); | ||
| 97 | + fprintf(stderr, " %s %d %s\n", (i == device_index) ? "*" : " ", i, | ||
| 98 | + info->name); | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + PaStreamParameters param; | ||
| 102 | + param.device = device_index; | ||
| 103 | + | ||
| 104 | + fprintf(stderr, "Use device: %d\n", param.device); | ||
| 87 | 105 | ||
| 88 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | 106 | const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); |
| 89 | fprintf(stderr, " Name: %s\n", info->name); | 107 | fprintf(stderr, " Name: %s\n", info->name); |
| @@ -94,12 +112,28 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | @@ -94,12 +112,28 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 94 | 112 | ||
| 95 | param.suggestedLatency = info->defaultLowInputLatency; | 113 | param.suggestedLatency = info->defaultLowInputLatency; |
| 96 | param.hostApiSpecificStreamInfo = nullptr; | 114 | param.hostApiSpecificStreamInfo = nullptr; |
| 115 | + float mic_sample_rate = 16000; | ||
| 116 | + const char *pSampleRateStr = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); | ||
| 117 | + if (pSampleRateStr) { | ||
| 118 | + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); | ||
| 119 | + mic_sample_rate = atof(pSampleRateStr); | ||
| 120 | + } | ||
| 97 | float sample_rate = 16000; | 121 | float sample_rate = 16000; |
| 98 | 122 | ||
| 123 | + std::unique_ptr<sherpa_onnx::LinearResample> resampler; | ||
| 124 | + if (mic_sample_rate != sample_rate) { | ||
| 125 | + float min_freq = std::min(mic_sample_rate, sample_rate); | ||
| 126 | + float lowpass_cutoff = 0.99 * 0.5 * min_freq; | ||
| 127 | + | ||
| 128 | + int32_t lowpass_filter_width = 6; | ||
| 129 | + resampler = std::make_unique<sherpa_onnx::LinearResample>( | ||
| 130 | + mic_sample_rate, sample_rate, lowpass_cutoff, lowpass_filter_width); | ||
| 131 | + } | ||
| 132 | + | ||
| 99 | PaStream *stream; | 133 | PaStream *stream; |
| 100 | PaError err = | 134 | PaError err = |
| 101 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | 135 | Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ |
| 102 | - sample_rate, | 136 | + mic_sample_rate, |
| 103 | 0, // frames per buffer | 137 | 0, // frames per buffer |
| 104 | paClipOff, // we won't output out of range samples | 138 | paClipOff, // we won't output out of range samples |
| 105 | // so don't bother clipping them | 139 | // so don't bother clipping them |
| @@ -131,6 +165,13 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | @@ -131,6 +165,13 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 131 | while (buffer.Size() >= window_size) { | 165 | while (buffer.Size() >= window_size) { |
| 132 | std::vector<float> samples = buffer.Get(buffer.Head(), window_size); | 166 | std::vector<float> samples = buffer.Get(buffer.Head(), window_size); |
| 133 | buffer.Pop(window_size); | 167 | buffer.Pop(window_size); |
| 168 | + | ||
| 169 | + if (resampler) { | ||
| 170 | + std::vector<float> tmp; | ||
| 171 | + resampler->Resample(samples.data(), samples.size(), true, &tmp); | ||
| 172 | + samples = std::move(tmp); | ||
| 173 | + } | ||
| 174 | + | ||
| 134 | vad->AcceptWaveform(samples.data(), samples.size()); | 175 | vad->AcceptWaveform(samples.data(), samples.size()); |
| 135 | 176 | ||
| 136 | if (vad->IsSpeechDetected() && !printed) { | 177 | if (vad->IsSpeechDetected() && !printed) { |
| @@ -149,7 +190,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | @@ -149,7 +190,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 149 | char filename[128]; | 190 | char filename[128]; |
| 150 | snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration); | 191 | snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration); |
| 151 | k += 1; | 192 | k += 1; |
| 152 | - sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(), | 193 | + sherpa_onnx::WriteWave(filename, sample_rate, segment.samples.data(), |
| 153 | segment.samples.size()); | 194 | segment.samples.size()); |
| 154 | fprintf(stderr, "Saved to %s\n", filename); | 195 | fprintf(stderr, "Saved to %s\n", filename); |
| 155 | fprintf(stderr, "----------\n"); | 196 | fprintf(stderr, "----------\n"); |
-
请 注册 或 登录 后发表评论