Committed by
GitHub
Add microphone streaming ASR example for C API (#650)
正在显示
8 个修改的文件
包含
285 行增加
和
1 行删除
| @@ -6,3 +6,9 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) | @@ -6,3 +6,9 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) | ||
| 6 | 6 | ||
| 7 | add_executable(offline-tts-c-api offline-tts-c-api.c) | 7 | add_executable(offline-tts-c-api offline-tts-c-api.c) |
| 8 | target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) | 8 | target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) |
| 9 | + | ||
| 10 | +if(SHERPA_ONNX_HAS_ALSA) | ||
| 11 | + add_subdirectory(./asr-microphone-example) | ||
| 12 | +else() | ||
| 13 | + message(WARNING "Not include ./asr-microphone-example since alsa is not available") | ||
| 14 | +endif() |
| 1 | + | ||
| 2 | +add_executable(c-api-alsa c-api-alsa.cc alsa.cc) | ||
| 3 | +target_link_libraries(c-api-alsa sherpa-onnx-c-api cargs) | ||
| 4 | + | ||
| 5 | +if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) | ||
| 6 | + target_link_libraries(c-api-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) | ||
| 7 | +else() | ||
| 8 | + target_link_libraries(c-api-alsa asound) | ||
| 9 | +endif() |
| 1 | +exclude_files=alsa.cc|alsa.h |
| 1 | +# Introduction | ||
| 2 | + | ||
| 3 | +This folder contains examples for real-time speech recognition from a microphone | ||
| 4 | +using sherpa-onnx C API. | ||
| 5 | + | ||
| 6 | +**Note**: You can call C API from C++ files. | ||
| 7 | + | ||
| 8 | + | ||
| 9 | +## ./c-api-alsa.cc | ||
| 10 | + | ||
| 11 | +This file uses alsa to read a microphone. It runs only on Linux. This file | ||
| 12 | +does not support macOS or Windows. |
| 1 | +../../sherpa-onnx/csrc/alsa.cc |
c-api-examples/asr-microphone-example/alsa.h
0 → 120000
| 1 | +../../sherpa-onnx/csrc/alsa.h |
| 1 | +// c-api-examples/asr-microphone-example/c-api-alsa.cc | ||
| 2 | +// Copyright (c) 2022-2024 Xiaomi Corporation | ||
| 3 | + | ||
| 4 | +#include <signal.h> | ||
| 5 | +#include <stdio.h> | ||
| 6 | +#include <stdlib.h> | ||
| 7 | +#include <string.h> | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | +#include <cctype> // std::tolower | ||
| 11 | +#include <cstdint> | ||
| 12 | +#include <string> | ||
| 13 | + | ||
| 14 | +#include "c-api-examples/asr-microphone-example/alsa.h" | ||
| 15 | + | ||
| 16 | +// NOTE: You don't need to use cargs.h in your own project. | ||
| 17 | +// We use it in this file to parse commandline arguments | ||
| 18 | +#include "cargs.h" // NOLINT | ||
| 19 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 20 | + | ||
| 21 | +static struct cag_option options[] = { | ||
| 22 | + {.identifier = 'h', | ||
| 23 | + .access_letters = "h", | ||
| 24 | + .access_name = "help", | ||
| 25 | + .description = "Show help"}, | ||
| 26 | + {.identifier = 't', | ||
| 27 | + .access_letters = NULL, | ||
| 28 | + .access_name = "tokens", | ||
| 29 | + .value_name = "tokens", | ||
| 30 | + .description = "Tokens file"}, | ||
| 31 | + {.identifier = 'e', | ||
| 32 | + .access_letters = NULL, | ||
| 33 | + .access_name = "encoder", | ||
| 34 | + .value_name = "encoder", | ||
| 35 | + .description = "Encoder ONNX file"}, | ||
| 36 | + {.identifier = 'd', | ||
| 37 | + .access_letters = NULL, | ||
| 38 | + .access_name = "decoder", | ||
| 39 | + .value_name = "decoder", | ||
| 40 | + .description = "Decoder ONNX file"}, | ||
| 41 | + {.identifier = 'j', | ||
| 42 | + .access_letters = NULL, | ||
| 43 | + .access_name = "joiner", | ||
| 44 | + .value_name = "joiner", | ||
| 45 | + .description = "Joiner ONNX file"}, | ||
| 46 | + {.identifier = 'n', | ||
| 47 | + .access_letters = NULL, | ||
| 48 | + .access_name = "num-threads", | ||
| 49 | + .value_name = "num-threads", | ||
| 50 | + .description = "Number of threads"}, | ||
| 51 | + {.identifier = 'p', | ||
| 52 | + .access_letters = NULL, | ||
| 53 | + .access_name = "provider", | ||
| 54 | + .value_name = "provider", | ||
| 55 | + .description = "Provider: cpu (default), cuda, coreml"}, | ||
| 56 | + {.identifier = 'm', | ||
| 57 | + .access_letters = NULL, | ||
| 58 | + .access_name = "decoding-method", | ||
| 59 | + .value_name = "decoding-method", | ||
| 60 | + .description = | ||
| 61 | + "Decoding method: greedy_search (default), modified_beam_search"}, | ||
| 62 | + {.identifier = 'f', | ||
| 63 | + .access_letters = NULL, | ||
| 64 | + .access_name = "hotwords-file", | ||
| 65 | + .value_name = "hotwords-file", | ||
| 66 | + .description = "The file containing hotwords, one words/phrases per line, " | ||
| 67 | + "and for each phrase the bpe/cjkchar are separated by a " | ||
| 68 | + "space. For example: ▁HE LL O ▁WORLD, 你 好 世 界"}, | ||
| 69 | + {.identifier = 's', | ||
| 70 | + .access_letters = NULL, | ||
| 71 | + .access_name = "hotwords-score", | ||
| 72 | + .value_name = "hotwords-score", | ||
| 73 | + .description = "The bonus score for each token in hotwords. Used only " | ||
| 74 | + "when decoding_method is modified_beam_search"}, | ||
| 75 | +}; | ||
| 76 | + | ||
| 77 | +const char *kUsage = | ||
| 78 | + R"( | ||
| 79 | +Usage: | ||
| 80 | + ./bin/c-api-alsa \ | ||
| 81 | + --tokens=/path/to/tokens.txt \ | ||
| 82 | + --encoder=/path/to/encoder.onnx \ | ||
| 83 | + --decoder=/path/to/decoder.onnx \ | ||
| 84 | + --joiner=/path/to/decoder.onnx \ | ||
| 85 | + device_name | ||
| 86 | + | ||
| 87 | +The device name specifies which microphone to use in case there are several | ||
| 88 | +on your system. You can use | ||
| 89 | + | ||
| 90 | + arecord -l | ||
| 91 | + | ||
| 92 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 93 | + | ||
| 94 | +**** List of CAPTURE Hardware Devices **** | ||
| 95 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 96 | + Subdevices: 1/1 | ||
| 97 | + Subdevice #0: subdevice #0 | ||
| 98 | + | ||
| 99 | +and if you want to select card 3 and the device 0 on that card, please use: | ||
| 100 | + | ||
| 101 | + plughw:3,0 | ||
| 102 | + | ||
| 103 | +as the device_name. | ||
| 104 | +)"; | ||
| 105 | + | ||
| 106 | +bool stop = false; | ||
| 107 | + | ||
| 108 | +static void Handler(int sig) { | ||
| 109 | + stop = true; | ||
| 110 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); | ||
| 111 | +} | ||
| 112 | + | ||
| 113 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 114 | + if (argc < 6) { | ||
| 115 | + fprintf(stderr, "%s\n", kUsage); | ||
| 116 | + exit(0); | ||
| 117 | + } | ||
| 118 | + | ||
| 119 | + signal(SIGINT, Handler); | ||
| 120 | + | ||
| 121 | + SherpaOnnxOnlineRecognizerConfig config; | ||
| 122 | + memset(&config, 0, sizeof(config)); | ||
| 123 | + | ||
| 124 | + config.model_config.debug = 0; | ||
| 125 | + config.model_config.num_threads = 1; | ||
| 126 | + config.model_config.provider = "cpu"; | ||
| 127 | + | ||
| 128 | + config.decoding_method = "greedy_search"; | ||
| 129 | + | ||
| 130 | + config.max_active_paths = 4; | ||
| 131 | + | ||
| 132 | + config.feat_config.sample_rate = 16000; | ||
| 133 | + config.feat_config.feature_dim = 80; | ||
| 134 | + | ||
| 135 | + config.enable_endpoint = 1; | ||
| 136 | + config.rule1_min_trailing_silence = 2.4; | ||
| 137 | + config.rule2_min_trailing_silence = 1.2; | ||
| 138 | + config.rule3_min_utterance_length = 300; | ||
| 139 | + | ||
| 140 | + cag_option_context context; | ||
| 141 | + char identifier; | ||
| 142 | + const char *value; | ||
| 143 | + | ||
| 144 | + cag_option_prepare(&context, options, CAG_ARRAY_SIZE(options), argc, argv); | ||
| 145 | + | ||
| 146 | + while (cag_option_fetch(&context)) { | ||
| 147 | + identifier = cag_option_get(&context); | ||
| 148 | + value = cag_option_get_value(&context); | ||
| 149 | + switch (identifier) { | ||
| 150 | + case 't': | ||
| 151 | + config.model_config.tokens = value; | ||
| 152 | + break; | ||
| 153 | + case 'e': | ||
| 154 | + config.model_config.transducer.encoder = value; | ||
| 155 | + break; | ||
| 156 | + case 'd': | ||
| 157 | + config.model_config.transducer.decoder = value; | ||
| 158 | + break; | ||
| 159 | + case 'j': | ||
| 160 | + config.model_config.transducer.joiner = value; | ||
| 161 | + break; | ||
| 162 | + case 'n': | ||
| 163 | + config.model_config.num_threads = atoi(value); | ||
| 164 | + break; | ||
| 165 | + case 'p': | ||
| 166 | + config.model_config.provider = value; | ||
| 167 | + break; | ||
| 168 | + case 'm': | ||
| 169 | + config.decoding_method = value; | ||
| 170 | + break; | ||
| 171 | + case 'f': | ||
| 172 | + config.hotwords_file = value; | ||
| 173 | + break; | ||
| 174 | + case 's': | ||
| 175 | + config.hotwords_score = atof(value); | ||
| 176 | + break; | ||
| 177 | + case 'h': { | ||
| 178 | + fprintf(stderr, "%s\n", kUsage); | ||
| 179 | + exit(0); | ||
| 180 | + break; | ||
| 181 | + } | ||
| 182 | + default: | ||
| 183 | + // do nothing as config already has valid default values | ||
| 184 | + break; | ||
| 185 | + } | ||
| 186 | + } | ||
| 187 | + | ||
| 188 | + SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config); | ||
| 189 | + SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); | ||
| 190 | + | ||
| 191 | + SherpaOnnxDisplay *display = CreateDisplay(50); | ||
| 192 | + int32_t segment_id = 0; | ||
| 193 | + | ||
| 194 | + const char *device_name = argv[context.index]; | ||
| 195 | + sherpa_onnx::Alsa alsa(device_name); | ||
| 196 | + fprintf(stderr, "Use recording device: %s\n", device_name); | ||
| 197 | + fprintf(stderr, | ||
| 198 | + "Please \033[32m\033[1mspeak\033[0m! Press \033[31m\033[1mCtrl + " | ||
| 199 | + "C\033[0m to exit\n"); | ||
| 200 | + | ||
| 201 | + int32_t expected_sample_rate = 16000; | ||
| 202 | + | ||
| 203 | + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { | ||
| 204 | + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), | ||
| 205 | + expected_sample_rate); | ||
| 206 | + exit(-1); | ||
| 207 | + } | ||
| 208 | + | ||
| 209 | + int32_t chunk = 0.1 * alsa.GetActualSampleRate(); | ||
| 210 | + | ||
| 211 | + std::string last_text; | ||
| 212 | + | ||
| 213 | + int32_t segment_index = 0; | ||
| 214 | + | ||
| 215 | + while (!stop) { | ||
| 216 | + const std::vector<float> &samples = alsa.Read(chunk); | ||
| 217 | + AcceptWaveform(stream, expected_sample_rate, samples.data(), | ||
| 218 | + samples.size()); | ||
| 219 | + while (IsOnlineStreamReady(recognizer, stream)) { | ||
| 220 | + DecodeOnlineStream(recognizer, stream); | ||
| 221 | + } | ||
| 222 | + | ||
| 223 | + const SherpaOnnxOnlineRecognizerResult *r = | ||
| 224 | + GetOnlineStreamResult(recognizer, stream); | ||
| 225 | + | ||
| 226 | + std::string text = r->text; | ||
| 227 | + DestroyOnlineRecognizerResult(r); | ||
| 228 | + | ||
| 229 | + if (!text.empty() && last_text != text) { | ||
| 230 | + last_text = text; | ||
| 231 | + | ||
| 232 | + std::transform(text.begin(), text.end(), text.begin(), | ||
| 233 | + [](auto c) { return std::tolower(c); }); | ||
| 234 | + | ||
| 235 | + SherpaOnnxPrint(display, segment_index, text.c_str()); | ||
| 236 | + fflush(stderr); | ||
| 237 | + } | ||
| 238 | + | ||
| 239 | + if (IsEndpoint(recognizer, stream)) { | ||
| 240 | + if (!text.empty()) { | ||
| 241 | + ++segment_index; | ||
| 242 | + } | ||
| 243 | + Reset(recognizer, stream); | ||
| 244 | + } | ||
| 245 | + } | ||
| 246 | + | ||
| 247 | + // free allocated resources | ||
| 248 | + DestroyDisplay(display); | ||
| 249 | + DestroyOnlineStream(stream); | ||
| 250 | + DestroyOnlineRecognizer(recognizer); | ||
| 251 | + fprintf(stderr, "\n"); | ||
| 252 | + | ||
| 253 | + return 0; | ||
| 254 | +} |
| @@ -157,7 +157,7 @@ int32_t main(int32_t argc, char *argv[]) { | @@ -157,7 +157,7 @@ int32_t main(int32_t argc, char *argv[]) { | ||
| 157 | break; | 157 | break; |
| 158 | } | 158 | } |
| 159 | default: | 159 | default: |
| 160 | - // do nothing as config already have valid default values | 160 | + // do nothing as config already has valid default values |
| 161 | break; | 161 | break; |
| 162 | } | 162 | } |
| 163 | } | 163 | } |
-
请 注册 或 登录 后发表评论