Fangjun Kuang
Committed by GitHub

Add microphone streaming ASR example for C API (#650)

@@ -6,3 +6,9 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) @@ -6,3 +6,9 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs)
6 6
7 add_executable(offline-tts-c-api offline-tts-c-api.c) 7 add_executable(offline-tts-c-api offline-tts-c-api.c)
8 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) 8 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs)
  9 +
  10 +if(SHERPA_ONNX_HAS_ALSA)
  11 + add_subdirectory(./asr-microphone-example)
  12 +else()
  13 + message(WARNING "Not include ./asr-microphone-example since alsa is not available")
  14 +endif()
  1 +
  2 +add_executable(c-api-alsa c-api-alsa.cc alsa.cc)
  3 +target_link_libraries(c-api-alsa sherpa-onnx-c-api cargs)
  4 +
  5 +if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
  6 + target_link_libraries(c-api-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
  7 +else()
  8 + target_link_libraries(c-api-alsa asound)
  9 +endif()
  1 +# Introduction
  2 +
  3 +This folder contains examples for real-time speech recognition from a microphone
  4 +using sherpa-onnx C API.
  5 +
  6 +**Note**: You can call C API from C++ files.
  7 +
  8 +
  9 +## ./c-api-alsa.cc
  10 +
  11 +This file uses alsa to read a microphone. It runs only on Linux. This file
  12 +does not support macOS or Windows.
  1 +../../sherpa-onnx/csrc/alsa.cc
  1 +../../sherpa-onnx/csrc/alsa.h
  1 +// c-api-examples/asr-microphone-example/c-api-alsa.cc
  2 +// Copyright (c) 2022-2024 Xiaomi Corporation
  3 +
  4 +#include <signal.h>
  5 +#include <stdio.h>
  6 +#include <stdlib.h>
  7 +#include <string.h>
  8 +
  9 +#include <algorithm>
  10 +#include <cctype> // std::tolower
  11 +#include <cstdint>
  12 +#include <string>
  13 +
  14 +#include "c-api-examples/asr-microphone-example/alsa.h"
  15 +
  16 +// NOTE: You don't need to use cargs.h in your own project.
  17 +// We use it in this file to parse commandline arguments
  18 +#include "cargs.h" // NOLINT
  19 +#include "sherpa-onnx/c-api/c-api.h"
  20 +
  21 +static struct cag_option options[] = {
  22 + {.identifier = 'h',
  23 + .access_letters = "h",
  24 + .access_name = "help",
  25 + .description = "Show help"},
  26 + {.identifier = 't',
  27 + .access_letters = NULL,
  28 + .access_name = "tokens",
  29 + .value_name = "tokens",
  30 + .description = "Tokens file"},
  31 + {.identifier = 'e',
  32 + .access_letters = NULL,
  33 + .access_name = "encoder",
  34 + .value_name = "encoder",
  35 + .description = "Encoder ONNX file"},
  36 + {.identifier = 'd',
  37 + .access_letters = NULL,
  38 + .access_name = "decoder",
  39 + .value_name = "decoder",
  40 + .description = "Decoder ONNX file"},
  41 + {.identifier = 'j',
  42 + .access_letters = NULL,
  43 + .access_name = "joiner",
  44 + .value_name = "joiner",
  45 + .description = "Joiner ONNX file"},
  46 + {.identifier = 'n',
  47 + .access_letters = NULL,
  48 + .access_name = "num-threads",
  49 + .value_name = "num-threads",
  50 + .description = "Number of threads"},
  51 + {.identifier = 'p',
  52 + .access_letters = NULL,
  53 + .access_name = "provider",
  54 + .value_name = "provider",
  55 + .description = "Provider: cpu (default), cuda, coreml"},
  56 + {.identifier = 'm',
  57 + .access_letters = NULL,
  58 + .access_name = "decoding-method",
  59 + .value_name = "decoding-method",
  60 + .description =
  61 + "Decoding method: greedy_search (default), modified_beam_search"},
  62 + {.identifier = 'f',
  63 + .access_letters = NULL,
  64 + .access_name = "hotwords-file",
  65 + .value_name = "hotwords-file",
  66 + .description = "The file containing hotwords, one words/phrases per line, "
  67 + "and for each phrase the bpe/cjkchar are separated by a "
  68 + "space. For example: ▁HE LL O ▁WORLD, 你 好 世 界"},
  69 + {.identifier = 's',
  70 + .access_letters = NULL,
  71 + .access_name = "hotwords-score",
  72 + .value_name = "hotwords-score",
  73 + .description = "The bonus score for each token in hotwords. Used only "
  74 + "when decoding_method is modified_beam_search"},
  75 +};
  76 +
  77 +const char *kUsage =
  78 + R"(
  79 +Usage:
  80 + ./bin/c-api-alsa \
  81 + --tokens=/path/to/tokens.txt \
  82 + --encoder=/path/to/encoder.onnx \
  83 + --decoder=/path/to/decoder.onnx \
  84 + --joiner=/path/to/decoder.onnx \
  85 + device_name
  86 +
  87 +The device name specifies which microphone to use in case there are several
  88 +on your system. You can use
  89 +
  90 + arecord -l
  91 +
  92 +to find all available microphones on your computer. For instance, if it outputs
  93 +
  94 +**** List of CAPTURE Hardware Devices ****
  95 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  96 + Subdevices: 1/1
  97 + Subdevice #0: subdevice #0
  98 +
  99 +and if you want to select card 3 and the device 0 on that card, please use:
  100 +
  101 + plughw:3,0
  102 +
  103 +as the device_name.
  104 +)";
  105 +
  106 +bool stop = false;
  107 +
  108 +static void Handler(int sig) {
  109 + stop = true;
  110 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  111 +}
  112 +
  113 +int32_t main(int32_t argc, char *argv[]) {
  114 + if (argc < 6) {
  115 + fprintf(stderr, "%s\n", kUsage);
  116 + exit(0);
  117 + }
  118 +
  119 + signal(SIGINT, Handler);
  120 +
  121 + SherpaOnnxOnlineRecognizerConfig config;
  122 + memset(&config, 0, sizeof(config));
  123 +
  124 + config.model_config.debug = 0;
  125 + config.model_config.num_threads = 1;
  126 + config.model_config.provider = "cpu";
  127 +
  128 + config.decoding_method = "greedy_search";
  129 +
  130 + config.max_active_paths = 4;
  131 +
  132 + config.feat_config.sample_rate = 16000;
  133 + config.feat_config.feature_dim = 80;
  134 +
  135 + config.enable_endpoint = 1;
  136 + config.rule1_min_trailing_silence = 2.4;
  137 + config.rule2_min_trailing_silence = 1.2;
  138 + config.rule3_min_utterance_length = 300;
  139 +
  140 + cag_option_context context;
  141 + char identifier;
  142 + const char *value;
  143 +
  144 + cag_option_prepare(&context, options, CAG_ARRAY_SIZE(options), argc, argv);
  145 +
  146 + while (cag_option_fetch(&context)) {
  147 + identifier = cag_option_get(&context);
  148 + value = cag_option_get_value(&context);
  149 + switch (identifier) {
  150 + case 't':
  151 + config.model_config.tokens = value;
  152 + break;
  153 + case 'e':
  154 + config.model_config.transducer.encoder = value;
  155 + break;
  156 + case 'd':
  157 + config.model_config.transducer.decoder = value;
  158 + break;
  159 + case 'j':
  160 + config.model_config.transducer.joiner = value;
  161 + break;
  162 + case 'n':
  163 + config.model_config.num_threads = atoi(value);
  164 + break;
  165 + case 'p':
  166 + config.model_config.provider = value;
  167 + break;
  168 + case 'm':
  169 + config.decoding_method = value;
  170 + break;
  171 + case 'f':
  172 + config.hotwords_file = value;
  173 + break;
  174 + case 's':
  175 + config.hotwords_score = atof(value);
  176 + break;
  177 + case 'h': {
  178 + fprintf(stderr, "%s\n", kUsage);
  179 + exit(0);
  180 + break;
  181 + }
  182 + default:
  183 + // do nothing as config already has valid default values
  184 + break;
  185 + }
  186 + }
  187 +
  188 + SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config);
  189 + SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
  190 +
  191 + SherpaOnnxDisplay *display = CreateDisplay(50);
  192 + int32_t segment_id = 0;
  193 +
  194 + const char *device_name = argv[context.index];
  195 + sherpa_onnx::Alsa alsa(device_name);
  196 + fprintf(stderr, "Use recording device: %s\n", device_name);
  197 + fprintf(stderr,
  198 + "Please \033[32m\033[1mspeak\033[0m! Press \033[31m\033[1mCtrl + "
  199 + "C\033[0m to exit\n");
  200 +
  201 + int32_t expected_sample_rate = 16000;
  202 +
  203 + if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
  204 + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
  205 + expected_sample_rate);
  206 + exit(-1);
  207 + }
  208 +
  209 + int32_t chunk = 0.1 * alsa.GetActualSampleRate();
  210 +
  211 + std::string last_text;
  212 +
  213 + int32_t segment_index = 0;
  214 +
  215 + while (!stop) {
  216 + const std::vector<float> &samples = alsa.Read(chunk);
  217 + AcceptWaveform(stream, expected_sample_rate, samples.data(),
  218 + samples.size());
  219 + while (IsOnlineStreamReady(recognizer, stream)) {
  220 + DecodeOnlineStream(recognizer, stream);
  221 + }
  222 +
  223 + const SherpaOnnxOnlineRecognizerResult *r =
  224 + GetOnlineStreamResult(recognizer, stream);
  225 +
  226 + std::string text = r->text;
  227 + DestroyOnlineRecognizerResult(r);
  228 +
  229 + if (!text.empty() && last_text != text) {
  230 + last_text = text;
  231 +
  232 + std::transform(text.begin(), text.end(), text.begin(),
  233 + [](auto c) { return std::tolower(c); });
  234 +
  235 + SherpaOnnxPrint(display, segment_index, text.c_str());
  236 + fflush(stderr);
  237 + }
  238 +
  239 + if (IsEndpoint(recognizer, stream)) {
  240 + if (!text.empty()) {
  241 + ++segment_index;
  242 + }
  243 + Reset(recognizer, stream);
  244 + }
  245 + }
  246 +
  247 + // free allocated resources
  248 + DestroyDisplay(display);
  249 + DestroyOnlineStream(stream);
  250 + DestroyOnlineRecognizer(recognizer);
  251 + fprintf(stderr, "\n");
  252 +
  253 + return 0;
  254 +}
@@ -157,7 +157,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -157,7 +157,7 @@ int32_t main(int32_t argc, char *argv[]) {
157 break; 157 break;
158 } 158 }
159 default: 159 default:
160 - // do nothing as config already have valid default values 160 + // do nothing as config already has valid default values
161 break; 161 break;
162 } 162 }
163 } 163 }