Fangjun Kuang
Committed by GitHub

Add Python ASR examples with alsa (#646)

@@ -146,6 +146,7 @@ include(CheckIncludeFileCXX) @@ -146,6 +146,7 @@ include(CheckIncludeFileCXX)
146 if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME STREQUAL Android) 146 if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME STREQUAL Android)
147 check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) 147 check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
148 if(SHERPA_ONNX_HAS_ALSA) 148 if(SHERPA_ONNX_HAS_ALSA)
  149 + message(STATUS "With Alsa")
149 add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) 150 add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
150 else() 151 else()
151 message(WARNING "\ 152 message(WARNING "\
  1 +#!/usr/bin/env python3
  2 +
  3 +# Real-time speech recognition from a microphone with sherpa-onnx Python API
  4 +# with endpoint detection.
  5 +#
  6 +# Note: This script uses ALSA and works only on Linux systems, especially
  7 +# for embedding Linux systems and for running Linux on Windows using WSL.
  8 +#
  9 +# Please refer to
  10 +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  11 +# to download pre-trained models
  12 +
  13 +import argparse
  14 +import sys
  15 +from pathlib import Path
  16 +import sherpa_onnx
  17 +
  18 +
  19 +def assert_file_exists(filename: str):
  20 + assert Path(filename).is_file(), (
  21 + f"{filename} does not exist!\n"
  22 + "Please refer to "
  23 + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
  24 + )
  25 +
  26 +
  27 +def get_args():
  28 + parser = argparse.ArgumentParser(
  29 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  30 + )
  31 +
  32 + parser.add_argument(
  33 + "--tokens",
  34 + type=str,
  35 + required=True,
  36 + help="Path to tokens.txt",
  37 + )
  38 +
  39 + parser.add_argument(
  40 + "--encoder",
  41 + type=str,
  42 + required=True,
  43 + help="Path to the encoder model",
  44 + )
  45 +
  46 + parser.add_argument(
  47 + "--decoder",
  48 + type=str,
  49 + required=True,
  50 + help="Path to the decoder model",
  51 + )
  52 +
  53 + parser.add_argument(
  54 + "--joiner",
  55 + type=str,
  56 + required=True,
  57 + help="Path to the joiner model",
  58 + )
  59 +
  60 + parser.add_argument(
  61 + "--decoding-method",
  62 + type=str,
  63 + default="greedy_search",
  64 + help="Valid values are greedy_search and modified_beam_search",
  65 + )
  66 +
  67 + parser.add_argument(
  68 + "--provider",
  69 + type=str,
  70 + default="cpu",
  71 + help="Valid values: cpu, cuda, coreml",
  72 + )
  73 +
  74 + parser.add_argument(
  75 + "--hotwords-file",
  76 + type=str,
  77 + default="",
  78 + help="""
  79 + The file containing hotwords, one words/phrases per line, and for each
  80 + phrase the bpe/cjkchar are separated by a space. For example:
  81 +
  82 + ▁HE LL O ▁WORLD
  83 + 你 好 世 界
  84 + """,
  85 + )
  86 +
  87 + parser.add_argument(
  88 + "--hotwords-score",
  89 + type=float,
  90 + default=1.5,
  91 + help="""
  92 + The hotword score of each token for biasing word/phrase. Used only if
  93 + --hotwords-file is given.
  94 + """,
  95 + )
  96 +
  97 + parser.add_argument(
  98 + "--blank-penalty",
  99 + type=float,
  100 + default=0.0,
  101 + help="""
  102 + The penalty applied on blank symbol during decoding.
  103 + Note: It is a positive value that would be applied to logits like
  104 + this `logits[:, 0] -= blank_penalty` (suppose logits.shape is
  105 + [batch_size, vocab] and blank id is 0).
  106 + """,
  107 + )
  108 +
  109 + parser.add_argument(
  110 + "--device-name",
  111 + type=str,
  112 + required=True,
  113 + help="""
  114 +The device name specifies which microphone to use in case there are several
  115 +on your system. You can use
  116 +
  117 + arecord -l
  118 +
  119 +to find all available microphones on your computer. For instance, if it outputs
  120 +
  121 +**** List of CAPTURE Hardware Devices ****
  122 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  123 + Subdevices: 1/1
  124 + Subdevice #0: subdevice #0
  125 +
  126 +and if you want to select card 3 and the device 0 on that card, please use:
  127 +
  128 + plughw:3,0
  129 +
  130 +as the device_name.
  131 + """,
  132 + )
  133 +
  134 + return parser.parse_args()
  135 +
  136 +
  137 +def create_recognizer(args):
  138 + assert_file_exists(args.encoder)
  139 + assert_file_exists(args.decoder)
  140 + assert_file_exists(args.joiner)
  141 + assert_file_exists(args.tokens)
  142 + # Please replace the model files if needed.
  143 + # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  144 + # for download links.
  145 + recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
  146 + tokens=args.tokens,
  147 + encoder=args.encoder,
  148 + decoder=args.decoder,
  149 + joiner=args.joiner,
  150 + num_threads=1,
  151 + sample_rate=16000,
  152 + feature_dim=80,
  153 + enable_endpoint_detection=True,
  154 + rule1_min_trailing_silence=2.4,
  155 + rule2_min_trailing_silence=1.2,
  156 + rule3_min_utterance_length=300, # it essentially disables this rule
  157 + decoding_method=args.decoding_method,
  158 + provider=args.provider,
  159 + hotwords_file=args.hotwords_file,
  160 + hotwords_score=args.hotwords_score,
  161 + blank_penalty=args.blank_penalty,
  162 + )
  163 + return recognizer
  164 +
  165 +
  166 +def main():
  167 + args = get_args()
  168 + device_name = args.device_name
  169 + print(f"device_name: {device_name}")
  170 + alsa = sherpa_onnx.Alsa(device_name)
  171 +
  172 + print("Creating recognizer")
  173 + recognizer = create_recognizer(args)
  174 + print("Started! Please speak")
  175 +
  176 + sample_rate = 16000
  177 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
  178 +
  179 + stream = recognizer.create_stream()
  180 +
  181 + last_result = ""
  182 + segment_id = 0
  183 + while True:
  184 + samples = alsa.read(samples_per_read) # a blocking read
  185 + stream.accept_waveform(sample_rate, samples)
  186 + while recognizer.is_ready(stream):
  187 + recognizer.decode_stream(stream)
  188 +
  189 + is_endpoint = recognizer.is_endpoint(stream)
  190 +
  191 + result = recognizer.get_result(stream)
  192 +
  193 + if result and (last_result != result):
  194 + last_result = result
  195 + print("\r{}:{}".format(segment_id, result), end="", flush=True)
  196 + if is_endpoint:
  197 + if result:
  198 + print("\r{}:{}".format(segment_id, result), flush=True)
  199 + segment_id += 1
  200 + recognizer.reset(stream)
  201 +
  202 +
  203 +if __name__ == "__main__":
  204 + try:
  205 + main()
  206 + except KeyboardInterrupt:
  207 + print("\nCaught Ctrl + C. Exiting")
@@ -16,7 +16,7 @@ @@ -16,7 +16,7 @@
16 #endif 16 #endif
17 17
18 #if __ANDROID_API__ >= 27 18 #if __ANDROID_API__ >= 27
19 -#include "nnapi_provider_factory.h" 19 +#include "nnapi_provider_factory.h" // NOLINT
20 #endif 20 #endif
21 21
22 namespace sherpa_onnx { 22 namespace sherpa_onnx {
@@ -276,8 +276,8 @@ as the device_name. @@ -276,8 +276,8 @@ as the device_name.
276 } 276 }
277 } 277 }
278 278
279 - using namespace std::chrono_literals;  
280 - std::this_thread::sleep_for(20ms); // sleep for 20ms 279 + using namespace std::chrono_literals; // NOLINT
  280 + std::this_thread::sleep_for(20ms); // sleep for 20ms
281 } 281 }
282 282
283 t.join(); 283 t.join();
@@ -192,8 +192,8 @@ as the device_name. @@ -192,8 +192,8 @@ as the device_name.
192 } 192 }
193 } 193 }
194 194
195 - using namespace std::chrono_literals;  
196 - std::this_thread::sleep_for(20ms); // sleep for 20ms 195 + using namespace std::chrono_literals; // NOLINT
  196 + std::this_thread::sleep_for(20ms); // sleep for 20ms
197 } 197 }
198 t.join(); 198 t.join();
199 t2.join(); 199 t2.join();
@@ -53,10 +53,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] @@ -53,10 +53,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
53 53
54 and if you want to select card 3 and the device 0 on that card, please use: 54 and if you want to select card 3 and the device 0 on that card, please use:
55 55
56 - hw:3,0  
57 -  
58 -or  
59 -  
60 plughw:3,0 56 plughw:3,0
61 57
62 as the device_name. 58 as the device_name.
1 include_directories(${CMAKE_SOURCE_DIR}) 1 include_directories(${CMAKE_SOURCE_DIR})
2 2
3 -pybind11_add_module(_sherpa_onnx 3 +set(srcs
4 circular-buffer.cc 4 circular-buffer.cc
5 display.cc 5 display.cc
6 endpoint.cc 6 endpoint.cc
@@ -37,6 +37,13 @@ pybind11_add_module(_sherpa_onnx @@ -37,6 +37,13 @@ pybind11_add_module(_sherpa_onnx
37 vad-model.cc 37 vad-model.cc
38 voice-activity-detector.cc 38 voice-activity-detector.cc
39 ) 39 )
  40 +if(SHERPA_ONNX_HAS_ALSA)
  41 + list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc)
  42 +else()
  43 + list(APPEND srcs faked-alsa.cc)
  44 +endif()
  45 +
  46 +pybind11_add_module(_sherpa_onnx ${srcs})
40 47
41 if(APPLE) 48 if(APPLE)
42 execute_process( 49 execute_process(
@@ -54,6 +61,14 @@ endif() @@ -54,6 +61,14 @@ endif()
54 61
55 target_link_libraries(_sherpa_onnx PRIVATE sherpa-onnx-core) 62 target_link_libraries(_sherpa_onnx PRIVATE sherpa-onnx-core)
56 63
  64 +if(SHERPA_ONNX_HAS_ALSA)
  65 + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
  66 + target_link_libraries(_sherpa_onnx PRIVATE -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
  67 + else()
  68 + target_link_libraries(_sherpa_onnx PRIVATE asound)
  69 + endif()
  70 +endif()
  71 +
57 install(TARGETS _sherpa_onnx 72 install(TARGETS _sherpa_onnx
58 DESTINATION ../ 73 DESTINATION ../
59 ) 74 )
  1 +// sherpa-onnx/python/csrc/alsa.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/alsa.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "sherpa-onnx/csrc/alsa.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +void PybindAlsa(py::module *m) {
  14 + using PyClass = Alsa;
  15 + py::class_<PyClass>(*m, "Alsa")
  16 + .def(py::init<const char *>(), py::arg("device_name"),
  17 + py::call_guard<py::gil_scoped_release>())
  18 + .def(
  19 + "read",
  20 + [](PyClass &self, int32_t num_samples) -> std::vector<float> {
  21 + return self.Read(num_samples);
  22 + },
  23 + py::arg("num_samples"), py::call_guard<py::gil_scoped_release>())
  24 + .def_property_readonly("expected_sample_rate",
  25 + &PyClass::GetExpectedSampleRate)
  26 + .def_property_readonly("actual_sample_rate",
  27 + &PyClass::GetActualSampleRate);
  28 +}
  29 +
  30 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/alsa.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_ALSA_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_ALSA_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindAlsa(py::module *m);
  13 +
  14 +} // namespace sherpa_onnx
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_ALSA_H_
  1 +// sherpa-onnx/python/csrc/faked-alsa.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/macros.h"
  6 +#include "sherpa-onnx/python/csrc/alsa.h"
  7 +
  8 +namespace sherpa_onnx {
  9 +
  10 +class FakedAlsa {
  11 + public:
  12 + explicit FakedAlsa(const char *) {
  13 + SHERPA_ONNX_LOGE("This function is for Linux only.");
  14 +#if (SHERPA_ONNX_ENABLE_ALSA == 0) && (defined(__unix__) || defined(__unix))
  15 + SHERPA_ONNX_LOGE(R"doc(
  16 +sherpa-onnx is compiled without alsa support. To enable that, please run
  17 + (1) sudo apt-get install alsa-utils libasound2-dev
  18 + (2) rebuild sherpa-onnx
  19 +)doc");
  20 +#endif
  21 + exit(-1);
  22 + }
  23 +
  24 + std::vector<float> Read(int32_t) const { return {}; }
  25 + int32_t GetExpectedSampleRate() const { return -1; }
  26 + int32_t GetActualSampleRate() const { return -1; }
  27 +};
  28 +
  29 +void PybindAlsa(py::module *m) {
  30 + using PyClass = FakedAlsa;
  31 + py::class_<PyClass>(*m, "Alsa")
  32 + .def(py::init<const char *>(), py::arg("device_name"))
  33 + .def(
  34 + "read",
  35 + [](PyClass &self, int32_t num_samples) -> std::vector<float> {
  36 + return self.Read(num_samples);
  37 + },
  38 + py::arg("num_samples"), py::call_guard<py::gil_scoped_release>())
  39 + .def_property_readonly("expected_sample_rate",
  40 + &PyClass::GetExpectedSampleRate)
  41 + .def_property_readonly("actual_sample_rate",
  42 + &PyClass::GetActualSampleRate);
  43 +}
  44 +
  45 +} // namespace sherpa_onnx
  46 +
  47 +#endif // SHERPA_ONNX_PYTHON_CSRC_FAKED_ALSA_H_
@@ -4,6 +4,7 @@ @@ -4,6 +4,7 @@
4 4
5 #include "sherpa-onnx/python/csrc/sherpa-onnx.h" 5 #include "sherpa-onnx/python/csrc/sherpa-onnx.h"
6 6
  7 +#include "sherpa-onnx/python/csrc/alsa.h"
7 #include "sherpa-onnx/python/csrc/circular-buffer.h" 8 #include "sherpa-onnx/python/csrc/circular-buffer.h"
8 #include "sherpa-onnx/python/csrc/display.h" 9 #include "sherpa-onnx/python/csrc/display.h"
9 #include "sherpa-onnx/python/csrc/endpoint.h" 10 #include "sherpa-onnx/python/csrc/endpoint.h"
@@ -54,6 +55,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) { @@ -54,6 +55,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
54 PybindOfflineTts(&m); 55 PybindOfflineTts(&m);
55 PybindSpeakerEmbeddingExtractor(&m); 56 PybindSpeakerEmbeddingExtractor(&m);
56 PybindSpeakerEmbeddingManager(&m); 57 PybindSpeakerEmbeddingManager(&m);
  58 +
  59 + PybindAlsa(&m);
57 } 60 }
58 61
59 } // namespace sherpa_onnx 62 } // namespace sherpa_onnx
1 from _sherpa_onnx import ( 1 from _sherpa_onnx import (
  2 + Alsa,
2 CircularBuffer, 3 CircularBuffer,
3 Display, 4 Display,
4 OfflineStream, 5 OfflineStream,