正在显示
12 个修改的文件
包含
326 行增加
和
10 行删除
| @@ -146,6 +146,7 @@ include(CheckIncludeFileCXX) | @@ -146,6 +146,7 @@ include(CheckIncludeFileCXX) | ||
| 146 | if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME STREQUAL Android) | 146 | if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME STREQUAL Android) |
| 147 | check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) | 147 | check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) |
| 148 | if(SHERPA_ONNX_HAS_ALSA) | 148 | if(SHERPA_ONNX_HAS_ALSA) |
| 149 | + message(STATUS "With Alsa") | ||
| 149 | add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) | 150 | add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) |
| 150 | else() | 151 | else() |
| 151 | message(WARNING "\ | 152 | message(WARNING "\ |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +# Real-time speech recognition from a microphone with sherpa-onnx Python API | ||
| 4 | +# with endpoint detection. | ||
| 5 | +# | ||
| 6 | +# Note: This script uses ALSA and works only on Linux systems, especially | ||
| 7 | +# for embedding Linux systems and for running Linux on Windows using WSL. | ||
| 8 | +# | ||
| 9 | +# Please refer to | ||
| 10 | +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
| 11 | +# to download pre-trained models | ||
| 12 | + | ||
| 13 | +import argparse | ||
| 14 | +import sys | ||
| 15 | +from pathlib import Path | ||
| 16 | +import sherpa_onnx | ||
| 17 | + | ||
| 18 | + | ||
| 19 | +def assert_file_exists(filename: str): | ||
| 20 | + assert Path(filename).is_file(), ( | ||
| 21 | + f"{filename} does not exist!\n" | ||
| 22 | + "Please refer to " | ||
| 23 | + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" | ||
| 24 | + ) | ||
| 25 | + | ||
| 26 | + | ||
| 27 | +def get_args(): | ||
| 28 | + parser = argparse.ArgumentParser( | ||
| 29 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 30 | + ) | ||
| 31 | + | ||
| 32 | + parser.add_argument( | ||
| 33 | + "--tokens", | ||
| 34 | + type=str, | ||
| 35 | + required=True, | ||
| 36 | + help="Path to tokens.txt", | ||
| 37 | + ) | ||
| 38 | + | ||
| 39 | + parser.add_argument( | ||
| 40 | + "--encoder", | ||
| 41 | + type=str, | ||
| 42 | + required=True, | ||
| 43 | + help="Path to the encoder model", | ||
| 44 | + ) | ||
| 45 | + | ||
| 46 | + parser.add_argument( | ||
| 47 | + "--decoder", | ||
| 48 | + type=str, | ||
| 49 | + required=True, | ||
| 50 | + help="Path to the decoder model", | ||
| 51 | + ) | ||
| 52 | + | ||
| 53 | + parser.add_argument( | ||
| 54 | + "--joiner", | ||
| 55 | + type=str, | ||
| 56 | + required=True, | ||
| 57 | + help="Path to the joiner model", | ||
| 58 | + ) | ||
| 59 | + | ||
| 60 | + parser.add_argument( | ||
| 61 | + "--decoding-method", | ||
| 62 | + type=str, | ||
| 63 | + default="greedy_search", | ||
| 64 | + help="Valid values are greedy_search and modified_beam_search", | ||
| 65 | + ) | ||
| 66 | + | ||
| 67 | + parser.add_argument( | ||
| 68 | + "--provider", | ||
| 69 | + type=str, | ||
| 70 | + default="cpu", | ||
| 71 | + help="Valid values: cpu, cuda, coreml", | ||
| 72 | + ) | ||
| 73 | + | ||
| 74 | + parser.add_argument( | ||
| 75 | + "--hotwords-file", | ||
| 76 | + type=str, | ||
| 77 | + default="", | ||
| 78 | + help=""" | ||
| 79 | + The file containing hotwords, one words/phrases per line, and for each | ||
| 80 | + phrase the bpe/cjkchar are separated by a space. For example: | ||
| 81 | + | ||
| 82 | + ▁HE LL O ▁WORLD | ||
| 83 | + 你 好 世 界 | ||
| 84 | + """, | ||
| 85 | + ) | ||
| 86 | + | ||
| 87 | + parser.add_argument( | ||
| 88 | + "--hotwords-score", | ||
| 89 | + type=float, | ||
| 90 | + default=1.5, | ||
| 91 | + help=""" | ||
| 92 | + The hotword score of each token for biasing word/phrase. Used only if | ||
| 93 | + --hotwords-file is given. | ||
| 94 | + """, | ||
| 95 | + ) | ||
| 96 | + | ||
| 97 | + parser.add_argument( | ||
| 98 | + "--blank-penalty", | ||
| 99 | + type=float, | ||
| 100 | + default=0.0, | ||
| 101 | + help=""" | ||
| 102 | + The penalty applied on blank symbol during decoding. | ||
| 103 | + Note: It is a positive value that would be applied to logits like | ||
| 104 | + this `logits[:, 0] -= blank_penalty` (suppose logits.shape is | ||
| 105 | + [batch_size, vocab] and blank id is 0). | ||
| 106 | + """, | ||
| 107 | + ) | ||
| 108 | + | ||
| 109 | + parser.add_argument( | ||
| 110 | + "--device-name", | ||
| 111 | + type=str, | ||
| 112 | + required=True, | ||
| 113 | + help=""" | ||
| 114 | +The device name specifies which microphone to use in case there are several | ||
| 115 | +on your system. You can use | ||
| 116 | + | ||
| 117 | + arecord -l | ||
| 118 | + | ||
| 119 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 120 | + | ||
| 121 | +**** List of CAPTURE Hardware Devices **** | ||
| 122 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 123 | + Subdevices: 1/1 | ||
| 124 | + Subdevice #0: subdevice #0 | ||
| 125 | + | ||
| 126 | +and if you want to select card 3 and the device 0 on that card, please use: | ||
| 127 | + | ||
| 128 | + plughw:3,0 | ||
| 129 | + | ||
| 130 | +as the device_name. | ||
| 131 | + """, | ||
| 132 | + ) | ||
| 133 | + | ||
| 134 | + return parser.parse_args() | ||
| 135 | + | ||
| 136 | + | ||
| 137 | +def create_recognizer(args): | ||
| 138 | + assert_file_exists(args.encoder) | ||
| 139 | + assert_file_exists(args.decoder) | ||
| 140 | + assert_file_exists(args.joiner) | ||
| 141 | + assert_file_exists(args.tokens) | ||
| 142 | + # Please replace the model files if needed. | ||
| 143 | + # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
| 144 | + # for download links. | ||
| 145 | + recognizer = sherpa_onnx.OnlineRecognizer.from_transducer( | ||
| 146 | + tokens=args.tokens, | ||
| 147 | + encoder=args.encoder, | ||
| 148 | + decoder=args.decoder, | ||
| 149 | + joiner=args.joiner, | ||
| 150 | + num_threads=1, | ||
| 151 | + sample_rate=16000, | ||
| 152 | + feature_dim=80, | ||
| 153 | + enable_endpoint_detection=True, | ||
| 154 | + rule1_min_trailing_silence=2.4, | ||
| 155 | + rule2_min_trailing_silence=1.2, | ||
| 156 | + rule3_min_utterance_length=300, # it essentially disables this rule | ||
| 157 | + decoding_method=args.decoding_method, | ||
| 158 | + provider=args.provider, | ||
| 159 | + hotwords_file=args.hotwords_file, | ||
| 160 | + hotwords_score=args.hotwords_score, | ||
| 161 | + blank_penalty=args.blank_penalty, | ||
| 162 | + ) | ||
| 163 | + return recognizer | ||
| 164 | + | ||
| 165 | + | ||
| 166 | +def main(): | ||
| 167 | + args = get_args() | ||
| 168 | + device_name = args.device_name | ||
| 169 | + print(f"device_name: {device_name}") | ||
| 170 | + alsa = sherpa_onnx.Alsa(device_name) | ||
| 171 | + | ||
| 172 | + print("Creating recognizer") | ||
| 173 | + recognizer = create_recognizer(args) | ||
| 174 | + print("Started! Please speak") | ||
| 175 | + | ||
| 176 | + sample_rate = 16000 | ||
| 177 | + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
| 178 | + | ||
| 179 | + stream = recognizer.create_stream() | ||
| 180 | + | ||
| 181 | + last_result = "" | ||
| 182 | + segment_id = 0 | ||
| 183 | + while True: | ||
| 184 | + samples = alsa.read(samples_per_read) # a blocking read | ||
| 185 | + stream.accept_waveform(sample_rate, samples) | ||
| 186 | + while recognizer.is_ready(stream): | ||
| 187 | + recognizer.decode_stream(stream) | ||
| 188 | + | ||
| 189 | + is_endpoint = recognizer.is_endpoint(stream) | ||
| 190 | + | ||
| 191 | + result = recognizer.get_result(stream) | ||
| 192 | + | ||
| 193 | + if result and (last_result != result): | ||
| 194 | + last_result = result | ||
| 195 | + print("\r{}:{}".format(segment_id, result), end="", flush=True) | ||
| 196 | + if is_endpoint: | ||
| 197 | + if result: | ||
| 198 | + print("\r{}:{}".format(segment_id, result), flush=True) | ||
| 199 | + segment_id += 1 | ||
| 200 | + recognizer.reset(stream) | ||
| 201 | + | ||
| 202 | + | ||
| 203 | +if __name__ == "__main__": | ||
| 204 | + try: | ||
| 205 | + main() | ||
| 206 | + except KeyboardInterrupt: | ||
| 207 | + print("\nCaught Ctrl + C. Exiting") |
| @@ -276,8 +276,8 @@ as the device_name. | @@ -276,8 +276,8 @@ as the device_name. | ||
| 276 | } | 276 | } |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | - using namespace std::chrono_literals; | ||
| 280 | - std::this_thread::sleep_for(20ms); // sleep for 20ms | 279 | + using namespace std::chrono_literals; // NOLINT |
| 280 | + std::this_thread::sleep_for(20ms); // sleep for 20ms | ||
| 281 | } | 281 | } |
| 282 | 282 | ||
| 283 | t.join(); | 283 | t.join(); |
| @@ -192,8 +192,8 @@ as the device_name. | @@ -192,8 +192,8 @@ as the device_name. | ||
| 192 | } | 192 | } |
| 193 | } | 193 | } |
| 194 | 194 | ||
| 195 | - using namespace std::chrono_literals; | ||
| 196 | - std::this_thread::sleep_for(20ms); // sleep for 20ms | 195 | + using namespace std::chrono_literals; // NOLINT |
| 196 | + std::this_thread::sleep_for(20ms); // sleep for 20ms | ||
| 197 | } | 197 | } |
| 198 | t.join(); | 198 | t.join(); |
| 199 | t2.join(); | 199 | t2.join(); |
| @@ -53,10 +53,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | @@ -53,10 +53,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 53 | 53 | ||
| 54 | and if you want to select card 3 and the device 0 on that card, please use: | 54 | and if you want to select card 3 and the device 0 on that card, please use: |
| 55 | 55 | ||
| 56 | - hw:3,0 | ||
| 57 | - | ||
| 58 | -or | ||
| 59 | - | ||
| 60 | plughw:3,0 | 56 | plughw:3,0 |
| 61 | 57 | ||
| 62 | as the device_name. | 58 | as the device_name. |
| 1 | include_directories(${CMAKE_SOURCE_DIR}) | 1 | include_directories(${CMAKE_SOURCE_DIR}) |
| 2 | 2 | ||
| 3 | -pybind11_add_module(_sherpa_onnx | 3 | +set(srcs |
| 4 | circular-buffer.cc | 4 | circular-buffer.cc |
| 5 | display.cc | 5 | display.cc |
| 6 | endpoint.cc | 6 | endpoint.cc |
| @@ -37,6 +37,13 @@ pybind11_add_module(_sherpa_onnx | @@ -37,6 +37,13 @@ pybind11_add_module(_sherpa_onnx | ||
| 37 | vad-model.cc | 37 | vad-model.cc |
| 38 | voice-activity-detector.cc | 38 | voice-activity-detector.cc |
| 39 | ) | 39 | ) |
| 40 | +if(SHERPA_ONNX_HAS_ALSA) | ||
| 41 | + list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc) | ||
| 42 | +else() | ||
| 43 | + list(APPEND srcs faked-alsa.cc) | ||
| 44 | +endif() | ||
| 45 | + | ||
| 46 | +pybind11_add_module(_sherpa_onnx ${srcs}) | ||
| 40 | 47 | ||
| 41 | if(APPLE) | 48 | if(APPLE) |
| 42 | execute_process( | 49 | execute_process( |
| @@ -54,6 +61,14 @@ endif() | @@ -54,6 +61,14 @@ endif() | ||
| 54 | 61 | ||
| 55 | target_link_libraries(_sherpa_onnx PRIVATE sherpa-onnx-core) | 62 | target_link_libraries(_sherpa_onnx PRIVATE sherpa-onnx-core) |
| 56 | 63 | ||
| 64 | +if(SHERPA_ONNX_HAS_ALSA) | ||
| 65 | + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) | ||
| 66 | + target_link_libraries(_sherpa_onnx PRIVATE -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) | ||
| 67 | + else() | ||
| 68 | + target_link_libraries(_sherpa_onnx PRIVATE asound) | ||
| 69 | + endif() | ||
| 70 | +endif() | ||
| 71 | + | ||
| 57 | install(TARGETS _sherpa_onnx | 72 | install(TARGETS _sherpa_onnx |
| 58 | DESTINATION ../ | 73 | DESTINATION ../ |
| 59 | ) | 74 | ) |
sherpa-onnx/python/csrc/alsa.cc
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/alsa.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/alsa.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/alsa.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +void PybindAlsa(py::module *m) { | ||
| 14 | + using PyClass = Alsa; | ||
| 15 | + py::class_<PyClass>(*m, "Alsa") | ||
| 16 | + .def(py::init<const char *>(), py::arg("device_name"), | ||
| 17 | + py::call_guard<py::gil_scoped_release>()) | ||
| 18 | + .def( | ||
| 19 | + "read", | ||
| 20 | + [](PyClass &self, int32_t num_samples) -> std::vector<float> { | ||
| 21 | + return self.Read(num_samples); | ||
| 22 | + }, | ||
| 23 | + py::arg("num_samples"), py::call_guard<py::gil_scoped_release>()) | ||
| 24 | + .def_property_readonly("expected_sample_rate", | ||
| 25 | + &PyClass::GetExpectedSampleRate) | ||
| 26 | + .def_property_readonly("actual_sample_rate", | ||
| 27 | + &PyClass::GetActualSampleRate); | ||
| 28 | +} | ||
| 29 | + | ||
| 30 | +} // namespace sherpa_onnx |
sherpa-onnx/python/csrc/alsa.h
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/alsa.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindAlsa(py::module *m); | ||
| 13 | + | ||
| 14 | +} // namespace sherpa_onnx | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_ALSA_H_ |
sherpa-onnx/python/csrc/faked-alsa.cc
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/faked-alsa.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 6 | +#include "sherpa-onnx/python/csrc/alsa.h" | ||
| 7 | + | ||
| 8 | +namespace sherpa_onnx { | ||
| 9 | + | ||
| 10 | +class FakedAlsa { | ||
| 11 | + public: | ||
| 12 | + explicit FakedAlsa(const char *) { | ||
| 13 | + SHERPA_ONNX_LOGE("This function is for Linux only."); | ||
| 14 | +#if (SHERPA_ONNX_ENABLE_ALSA == 0) && (defined(__unix__) || defined(__unix)) | ||
| 15 | + SHERPA_ONNX_LOGE(R"doc( | ||
| 16 | +sherpa-onnx is compiled without alsa support. To enable that, please run | ||
| 17 | + (1) sudo apt-get install alsa-utils libasound2-dev | ||
| 18 | + (2) rebuild sherpa-onnx | ||
| 19 | +)doc"); | ||
| 20 | +#endif | ||
| 21 | + exit(-1); | ||
| 22 | + } | ||
| 23 | + | ||
| 24 | + std::vector<float> Read(int32_t) const { return {}; } | ||
| 25 | + int32_t GetExpectedSampleRate() const { return -1; } | ||
| 26 | + int32_t GetActualSampleRate() const { return -1; } | ||
| 27 | +}; | ||
| 28 | + | ||
| 29 | +void PybindAlsa(py::module *m) { | ||
| 30 | + using PyClass = FakedAlsa; | ||
| 31 | + py::class_<PyClass>(*m, "Alsa") | ||
| 32 | + .def(py::init<const char *>(), py::arg("device_name")) | ||
| 33 | + .def( | ||
| 34 | + "read", | ||
| 35 | + [](PyClass &self, int32_t num_samples) -> std::vector<float> { | ||
| 36 | + return self.Read(num_samples); | ||
| 37 | + }, | ||
| 38 | + py::arg("num_samples"), py::call_guard<py::gil_scoped_release>()) | ||
| 39 | + .def_property_readonly("expected_sample_rate", | ||
| 40 | + &PyClass::GetExpectedSampleRate) | ||
| 41 | + .def_property_readonly("actual_sample_rate", | ||
| 42 | + &PyClass::GetActualSampleRate); | ||
| 43 | +} | ||
| 44 | + | ||
| 45 | +} // namespace sherpa_onnx | ||
| 46 | + | ||
| 47 | +#endif // SHERPA_ONNX_PYTHON_CSRC_FAKED_ALSA_H_ |
| @@ -4,6 +4,7 @@ | @@ -4,6 +4,7 @@ | ||
| 4 | 4 | ||
| 5 | #include "sherpa-onnx/python/csrc/sherpa-onnx.h" | 5 | #include "sherpa-onnx/python/csrc/sherpa-onnx.h" |
| 6 | 6 | ||
| 7 | +#include "sherpa-onnx/python/csrc/alsa.h" | ||
| 7 | #include "sherpa-onnx/python/csrc/circular-buffer.h" | 8 | #include "sherpa-onnx/python/csrc/circular-buffer.h" |
| 8 | #include "sherpa-onnx/python/csrc/display.h" | 9 | #include "sherpa-onnx/python/csrc/display.h" |
| 9 | #include "sherpa-onnx/python/csrc/endpoint.h" | 10 | #include "sherpa-onnx/python/csrc/endpoint.h" |
| @@ -54,6 +55,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) { | @@ -54,6 +55,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) { | ||
| 54 | PybindOfflineTts(&m); | 55 | PybindOfflineTts(&m); |
| 55 | PybindSpeakerEmbeddingExtractor(&m); | 56 | PybindSpeakerEmbeddingExtractor(&m); |
| 56 | PybindSpeakerEmbeddingManager(&m); | 57 | PybindSpeakerEmbeddingManager(&m); |
| 58 | + | ||
| 59 | + PybindAlsa(&m); | ||
| 57 | } | 60 | } |
| 58 | 61 | ||
| 59 | } // namespace sherpa_onnx | 62 | } // namespace sherpa_onnx |
-
请 注册 或 登录 后发表评论