Add Python ASR examples with alsa (#646)

Fangjun Kuang · GitHub
Commit d3287f94940f38d656d1be46b8644dcea337b195 d3287f94 1 parent e9e8d755
CMakeLists.txt
python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
sherpa-onnx/csrc/session.cc
sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc
sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc
sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc
sherpa-onnx/python/csrc/CMakeLists.txt
sherpa-onnx/python/csrc/alsa.cc
sherpa-onnx/python/csrc/alsa.h
sherpa-onnx/python/csrc/faked-alsa.cc
sherpa-onnx/python/csrc/sherpa-onnx.cc
sherpa-onnx/python/sherpa_onnx/__init__.py
--- a/CMakeLists.txt
查看文件 @d3287f9
+++ b/CMakeLists.txt
查看文件 @d3287f9
@@ -146,6 +146,7 @@ include(CheckIncludeFileCXX)
 if(UNIX AND NOT APPLE AND NOT SHERPA_ONNX_ENABLE_WASM AND NOT CMAKE_SYSTEM_NAME STREQUAL Android)
   check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
   if(SHERPA_ONNX_HAS_ALSA)
+     message(STATUS "With Alsa")
     add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
   else()
     message(WARNING "\
--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py 0 → 100755
查看文件 @d3287f9
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py 0 → 100755
查看文件 @d3287f9
+ #!/usr/bin/env python3
+ 
+ # Real-time speech recognition from a microphone with sherpa-onnx Python API
+ # with endpoint detection.
+ #
+ # Note: This script uses ALSA and works only on Linux systems, especially
+ # for embedding Linux systems and for running Linux on Windows using WSL.
+ #
+ # Please refer to
+ # https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
+ # to download pre-trained models
+ 
+ import argparse
+ import sys
+ from pathlib import Path
+ import sherpa_onnx
+ 
+ 
+ def assert_file_exists(filename: str):
+     assert Path(filename).is_file(), (
+         f"{filename} does not exist!\n"
+         "Please refer to "
+         "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
+     )
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser(
+         formatter_class=argparse.ArgumentDefaultsHelpFormatter
+     )
+ 
+     parser.add_argument(
+         "--tokens",
+         type=str,
+         required=True,
+         help="Path to tokens.txt",
+     )
+ 
+     parser.add_argument(
+         "--encoder",
+         type=str,
+         required=True,
+         help="Path to the encoder model",
+     )
+ 
+     parser.add_argument(
+         "--decoder",
+         type=str,
+         required=True,
+         help="Path to the decoder model",
+     )
+ 
+     parser.add_argument(
+         "--joiner",
+         type=str,
+         required=True,
+         help="Path to the joiner model",
+     )
+ 
+     parser.add_argument(
+         "--decoding-method",
+         type=str,
+         default="greedy_search",
+         help="Valid values are greedy_search and modified_beam_search",
+     )
+ 
+     parser.add_argument(
+         "--provider",
+         type=str,
+         default="cpu",
+         help="Valid values: cpu, cuda, coreml",
+     )
+ 
+     parser.add_argument(
+         "--hotwords-file",
+         type=str,
+         default="",
+         help="""
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
+ 
+         ▁HE LL O ▁WORLD
+         你 好 世 界
+         """,
+     )
+ 
+     parser.add_argument(
+         "--hotwords-score",
+         type=float,
+         default=1.5,
+         help="""
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
+         """,
+     )
+ 
+     parser.add_argument(
+         "--blank-penalty",
+         type=float,
+         default=0.0,
+         help="""
+         The penalty applied on blank symbol during decoding.
+         Note: It is a positive value that would be applied to logits like
+         this `logits[:, 0] -= blank_penalty` (suppose logits.shape is
+         [batch_size, vocab] and blank id is 0).
+         """,
+     )
+ 
+     parser.add_argument(
+         "--device-name",
+         type=str,
+         required=True,
+         help="""
+ The device name specifies which microphone to use in case there are several
+ on your system. You can use
+ 
+   arecord -l
+ 
+ to find all available microphones on your computer. For instance, if it outputs
+ 
+ **** List of CAPTURE Hardware Devices ****
+ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
+   Subdevices: 1/1
+   Subdevice #0: subdevice #0
+ 
+ and if you want to select card 3 and the device 0 on that card, please use:
+ 
+   plughw:3,0
+ 
+ as the device_name.
+         """,
+     )
+ 
+     return parser.parse_args()
+ 
+ 
+ def create_recognizer(args):
+     assert_file_exists(args.encoder)
+     assert_file_exists(args.decoder)
+     assert_file_exists(args.joiner)
+     assert_file_exists(args.tokens)
+     # Please replace the model files if needed.
+     # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
+     # for download links.
+     recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
+         tokens=args.tokens,
+         encoder=args.encoder,
+         decoder=args.decoder,
+         joiner=args.joiner,
+         num_threads=1,
+         sample_rate=16000,
+         feature_dim=80,
+         enable_endpoint_detection=True,
+         rule1_min_trailing_silence=2.4,
+         rule2_min_trailing_silence=1.2,
+         rule3_min_utterance_length=300,  # it essentially disables this rule
+         decoding_method=args.decoding_method,
+         provider=args.provider,
+         hotwords_file=args.hotwords_file,
+         hotwords_score=args.hotwords_score,
+         blank_penalty=args.blank_penalty,
+     )
+     return recognizer
+ 
+ 
+ def main():
+     args = get_args()
+     device_name = args.device_name
+     print(f"device_name: {device_name}")
+     alsa = sherpa_onnx.Alsa(device_name)
+ 
+     print("Creating recognizer")
+     recognizer = create_recognizer(args)
+     print("Started! Please speak")
+ 
+     sample_rate = 16000
+     samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
+ 
+     stream = recognizer.create_stream()
+ 
+     last_result = ""
+     segment_id = 0
+     while True:
+         samples = alsa.read(samples_per_read)  # a blocking read
+         stream.accept_waveform(sample_rate, samples)
+         while recognizer.is_ready(stream):
+             recognizer.decode_stream(stream)
+ 
+         is_endpoint = recognizer.is_endpoint(stream)
+ 
+         result = recognizer.get_result(stream)
+ 
+         if result and (last_result != result):
+             last_result = result
+             print("\r{}:{}".format(segment_id, result), end="", flush=True)
+         if is_endpoint:
+             if result:
+                 print("\r{}:{}".format(segment_id, result), flush=True)
+                 segment_id += 1
+             recognizer.reset(stream)
+ 
+ 
+ if __name__ == "__main__":
+     try:
+         main()
+     except KeyboardInterrupt:
+         print("\nCaught Ctrl + C. Exiting")
--- a/sherpa-onnx/csrc/session.cc
查看文件 @d3287f9
+++ b/sherpa-onnx/csrc/session.cc
查看文件 @d3287f9
@@ -16,7 +16,7 @@
 #endif
 
 #if __ANDROID_API__ >= 27
- #include "nnapi_provider_factory.h"
+ #include "nnapi_provider_factory.h"  // NOLINT
 #endif
 
 namespace sherpa_onnx {
--- a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc
查看文件 @d3287f9
+++ b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc
查看文件 @d3287f9
@@ -276,7 +276,7 @@ as the device_name.
       }
     }
 
-     using namespace std::chrono_literals;
+     using namespace std::chrono_literals;  // NOLINT
     std::this_thread::sleep_for(20ms);     // sleep for 20ms
   }
 
--- a/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc
查看文件 @d3287f9
+++ b/sherpa-onnx/csrc/sherpa-onnx-alsa-offline.cc
查看文件 @d3287f9
@@ -192,7 +192,7 @@ as the device_name.
       }
     }
 
-     using namespace std::chrono_literals;
+     using namespace std::chrono_literals;  // NOLINT
     std::this_thread::sleep_for(20ms);     // sleep for 20ms
   }
   t.join();
--- a/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc
查看文件 @d3287f9
+++ b/sherpa-onnx/csrc/sherpa-onnx-keyword-spotter-alsa.cc
查看文件 @d3287f9
@@ -53,10 +53,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
 
 and if you want to select card 3 and the device 0 on that card, please use:
 
-   hw:3,0
- 
- or
- 
   plughw:3,0
 
 as the device_name.
--- a/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @d3287f9
+++ b/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @d3287f9
 include_directories(${CMAKE_SOURCE_DIR})
 
- pybind11_add_module(_sherpa_onnx
+ set(srcs
   circular-buffer.cc
   display.cc
   endpoint.cc
@@ -37,6 +37,13 @@ pybind11_add_module(_sherpa_onnx
   vad-model.cc
   voice-activity-detector.cc
 )
+ if(SHERPA_ONNX_HAS_ALSA)
+   list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc)
+ else()
+   list(APPEND srcs faked-alsa.cc)
+ endif()
+ 
+ pybind11_add_module(_sherpa_onnx ${srcs})
 
 if(APPLE)
   execute_process(
@@ -54,6 +61,14 @@ endif()
 
 target_link_libraries(_sherpa_onnx PRIVATE sherpa-onnx-core)
 
+ if(SHERPA_ONNX_HAS_ALSA)
+   if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
+     target_link_libraries(_sherpa_onnx PRIVATE -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
+   else()
+     target_link_libraries(_sherpa_onnx PRIVATE asound)
+   endif()
+ endif()
+ 
 install(TARGETS _sherpa_onnx
   DESTINATION ../
 )
--- a/sherpa-onnx/python/csrc/alsa.cc 0 → 100644
查看文件 @d3287f9
+++ b/sherpa-onnx/python/csrc/alsa.cc 0 → 100644
查看文件 @d3287f9
+ // sherpa-onnx/python/csrc/alsa.cc
+ //
+ // Copyright (c)  2024  Xiaomi Corporation
+ 
+ #include "sherpa-onnx/python/csrc/alsa.h"
+ 
+ #include <vector>
+ 
+ #include "sherpa-onnx/csrc/alsa.h"
+ 
+ namespace sherpa_onnx {
+ 
+ void PybindAlsa(py::module *m) {
+   using PyClass = Alsa;
+   py::class_<PyClass>(*m, "Alsa")
+       .def(py::init<const char *>(), py::arg("device_name"),
+            py::call_guard<py::gil_scoped_release>())
+       .def(
+           "read",
+           [](PyClass &self, int32_t num_samples) -> std::vector<float> {
+             return self.Read(num_samples);
+           },
+           py::arg("num_samples"), py::call_guard<py::gil_scoped_release>())
+       .def_property_readonly("expected_sample_rate",
+                              &PyClass::GetExpectedSampleRate)
+       .def_property_readonly("actual_sample_rate",
+                              &PyClass::GetActualSampleRate);
+ }
+ 
+ }  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/alsa.h 0 → 100644
查看文件 @d3287f9
+++ b/sherpa-onnx/python/csrc/alsa.h 0 → 100644
查看文件 @d3287f9
+ // sherpa-onnx/python/csrc/alsa.h
+ //
+ // Copyright (c)  2024  Xiaomi Corporation
+ 
+ #ifndef SHERPA_ONNX_PYTHON_CSRC_ALSA_H_
+ #define SHERPA_ONNX_PYTHON_CSRC_ALSA_H_
+ 
+ #include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+ 
+ namespace sherpa_onnx {
+ 
+ void PybindAlsa(py::module *m);
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_PYTHON_CSRC_ALSA_H_
--- a/sherpa-onnx/python/csrc/faked-alsa.cc 0 → 100644
查看文件 @d3287f9
+++ b/sherpa-onnx/python/csrc/faked-alsa.cc 0 → 100644
查看文件 @d3287f9
+ // sherpa-onnx/python/csrc/faked-alsa.cc
+ //
+ // Copyright (c)  2024  Xiaomi Corporation
+ 
+ #include "sherpa-onnx/csrc/macros.h"
+ #include "sherpa-onnx/python/csrc/alsa.h"
+ 
+ namespace sherpa_onnx {
+ 
+ class FakedAlsa {
+  public:
+   explicit FakedAlsa(const char *) {
+     SHERPA_ONNX_LOGE("This function is for Linux only.");
+ #if (SHERPA_ONNX_ENABLE_ALSA == 0) && (defined(__unix__) || defined(__unix))
+     SHERPA_ONNX_LOGE(R"doc(
+ sherpa-onnx is compiled without alsa support. To enable that, please run
+   (1) sudo apt-get install alsa-utils libasound2-dev
+   (2) rebuild sherpa-onnx
+ )doc");
+ #endif
+     exit(-1);
+   }
+ 
+   std::vector<float> Read(int32_t) const { return {}; }
+   int32_t GetExpectedSampleRate() const { return -1; }
+   int32_t GetActualSampleRate() const { return -1; }
+ };
+ 
+ void PybindAlsa(py::module *m) {
+   using PyClass = FakedAlsa;
+   py::class_<PyClass>(*m, "Alsa")
+       .def(py::init<const char *>(), py::arg("device_name"))
+       .def(
+           "read",
+           [](PyClass &self, int32_t num_samples) -> std::vector<float> {
+             return self.Read(num_samples);
+           },
+           py::arg("num_samples"), py::call_guard<py::gil_scoped_release>())
+       .def_property_readonly("expected_sample_rate",
+                              &PyClass::GetExpectedSampleRate)
+       .def_property_readonly("actual_sample_rate",
+                              &PyClass::GetActualSampleRate);
+ }
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_PYTHON_CSRC_FAKED_ALSA_H_
--- a/sherpa-onnx/python/csrc/sherpa-onnx.cc
查看文件 @d3287f9
+++ b/sherpa-onnx/python/csrc/sherpa-onnx.cc
查看文件 @d3287f9
@@ -4,6 +4,7 @@
 
 #include "sherpa-onnx/python/csrc/sherpa-onnx.h"
 
+ #include "sherpa-onnx/python/csrc/alsa.h"
 #include "sherpa-onnx/python/csrc/circular-buffer.h"
 #include "sherpa-onnx/python/csrc/display.h"
 #include "sherpa-onnx/python/csrc/endpoint.h"
@@ -54,6 +55,8 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
   PybindOfflineTts(&m);
   PybindSpeakerEmbeddingExtractor(&m);
   PybindSpeakerEmbeddingManager(&m);
+ 
+   PybindAlsa(&m);
 }
 
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @d3287f9
+++ b/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @d3287f9
 from _sherpa_onnx import (
+     Alsa,
     CircularBuffer,
     Display,
     OfflineStream,