Add VAD + Non-streaming ASR Python example. (#332)

Fangjun Kuang · GitHub
Commit 969fff56229209e704f573dd209b4085a6f31b8a 969fff56 1 parent cf199ad4
python-api-examples/README.md
python-api-examples/vad-with-non-streaming-asr.py
--- a/python-api-examples/README.md
查看文件 @969fff5
+++ b/python-api-examples/README.md
查看文件 @969fff5
@@ -7,3 +7,6 @@
 - [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses
   [silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech
   segments and concatenate all speech segments into a single one.
+ - [vad-with-non-streaming-asr.py](./vad-with-non-streaming-asr.py) It shows
+   how to use VAD with a non-streaming ASR model for speech recognition from
+   a microphone
--- a/python-api-examples/vad-with-non-streaming-asr.py 0 → 100755
查看文件 @969fff5
+++ b/python-api-examples/vad-with-non-streaming-asr.py 0 → 100755
查看文件 @969fff5
+ #!/usr/bin/env python3
+ #
+ # Copyright (c)  2023  Xiaomi Corporation
+ 
+ """
+ This file demonstrates how to use sherpa-onnx Python APIs
+ with VAD and non-streaming ASR models for speech recognition
+ from a microphone.
+ 
+ Note that you need a non-streaming model for this script.
+ 
+ (1) For paraformer
+ 
+     ./python-api-examples/vad-with-non-streaming-asr.py  \
+       --silero-vad-model=/path/to/silero_vad.onnx \
+       --tokens=/path/to/tokens.txt \
+       --paraformer=/path/to/paraformer.onnx \
+       --num-threads=2 \
+       --decoding-method=greedy_search \
+       --debug=false \
+       --sample-rate=16000 \
+       --feature-dim=80
+ 
+ (2) For transducer models from icefall
+ 
+     ./python-api-examples/vad-with-non-streaming-asr.py  \
+       --silero-vad-model=/path/to/silero_vad.onnx \
+       --tokens=/path/to/tokens.txt \
+       --encoder=/path/to/encoder.onnx \
+       --decoder=/path/to/decoder.onnx \
+       --joiner=/path/to/joiner.onnx \
+       --num-threads=2 \
+       --decoding-method=greedy_search \
+       --debug=false \
+       --sample-rate=16000 \
+       --feature-dim=80
+ 
+ (3) For Whisper models
+ 
+ ./python-api-examples/vad-with-non-streaming-asr.py  \
+   --silero-vad-model=/path/to/silero_vad.onnx \
+   --whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \
+   --whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \
+   --tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \
+   --whisper-task=transcribe \
+   --num-threads=2
+ 
+ Please refer to
+ https://k2-fsa.github.io/sherpa/onnx/index.html
+ to install sherpa-onnx and to download non-streaming pre-trained models
+ used in this file.
+ 
+ Please visit
+ https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
+ to download silero_vad.onnx
+ 
+ For instance,
+ 
+ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
+ """
+ import argparse
+ import sys
+ from pathlib import Path
+ 
+ import numpy as np
+ 
+ try:
+     import sounddevice as sd
+ except ImportError:
+     print("Please install sounddevice first. You can use")
+     print()
+     print("  pip install sounddevice")
+     print()
+     print("to install it")
+     sys.exit(-1)
+ 
+ import sherpa_onnx
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser(
+         formatter_class=argparse.ArgumentDefaultsHelpFormatter
+     )
+ 
+     parser.add_argument(
+         "--silero-vad-model",
+         type=str,
+         required=True,
+         help="Path to silero_vad.onnx",
+     )
+ 
+     parser.add_argument(
+         "--tokens",
+         type=str,
+         help="Path to tokens.txt",
+     )
+ 
+     parser.add_argument(
+         "--encoder",
+         default="",
+         type=str,
+         help="Path to the transducer encoder model",
+     )
+ 
+     parser.add_argument(
+         "--decoder",
+         default="",
+         type=str,
+         help="Path to the transducer decoder model",
+     )
+ 
+     parser.add_argument(
+         "--joiner",
+         default="",
+         type=str,
+         help="Path to the transducer joiner model",
+     )
+ 
+     parser.add_argument(
+         "--paraformer",
+         default="",
+         type=str,
+         help="Path to the model.onnx from Paraformer",
+     )
+ 
+     parser.add_argument(
+         "--num-threads",
+         type=int,
+         default=1,
+         help="Number of threads for neural network computation",
+     )
+ 
+     parser.add_argument(
+         "--whisper-encoder",
+         default="",
+         type=str,
+         help="Path to whisper encoder model",
+     )
+ 
+     parser.add_argument(
+         "--whisper-decoder",
+         default="",
+         type=str,
+         help="Path to whisper decoder model",
+     )
+ 
+     parser.add_argument(
+         "--whisper-language",
+         default="",
+         type=str,
+         help="""It specifies the spoken language in the input file.
+         Example values: en, fr, de, zh, jp.
+         Available languages for multilingual models can be found at
+         https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
+         If not specified, we infer the language from the input audio file.
+         """,
+     )
+ 
+     parser.add_argument(
+         "--whisper-task",
+         default="transcribe",
+         choices=["transcribe", "translate"],
+         type=str,
+         help="""For multilingual models, if you specify translate, the output
+         will be in English.
+         """,
+     )
+ 
+     parser.add_argument(
+         "--decoding-method",
+         type=str,
+         default="greedy_search",
+         help="""Valid values are greedy_search and modified_beam_search.
+         modified_beam_search is valid only for transducer models.
+         """,
+     )
+     parser.add_argument(
+         "--debug",
+         type=bool,
+         default=False,
+         help="True to show debug messages when loading modes.",
+     )
+ 
+     parser.add_argument(
+         "--sample-rate",
+         type=int,
+         default=16000,
+         help="""Sample rate of the feature extractor. Must match the one
+         expected by the model.""",
+     )
+ 
+     parser.add_argument(
+         "--feature-dim",
+         type=int,
+         default=80,
+         help="Feature dimension. Must match the one expected by the model",
+     )
+ 
+     return parser.parse_args()
+ 
+ 
+ def assert_file_exists(filename: str):
+     assert Path(filename).is_file(), (
+         f"{filename} does not exist!\n"
+         "Please refer to "
+         "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
+     )
+ 
+ 
+ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
+     if args.encoder:
+         assert len(args.paraformer) == 0, args.paraformer
+         assert len(args.whisper_encoder) == 0, args.whisper_encoder
+         assert len(args.whisper_decoder) == 0, args.whisper_decoder
+ 
+         assert_file_exists(args.encoder)
+         assert_file_exists(args.decoder)
+         assert_file_exists(args.joiner)
+ 
+         recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
+             encoder=args.encoder,
+             decoder=args.decoder,
+             joiner=args.joiner,
+             tokens=args.tokens,
+             num_threads=args.num_threads,
+             sample_rate=args.sample_rate,
+             feature_dim=args.feature_dim,
+             decoding_method=args.decoding_method,
+             debug=args.debug,
+         )
+     elif args.paraformer:
+         assert len(args.whisper_encoder) == 0, args.whisper_encoder
+         assert len(args.whisper_decoder) == 0, args.whisper_decoder
+ 
+         assert_file_exists(args.paraformer)
+ 
+         recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
+             paraformer=args.paraformer,
+             tokens=args.tokens,
+             num_threads=args.num_threads,
+             sample_rate=args.sample_rate,
+             feature_dim=args.feature_dim,
+             decoding_method=args.decoding_method,
+             debug=args.debug,
+         )
+     elif args.whisper_encoder:
+         assert_file_exists(args.whisper_encoder)
+         assert_file_exists(args.whisper_decoder)
+ 
+         recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
+             encoder=args.whisper_encoder,
+             decoder=args.whisper_decoder,
+             tokens=args.tokens,
+             num_threads=args.num_threads,
+             decoding_method=args.decoding_method,
+             debug=args.debug,
+             language=args.whisper_language,
+             task=args.whisper_task,
+         )
+     else:
+         raise ValueError("Please specify at least one model")
+ 
+     return recognizer
+ 
+ 
+ def main():
+     devices = sd.query_devices()
+     if len(devices) == 0:
+         print("No microphone devices found")
+         sys.exit(0)
+ 
+     print(devices)
+ 
+     # If you want to select a different input device, please use
+     # sd.default.device[0] = xxx
+     # where xxx is the device number
+ 
+     default_input_device_idx = sd.default.device[0]
+     print(f'Use default device: {devices[default_input_device_idx]["name"]}')
+ 
+     args = get_args()
+     assert_file_exists(args.tokens)
+     assert_file_exists(args.silero_vad_model)
+ 
+     assert args.num_threads > 0, args.num_threads
+ 
+     assert (
+         args.sample_rate == 16000
+     ), f"Only sample rate 16000 is supported.Given: {args.sample_rate}"
+ 
+     print("Creating recognizer. Please wait...")
+     recognizer = create_recognizer(args)
+ 
+     config = sherpa_onnx.VadModelConfig()
+     config.silero_vad.model = args.silero_vad_model
+     config.silero_vad.min_silence_duration = 0.25
+     config.sample_rate = args.sample_rate
+ 
+     window_size = config.silero_vad.window_size
+ 
+     vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
+ 
+     samples_per_read = int(0.1 * args.sample_rate)  # 0.1 second = 100 ms
+ 
+     print("Started! Please speak")
+ 
+     buffer = []
+     texts = []
+     with sd.InputStream(channels=1, dtype="float32", samplerate=args.sample_rate) as s:
+         while True:
+             samples, _ = s.read(samples_per_read)  # a blocking read
+             samples = samples.reshape(-1)
+ 
+             buffer = np.concatenate([buffer, samples])
+             while len(buffer) > window_size:
+                 vad.accept_waveform(buffer[:window_size])
+                 buffer = buffer[window_size:]
+ 
+             while not vad.empty():
+                 stream = recognizer.create_stream()
+                 stream.accept_waveform(args.sample_rate, vad.front.samples)
+ 
+                 vad.pop()
+                 recognizer.decode_stream(stream)
+ 
+                 text = stream.result.text.strip().lower()
+                 if len(text):
+                     idx = len(texts)
+                     texts.append(text)
+                     print(f"{idx}: {text}")
+ 
+ 
+ if __name__ == "__main__":
+     try:
+         main()
+     except KeyboardInterrupt:
+         print("\nCaught Ctrl + C. Exiting")