Fangjun Kuang
Committed by GitHub

Add real-time speech recognition example for SenseVoice. (#2197)

  1 +#!/usr/bin/env python3
  2 +#
  3 +# Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +"""
  6 +This file demonstrates how to use sherpa-onnx Python APIs
  7 +with VAD and non-streaming SenseVoice for real-time speech recognition
  8 +from a microphone.
  9 +
  10 +Usage:
  11 +
  12 +
  13 +wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
  14 +
  15 +./python-api-examples/simulate-streaming-sense-voice-microphone.py \
  16 + --silero-vad-model=./silero_vad.onnx \
  17 + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \
  18 + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt
  19 +"""
  20 +import argparse
  21 +import queue
  22 +import sys
  23 +import threading
  24 +import time
  25 +from pathlib import Path
  26 +
  27 +import numpy as np
  28 +
  29 +try:
  30 + import sounddevice as sd
  31 +except ImportError:
  32 + print("Please install sounddevice first. You can use")
  33 + print()
  34 + print(" pip install sounddevice")
  35 + print()
  36 + print("to install it")
  37 + sys.exit(-1)
  38 +
  39 +import sherpa_onnx
  40 +
  41 +killed = False
  42 +recording_thread = None
  43 +sample_rate = 16000 # Please don't change it
  44 +
  45 +# buffer saves audio samples to be played
  46 +samples_queue = queue.Queue()
  47 +
  48 +
  49 +def get_args():
  50 + parser = argparse.ArgumentParser(
  51 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  52 + )
  53 +
  54 + parser.add_argument(
  55 + "--silero-vad-model",
  56 + type=str,
  57 + required=True,
  58 + help="Path to silero_vad.onnx",
  59 + )
  60 +
  61 + parser.add_argument(
  62 + "--tokens",
  63 + type=str,
  64 + help="Path to tokens.txt",
  65 + )
  66 +
  67 + parser.add_argument(
  68 + "--sense-voice",
  69 + default="",
  70 + type=str,
  71 + help="Path to the model.onnx from SenseVoice",
  72 + )
  73 +
  74 + parser.add_argument(
  75 + "--num-threads",
  76 + type=int,
  77 + default=1,
  78 + help="Number of threads for neural network computation",
  79 + )
  80 +
  81 + parser.add_argument(
  82 + "--hr-dict-dir",
  83 + type=str,
  84 + default="",
  85 + help="If not empty, it is the jieba dict directory for homophone replacer",
  86 + )
  87 +
  88 + parser.add_argument(
  89 + "--hr-lexicon",
  90 + type=str,
  91 + default="",
  92 + help="If not empty, it is the lexicon.txt for homophone replacer",
  93 + )
  94 +
  95 + parser.add_argument(
  96 + "--hr-rule-fsts",
  97 + type=str,
  98 + default="",
  99 + help="If not empty, it is the replace.fst for homophone replacer",
  100 + )
  101 +
  102 + return parser.parse_args()
  103 +
  104 +
  105 +def assert_file_exists(filename: str):
  106 + assert Path(filename).is_file(), (
  107 + f"{filename} does not exist!\n"
  108 + "Please refer to "
  109 + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
  110 + )
  111 +
  112 +
  113 +def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
  114 + assert_file_exists(args.sense_voice)
  115 + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
  116 + model=args.sense_voice,
  117 + tokens=args.tokens,
  118 + num_threads=args.num_threads,
  119 + use_itn=False,
  120 + debug=False,
  121 + hr_dict_dir=args.hr_dict_dir,
  122 + hr_rule_fsts=args.hr_rule_fsts,
  123 + hr_lexicon=args.hr_lexicon,
  124 + )
  125 +
  126 + return recognizer
  127 +
  128 +
  129 +def start_recording():
  130 + # You can use any value you like for samples_per_read
  131 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
  132 +
  133 + with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
  134 + while not killed:
  135 + samples, _ = s.read(samples_per_read) # a blocking read
  136 + samples = samples.reshape(-1)
  137 + samples = np.copy(samples)
  138 + samples_queue.put(samples)
  139 +
  140 +
  141 +def main():
  142 + devices = sd.query_devices()
  143 + if len(devices) == 0:
  144 + print("No microphone devices found")
  145 + sys.exit(0)
  146 +
  147 + print(devices)
  148 +
  149 + # If you want to select a different input device, please use
  150 + # sd.default.device[0] = xxx
  151 + # where xxx is the device number
  152 +
  153 + default_input_device_idx = sd.default.device[0]
  154 + print(f'Use default device: {devices[default_input_device_idx]["name"]}')
  155 +
  156 + args = get_args()
  157 + assert_file_exists(args.tokens)
  158 + assert_file_exists(args.silero_vad_model)
  159 +
  160 + assert args.num_threads > 0, args.num_threads
  161 +
  162 + print("Creating recognizer. Please wait...")
  163 + recognizer = create_recognizer(args)
  164 +
  165 + config = sherpa_onnx.VadModelConfig()
  166 + config.silero_vad.model = args.silero_vad_model
  167 + config.silero_vad.min_silence_duration = 0.25
  168 + config.sample_rate = sample_rate
  169 +
  170 + window_size = config.silero_vad.window_size
  171 +
  172 + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
  173 +
  174 + print("Started! Please speak")
  175 +
  176 + buffer = []
  177 +
  178 + global recording_thread
  179 + recording_thread = threading.Thread(target=start_recording)
  180 + recording_thread.start()
  181 +
  182 + display = sherpa_onnx.Display()
  183 +
  184 + started = False
  185 + started_time = None
  186 +
  187 + while not killed:
  188 + samples = samples_queue.get() # a blocking read
  189 +
  190 + buffer = np.concatenate([buffer, samples])
  191 + offset = 0
  192 + while offset + window_size < samples.shape[0]:
  193 + vad.accept_waveform(samples[offset : offset + window_size])
  194 + if not started and vad.is_speech_detected():
  195 + started = True
  196 + started_time = time.time()
  197 + offset += window_size
  198 +
  199 + if not started:
  200 + buffer = buffer[-10 * window_size :]
  201 +
  202 + if started and time.time() - started_time > 0.2:
  203 + stream = recognizer.create_stream()
  204 + stream.accept_waveform(sample_rate, buffer)
  205 + recognizer.decode_stream(stream)
  206 + text = stream.result.text.strip()
  207 + if text:
  208 + display.update_text(text)
  209 + display.display()
  210 +
  211 + started_time = time.time()
  212 +
  213 + while not vad.empty():
  214 + # In general, this while loop is executed only once
  215 + stream = recognizer.create_stream()
  216 + stream.accept_waveform(sample_rate, vad.front.samples)
  217 +
  218 + vad.pop()
  219 + recognizer.decode_stream(stream)
  220 +
  221 + text = stream.result.text.strip()
  222 +
  223 + display.update_text(text)
  224 +
  225 + buffer = []
  226 + started = False
  227 + started_time = None
  228 +
  229 + display.finalize_current_sentence()
  230 + display.display()
  231 +
  232 +
  233 +if __name__ == "__main__":
  234 + try:
  235 + main()
  236 + except KeyboardInterrupt:
  237 + killed = True
  238 + if recording_thread:
  239 + recording_thread.join()
  240 + print("\nCaught Ctrl + C. Exiting")
1 # Copyright (c) 2025 Xiaomi Corporation 1 # Copyright (c) 2025 Xiaomi Corporation
2 import os 2 import os
3 -from time import gmtime, strftime 3 +from time import localtime, strftime
4 4
5 5
6 def get_current_time(): 6 def get_current_time():
7 - return strftime("%Y-%m-%d %H:%M:%S", gmtime()) 7 + return strftime("%Y-%m-%d %H:%M:%S", localtime())
8 8
9 9
10 def clear_console(): 10 def clear_console():