Fangjun Kuang
Committed by GitHub

Fix displaying streaming speech recognition results for Python. (#2196)

... ... @@ -14,7 +14,7 @@ project(sherpa-onnx)
# Remember to update
# ./CHANGELOG.md
# ./new-release.sh
set(SHERPA_ONNX_VERSION "1.11.5")
set(SHERPA_ONNX_VERSION "1.11.6")
# Disable warning about
#
... ...
... ... @@ -11,8 +11,8 @@
# to download pre-trained models
import argparse
import sys
from pathlib import Path
import sherpa_onnx
... ... @@ -202,8 +202,8 @@ def main():
stream = recognizer.create_stream()
last_result = ""
segment_id = 0
display = sherpa_onnx.Display()
while True:
samples = alsa.read(samples_per_read) # a blocking read
stream.accept_waveform(sample_rate, samples)
... ... @@ -214,13 +214,14 @@ def main():
result = recognizer.get_result(stream)
if result and (last_result != result):
last_result = result
print("\r{}:{}".format(segment_id, result), end="", flush=True)
display.update_text(result)
display.display()
if is_endpoint:
if result:
print("\r{}:{}".format(segment_id, result), flush=True)
segment_id += 1
display.finalize_current_sentence()
display.display()
recognizer.reset(stream)
... ...
... ... @@ -192,8 +192,8 @@ def main():
stream = recognizer.create_stream()
last_result = ""
segment_id = 0
display = sherpa_onnx.Display()
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
... ... @@ -206,13 +206,14 @@ def main():
result = recognizer.get_result(stream)
if result and (last_result != result):
last_result = result
print("\r{}:{}".format(segment_id, result), end="", flush=True)
display.update_text(result)
display.display()
if is_endpoint:
if result:
print("\r{}:{}".format(segment_id, result), flush=True)
segment_id += 1
display.finalize_current_sentence()
display.display()
recognizer.reset(stream)
... ...
... ... @@ -192,8 +192,7 @@ def main():
stream = recognizer.create_stream()
last_result = ""
segment_id = 0
display = sherpa_onnx.Display()
print("Started!")
while True:
... ... @@ -213,13 +212,14 @@ def main():
result = recognizer.get_result(stream)
if result and (last_result != result):
last_result = result
print("\r{}:{}".format(segment_id, result), end="", flush=True)
display.update_text(result)
display.display()
if is_endpoint:
if result:
print("\r{}:{}".format(segment_id, result), flush=True)
segment_id += 1
display.finalize_current_sentence()
display.display()
recognizer.reset(stream)
... ...
... ... @@ -74,8 +74,8 @@ def main():
stream = recognizer.create_stream()
last_result = ""
segment_id = 0
display = sherpa_onnx.Display()
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
... ... @@ -88,13 +88,14 @@ def main():
result = recognizer.get_result(stream)
if result and (last_result != result):
last_result = result
print("\r{}:{}".format(segment_id, result), end="", flush=True)
display.update_text(result)
display.display()
if is_endpoint:
if result:
print("\r{}:{}".format(segment_id, result), flush=True)
segment_id += 1
display.finalize_current_sentence()
display.display()
recognizer.reset(stream)
... ...
... ... @@ -46,7 +46,6 @@ python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py \
import argparse
import sys
from pathlib import Path
from typing import List
import numpy as np
... ... @@ -375,8 +374,7 @@ def main():
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
stream = first_recognizer.create_stream()
last_result = ""
segment_id = 0
display = sherpa_onnx.Display()
sample_buffers = []
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
... ... @@ -395,14 +393,8 @@ def main():
result = first_recognizer.get_result(stream)
result = result.lower().strip()
if last_result != result:
print(
"\r{}:{}".format(segment_id, " " * len(last_result)),
end="",
flush=True,
)
last_result = result
print("\r{}:{}".format(segment_id, result), end="", flush=True)
display.update_text(result)
display.display()
if is_endpoint:
if result:
... ... @@ -419,14 +411,9 @@ def main():
sample_rate=sample_rate,
)
result = result.lower().strip()
print(
"\r{}:{}".format(segment_id, " " * len(last_result)),
end="",
flush=True,
)
print("\r{}:{}".format(segment_id, result), flush=True)
segment_id += 1
display.update_text(result)
display.finalize_current_sentence()
display.display()
else:
sample_buffers = []
... ...
... ... @@ -6,7 +6,6 @@ from _sherpa_onnx import (
AudioTaggingModelConfig,
CircularBuffer,
DenoisedAudio,
Display,
FastClustering,
FastClusteringConfig,
OfflinePunctuation,
... ... @@ -48,6 +47,7 @@ from _sherpa_onnx import (
write_wave,
)
from .display import Display
from .keyword_spotter import KeywordSpotter
from .offline_recognizer import OfflineRecognizer
from .online_recognizer import OnlineRecognizer
... ...
# Copyright (c) 2025 Xiaomi Corporation
import os
from time import gmtime, strftime
def get_current_time():
return strftime("%Y-%m-%d %H:%M:%S", gmtime())
def clear_console():
os.system("cls" if os.name == "nt" else "clear")
class Display:
def __init__(self):
self.sentences = []
self.currentText = ""
def update_text(self, text):
self.currentText = text
def finalize_current_sentence(self):
if self.currentText.strip():
self.sentences.append((get_current_time(), self.currentText))
self.currentText = ""
def display(self):
clear_console()
print("=== Speech Recognition with Next-gen Kaldi ===")
print("Time:", get_current_time())
print("-" * 30)
# display history sentences
if self.sentences:
for i, (when, text) in enumerate(self.sentences):
print(f"[{when}] {i + 1}. {text}")
print("-" * 30)
if self.currentText.strip():
print("Recognizing:", self.currentText)
... ...