Fix displaying streaming speech recognition results for Python. (#2196)

Fangjun Kuang · GitHub
Commit 4a833a754743076d68252731c681511e2d9390bd 4a833a75 1 parent a6834f65
CMakeLists.txt
python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
python-api-examples/speech-recognition-from-url.py
python-api-examples/streaming-paraformer-asr-microphone.py
python-api-examples/two-pass-speech-recognition-from-microphone.py
sherpa-onnx/python/sherpa_onnx/__init__.py
sherpa-onnx/python/sherpa_onnx/display.py
--- a/CMakeLists.txt
查看文件 @4a833a7
+++ b/CMakeLists.txt
查看文件 @4a833a7
@@ -14,7 +14,7 @@ project(sherpa-onnx)
 # Remember to update
 # ./CHANGELOG.md
 # ./new-release.sh
- set(SHERPA_ONNX_VERSION "1.11.5")
+ set(SHERPA_ONNX_VERSION "1.11.6")
 
 # Disable warning about
 #
--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
查看文件 @4a833a7
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection-alsa.py
查看文件 @4a833a7
@@ -11,8 +11,8 @@
 # to download pre-trained models
 
 import argparse
- import sys
 from pathlib import Path
+ 
 import sherpa_onnx
 
 
@@ -202,8 +202,8 @@ def main():
 
     stream = recognizer.create_stream()
 
-     last_result = ""
-     segment_id = 0
+     display = sherpa_onnx.Display()
+ 
     while True:
         samples = alsa.read(samples_per_read)  # a blocking read
         stream.accept_waveform(sample_rate, samples)
@@ -214,13 +214,14 @@ def main():
 
         result = recognizer.get_result(stream)
 
-         if result and (last_result != result):
-             last_result = result
-             print("\r{}:{}".format(segment_id, result), end="", flush=True)
+         display.update_text(result)
+         display.display()
+ 
         if is_endpoint:
             if result:
-                 print("\r{}:{}".format(segment_id, result), flush=True)
-                 segment_id += 1
+                 display.finalize_current_sentence()
+                 display.display()
+ 
             recognizer.reset(stream)
 
 
--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
查看文件 @4a833a7
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
查看文件 @4a833a7
@@ -192,8 +192,8 @@ def main():
 
     stream = recognizer.create_stream()
 
-     last_result = ""
-     segment_id = 0
+     display = sherpa_onnx.Display()
+ 
     with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
         while True:
             samples, _ = s.read(samples_per_read)  # a blocking read
@@ -206,13 +206,14 @@ def main():
 
             result = recognizer.get_result(stream)
 
-             if result and (last_result != result):
-                 last_result = result
-                 print("\r{}:{}".format(segment_id, result), end="", flush=True)
+             display.update_text(result)
+             display.display()
+ 
             if is_endpoint:
                 if result:
-                     print("\r{}:{}".format(segment_id, result), flush=True)
-                     segment_id += 1
+                     display.finalize_current_sentence()
+                     display.display()
+ 
                 recognizer.reset(stream)
 
 
--- a/python-api-examples/speech-recognition-from-url.py
查看文件 @4a833a7
+++ b/python-api-examples/speech-recognition-from-url.py
查看文件 @4a833a7
@@ -192,8 +192,7 @@ def main():
 
     stream = recognizer.create_stream()
 
-     last_result = ""
-     segment_id = 0
+     display = sherpa_onnx.Display()
 
     print("Started!")
     while True:
@@ -213,13 +212,14 @@ def main():
 
         result = recognizer.get_result(stream)
 
-         if result and (last_result != result):
-             last_result = result
-             print("\r{}:{}".format(segment_id, result), end="", flush=True)
+         display.update_text(result)
+         display.display()
+ 
         if is_endpoint:
             if result:
-                 print("\r{}:{}".format(segment_id, result), flush=True)
-                 segment_id += 1
+                 display.finalize_current_sentence()
+                 display.display()
+ 
             recognizer.reset(stream)
 
 
--- a/python-api-examples/streaming-paraformer-asr-microphone.py
查看文件 @4a833a7
+++ b/python-api-examples/streaming-paraformer-asr-microphone.py
查看文件 @4a833a7
@@ -74,8 +74,8 @@ def main():
 
     stream = recognizer.create_stream()
 
-     last_result = ""
-     segment_id = 0
+     display = sherpa_onnx.Display()
+ 
     with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
         while True:
             samples, _ = s.read(samples_per_read)  # a blocking read
@@ -88,13 +88,14 @@ def main():
 
             result = recognizer.get_result(stream)
 
-             if result and (last_result != result):
-                 last_result = result
-                 print("\r{}:{}".format(segment_id, result), end="", flush=True)
+             display.update_text(result)
+             display.display()
+ 
             if is_endpoint:
                 if result:
-                     print("\r{}:{}".format(segment_id, result), flush=True)
-                     segment_id += 1
+                     display.finalize_current_sentence()
+                     display.display()
+ 
                 recognizer.reset(stream)
 
 
--- a/python-api-examples/two-pass-speech-recognition-from-microphone.py
查看文件 @4a833a7
+++ b/python-api-examples/two-pass-speech-recognition-from-microphone.py
查看文件 @4a833a7
@@ -46,7 +46,6 @@ python3 ./python-api-examples/two-pass-speech-recognition-from-microphone.py \
 import argparse
 import sys
 from pathlib import Path
- from typing import List
 
 import numpy as np
 
@@ -375,8 +374,7 @@ def main():
     samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
     stream = first_recognizer.create_stream()
 
-     last_result = ""
-     segment_id = 0
+     display = sherpa_onnx.Display()
 
     sample_buffers = []
     with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
@@ -395,14 +393,8 @@ def main():
             result = first_recognizer.get_result(stream)
             result = result.lower().strip()
 
-             if last_result != result:
-                 print(
-                     "\r{}:{}".format(segment_id, " " * len(last_result)),
-                     end="",
-                     flush=True,
-                 )
-                 last_result = result
-                 print("\r{}:{}".format(segment_id, result), end="", flush=True)
+             display.update_text(result)
+             display.display()
 
             if is_endpoint:
                 if result:
@@ -419,14 +411,9 @@ def main():
                         sample_rate=sample_rate,
                     )
                     result = result.lower().strip()
- 
-                     print(
-                         "\r{}:{}".format(segment_id, " " * len(last_result)),
-                         end="",
-                         flush=True,
-                     )
-                     print("\r{}:{}".format(segment_id, result), flush=True)
-                     segment_id += 1
+                     display.update_text(result)
+                     display.finalize_current_sentence()
+                     display.display()
                 else:
                     sample_buffers = []
 
--- a/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @4a833a7
+++ b/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @4a833a7
@@ -6,7 +6,6 @@ from _sherpa_onnx import (
     AudioTaggingModelConfig,
     CircularBuffer,
     DenoisedAudio,
-     Display,
     FastClustering,
     FastClusteringConfig,
     OfflinePunctuation,
@@ -48,6 +47,7 @@ from _sherpa_onnx import (
     write_wave,
 )
 
+ from .display import Display
 from .keyword_spotter import KeywordSpotter
 from .offline_recognizer import OfflineRecognizer
 from .online_recognizer import OnlineRecognizer
--- a/sherpa-onnx/python/sherpa_onnx/display.py 0 → 100644
查看文件 @4a833a7
+++ b/sherpa-onnx/python/sherpa_onnx/display.py 0 → 100644
查看文件 @4a833a7
+ # Copyright (c)  2025  Xiaomi Corporation
+ import os
+ from time import gmtime, strftime
+ 
+ 
+ def get_current_time():
+     return strftime("%Y-%m-%d %H:%M:%S", gmtime())
+ 
+ 
+ def clear_console():
+     os.system("cls" if os.name == "nt" else "clear")
+ 
+ 
+ class Display:
+     def __init__(self):
+         self.sentences = []
+         self.currentText = ""
+ 
+     def update_text(self, text):
+         self.currentText = text
+ 
+     def finalize_current_sentence(self):
+         if self.currentText.strip():
+             self.sentences.append((get_current_time(), self.currentText))
+ 
+         self.currentText = ""
+ 
+     def display(self):
+         clear_console()
+         print("=== Speech Recognition with Next-gen Kaldi ===")
+         print("Time:", get_current_time())
+         print("-" * 30)
+ 
+         # display history sentences
+         if self.sentences:
+             for i, (when, text) in enumerate(self.sentences):
+                 print(f"[{when}] {i + 1}. {text}")
+             print("-" * 30)
+ 
+         if self.currentText.strip():
+             print("Recognizing:", self.currentText)