Fangjun Kuang
Committed by GitHub

Support recognition from URLs. (#194)

@@ -40,24 +40,28 @@ def get_args(): @@ -40,24 +40,28 @@ def get_args():
40 parser.add_argument( 40 parser.add_argument(
41 "--tokens", 41 "--tokens",
42 type=str, 42 type=str,
  43 + required=True,
43 help="Path to tokens.txt", 44 help="Path to tokens.txt",
44 ) 45 )
45 46
46 parser.add_argument( 47 parser.add_argument(
47 "--encoder", 48 "--encoder",
48 type=str, 49 type=str,
  50 + required=True,
49 help="Path to the encoder model", 51 help="Path to the encoder model",
50 ) 52 )
51 53
52 parser.add_argument( 54 parser.add_argument(
53 "--decoder", 55 "--decoder",
54 type=str, 56 type=str,
  57 + required=True,
55 help="Path to the decoder model", 58 help="Path to the decoder model",
56 ) 59 )
57 60
58 parser.add_argument( 61 parser.add_argument(
59 "--joiner", 62 "--joiner",
60 type=str, 63 type=str,
  64 + required=True,
61 help="Path to the joiner model", 65 help="Path to the joiner model",
62 ) 66 )
63 67
@@ -105,7 +109,7 @@ def main(): @@ -105,7 +109,7 @@ def main():
105 # sherpa-onnx will do resampling inside. 109 # sherpa-onnx will do resampling inside.
106 sample_rate = 48000 110 sample_rate = 48000
107 samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms 111 samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
108 - last_result = "" 112 +
109 stream = recognizer.create_stream() 113 stream = recognizer.create_stream()
110 114
111 last_result = "" 115 last_result = ""
@@ -39,18 +39,21 @@ def get_args(): @@ -39,18 +39,21 @@ def get_args():
39 parser.add_argument( 39 parser.add_argument(
40 "--tokens", 40 "--tokens",
41 type=str, 41 type=str,
  42 + required=True,
42 help="Path to tokens.txt", 43 help="Path to tokens.txt",
43 ) 44 )
44 45
45 parser.add_argument( 46 parser.add_argument(
46 "--encoder", 47 "--encoder",
47 type=str, 48 type=str,
  49 + required=True,
48 help="Path to the encoder model", 50 help="Path to the encoder model",
49 ) 51 )
50 52
51 parser.add_argument( 53 parser.add_argument(
52 "--decoder", 54 "--decoder",
53 type=str, 55 type=str,
  56 + required=True,
54 help="Path to the decoder model", 57 help="Path to the decoder model",
55 ) 58 )
56 59
  1 +#!/usr/bin/env python3
  2 +#
  3 +# Real-time speech recognition from a URL with sherpa-onnx Python API
  4 +#
  5 +# Supported URLs are those supported by ffmpeg.
  6 +#
  7 +# For instance:
  8 +# (1) RTMP
  9 +# rtmp://localhost/live/livestream
  10 +#
  11 +# (2) A file
  12 +# https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/wenetspeech/DEV_T0000000000.opus
  13 +# https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/aishell2/ID0012W0030.wav
  14 +# file:///Users/fangjun/open-source/sherpa-onnx/a.wav
  15 +#
  16 +# Note that it supports all file formats supported by ffmpeg
  17 +#
  18 +# Please refer to
  19 +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  20 +# to download pre-trained models
  21 +
  22 +import argparse
  23 +import shutil
  24 +import subprocess
  25 +import sys
  26 +from pathlib import Path
  27 +
  28 +import numpy as np
  29 +import sherpa_onnx
  30 +
  31 +
  32 +def assert_file_exists(filename: str):
  33 + assert Path(filename).is_file(), (
  34 + f"{filename} does not exist!\n"
  35 + "Please refer to "
  36 + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
  37 + )
  38 +
  39 +
  40 +def get_args():
  41 + parser = argparse.ArgumentParser(
  42 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  43 + )
  44 +
  45 + parser.add_argument(
  46 + "--tokens",
  47 + type=str,
  48 + required=True,
  49 + help="Path to tokens.txt",
  50 + )
  51 +
  52 + parser.add_argument(
  53 + "--encoder",
  54 + type=str,
  55 + required=True,
  56 + help="Path to the encoder model",
  57 + )
  58 +
  59 + parser.add_argument(
  60 + "--decoder",
  61 + type=str,
  62 + required=True,
  63 + help="Path to the decoder model",
  64 + )
  65 +
  66 + parser.add_argument(
  67 + "--joiner",
  68 + type=str,
  69 + help="Path to the joiner model",
  70 + )
  71 +
  72 + parser.add_argument(
  73 + "--decoding-method",
  74 + type=str,
  75 + default="greedy_search",
  76 + help="Valid values are greedy_search and modified_beam_search",
  77 + )
  78 +
  79 + parser.add_argument(
  80 + "--url",
  81 + type=str,
  82 + required=True,
  83 + help="""Example values:
  84 + rtmp://localhost/live/livestream
  85 + https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/wenetspeech/DEV_T0000000000.opus
  86 + https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/aishell2/ID0012W0030.wav
  87 + """,
  88 + )
  89 +
  90 + return parser.parse_args()
  91 +
  92 +
  93 +def create_recognizer(args):
  94 + # Please replace the model files if needed.
  95 + # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  96 + # for download links.
  97 + recognizer = sherpa_onnx.OnlineRecognizer(
  98 + tokens=args.tokens,
  99 + encoder=args.encoder,
  100 + decoder=args.decoder,
  101 + joiner=args.joiner,
  102 + num_threads=1,
  103 + sample_rate=16000,
  104 + feature_dim=80,
  105 + decoding_method=args.decoding_method,
  106 + enable_endpoint_detection=True,
  107 + rule1_min_trailing_silence=2.4,
  108 + rule2_min_trailing_silence=1.2,
  109 + rule3_min_utterance_length=300, # it essentially disables this rule
  110 + )
  111 + return recognizer
  112 +
  113 +
  114 +def main():
  115 + args = get_args()
  116 + assert_file_exists(args.encoder)
  117 + assert_file_exists(args.decoder)
  118 + assert_file_exists(args.joiner)
  119 + assert_file_exists(args.tokens)
  120 +
  121 + recognizer = create_recognizer(args)
  122 +
  123 + ffmpeg_cmd = [
  124 + "ffmpeg",
  125 + "-i",
  126 + args.url,
  127 + "-f",
  128 + "s16le",
  129 + "-acodec",
  130 + "pcm_s16le",
  131 + "-ac",
  132 + "1",
  133 + "-ar",
  134 + "16000",
  135 + "-",
  136 + ]
  137 +
  138 + process = subprocess.Popen(
  139 + ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL
  140 + )
  141 +
  142 + frames_per_read = 1600 # 0.1 second
  143 +
  144 + stream = recognizer.create_stream()
  145 +
  146 + last_result = ""
  147 + segment_id = 0
  148 +
  149 + print("Started!")
  150 + while True:
  151 + # *2 because int16_t has two bytes
  152 + data = process.stdout.read(frames_per_read * 2)
  153 + if not data:
  154 + break
  155 +
  156 + samples = np.frombuffer(data, dtype=np.int16)
  157 + samples = samples.astype(np.float32) / 32768
  158 + stream.accept_waveform(16000, samples)
  159 +
  160 + while recognizer.is_ready(stream):
  161 + recognizer.decode_stream(stream)
  162 +
  163 + is_endpoint = recognizer.is_endpoint(stream)
  164 +
  165 + result = recognizer.get_result(stream)
  166 +
  167 + if result and (last_result != result):
  168 + last_result = result
  169 + print("\r{}:{}".format(segment_id, result), end="", flush=True)
  170 + if is_endpoint:
  171 + if result:
  172 + print("\r{}:{}".format(segment_id, result), flush=True)
  173 + segment_id += 1
  174 + recognizer.reset(stream)
  175 +
  176 +
  177 +if __name__ == "__main__":
  178 + if shutil.which("ffmpeg") is None:
  179 + sys.exit("Please install ffmpeg first!")
  180 + main()