正在显示
3 个修改的文件
包含
188 行增加
和
1 行删除
| @@ -40,24 +40,28 @@ def get_args(): | @@ -40,24 +40,28 @@ def get_args(): | ||
| 40 | parser.add_argument( | 40 | parser.add_argument( |
| 41 | "--tokens", | 41 | "--tokens", |
| 42 | type=str, | 42 | type=str, |
| 43 | + required=True, | ||
| 43 | help="Path to tokens.txt", | 44 | help="Path to tokens.txt", |
| 44 | ) | 45 | ) |
| 45 | 46 | ||
| 46 | parser.add_argument( | 47 | parser.add_argument( |
| 47 | "--encoder", | 48 | "--encoder", |
| 48 | type=str, | 49 | type=str, |
| 50 | + required=True, | ||
| 49 | help="Path to the encoder model", | 51 | help="Path to the encoder model", |
| 50 | ) | 52 | ) |
| 51 | 53 | ||
| 52 | parser.add_argument( | 54 | parser.add_argument( |
| 53 | "--decoder", | 55 | "--decoder", |
| 54 | type=str, | 56 | type=str, |
| 57 | + required=True, | ||
| 55 | help="Path to the decoder model", | 58 | help="Path to the decoder model", |
| 56 | ) | 59 | ) |
| 57 | 60 | ||
| 58 | parser.add_argument( | 61 | parser.add_argument( |
| 59 | "--joiner", | 62 | "--joiner", |
| 60 | type=str, | 63 | type=str, |
| 64 | + required=True, | ||
| 61 | help="Path to the joiner model", | 65 | help="Path to the joiner model", |
| 62 | ) | 66 | ) |
| 63 | 67 | ||
| @@ -105,7 +109,7 @@ def main(): | @@ -105,7 +109,7 @@ def main(): | ||
| 105 | # sherpa-onnx will do resampling inside. | 109 | # sherpa-onnx will do resampling inside. |
| 106 | sample_rate = 48000 | 110 | sample_rate = 48000 |
| 107 | samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | 111 | samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms |
| 108 | - last_result = "" | 112 | + |
| 109 | stream = recognizer.create_stream() | 113 | stream = recognizer.create_stream() |
| 110 | 114 | ||
| 111 | last_result = "" | 115 | last_result = "" |
| @@ -39,18 +39,21 @@ def get_args(): | @@ -39,18 +39,21 @@ def get_args(): | ||
| 39 | parser.add_argument( | 39 | parser.add_argument( |
| 40 | "--tokens", | 40 | "--tokens", |
| 41 | type=str, | 41 | type=str, |
| 42 | + required=True, | ||
| 42 | help="Path to tokens.txt", | 43 | help="Path to tokens.txt", |
| 43 | ) | 44 | ) |
| 44 | 45 | ||
| 45 | parser.add_argument( | 46 | parser.add_argument( |
| 46 | "--encoder", | 47 | "--encoder", |
| 47 | type=str, | 48 | type=str, |
| 49 | + required=True, | ||
| 48 | help="Path to the encoder model", | 50 | help="Path to the encoder model", |
| 49 | ) | 51 | ) |
| 50 | 52 | ||
| 51 | parser.add_argument( | 53 | parser.add_argument( |
| 52 | "--decoder", | 54 | "--decoder", |
| 53 | type=str, | 55 | type=str, |
| 56 | + required=True, | ||
| 54 | help="Path to the decoder model", | 57 | help="Path to the decoder model", |
| 55 | ) | 58 | ) |
| 56 | 59 |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +# | ||
| 3 | +# Real-time speech recognition from a URL with sherpa-onnx Python API | ||
| 4 | +# | ||
| 5 | +# Supported URLs are those supported by ffmpeg. | ||
| 6 | +# | ||
| 7 | +# For instance: | ||
| 8 | +# (1) RTMP | ||
| 9 | +# rtmp://localhost/live/livestream | ||
| 10 | +# | ||
| 11 | +# (2) A file | ||
| 12 | +# https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/wenetspeech/DEV_T0000000000.opus | ||
| 13 | +# https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/aishell2/ID0012W0030.wav | ||
| 14 | +# file:///Users/fangjun/open-source/sherpa-onnx/a.wav | ||
| 15 | +# | ||
| 16 | +# Note that it supports all file formats supported by ffmpeg | ||
| 17 | +# | ||
| 18 | +# Please refer to | ||
| 19 | +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
| 20 | +# to download pre-trained models | ||
| 21 | + | ||
| 22 | +import argparse | ||
| 23 | +import shutil | ||
| 24 | +import subprocess | ||
| 25 | +import sys | ||
| 26 | +from pathlib import Path | ||
| 27 | + | ||
| 28 | +import numpy as np | ||
| 29 | +import sherpa_onnx | ||
| 30 | + | ||
| 31 | + | ||
| 32 | +def assert_file_exists(filename: str): | ||
| 33 | + assert Path(filename).is_file(), ( | ||
| 34 | + f"{filename} does not exist!\n" | ||
| 35 | + "Please refer to " | ||
| 36 | + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" | ||
| 37 | + ) | ||
| 38 | + | ||
| 39 | + | ||
| 40 | +def get_args(): | ||
| 41 | + parser = argparse.ArgumentParser( | ||
| 42 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 43 | + ) | ||
| 44 | + | ||
| 45 | + parser.add_argument( | ||
| 46 | + "--tokens", | ||
| 47 | + type=str, | ||
| 48 | + required=True, | ||
| 49 | + help="Path to tokens.txt", | ||
| 50 | + ) | ||
| 51 | + | ||
| 52 | + parser.add_argument( | ||
| 53 | + "--encoder", | ||
| 54 | + type=str, | ||
| 55 | + required=True, | ||
| 56 | + help="Path to the encoder model", | ||
| 57 | + ) | ||
| 58 | + | ||
| 59 | + parser.add_argument( | ||
| 60 | + "--decoder", | ||
| 61 | + type=str, | ||
| 62 | + required=True, | ||
| 63 | + help="Path to the decoder model", | ||
| 64 | + ) | ||
| 65 | + | ||
| 66 | + parser.add_argument( | ||
| 67 | + "--joiner", | ||
| 68 | + type=str, | ||
| 69 | + help="Path to the joiner model", | ||
| 70 | + ) | ||
| 71 | + | ||
| 72 | + parser.add_argument( | ||
| 73 | + "--decoding-method", | ||
| 74 | + type=str, | ||
| 75 | + default="greedy_search", | ||
| 76 | + help="Valid values are greedy_search and modified_beam_search", | ||
| 77 | + ) | ||
| 78 | + | ||
| 79 | + parser.add_argument( | ||
| 80 | + "--url", | ||
| 81 | + type=str, | ||
| 82 | + required=True, | ||
| 83 | + help="""Example values: | ||
| 84 | + rtmp://localhost/live/livestream | ||
| 85 | + https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/wenetspeech/DEV_T0000000000.opus | ||
| 86 | + https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition/resolve/main/test_wavs/aishell2/ID0012W0030.wav | ||
| 87 | + """, | ||
| 88 | + ) | ||
| 89 | + | ||
| 90 | + return parser.parse_args() | ||
| 91 | + | ||
| 92 | + | ||
| 93 | +def create_recognizer(args): | ||
| 94 | + # Please replace the model files if needed. | ||
| 95 | + # See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
| 96 | + # for download links. | ||
| 97 | + recognizer = sherpa_onnx.OnlineRecognizer( | ||
| 98 | + tokens=args.tokens, | ||
| 99 | + encoder=args.encoder, | ||
| 100 | + decoder=args.decoder, | ||
| 101 | + joiner=args.joiner, | ||
| 102 | + num_threads=1, | ||
| 103 | + sample_rate=16000, | ||
| 104 | + feature_dim=80, | ||
| 105 | + decoding_method=args.decoding_method, | ||
| 106 | + enable_endpoint_detection=True, | ||
| 107 | + rule1_min_trailing_silence=2.4, | ||
| 108 | + rule2_min_trailing_silence=1.2, | ||
| 109 | + rule3_min_utterance_length=300, # it essentially disables this rule | ||
| 110 | + ) | ||
| 111 | + return recognizer | ||
| 112 | + | ||
| 113 | + | ||
| 114 | +def main(): | ||
| 115 | + args = get_args() | ||
| 116 | + assert_file_exists(args.encoder) | ||
| 117 | + assert_file_exists(args.decoder) | ||
| 118 | + assert_file_exists(args.joiner) | ||
| 119 | + assert_file_exists(args.tokens) | ||
| 120 | + | ||
| 121 | + recognizer = create_recognizer(args) | ||
| 122 | + | ||
| 123 | + ffmpeg_cmd = [ | ||
| 124 | + "ffmpeg", | ||
| 125 | + "-i", | ||
| 126 | + args.url, | ||
| 127 | + "-f", | ||
| 128 | + "s16le", | ||
| 129 | + "-acodec", | ||
| 130 | + "pcm_s16le", | ||
| 131 | + "-ac", | ||
| 132 | + "1", | ||
| 133 | + "-ar", | ||
| 134 | + "16000", | ||
| 135 | + "-", | ||
| 136 | + ] | ||
| 137 | + | ||
| 138 | + process = subprocess.Popen( | ||
| 139 | + ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL | ||
| 140 | + ) | ||
| 141 | + | ||
| 142 | + frames_per_read = 1600 # 0.1 second | ||
| 143 | + | ||
| 144 | + stream = recognizer.create_stream() | ||
| 145 | + | ||
| 146 | + last_result = "" | ||
| 147 | + segment_id = 0 | ||
| 148 | + | ||
| 149 | + print("Started!") | ||
| 150 | + while True: | ||
| 151 | + # *2 because int16_t has two bytes | ||
| 152 | + data = process.stdout.read(frames_per_read * 2) | ||
| 153 | + if not data: | ||
| 154 | + break | ||
| 155 | + | ||
| 156 | + samples = np.frombuffer(data, dtype=np.int16) | ||
| 157 | + samples = samples.astype(np.float32) / 32768 | ||
| 158 | + stream.accept_waveform(16000, samples) | ||
| 159 | + | ||
| 160 | + while recognizer.is_ready(stream): | ||
| 161 | + recognizer.decode_stream(stream) | ||
| 162 | + | ||
| 163 | + is_endpoint = recognizer.is_endpoint(stream) | ||
| 164 | + | ||
| 165 | + result = recognizer.get_result(stream) | ||
| 166 | + | ||
| 167 | + if result and (last_result != result): | ||
| 168 | + last_result = result | ||
| 169 | + print("\r{}:{}".format(segment_id, result), end="", flush=True) | ||
| 170 | + if is_endpoint: | ||
| 171 | + if result: | ||
| 172 | + print("\r{}:{}".format(segment_id, result), flush=True) | ||
| 173 | + segment_id += 1 | ||
| 174 | + recognizer.reset(stream) | ||
| 175 | + | ||
| 176 | + | ||
| 177 | +if __name__ == "__main__": | ||
| 178 | + if shutil.which("ffmpeg") is None: | ||
| 179 | + sys.exit("Please install ffmpeg first!") | ||
| 180 | + main() |
-
请 注册 或 登录 后发表评论