Fangjun Kuang
Committed by GitHub

Add more Python examples for SenseVoice (#1179)

@@ -20,6 +20,38 @@ tar xvf $name @@ -20,6 +20,38 @@ tar xvf $name
20 rm $name 20 rm $name
21 ls -lh $repo 21 ls -lh $repo
22 python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py 22 python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py
  23 +
  24 +if [[ $(uname) == Linux ]]; then
  25 + # It needs ffmpeg
  26 + log "generate subtitles (Chinese)"
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  28 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  29 +
  30 + python3 ./python-api-examples/generate-subtitles.py \
  31 + --silero-vad-model=./silero_vad.onnx \
  32 + --sense-voice=$repo/model.onnx \
  33 + --tokens=$repo/tokens.txt \
  34 + --num-threads=2 \
  35 + ./lei-jun-test.wav
  36 +
  37 + cat lei-jun-test.srt
  38 +
  39 + rm lei-jun-test.wav
  40 +
  41 + log "generate subtitles (English)"
  42 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  43 +
  44 + python3 ./python-api-examples/generate-subtitles.py \
  45 + --silero-vad-model=./silero_vad.onnx \
  46 + --sense-voice=$repo/model.onnx \
  47 + --tokens=$repo/tokens.txt \
  48 + --num-threads=2 \
  49 + ./Obama.wav
  50 +
  51 + cat Obama.srt
  52 + rm Obama.wav
  53 + rm silero_vad.onnx
  54 +fi
23 rm -rf $repo 55 rm -rf $repo
24 56
25 log "test offline TeleSpeech CTC" 57 log "test offline TeleSpeech CTC"
@@ -79,6 +79,11 @@ jobs: @@ -79,6 +79,11 @@ jobs:
79 python3 -m pip install --upgrade pip numpy pypinyin sentencepiece>=0.1.96 soundfile 79 python3 -m pip install --upgrade pip numpy pypinyin sentencepiece>=0.1.96 soundfile
80 python3 -m pip install wheel twine setuptools 80 python3 -m pip install wheel twine setuptools
81 81
  82 + - name: Install ffmpeg
  83 + shell: bash
  84 + run: |
  85 + sudo apt-get install ffmpeg
  86 +
82 - name: Install ninja 87 - name: Install ninja
83 shell: bash 88 shell: bash
84 run: | 89 run: |
@@ -12,12 +12,12 @@ Supported file formats are those supported by ffmpeg; for instance, @@ -12,12 +12,12 @@ Supported file formats are those supported by ffmpeg; for instance,
12 Note that you need a non-streaming model for this script. 12 Note that you need a non-streaming model for this script.
13 13
14 Please visit 14 Please visit
15 -https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx 15 +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
16 to download silero_vad.onnx 16 to download silero_vad.onnx
17 17
18 For instance, 18 For instance,
19 19
20 -wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx 20 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
21 21
22 (1) For paraformer 22 (1) For paraformer
23 23
@@ -58,7 +58,17 @@ wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/siler @@ -58,7 +58,17 @@ wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/siler
58 --num-threads=2 \ 58 --num-threads=2 \
59 /path/to/test.mp4 59 /path/to/test.mp4
60 60
61 -(4) For WeNet CTC models 61 +(4) For SenseVoice CTC models
  62 +
  63 +./python-api-examples/generate-subtitles.py \
  64 + --silero-vad-model=/path/to/silero_vad.onnx \
  65 + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \
  66 + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \
  67 + --num-threads=2 \
  68 + /path/to/test.mp4
  69 +
  70 +
  71 +(5) For WeNet CTC models
62 72
63 ./python-api-examples/generate-subtitles.py \ 73 ./python-api-examples/generate-subtitles.py \
64 --silero-vad-model=/path/to/silero_vad.onnx \ 74 --silero-vad-model=/path/to/silero_vad.onnx \
@@ -131,6 +141,13 @@ def get_args(): @@ -131,6 +141,13 @@ def get_args():
131 ) 141 )
132 142
133 parser.add_argument( 143 parser.add_argument(
  144 + "--sense-voice",
  145 + default="",
  146 + type=str,
  147 + help="Path to the model.onnx from SenseVoice",
  148 + )
  149 +
  150 + parser.add_argument(
134 "--wenet-ctc", 151 "--wenet-ctc",
135 default="", 152 default="",
136 type=str, 153 type=str,
@@ -242,6 +259,7 @@ def assert_file_exists(filename: str): @@ -242,6 +259,7 @@ def assert_file_exists(filename: str):
242 def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: 259 def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
243 if args.encoder: 260 if args.encoder:
244 assert len(args.paraformer) == 0, args.paraformer 261 assert len(args.paraformer) == 0, args.paraformer
  262 + assert len(args.sense_voice) == 0, args.sense_voice
245 assert len(args.wenet_ctc) == 0, args.wenet_ctc 263 assert len(args.wenet_ctc) == 0, args.wenet_ctc
246 assert len(args.whisper_encoder) == 0, args.whisper_encoder 264 assert len(args.whisper_encoder) == 0, args.whisper_encoder
247 assert len(args.whisper_decoder) == 0, args.whisper_decoder 265 assert len(args.whisper_decoder) == 0, args.whisper_decoder
@@ -262,6 +280,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: @@ -262,6 +280,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
262 debug=args.debug, 280 debug=args.debug,
263 ) 281 )
264 elif args.paraformer: 282 elif args.paraformer:
  283 + assert len(args.sense_voice) == 0, args.sense_voice
265 assert len(args.wenet_ctc) == 0, args.wenet_ctc 284 assert len(args.wenet_ctc) == 0, args.wenet_ctc
266 assert len(args.whisper_encoder) == 0, args.whisper_encoder 285 assert len(args.whisper_encoder) == 0, args.whisper_encoder
267 assert len(args.whisper_decoder) == 0, args.whisper_decoder 286 assert len(args.whisper_decoder) == 0, args.whisper_decoder
@@ -277,6 +296,19 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: @@ -277,6 +296,19 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
277 decoding_method=args.decoding_method, 296 decoding_method=args.decoding_method,
278 debug=args.debug, 297 debug=args.debug,
279 ) 298 )
  299 + elif args.sense_voice:
  300 + assert len(args.wenet_ctc) == 0, args.wenet_ctc
  301 + assert len(args.whisper_encoder) == 0, args.whisper_encoder
  302 + assert len(args.whisper_decoder) == 0, args.whisper_decoder
  303 +
  304 + assert_file_exists(args.sense_voice)
  305 + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
  306 + model=args.sense_voice,
  307 + tokens=args.tokens,
  308 + num_threads=args.num_threads,
  309 + use_itn=True,
  310 + debug=args.debug,
  311 + )
280 elif args.wenet_ctc: 312 elif args.wenet_ctc:
281 assert len(args.whisper_encoder) == 0, args.whisper_encoder 313 assert len(args.whisper_encoder) == 0, args.whisper_encoder
282 assert len(args.whisper_decoder) == 0, args.whisper_decoder 314 assert len(args.whisper_decoder) == 0, args.whisper_decoder
@@ -406,6 +438,9 @@ def main(): @@ -406,6 +438,9 @@ def main():
406 vad.accept_waveform(buffer[:window_size]) 438 vad.accept_waveform(buffer[:window_size])
407 buffer = buffer[window_size:] 439 buffer = buffer[window_size:]
408 440
  441 + if is_silence:
  442 + vad.flush()
  443 +
409 streams = [] 444 streams = []
410 segments = [] 445 segments = []
411 while not vad.empty(): 446 while not vad.empty():
@@ -92,6 +92,16 @@ python3 ./python-api-examples/non_streaming_server.py \ @@ -92,6 +92,16 @@ python3 ./python-api-examples/non_streaming_server.py \
92 --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \ 92 --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \
93 --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt 93 --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt
94 94
  95 +(6) Use a Non-streaming SenseVoice model
  96 +
  97 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  98 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  99 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  100 +
  101 +python3 ./python-api-examples/non_streaming_server.py \
  102 + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx \
  103 + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt
  104 +
95 ---- 105 ----
96 106
97 To use a certificate so that you can use https, please use 107 To use a certificate so that you can use https, please use
@@ -208,6 +218,15 @@ def add_paraformer_model_args(parser: argparse.ArgumentParser): @@ -208,6 +218,15 @@ def add_paraformer_model_args(parser: argparse.ArgumentParser):
208 ) 218 )
209 219
210 220
  221 +def add_sense_voice_model_args(parser: argparse.ArgumentParser):
  222 + parser.add_argument(
  223 + "--sense-voice",
  224 + default="",
  225 + type=str,
  226 + help="Path to the model.onnx from SenseVoice",
  227 + )
  228 +
  229 +
211 def add_nemo_ctc_model_args(parser: argparse.ArgumentParser): 230 def add_nemo_ctc_model_args(parser: argparse.ArgumentParser):
212 parser.add_argument( 231 parser.add_argument(
213 "--nemo-ctc", 232 "--nemo-ctc",
@@ -287,6 +306,7 @@ def add_whisper_model_args(parser: argparse.ArgumentParser): @@ -287,6 +306,7 @@ def add_whisper_model_args(parser: argparse.ArgumentParser):
287 def add_model_args(parser: argparse.ArgumentParser): 306 def add_model_args(parser: argparse.ArgumentParser):
288 add_transducer_model_args(parser) 307 add_transducer_model_args(parser)
289 add_paraformer_model_args(parser) 308 add_paraformer_model_args(parser)
  309 + add_sense_voice_model_args(parser)
290 add_nemo_ctc_model_args(parser) 310 add_nemo_ctc_model_args(parser)
291 add_wenet_ctc_model_args(parser) 311 add_wenet_ctc_model_args(parser)
292 add_tdnn_ctc_model_args(parser) 312 add_tdnn_ctc_model_args(parser)
@@ -850,6 +870,7 @@ def assert_file_exists(filename: str): @@ -850,6 +870,7 @@ def assert_file_exists(filename: str):
850 def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: 870 def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
851 if args.encoder: 871 if args.encoder:
852 assert len(args.paraformer) == 0, args.paraformer 872 assert len(args.paraformer) == 0, args.paraformer
  873 + assert len(args.sense_voice) == 0, args.sense_voice
853 assert len(args.nemo_ctc) == 0, args.nemo_ctc 874 assert len(args.nemo_ctc) == 0, args.nemo_ctc
854 assert len(args.wenet_ctc) == 0, args.wenet_ctc 875 assert len(args.wenet_ctc) == 0, args.wenet_ctc
855 assert len(args.whisper_encoder) == 0, args.whisper_encoder 876 assert len(args.whisper_encoder) == 0, args.whisper_encoder
@@ -876,6 +897,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: @@ -876,6 +897,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
876 provider=args.provider, 897 provider=args.provider,
877 ) 898 )
878 elif args.paraformer: 899 elif args.paraformer:
  900 + assert len(args.sense_voice) == 0, args.sense_voice
879 assert len(args.nemo_ctc) == 0, args.nemo_ctc 901 assert len(args.nemo_ctc) == 0, args.nemo_ctc
880 assert len(args.wenet_ctc) == 0, args.wenet_ctc 902 assert len(args.wenet_ctc) == 0, args.wenet_ctc
881 assert len(args.whisper_encoder) == 0, args.whisper_encoder 903 assert len(args.whisper_encoder) == 0, args.whisper_encoder
@@ -893,6 +915,20 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: @@ -893,6 +915,20 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
893 decoding_method=args.decoding_method, 915 decoding_method=args.decoding_method,
894 provider=args.provider, 916 provider=args.provider,
895 ) 917 )
  918 + elif args.sense_voice:
  919 + assert len(args.nemo_ctc) == 0, args.nemo_ctc
  920 + assert len(args.wenet_ctc) == 0, args.wenet_ctc
  921 + assert len(args.whisper_encoder) == 0, args.whisper_encoder
  922 + assert len(args.whisper_decoder) == 0, args.whisper_decoder
  923 + assert len(args.tdnn_model) == 0, args.tdnn_model
  924 +
  925 + assert_file_exists(args.sense_voice)
  926 + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
  927 + model=args.sense_voice,
  928 + tokens=args.tokens,
  929 + num_threads=args.num_threads,
  930 + use_itn=True,
  931 + )
896 elif args.nemo_ctc: 932 elif args.nemo_ctc:
897 assert len(args.wenet_ctc) == 0, args.wenet_ctc 933 assert len(args.wenet_ctc) == 0, args.wenet_ctc
898 assert len(args.whisper_encoder) == 0, args.whisper_encoder 934 assert len(args.whisper_encoder) == 0, args.whisper_encoder
@@ -22,7 +22,7 @@ import soundfile as sf @@ -22,7 +22,7 @@ import soundfile as sf
22 22
23 23
24 def create_recognizer(): 24 def create_recognizer():
25 - model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx" 25 + model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx"
26 tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt" 26 tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"
27 test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav" 27 test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav"
28 # test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav" 28 # test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav"
@@ -45,6 +45,14 @@ Note that you need a non-streaming model for this script. @@ -45,6 +45,14 @@ Note that you need a non-streaming model for this script.
45 --whisper-task=transcribe \ 45 --whisper-task=transcribe \
46 --num-threads=2 46 --num-threads=2
47 47
  48 +(4) For SenseVoice CTC models
  49 +
  50 +./python-api-examples/vad-with-non-streaming-asr.py \
  51 + --silero-vad-model=/path/to/silero_vad.onnx \
  52 + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \
  53 + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \
  54 + --num-threads=2
  55 +
48 Please refer to 56 Please refer to
49 https://k2-fsa.github.io/sherpa/onnx/index.html 57 https://k2-fsa.github.io/sherpa/onnx/index.html
50 to install sherpa-onnx and to download non-streaming pre-trained models 58 to install sherpa-onnx and to download non-streaming pre-trained models
@@ -124,6 +132,13 @@ def get_args(): @@ -124,6 +132,13 @@ def get_args():
124 ) 132 )
125 133
126 parser.add_argument( 134 parser.add_argument(
  135 + "--sense-voice",
  136 + default="",
  137 + type=str,
  138 + help="Path to the model.onnx from SenseVoice",
  139 + )
  140 +
  141 + parser.add_argument(
127 "--num-threads", 142 "--num-threads",
128 type=int, 143 type=int,
129 default=1, 144 default=1,
@@ -233,6 +248,7 @@ def assert_file_exists(filename: str): @@ -233,6 +248,7 @@ def assert_file_exists(filename: str):
233 def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: 248 def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
234 if args.encoder: 249 if args.encoder:
235 assert len(args.paraformer) == 0, args.paraformer 250 assert len(args.paraformer) == 0, args.paraformer
  251 + assert len(args.sense_voice) == 0, args.sense_voice
236 assert len(args.whisper_encoder) == 0, args.whisper_encoder 252 assert len(args.whisper_encoder) == 0, args.whisper_encoder
237 assert len(args.whisper_decoder) == 0, args.whisper_decoder 253 assert len(args.whisper_decoder) == 0, args.whisper_decoder
238 254
@@ -253,6 +269,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: @@ -253,6 +269,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
253 debug=args.debug, 269 debug=args.debug,
254 ) 270 )
255 elif args.paraformer: 271 elif args.paraformer:
  272 + assert len(args.sense_voice) == 0, args.sense_voice
256 assert len(args.whisper_encoder) == 0, args.whisper_encoder 273 assert len(args.whisper_encoder) == 0, args.whisper_encoder
257 assert len(args.whisper_decoder) == 0, args.whisper_decoder 274 assert len(args.whisper_decoder) == 0, args.whisper_decoder
258 275
@@ -267,6 +284,18 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: @@ -267,6 +284,18 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
267 decoding_method=args.decoding_method, 284 decoding_method=args.decoding_method,
268 debug=args.debug, 285 debug=args.debug,
269 ) 286 )
  287 + elif args.sense_voice:
  288 + assert len(args.whisper_encoder) == 0, args.whisper_encoder
  289 + assert len(args.whisper_decoder) == 0, args.whisper_decoder
  290 +
  291 + assert_file_exists(args.sense_voice)
  292 + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
  293 + model=args.sense_voice,
  294 + tokens=args.tokens,
  295 + num_threads=args.num_threads,
  296 + use_itn=True,
  297 + debug=args.debug,
  298 + )
270 elif args.whisper_encoder: 299 elif args.whisper_encoder:
271 assert_file_exists(args.whisper_encoder) 300 assert_file_exists(args.whisper_encoder)
272 assert_file_exists(args.whisper_decoder) 301 assert_file_exists(args.whisper_decoder)