goddamnVincent
Committed by GitHub

'update20241203' (#1589)

add '--modeling-unit' and "--bpe-vocab" to /sherpa-onnx/python-api-examples/streaming_server.py make it specifiable.
@@ -229,6 +229,28 @@ def add_hotwords_args(parser: argparse.ArgumentParser): @@ -229,6 +229,28 @@ def add_hotwords_args(parser: argparse.ArgumentParser):
229 --hotwords-file is given. 229 --hotwords-file is given.
230 """, 230 """,
231 ) 231 )
  232 + parser.add_argument(
  233 + "--modeling-unit",
  234 + type=str,
  235 + default='cjkchar',
  236 + help="""
  237 + The modeling unit of the used model. Current supported units are:
  238 + - cjkchar(for Chinese)
  239 + - bpe(for English like languages)
  240 + - cjkchar+bpe(for multilingual models)
  241 + """,
  242 + )
  243 + parser.add_argument(
  244 + "--bpe-vocab",
  245 + type=str,
  246 + default='',
  247 + help="""
  248 + The bpe vocabulary generated by sentencepiece toolkit.
  249 + It is only used when modeling-unit is bpe or cjkchar+bpe.
  250 + if you can’t find bpe.vocab in the model directory, please run:
  251 + python script/export_bpe_vocab.py --bpe-model exp/bpe.model
  252 + """,
  253 + )
232 254
233 255
234 def add_modified_beam_search_args(parser: argparse.ArgumentParser): 256 def add_modified_beam_search_args(parser: argparse.ArgumentParser):
@@ -409,6 +431,8 @@ def create_recognizer(args) -> sherpa_onnx.OnlineRecognizer: @@ -409,6 +431,8 @@ def create_recognizer(args) -> sherpa_onnx.OnlineRecognizer:
409 rule2_min_trailing_silence=args.rule2_min_trailing_silence, 431 rule2_min_trailing_silence=args.rule2_min_trailing_silence,
410 rule3_min_utterance_length=args.rule3_min_utterance_length, 432 rule3_min_utterance_length=args.rule3_min_utterance_length,
411 provider=args.provider, 433 provider=args.provider,
  434 + modeling_unit=args.modeling_unit,
  435 + bpe_vocab=args.bpe_vocab
412 ) 436 )
413 elif args.paraformer_encoder: 437 elif args.paraformer_encoder:
414 recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer( 438 recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(