Refactor hotwords，support loading hotwords from file (#296)

Wei Kang · GitHub
Commit 47184f9db74ba6b3d8f05101390457e39ea4f339 47184f9d 1 parent 087367d7
.github/scripts/test-python.sh
.github/workflows/test-python-offline-websocket-server.yaml
.github/workflows/test-python-online-websocket-server.yaml
python-api-examples/non_streaming_server.py
python-api-examples/offline-decode-files.py
python-api-examples/online-decode-files.py
python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
python-api-examples/speech-recognition-from-microphone.py
python-api-examples/speech-recognition-from-url.py
python-api-examples/streaming_server.py
scripts/text2token.py
setup.py
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/context-graph-test.cc
sherpa-onnx/csrc/offline-recognizer-impl.h
sherpa-onnx/csrc/offline-recognizer-transducer-impl.h
sherpa-onnx/csrc/offline-recognizer.cc
sherpa-onnx/csrc/offline-recognizer.h
sherpa-onnx/csrc/online-recognizer-impl.h
sherpa-onnx/csrc/online-recognizer-transducer-impl.h
--- a/.github/scripts/test-python.sh
查看文件 @47184f9
+++ b/.github/scripts/test-python.sh
查看文件 @47184f9
@@ -166,3 +166,8 @@ python3 ./python-api-examples/offline-decode-files.py \
 python3 sherpa-onnx/python/tests/test_offline_recognizer.py --verbose
 
 rm -rf $repo
+ 
+ # test text2token
+ git clone https://github.com/pkufool/sherpa-test-data /tmp/sherpa-test-data
+ 
+ python3 sherpa-onnx/python/tests/test_text2token.py --verbose
--- a/.github/workflows/test-python-offline-websocket-server.yaml
查看文件 @47184f9
+++ b/.github/workflows/test-python-offline-websocket-server.yaml
查看文件 @47184f9
@@ -39,7 +39,7 @@ jobs:
       - name: Install Python dependencies
         shell: bash
         run: |
-           python3 -m pip install --upgrade pip numpy
+           python3 -m pip install --upgrade pip numpy sentencepiece
 
       - name: Install sherpa-onnx
         shell: bash
--- a/.github/workflows/test-python-online-websocket-server.yaml
查看文件 @47184f9
+++ b/.github/workflows/test-python-online-websocket-server.yaml
查看文件 @47184f9
@@ -39,7 +39,7 @@ jobs:
       - name: Install Python dependencies
         shell: bash
         run: |
-           python3 -m pip install --upgrade pip numpy
+           python3 -m pip install --upgrade pip numpy sentencepiece
 
       - name: Install sherpa-onnx
         shell: bash
--- a/python-api-examples/non_streaming_server.py
查看文件 @47184f9
+++ b/python-api-examples/non_streaming_server.py
查看文件 @47184f9
@@ -326,6 +326,31 @@ def add_modified_beam_search_args(parser: argparse.ArgumentParser):
     )
 
 
+ def add_hotwords_args(parser: argparse.ArgumentParser):
+     parser.add_argument(
+         "--hotwords-file",
+         type=str,
+         default="",
+         help="""
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
+ 
+         ▁HE LL O ▁WORLD
+         你 好 世 界
+         """,
+     )
+ 
+     parser.add_argument(
+         "--hotwords-score",
+         type=float,
+         default=1.5,
+         help="""
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
+         """,
+     )
+ 
+ 
 def check_args(args):
     if not Path(args.tokens).is_file():
         raise ValueError(f"{args.tokens} does not exist")
@@ -342,6 +367,10 @@ def check_args(args):
         assert Path(args.decoder).is_file(), args.decoder
         assert Path(args.joiner).is_file(), args.joiner
 
+     if args.hotwords_file != "":
+         assert args.decoding_method == "modified_beam_search", args.decoding_method
+         assert Path(args.hotwords_file).is_file(), args.hotwords_file
+ 
 
 def get_args():
     parser = argparse.ArgumentParser(
@@ -351,6 +380,7 @@ def get_args():
     add_model_args(parser)
     add_feature_config_args(parser)
     add_decoding_args(parser)
+     add_hotwords_args(parser)
 
     parser.add_argument(
         "--port",
@@ -792,6 +822,8 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
             feature_dim=args.feat_dim,
             decoding_method=args.decoding_method,
             max_active_paths=args.max_active_paths,
+             hotwords_file=args.hotwords_file,
+             hotwords_score=args.hotwords_score,
         )
     elif args.paraformer:
         assert len(args.nemo_ctc) == 0, args.nemo_ctc
--- a/python-api-examples/offline-decode-files.py
查看文件 @47184f9
+++ b/python-api-examples/offline-decode-files.py
查看文件 @47184f9
@@ -82,7 +82,6 @@ from pathlib import Path
 from typing import List, Tuple
 
 import numpy as np
- import sentencepiece as spm
 import sherpa_onnx
 
 
@@ -98,43 +97,25 @@ def get_args():
     )
 
     parser.add_argument(
-         "--bpe-model",
+         "--hotwords-file",
         type=str,
         default="",
         help="""
-         Path to bpe.model,
-         Used only when --decoding-method=modified_beam_search
-         """,
-     )
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
 
-     parser.add_argument(
-         "--modeling-unit",
-         type=str,
-         default="char",
-         help="""
-         The type of modeling unit.
-         Valid values are bpe, bpe+char, char.
-         Note: the char here means characters in CJK languages.
+         ▁HE LL O ▁WORLD
+         你 好 世 界
         """,
     )
 
     parser.add_argument(
-         "--contexts",
-         type=str,
-         default="",
-         help="""
-         The context list, it is a string containing some words/phrases separated
-         with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
-         """,
-     )
- 
-     parser.add_argument(
-         "--context-score",
+         "--hotwords-score",
         type=float,
         default=1.5,
         help="""
-         The context score of each token for biasing word/phrase. Used only if
-         --contexts is given.
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
         """,
     )
 
@@ -273,25 +254,6 @@ def assert_file_exists(filename: str):
         "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
     )
 
- 
- def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
-     sp = None
-     if "bpe" in args.modeling_unit:
-         assert_file_exists(args.bpe_model)
-         sp = spm.SentencePieceProcessor()
-         sp.load(args.bpe_model)
-     tokens = {}
-     with open(args.tokens, "r", encoding="utf-8") as f:
-         for line in f:
-             toks = line.strip().split()
-             assert len(toks) == 2, len(toks)
-             assert toks[0] not in tokens, f"Duplicate token: {toks} "
-             tokens[toks[0]] = int(toks[1])
-     return sherpa_onnx.encode_contexts(
-         modeling_unit=args.modeling_unit, contexts=contexts, sp=sp, tokens_table=tokens
-     )
- 
- 
 def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
     """
     Args:
@@ -322,7 +284,6 @@ def main():
     assert_file_exists(args.tokens)
     assert args.num_threads > 0, args.num_threads
 
-     contexts_list = []
     if args.encoder:
         assert len(args.paraformer) == 0, args.paraformer
         assert len(args.nemo_ctc) == 0, args.nemo_ctc
@@ -330,11 +291,6 @@ def main():
         assert len(args.whisper_decoder) == 0, args.whisper_decoder
         assert len(args.tdnn_model) == 0, args.tdnn_model
 
-         contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
-         if contexts:
-             print(f"Contexts list: {contexts}")
-             contexts_list = encode_contexts(args, contexts)
- 
         assert_file_exists(args.encoder)
         assert_file_exists(args.decoder)
         assert_file_exists(args.joiner)
@@ -348,7 +304,8 @@ def main():
             sample_rate=args.sample_rate,
             feature_dim=args.feature_dim,
             decoding_method=args.decoding_method,
-             context_score=args.context_score,
+             hotwords_file=args.hotwords_file,
+             hotwords_score=args.hotwords_score,
             debug=args.debug,
         )
     elif args.paraformer:
@@ -425,12 +382,7 @@ def main():
         samples, sample_rate = read_wave(wave_filename)
         duration = len(samples) / sample_rate
         total_duration += duration
-         if contexts_list:
-             assert len(args.paraformer) == 0, args.paraformer
-             assert len(args.nemo_ctc) == 0, args.nemo_ctc
-             s = recognizer.create_stream(contexts_list=contexts_list)
-         else:
-             s = recognizer.create_stream()
+         s = recognizer.create_stream()
         s.accept_waveform(sample_rate, samples)
 
         streams.append(s)
--- a/python-api-examples/online-decode-files.py
查看文件 @47184f9
+++ b/python-api-examples/online-decode-files.py
查看文件 @47184f9
@@ -48,7 +48,6 @@ from pathlib import Path
 from typing import List, Tuple
 
 import numpy as np
- import sentencepiece as spm
 import sherpa_onnx
 
 
@@ -124,46 +123,25 @@ def get_args():
     )
 
     parser.add_argument(
-         "--bpe-model",
+         "--hotwords-file",
         type=str,
         default="",
         help="""
-         Path to bpe.model, it will be used to tokenize contexts biasing phrases.
-         Used only when --decoding-method=modified_beam_search
-         """,
-     )
- 
-     parser.add_argument(
-         "--modeling-unit",
-         type=str,
-         default="char",
-         help="""
-         The type of modeling unit, it will be used to tokenize contexts biasing phrases.
-         Valid values are bpe, bpe+char, char.
-         Note: the char here means characters in CJK languages.
-         Used only when --decoding-method=modified_beam_search
-         """,
-     )
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
 
-     parser.add_argument(
-         "--contexts",
-         type=str,
-         default="",
-         help="""
-         The context list, it is a string containing some words/phrases separated
-         with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
-         Used only when --decoding-method=modified_beam_search
+         ▁HE LL O ▁WORLD
+         你 好 世 界
         """,
     )
 
     parser.add_argument(
-         "--context-score",
+         "--hotwords-score",
         type=float,
         default=1.5,
         help="""
-         The context score of each token for biasing word/phrase. Used only if
-         --contexts is given.
-         Used only when --decoding-method=modified_beam_search
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
         """,
     )
 
@@ -214,27 +192,6 @@ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
         return samples_float32, f.getframerate()
 
 
- def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
-     sp = None
-     if "bpe" in args.modeling_unit:
-         assert_file_exists(args.bpe_model)
-         sp = spm.SentencePieceProcessor()
-         sp.load(args.bpe_model)
-     tokens = {}
-     with open(args.tokens, "r", encoding="utf-8") as f:
-         for line in f:
-             toks = line.strip().split()
-             assert len(toks) == 2, len(toks)
-             assert toks[0] not in tokens, f"Duplicate token: {toks} "
-             tokens[toks[0]] = int(toks[1])
-     return sherpa_onnx.encode_contexts(
-         modeling_unit=args.modeling_unit,
-         contexts=contexts,
-         sp=sp,
-         tokens_table=tokens,
-     )
- 
- 
 def main():
     args = get_args()
     assert_file_exists(args.tokens)
@@ -258,7 +215,8 @@ def main():
             feature_dim=80,
             decoding_method=args.decoding_method,
             max_active_paths=args.max_active_paths,
-             context_score=args.context_score,
+             hotwords_file=args.hotwords_file,
+             hotwords_score=args.hotwords_score,
         )
     elif args.paraformer_encoder:
         recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
@@ -277,12 +235,6 @@ def main():
     print("Started!")
     start_time = time.time()
 
-     contexts_list = []
-     contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
-     if contexts:
-         print(f"Contexts list: {contexts}")
-         contexts_list = encode_contexts(args, contexts)
- 
     streams = []
     total_duration = 0
     for wave_filename in args.sound_files:
@@ -291,10 +243,7 @@ def main():
         duration = len(samples) / sample_rate
         total_duration += duration
 
-         if contexts_list:
-             s = recognizer.create_stream(contexts_list=contexts_list)
-         else:
-             s = recognizer.create_stream()
+         s = recognizer.create_stream()
 
         s.accept_waveform(sample_rate, samples)
 
--- a/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
查看文件 @47184f9
+++ b/python-api-examples/speech-recognition-from-microphone-with-endpoint-detection.py
查看文件 @47184f9
@@ -79,6 +79,30 @@ def get_args():
         help="Valid values: cpu, cuda, coreml",
     )
 
+     parser.add_argument(
+         "--hotwords-file",
+         type=str,
+         default="",
+         help="""
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
+ 
+         ▁HE LL O ▁WORLD
+         你 好 世 界
+         """,
+     )
+ 
+     parser.add_argument(
+         "--hotwords-score",
+         type=float,
+         default=1.5,
+         help="""
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
+         """,
+     )
+ 
+ 
     return parser.parse_args()
 
 
@@ -104,6 +128,8 @@ def create_recognizer(args):
         rule3_min_utterance_length=300,  # it essentially disables this rule
         decoding_method=args.decoding_method,
         provider=args.provider,
+         hotwords_file=agrs.hotwords_file,
+         hotwords_score=args.hotwords_score,
     )
     return recognizer
 
--- a/python-api-examples/speech-recognition-from-microphone.py
查看文件 @47184f9
+++ b/python-api-examples/speech-recognition-from-microphone.py
查看文件 @47184f9
@@ -11,7 +11,6 @@ import sys
 from pathlib import Path
 
 from typing import List
- import sentencepiece as spm
 
 try:
     import sounddevice as sd
@@ -90,49 +89,29 @@ def get_args():
     )
 
     parser.add_argument(
-         "--bpe-model",
+         "--hotwords-file",
         type=str,
         default="",
         help="""
-         Path to bpe.model, it will be used to tokenize contexts biasing phrases.
-         Used only when --decoding-method=modified_beam_search
-         """,
-     )
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
 
-     parser.add_argument(
-         "--modeling-unit",
-         type=str,
-         default="char",
-         help="""
-         The type of modeling unit, it will be used to tokenize contexts biasing phrases.
-         Valid values are bpe, bpe+char, char.
-         Note: the char here means characters in CJK languages.
-         Used only when --decoding-method=modified_beam_search
+         ▁HE LL O ▁WORLD
+         你 好 世 界
         """,
     )
 
     parser.add_argument(
-         "--contexts",
-         type=str,
-         default="",
-         help="""
-         The context list, it is a string containing some words/phrases separated
-         with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
-         Used only when --decoding-method=modified_beam_search
-         """,
-     )
- 
-     parser.add_argument(
-         "--context-score",
+         "--hotwords-score",
         type=float,
         default=1.5,
         help="""
-         The context score of each token for biasing word/phrase. Used only if
-         --contexts is given.
-         Used only when --decoding-method=modified_beam_search
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
         """,
     )
 
+ 
     return parser.parse_args()
 
 
@@ -155,32 +134,12 @@ def create_recognizer(args):
         decoding_method=args.decoding_method,
         max_active_paths=args.max_active_paths,
         provider=args.provider,
-         context_score=args.context_score,
+         hotwords_file=args.hotwords_file,
+         hotwords_score=args.hotwords_score,
     )
     return recognizer
 
 
- def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
-     sp = None
-     if "bpe" in args.modeling_unit:
-         assert_file_exists(args.bpe_model)
-         sp = spm.SentencePieceProcessor()
-         sp.load(args.bpe_model)
-     tokens = {}
-     with open(args.tokens, "r", encoding="utf-8") as f:
-         for line in f:
-             toks = line.strip().split()
-             assert len(toks) == 2, len(toks)
-             assert toks[0] not in tokens, f"Duplicate token: {toks} "
-             tokens[toks[0]] = int(toks[1])
-     return sherpa_onnx.encode_contexts(
-         modeling_unit=args.modeling_unit,
-         contexts=contexts,
-         sp=sp,
-         tokens_table=tokens,
-     )
- 
- 
 def main():
     args = get_args()
 
@@ -193,12 +152,6 @@ def main():
     default_input_device_idx = sd.default.device[0]
     print(f'Use default device: {devices[default_input_device_idx]["name"]}')
 
-     contexts_list = []
-     contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
-     if contexts:
-         print(f"Contexts list: {contexts}")
-         contexts_list = encode_contexts(args, contexts)
- 
     recognizer = create_recognizer(args)
     print("Started! Please speak")
 
@@ -207,10 +160,7 @@ def main():
     sample_rate = 48000
     samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
     last_result = ""
-     if contexts_list:
-         stream = recognizer.create_stream(contexts_list=contexts_list)
-     else:
-         stream = recognizer.create_stream()
+     stream = recognizer.create_stream()
     with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
         while True:
             samples, _ = s.read(samples_per_read)  # a blocking read
--- a/python-api-examples/speech-recognition-from-url.py
查看文件 @47184f9
+++ b/python-api-examples/speech-recognition-from-url.py
查看文件 @47184f9
@@ -87,6 +87,30 @@ def get_args():
         """,
     )
 
+     parser.add_argument(
+         "--hotwords-file",
+         type=str,
+         default="",
+         help="""
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
+ 
+         ▁HE LL O ▁WORLD
+         你 好 世 界
+         """,
+     )
+ 
+     parser.add_argument(
+         "--hotwords-score",
+         type=float,
+         default=1.5,
+         help="""
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
+         """,
+     )
+ 
+ 
     return parser.parse_args()
 
 
@@ -107,6 +131,8 @@ def create_recognizer(args):
         rule1_min_trailing_silence=2.4,
         rule2_min_trailing_silence=1.2,
         rule3_min_utterance_length=300,  # it essentially disables this rule
+         hotwords_file=args.hotwords_file,
+         hotwords_score=args.hotwords_score,
     )
     return recognizer
 
--- a/python-api-examples/streaming_server.py
查看文件 @47184f9
+++ b/python-api-examples/streaming_server.py
查看文件 @47184f9
@@ -187,6 +187,32 @@ def add_decoding_args(parser: argparse.ArgumentParser):
     add_modified_beam_search_args(parser)
 
 
+ def add_hotwords_args(parser: argparse.ArgumentParser):
+     parser.add_argument(
+         "--hotwords-file",
+         type=str,
+         default="",
+         help="""
+         The file containing hotwords, one words/phrases per line, and for each
+         phrase the bpe/cjkchar are separated by a space. For example:
+ 
+         ▁HE LL O ▁WORLD
+         你 好 世 界
+         """,
+     )
+ 
+     parser.add_argument(
+         "--hotwords-score",
+         type=float,
+         default=1.5,
+         help="""
+         The hotword score of each token for biasing word/phrase. Used only if
+         --hotwords-file is given.
+         """,
+     )
+ 
+ 
+ 
 def add_modified_beam_search_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--num-active-paths",
@@ -239,6 +265,7 @@ def get_args():
     add_model_args(parser)
     add_decoding_args(parser)
     add_endpointing_args(parser)
+     add_hotwords_args(parser)
 
     parser.add_argument(
         "--port",
@@ -343,6 +370,8 @@ def create_recognizer(args) -> sherpa_onnx.OnlineRecognizer:
             feature_dim=args.feat_dim,
             decoding_method=args.decoding_method,
             max_active_paths=args.num_active_paths,
+             hotwords_score=args.hotwords_score,
+             hotwords_file=args.hotwords_file,
             enable_endpoint_detection=args.use_endpoint != 0,
             rule1_min_trailing_silence=args.rule1_min_trailing_silence,
             rule2_min_trailing_silence=args.rule2_min_trailing_silence,
--- a/scripts/text2token.py 0 → 100755
查看文件 @47184f9
+++ b/scripts/text2token.py 0 → 100755
查看文件 @47184f9
+ #!/usr/bin/env python3
+ 
+ """
+ This script encode the texts (given line by line through `text`) to tokens and
+ write the results to the file given by ``output``.
+ 
+ Usage:
+ If the tokens_type is bpe:
+ 
+ python3 ./text2token.py \
+           --text texts.txt \
+           --tokens tokens.txt \
+           --tokens-type bpe \
+           --bpe-model bpe.model \
+           --output hotwords.txt
+ 
+ If the tokens_type is cjkchar:
+ 
+ python3 ./text2token.py \
+           --text texts.txt \
+           --tokens tokens.txt \
+           --tokens-type cjkchar \
+           --output hotwords.txt
+ 
+ If the tokens_type is cjkchar+bpe:
+ 
+ python3 ./text2token.py \
+           --text texts.txt \
+           --tokens tokens.txt \
+           --tokens-type cjkchar+bpe \
+           --bpe-model bpe.model \
+           --output hotwords.txt
+ 
+ """
+ import argparse
+ 
+ from sherpa_onnx import text2token
+ 
+ def get_args():
+     parser = argparse.ArgumentParser()
+     parser.add_argument(
+         "--text",
+         type=str,
+         required=True,
+         help="Path to the input texts",
+     )
+ 
+     parser.add_argument(
+         "--tokens",
+         type=str,
+         required=True,
+         help="The path to tokens.txt.",
+     )
+ 
+     parser.add_argument(
+         "--tokens-type",
+         type=str,
+         required=True,
+         help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
+     )
+ 
+     parser.add_argument(
+         "--bpe-model",
+         type=str,
+         help="The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe.",
+     )
+ 
+     parser.add_argument(
+         "--output",
+         type=str,
+         required=True,
+         help="Path where the encoded tokens will be written to.",
+     )
+ 
+     return parser.parse_args()
+ 
+ 
+ def main():
+     args = get_args()
+ 
+     texts = []
+     with open(args.text, "r", encoding="utf8") as f:
+         for line in f:
+             texts.append(line.strip())
+     encoded_texts = text2token(
+         texts,
+         tokens=args.tokens,
+         tokens_type=args.tokens_type,
+         bpe_model=args.bpe_model,
+     )
+     with open(args.output, "w", encoding="utf8") as f:
+         for txt in encoded_texts:
+             f.write(" ".join(txt) + "\n")
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/setup.py
查看文件 @47184f9
+++ b/setup.py
查看文件 @47184f9
@@ -39,6 +39,7 @@ install_requires = [
     "numpy",
     "sentencepiece==0.1.96; python_version < '3.11'",
     "sentencepiece; python_version >= '3.11'",
+     "click>=7.1.1",
 ]
 
 
@@ -93,6 +94,11 @@ setuptools.setup(
         "Programming Language :: Python",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
+     entry_points={
+         'console_scripts': [
+             'sherpa-onnx-cli=sherpa_onnx.cli:cli',
+         ],
+     },
     license="Apache licensed, as found in the LICENSE file",
 )
 
--- a/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @47184f9
@@ -72,6 +72,7 @@ set(sources
   text-utils.cc
   transpose.cc
   unbind.cc
+   utils.cc
   wave-reader.cc
 )
 
--- a/sherpa-onnx/csrc/context-graph-test.cc
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/context-graph-test.cc
查看文件 @47184f9
@@ -4,11 +4,14 @@
 
 #include "sherpa-onnx/csrc/context-graph.h"
 
+ #include <chrono>  // NOLINT
 #include <map>
+ #include <random>
 #include <string>
 #include <vector>
 
 #include "gtest/gtest.h"
+ #include "sherpa-onnx/csrc/macros.h"
 
 namespace sherpa_onnx {
 
@@ -41,4 +44,29 @@ TEST(ContextGraph, TestBasic) {
   }
 }
 
+ TEST(ContextGraph, Benchmark) {
+   std::random_device rd;
+   std::mt19937 mt(rd());
+   std::uniform_int_distribution<int32_t> char_dist(0, 25);
+   std::uniform_int_distribution<int32_t> len_dist(3, 8);
+   for (int32_t num = 10; num <= 10000; num *= 10) {
+     std::vector<std::vector<int32_t>> contexts;
+     for (int32_t i = 0; i < num; ++i) {
+       std::vector<int32_t> tmp;
+       int32_t word_len = len_dist(mt);
+       for (int32_t j = 0; j < word_len; ++j) {
+         tmp.push_back(char_dist(mt));
+       }
+       contexts.push_back(std::move(tmp));
+     }
+     auto start = std::chrono::high_resolution_clock::now();
+     auto context_graph = ContextGraph(contexts, 1);
+     auto stop = std::chrono::high_resolution_clock::now();
+     auto duration =
+         std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+     SHERPA_ONNX_LOGE("Construct context graph for %d item takes %ld us.", num,
+                      duration.count());
+   }
+ }
+ 
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/offline-recognizer-impl.h
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/offline-recognizer-impl.h
查看文件 @47184f9
@@ -6,6 +6,7 @@
 #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_IMPL_H_
 
 #include <memory>
+ #include <string>
 #include <vector>
 
 #if __ANDROID_API__ >= 9
@@ -32,7 +33,7 @@ class OfflineRecognizerImpl {
   virtual ~OfflineRecognizerImpl() = default;
 
   virtual std::unique_ptr<OfflineStream> CreateStream(
-       const std::vector<std::vector<int32_t>> &context_list) const {
+       const std::string &hotwords) const {
     SHERPA_ONNX_LOGE("Only transducer models support contextual biasing.");
     exit(-1);
   }
--- a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h
查看文件 @47184f9
@@ -5,7 +5,9 @@
 #ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
 #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
 
+ #include <fstream>
 #include <memory>
+ #include <regex>  // NOLINT
 #include <string>
 #include <utility>
 #include <vector>
@@ -16,6 +18,7 @@
 #endif
 
 #include "sherpa-onnx/csrc/context-graph.h"
+ #include "sherpa-onnx/csrc/log.h"
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/offline-recognizer-impl.h"
 #include "sherpa-onnx/csrc/offline-recognizer.h"
@@ -25,6 +28,7 @@
 #include "sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.h"
 #include "sherpa-onnx/csrc/pad-sequence.h"
 #include "sherpa-onnx/csrc/symbol-table.h"
+ #include "sherpa-onnx/csrc/utils.h"
 
 namespace sherpa_onnx {
 
@@ -60,6 +64,9 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl {
       : config_(config),
         symbol_table_(config_.model_config.tokens),
         model_(std::make_unique<OfflineTransducerModel>(config_.model_config)) {
+     if (!config_.hotwords_file.empty()) {
+       InitHotwords();
+     }
     if (config_.decoding_method == "greedy_search") {
       decoder_ =
           std::make_unique<OfflineTransducerGreedySearchDecoder>(model_.get());
@@ -105,17 +112,24 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl {
 #endif
 
   std::unique_ptr<OfflineStream> CreateStream(
-       const std::vector<std::vector<int32_t>> &context_list) const override {
-     // We create context_graph at this level, because we might have default
-     // context_graph(will be added later if needed) that belongs to the whole
-     // model rather than each stream.
+       const std::string &hotwords) const override {
+     auto hws = std::regex_replace(hotwords, std::regex("/"), "\n");
+     std::istringstream is(hws);
+     std::vector<std::vector<int32_t>> current;
+     if (!EncodeHotwords(is, symbol_table_, &current)) {
+       SHERPA_ONNX_LOGE("Encode hotwords failed, skipping, hotwords are : %s",
+                        hotwords.c_str());
+     }
+     current.insert(current.end(), hotwords_.begin(), hotwords_.end());
+ 
     auto context_graph =
-         std::make_shared<ContextGraph>(context_list, config_.context_score);
+         std::make_shared<ContextGraph>(current, config_.hotwords_score);
     return std::make_unique<OfflineStream>(config_.feat_config, context_graph);
   }
 
   std::unique_ptr<OfflineStream> CreateStream() const override {
-     return std::make_unique<OfflineStream>(config_.feat_config);
+     return std::make_unique<OfflineStream>(config_.feat_config,
+                                            hotwords_graph_);
   }
 
   void DecodeStreams(OfflineStream **ss, int32_t n) const override {
@@ -171,9 +185,29 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl {
     }
   }
 
+   void InitHotwords() {
+     // each line in hotwords_file contains space-separated words
+ 
+     std::ifstream is(config_.hotwords_file);
+     if (!is) {
+       SHERPA_ONNX_LOGE("Open hotwords file failed: %s",
+                        config_.hotwords_file.c_str());
+       exit(-1);
+     }
+ 
+     if (!EncodeHotwords(is, symbol_table_, &hotwords_)) {
+       SHERPA_ONNX_LOGE("Encode hotwords failed.");
+       exit(-1);
+     }
+     hotwords_graph_ =
+         std::make_shared<ContextGraph>(hotwords_, config_.hotwords_score);
+   }
+ 
  private:
   OfflineRecognizerConfig config_;
   SymbolTable symbol_table_;
+   std::vector<std::vector<int32_t>> hotwords_;
+   ContextGraphPtr hotwords_graph_;
   std::unique_ptr<OfflineTransducerModel> model_;
   std::unique_ptr<OfflineTransducerDecoder> decoder_;
   std::unique_ptr<OfflineLM> lm_;
--- a/sherpa-onnx/csrc/offline-recognizer.cc
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/offline-recognizer.cc
查看文件 @47184f9
@@ -26,7 +26,15 @@ void OfflineRecognizerConfig::Register(ParseOptions *po) {
 
   po->Register("max-active-paths", &max_active_paths,
                "Used only when decoding_method is modified_beam_search");
-   po->Register("context-score", &context_score,
+ 
+   po->Register(
+       "hotwords-file", &hotwords_file,
+       "The file containing hotwords, one words/phrases per line, and for each"
+       "phrase the bpe/cjkchar are separated by a space. For example: "
+       "▁HE LL O ▁WORLD"
+       "你 好 世 界");
+ 
+   po->Register("hotwords-score", &hotwords_score,
                "The bonus score for each token in context word/phrase. "
                "Used only when decoding_method is modified_beam_search");
 }
@@ -53,7 +61,8 @@ std::string OfflineRecognizerConfig::ToString() const {
   os << "lm_config=" << lm_config.ToString() << ", ";
   os << "decoding_method=\"" << decoding_method << "\", ";
   os << "max_active_paths=" << max_active_paths << ", ";
-   os << "context_score=" << context_score << ")";
+   os << "hotwords_file=\"" << hotwords_file << "\", ";
+   os << "hotwords_score=" << hotwords_score << ")";
 
   return os.str();
 }
@@ -70,8 +79,8 @@ OfflineRecognizer::OfflineRecognizer(const OfflineRecognizerConfig &config)
 OfflineRecognizer::~OfflineRecognizer() = default;
 
 std::unique_ptr<OfflineStream> OfflineRecognizer::CreateStream(
-     const std::vector<std::vector<int32_t>> &context_list) const {
-   return impl_->CreateStream(context_list);
+     const std::string &hotwords) const {
+   return impl_->CreateStream(hotwords);
 }
 
 std::unique_ptr<OfflineStream> OfflineRecognizer::CreateStream() const {
--- a/sherpa-onnx/csrc/offline-recognizer.h
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/offline-recognizer.h
查看文件 @47184f9
@@ -31,7 +31,10 @@ struct OfflineRecognizerConfig {
 
   std::string decoding_method = "greedy_search";
   int32_t max_active_paths = 4;
-   float context_score = 1.5;
+ 
+   std::string hotwords_file;
+   float hotwords_score = 1.5;
+ 
   // only greedy_search is implemented
   // TODO(fangjun): Implement modified_beam_search
 
@@ -40,13 +43,16 @@ struct OfflineRecognizerConfig {
                           const OfflineModelConfig &model_config,
                           const OfflineLMConfig &lm_config,
                           const std::string &decoding_method,
-                           int32_t max_active_paths, float context_score)
+                           int32_t max_active_paths,
+                           const std::string &hotwords_file,
+                           float hotwords_score)
       : feat_config(feat_config),
         model_config(model_config),
         lm_config(lm_config),
         decoding_method(decoding_method),
         max_active_paths(max_active_paths),
-         context_score(context_score) {}
+         hotwords_file(hotwords_file),
+         hotwords_score(hotwords_score) {}
 
   void Register(ParseOptions *po);
   bool Validate() const;
@@ -69,9 +75,17 @@ class OfflineRecognizer {
   /// Create a stream for decoding.
   std::unique_ptr<OfflineStream> CreateStream() const;
 
-   /// Create a stream for decoding.
+   /** Create a stream for decoding.
+    *
+    *  @param The hotwords for this string, it might contain several hotwords,
+    *         the hotwords are separated by "/". In each of the hotwords, there
+    *         are cjkchars or bpes, the bpe/cjkchar are separated by space (" ").
+    *         For example, hotwords I LOVE YOU and HELLO WORLD, looks like:
+    *
+    *         "▁I ▁LOVE ▁YOU/▁HE LL O ▁WORLD"
+    */
   std::unique_ptr<OfflineStream> CreateStream(
-       const std::vector<std::vector<int32_t>> &context_list) const;
+       const std::string &hotwords) const;
 
   /** Decode a single stream
    *
--- a/sherpa-onnx/csrc/online-recognizer-impl.h
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/online-recognizer-impl.h
查看文件 @47184f9
@@ -6,6 +6,7 @@
 #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_IMPL_H_
 
 #include <memory>
+ #include <string>
 #include <vector>
 
 #include "sherpa-onnx/csrc/macros.h"
@@ -29,7 +30,7 @@ class OnlineRecognizerImpl {
   virtual std::unique_ptr<OnlineStream> CreateStream() const = 0;
 
   virtual std::unique_ptr<OnlineStream> CreateStream(
-       const std::vector<std::vector<int32_t>> &contexts) const {
+       const std::string &hotwords) const {
     SHERPA_ONNX_LOGE("Only transducer models support contextual biasing.");
     exit(-1);
   }
--- a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h
查看文件 @47184f9
@@ -7,6 +7,8 @@
 
 #include <algorithm>
 #include <memory>
+ #include <regex>  // NOLINT
+ #include <string>
 #include <utility>
 #include <vector>
 
@@ -20,6 +22,7 @@
 #include "sherpa-onnx/csrc/online-transducer-model.h"
 #include "sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.h"
 #include "sherpa-onnx/csrc/symbol-table.h"
+ #include "sherpa-onnx/csrc/utils.h"
 
 namespace sherpa_onnx {
 
@@ -57,6 +60,9 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
         model_(OnlineTransducerModel::Create(config.model_config)),
         sym_(config.model_config.tokens),
         endpoint_(config_.endpoint_config) {
+     if (!config_.hotwords_file.empty()) {
+       InitHotwords();
+     }
     if (sym_.contains("<unk>")) {
       unk_id_ = sym_["<unk>"];
     }
@@ -106,18 +112,24 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
 #endif
 
   std::unique_ptr<OnlineStream> CreateStream() const override {
-     auto stream = std::make_unique<OnlineStream>(config_.feat_config);
+     auto stream =
+         std::make_unique<OnlineStream>(config_.feat_config, hotwords_graph_);
     InitOnlineStream(stream.get());
     return stream;
   }
 
   std::unique_ptr<OnlineStream> CreateStream(
-       const std::vector<std::vector<int32_t>> &contexts) const override {
-     // We create context_graph at this level, because we might have default
-     // context_graph(will be added later if needed) that belongs to the whole
-     // model rather than each stream.
+       const std::string &hotwords) const override {
+     auto hws = std::regex_replace(hotwords, std::regex("/"), "\n");
+     std::istringstream is(hws);
+     std::vector<std::vector<int32_t>> current;
+     if (!EncodeHotwords(is, sym_, &current)) {
+       SHERPA_ONNX_LOGE("Encode hotwords failed, skipping, hotwords are : %s",
+                        hotwords.c_str());
+     }
+     current.insert(current.end(), hotwords_.begin(), hotwords_.end());
     auto context_graph =
-         std::make_shared<ContextGraph>(contexts, config_.context_score);
+         std::make_shared<ContextGraph>(current, config_.hotwords_score);
     auto stream =
         std::make_unique<OnlineStream>(config_.feat_config, context_graph);
     InitOnlineStream(stream.get());
@@ -253,6 +265,24 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
     s->Reset();
   }
 
+   void InitHotwords() {
+     // each line in hotwords_file contains space-separated words
+ 
+     std::ifstream is(config_.hotwords_file);
+     if (!is) {
+       SHERPA_ONNX_LOGE("Open hotwords file failed: %s",
+                        config_.hotwords_file.c_str());
+       exit(-1);
+     }
+ 
+     if (!EncodeHotwords(is, sym_, &hotwords_)) {
+       SHERPA_ONNX_LOGE("Encode hotwords failed.");
+       exit(-1);
+     }
+     hotwords_graph_ =
+         std::make_shared<ContextGraph>(hotwords_, config_.hotwords_score);
+   }
+ 
  private:
   void InitOnlineStream(OnlineStream *stream) const {
     auto r = decoder_->GetEmptyResult();
@@ -271,6 +301,8 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
 
  private:
   OnlineRecognizerConfig config_;
+   std::vector<std::vector<int32_t>> hotwords_;
+   ContextGraphPtr hotwords_graph_;
   std::unique_ptr<OnlineTransducerModel> model_;
   std::unique_ptr<OnlineLM> lm_;
   std::unique_ptr<OnlineTransducerDecoder> decoder_;
--- a/sherpa-onnx/csrc/online-recognizer.cc
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/online-recognizer.cc
查看文件 @47184f9
@@ -57,9 +57,15 @@ void OnlineRecognizerConfig::Register(ParseOptions *po) {
                "True to enable endpoint detection. False to disable it.");
   po->Register("max-active-paths", &max_active_paths,
                "beam size used in modified beam search.");
-   po->Register("context-score", &context_score,
+   po->Register("hotwords-score", &hotwords_score,
                "The bonus score for each token in context word/phrase. "
                "Used only when decoding_method is modified_beam_search");
+   po->Register(
+       "hotwords-file", &hotwords_file,
+       "The file containing hotwords, one words/phrases per line, and for each"
+       "phrase the bpe/cjkchar are separated by a space. For example: "
+       "▁HE LL O ▁WORLD"
+       "你 好 世 界");
   po->Register("decoding-method", &decoding_method,
                "decoding method,"
                "now support greedy_search and modified_beam_search.");
@@ -87,7 +93,8 @@ std::string OnlineRecognizerConfig::ToString() const {
   os << "endpoint_config=" << endpoint_config.ToString() << ", ";
   os << "enable_endpoint=" << (enable_endpoint ? "True" : "False") << ", ";
   os << "max_active_paths=" << max_active_paths << ", ";
-   os << "context_score=" << context_score << ", ";
+   os << "hotwords_score=" << hotwords_score << ", ";
+   os << "hotwords_file=\"" << hotwords_file << "\", ";
   os << "decoding_method=\"" << decoding_method << "\")";
 
   return os.str();
@@ -109,8 +116,8 @@ std::unique_ptr<OnlineStream> OnlineRecognizer::CreateStream() const {
 }
 
 std::unique_ptr<OnlineStream> OnlineRecognizer::CreateStream(
-     const std::vector<std::vector<int32_t>> &context_list) const {
-   return impl_->CreateStream(context_list);
+     const std::string &hotwords) const {
+   return impl_->CreateStream(hotwords);
 }
 
 bool OnlineRecognizer::IsReady(OnlineStream *s) const {
--- a/sherpa-onnx/csrc/online-recognizer.h
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/online-recognizer.h
查看文件 @47184f9
@@ -78,8 +78,10 @@ struct OnlineRecognizerConfig {
 
   // used only for modified_beam_search
   int32_t max_active_paths = 4;
+ 
   /// used only for modified_beam_search
-   float context_score = 1.5;
+   float hotwords_score = 1.5;
+   std::string hotwords_file;
 
   OnlineRecognizerConfig() = default;
 
@@ -89,14 +91,16 @@ struct OnlineRecognizerConfig {
                          const EndpointConfig &endpoint_config,
                          bool enable_endpoint,
                          const std::string &decoding_method,
-                          int32_t max_active_paths, float context_score)
+                          int32_t max_active_paths,
+                          const std::string &hotwords_file, float hotwords_score)
       : feat_config(feat_config),
         model_config(model_config),
         endpoint_config(endpoint_config),
         enable_endpoint(enable_endpoint),
         decoding_method(decoding_method),
         max_active_paths(max_active_paths),
-         context_score(context_score) {}
+         hotwords_score(hotwords_score),
+         hotwords_file(hotwords_file) {}
 
   void Register(ParseOptions *po);
   bool Validate() const;
@@ -119,9 +123,16 @@ class OnlineRecognizer {
   /// Create a stream for decoding.
   std::unique_ptr<OnlineStream> CreateStream() const;
 
-   // Create a stream with context phrases
-   std::unique_ptr<OnlineStream> CreateStream(
-       const std::vector<std::vector<int32_t>> &context_list) const;
+   /** Create a stream for decoding.
+    *
+    *  @param The hotwords for this string, it might contain several hotwords,
+    *         the hotwords are separated by "/". In each of the hotwords, there
+    *         are cjkchars or bpes, the bpe/cjkchar are separated by space (" ").
+    *         For example, hotwords I LOVE YOU and HELLO WORLD, looks like:
+    *
+    *         "▁I ▁LOVE ▁YOU/▁HE LL O ▁WORLD"
+    */
+   std::unique_ptr<OnlineStream> CreateStream(const std::string &hotwords) const;
 
   /**
    * Return true if the given stream has enough frames for decoding.
--- a/sherpa-onnx/csrc/utils.cc 0 → 100644
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/utils.cc 0 → 100644
查看文件 @47184f9
+ // sherpa-onnx/csrc/utils.cc
+ //
+ // Copyright      2023  Xiaomi Corporation
+ 
+ #include "sherpa-onnx/csrc/utils.h"
+ 
+ #include <iostream>
+ #include <sstream>
+ #include <string>
+ #include <utility>
+ #include <vector>
+ 
+ #include "sherpa-onnx/csrc/log.h"
+ #include "sherpa-onnx/csrc/macros.h"
+ 
+ namespace sherpa_onnx {
+ 
+ bool EncodeHotwords(std::istream &is, const SymbolTable &symbol_table,
+                     std::vector<std::vector<int32_t>> *hotwords) {
+   hotwords->clear();
+   std::vector<int32_t> tmp;
+   std::string line;
+   std::string word;
+ 
+   while (std::getline(is, line)) {
+     std::istringstream iss(line);
+     std::vector<std::string> syms;
+     while (iss >> word) {
+       if (word.size() >= 3) {
+         // For BPE-based models, we replace ▁ with a space
+         // Unicode 9601, hex 0x2581, utf8 0xe29681
+         const uint8_t *p = reinterpret_cast<const uint8_t *>(word.c_str());
+         if (p[0] == 0xe2 && p[1] == 0x96 && p[2] == 0x81) {
+           word = word.replace(0, 3, " ");
+         }
+       }
+       if (symbol_table.contains(word)) {
+         int32_t number = symbol_table[word];
+         tmp.push_back(number);
+       } else {
+         SHERPA_ONNX_LOGE(
+             "Cannot find ID for hotword %s at line: %s. (Hint: words on "
+             "the "
+             "same line are separated by spaces)",
+             word.c_str(), line.c_str());
+         return false;
+       }
+     }
+     hotwords->push_back(std::move(tmp));
+   }
+   return true;
+ }
+ 
+ }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/utils.h 0 → 100644
查看文件 @47184f9
+++ b/sherpa-onnx/csrc/utils.h 0 → 100644
查看文件 @47184f9
+ // sherpa-onnx/csrc/utils.h
+ //
+ // Copyright      2023  Xiaomi Corporation
+ #ifndef SHERPA_ONNX_CSRC_UTILS_H_
+ #define SHERPA_ONNX_CSRC_UTILS_H_
+ 
+ #include <string>
+ #include <vector>
+ 
+ #include "sherpa-onnx/csrc/symbol-table.h"
+ 
+ namespace sherpa_onnx {
+ 
+ /* Encode the hotwords in an input stream to be tokens ids.
+  *
+  * @param is The input stream, it contains several lines, one hotword for each
+  *           line. For each hotword, the tokens (cjkchar or bpe) are separated
+  *           by spaces.
+  * @param symbol_table  The tokens table mapping symbols to ids. All the symbols
+  *                      in the stream should be in the symbol_table, if not this
+  *                      function returns fasle.
+  *
+  * @@param hotwords  The encoded ids to be written to.
+  *
+  * @return  If all the symbols from ``is`` are in the symbol_table, returns true
+  *          otherwise returns false.
+  */
+ bool EncodeHotwords(std::istream &is, const SymbolTable &symbol_table,
+                     std::vector<std::vector<int32_t>> *hotwords);
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_CSRC_UTILS_H_
--- a/sherpa-onnx/python/csrc/offline-recognizer.cc
查看文件 @47184f9
+++ b/sherpa-onnx/python/csrc/offline-recognizer.cc
查看文件 @47184f9
@@ -16,17 +16,19 @@ static void PybindOfflineRecognizerConfig(py::module *m) {
   py::class_<PyClass>(*m, "OfflineRecognizerConfig")
       .def(py::init<const OfflineFeatureExtractorConfig &,
                     const OfflineModelConfig &, const OfflineLMConfig &,
-                     const std::string &, int32_t, float>(),
+                     const std::string &, int32_t, const std::string &, float>(),
            py::arg("feat_config"), py::arg("model_config"),
            py::arg("lm_config") = OfflineLMConfig(),
            py::arg("decoding_method") = "greedy_search",
-            py::arg("max_active_paths") = 4, py::arg("context_score") = 1.5)
+            py::arg("max_active_paths") = 4, py::arg("hotwords_file") = "",
+            py::arg("hotwords_score") = 1.5)
       .def_readwrite("feat_config", &PyClass::feat_config)
       .def_readwrite("model_config", &PyClass::model_config)
       .def_readwrite("lm_config", &PyClass::lm_config)
       .def_readwrite("decoding_method", &PyClass::decoding_method)
       .def_readwrite("max_active_paths", &PyClass::max_active_paths)
-       .def_readwrite("context_score", &PyClass::context_score)
+       .def_readwrite("hotwords_file", &PyClass::hotwords_file)
+       .def_readwrite("hotwords_score", &PyClass::hotwords_score)
       .def("__str__", &PyClass::ToString);
 }
 
@@ -40,11 +42,10 @@ void PybindOfflineRecognizer(py::module *m) {
            [](const PyClass &self) { return self.CreateStream(); })
       .def(
           "create_stream",
-           [](PyClass &self,
-              const std::vector<std::vector<int32_t>> &contexts_list) {
-             return self.CreateStream(contexts_list);
+           [](PyClass &self, const std::string &hotwords) {
+             return self.CreateStream(hotwords);
           },
-           py::arg("contexts_list"))
+           py::arg("hotwords"))
       .def("decode_stream", &PyClass::DecodeStream)
       .def("decode_streams",
            [](const PyClass &self, std::vector<OfflineStream *> ss) {
--- a/sherpa-onnx/python/csrc/online-model-config.cc
查看文件 @47184f9
+++ b/sherpa-onnx/python/csrc/online-model-config.cc
查看文件 @47184f9
@@ -21,8 +21,8 @@ void PybindOnlineModelConfig(py::module *m) {
   using PyClass = OnlineModelConfig;
   py::class_<PyClass>(*m, "OnlineModelConfig")
       .def(py::init<const OnlineTransducerModelConfig &,
-                     const OnlineParaformerModelConfig &, std::string &, int32_t,
-                     bool, const std::string &, const std::string &>(),
+                     const OnlineParaformerModelConfig &, const std::string &,
+                     int32_t, bool, const std::string &, const std::string &>(),
            py::arg("transducer") = OnlineTransducerModelConfig(),
            py::arg("paraformer") = OnlineParaformerModelConfig(),
            py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false,
--- a/sherpa-onnx/python/csrc/online-recognizer.cc
查看文件 @47184f9
+++ b/sherpa-onnx/python/csrc/online-recognizer.cc
查看文件 @47184f9
@@ -29,18 +29,20 @@ static void PybindOnlineRecognizerConfig(py::module *m) {
   py::class_<PyClass>(*m, "OnlineRecognizerConfig")
       .def(py::init<const FeatureExtractorConfig &, const OnlineModelConfig &,
                     const OnlineLMConfig &, const EndpointConfig &, bool,
-                     const std::string &, int32_t, float>(),
+                     const std::string &, int32_t, const std::string &, float>(),
            py::arg("feat_config"), py::arg("model_config"),
            py::arg("lm_config") = OnlineLMConfig(), py::arg("endpoint_config"),
            py::arg("enable_endpoint"), py::arg("decoding_method"),
-            py::arg("max_active_paths") = 4, py::arg("context_score") = 0)
+            py::arg("max_active_paths") = 4, py::arg("hotwords_file") = "",
+            py::arg("hotwords_score") = 0)
       .def_readwrite("feat_config", &PyClass::feat_config)
       .def_readwrite("model_config", &PyClass::model_config)
       .def_readwrite("endpoint_config", &PyClass::endpoint_config)
       .def_readwrite("enable_endpoint", &PyClass::enable_endpoint)
       .def_readwrite("decoding_method", &PyClass::decoding_method)
       .def_readwrite("max_active_paths", &PyClass::max_active_paths)
-       .def_readwrite("context_score", &PyClass::context_score)
+       .def_readwrite("hotwords_file", &PyClass::hotwords_file)
+       .def_readwrite("hotwords_score", &PyClass::hotwords_score)
       .def("__str__", &PyClass::ToString);
 }
 
@@ -55,11 +57,10 @@ void PybindOnlineRecognizer(py::module *m) {
            [](const PyClass &self) { return self.CreateStream(); })
       .def(
           "create_stream",
-           [](PyClass &self,
-              const std::vector<std::vector<int32_t>> &contexts_list) {
-             return self.CreateStream(contexts_list);
+           [](PyClass &self, const std::string &hotwords) {
+             return self.CreateStream(hotwords);
           },
-           py::arg("contexts_list"))
+           py::arg("hotwords"))
       .def("is_ready", &PyClass::IsReady)
       .def("decode_stream", &PyClass::DecodeStream)
       .def("decode_streams",
--- a/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @47184f9
+++ b/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @47184f9
@@ -4,4 +4,4 @@ from _sherpa_onnx import Display, OfflineStream, OnlineStream
 
 from .offline_recognizer import OfflineRecognizer
 from .online_recognizer import OnlineRecognizer
- from .utils import encode_contexts
+ from .utils import text2token
--- a/sherpa-onnx/python/sherpa_onnx/cli.py 0 → 100644
查看文件 @47184f9
+++ b/sherpa-onnx/python/sherpa_onnx/cli.py 0 → 100644
查看文件 @47184f9
+ # Copyright (c)  2023  Xiaomi Corporation
+ 
+ import logging
+ import click
+ from pathlib import Path
+ from sherpa_onnx import text2token
+ 
+ 
+ @click.group()
+ def cli():
+     """
+     The shell entry point to sherpa-onnx.
+     """
+     logging.basicConfig(
+         format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+         level=logging.INFO,
+     )
+ 
+ 
+ @cli.command(name="text2token")
+ @click.argument("input", type=click.Path(exists=True, dir_okay=False))
+ @click.argument("output", type=click.Path())
+ @click.option(
+     "--tokens",
+     type=str,
+     required=True,
+     help="The path to tokens.txt.",
+ )
+ @click.option(
+     "--tokens-type",
+     type=str,
+     required=True,
+     help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
+ )
+ @click.option(
+     "--bpe-model",
+     type=str,
+     help="The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe.",
+ )
+ def encode_text(
+     input: Path, output: Path, tokens: Path, tokens_type: str, bpe_model: Path
+ ):
+     """
+     Encode the texts given by the INPUT to tokens and write the results to the OUTPUT.
+     """
+     texts = []
+     with open(input, "r", encoding="utf8") as f:
+         for line in f:
+             texts.append(line.strip())
+     encoded_texts = text2token(
+         texts, tokens=tokens, tokens_type=tokens_type, bpe_model=bpe_model
+     )
+     with open(output, "w", encoding="utf8") as f:
+         for txt in encoded_texts:
+             f.write(" ".join(txt) + "\n")
--- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py
查看文件 @47184f9
+++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py
查看文件 @47184f9
@@ -43,7 +43,8 @@ class OfflineRecognizer(object):
         feature_dim: int = 80,
         decoding_method: str = "greedy_search",
         max_active_paths: int = 4,
-         context_score: float = 1.5,
+         hotwords_file: str = "",
+         hotwords_score: float = 1.5,
         debug: bool = False,
         provider: str = "cpu",
     ):
@@ -105,7 +106,8 @@ class OfflineRecognizer(object):
             feat_config=feat_config,
             model_config=model_config,
             decoding_method=decoding_method,
-             context_score=context_score,
+             hotwords_file=hotwords_file,
+             hotwords_score=hotwords_score,
         )
         self.recognizer = _Recognizer(recognizer_config)
         self.config = recognizer_config
@@ -379,11 +381,11 @@ class OfflineRecognizer(object):
         self.config = recognizer_config
         return self
 
-     def create_stream(self, contexts_list: Optional[List[List[int]]] = None):
-         if contexts_list is None:
+     def create_stream(self, hotwords: Optional[str] = None):
+         if hotwords is None:
             return self.recognizer.create_stream()
         else:
-             return self.recognizer.create_stream(contexts_list)
+             return self.recognizer.create_stream(hotwords)
 
     def decode_stream(self, s: OfflineStream):
         self.recognizer.decode_stream(s)
--- a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py
查看文件 @47184f9
+++ b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py
查看文件 @47184f9
@@ -42,7 +42,8 @@ class OnlineRecognizer(object):
         rule3_min_utterance_length: float = 20.0,
         decoding_method: str = "greedy_search",
         max_active_paths: int = 4,
-         context_score: float = 1.5,
+         hotwords_score: float = 1.5,
+         hotwords_file: str = "",
         provider: str = "cpu",
         model_type: str = "",
     ):
@@ -138,7 +139,8 @@ class OnlineRecognizer(object):
             enable_endpoint=enable_endpoint_detection,
             decoding_method=decoding_method,
             max_active_paths=max_active_paths,
-             context_score=context_score,
+             hotwords_score=hotwords_score,
+             hotwords_file=hotwords_file,
         )
 
         self.recognizer = _Recognizer(recognizer_config)
@@ -248,11 +250,11 @@ class OnlineRecognizer(object):
         self.config = recognizer_config
         return self
 
-     def create_stream(self, contexts_list: Optional[List[List[int]]] = None):
-         if contexts_list is None:
+     def create_stream(self, hotwords: Optional[str] = None):
+         if hotwords is None:
             return self.recognizer.create_stream()
         else:
-             return self.recognizer.create_stream(contexts_list)
+             return self.recognizer.create_stream(hotwords)
 
     def decode_stream(self, s: OnlineStream):
         self.recognizer.decode_stream(s)
--- a/sherpa-onnx/python/sherpa_onnx/utils.py
查看文件 @47184f9
+++ b/sherpa-onnx/python/sherpa_onnx/utils.py
查看文件 @47184f9
- from typing import Dict, List, Optional
+ # Copyright (c)  2023  Xiaomi Corporation
+ import re
 
+ from pathlib import Path
+ from typing import List, Optional, Union
 
- def encode_contexts(
-     modeling_unit: str,
-     contexts: List[str],
-     sp: Optional["SentencePieceProcessor"] = None,
-     tokens_table: Optional[Dict[str, int]] = None,
- ) -> List[List[int]]:
+ import sentencepiece as spm
+ 
+ 
+ def text2token(
+     texts: List[str],
+     tokens: str,
+     tokens_type: str = "cjkchar",
+     bpe_model: Optional[str] = None,
+     output_ids: bool = False,
+ ) -> List[List[Union[str, int]]]:
     """
-     Encode the given contexts (a list of string) to a list of a list of token ids.
+     Encode the given texts (a list of string) to a list of a list of tokens.
 
     Args:
-       modeling_unit:
-         The valid values are bpe, char, bpe+char.
-         Note: char here means characters in CJK languages, not English like languages.
-       contexts:
+       texts:
         The given contexts list (a list of string).
-       sp:
-         An instance of SentencePieceProcessor.
-       tokens_table:
-         The tokens_table containing the tokens and the corresponding ids.
+       tokens:
+         The path of the tokens.txt.
+       tokens_type:
+         The valid values are cjkchar, bpe, cjkchar+bpe.
+       bpe_model:
+         The path of the bpe model. Only required when tokens_type is bpe or
+         cjkchar+bpe.
+       output_ids:
+         True to output token ids otherwise tokens.
     Returns:
-       Return the contexts_list, it is a list of a list of token ids.
+       Return the encoded texts, it is a list of a list of token ids if output_ids
+       is True, or it is a list of list of tokens.
     """
-     contexts_list = []
-     if "bpe" in modeling_unit:
-         assert sp is not None
-     if "char" in modeling_unit:
-         assert tokens_table is not None
-         assert len(tokens_table) > 0, len(tokens_table)
+     assert Path(tokens).is_file(), f"File not exists, {tokens}"
+     tokens_table = {}
+     with open(tokens, "r", encoding="utf-8") as f:
+         for line in f:
+             toks = line.strip().split()
+             assert len(toks) == 2, len(toks)
+             assert toks[0] not in tokens_table, f"Duplicate token: {toks} "
+             tokens_table[toks[0]] = int(toks[1])
 
-     if "char" == modeling_unit:
-         for context in contexts:
-             assert ' ' not in context
-             ids = [
-                 tokens_table[txt] if txt in tokens_table else tokens_table["<unk>"]
-                 for txt in context
-             ]
-             contexts_list.append(ids)
-     elif "bpe" == modeling_unit:
-         contexts_list = sp.encode(contexts, out_type=int)
-     else:
-         assert modeling_unit == "bpe+char", modeling_unit
+     if "bpe" in tokens_type:
+         assert Path(bpe_model).is_file(), f"File not exists, {bpe_model}"
+         sp = spm.SentencePieceProcessor()
+         sp.load(bpe_model)
 
+     texts_list: List[List[str]] = []
+ 
+     if tokens_type == "cjkchar":
+         texts_list = [list("".join(text.split())) for text in texts]
+     elif tokens_type == "bpe":
+         texts_list = sp.encode(texts, out_type=str)
+     else:
+         assert (
+             tokens_type == "cjkchar+bpe"
+         ), f"Supported tokens_type are cjkchar, bpe, cjkchar+bpe, given {tokens_type}"
         # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref:
         # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
         pattern = re.compile(r"([\u4e00-\u9fff])")
-         for context in contexts:
+         for text in texts:
             # Example:
             #   txt   = "你好 ITS'S OKAY 的"
             #   chars = ["你", "好", " ITS'S OKAY ", "的"]
-             chars = pattern.split(context.upper())
+             chars = pattern.split(text)
             mix_chars = [w for w in chars if len(w.strip()) > 0]
-             ids = []
+             text_list = []
             for ch_or_w in mix_chars:
                 # ch_or_w is a single CJK charater(i.e., "你"), do nothing.
                 if pattern.fullmatch(ch_or_w) is not None:
-                     ids.append(
-                         tokens_table[ch_or_w]
-                         if ch_or_w in tokens_table
-                         else tokens_table["<unk>"]
-                     )
+                     text_list.append(ch_or_w)
                 # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "),
                 # encode ch_or_w using bpe_model.
                 else:
-                     for p in sp.encode_as_pieces(ch_or_w):
-                         ids.append(
-                             tokens_table[p]
-                             if p in tokens_table
-                             else tokens_table["<unk>"]
-                         )
-             contexts_list.append(ids)
-     return contexts_list
+                     text_list += sp.encode_as_pieces(ch_or_w)
+             texts_list.append(text_list)
+ 
+     result: List[List[Union[int, str]]] = []
+     for text in texts_list:
+         text_list = []
+         contain_oov = False
+         for txt in text:
+             if txt in tokens_table:
+                 text_list.append(tokens_table[txt] if output_ids else txt)
+             else:
+                 print(f"OOV token : {txt}, skipping text : {text}.")
+                 contain_oov = True
+                 break
+         if contain_oov:
+             continue
+         else:
+             result.append(text_list)
+     return result
--- a/sherpa-onnx/python/tests/CMakeLists.txt
查看文件 @47184f9
+++ b/sherpa-onnx/python/tests/CMakeLists.txt
查看文件 @47184f9
@@ -6,12 +6,14 @@ function(sherpa_onnx_add_py_test source)
     COMMAND
       "${PYTHON_EXECUTABLE}"
       "${CMAKE_CURRENT_SOURCE_DIR}/${source}"
+     WORKING_DIRECTORY
+       ${CMAKE_CURRENT_SOURCE_DIR}
   )
 
   get_filename_component(sherpa_onnx_path ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
 
   set_property(TEST ${name}
-     PROPERTY ENVIRONMENT "PYTHONPATH=${sherpa_path}:$<TARGET_FILE_DIR:_sherpa_onnx>:$ENV{PYTHONPATH}"
+     PROPERTY ENVIRONMENT "PYTHONPATH=${sherpa_onnx_path}:$<TARGET_FILE_DIR:_sherpa_onnx>:$ENV{PYTHONPATH}"
   )
 endfunction()
 
@@ -21,6 +23,7 @@ set(py_test_files
   test_offline_recognizer.py
   test_online_recognizer.py
   test_online_transducer_model_config.py
+   test_text2token.py
 )
 
 foreach(source IN LISTS py_test_files)
--- a/sherpa-onnx/python/tests/test_text2token.py 0 → 100644
查看文件 @47184f9
+++ b/sherpa-onnx/python/tests/test_text2token.py 0 → 100644
查看文件 @47184f9
+ # sherpa-onnx/python/tests/test_text2token.py
+ #
+ # Copyright (c)  2023  Xiaomi Corporation
+ #
+ # To run this single test, use
+ #
+ #  ctest --verbose -R  test_text2token_py
+ 
+ import unittest
+ from pathlib import Path
+ 
+ import sherpa_onnx
+ 
+ d = "/tmp/sherpa-test-data"
+ # Please refer to
+ # https://github.com/pkufool/sherpa-test-data
+ # to download test data for testing
+ 
+ 
+ class TestText2Token(unittest.TestCase):
+     def test_bpe(self):
+         tokens = f"{d}/text2token/tokens_en.txt"
+         bpe_model = f"{d}/text2token/bpe_en.model"
+ 
+         if not Path(tokens).is_file() or not Path(bpe_model).is_file():
+             print(
+                 f"No test data found, skipping test_bpe().\n"
+                 f"You can download the test data by: \n"
+                 f"git clone https://github.com/pkufool/sherpa-test-data.git /tmp/sherpa-test-data"
+             )
+             return
+ 
+         texts = ["HELLO WORLD", "I LOVE YOU"]
+         encoded_texts = sherpa_onnx.text2token(
+             texts,
+             tokens=tokens,
+             tokens_type="bpe",
+             bpe_model=bpe_model,
+         )
+         assert encoded_texts == [
+             ["▁HE", "LL", "O", "▁WORLD"],
+             ["▁I", "▁LOVE", "▁YOU"],
+         ], encoded_texts
+ 
+         encoded_ids = sherpa_onnx.text2token(
+             texts,
+             tokens=tokens,
+             tokens_type="bpe",
+             bpe_model=bpe_model,
+             output_ids=True,
+         )
+         assert encoded_ids == [[22, 58, 24, 425], [19, 370, 47]], encoded_ids
+ 
+     def test_cjkchar(self):
+         tokens = f"{d}/text2token/tokens_cn.txt"
+ 
+         if not Path(tokens).is_file():
+             print(
+                 f"No test data found, skipping test_cjkchar().\n"
+                 f"You can download the test data by: \n"
+                 f"git clone https://github.com/pkufool/sherpa-test-data.git /tmp/sherpa-test-data"
+             )
+             return
+ 
+         texts = ["世界人民大团结", "中国 VS 美国"]
+         encoded_texts = sherpa_onnx.text2token(
+             texts, tokens=tokens, tokens_type="cjkchar"
+         )
+         assert encoded_texts == [
+             ["世", "界", "人", "民", "大", "团", "结"],
+             ["中", "国", "V", "S", "美", "国"],
+         ], encoded_texts
+         encoded_ids = sherpa_onnx.text2token(
+             texts,
+             tokens=tokens,
+             tokens_type="cjkchar",
+             output_ids=True,
+         )
+         assert encoded_ids == [
+             [379, 380, 72, 874, 93, 1251, 489],
+             [262, 147, 3423, 2476, 21, 147],
+         ], encoded_ids
+ 
+     def test_cjkchar_bpe(self):
+         tokens = f"{d}/text2token/tokens_mix.txt"
+         bpe_model = f"{d}/text2token/bpe_mix.model"
+ 
+         if not Path(tokens).is_file() or not Path(bpe_model).is_file():
+             print(
+                 f"No test data found, skipping test_cjkchar_bpe().\n"
+                 f"You can download the test data by: \n"
+                 f"git clone https://github.com/pkufool/sherpa-test-data.git /tmp/sherpa-test-data"
+             )
+             return
+ 
+         texts = ["世界人民 GOES TOGETHER", "中国 GOES WITH 美国"]
+         encoded_texts = sherpa_onnx.text2token(
+             texts,
+             tokens=tokens,
+             tokens_type="cjkchar+bpe",
+             bpe_model=bpe_model,
+         )
+         assert encoded_texts == [
+             ["世", "界", "人", "民", "▁GO", "ES", "▁TOGETHER"],
+             ["中", "国", "▁GO", "ES", "▁WITH", "美", "国"],
+         ], encoded_texts
+         encoded_ids = sherpa_onnx.text2token(
+             texts,
+             tokens=tokens,
+             tokens_type="cjkchar+bpe",
+             bpe_model=bpe_model,
+             output_ids=True,
+         )
+         assert encoded_ids == [
+             [1368, 1392, 557, 680, 275, 178, 475],
+             [685, 736, 275, 178, 179, 921, 736],
+         ], encoded_ids
+ 
+ 
+ if __name__ == "__main__":
+     unittest.main()