Fix hotwords OOV log (#1139)

Wei Kang · GitHub
Commit 5b1fa8750ffbf779604d3c618d1aca3ab09bf02a 5b1fa875 1 parent 960eb752
sherpa-onnx/csrc/utils.cc
sherpa-onnx/python/sherpa_onnx/utils.py
--- a/sherpa-onnx/csrc/utils.cc
查看文件 @5b1fa87
+++ b/sherpa-onnx/csrc/utils.cc
查看文件 @5b1fa87
@@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines,
             break;
           default:
             SHERPA_ONNX_LOGE(
-                "Cannot find ID for token %s at line: %s. (Hint: words on "
-                "the same line are separated by spaces)",
-                word.c_str(), line.c_str());
+                "Cannot find ID for token %s at line: %s. (Hint: Check the "
+                "tokens.txt see if %s in it)",
+                word.c_str(), line.c_str(), word.c_str());
             has_oov = true;
             break;
         }
--- a/sherpa-onnx/python/sherpa_onnx/utils.py
查看文件 @5b1fa87
+++ b/sherpa-onnx/python/sherpa_onnx/utils.py
查看文件 @5b1fa87
@@ -4,6 +4,7 @@ import re
 from pathlib import Path
 from typing import List, Optional, Union
+
 def text2token(
     texts: List[str],
     tokens: str,
@@ -33,20 +34,20 @@ def text2token(
       is True, or it is a list of list of tokens.
     """
     try:
-      import sentencepiece as spm
+        import sentencepiece as spm
     except ImportError:
-        print('Please run')
-        print('  pip install sentencepiece')
-        print('before you continue')
+        print("Please run")
+        print("  pip install sentencepiece")
+        print("before you continue")
         raise
     try:
         from pypinyin import pinyin
         from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
     except ImportError:
-        print('Please run')
-        print('  pip install pypinyin')
-        print('before you continue')
+        print("Please run")
+        print("  pip install pypinyin")
+        print("before you continue")
         raise
     assert Path(tokens).is_file(), f"File not exists, {tokens}"
@@ -119,7 +120,10 @@ def text2token(
             if txt in tokens_table:
                 text_list.append(tokens_table[txt] if output_ids else txt)
             else:
-                print(f"OOV token : {txt}, skipping text : {text}.")
+                print(
+                    f"Can't find token {txt} in token table, check your "
+                    f"tokens.txt see if {txt} in it. skipping text : {text}."
+                )
                 contain_oov = True
                 break
         if contain_oov: