Wei Kang
Committed by GitHub

Fix hotwords OOV log (#1139)

@@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines, @@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines,
62 break; 62 break;
63 default: 63 default:
64 SHERPA_ONNX_LOGE( 64 SHERPA_ONNX_LOGE(
65 - "Cannot find ID for token %s at line: %s. (Hint: words on "  
66 - "the same line are separated by spaces)",  
67 - word.c_str(), line.c_str()); 65 + "Cannot find ID for token %s at line: %s. (Hint: Check the "
  66 + "tokens.txt see if %s in it)",
  67 + word.c_str(), line.c_str(), word.c_str());
68 has_oov = true; 68 has_oov = true;
69 break; 69 break;
70 } 70 }
@@ -4,6 +4,7 @@ import re @@ -4,6 +4,7 @@ import re
4 from pathlib import Path 4 from pathlib import Path
5 from typing import List, Optional, Union 5 from typing import List, Optional, Union
6 6
  7 +
7 def text2token( 8 def text2token(
8 texts: List[str], 9 texts: List[str],
9 tokens: str, 10 tokens: str,
@@ -33,20 +34,20 @@ def text2token( @@ -33,20 +34,20 @@ def text2token(
33 is True, or it is a list of list of tokens. 34 is True, or it is a list of list of tokens.
34 """ 35 """
35 try: 36 try:
36 - import sentencepiece as spm 37 + import sentencepiece as spm
37 except ImportError: 38 except ImportError:
38 - print('Please run')  
39 - print(' pip install sentencepiece')  
40 - print('before you continue') 39 + print("Please run")
  40 + print(" pip install sentencepiece")
  41 + print("before you continue")
41 raise 42 raise
42 43
43 try: 44 try:
44 from pypinyin import pinyin 45 from pypinyin import pinyin
45 from pypinyin.contrib.tone_convert import to_initials, to_finals_tone 46 from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
46 except ImportError: 47 except ImportError:
47 - print('Please run')  
48 - print(' pip install pypinyin')  
49 - print('before you continue') 48 + print("Please run")
  49 + print(" pip install pypinyin")
  50 + print("before you continue")
50 raise 51 raise
51 52
52 assert Path(tokens).is_file(), f"File not exists, {tokens}" 53 assert Path(tokens).is_file(), f"File not exists, {tokens}"
@@ -119,7 +120,10 @@ def text2token( @@ -119,7 +120,10 @@ def text2token(
119 if txt in tokens_table: 120 if txt in tokens_table:
120 text_list.append(tokens_table[txt] if output_ids else txt) 121 text_list.append(tokens_table[txt] if output_ids else txt)
121 else: 122 else:
122 - print(f"OOV token : {txt}, skipping text : {text}.") 123 + print(
  124 + f"Can't find token {txt} in token table, check your "
  125 + f"tokens.txt see if {txt} in it. skipping text : {text}."
  126 + )
123 contain_oov = True 127 contain_oov = True
124 break 128 break
125 if contain_oov: 129 if contain_oov: