正在显示
2 个修改的文件
包含
15 行增加
和
11 行删除
| @@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines, | @@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines, | ||
| 62 | break; | 62 | break; |
| 63 | default: | 63 | default: |
| 64 | SHERPA_ONNX_LOGE( | 64 | SHERPA_ONNX_LOGE( |
| 65 | - "Cannot find ID for token %s at line: %s. (Hint: words on " | ||
| 66 | - "the same line are separated by spaces)", | ||
| 67 | - word.c_str(), line.c_str()); | 65 | + "Cannot find ID for token %s at line: %s. (Hint: Check the " |
| 66 | + "tokens.txt see if %s in it)", | ||
| 67 | + word.c_str(), line.c_str(), word.c_str()); | ||
| 68 | has_oov = true; | 68 | has_oov = true; |
| 69 | break; | 69 | break; |
| 70 | } | 70 | } |
| @@ -4,6 +4,7 @@ import re | @@ -4,6 +4,7 @@ import re | ||
| 4 | from pathlib import Path | 4 | from pathlib import Path |
| 5 | from typing import List, Optional, Union | 5 | from typing import List, Optional, Union |
| 6 | 6 | ||
| 7 | + | ||
| 7 | def text2token( | 8 | def text2token( |
| 8 | texts: List[str], | 9 | texts: List[str], |
| 9 | tokens: str, | 10 | tokens: str, |
| @@ -33,20 +34,20 @@ def text2token( | @@ -33,20 +34,20 @@ def text2token( | ||
| 33 | is True, or it is a list of list of tokens. | 34 | is True, or it is a list of list of tokens. |
| 34 | """ | 35 | """ |
| 35 | try: | 36 | try: |
| 36 | - import sentencepiece as spm | 37 | + import sentencepiece as spm |
| 37 | except ImportError: | 38 | except ImportError: |
| 38 | - print('Please run') | ||
| 39 | - print(' pip install sentencepiece') | ||
| 40 | - print('before you continue') | 39 | + print("Please run") |
| 40 | + print(" pip install sentencepiece") | ||
| 41 | + print("before you continue") | ||
| 41 | raise | 42 | raise |
| 42 | 43 | ||
| 43 | try: | 44 | try: |
| 44 | from pypinyin import pinyin | 45 | from pypinyin import pinyin |
| 45 | from pypinyin.contrib.tone_convert import to_initials, to_finals_tone | 46 | from pypinyin.contrib.tone_convert import to_initials, to_finals_tone |
| 46 | except ImportError: | 47 | except ImportError: |
| 47 | - print('Please run') | ||
| 48 | - print(' pip install pypinyin') | ||
| 49 | - print('before you continue') | 48 | + print("Please run") |
| 49 | + print(" pip install pypinyin") | ||
| 50 | + print("before you continue") | ||
| 50 | raise | 51 | raise |
| 51 | 52 | ||
| 52 | assert Path(tokens).is_file(), f"File not exists, {tokens}" | 53 | assert Path(tokens).is_file(), f"File not exists, {tokens}" |
| @@ -119,7 +120,10 @@ def text2token( | @@ -119,7 +120,10 @@ def text2token( | ||
| 119 | if txt in tokens_table: | 120 | if txt in tokens_table: |
| 120 | text_list.append(tokens_table[txt] if output_ids else txt) | 121 | text_list.append(tokens_table[txt] if output_ids else txt) |
| 121 | else: | 122 | else: |
| 122 | - print(f"OOV token : {txt}, skipping text : {text}.") | 123 | + print( |
| 124 | + f"Can't find token {txt} in token table, check your " | ||
| 125 | + f"tokens.txt see if {txt} in it. skipping text : {text}." | ||
| 126 | + ) | ||
| 123 | contain_oov = True | 127 | contain_oov = True |
| 124 | break | 128 | break |
| 125 | if contain_oov: | 129 | if contain_oov: |
-
请 注册 或 登录 后发表评论