正在显示
3 个修改的文件
包含
43 行增加
和
6 行删除
| @@ -131,6 +131,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( | @@ -131,6 +131,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( | ||
| 131 | return ConvertTextToTokenIdsEnglish(text); | 131 | return ConvertTextToTokenIdsEnglish(text); |
| 132 | case Language::kGerman: | 132 | case Language::kGerman: |
| 133 | return ConvertTextToTokenIdsGerman(text); | 133 | return ConvertTextToTokenIdsGerman(text); |
| 134 | + case Language::kSpanish: | ||
| 135 | + return ConvertTextToTokenIdsSpanish(text); | ||
| 134 | case Language::kChinese: | 136 | case Language::kChinese: |
| 135 | return ConvertTextToTokenIdsChinese(text); | 137 | return ConvertTextToTokenIdsChinese(text); |
| 136 | default: | 138 | default: |
| @@ -250,6 +252,8 @@ void Lexicon::InitLanguage(const std::string &_lang) { | @@ -250,6 +252,8 @@ void Lexicon::InitLanguage(const std::string &_lang) { | ||
| 250 | language_ = Language::kEnglish; | 252 | language_ = Language::kEnglish; |
| 251 | } else if (lang == "german") { | 253 | } else if (lang == "german") { |
| 252 | language_ = Language::kGerman; | 254 | language_ = Language::kGerman; |
| 255 | + } else if (lang == "spanish") { | ||
| 256 | + language_ = Language::kSpanish; | ||
| 253 | } else if (lang == "chinese") { | 257 | } else if (lang == "chinese") { |
| 254 | language_ = Language::kChinese; | 258 | language_ = Language::kChinese; |
| 255 | } else { | 259 | } else { |
| @@ -41,6 +41,11 @@ class Lexicon { | @@ -41,6 +41,11 @@ class Lexicon { | ||
| 41 | return ConvertTextToTokenIdsEnglish(text); | 41 | return ConvertTextToTokenIdsEnglish(text); |
| 42 | } | 42 | } |
| 43 | 43 | ||
| 44 | + std::vector<int64_t> ConvertTextToTokenIdsSpanish( | ||
| 45 | + const std::string &text) const { | ||
| 46 | + return ConvertTextToTokenIdsEnglish(text); | ||
| 47 | + } | ||
| 48 | + | ||
| 44 | std::vector<int64_t> ConvertTextToTokenIdsEnglish( | 49 | std::vector<int64_t> ConvertTextToTokenIdsEnglish( |
| 45 | const std::string &text) const; | 50 | const std::string &text) const; |
| 46 | 51 | ||
| @@ -56,6 +61,7 @@ class Lexicon { | @@ -56,6 +61,7 @@ class Lexicon { | ||
| 56 | enum class Language { | 61 | enum class Language { |
| 57 | kEnglish, | 62 | kEnglish, |
| 58 | kGerman, | 63 | kGerman, |
| 64 | + kSpanish, | ||
| 59 | kChinese, | 65 | kChinese, |
| 60 | kUnknown, | 66 | kUnknown, |
| 61 | }; | 67 | }; |
| @@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | @@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | ||
| 164 | std::vector<double> *out); | 164 | std::vector<double> *out); |
| 165 | 165 | ||
| 166 | static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } | 166 | static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } |
| 167 | -static bool IsGermanUmlauts(const std::string &words) { | 167 | +static bool IsGermanUmlauts(const std::string &word) { |
| 168 | // ä 0xC3 0xA4 | 168 | // ä 0xC3 0xA4 |
| 169 | // ö 0xC3 0xB6 | 169 | // ö 0xC3 0xB6 |
| 170 | // ü 0xC3 0xBC | 170 | // ü 0xC3 0xBC |
| @@ -173,12 +173,12 @@ static bool IsGermanUmlauts(const std::string &words) { | @@ -173,12 +173,12 @@ static bool IsGermanUmlauts(const std::string &words) { | ||
| 173 | // Ü 0xC3 0x9C | 173 | // Ü 0xC3 0x9C |
| 174 | // ß 0xC3 0x9F | 174 | // ß 0xC3 0x9F |
| 175 | 175 | ||
| 176 | - if (words.size() != 2 || static_cast<uint8_t>(words[0]) != 0xc3) { | 176 | + if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) { |
| 177 | return false; | 177 | return false; |
| 178 | } | 178 | } |
| 179 | 179 | ||
| 180 | - auto c = static_cast<uint8_t>(words[1]); | ||
| 181 | - if (c == 0xa4 || c == 0xb6 || c == 0xbC || c == 0x84 || c == 0x96 || | 180 | + auto c = static_cast<uint8_t>(word[1]); |
| 181 | + if (c == 0xa4 || c == 0xb6 || c == 0xbc || c == 0x84 || c == 0x96 || | ||
| 182 | c == 0x9c || c == 0x9f) { | 182 | c == 0x9c || c == 0x9f) { |
| 183 | return true; | 183 | return true; |
| 184 | } | 184 | } |
| @@ -186,6 +186,33 @@ static bool IsGermanUmlauts(const std::string &words) { | @@ -186,6 +186,33 @@ static bool IsGermanUmlauts(const std::string &words) { | ||
| 186 | return false; | 186 | return false; |
| 187 | } | 187 | } |
| 188 | 188 | ||
| 189 | +// see https://www.tandem.net/blog/spanish-accents | ||
| 190 | +static bool IsSpanishDiacritic(const std::string &word) { | ||
| 191 | + // á 0xC3 0xA1 | ||
| 192 | + // é 0xC3 0xA9 | ||
| 193 | + // í 0xC3 0xAD | ||
| 194 | + // ó 0xC3 0xB3 | ||
| 195 | + // ú 0xC3 0xBA | ||
| 196 | + // ü 0xC3 0xBC | ||
| 197 | + // ñ 0xC3 0xB1 | ||
| 198 | + | ||
| 199 | + if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) { | ||
| 200 | + return false; | ||
| 201 | + } | ||
| 202 | + | ||
| 203 | + auto c = static_cast<uint8_t>(word[1]); | ||
| 204 | + if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba || | ||
| 205 | + c == 0xbc || c == 0xb1) { | ||
| 206 | + return true; | ||
| 207 | + } | ||
| 208 | + | ||
| 209 | + return false; | ||
| 210 | +} | ||
| 211 | + | ||
| 212 | +static bool IsSpecial(const std::string &w) { | ||
| 213 | + return IsGermanUmlauts(w) || IsSpanishDiacritic(w); | ||
| 214 | +} | ||
| 215 | + | ||
| 189 | static std::vector<std::string> MergeCharactersIntoWords( | 216 | static std::vector<std::string> MergeCharactersIntoWords( |
| 190 | const std::vector<std::string> &words) { | 217 | const std::vector<std::string> &words) { |
| 191 | std::vector<std::string> ans; | 218 | std::vector<std::string> ans; |
| @@ -196,7 +223,7 @@ static std::vector<std::string> MergeCharactersIntoWords( | @@ -196,7 +223,7 @@ static std::vector<std::string> MergeCharactersIntoWords( | ||
| 196 | 223 | ||
| 197 | while (i < n) { | 224 | while (i < n) { |
| 198 | const auto &w = words[i]; | 225 | const auto &w = words[i]; |
| 199 | - if (w.size() >= 3 || (w.size() == 2 && !IsGermanUmlauts(w)) || | 226 | + if (w.size() >= 3 || (w.size() == 2 && !IsSpecial(w)) || |
| 200 | (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) { | 227 | (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) { |
| 201 | if (prev != -1) { | 228 | if (prev != -1) { |
| 202 | std::string t; | 229 | std::string t; |
| @@ -215,7 +242,7 @@ static std::vector<std::string> MergeCharactersIntoWords( | @@ -215,7 +242,7 @@ static std::vector<std::string> MergeCharactersIntoWords( | ||
| 215 | } | 242 | } |
| 216 | 243 | ||
| 217 | // e.g., öffnen | 244 | // e.g., öffnen |
| 218 | - if (w.size() == 1 || (w.size() == 2 && IsGermanUmlauts(w))) { | 245 | + if (w.size() == 1 || (w.size() == 2 && IsSpecial(w))) { |
| 219 | if (prev == -1) { | 246 | if (prev == -1) { |
| 220 | prev = i; | 247 | prev = i; |
| 221 | } | 248 | } |
-
请 注册 或 登录 后发表评论