正在显示
4 个修改的文件
包含
96 行增加
和
4 行删除
| @@ -133,6 +133,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( | @@ -133,6 +133,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( | ||
| 133 | return ConvertTextToTokenIdsGerman(text); | 133 | return ConvertTextToTokenIdsGerman(text); |
| 134 | case Language::kSpanish: | 134 | case Language::kSpanish: |
| 135 | return ConvertTextToTokenIdsSpanish(text); | 135 | return ConvertTextToTokenIdsSpanish(text); |
| 136 | + case Language::kFrench: | ||
| 137 | + return ConvertTextToTokenIdsFrench(text); | ||
| 136 | case Language::kChinese: | 138 | case Language::kChinese: |
| 137 | return ConvertTextToTokenIdsChinese(text); | 139 | return ConvertTextToTokenIdsChinese(text); |
| 138 | default: | 140 | default: |
| @@ -254,6 +256,8 @@ void Lexicon::InitLanguage(const std::string &_lang) { | @@ -254,6 +256,8 @@ void Lexicon::InitLanguage(const std::string &_lang) { | ||
| 254 | language_ = Language::kGerman; | 256 | language_ = Language::kGerman; |
| 255 | } else if (lang == "spanish") { | 257 | } else if (lang == "spanish") { |
| 256 | language_ = Language::kSpanish; | 258 | language_ = Language::kSpanish; |
| 259 | + } else if (lang == "french") { | ||
| 260 | + language_ = Language::kFrench; | ||
| 257 | } else if (lang == "chinese") { | 261 | } else if (lang == "chinese") { |
| 258 | language_ = Language::kChinese; | 262 | language_ = Language::kChinese; |
| 259 | } else { | 263 | } else { |
| @@ -46,6 +46,11 @@ class Lexicon { | @@ -46,6 +46,11 @@ class Lexicon { | ||
| 46 | return ConvertTextToTokenIdsEnglish(text); | 46 | return ConvertTextToTokenIdsEnglish(text); |
| 47 | } | 47 | } |
| 48 | 48 | ||
| 49 | + std::vector<int64_t> ConvertTextToTokenIdsFrench( | ||
| 50 | + const std::string &text) const { | ||
| 51 | + return ConvertTextToTokenIdsEnglish(text); | ||
| 52 | + } | ||
| 53 | + | ||
| 49 | std::vector<int64_t> ConvertTextToTokenIdsEnglish( | 54 | std::vector<int64_t> ConvertTextToTokenIdsEnglish( |
| 50 | const std::string &text) const; | 55 | const std::string &text) const; |
| 51 | 56 | ||
| @@ -62,6 +67,7 @@ class Lexicon { | @@ -62,6 +67,7 @@ class Lexicon { | ||
| 62 | kEnglish, | 67 | kEnglish, |
| 63 | kGerman, | 68 | kGerman, |
| 64 | kSpanish, | 69 | kSpanish, |
| 70 | + kFrench, | ||
| 65 | kChinese, | 71 | kChinese, |
| 66 | kUnknown, | 72 | kUnknown, |
| 67 | }; | 73 | }; |
| @@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | @@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | ||
| 164 | std::vector<double> *out); | 164 | std::vector<double> *out); |
| 165 | 165 | ||
| 166 | static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } | 166 | static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } |
| 167 | -static bool IsGermanUmlauts(const std::string &word) { | 167 | +static bool IsGermanUmlaut(const std::string &word) { |
| 168 | // ä 0xC3 0xA4 | 168 | // ä 0xC3 0xA4 |
| 169 | // ö 0xC3 0xB6 | 169 | // ö 0xC3 0xB6 |
| 170 | // ü 0xC3 0xBC | 170 | // ü 0xC3 0xBC |
| @@ -187,6 +187,7 @@ static bool IsGermanUmlauts(const std::string &word) { | @@ -187,6 +187,7 @@ static bool IsGermanUmlauts(const std::string &word) { | ||
| 187 | } | 187 | } |
| 188 | 188 | ||
| 189 | // see https://www.tandem.net/blog/spanish-accents | 189 | // see https://www.tandem.net/blog/spanish-accents |
| 190 | +// https://www.compart.com/en/unicode/U+00DC | ||
| 190 | static bool IsSpanishDiacritic(const std::string &word) { | 191 | static bool IsSpanishDiacritic(const std::string &word) { |
| 191 | // á 0xC3 0xA1 | 192 | // á 0xC3 0xA1 |
| 192 | // é 0xC3 0xA9 | 193 | // é 0xC3 0xA9 |
| @@ -195,6 +196,16 @@ static bool IsSpanishDiacritic(const std::string &word) { | @@ -195,6 +196,16 @@ static bool IsSpanishDiacritic(const std::string &word) { | ||
| 195 | // ú 0xC3 0xBA | 196 | // ú 0xC3 0xBA |
| 196 | // ü 0xC3 0xBC | 197 | // ü 0xC3 0xBC |
| 197 | // ñ 0xC3 0xB1 | 198 | // ñ 0xC3 0xB1 |
| 199 | + // | ||
| 200 | + // uppercase | ||
| 201 | + // | ||
| 202 | + // Á 0xC3 0x81 | ||
| 203 | + // É 0xC3 0x89 | ||
| 204 | + // Í 0xC3 0x8D | ||
| 205 | + // Ó 0xC3 0x93 | ||
| 206 | + // Ú 0xC3 0x9A | ||
| 207 | + // Ü 0xC3 0x9C | ||
| 208 | + // Ñ 0xC3 0x91 | ||
| 198 | 209 | ||
| 199 | if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) { | 210 | if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) { |
| 200 | return false; | 211 | return false; |
| @@ -202,15 +213,86 @@ static bool IsSpanishDiacritic(const std::string &word) { | @@ -202,15 +213,86 @@ static bool IsSpanishDiacritic(const std::string &word) { | ||
| 202 | 213 | ||
| 203 | auto c = static_cast<uint8_t>(word[1]); | 214 | auto c = static_cast<uint8_t>(word[1]); |
| 204 | if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba || | 215 | if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba || |
| 205 | - c == 0xbc || c == 0xb1) { | 216 | + c == 0xbc || c == 0xb1 || c == 0x81 || c == 0x89 || c == 0x8d || |
| 217 | + c == 0x93 || c == 0x9a || c == 0x9c || c == 0x91) { | ||
| 206 | return true; | 218 | return true; |
| 207 | } | 219 | } |
| 208 | 220 | ||
| 209 | return false; | 221 | return false; |
| 210 | } | 222 | } |
| 211 | 223 | ||
| 224 | +// see https://www.busuu.com/en/french/accent-marks | ||
| 225 | +static bool IsFrenchDiacritic(const std::string &word) { | ||
| 226 | + // acute accent | ||
| 227 | + // é 0xC3 0xA9 | ||
| 228 | + // | ||
| 229 | + // grave accent | ||
| 230 | + // à 0xC3 0xA0 | ||
| 231 | + // è 0xC3 0xA8 | ||
| 232 | + // ù 0xC3 0xB9 | ||
| 233 | + // | ||
| 234 | + // cedilla | ||
| 235 | + // ç 0xC3 0xA7 | ||
| 236 | + // | ||
| 237 | + // circumflex | ||
| 238 | + // â 0xC3 0xA2 | ||
| 239 | + // ê 0xC3 0xAA | ||
| 240 | + // î 0xC3 0xAE | ||
| 241 | + // ô 0xC3 0xB4 | ||
| 242 | + // û 0xC3 0xBB | ||
| 243 | + // | ||
| 244 | + // trema | ||
| 245 | + // ë 0xC3 0xAB | ||
| 246 | + // ï 0xC3 0xAF | ||
| 247 | + // ü 0xC3 0xBC | ||
| 248 | + // | ||
| 249 | + // É 0xC3 0x89 | ||
| 250 | + // | ||
| 251 | + // À 0xC3 0x80 | ||
| 252 | + // È 0xC3 0x88 | ||
| 253 | + // Ù 0xC3 0x99 | ||
| 254 | + // Ç 0xC3 0x87 | ||
| 255 | + // Â 0xC3 0x82 | ||
| 256 | + // Ê 0xC3 0x8A | ||
| 257 | + // Î 0xC3 0x8E | ||
| 258 | + // Ô 0xC3 0x94 | ||
| 259 | + // Û 0xC3 0x9B | ||
| 260 | + // Ë 0xC3 0x8B | ||
| 261 | + // Ï 0xC3 0x8F | ||
| 262 | + // Ü 0xC3 0x9C | ||
| 263 | + | ||
| 264 | + if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) { | ||
| 265 | + return false; | ||
| 266 | + } | ||
| 267 | + | ||
| 268 | + auto c = static_cast<uint8_t>(word[1]); | ||
| 269 | + if (c == 0xa9 || c == 0xa0 || c == 0xa8 || c == 0xb9 || c == 0xa7 || | ||
| 270 | + c == 0xa2 || c == 0xaa || c == 0xae || c == 0xb4 || c == 0xbb || | ||
| 271 | + c == 0xab || c == 0xaf || c == 0xbc || c == 0x89 || c == 0x80 || | ||
| 272 | + c == 0x88 || c == 0x99 || c == 0x87 || c == 0x82 || c == 0x8a || | ||
| 273 | + c == 0x8e || c == 0x94 || c == 0x9b || c == 0x8b || c == 0x8f || | ||
| 274 | + c == 0x9c) { | ||
| 275 | + return true; | ||
| 276 | + } | ||
| 277 | + return false; | ||
| 278 | +} | ||
| 279 | + | ||
| 212 | static bool IsSpecial(const std::string &w) { | 280 | static bool IsSpecial(const std::string &w) { |
| 213 | - return IsGermanUmlauts(w) || IsSpanishDiacritic(w); | 281 | + bool ans = IsGermanUmlaut(w) || IsSpanishDiacritic(w) || IsFrenchDiacritic(w); |
| 282 | + | ||
| 283 | + // for french d’impossible | ||
| 284 | + // ’ 0xE2 0x80 0x99 | ||
| 285 | + bool ans2 = false; | ||
| 286 | + if (w.size() == 3) { | ||
| 287 | + auto c0 = static_cast<uint8_t>(w[0]); | ||
| 288 | + auto c1 = static_cast<uint8_t>(w[1]); | ||
| 289 | + auto c2 = static_cast<uint8_t>(w[2]); | ||
| 290 | + if (c0 == 0xe2 && c1 == 0x80 && c2 == 0x99) { | ||
| 291 | + ans2 = true; | ||
| 292 | + } | ||
| 293 | + } | ||
| 294 | + | ||
| 295 | + return ans || ans2; | ||
| 214 | } | 296 | } |
| 215 | 297 | ||
| 216 | static std::vector<std::string> MergeCharactersIntoWords( | 298 | static std::vector<std::string> MergeCharactersIntoWords( |
-
请 注册 或 登录 后发表评论