Committed by
GitHub
Support German umlauts in splitting UTF8 strings. (#395)
正在显示
1 个修改的文件
包含
24 行增加
和
2 行删除
| @@ -164,6 +164,27 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | @@ -164,6 +164,27 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | ||
| 164 | std::vector<double> *out); | 164 | std::vector<double> *out); |
| 165 | 165 | ||
| 166 | static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } | 166 | static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } |
| 167 | +static bool IsGermanUmlauts(const std::string &words) { | ||
| 168 | + // ä 0xC3 0xA4 | ||
| 169 | + // ö 0xC3 0xB6 | ||
| 170 | + // ü 0xC3 0xBC | ||
| 171 | + // Ä 0xC3 0x84 | ||
| 172 | + // Ö 0xC3 0x96 | ||
| 173 | + // Ü 0xC3 0x9C | ||
| 174 | + // ß 0xC3 0x9F | ||
| 175 | + | ||
| 176 | + if (words.size() != 2 || static_cast<uint8_t>(words[0]) != 0xc3) { | ||
| 177 | + return false; | ||
| 178 | + } | ||
| 179 | + | ||
| 180 | + auto c = static_cast<uint8_t>(words[1]); | ||
| 181 | + if (c == 0xa4 || c == 0xb6 || c == 0xbC || c == 0x84 || c == 0x96 || | ||
| 182 | + c == 0x9c || c == 0x9f) { | ||
| 183 | + return true; | ||
| 184 | + } | ||
| 185 | + | ||
| 186 | + return false; | ||
| 187 | +} | ||
| 167 | 188 | ||
| 168 | static std::vector<std::string> MergeCharactersIntoWords( | 189 | static std::vector<std::string> MergeCharactersIntoWords( |
| 169 | const std::vector<std::string> &words) { | 190 | const std::vector<std::string> &words) { |
| @@ -175,7 +196,7 @@ static std::vector<std::string> MergeCharactersIntoWords( | @@ -175,7 +196,7 @@ static std::vector<std::string> MergeCharactersIntoWords( | ||
| 175 | 196 | ||
| 176 | while (i < n) { | 197 | while (i < n) { |
| 177 | const auto &w = words[i]; | 198 | const auto &w = words[i]; |
| 178 | - if (w.size() > 1 || | 199 | + if (w.size() >= 3 || (w.size() == 2 && !IsGermanUmlauts(w)) || |
| 179 | (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) { | 200 | (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) { |
| 180 | if (prev != -1) { | 201 | if (prev != -1) { |
| 181 | std::string t; | 202 | std::string t; |
| @@ -193,7 +214,8 @@ static std::vector<std::string> MergeCharactersIntoWords( | @@ -193,7 +214,8 @@ static std::vector<std::string> MergeCharactersIntoWords( | ||
| 193 | continue; | 214 | continue; |
| 194 | } | 215 | } |
| 195 | 216 | ||
| 196 | - if (w.size() == 1) { | 217 | + // e.g., öffnen |
| 218 | + if (w.size() == 1 || (w.size() == 2 && IsGermanUmlauts(w))) { | ||
| 197 | if (prev == -1) { | 219 | if (prev == -1) { |
| 198 | prev = i; | 220 | prev = i; |
| 199 | } | 221 | } |
-
请 注册 或 登录 后发表评论