Fangjun Kuang
Committed by GitHub

Support German umlauts in splitting UTF8 strings. (#395)

@@ -164,6 +164,27 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, @@ -164,6 +164,27 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
164 std::vector<double> *out); 164 std::vector<double> *out);
165 165
166 static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } 166 static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); }
  167 +static bool IsGermanUmlauts(const std::string &words) {
  168 + // ä 0xC3 0xA4
  169 + // ö 0xC3 0xB6
  170 + // ü 0xC3 0xBC
  171 + // Ä 0xC3 0x84
  172 + // Ö 0xC3 0x96
  173 + // Ü 0xC3 0x9C
  174 + // ß 0xC3 0x9F
  175 +
  176 + if (words.size() != 2 || static_cast<uint8_t>(words[0]) != 0xc3) {
  177 + return false;
  178 + }
  179 +
  180 + auto c = static_cast<uint8_t>(words[1]);
  181 + if (c == 0xa4 || c == 0xb6 || c == 0xbC || c == 0x84 || c == 0x96 ||
  182 + c == 0x9c || c == 0x9f) {
  183 + return true;
  184 + }
  185 +
  186 + return false;
  187 +}
167 188
168 static std::vector<std::string> MergeCharactersIntoWords( 189 static std::vector<std::string> MergeCharactersIntoWords(
169 const std::vector<std::string> &words) { 190 const std::vector<std::string> &words) {
@@ -175,7 +196,7 @@ static std::vector<std::string> MergeCharactersIntoWords( @@ -175,7 +196,7 @@ static std::vector<std::string> MergeCharactersIntoWords(
175 196
176 while (i < n) { 197 while (i < n) {
177 const auto &w = words[i]; 198 const auto &w = words[i];
178 - if (w.size() > 1 || 199 + if (w.size() >= 3 || (w.size() == 2 && !IsGermanUmlauts(w)) ||
179 (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) { 200 (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) {
180 if (prev != -1) { 201 if (prev != -1) {
181 std::string t; 202 std::string t;
@@ -193,7 +214,8 @@ static std::vector<std::string> MergeCharactersIntoWords( @@ -193,7 +214,8 @@ static std::vector<std::string> MergeCharactersIntoWords(
193 continue; 214 continue;
194 } 215 }
195 216
196 - if (w.size() == 1) { 217 + // e.g., öffnen
  218 + if (w.size() == 1 || (w.size() == 2 && IsGermanUmlauts(w))) {
197 if (prev == -1) { 219 if (prev == -1) {
198 prev = i; 220 prev = i;
199 } 221 }