Fangjun Kuang
Committed by GitHub

Fix utf8 spliting for English (#386)

@@ -162,10 +162,63 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, @@ -162,10 +162,63 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
162 bool omit_empty_strings, 162 bool omit_empty_strings,
163 std::vector<double> *out); 163 std::vector<double> *out);
164 164
  165 +static std::vector<std::string> MergeCharactersIntoWords(
  166 + const std::vector<std::string> &words) {
  167 + std::vector<std::string> ans;
  168 +
  169 + int32_t n = static_cast<int32_t>(words.size());
  170 + int32_t i = 0;
  171 + int32_t prev = -1;
  172 +
  173 + while (i < n) {
  174 + const auto &w = words[i];
  175 + if (w.size() > 1 ||
  176 + (w.size() == 1 && (std::ispunct(w[0]) || std::isspace(w[0])))) {
  177 + if (prev != -1) {
  178 + std::string t;
  179 + for (; prev < i; ++prev) {
  180 + t.append(words[prev]);
  181 + }
  182 + prev = -1;
  183 + ans.push_back(std::move(t));
  184 + }
  185 +
  186 + if (!std::isspace(w[0])) {
  187 + ans.push_back(w);
  188 + }
  189 + ++i;
  190 + continue;
  191 + }
  192 +
  193 + if (w.size() == 1) {
  194 + if (prev == -1) {
  195 + prev = i;
  196 + }
  197 + ++i;
  198 + continue;
  199 + }
  200 +
  201 + SHERPA_ONNX_LOGE("Ignore %s", w.c_str());
  202 + ++i;
  203 + }
  204 +
  205 + if (prev != -1) {
  206 + std::string t;
  207 + for (; prev < i; ++prev) {
  208 + t.append(words[prev]);
  209 + }
  210 + ans.push_back(std::move(t));
  211 + }
  212 +
  213 + return ans;
  214 +}
  215 +
165 std::vector<std::string> SplitUtf8(const std::string &text) { 216 std::vector<std::string> SplitUtf8(const std::string &text) {
166 const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str()); 217 const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str());
167 const uint8_t *end = begin + text.size(); 218 const uint8_t *end = begin + text.size();
168 219
  220 + // Note that English words are split into single characters.
  221 + // We need to invoke MergeCharactersIntoWords() to merge them
169 std::vector<std::string> ans; 222 std::vector<std::string> ans;
170 223
171 auto start = begin; 224 auto start = begin;
@@ -195,7 +248,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) { @@ -195,7 +248,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
195 } 248 }
196 } 249 }
197 250
198 - return ans; 251 + return MergeCharactersIntoWords(ans);
199 } 252 }
200 253
201 } // namespace sherpa_onnx 254 } // namespace sherpa_onnx