正在显示
1 个修改的文件
包含
54 行增加
和
1 行删除
| @@ -162,10 +162,63 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | @@ -162,10 +162,63 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | ||
| 162 | bool omit_empty_strings, | 162 | bool omit_empty_strings, |
| 163 | std::vector<double> *out); | 163 | std::vector<double> *out); |
| 164 | 164 | ||
| 165 | +static std::vector<std::string> MergeCharactersIntoWords( | ||
| 166 | + const std::vector<std::string> &words) { | ||
| 167 | + std::vector<std::string> ans; | ||
| 168 | + | ||
| 169 | + int32_t n = static_cast<int32_t>(words.size()); | ||
| 170 | + int32_t i = 0; | ||
| 171 | + int32_t prev = -1; | ||
| 172 | + | ||
| 173 | + while (i < n) { | ||
| 174 | + const auto &w = words[i]; | ||
| 175 | + if (w.size() > 1 || | ||
| 176 | + (w.size() == 1 && (std::ispunct(w[0]) || std::isspace(w[0])))) { | ||
| 177 | + if (prev != -1) { | ||
| 178 | + std::string t; | ||
| 179 | + for (; prev < i; ++prev) { | ||
| 180 | + t.append(words[prev]); | ||
| 181 | + } | ||
| 182 | + prev = -1; | ||
| 183 | + ans.push_back(std::move(t)); | ||
| 184 | + } | ||
| 185 | + | ||
| 186 | + if (!std::isspace(w[0])) { | ||
| 187 | + ans.push_back(w); | ||
| 188 | + } | ||
| 189 | + ++i; | ||
| 190 | + continue; | ||
| 191 | + } | ||
| 192 | + | ||
| 193 | + if (w.size() == 1) { | ||
| 194 | + if (prev == -1) { | ||
| 195 | + prev = i; | ||
| 196 | + } | ||
| 197 | + ++i; | ||
| 198 | + continue; | ||
| 199 | + } | ||
| 200 | + | ||
| 201 | + SHERPA_ONNX_LOGE("Ignore %s", w.c_str()); | ||
| 202 | + ++i; | ||
| 203 | + } | ||
| 204 | + | ||
| 205 | + if (prev != -1) { | ||
| 206 | + std::string t; | ||
| 207 | + for (; prev < i; ++prev) { | ||
| 208 | + t.append(words[prev]); | ||
| 209 | + } | ||
| 210 | + ans.push_back(std::move(t)); | ||
| 211 | + } | ||
| 212 | + | ||
| 213 | + return ans; | ||
| 214 | +} | ||
| 215 | + | ||
| 165 | std::vector<std::string> SplitUtf8(const std::string &text) { | 216 | std::vector<std::string> SplitUtf8(const std::string &text) { |
| 166 | const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str()); | 217 | const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str()); |
| 167 | const uint8_t *end = begin + text.size(); | 218 | const uint8_t *end = begin + text.size(); |
| 168 | 219 | ||
| 220 | + // Note that English words are split into single characters. | ||
| 221 | + // We need to invoke MergeCharactersIntoWords() to merge them | ||
| 169 | std::vector<std::string> ans; | 222 | std::vector<std::string> ans; |
| 170 | 223 | ||
| 171 | auto start = begin; | 224 | auto start = begin; |
| @@ -195,7 +248,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) { | @@ -195,7 +248,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) { | ||
| 195 | } | 248 | } |
| 196 | } | 249 | } |
| 197 | 250 | ||
| 198 | - return ans; | 251 | + return MergeCharactersIntoWords(ans); |
| 199 | } | 252 | } |
| 200 | 253 | ||
| 201 | } // namespace sherpa_onnx | 254 | } // namespace sherpa_onnx |
-
请 注册 或 登录 后发表评论