正在显示
2 个修改的文件
包含
47 行增加
和
14 行删除
| @@ -188,23 +188,34 @@ class JiebaLexicon::Impl { | @@ -188,23 +188,34 @@ class JiebaLexicon::Impl { | ||
| 188 | 188 | ||
| 189 | private: | 189 | private: |
| 190 | std::vector<int32_t> ConvertWordToIds(const std::string &w) const { | 190 | std::vector<int32_t> ConvertWordToIds(const std::string &w) const { |
| 191 | - if (word2ids_.count(w)) { | ||
| 192 | - return word2ids_.at(w); | ||
| 193 | - } | ||
| 194 | - | ||
| 195 | - if (token2id_.count(w)) { | ||
| 196 | - return {token2id_.at(w)}; | ||
| 197 | - } | ||
| 198 | - | ||
| 199 | std::vector<int32_t> ans; | 191 | std::vector<int32_t> ans; |
| 200 | 192 | ||
| 201 | - std::vector<std::string> words = SplitUtf8(w); | ||
| 202 | - for (const auto &word : words) { | ||
| 203 | - if (word2ids_.count(word)) { | ||
| 204 | - auto ids = ConvertWordToIds(word); | ||
| 205 | - ans.insert(ans.end(), ids.begin(), ids.end()); | 193 | + if (word2ids_.count(w)) { |
| 194 | + ans = word2ids_.at(w); | ||
| 195 | + } else if (token2id_.count(w)) { | ||
| 196 | + ans = {token2id_.at(w)}; | ||
| 197 | + } else { | ||
| 198 | + std::vector<std::string> words = SplitUtf8(w); | ||
| 199 | + for (const auto &word : words) { | ||
| 200 | + if (word2ids_.count(word)) { | ||
| 201 | + auto ids = ConvertWordToIds(word); | ||
| 202 | + ans.insert(ans.end(), ids.begin(), ids.end()); | ||
| 203 | + } | ||
| 206 | } | 204 | } |
| 207 | } | 205 | } |
| 206 | + if (debug_) { | ||
| 207 | + std::ostringstream os; | ||
| 208 | + os << w << ": "; | ||
| 209 | + for (auto i : ans) { | ||
| 210 | + os << id2token_.at(i) << " "; | ||
| 211 | + } | ||
| 212 | + os << "\n"; | ||
| 213 | +#if __OHOS__ | ||
| 214 | + SHERPA_ONNX_LOGE("%{public}s", os.str().c_str()); | ||
| 215 | +#else | ||
| 216 | + SHERPA_ONNX_LOGE("%s", os.str().c_str()); | ||
| 217 | +#endif | ||
| 218 | + } | ||
| 208 | 219 | ||
| 209 | return ans; | 220 | return ans; |
| 210 | } | 221 | } |
| @@ -234,6 +245,12 @@ class JiebaLexicon::Impl { | @@ -234,6 +245,12 @@ class JiebaLexicon::Impl { | ||
| 234 | if (!token2id_.count(";") && token2id_.count(",")) { | 245 | if (!token2id_.count(";") && token2id_.count(",")) { |
| 235 | token2id_[";"] = token2id_[","]; | 246 | token2id_[";"] = token2id_[","]; |
| 236 | } | 247 | } |
| 248 | + | ||
| 249 | + if (debug_) { | ||
| 250 | + for (const auto &p : token2id_) { | ||
| 251 | + id2token_[p.second] = p.first; | ||
| 252 | + } | ||
| 253 | + } | ||
| 237 | } | 254 | } |
| 238 | 255 | ||
| 239 | void InitLexicon(std::istream &is) { | 256 | void InitLexicon(std::istream &is) { |
| @@ -272,6 +289,11 @@ class JiebaLexicon::Impl { | @@ -272,6 +289,11 @@ class JiebaLexicon::Impl { | ||
| 272 | 289 | ||
| 273 | std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list); | 290 | std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list); |
| 274 | if (ids.empty()) { | 291 | if (ids.empty()) { |
| 292 | +#if __OHOS__ | ||
| 293 | + SHERPA_ONNX_LOGE("Empty token ids for %{public}s", line.c_str()); | ||
| 294 | +#else | ||
| 295 | + SHERPA_ONNX_LOGE("Empty token ids for %s", line.c_str()); | ||
| 296 | +#endif | ||
| 275 | continue; | 297 | continue; |
| 276 | } | 298 | } |
| 277 | 299 | ||
| @@ -286,6 +308,8 @@ class JiebaLexicon::Impl { | @@ -286,6 +308,8 @@ class JiebaLexicon::Impl { | ||
| 286 | // tokens.txt is saved in token2id_ | 308 | // tokens.txt is saved in token2id_ |
| 287 | std::unordered_map<std::string, int32_t> token2id_; | 309 | std::unordered_map<std::string, int32_t> token2id_; |
| 288 | 310 | ||
| 311 | + std::unordered_map<int32_t, std::string> id2token_; | ||
| 312 | + | ||
| 289 | std::unique_ptr<cppjieba::Jieba> jieba_; | 313 | std::unique_ptr<cppjieba::Jieba> jieba_; |
| 290 | bool debug_ = false; | 314 | bool debug_ = false; |
| 291 | }; | 315 | }; |
| @@ -85,6 +85,11 @@ std::vector<int32_t> ConvertTokensToIds( | @@ -85,6 +85,11 @@ std::vector<int32_t> ConvertTokensToIds( | ||
| 85 | ids.reserve(tokens.size()); | 85 | ids.reserve(tokens.size()); |
| 86 | for (const auto &s : tokens) { | 86 | for (const auto &s : tokens) { |
| 87 | if (!token2id.count(s)) { | 87 | if (!token2id.count(s)) { |
| 88 | +#if __OHOS__ | ||
| 89 | + SHERPA_ONNX_LOGE("Unknown token: %{public}s", s.c_str()); | ||
| 90 | +#else | ||
| 91 | + SHERPA_ONNX_LOGE("Unknown token: %s", s.c_str()); | ||
| 92 | +#endif | ||
| 88 | return {}; | 93 | return {}; |
| 89 | } | 94 | } |
| 90 | int32_t id = token2id.at(s); | 95 | int32_t id = token2id.at(s); |
| @@ -346,8 +351,12 @@ void Lexicon::InitLanguage(const std::string &_lang) { | @@ -346,8 +351,12 @@ void Lexicon::InitLanguage(const std::string &_lang) { | ||
| 346 | } else if (!lang.empty()) { | 351 | } else if (!lang.empty()) { |
| 347 | language_ = Language::kNotChinese; | 352 | language_ = Language::kNotChinese; |
| 348 | } else { | 353 | } else { |
| 354 | +#if __OHOS__ | ||
| 355 | + SHERPA_ONNX_LOGE("Unknown language: %{public}s", _lang.c_str()); | ||
| 356 | +#else | ||
| 349 | SHERPA_ONNX_LOGE("Unknown language: %s", _lang.c_str()); | 357 | SHERPA_ONNX_LOGE("Unknown language: %s", _lang.c_str()); |
| 350 | - exit(-1); | 358 | +#endif |
| 359 | + SHERPA_ONNX_EXIT(-1); | ||
| 351 | } | 360 | } |
| 352 | } | 361 | } |
| 353 | 362 |
-
请 注册 或 登录 后发表评论