Committed by
GitHub
Remove spaces after punctuations for TTS (#1666)
正在显示
1 个修改的文件
包含
73 行增加
和
3 行删除
| @@ -6,6 +6,7 @@ | @@ -6,6 +6,7 @@ | ||
| 6 | 6 | ||
| 7 | #include <fstream> | 7 | #include <fstream> |
| 8 | #include <regex> // NOLINT | 8 | #include <regex> // NOLINT |
| 9 | +#include <unordered_set> | ||
| 9 | #include <utility> | 10 | #include <utility> |
| 10 | 11 | ||
| 11 | #include "cppjieba/Jieba.hpp" | 12 | #include "cppjieba/Jieba.hpp" |
| @@ -16,6 +17,14 @@ | @@ -16,6 +17,14 @@ | ||
| 16 | 17 | ||
| 17 | namespace sherpa_onnx { | 18 | namespace sherpa_onnx { |
| 18 | 19 | ||
| 20 | +static bool IsPunct(const std::string &s) { | ||
| 21 | + static const std::unordered_set<std::string> puncts = { | ||
| 22 | + ",", ".", "!", "?", ":", "\"", "'", ",", | ||
| 23 | + "。", "!", "?", "“", "”", "‘", "’", | ||
| 24 | + }; | ||
| 25 | + return puncts.count(s); | ||
| 26 | +} | ||
| 27 | + | ||
| 19 | class JiebaLexicon::Impl { | 28 | class JiebaLexicon::Impl { |
| 20 | public: | 29 | public: |
| 21 | Impl(const std::string &lexicon, const std::string &tokens, | 30 | Impl(const std::string &lexicon, const std::string &tokens, |
| @@ -67,8 +76,13 @@ class JiebaLexicon::Impl { | @@ -67,8 +76,13 @@ class JiebaLexicon::Impl { | ||
| 67 | jieba_->Cut(text, words, is_hmm); | 76 | jieba_->Cut(text, words, is_hmm); |
| 68 | 77 | ||
| 69 | if (debug_) { | 78 | if (debug_) { |
| 70 | - SHERPA_ONNX_LOGE("input text: %s", text.c_str()); | ||
| 71 | - SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str()); | 79 | +#if __OHOS__ |
| 80 | + SHERPA_ONNX_LOGE("input text:\n%{public}s", text.c_str()); | ||
| 81 | + SHERPA_ONNX_LOGE("after replacing punctuations:\n%{public}s", s.c_str()); | ||
| 82 | +#else | ||
| 83 | + SHERPA_ONNX_LOGE("input text:\n%s", text.c_str()); | ||
| 84 | + SHERPA_ONNX_LOGE("after replacing punctuations:\n%s", s.c_str()); | ||
| 85 | +#endif | ||
| 72 | 86 | ||
| 73 | std::ostringstream os; | 87 | std::ostringstream os; |
| 74 | std::string sep = ""; | 88 | std::string sep = ""; |
| @@ -77,7 +91,52 @@ class JiebaLexicon::Impl { | @@ -77,7 +91,52 @@ class JiebaLexicon::Impl { | ||
| 77 | sep = "_"; | 91 | sep = "_"; |
| 78 | } | 92 | } |
| 79 | 93 | ||
| 80 | - SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str()); | 94 | +#if __OHOS__ |
| 95 | + SHERPA_ONNX_LOGE("after jieba processing:\n%{public}s", os.str().c_str()); | ||
| 96 | +#else | ||
| 97 | + SHERPA_ONNX_LOGE("after jieba processing:\n%s", os.str().c_str()); | ||
| 98 | +#endif | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + // remove spaces after punctuations | ||
| 102 | + std::vector<std::string> words2 = std::move(words); | ||
| 103 | + words.reserve(words2.size()); | ||
| 104 | + | ||
| 105 | + for (int32_t i = 0; i < words2.size(); ++i) { | ||
| 106 | + if (i == 0) { | ||
| 107 | + words.push_back(std::move(words2[i])); | ||
| 108 | + } else if (words2[i] == " ") { | ||
| 109 | + if (words.back() == " " || IsPunct(words.back())) { | ||
| 110 | + continue; | ||
| 111 | + } else { | ||
| 112 | + words.push_back(std::move(words2[i])); | ||
| 113 | + } | ||
| 114 | + } else if (IsPunct(words2[i])) { | ||
| 115 | + if (words.back() == " " || IsPunct(words.back())) { | ||
| 116 | + continue; | ||
| 117 | + } else { | ||
| 118 | + words.push_back(std::move(words2[i])); | ||
| 119 | + } | ||
| 120 | + } else { | ||
| 121 | + words.push_back(std::move(words2[i])); | ||
| 122 | + } | ||
| 123 | + } | ||
| 124 | + | ||
| 125 | + if (debug_) { | ||
| 126 | + std::ostringstream os; | ||
| 127 | + std::string sep = ""; | ||
| 128 | + for (const auto &w : words) { | ||
| 129 | + os << sep << w; | ||
| 130 | + sep = "_"; | ||
| 131 | + } | ||
| 132 | + | ||
| 133 | +#if __OHOS__ | ||
| 134 | + SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%{public}s", | ||
| 135 | + os.str().c_str()); | ||
| 136 | +#else | ||
| 137 | + SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%s", | ||
| 138 | + os.str().c_str()); | ||
| 139 | +#endif | ||
| 81 | } | 140 | } |
| 82 | 141 | ||
| 83 | std::vector<TokenIDs> ans; | 142 | std::vector<TokenIDs> ans; |
| @@ -86,7 +145,11 @@ class JiebaLexicon::Impl { | @@ -86,7 +145,11 @@ class JiebaLexicon::Impl { | ||
| 86 | for (const auto &w : words) { | 145 | for (const auto &w : words) { |
| 87 | auto ids = ConvertWordToIds(w); | 146 | auto ids = ConvertWordToIds(w); |
| 88 | if (ids.empty()) { | 147 | if (ids.empty()) { |
| 148 | +#if __OHOS__ | ||
| 149 | + SHERPA_ONNX_LOGE("Ignore OOV '%{public}s'", w.c_str()); | ||
| 150 | +#else | ||
| 89 | SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str()); | 151 | SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str()); |
| 152 | +#endif | ||
| 90 | continue; | 153 | continue; |
| 91 | } | 154 | } |
| 92 | 155 | ||
| @@ -173,8 +236,15 @@ class JiebaLexicon::Impl { | @@ -173,8 +236,15 @@ class JiebaLexicon::Impl { | ||
| 173 | ToLowerCase(&word); | 236 | ToLowerCase(&word); |
| 174 | 237 | ||
| 175 | if (word2ids_.count(word)) { | 238 | if (word2ids_.count(word)) { |
| 239 | +#if __OHOS__ | ||
| 240 | + SHERPA_ONNX_LOGE( | ||
| 241 | + "Duplicated word: %{public}s at line %{public}d:%{public}s. Ignore " | ||
| 242 | + "it.", | ||
| 243 | + word.c_str(), line_num, line.c_str()); | ||
| 244 | +#else | ||
| 176 | SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.", | 245 | SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.", |
| 177 | word.c_str(), line_num, line.c_str()); | 246 | word.c_str(), line_num, line.c_str()); |
| 247 | +#endif | ||
| 178 | continue; | 248 | continue; |
| 179 | } | 249 | } |
| 180 | 250 |
-
请 注册 或 登录 后发表评论