Fangjun Kuang
Committed by GitHub

Remove spaces after punctuations for TTS (#1666)

@@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
6 6
7 #include <fstream> 7 #include <fstream>
8 #include <regex> // NOLINT 8 #include <regex> // NOLINT
  9 +#include <unordered_set>
9 #include <utility> 10 #include <utility>
10 11
11 #include "cppjieba/Jieba.hpp" 12 #include "cppjieba/Jieba.hpp"
@@ -16,6 +17,14 @@ @@ -16,6 +17,14 @@
16 17
17 namespace sherpa_onnx { 18 namespace sherpa_onnx {
18 19
  20 +static bool IsPunct(const std::string &s) {
  21 + static const std::unordered_set<std::string> puncts = {
  22 + ",", ".", "!", "?", ":", "\"", "'", ",",
  23 + "。", "!", "?", "“", "”", "‘", "’",
  24 + };
  25 + return puncts.count(s);
  26 +}
  27 +
19 class JiebaLexicon::Impl { 28 class JiebaLexicon::Impl {
20 public: 29 public:
21 Impl(const std::string &lexicon, const std::string &tokens, 30 Impl(const std::string &lexicon, const std::string &tokens,
@@ -67,8 +76,13 @@ class JiebaLexicon::Impl { @@ -67,8 +76,13 @@ class JiebaLexicon::Impl {
67 jieba_->Cut(text, words, is_hmm); 76 jieba_->Cut(text, words, is_hmm);
68 77
69 if (debug_) { 78 if (debug_) {
70 - SHERPA_ONNX_LOGE("input text: %s", text.c_str());  
71 - SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str()); 79 +#if __OHOS__
  80 + SHERPA_ONNX_LOGE("input text:\n%{public}s", text.c_str());
  81 + SHERPA_ONNX_LOGE("after replacing punctuations:\n%{public}s", s.c_str());
  82 +#else
  83 + SHERPA_ONNX_LOGE("input text:\n%s", text.c_str());
  84 + SHERPA_ONNX_LOGE("after replacing punctuations:\n%s", s.c_str());
  85 +#endif
72 86
73 std::ostringstream os; 87 std::ostringstream os;
74 std::string sep = ""; 88 std::string sep = "";
@@ -77,7 +91,52 @@ class JiebaLexicon::Impl { @@ -77,7 +91,52 @@ class JiebaLexicon::Impl {
77 sep = "_"; 91 sep = "_";
78 } 92 }
79 93
80 - SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str()); 94 +#if __OHOS__
  95 + SHERPA_ONNX_LOGE("after jieba processing:\n%{public}s", os.str().c_str());
  96 +#else
  97 + SHERPA_ONNX_LOGE("after jieba processing:\n%s", os.str().c_str());
  98 +#endif
  99 + }
  100 +
  101 + // remove spaces after punctuations
  102 + std::vector<std::string> words2 = std::move(words);
  103 + words.reserve(words2.size());
  104 +
  105 + for (int32_t i = 0; i < words2.size(); ++i) {
  106 + if (i == 0) {
  107 + words.push_back(std::move(words2[i]));
  108 + } else if (words2[i] == " ") {
  109 + if (words.back() == " " || IsPunct(words.back())) {
  110 + continue;
  111 + } else {
  112 + words.push_back(std::move(words2[i]));
  113 + }
  114 + } else if (IsPunct(words2[i])) {
  115 + if (words.back() == " " || IsPunct(words.back())) {
  116 + continue;
  117 + } else {
  118 + words.push_back(std::move(words2[i]));
  119 + }
  120 + } else {
  121 + words.push_back(std::move(words2[i]));
  122 + }
  123 + }
  124 +
  125 + if (debug_) {
  126 + std::ostringstream os;
  127 + std::string sep = "";
  128 + for (const auto &w : words) {
  129 + os << sep << w;
  130 + sep = "_";
  131 + }
  132 +
  133 +#if __OHOS__
  134 + SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%{public}s",
  135 + os.str().c_str());
  136 +#else
  137 + SHERPA_ONNX_LOGE("after removing spaces after punctuations:\n%s",
  138 + os.str().c_str());
  139 +#endif
81 } 140 }
82 141
83 std::vector<TokenIDs> ans; 142 std::vector<TokenIDs> ans;
@@ -86,7 +145,11 @@ class JiebaLexicon::Impl { @@ -86,7 +145,11 @@ class JiebaLexicon::Impl {
86 for (const auto &w : words) { 145 for (const auto &w : words) {
87 auto ids = ConvertWordToIds(w); 146 auto ids = ConvertWordToIds(w);
88 if (ids.empty()) { 147 if (ids.empty()) {
  148 +#if __OHOS__
  149 + SHERPA_ONNX_LOGE("Ignore OOV '%{public}s'", w.c_str());
  150 +#else
89 SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str()); 151 SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str());
  152 +#endif
90 continue; 153 continue;
91 } 154 }
92 155
@@ -173,8 +236,15 @@ class JiebaLexicon::Impl { @@ -173,8 +236,15 @@ class JiebaLexicon::Impl {
173 ToLowerCase(&word); 236 ToLowerCase(&word);
174 237
175 if (word2ids_.count(word)) { 238 if (word2ids_.count(word)) {
  239 +#if __OHOS__
  240 + SHERPA_ONNX_LOGE(
  241 + "Duplicated word: %{public}s at line %{public}d:%{public}s. Ignore "
  242 + "it.",
  243 + word.c_str(), line_num, line.c_str());
  244 +#else
176 SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.", 245 SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
177 word.c_str(), line_num, line.c_str()); 246 word.c_str(), line_num, line.c_str());
  247 +#endif
178 continue; 248 continue;
179 } 249 }
180 250