Fangjun Kuang
Committed by GitHub

Support specifying pronunciations of phrases in Chinese TTS. (#2507)

This PR implements support for specifying pronunciations of phrases in Chinese TTS by modifying the lexicon processing logic. The change introduces a greedy longest-match algorithm that attempts to match multi-word phrases before falling back to individual word processing.
@@ -36,6 +36,22 @@ static bool IsPunct(const std::string &s) { @@ -36,6 +36,22 @@ static bool IsPunct(const std::string &s) {
36 return puncts.count(s); 36 return puncts.count(s);
37 } 37 }
38 38
  39 +// end is inclusive
  40 +static std::string GetWord(const std::vector<std::string> &words, int32_t start,
  41 + int32_t end) {
  42 + std::string ans;
  43 +
  44 + if (start >= words.size() || end >= words.size()) {
  45 + return ans;
  46 + }
  47 +
  48 + for (int32_t i = start; i <= end; ++i) {
  49 + ans += words[i];
  50 + }
  51 +
  52 + return ans;
  53 +}
  54 +
39 class JiebaLexicon::Impl { 55 class JiebaLexicon::Impl {
40 public: 56 public:
41 Impl(const std::string &lexicon, const std::string &tokens, 57 Impl(const std::string &lexicon, const std::string &tokens,
@@ -160,7 +176,46 @@ class JiebaLexicon::Impl { @@ -160,7 +176,46 @@ class JiebaLexicon::Impl {
160 std::vector<TokenIDs> ans; 176 std::vector<TokenIDs> ans;
161 std::vector<int64_t> this_sentence; 177 std::vector<int64_t> this_sentence;
162 178
163 - for (const auto &w : words) { 179 + int32_t num_words = static_cast<int32_t>(words.size());
  180 + int32_t max_len = 10;
  181 +
  182 + for (int32_t i = 0; i < num_words;) {
  183 + int32_t start = i;
  184 + int32_t end = std::min(i + max_len, num_words - 1);
  185 +
  186 + std::string w;
  187 + while (end > start) {
  188 + auto this_word = GetWord(words, start, end);
  189 + if (debug_) {
  190 +#if __OHOS__
  191 + SHERPA_ONNX_LOGE("%{public}d-%{public}d: %{public}s", start, end,
  192 + this_word.c_str());
  193 +#else
  194 + SHERPA_ONNX_LOGE("%d-%d: %s", start, end, this_word.c_str());
  195 +#endif
  196 + }
  197 + if (word2ids_.count(this_word)) {
  198 + i = end + 1;
  199 + w = std::move(this_word);
  200 + if (debug_) {
  201 +#if __OHOS__
  202 + SHERPA_ONNX_LOGE("matched %{public}d-%{public}d: %{public}s", start,
  203 + end, w.c_str());
  204 +#else
  205 + SHERPA_ONNX_LOGE("matched %d-%d: %s", start, end, w.c_str());
  206 +#endif
  207 + }
  208 + break;
  209 + }
  210 +
  211 + end -= 1;
  212 + }
  213 +
  214 + if (w.empty()) {
  215 + w = words[i];
  216 + i += 1;
  217 + }
  218 +
164 auto ids = ConvertWordToIds(w); 219 auto ids = ConvertWordToIds(w);
165 if (ids.empty()) { 220 if (ids.empty()) {
166 #if __OHOS__ 221 #if __OHOS__