Support specifying pronunciations of phrases in Chinese TTS. (#2507)

This PR implements support for specifying pronunciations of phrases in Chinese TTS by modifying the lexicon processing logic. The change introduces a greedy longest-match algorithm that attempts to match multi-word phrases before falling back to individual word processing.

Support specifying pronunciations of phrases in Chinese TTS. (#2507)
This PR implements support for specifying pronunciations of phrases in Chinese TTS by modifying the lexicon processing logic. The change introduces a greedy longest-match algorithm that attempts to match multi-word phrases before falling back to individual word processing.
Fangjun Kuang · GitHub
Commit 13788a4e4cd5c4a243ebca7015ce9b865eaa23b3 13788a4e 1 parent c455ad2a
sherpa-onnx/csrc/jieba-lexicon.cc
--- a/sherpa-onnx/csrc/jieba-lexicon.cc
查看文件 @13788a4
+++ b/sherpa-onnx/csrc/jieba-lexicon.cc
查看文件 @13788a4
@@ -36,6 +36,22 @@ static bool IsPunct(const std::string &s) {
   return puncts.count(s);
 }
+// end is inclusive
+static std::string GetWord(const std::vector<std::string> &words, int32_t start,
+                           int32_t end) {
+  std::string ans;
+
+  if (start >= words.size() || end >= words.size()) {
+    return ans;
+  }
+
+  for (int32_t i = start; i <= end; ++i) {
+    ans += words[i];
+  }
+
+  return ans;
+}
+
 class JiebaLexicon::Impl {
  public:
   Impl(const std::string &lexicon, const std::string &tokens,
@@ -160,7 +176,46 @@ class JiebaLexicon::Impl {
     std::vector<TokenIDs> ans;
     std::vector<int64_t> this_sentence;
-    for (const auto &w : words) {
+    int32_t num_words = static_cast<int32_t>(words.size());
+    int32_t max_len = 10;
+
+    for (int32_t i = 0; i < num_words;) {
+      int32_t start = i;
+      int32_t end = std::min(i + max_len, num_words - 1);
+
+      std::string w;
+      while (end > start) {
+        auto this_word = GetWord(words, start, end);
+        if (debug_) {
+#if __OHOS__
+          SHERPA_ONNX_LOGE("%{public}d-%{public}d: %{public}s", start, end,
+                           this_word.c_str());
+#else
+          SHERPA_ONNX_LOGE("%d-%d: %s", start, end, this_word.c_str());
+#endif
+        }
+        if (word2ids_.count(this_word)) {
+          i = end + 1;
+          w = std::move(this_word);
+          if (debug_) {
+#if __OHOS__
+            SHERPA_ONNX_LOGE("matched %{public}d-%{public}d: %{public}s", start,
+                             end, w.c_str());
+#else
+            SHERPA_ONNX_LOGE("matched %d-%d: %s", start, end, w.c_str());
+#endif
+          }
+          break;
+        }
+
+        end -= 1;
+      }
+
+      if (w.empty()) {
+        w = words[i];
+        i += 1;
+      }
+
       auto ids = ConvertWordToIds(w);
       if (ids.empty()) {
 #if __OHOS__