Fix punctuations in kokoro tts. (#2458)

Fangjun Kuang · GitHub
Commit 07d69576d29caa388f4d3f6f2fc55455a0b0cc6f 07d69576 1 parent 6b16c0b8
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
sherpa-onnx/csrc/piper-phonemize-lexicon.cc
--- a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
查看文件 @07d6957
+++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
查看文件 @07d6957
@@ -69,7 +69,9 @@ class KokoroMultiLangLexicon::Impl {
   std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
                                               const std::string &voice) const {
-    std::string text = ToLowerCase(_text);
+    // we cannot convert text to lowercase here since it will affect
+    // how piper_phonemize handles punctuations inside the text
+    std::string text = _text;
     if (debug_) {
       SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
     }
@@ -300,7 +302,8 @@ class KokoroMultiLangLexicon::Impl {
     this_sentence.push_back(0);
-    for (const auto &word : words) {
+    for (const auto &_word : words) {
+      auto word = ToLowerCase(_word);
       if (IsPunctuation(word)) {
         this_sentence.push_back(token2id_.at(word));
--- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc
查看文件 @07d6957
+++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc
查看文件 @07d6957
@@ -32,6 +32,32 @@
 namespace sherpa_onnx {
+// Encode a single char32_t to UTF-8 string. For debugging only
+static std::string ToString(char32_t cp) {
+  std::string result;
+
+  if (cp <= 0x7F) {
+    result += static_cast<char>(cp);
+  } else if (cp <= 0x7FF) {
+    result += static_cast<char>(0xC0 | ((cp >> 6) & 0x1F));
+    result += static_cast<char>(0x80 | (cp & 0x3F));
+  } else if (cp <= 0xFFFF) {
+    result += static_cast<char>(0xE0 | ((cp >> 12) & 0x0F));
+    result += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
+    result += static_cast<char>(0x80 | (cp & 0x3F));
+  } else if (cp <= 0x10FFFF) {
+    result += static_cast<char>(0xF0 | ((cp >> 18) & 0x07));
+    result += static_cast<char>(0x80 | ((cp >> 12) & 0x3F));
+    result += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
+    result += static_cast<char>(0x80 | (cp & 0x3F));
+  } else {
+    SHERPA_ONNX_LOGE("Invalid Unicode code point: %d",
+                     static_cast<int32_t>(cp));
+  }
+
+  return result;
+}
+
 void CallPhonemizeEspeak(const std::string &text,
                          piper::eSpeakPhonemeConfig &config,  // NOLINT
                          std::vector<std::vector<piper::Phoneme>> *phonemes) {
@@ -165,6 +191,7 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
   current.push_back(0);
   for (auto p : phonemes) {
+    // SHERPA_ONNX_LOGE("%d %s", static_cast<int32_t>(p), ToString(p).c_str());
     if (token2id.count(p)) {
       if (current.size() > max_len - 1) {
         current.push_back(0);
@@ -175,6 +202,9 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
       }
       current.push_back(token2id.at(p));
+      if (p == '.') {
+        current.push_back(token2id.at(' '));
+      }
     } else {
       SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
                        static_cast<uint32_t>(p));