Fangjun Kuang
Committed by GitHub

Fix punctuations in kokoro tts. (#2458)

@@ -69,7 +69,9 @@ class KokoroMultiLangLexicon::Impl { @@ -69,7 +69,9 @@ class KokoroMultiLangLexicon::Impl {
69 69
70 std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text, 70 std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
71 const std::string &voice) const { 71 const std::string &voice) const {
72 - std::string text = ToLowerCase(_text); 72 + // we cannot convert text to lowercase here since it will affect
  73 + // how piper_phonemize handles punctuations inside the text
  74 + std::string text = _text;
73 if (debug_) { 75 if (debug_) {
74 SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str()); 76 SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
75 } 77 }
@@ -300,7 +302,8 @@ class KokoroMultiLangLexicon::Impl { @@ -300,7 +302,8 @@ class KokoroMultiLangLexicon::Impl {
300 302
301 this_sentence.push_back(0); 303 this_sentence.push_back(0);
302 304
303 - for (const auto &word : words) { 305 + for (const auto &_word : words) {
  306 + auto word = ToLowerCase(_word);
304 if (IsPunctuation(word)) { 307 if (IsPunctuation(word)) {
305 this_sentence.push_back(token2id_.at(word)); 308 this_sentence.push_back(token2id_.at(word));
306 309
@@ -32,6 +32,32 @@ @@ -32,6 +32,32 @@
32 32
33 namespace sherpa_onnx { 33 namespace sherpa_onnx {
34 34
  35 +// Encode a single char32_t to UTF-8 string. For debugging only
  36 +static std::string ToString(char32_t cp) {
  37 + std::string result;
  38 +
  39 + if (cp <= 0x7F) {
  40 + result += static_cast<char>(cp);
  41 + } else if (cp <= 0x7FF) {
  42 + result += static_cast<char>(0xC0 | ((cp >> 6) & 0x1F));
  43 + result += static_cast<char>(0x80 | (cp & 0x3F));
  44 + } else if (cp <= 0xFFFF) {
  45 + result += static_cast<char>(0xE0 | ((cp >> 12) & 0x0F));
  46 + result += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
  47 + result += static_cast<char>(0x80 | (cp & 0x3F));
  48 + } else if (cp <= 0x10FFFF) {
  49 + result += static_cast<char>(0xF0 | ((cp >> 18) & 0x07));
  50 + result += static_cast<char>(0x80 | ((cp >> 12) & 0x3F));
  51 + result += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
  52 + result += static_cast<char>(0x80 | (cp & 0x3F));
  53 + } else {
  54 + SHERPA_ONNX_LOGE("Invalid Unicode code point: %d",
  55 + static_cast<int32_t>(cp));
  56 + }
  57 +
  58 + return result;
  59 +}
  60 +
35 void CallPhonemizeEspeak(const std::string &text, 61 void CallPhonemizeEspeak(const std::string &text,
36 piper::eSpeakPhonemeConfig &config, // NOLINT 62 piper::eSpeakPhonemeConfig &config, // NOLINT
37 std::vector<std::vector<piper::Phoneme>> *phonemes) { 63 std::vector<std::vector<piper::Phoneme>> *phonemes) {
@@ -165,6 +191,7 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro( @@ -165,6 +191,7 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
165 current.push_back(0); 191 current.push_back(0);
166 192
167 for (auto p : phonemes) { 193 for (auto p : phonemes) {
  194 + // SHERPA_ONNX_LOGE("%d %s", static_cast<int32_t>(p), ToString(p).c_str());
168 if (token2id.count(p)) { 195 if (token2id.count(p)) {
169 if (current.size() > max_len - 1) { 196 if (current.size() > max_len - 1) {
170 current.push_back(0); 197 current.push_back(0);
@@ -175,6 +202,9 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro( @@ -175,6 +202,9 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
175 } 202 }
176 203
177 current.push_back(token2id.at(p)); 204 current.push_back(token2id.at(p));
  205 + if (p == '.') {
  206 + current.push_back(token2id.at(' '));
  207 + }
178 } else { 208 } else {
179 SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.", 209 SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
180 static_cast<uint32_t>(p)); 210 static_cast<uint32_t>(p));