Fangjun Kuang
Committed by GitHub

Fix punctuations in kokoro tts. (#2458)

... ... @@ -69,7 +69,9 @@ class KokoroMultiLangLexicon::Impl {
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
const std::string &voice) const {
std::string text = ToLowerCase(_text);
// we cannot convert text to lowercase here since it will affect
// how piper_phonemize handles punctuations inside the text
std::string text = _text;
if (debug_) {
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
}
... ... @@ -300,7 +302,8 @@ class KokoroMultiLangLexicon::Impl {
this_sentence.push_back(0);
for (const auto &word : words) {
for (const auto &_word : words) {
auto word = ToLowerCase(_word);
if (IsPunctuation(word)) {
this_sentence.push_back(token2id_.at(word));
... ...
... ... @@ -32,6 +32,32 @@
namespace sherpa_onnx {
// Encode a single char32_t to UTF-8 string. For debugging only
static std::string ToString(char32_t cp) {
std::string result;
if (cp <= 0x7F) {
result += static_cast<char>(cp);
} else if (cp <= 0x7FF) {
result += static_cast<char>(0xC0 | ((cp >> 6) & 0x1F));
result += static_cast<char>(0x80 | (cp & 0x3F));
} else if (cp <= 0xFFFF) {
result += static_cast<char>(0xE0 | ((cp >> 12) & 0x0F));
result += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
result += static_cast<char>(0x80 | (cp & 0x3F));
} else if (cp <= 0x10FFFF) {
result += static_cast<char>(0xF0 | ((cp >> 18) & 0x07));
result += static_cast<char>(0x80 | ((cp >> 12) & 0x3F));
result += static_cast<char>(0x80 | ((cp >> 6) & 0x3F));
result += static_cast<char>(0x80 | (cp & 0x3F));
} else {
SHERPA_ONNX_LOGE("Invalid Unicode code point: %d",
static_cast<int32_t>(cp));
}
return result;
}
void CallPhonemizeEspeak(const std::string &text,
piper::eSpeakPhonemeConfig &config, // NOLINT
std::vector<std::vector<piper::Phoneme>> *phonemes) {
... ... @@ -165,6 +191,7 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
current.push_back(0);
for (auto p : phonemes) {
// SHERPA_ONNX_LOGE("%d %s", static_cast<int32_t>(p), ToString(p).c_str());
if (token2id.count(p)) {
if (current.size() > max_len - 1) {
current.push_back(0);
... ... @@ -175,6 +202,9 @@ static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
}
current.push_back(token2id.at(p));
if (p == '.') {
current.push_back(token2id.at(' '));
}
} else {
SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
static_cast<uint32_t>(p));
... ...