Fangjun Kuang
Committed by GitHub

Support specifying voice in espeak-ng for kokoro tts models. (#1836)

@@ -104,7 +104,8 @@ class KokoroMultiLangLexicon::Impl { @@ -104,7 +104,8 @@ class KokoroMultiLangLexicon::Impl {
104 // https://en.cppreference.com/w/cpp/regex 104 // https://en.cppreference.com/w/cpp/regex
105 // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex 105 // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
106 std::string expr = 106 std::string expr =
107 - "([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)"; 107 + "([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
  108 + ")";
108 109
109 auto ws = ToWideString(text); 110 auto ws = ToWideString(text);
110 std::wstring wexpr = ToWideString(expr); 111 std::wstring wexpr = ToWideString(expr);
@@ -127,7 +128,7 @@ class KokoroMultiLangLexicon::Impl { @@ -127,7 +128,7 @@ class KokoroMultiLangLexicon::Impl {
127 if (debug_) { 128 if (debug_) {
128 SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); 129 SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
129 } 130 }
130 - ids_vec = ConvertEnglishToTokenIDs(ms); 131 + ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
131 } else { 132 } else {
132 if (debug_) { 133 if (debug_) {
133 SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); 134 SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
@@ -257,7 +258,7 @@ class KokoroMultiLangLexicon::Impl { @@ -257,7 +258,7 @@ class KokoroMultiLangLexicon::Impl {
257 } 258 }
258 259
259 std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs( 260 std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
260 - const std::string &text) const { 261 + const std::string &text, const std::string &voice) const {
261 std::vector<std::string> words = SplitUtf8(text); 262 std::vector<std::string> words = SplitUtf8(text);
262 if (debug_) { 263 if (debug_) {
263 std::ostringstream os; 264 std::ostringstream os;
@@ -315,7 +316,7 @@ class KokoroMultiLangLexicon::Impl { @@ -315,7 +316,7 @@ class KokoroMultiLangLexicon::Impl {
315 316
316 piper::eSpeakPhonemeConfig config; 317 piper::eSpeakPhonemeConfig config;
317 318
318 - config.voice = "en-us"; 319 + config.voice = voice;
319 320
320 std::vector<std::vector<piper::Phoneme>> phonemes; 321 std::vector<std::vector<piper::Phoneme>> phonemes;
321 322
@@ -221,7 +221,7 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { @@ -221,7 +221,7 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
221 } 221 }
222 222
223 std::vector<TokenIDs> token_ids = 223 std::vector<TokenIDs> token_ids =
224 - frontend_->ConvertTextToTokenIds(text, "en-us"); 224 + frontend_->ConvertTextToTokenIds(text, meta_data.voice);
225 225
226 if (token_ids.empty() || 226 if (token_ids.empty() ||
227 (token_ids.size() == 1 && token_ids[0].tokens.empty())) { 227 (token_ids.size() == 1 && token_ids[0].tokens.empty())) {
@@ -18,6 +18,8 @@ struct OfflineTtsKokoroModelMetaData { @@ -18,6 +18,8 @@ struct OfflineTtsKokoroModelMetaData {
18 int32_t version = 1; 18 int32_t version = 1;
19 int32_t has_espeak = 1; 19 int32_t has_espeak = 1;
20 int32_t max_token_len = 0; 20 int32_t max_token_len = 0;
  21 +
  22 + std::string voice;
21 }; 23 };
22 24
23 } // namespace sherpa_onnx 25 } // namespace sherpa_onnx
@@ -138,6 +138,8 @@ class OfflineTtsKokoroModel::Impl { @@ -138,6 +138,8 @@ class OfflineTtsKokoroModel::Impl {
138 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); 138 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
139 SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); 139 SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
140 SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); 140 SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
  141 + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
  142 + "en-us");
141 143
142 if (config_.debug) { 144 if (config_.debug) {
143 std::vector<std::string> speaker_names; 145 std::vector<std::string> speaker_names;