Committed by
GitHub
Support specifying voice in espeak-ng for kokoro tts models. (#1836)
正在显示
4 个修改的文件
包含
10 行增加
和
5 行删除
| @@ -104,7 +104,8 @@ class KokoroMultiLangLexicon::Impl { | @@ -104,7 +104,8 @@ class KokoroMultiLangLexicon::Impl { | ||
| 104 | // https://en.cppreference.com/w/cpp/regex | 104 | // https://en.cppreference.com/w/cpp/regex |
| 105 | // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex | 105 | // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex |
| 106 | std::string expr = | 106 | std::string expr = |
| 107 | - "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)"; | 107 | + "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+" |
| 108 | + ")"; | ||
| 108 | 109 | ||
| 109 | auto ws = ToWideString(text); | 110 | auto ws = ToWideString(text); |
| 110 | std::wstring wexpr = ToWideString(expr); | 111 | std::wstring wexpr = ToWideString(expr); |
| @@ -127,7 +128,7 @@ class KokoroMultiLangLexicon::Impl { | @@ -127,7 +128,7 @@ class KokoroMultiLangLexicon::Impl { | ||
| 127 | if (debug_) { | 128 | if (debug_) { |
| 128 | SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); | 129 | SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); |
| 129 | } | 130 | } |
| 130 | - ids_vec = ConvertEnglishToTokenIDs(ms); | 131 | + ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); |
| 131 | } else { | 132 | } else { |
| 132 | if (debug_) { | 133 | if (debug_) { |
| 133 | SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); | 134 | SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); |
| @@ -257,7 +258,7 @@ class KokoroMultiLangLexicon::Impl { | @@ -257,7 +258,7 @@ class KokoroMultiLangLexicon::Impl { | ||
| 257 | } | 258 | } |
| 258 | 259 | ||
| 259 | std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs( | 260 | std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs( |
| 260 | - const std::string &text) const { | 261 | + const std::string &text, const std::string &voice) const { |
| 261 | std::vector<std::string> words = SplitUtf8(text); | 262 | std::vector<std::string> words = SplitUtf8(text); |
| 262 | if (debug_) { | 263 | if (debug_) { |
| 263 | std::ostringstream os; | 264 | std::ostringstream os; |
| @@ -315,7 +316,7 @@ class KokoroMultiLangLexicon::Impl { | @@ -315,7 +316,7 @@ class KokoroMultiLangLexicon::Impl { | ||
| 315 | 316 | ||
| 316 | piper::eSpeakPhonemeConfig config; | 317 | piper::eSpeakPhonemeConfig config; |
| 317 | 318 | ||
| 318 | - config.voice = "en-us"; | 319 | + config.voice = voice; |
| 319 | 320 | ||
| 320 | std::vector<std::vector<piper::Phoneme>> phonemes; | 321 | std::vector<std::vector<piper::Phoneme>> phonemes; |
| 321 | 322 |
| @@ -221,7 +221,7 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | @@ -221,7 +221,7 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 221 | } | 221 | } |
| 222 | 222 | ||
| 223 | std::vector<TokenIDs> token_ids = | 223 | std::vector<TokenIDs> token_ids = |
| 224 | - frontend_->ConvertTextToTokenIds(text, "en-us"); | 224 | + frontend_->ConvertTextToTokenIds(text, meta_data.voice); |
| 225 | 225 | ||
| 226 | if (token_ids.empty() || | 226 | if (token_ids.empty() || |
| 227 | (token_ids.size() == 1 && token_ids[0].tokens.empty())) { | 227 | (token_ids.size() == 1 && token_ids[0].tokens.empty())) { |
| @@ -18,6 +18,8 @@ struct OfflineTtsKokoroModelMetaData { | @@ -18,6 +18,8 @@ struct OfflineTtsKokoroModelMetaData { | ||
| 18 | int32_t version = 1; | 18 | int32_t version = 1; |
| 19 | int32_t has_espeak = 1; | 19 | int32_t has_espeak = 1; |
| 20 | int32_t max_token_len = 0; | 20 | int32_t max_token_len = 0; |
| 21 | + | ||
| 22 | + std::string voice; | ||
| 21 | }; | 23 | }; |
| 22 | 24 | ||
| 23 | } // namespace sherpa_onnx | 25 | } // namespace sherpa_onnx |
| @@ -138,6 +138,8 @@ class OfflineTtsKokoroModel::Impl { | @@ -138,6 +138,8 @@ class OfflineTtsKokoroModel::Impl { | ||
| 138 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); | 138 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); |
| 139 | SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); | 139 | SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); |
| 140 | SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); | 140 | SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); |
| 141 | + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice", | ||
| 142 | + "en-us"); | ||
| 141 | 143 | ||
| 142 | if (config_.debug) { | 144 | if (config_.debug) { |
| 143 | std::vector<std::string> speaker_names; | 145 | std::vector<std::string> speaker_names; |
-
请 注册 或 登录 后发表评论