Committed by
GitHub
Add C++ support for MatchaTTS models not from icefall. (#1834)
正在显示
5 个修改的文件
包含
61 行增加
和
7 行删除
| @@ -44,6 +44,28 @@ done | @@ -44,6 +44,28 @@ done | ||
| 44 | rm -rf kokoro-en-v0_19 | 44 | rm -rf kokoro-en-v0_19 |
| 45 | 45 | ||
| 46 | log "------------------------------------------------------------" | 46 | log "------------------------------------------------------------" |
| 47 | +log "matcha-tts-fa_en-male" | ||
| 48 | +log "------------------------------------------------------------" | ||
| 49 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-tts-fa_en-male.tar.bz2 | ||
| 50 | +tar xvf matcha-tts-fa_en-male.tar.bz2 | ||
| 51 | +rm matcha-tts-fa_en-male.tar.bz2 | ||
| 52 | + | ||
| 53 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 54 | + | ||
| 55 | +$EXE \ | ||
| 56 | + --matcha-acoustic-model=./matcha-tts-fa_en-male/model.onnx \ | ||
| 57 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 58 | + --matcha-tokens=./matcha-tts-fa_en-male/tokens.txt \ | ||
| 59 | + --matcha-data-dir=./matcha-tts-fa_en-male/espeak-ng-data \ | ||
| 60 | + --output-filename=./tts/test-matcha-fa-en-male.wav \ | ||
| 61 | + --num-threads=2 \ | ||
| 62 | + "How are you doing today? این یک نمونه ی تست فارسی است. This is a test." | ||
| 63 | + | ||
| 64 | +rm -rf matcha-tts-fa_en-male | ||
| 65 | +rm hifigan_v2.onnx | ||
| 66 | +ls -lh tts/*.wav | ||
| 67 | + | ||
| 68 | +log "------------------------------------------------------------" | ||
| 47 | log "matcha-icefall-en_US-ljspeech" | 69 | log "matcha-icefall-en_US-ljspeech" |
| 48 | log "------------------------------------------------------------" | 70 | log "------------------------------------------------------------" |
| 49 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | 71 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 |
| @@ -64,6 +86,7 @@ $EXE \ | @@ -64,6 +86,7 @@ $EXE \ | ||
| 64 | 86 | ||
| 65 | rm hifigan_v2.onnx | 87 | rm hifigan_v2.onnx |
| 66 | rm -rf matcha-icefall-en_US-ljspeech | 88 | rm -rf matcha-icefall-en_US-ljspeech |
| 89 | +ls -lh tts/*.wav | ||
| 67 | 90 | ||
| 68 | log "------------------------------------------------------------" | 91 | log "------------------------------------------------------------" |
| 69 | log "matcha-icefall-zh-baker" | 92 | log "matcha-icefall-zh-baker" |
| @@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]: | @@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]: | ||
| 397 | m.dict_dir = m.model_dir + "/dict" | 397 | m.dict_dir = m.model_dir + "/dict" |
| 398 | m.vocoder = "hifigan_v2.onnx" | 398 | m.vocoder = "hifigan_v2.onnx" |
| 399 | 399 | ||
| 400 | - english_models = [ | 400 | + english_persian_models = [ |
| 401 | TtsModel( | 401 | TtsModel( |
| 402 | model_dir="matcha-icefall-en_US-ljspeech", | 402 | model_dir="matcha-icefall-en_US-ljspeech", |
| 403 | acoustic_model_name="model-steps-3.onnx", | 403 | acoustic_model_name="model-steps-3.onnx", |
| 404 | lang="en", | 404 | lang="en", |
| 405 | - ) | 405 | + ), |
| 406 | + TtsModel( | ||
| 407 | + model_dir="matcha-tts-fa_en-male", | ||
| 408 | + acoustic_model_name="model.onnx", | ||
| 409 | + lang="fa", | ||
| 410 | + ), | ||
| 411 | + TtsModel( | ||
| 412 | + model_dir="matcha-tts-fa_en-female", | ||
| 413 | + acoustic_model_name="model.onnx", | ||
| 414 | + lang="fa", | ||
| 415 | + ), | ||
| 406 | ] | 416 | ] |
| 407 | - for m in english_models: | 417 | + for m in english_persian_models: |
| 408 | m.data_dir = f"{m.model_dir}/espeak-ng-data" | 418 | m.data_dir = f"{m.model_dir}/espeak-ng-data" |
| 409 | m.vocoder = "hifigan_v2.onnx" | 419 | m.vocoder = "hifigan_v2.onnx" |
| 410 | 420 | ||
| 411 | - return chinese_models + english_models | 421 | + return chinese_models + english_persian_models |
| 412 | 422 | ||
| 413 | 423 | ||
| 414 | def get_kokoro_models() -> List[TtsModel]: | 424 | def get_kokoro_models() -> List[TtsModel]: |
| @@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | @@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | ||
| 214 | } | 214 | } |
| 215 | 215 | ||
| 216 | std::vector<TokenIDs> token_ids = | 216 | std::vector<TokenIDs> token_ids = |
| 217 | - frontend_->ConvertTextToTokenIds(text, "en-US"); | 217 | + frontend_->ConvertTextToTokenIds(text, meta_data.voice); |
| 218 | 218 | ||
| 219 | if (token_ids.empty() || | 219 | if (token_ids.empty() || |
| 220 | (token_ids.size() == 1 && token_ids[0].tokens.empty())) { | 220 | (token_ids.size() == 1 && token_ids[0].tokens.empty())) { |
| @@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData { | @@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData { | ||
| 21 | int32_t has_espeak = 0; | 21 | int32_t has_espeak = 0; |
| 22 | int32_t use_eos_bos = 0; | 22 | int32_t use_eos_bos = 0; |
| 23 | int32_t pad_id = 0; | 23 | int32_t pad_id = 0; |
| 24 | + | ||
| 25 | + std::string voice; | ||
| 24 | }; | 26 | }; |
| 25 | 27 | ||
| 26 | } // namespace sherpa_onnx | 28 | } // namespace sherpa_onnx |
| @@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl { | @@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl { | ||
| 83 | Ort::Value sid_tensor = | 83 | Ort::Value sid_tensor = |
| 84 | Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1); | 84 | Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1); |
| 85 | 85 | ||
| 86 | + std::array<float, 2> scales = {noise_scale, length_scale}; | ||
| 87 | + int64_t scales_shape = 2; | ||
| 88 | + | ||
| 89 | + Ort::Value scales_tensor = Ort::Value::CreateTensor( | ||
| 90 | + memory_info, scales.data(), scales.size(), &scales_shape, 1); | ||
| 91 | + | ||
| 86 | std::vector<Ort::Value> inputs; | 92 | std::vector<Ort::Value> inputs; |
| 87 | inputs.reserve(5); | 93 | inputs.reserve(5); |
| 88 | inputs.push_back(std::move(x)); | 94 | inputs.push_back(std::move(x)); |
| 89 | inputs.push_back(std::move(x_length)); | 95 | inputs.push_back(std::move(x_length)); |
| 90 | - inputs.push_back(std::move(noise_scale_tensor)); | ||
| 91 | - inputs.push_back(std::move(length_scale_tensor)); | 96 | + if (input_names_[2] == "scales") { |
| 97 | + // for models from | ||
| 98 | + // https://github.com/shivammehta25/Matcha-TTS | ||
| 99 | + inputs.push_back(std::move(scales_tensor)); | ||
| 100 | + } else { | ||
| 101 | + // for models from icefall | ||
| 102 | + inputs.push_back(std::move(noise_scale_tensor)); | ||
| 103 | + inputs.push_back(std::move(length_scale_tensor)); | ||
| 104 | + } | ||
| 92 | 105 | ||
| 93 | if (input_names_.size() == 5 && input_names_.back() == "sid") { | 106 | if (input_names_.size() == 5 && input_names_.back() == "sid") { |
| 107 | + // for models from icefall | ||
| 94 | inputs.push_back(std::move(sid_tensor)); | 108 | inputs.push_back(std::move(sid_tensor)); |
| 109 | + | ||
| 110 | + // Note that we have not supported multi-speaker tts models from | ||
| 111 | + // https://github.com/shivammehta25/Matcha-TTS | ||
| 95 | } | 112 | } |
| 96 | 113 | ||
| 97 | auto out = | 114 | auto out = |
| @@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl { | @@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl { | ||
| 145 | SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); | 162 | SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); |
| 146 | SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); | 163 | SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); |
| 147 | SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); | 164 | SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); |
| 165 | + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice", | ||
| 166 | + "en-us"); | ||
| 148 | } | 167 | } |
| 149 | 168 | ||
| 150 | private: | 169 | private: |
-
请 注册 或 登录 后发表评论