继续操作前请注册或者登录。
Fangjun Kuang
Committed by GitHub

Add C++ support for MatchaTTS models not from icefall. (#1834)

@@ -44,6 +44,28 @@ done @@ -44,6 +44,28 @@ done
44 rm -rf kokoro-en-v0_19 44 rm -rf kokoro-en-v0_19
45 45
46 log "------------------------------------------------------------" 46 log "------------------------------------------------------------"
  47 +log "matcha-tts-fa_en-male"
  48 +log "------------------------------------------------------------"
  49 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-tts-fa_en-male.tar.bz2
  50 +tar xvf matcha-tts-fa_en-male.tar.bz2
  51 +rm matcha-tts-fa_en-male.tar.bz2
  52 +
  53 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  54 +
  55 +$EXE \
  56 + --matcha-acoustic-model=./matcha-tts-fa_en-male/model.onnx \
  57 + --matcha-vocoder=./hifigan_v2.onnx \
  58 + --matcha-tokens=./matcha-tts-fa_en-male/tokens.txt \
  59 + --matcha-data-dir=./matcha-tts-fa_en-male/espeak-ng-data \
  60 + --output-filename=./tts/test-matcha-fa-en-male.wav \
  61 + --num-threads=2 \
  62 + "How are you doing today? این یک نمونه ی تست فارسی است. This is a test."
  63 +
  64 +rm -rf matcha-tts-fa_en-male
  65 +rm hifigan_v2.onnx
  66 +ls -lh tts/*.wav
  67 +
  68 +log "------------------------------------------------------------"
47 log "matcha-icefall-en_US-ljspeech" 69 log "matcha-icefall-en_US-ljspeech"
48 log "------------------------------------------------------------" 70 log "------------------------------------------------------------"
49 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 71 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
@@ -64,6 +86,7 @@ $EXE \ @@ -64,6 +86,7 @@ $EXE \
64 86
65 rm hifigan_v2.onnx 87 rm hifigan_v2.onnx
66 rm -rf matcha-icefall-en_US-ljspeech 88 rm -rf matcha-icefall-en_US-ljspeech
  89 +ls -lh tts/*.wav
67 90
68 log "------------------------------------------------------------" 91 log "------------------------------------------------------------"
69 log "matcha-icefall-zh-baker" 92 log "matcha-icefall-zh-baker"
@@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]: @@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]:
397 m.dict_dir = m.model_dir + "/dict" 397 m.dict_dir = m.model_dir + "/dict"
398 m.vocoder = "hifigan_v2.onnx" 398 m.vocoder = "hifigan_v2.onnx"
399 399
400 - english_models = [ 400 + english_persian_models = [
401 TtsModel( 401 TtsModel(
402 model_dir="matcha-icefall-en_US-ljspeech", 402 model_dir="matcha-icefall-en_US-ljspeech",
403 acoustic_model_name="model-steps-3.onnx", 403 acoustic_model_name="model-steps-3.onnx",
404 lang="en", 404 lang="en",
405 - ) 405 + ),
  406 + TtsModel(
  407 + model_dir="matcha-tts-fa_en-male",
  408 + acoustic_model_name="model.onnx",
  409 + lang="fa",
  410 + ),
  411 + TtsModel(
  412 + model_dir="matcha-tts-fa_en-female",
  413 + acoustic_model_name="model.onnx",
  414 + lang="fa",
  415 + ),
406 ] 416 ]
407 - for m in english_models: 417 + for m in english_persian_models:
408 m.data_dir = f"{m.model_dir}/espeak-ng-data" 418 m.data_dir = f"{m.model_dir}/espeak-ng-data"
409 m.vocoder = "hifigan_v2.onnx" 419 m.vocoder = "hifigan_v2.onnx"
410 420
411 - return chinese_models + english_models 421 + return chinese_models + english_persian_models
412 422
413 423
414 def get_kokoro_models() -> List[TtsModel]: 424 def get_kokoro_models() -> List[TtsModel]:
@@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { @@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
214 } 214 }
215 215
216 std::vector<TokenIDs> token_ids = 216 std::vector<TokenIDs> token_ids =
217 - frontend_->ConvertTextToTokenIds(text, "en-US"); 217 + frontend_->ConvertTextToTokenIds(text, meta_data.voice);
218 218
219 if (token_ids.empty() || 219 if (token_ids.empty() ||
220 (token_ids.size() == 1 && token_ids[0].tokens.empty())) { 220 (token_ids.size() == 1 && token_ids[0].tokens.empty())) {
@@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData { @@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData {
21 int32_t has_espeak = 0; 21 int32_t has_espeak = 0;
22 int32_t use_eos_bos = 0; 22 int32_t use_eos_bos = 0;
23 int32_t pad_id = 0; 23 int32_t pad_id = 0;
  24 +
  25 + std::string voice;
24 }; 26 };
25 27
26 } // namespace sherpa_onnx 28 } // namespace sherpa_onnx
@@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl { @@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl {
83 Ort::Value sid_tensor = 83 Ort::Value sid_tensor =
84 Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1); 84 Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1);
85 85
  86 + std::array<float, 2> scales = {noise_scale, length_scale};
  87 + int64_t scales_shape = 2;
  88 +
  89 + Ort::Value scales_tensor = Ort::Value::CreateTensor(
  90 + memory_info, scales.data(), scales.size(), &scales_shape, 1);
  91 +
86 std::vector<Ort::Value> inputs; 92 std::vector<Ort::Value> inputs;
87 inputs.reserve(5); 93 inputs.reserve(5);
88 inputs.push_back(std::move(x)); 94 inputs.push_back(std::move(x));
89 inputs.push_back(std::move(x_length)); 95 inputs.push_back(std::move(x_length));
90 - inputs.push_back(std::move(noise_scale_tensor));  
91 - inputs.push_back(std::move(length_scale_tensor)); 96 + if (input_names_[2] == "scales") {
  97 + // for models from
  98 + // https://github.com/shivammehta25/Matcha-TTS
  99 + inputs.push_back(std::move(scales_tensor));
  100 + } else {
  101 + // for models from icefall
  102 + inputs.push_back(std::move(noise_scale_tensor));
  103 + inputs.push_back(std::move(length_scale_tensor));
  104 + }
92 105
93 if (input_names_.size() == 5 && input_names_.back() == "sid") { 106 if (input_names_.size() == 5 && input_names_.back() == "sid") {
  107 + // for models from icefall
94 inputs.push_back(std::move(sid_tensor)); 108 inputs.push_back(std::move(sid_tensor));
  109 +
  110 + // Note that we have not supported multi-speaker tts models from
  111 + // https://github.com/shivammehta25/Matcha-TTS
95 } 112 }
96 113
97 auto out = 114 auto out =
@@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl { @@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl {
145 SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); 162 SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
146 SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); 163 SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos");
147 SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); 164 SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id");
  165 + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
  166 + "en-us");
148 } 167 }
149 168
150 private: 169 private: