Fangjun Kuang
Committed by GitHub

Add C++ support for MatchaTTS models not from icefall. (#1834)

... ... @@ -44,6 +44,28 @@ done
rm -rf kokoro-en-v0_19
log "------------------------------------------------------------"
log "matcha-tts-fa_en-male"
log "------------------------------------------------------------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-tts-fa_en-male.tar.bz2
tar xvf matcha-tts-fa_en-male.tar.bz2
rm matcha-tts-fa_en-male.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
$EXE \
--matcha-acoustic-model=./matcha-tts-fa_en-male/model.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--matcha-tokens=./matcha-tts-fa_en-male/tokens.txt \
--matcha-data-dir=./matcha-tts-fa_en-male/espeak-ng-data \
--output-filename=./tts/test-matcha-fa-en-male.wav \
--num-threads=2 \
"How are you doing today? این یک نمونه ی تست فارسی است. This is a test."
rm -rf matcha-tts-fa_en-male
rm hifigan_v2.onnx
ls -lh tts/*.wav
log "------------------------------------------------------------"
log "matcha-icefall-en_US-ljspeech"
log "------------------------------------------------------------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
... ... @@ -64,6 +86,7 @@ $EXE \
rm hifigan_v2.onnx
rm -rf matcha-icefall-en_US-ljspeech
ls -lh tts/*.wav
log "------------------------------------------------------------"
log "matcha-icefall-zh-baker"
... ...
... ... @@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]:
m.dict_dir = m.model_dir + "/dict"
m.vocoder = "hifigan_v2.onnx"
english_models = [
english_persian_models = [
TtsModel(
model_dir="matcha-icefall-en_US-ljspeech",
acoustic_model_name="model-steps-3.onnx",
lang="en",
)
),
TtsModel(
model_dir="matcha-tts-fa_en-male",
acoustic_model_name="model.onnx",
lang="fa",
),
TtsModel(
model_dir="matcha-tts-fa_en-female",
acoustic_model_name="model.onnx",
lang="fa",
),
]
for m in english_models:
for m in english_persian_models:
m.data_dir = f"{m.model_dir}/espeak-ng-data"
m.vocoder = "hifigan_v2.onnx"
return chinese_models + english_models
return chinese_models + english_persian_models
def get_kokoro_models() -> List[TtsModel]:
... ...
... ... @@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
}
std::vector<TokenIDs> token_ids =
frontend_->ConvertTextToTokenIds(text, "en-US");
frontend_->ConvertTextToTokenIds(text, meta_data.voice);
if (token_ids.empty() ||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
... ...
... ... @@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData {
int32_t has_espeak = 0;
int32_t use_eos_bos = 0;
int32_t pad_id = 0;
std::string voice;
};
} // namespace sherpa_onnx
... ...
... ... @@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl {
Ort::Value sid_tensor =
Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1);
std::array<float, 2> scales = {noise_scale, length_scale};
int64_t scales_shape = 2;
Ort::Value scales_tensor = Ort::Value::CreateTensor(
memory_info, scales.data(), scales.size(), &scales_shape, 1);
std::vector<Ort::Value> inputs;
inputs.reserve(5);
inputs.push_back(std::move(x));
inputs.push_back(std::move(x_length));
inputs.push_back(std::move(noise_scale_tensor));
inputs.push_back(std::move(length_scale_tensor));
if (input_names_[2] == "scales") {
// for models from
// https://github.com/shivammehta25/Matcha-TTS
inputs.push_back(std::move(scales_tensor));
} else {
// for models from icefall
inputs.push_back(std::move(noise_scale_tensor));
inputs.push_back(std::move(length_scale_tensor));
}
if (input_names_.size() == 5 && input_names_.back() == "sid") {
// for models from icefall
inputs.push_back(std::move(sid_tensor));
// Note that we have not supported multi-speaker tts models from
// https://github.com/shivammehta25/Matcha-TTS
}
auto out =
... ... @@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl {
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos");
SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id");
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
"en-us");
}
private:
... ...