Fangjun Kuang
Committed by GitHub

Export MatchaTTS fa-en model to sherpa-onnx (#1832)

  1 +name: export-matcha-fa-en-to-onnx
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - export-matcha-tts-fa-en
  7 +
  8 + workflow_dispatch:
  9 +
  10 +concurrency:
  11 + group: export-matcha-fa-en-to-onnx-${{ github.ref }}
  12 + cancel-in-progress: true
  13 +
  14 +jobs:
  15 + export-kokoro-to-onnx:
  16 + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
  17 + name: export matcha fa-en ${{ matrix.version }}
  18 + runs-on: ${{ matrix.os }}
  19 + strategy:
  20 + fail-fast: false
  21 + matrix:
  22 + os: [ubuntu-latest]
  23 + python-version: ["3.10"]
  24 +
  25 + steps:
  26 + - uses: actions/checkout@v4
  27 +
  28 + - name: Setup Python ${{ matrix.python-version }}
  29 + uses: actions/setup-python@v5
  30 + with:
  31 + python-version: ${{ matrix.python-version }}
  32 +
  33 + - name: Install Python dependencies
  34 + shell: bash
  35 + run: |
  36 + pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
  37 +
  38 + - name: Run
  39 + shell: bash
  40 + run: |
  41 + cd scripts/matcha-tts/fa-en
  42 + ./run.sh
  43 +
  44 + - name: Collect results ${{ matrix.version }}
  45 + shell: bash
  46 + run: |
  47 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
  48 + tar xf espeak-ng-data.tar.bz2
  49 + rm espeak-ng-data.tar.bz2
  50 +
  51 + src=scripts/matcha-tts/fa-en
  52 + dst1=matcha-tts-fa_en-male
  53 + dst2=matcha-tts-fa_en-female
  54 +
  55 + mkdir $dst1 $dst2
  56 +
  57 + cp -a espeak-ng-data $dst1/
  58 + cp -a espeak-ng-data $dst2/
  59 +
  60 + cp -v $src/male/* $dst1
  61 + cp -v $src/female/* $dst2
  62 +
  63 + cp -v $src/README.md $dst1/
  64 + cp -v $src/README.md $dst2/
  65 +
  66 + ls -lh $dst1/
  67 + echo "---"
  68 + ls -lh $dst2/
  69 + tar cjfv $dst1.tar.bz2 $dst1
  70 + tar cjfv $dst2.tar.bz2 $dst2
  71 +
  72 + ls -lh $dst1.tar.bz2
  73 + ls -lh $dst2.tar.bz2
  74 +
  75 + - name: Publish to huggingface male
  76 + env:
  77 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  78 + uses: nick-fields/retry@v3
  79 + with:
  80 + max_attempts: 20
  81 + timeout_seconds: 200
  82 + shell: bash
  83 + command: |
  84 + git config --global user.email "csukuangfj@gmail.com"
  85 + git config --global user.name "Fangjun Kuang"
  86 +
  87 + rm -rf huggingface
  88 + export GIT_LFS_SKIP_SMUDGE=1
  89 + export GIT_CLONE_PROTECTION_ACTIVE=false
  90 +
  91 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-male huggingface
  92 + cd huggingface
  93 + rm -rf ./*
  94 + git fetch
  95 + git pull
  96 +
  97 + git lfs track "cmn_dict"
  98 + git lfs track "ru_dict"
  99 +
  100 + cp -a ../matcha-tts-fa_en-male/* ./
  101 +
  102 + git lfs track "*.onnx"
  103 + git add .
  104 +
  105 + ls -lh
  106 +
  107 + git status
  108 +
  109 + git commit -m "add models"
  110 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-male main || true
  111 +
  112 + - name: Publish to huggingface male
  113 + env:
  114 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  115 + uses: nick-fields/retry@v3
  116 + with:
  117 + max_attempts: 20
  118 + timeout_seconds: 200
  119 + shell: bash
  120 + command: |
  121 + git config --global user.email "csukuangfj@gmail.com"
  122 + git config --global user.name "Fangjun Kuang"
  123 +
  124 + rm -rf huggingface
  125 + export GIT_LFS_SKIP_SMUDGE=1
  126 + export GIT_CLONE_PROTECTION_ACTIVE=false
  127 +
  128 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-female huggingface
  129 + cd huggingface
  130 + rm -rf ./*
  131 + git fetch
  132 + git pull
  133 +
  134 + git lfs track "cmn_dict"
  135 + git lfs track "ru_dict"
  136 +
  137 + cp -a ../matcha-tts-fa_en-female/* ./
  138 +
  139 + git lfs track "*.onnx"
  140 + git add .
  141 +
  142 + ls -lh
  143 +
  144 + git status
  145 +
  146 + git commit -m "add models"
  147 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-female main || true
  148 +
  149 + - name: Release
  150 + if: github.repository_owner == 'csukuangfj'
  151 + uses: svenstaro/upload-release-action@v2
  152 + with:
  153 + file_glob: true
  154 + file: ./*.tar.bz2
  155 + overwrite: true
  156 + repo_name: k2-fsa/sherpa-onnx
  157 + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
  158 + tag: tts-models
  159 +
  160 + - name: Release
  161 + if: github.repository_owner == 'k2-fsa'
  162 + uses: svenstaro/upload-release-action@v2
  163 + with:
  164 + file_glob: true
  165 + file: ./*.tar.bz2
  166 + overwrite: true
  167 + tag: tts-models
  1 +# Introduction
  2 +
  3 +This folder contains script for adding meta data to tts models
  4 +from https://github.com/shivammehta25/Matcha-TTS
  5 +
  6 +Note: If you use icefall to train a MatchaTTS model, you don't need this folder.
  1 +# Introduction
  2 +
  3 +This folder is for
  4 +https://github.com/k2-fsa/sherpa-onnx/issues/1779
  1 +#!/usr/bin/env python3
  2 +
  3 +from typing import Any, Dict
  4 +
  5 +import onnx
  6 +
  7 +
  8 +def add_meta_data(filename: str, meta_data: Dict[str, Any]):
  9 + """Add meta data to an ONNX model. It is changed in-place.
  10 +
  11 + Args:
  12 + filename:
  13 + Filename of the ONNX model to be changed.
  14 + meta_data:
  15 + Key-value pairs.
  16 + """
  17 + model = onnx.load(filename)
  18 +
  19 + while len(model.metadata_props):
  20 + model.metadata_props.pop()
  21 +
  22 + for key, value in meta_data.items():
  23 + meta = model.metadata_props.add()
  24 + meta.key = key
  25 + meta.value = str(value)
  26 +
  27 + onnx.save(model, filename)
  28 +
  29 +
  30 +def main():
  31 + meta_data = {
  32 + "model_type": "matcha-tts",
  33 + "language": "Persian+English",
  34 + "voice": "fa",
  35 + "has_espeak": 1,
  36 + "jieba": 0,
  37 + "n_speakers": 1,
  38 + "sample_rate": 22050,
  39 + "version": 1,
  40 + "pad_id": 0,
  41 + "use_icefall": 0,
  42 + "model_author": "Ali Mahmoudi (@mah92)",
  43 + "maintainer": "k2-fsa",
  44 + "use_eos_bos": 0,
  45 + "num_ode_steps": 5,
  46 + "see_also": "https://github.com/k2-fsa/sherpa-onnx/issues/1779",
  47 + }
  48 + add_meta_data("./female/model.onnx", meta_data)
  49 + add_meta_data("./male/model.onnx", meta_data)
  50 +
  51 +
  52 +if __name__ == "__main__":
  53 + main()
  1 +#!/usr/bin/env bash
  2 +
  3 +
  4 +set -ex
  5 +mkdir -p female male
  6 +
  7 +if [ ! -f female/model.onnx ]; then
  8 + curl -SL --output female/model.onnx https://huggingface.co/mah92/Khadijah-FA_EN-Matcha-TTS-Model/resolve/main/matcha-fa-en-khadijah-22050-5.onnx
  9 +fi
  10 +
  11 +if [ ! -f female/tokens.txt ]; then
  12 + curl -SL --output female/tokens.txt https://huggingface.co/mah92/Khadijah-FA_EN-Matcha-TTS-Model/resolve/main/tokens_sherpa_with_fa.txt
  13 +fi
  14 +
  15 +if [ ! -f male/model.onnx ]; then
  16 + curl -SL --output male/model.onnx https://huggingface.co/mah92/Musa-FA_EN-Matcha-TTS-Model/resolve/main/matcha-fa-en-musa-22050-5.onnx
  17 +fi
  18 +
  19 +if [ ! -f male/tokens.txt ]; then
  20 + curl -SL --output male/tokens.txt https://huggingface.co/mah92/Musa-FA_EN-Matcha-TTS-Model/resolve/main/tokens_sherpa_with_fa.txt
  21 +fi
  22 +
  23 +if [ ! -f hifigan_v2.onnx ]; then
  24 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  25 +fi
  26 +
  27 +if [ ! -f .add-meta-data.done ]; then
  28 + python3 ./add_meta_data.py
  29 + touch .add-meta-data.done
  30 +fi
  31 +
  32 +python3 ./test.py \
  33 + --am ./female/model.onnx \
  34 + --vocoder ./hifigan_v2.onnx \
  35 + --tokens ./female/tokens.txt \
  36 + --text "This is a test. این یک نمونه ی تست فارسی است." \
  37 + --out-wav "./female-en-fa.wav"
  38 +
  39 +python3 ./test.py \
  40 + --am ./male/model.onnx \
  41 + --vocoder ./hifigan_v2.onnx \
  42 + --tokens ./male/tokens.txt \
  43 + --text "This is a test. این یک نمونه ی تست فارسی است." \
  44 + --out-wav "./male-en-fa.wav"
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +AM
  5 +NodeArg(name='x', type='tensor(int64)', shape=['batch_size', 'time'])
  6 +NodeArg(name='x_lengths', type='tensor(int64)', shape=['batch_size'])
  7 +NodeArg(name='scales', type='tensor(float)', shape=[2])
  8 +-----
  9 +NodeArg(name='mel', type='tensor(float)', shape=['batch_size', 80, 'time'])
  10 +NodeArg(name='mel_lengths', type='tensor(int64)', shape=['batch_size'])
  11 +
  12 +Vocoder
  13 +NodeArg(name='mel', type='tensor(float)', shape=['N', 80, 'L'])
  14 +-----
  15 +NodeArg(name='audio', type='tensor(float)', shape=['N', 'L'])
  16 +"""
  17 +
  18 +import argparse
  19 +
  20 +import numpy as np
  21 +import onnxruntime as ort
  22 +import soundfile as sf
  23 +
  24 +try:
  25 + from piper_phonemize import phonemize_espeak
  26 +except Exception as ex:
  27 + raise RuntimeError(
  28 + f"{ex}\nPlease run\n"
  29 + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
  30 + )
  31 +
  32 +
  33 +def get_args():
  34 + parser = argparse.ArgumentParser()
  35 + parser.add_argument(
  36 + "--am", type=str, required=True, help="Path to the acoustic model"
  37 + )
  38 +
  39 + parser.add_argument(
  40 + "--vocoder", type=str, required=True, help="Path to the vocoder"
  41 + )
  42 + parser.add_argument(
  43 + "--tokens", type=str, required=True, help="Path to the tokens.txt"
  44 + )
  45 +
  46 + parser.add_argument(
  47 + "--text", type=str, required=True, help="Path to the text for generation"
  48 + )
  49 +
  50 + parser.add_argument(
  51 + "--out-wav", type=str, required=True, help="Path to save the generated wav"
  52 + )
  53 + return parser.parse_args()
  54 +
  55 +
  56 +def load_tokens(filename: str):
  57 + ans = dict()
  58 + with open(filename, encoding="utf-8") as f:
  59 + for line in f:
  60 + fields = line.strip().split()
  61 + if len(fields) == 1:
  62 + ans[" "] = int(fields[0])
  63 + else:
  64 + assert len(fields) == 2, (line, fields)
  65 + ans[fields[0]] = int(fields[1])
  66 + return ans
  67 +
  68 +
  69 +class OnnxHifiGANModel:
  70 + def __init__(
  71 + self,
  72 + filename: str,
  73 + ):
  74 + session_opts = ort.SessionOptions()
  75 + session_opts.inter_op_num_threads = 1
  76 + session_opts.intra_op_num_threads = 1
  77 +
  78 + self.session_opts = session_opts
  79 + self.model = ort.InferenceSession(
  80 + filename,
  81 + sess_options=self.session_opts,
  82 + providers=["CPUExecutionProvider"],
  83 + )
  84 +
  85 + for i in self.model.get_inputs():
  86 + print(i)
  87 +
  88 + print("-----")
  89 +
  90 + for i in self.model.get_outputs():
  91 + print(i)
  92 +
  93 + def __call__(self, x: np.ndarray):
  94 + assert x.ndim == 3, x.shape
  95 + assert x.shape[0] == 1, x.shape
  96 +
  97 + audio = self.model.run(
  98 + [self.model.get_outputs()[0].name],
  99 + {
  100 + self.model.get_inputs()[0].name: x,
  101 + },
  102 + )[0]
  103 + # audio: (batch_size, num_samples)
  104 +
  105 + return audio
  106 +
  107 +
  108 +class OnnxModel:
  109 + def __init__(
  110 + self,
  111 + filename: str,
  112 + tokens: str,
  113 + ):
  114 + session_opts = ort.SessionOptions()
  115 + session_opts.inter_op_num_threads = 1
  116 + session_opts.intra_op_num_threads = 2
  117 +
  118 + self.session_opts = session_opts
  119 + self.token2id = load_tokens(tokens)
  120 + self.model = ort.InferenceSession(
  121 + filename,
  122 + sess_options=self.session_opts,
  123 + providers=["CPUExecutionProvider"],
  124 + )
  125 +
  126 + print(f"{self.model.get_modelmeta().custom_metadata_map}")
  127 + metadata = self.model.get_modelmeta().custom_metadata_map
  128 + self.sample_rate = int(metadata["sample_rate"])
  129 +
  130 + for i in self.model.get_inputs():
  131 + print(i)
  132 +
  133 + print("-----")
  134 +
  135 + for i in self.model.get_outputs():
  136 + print(i)
  137 +
  138 + def __call__(self, x: np.ndarray):
  139 + assert x.ndim == 2, x.shape
  140 + assert x.shape[0] == 1, x.shape
  141 +
  142 + x_lengths = np.array([x.shape[1]], dtype=np.int64)
  143 +
  144 + noise_scale = 1.0
  145 + length_scale = 1.0
  146 + scales = np.array([noise_scale, length_scale], dtype=np.float32)
  147 +
  148 + mel = self.model.run(
  149 + [self.model.get_outputs()[0].name],
  150 + {
  151 + self.model.get_inputs()[0].name: x,
  152 + self.model.get_inputs()[1].name: x_lengths,
  153 + self.model.get_inputs()[2].name: scales,
  154 + },
  155 + )[0]
  156 + # mel: (batch_size, feat_dim, num_frames)
  157 +
  158 + return mel
  159 +
  160 +
  161 +def main():
  162 + args = get_args()
  163 + print(vars(args))
  164 + am = OnnxModel(args.am, args.tokens)
  165 + vocoder = OnnxHifiGANModel(args.vocoder)
  166 +
  167 + phones = phonemize_espeak(args.text, voice="fa")
  168 + phones = sum(phones, [])
  169 + phone_ids = [am.token2id[i] for i in phones]
  170 +
  171 + padded_phone_ids = [0] * (len(phone_ids) * 2 + 1)
  172 + padded_phone_ids[1::2] = phone_ids
  173 +
  174 + tokens = np.array([padded_phone_ids], dtype=np.int64)
  175 + mel = am(tokens)
  176 + audio = vocoder(mel)
  177 +
  178 + sf.write(args.out_wav, audio[0], am.sample_rate, "PCM_16")
  179 +
  180 +
  181 +if __name__ == "__main__":
  182 + main()