Committed by
GitHub
Export MatchaTTS fa-en model to sherpa-onnx (#1832)
正在显示
7 个修改的文件
包含
457 行增加
和
0 行删除
.github/workflows/export-matcha-fa-en.yaml
0 → 100644
| 1 | +name: export-matcha-fa-en-to-onnx | ||
| 2 | + | ||
| 3 | +on: | ||
| 4 | + push: | ||
| 5 | + branches: | ||
| 6 | + - export-matcha-tts-fa-en | ||
| 7 | + | ||
| 8 | + workflow_dispatch: | ||
| 9 | + | ||
| 10 | +concurrency: | ||
| 11 | + group: export-matcha-fa-en-to-onnx-${{ github.ref }} | ||
| 12 | + cancel-in-progress: true | ||
| 13 | + | ||
| 14 | +jobs: | ||
| 15 | + export-kokoro-to-onnx: | ||
| 16 | + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' | ||
| 17 | + name: export matcha fa-en ${{ matrix.version }} | ||
| 18 | + runs-on: ${{ matrix.os }} | ||
| 19 | + strategy: | ||
| 20 | + fail-fast: false | ||
| 21 | + matrix: | ||
| 22 | + os: [ubuntu-latest] | ||
| 23 | + python-version: ["3.10"] | ||
| 24 | + | ||
| 25 | + steps: | ||
| 26 | + - uses: actions/checkout@v4 | ||
| 27 | + | ||
| 28 | + - name: Setup Python ${{ matrix.python-version }} | ||
| 29 | + uses: actions/setup-python@v5 | ||
| 30 | + with: | ||
| 31 | + python-version: ${{ matrix.python-version }} | ||
| 32 | + | ||
| 33 | + - name: Install Python dependencies | ||
| 34 | + shell: bash | ||
| 35 | + run: | | ||
| 36 | + pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html | ||
| 37 | + | ||
| 38 | + - name: Run | ||
| 39 | + shell: bash | ||
| 40 | + run: | | ||
| 41 | + cd scripts/matcha-tts/fa-en | ||
| 42 | + ./run.sh | ||
| 43 | + | ||
| 44 | + - name: Collect results ${{ matrix.version }} | ||
| 45 | + shell: bash | ||
| 46 | + run: | | ||
| 47 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2 | ||
| 48 | + tar xf espeak-ng-data.tar.bz2 | ||
| 49 | + rm espeak-ng-data.tar.bz2 | ||
| 50 | + | ||
| 51 | + src=scripts/matcha-tts/fa-en | ||
| 52 | + dst1=matcha-tts-fa_en-male | ||
| 53 | + dst2=matcha-tts-fa_en-female | ||
| 54 | + | ||
| 55 | + mkdir $dst1 $dst2 | ||
| 56 | + | ||
| 57 | + cp -a espeak-ng-data $dst1/ | ||
| 58 | + cp -a espeak-ng-data $dst2/ | ||
| 59 | + | ||
| 60 | + cp -v $src/male/* $dst1 | ||
| 61 | + cp -v $src/female/* $dst2 | ||
| 62 | + | ||
| 63 | + cp -v $src/README.md $dst1/ | ||
| 64 | + cp -v $src/README.md $dst2/ | ||
| 65 | + | ||
| 66 | + ls -lh $dst1/ | ||
| 67 | + echo "---" | ||
| 68 | + ls -lh $dst2/ | ||
| 69 | + tar cjfv $dst1.tar.bz2 $dst1 | ||
| 70 | + tar cjfv $dst2.tar.bz2 $dst2 | ||
| 71 | + | ||
| 72 | + ls -lh $dst1.tar.bz2 | ||
| 73 | + ls -lh $dst2.tar.bz2 | ||
| 74 | + | ||
| 75 | + - name: Publish to huggingface male | ||
| 76 | + env: | ||
| 77 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 78 | + uses: nick-fields/retry@v3 | ||
| 79 | + with: | ||
| 80 | + max_attempts: 20 | ||
| 81 | + timeout_seconds: 200 | ||
| 82 | + shell: bash | ||
| 83 | + command: | | ||
| 84 | + git config --global user.email "csukuangfj@gmail.com" | ||
| 85 | + git config --global user.name "Fangjun Kuang" | ||
| 86 | + | ||
| 87 | + rm -rf huggingface | ||
| 88 | + export GIT_LFS_SKIP_SMUDGE=1 | ||
| 89 | + export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 90 | + | ||
| 91 | + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-male huggingface | ||
| 92 | + cd huggingface | ||
| 93 | + rm -rf ./* | ||
| 94 | + git fetch | ||
| 95 | + git pull | ||
| 96 | + | ||
| 97 | + git lfs track "cmn_dict" | ||
| 98 | + git lfs track "ru_dict" | ||
| 99 | + | ||
| 100 | + cp -a ../matcha-tts-fa_en-male/* ./ | ||
| 101 | + | ||
| 102 | + git lfs track "*.onnx" | ||
| 103 | + git add . | ||
| 104 | + | ||
| 105 | + ls -lh | ||
| 106 | + | ||
| 107 | + git status | ||
| 108 | + | ||
| 109 | + git commit -m "add models" | ||
| 110 | + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-male main || true | ||
| 111 | + | ||
| 112 | + - name: Publish to huggingface male | ||
| 113 | + env: | ||
| 114 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 115 | + uses: nick-fields/retry@v3 | ||
| 116 | + with: | ||
| 117 | + max_attempts: 20 | ||
| 118 | + timeout_seconds: 200 | ||
| 119 | + shell: bash | ||
| 120 | + command: | | ||
| 121 | + git config --global user.email "csukuangfj@gmail.com" | ||
| 122 | + git config --global user.name "Fangjun Kuang" | ||
| 123 | + | ||
| 124 | + rm -rf huggingface | ||
| 125 | + export GIT_LFS_SKIP_SMUDGE=1 | ||
| 126 | + export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 127 | + | ||
| 128 | + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-female huggingface | ||
| 129 | + cd huggingface | ||
| 130 | + rm -rf ./* | ||
| 131 | + git fetch | ||
| 132 | + git pull | ||
| 133 | + | ||
| 134 | + git lfs track "cmn_dict" | ||
| 135 | + git lfs track "ru_dict" | ||
| 136 | + | ||
| 137 | + cp -a ../matcha-tts-fa_en-female/* ./ | ||
| 138 | + | ||
| 139 | + git lfs track "*.onnx" | ||
| 140 | + git add . | ||
| 141 | + | ||
| 142 | + ls -lh | ||
| 143 | + | ||
| 144 | + git status | ||
| 145 | + | ||
| 146 | + git commit -m "add models" | ||
| 147 | + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-female main || true | ||
| 148 | + | ||
| 149 | + - name: Release | ||
| 150 | + if: github.repository_owner == 'csukuangfj' | ||
| 151 | + uses: svenstaro/upload-release-action@v2 | ||
| 152 | + with: | ||
| 153 | + file_glob: true | ||
| 154 | + file: ./*.tar.bz2 | ||
| 155 | + overwrite: true | ||
| 156 | + repo_name: k2-fsa/sherpa-onnx | ||
| 157 | + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | ||
| 158 | + tag: tts-models | ||
| 159 | + | ||
| 160 | + - name: Release | ||
| 161 | + if: github.repository_owner == 'k2-fsa' | ||
| 162 | + uses: svenstaro/upload-release-action@v2 | ||
| 163 | + with: | ||
| 164 | + file_glob: true | ||
| 165 | + file: ./*.tar.bz2 | ||
| 166 | + overwrite: true | ||
| 167 | + tag: tts-models |
scripts/matcha-tts/README.md
0 → 100644
scripts/matcha-tts/fa-en/.gitignore
0 → 100644
| 1 | +.add-meta-data.done |
scripts/matcha-tts/fa-en/README.md
0 → 100644
scripts/matcha-tts/fa-en/add_meta_data.py
0 → 100755
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +from typing import Any, Dict | ||
| 4 | + | ||
| 5 | +import onnx | ||
| 6 | + | ||
| 7 | + | ||
| 8 | +def add_meta_data(filename: str, meta_data: Dict[str, Any]): | ||
| 9 | + """Add meta data to an ONNX model. It is changed in-place. | ||
| 10 | + | ||
| 11 | + Args: | ||
| 12 | + filename: | ||
| 13 | + Filename of the ONNX model to be changed. | ||
| 14 | + meta_data: | ||
| 15 | + Key-value pairs. | ||
| 16 | + """ | ||
| 17 | + model = onnx.load(filename) | ||
| 18 | + | ||
| 19 | + while len(model.metadata_props): | ||
| 20 | + model.metadata_props.pop() | ||
| 21 | + | ||
| 22 | + for key, value in meta_data.items(): | ||
| 23 | + meta = model.metadata_props.add() | ||
| 24 | + meta.key = key | ||
| 25 | + meta.value = str(value) | ||
| 26 | + | ||
| 27 | + onnx.save(model, filename) | ||
| 28 | + | ||
| 29 | + | ||
| 30 | +def main(): | ||
| 31 | + meta_data = { | ||
| 32 | + "model_type": "matcha-tts", | ||
| 33 | + "language": "Persian+English", | ||
| 34 | + "voice": "fa", | ||
| 35 | + "has_espeak": 1, | ||
| 36 | + "jieba": 0, | ||
| 37 | + "n_speakers": 1, | ||
| 38 | + "sample_rate": 22050, | ||
| 39 | + "version": 1, | ||
| 40 | + "pad_id": 0, | ||
| 41 | + "use_icefall": 0, | ||
| 42 | + "model_author": "Ali Mahmoudi (@mah92)", | ||
| 43 | + "maintainer": "k2-fsa", | ||
| 44 | + "use_eos_bos": 0, | ||
| 45 | + "num_ode_steps": 5, | ||
| 46 | + "see_also": "https://github.com/k2-fsa/sherpa-onnx/issues/1779", | ||
| 47 | + } | ||
| 48 | + add_meta_data("./female/model.onnx", meta_data) | ||
| 49 | + add_meta_data("./male/model.onnx", meta_data) | ||
| 50 | + | ||
| 51 | + | ||
| 52 | +if __name__ == "__main__": | ||
| 53 | + main() |
scripts/matcha-tts/fa-en/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | + | ||
| 4 | +set -ex | ||
| 5 | +mkdir -p female male | ||
| 6 | + | ||
| 7 | +if [ ! -f female/model.onnx ]; then | ||
| 8 | + curl -SL --output female/model.onnx https://huggingface.co/mah92/Khadijah-FA_EN-Matcha-TTS-Model/resolve/main/matcha-fa-en-khadijah-22050-5.onnx | ||
| 9 | +fi | ||
| 10 | + | ||
| 11 | +if [ ! -f female/tokens.txt ]; then | ||
| 12 | + curl -SL --output female/tokens.txt https://huggingface.co/mah92/Khadijah-FA_EN-Matcha-TTS-Model/resolve/main/tokens_sherpa_with_fa.txt | ||
| 13 | +fi | ||
| 14 | + | ||
| 15 | +if [ ! -f male/model.onnx ]; then | ||
| 16 | + curl -SL --output male/model.onnx https://huggingface.co/mah92/Musa-FA_EN-Matcha-TTS-Model/resolve/main/matcha-fa-en-musa-22050-5.onnx | ||
| 17 | +fi | ||
| 18 | + | ||
| 19 | +if [ ! -f male/tokens.txt ]; then | ||
| 20 | + curl -SL --output male/tokens.txt https://huggingface.co/mah92/Musa-FA_EN-Matcha-TTS-Model/resolve/main/tokens_sherpa_with_fa.txt | ||
| 21 | +fi | ||
| 22 | + | ||
| 23 | +if [ ! -f hifigan_v2.onnx ]; then | ||
| 24 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 25 | +fi | ||
| 26 | + | ||
| 27 | +if [ ! -f .add-meta-data.done ]; then | ||
| 28 | + python3 ./add_meta_data.py | ||
| 29 | + touch .add-meta-data.done | ||
| 30 | +fi | ||
| 31 | + | ||
| 32 | +python3 ./test.py \ | ||
| 33 | + --am ./female/model.onnx \ | ||
| 34 | + --vocoder ./hifigan_v2.onnx \ | ||
| 35 | + --tokens ./female/tokens.txt \ | ||
| 36 | + --text "This is a test. این یک نمونه ی تست فارسی است." \ | ||
| 37 | + --out-wav "./female-en-fa.wav" | ||
| 38 | + | ||
| 39 | +python3 ./test.py \ | ||
| 40 | + --am ./male/model.onnx \ | ||
| 41 | + --vocoder ./hifigan_v2.onnx \ | ||
| 42 | + --tokens ./male/tokens.txt \ | ||
| 43 | + --text "This is a test. این یک نمونه ی تست فارسی است." \ | ||
| 44 | + --out-wav "./male-en-fa.wav" |
scripts/matcha-tts/fa-en/test.py
0 → 100755
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +AM | ||
| 5 | +NodeArg(name='x', type='tensor(int64)', shape=['batch_size', 'time']) | ||
| 6 | +NodeArg(name='x_lengths', type='tensor(int64)', shape=['batch_size']) | ||
| 7 | +NodeArg(name='scales', type='tensor(float)', shape=[2]) | ||
| 8 | +----- | ||
| 9 | +NodeArg(name='mel', type='tensor(float)', shape=['batch_size', 80, 'time']) | ||
| 10 | +NodeArg(name='mel_lengths', type='tensor(int64)', shape=['batch_size']) | ||
| 11 | + | ||
| 12 | +Vocoder | ||
| 13 | +NodeArg(name='mel', type='tensor(float)', shape=['N', 80, 'L']) | ||
| 14 | +----- | ||
| 15 | +NodeArg(name='audio', type='tensor(float)', shape=['N', 'L']) | ||
| 16 | +""" | ||
| 17 | + | ||
| 18 | +import argparse | ||
| 19 | + | ||
| 20 | +import numpy as np | ||
| 21 | +import onnxruntime as ort | ||
| 22 | +import soundfile as sf | ||
| 23 | + | ||
| 24 | +try: | ||
| 25 | + from piper_phonemize import phonemize_espeak | ||
| 26 | +except Exception as ex: | ||
| 27 | + raise RuntimeError( | ||
| 28 | + f"{ex}\nPlease run\n" | ||
| 29 | + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html" | ||
| 30 | + ) | ||
| 31 | + | ||
| 32 | + | ||
| 33 | +def get_args(): | ||
| 34 | + parser = argparse.ArgumentParser() | ||
| 35 | + parser.add_argument( | ||
| 36 | + "--am", type=str, required=True, help="Path to the acoustic model" | ||
| 37 | + ) | ||
| 38 | + | ||
| 39 | + parser.add_argument( | ||
| 40 | + "--vocoder", type=str, required=True, help="Path to the vocoder" | ||
| 41 | + ) | ||
| 42 | + parser.add_argument( | ||
| 43 | + "--tokens", type=str, required=True, help="Path to the tokens.txt" | ||
| 44 | + ) | ||
| 45 | + | ||
| 46 | + parser.add_argument( | ||
| 47 | + "--text", type=str, required=True, help="Path to the text for generation" | ||
| 48 | + ) | ||
| 49 | + | ||
| 50 | + parser.add_argument( | ||
| 51 | + "--out-wav", type=str, required=True, help="Path to save the generated wav" | ||
| 52 | + ) | ||
| 53 | + return parser.parse_args() | ||
| 54 | + | ||
| 55 | + | ||
| 56 | +def load_tokens(filename: str): | ||
| 57 | + ans = dict() | ||
| 58 | + with open(filename, encoding="utf-8") as f: | ||
| 59 | + for line in f: | ||
| 60 | + fields = line.strip().split() | ||
| 61 | + if len(fields) == 1: | ||
| 62 | + ans[" "] = int(fields[0]) | ||
| 63 | + else: | ||
| 64 | + assert len(fields) == 2, (line, fields) | ||
| 65 | + ans[fields[0]] = int(fields[1]) | ||
| 66 | + return ans | ||
| 67 | + | ||
| 68 | + | ||
| 69 | +class OnnxHifiGANModel: | ||
| 70 | + def __init__( | ||
| 71 | + self, | ||
| 72 | + filename: str, | ||
| 73 | + ): | ||
| 74 | + session_opts = ort.SessionOptions() | ||
| 75 | + session_opts.inter_op_num_threads = 1 | ||
| 76 | + session_opts.intra_op_num_threads = 1 | ||
| 77 | + | ||
| 78 | + self.session_opts = session_opts | ||
| 79 | + self.model = ort.InferenceSession( | ||
| 80 | + filename, | ||
| 81 | + sess_options=self.session_opts, | ||
| 82 | + providers=["CPUExecutionProvider"], | ||
| 83 | + ) | ||
| 84 | + | ||
| 85 | + for i in self.model.get_inputs(): | ||
| 86 | + print(i) | ||
| 87 | + | ||
| 88 | + print("-----") | ||
| 89 | + | ||
| 90 | + for i in self.model.get_outputs(): | ||
| 91 | + print(i) | ||
| 92 | + | ||
| 93 | + def __call__(self, x: np.ndarray): | ||
| 94 | + assert x.ndim == 3, x.shape | ||
| 95 | + assert x.shape[0] == 1, x.shape | ||
| 96 | + | ||
| 97 | + audio = self.model.run( | ||
| 98 | + [self.model.get_outputs()[0].name], | ||
| 99 | + { | ||
| 100 | + self.model.get_inputs()[0].name: x, | ||
| 101 | + }, | ||
| 102 | + )[0] | ||
| 103 | + # audio: (batch_size, num_samples) | ||
| 104 | + | ||
| 105 | + return audio | ||
| 106 | + | ||
| 107 | + | ||
| 108 | +class OnnxModel: | ||
| 109 | + def __init__( | ||
| 110 | + self, | ||
| 111 | + filename: str, | ||
| 112 | + tokens: str, | ||
| 113 | + ): | ||
| 114 | + session_opts = ort.SessionOptions() | ||
| 115 | + session_opts.inter_op_num_threads = 1 | ||
| 116 | + session_opts.intra_op_num_threads = 2 | ||
| 117 | + | ||
| 118 | + self.session_opts = session_opts | ||
| 119 | + self.token2id = load_tokens(tokens) | ||
| 120 | + self.model = ort.InferenceSession( | ||
| 121 | + filename, | ||
| 122 | + sess_options=self.session_opts, | ||
| 123 | + providers=["CPUExecutionProvider"], | ||
| 124 | + ) | ||
| 125 | + | ||
| 126 | + print(f"{self.model.get_modelmeta().custom_metadata_map}") | ||
| 127 | + metadata = self.model.get_modelmeta().custom_metadata_map | ||
| 128 | + self.sample_rate = int(metadata["sample_rate"]) | ||
| 129 | + | ||
| 130 | + for i in self.model.get_inputs(): | ||
| 131 | + print(i) | ||
| 132 | + | ||
| 133 | + print("-----") | ||
| 134 | + | ||
| 135 | + for i in self.model.get_outputs(): | ||
| 136 | + print(i) | ||
| 137 | + | ||
| 138 | + def __call__(self, x: np.ndarray): | ||
| 139 | + assert x.ndim == 2, x.shape | ||
| 140 | + assert x.shape[0] == 1, x.shape | ||
| 141 | + | ||
| 142 | + x_lengths = np.array([x.shape[1]], dtype=np.int64) | ||
| 143 | + | ||
| 144 | + noise_scale = 1.0 | ||
| 145 | + length_scale = 1.0 | ||
| 146 | + scales = np.array([noise_scale, length_scale], dtype=np.float32) | ||
| 147 | + | ||
| 148 | + mel = self.model.run( | ||
| 149 | + [self.model.get_outputs()[0].name], | ||
| 150 | + { | ||
| 151 | + self.model.get_inputs()[0].name: x, | ||
| 152 | + self.model.get_inputs()[1].name: x_lengths, | ||
| 153 | + self.model.get_inputs()[2].name: scales, | ||
| 154 | + }, | ||
| 155 | + )[0] | ||
| 156 | + # mel: (batch_size, feat_dim, num_frames) | ||
| 157 | + | ||
| 158 | + return mel | ||
| 159 | + | ||
| 160 | + | ||
| 161 | +def main(): | ||
| 162 | + args = get_args() | ||
| 163 | + print(vars(args)) | ||
| 164 | + am = OnnxModel(args.am, args.tokens) | ||
| 165 | + vocoder = OnnxHifiGANModel(args.vocoder) | ||
| 166 | + | ||
| 167 | + phones = phonemize_espeak(args.text, voice="fa") | ||
| 168 | + phones = sum(phones, []) | ||
| 169 | + phone_ids = [am.token2id[i] for i in phones] | ||
| 170 | + | ||
| 171 | + padded_phone_ids = [0] * (len(phone_ids) * 2 + 1) | ||
| 172 | + padded_phone_ids[1::2] = phone_ids | ||
| 173 | + | ||
| 174 | + tokens = np.array([padded_phone_ids], dtype=np.int64) | ||
| 175 | + mel = am(tokens) | ||
| 176 | + audio = vocoder(mel) | ||
| 177 | + | ||
| 178 | + sf.write(args.out_wav, audio[0], am.sample_rate, "PCM_16") | ||
| 179 | + | ||
| 180 | + | ||
| 181 | +if __name__ == "__main__": | ||
| 182 | + main() |
-
请 注册 或 登录 后发表评论