正在显示
9 个修改的文件
包含
154 行增加
和
5 行删除
| @@ -3,7 +3,7 @@ name: export-kitten-to-onnx | @@ -3,7 +3,7 @@ name: export-kitten-to-onnx | ||
| 3 | on: | 3 | on: |
| 4 | push: | 4 | push: |
| 5 | branches: | 5 | branches: |
| 6 | - - kitten-tts | 6 | + - kitten-0.2 |
| 7 | 7 | ||
| 8 | workflow_dispatch: | 8 | workflow_dispatch: |
| 9 | 9 | ||
| @@ -20,6 +20,7 @@ jobs: | @@ -20,6 +20,7 @@ jobs: | ||
| 20 | fail-fast: false | 20 | fail-fast: false |
| 21 | matrix: | 21 | matrix: |
| 22 | os: [ubuntu-latest] | 22 | os: [ubuntu-latest] |
| 23 | + version: ["nano_v0_1", "nano_v0_2"] | ||
| 23 | python-version: ["3.10"] | 24 | python-version: ["3.10"] |
| 24 | 25 | ||
| 25 | steps: | 26 | steps: |
| @@ -40,7 +41,7 @@ jobs: | @@ -40,7 +41,7 @@ jobs: | ||
| 40 | HF_TOKEN: ${{ secrets.HF_TOKEN }} | 41 | HF_TOKEN: ${{ secrets.HF_TOKEN }} |
| 41 | shell: bash | 42 | shell: bash |
| 42 | run: | | 43 | run: | |
| 43 | - cd scripts/kitten-tts/nano_v0_1 | 44 | + cd scripts/kitten-tts/${{ matrix.version }} |
| 44 | ./run.sh | 45 | ./run.sh |
| 45 | 46 | ||
| 46 | - name: Collect results | 47 | - name: Collect results |
| @@ -50,9 +51,18 @@ jobs: | @@ -50,9 +51,18 @@ jobs: | ||
| 50 | tar xf espeak-ng-data.tar.bz2 | 51 | tar xf espeak-ng-data.tar.bz2 |
| 51 | rm espeak-ng-data.tar.bz2 | 52 | rm espeak-ng-data.tar.bz2 |
| 52 | 53 | ||
| 53 | - src=scripts/kitten-tts/nano_v0_1 | 54 | + version=${{ matrix.version }} |
| 54 | 55 | ||
| 55 | - d=kitten-nano-en-v0_1-fp16 | 56 | + src=scripts/kitten-tts/$version |
| 57 | + | ||
| 58 | + if [[ $version == "nano_v0_1" ]]; then | ||
| 59 | + d=kitten-nano-en-v0_1-fp16 | ||
| 60 | + elif [[ $version == "nano_v0_2" ]]; then | ||
| 61 | + d=kitten-nano-en-v0_2-fp16 | ||
| 62 | + else | ||
| 63 | + echo "version $version" | ||
| 64 | + exit 1 | ||
| 65 | + fi | ||
| 56 | 66 | ||
| 57 | mkdir $d | 67 | mkdir $d |
| 58 | cp -a LICENSE $d/LICENSE | 68 | cp -a LICENSE $d/LICENSE |
| @@ -100,12 +110,16 @@ jobs: | @@ -100,12 +110,16 @@ jobs: | ||
| 100 | 110 | ||
| 101 | dirs=( | 111 | dirs=( |
| 102 | kitten-nano-en-v0_1-fp16 | 112 | kitten-nano-en-v0_1-fp16 |
| 113 | + kitten-nano-en-v0_2-fp16 | ||
| 103 | ) | 114 | ) |
| 104 | 115 | ||
| 105 | export GIT_LFS_SKIP_SMUDGE=1 | 116 | export GIT_LFS_SKIP_SMUDGE=1 |
| 106 | export GIT_CLONE_PROTECTION_ACTIVE=false | 117 | export GIT_CLONE_PROTECTION_ACTIVE=false |
| 107 | 118 | ||
| 108 | for d in ${dirs[@]}; do | 119 | for d in ${dirs[@]}; do |
| 120 | + if [ ! -d ../$d ]]; then | ||
| 121 | + continue | ||
| 122 | + fi | ||
| 109 | rm -rf huggingface | 123 | rm -rf huggingface |
| 110 | 124 | ||
| 111 | git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface | 125 | git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface |
| @@ -515,7 +515,12 @@ def get_kitten_models() -> List[TtsModel]: | @@ -515,7 +515,12 @@ def get_kitten_models() -> List[TtsModel]: | ||
| 515 | model_dir="kitten-nano-en-v0_1-fp16", | 515 | model_dir="kitten-nano-en-v0_1-fp16", |
| 516 | model_name="model.fp16.onnx", | 516 | model_name="model.fp16.onnx", |
| 517 | lang="en", | 517 | lang="en", |
| 518 | - ) | 518 | + ), |
| 519 | + TtsModel( | ||
| 520 | + model_dir="kitten-nano-en-v0_2-fp16", | ||
| 521 | + model_name="model.fp16.onnx", | ||
| 522 | + lang="en", | ||
| 523 | + ), | ||
| 519 | ] | 524 | ] |
| 520 | for m in english_models: | 525 | for m in english_models: |
| 521 | m.data_dir = f"{m.model_dir}/espeak-ng-data" | 526 | m.data_dir = f"{m.model_dir}/espeak-ng-data" |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) | ||
| 3 | + | ||
| 4 | + | ||
| 5 | +import argparse | ||
| 6 | + | ||
| 7 | +import numpy as np | ||
| 8 | +import onnx | ||
| 9 | + | ||
| 10 | +from generate_voices_bin import speaker2id | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +def get_args(): | ||
| 14 | + parser = argparse.ArgumentParser() | ||
| 15 | + parser.add_argument( | ||
| 16 | + "--model", type=str, required=True, help="input and output onnx model" | ||
| 17 | + ) | ||
| 18 | + | ||
| 19 | + return parser.parse_args() | ||
| 20 | + | ||
| 21 | + | ||
| 22 | +def main(): | ||
| 23 | + args = get_args() | ||
| 24 | + print(args.model) | ||
| 25 | + | ||
| 26 | + model = onnx.load(args.model) | ||
| 27 | + | ||
| 28 | + style = np.load("./voices.npz") | ||
| 29 | + style_shape = style[list(style.keys())[0]].shape | ||
| 30 | + | ||
| 31 | + speaker2id_str = "" | ||
| 32 | + id2speaker_str = "" | ||
| 33 | + sep = "" | ||
| 34 | + for s, i in speaker2id.items(): | ||
| 35 | + speaker2id_str += f"{sep}{s}->{i}" | ||
| 36 | + id2speaker_str += f"{sep}{i}->{s}" | ||
| 37 | + sep = "," | ||
| 38 | + | ||
| 39 | + meta_data = { | ||
| 40 | + "model_type": "kitten-tts", | ||
| 41 | + "language": "English", | ||
| 42 | + "has_espeak": 1, | ||
| 43 | + "sample_rate": 24000, | ||
| 44 | + "version": 1, | ||
| 45 | + "voice": "en-us", | ||
| 46 | + "style_dim": ",".join(map(str, style_shape)), | ||
| 47 | + "n_speakers": len(speaker2id), | ||
| 48 | + "speaker2id": speaker2id_str, | ||
| 49 | + "id2speaker": id2speaker_str, | ||
| 50 | + "speaker_names": ",".join(map(str, speaker2id.keys())), | ||
| 51 | + "model_url": "https://huggingface.co/KittenML/kitten-tts-nano-0.2", | ||
| 52 | + "see_also": "https://github.com/KittenML/KittenTTS", | ||
| 53 | + "maintainer": "k2-fsa", | ||
| 54 | + "comment": "This is kitten-tts-nano-0.2 and supports only English", | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + print(model.metadata_props) | ||
| 58 | + | ||
| 59 | + while len(model.metadata_props): | ||
| 60 | + model.metadata_props.pop() | ||
| 61 | + | ||
| 62 | + for key, value in meta_data.items(): | ||
| 63 | + meta = model.metadata_props.add() | ||
| 64 | + meta.key = key | ||
| 65 | + meta.value = str(value) | ||
| 66 | + print("--------------------") | ||
| 67 | + | ||
| 68 | + print(model.metadata_props) | ||
| 69 | + | ||
| 70 | + onnx.save(model, args.model) | ||
| 71 | + | ||
| 72 | + print(f"Please see {args.model}") | ||
| 73 | + | ||
| 74 | + | ||
| 75 | +if __name__ == "__main__": | ||
| 76 | + main() |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) | ||
| 3 | + | ||
| 4 | +""" | ||
| 5 | +Change the model so that it can be run in onnxruntime 1.17.1 | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +import onnx | ||
| 9 | + | ||
| 10 | + | ||
| 11 | +def main(): | ||
| 12 | + model = onnx.load("kitten_tts_nano_v0_2.onnx") | ||
| 13 | + | ||
| 14 | + # Print current opsets | ||
| 15 | + for opset in model.opset_import: | ||
| 16 | + print(f"Domain: '{opset.domain}', Version: {opset.version}") | ||
| 17 | + | ||
| 18 | + # Modify the opset versions (be careful!) | ||
| 19 | + for opset in model.opset_import: | ||
| 20 | + if opset.domain == "": # ai.onnx domain | ||
| 21 | + opset.version = 19 # change from 20 to 19 | ||
| 22 | + elif opset.domain == "ai.onnx.ml": | ||
| 23 | + opset.version = 4 # change from 5 to 4 | ||
| 24 | + | ||
| 25 | + # Save the modified model | ||
| 26 | + onnx.save(model, "model.fp16.onnx") | ||
| 27 | + | ||
| 28 | + | ||
| 29 | +if __name__ == "__main__": | ||
| 30 | + main() |
| 1 | +../nano_v0_1/generate_tokens.py |
| 1 | +../nano_v0_1/generate_voices_bin.py |
scripts/kitten-tts/nano_v0_2/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) | ||
| 3 | + | ||
| 4 | +set -ex | ||
| 5 | + | ||
| 6 | +if [ ! -f kitten_tts_nano_v0_2.onnx ]; then | ||
| 7 | + curl -SL -O https://huggingface.co/KittenML/kitten-tts-nano-0.2/resolve/main/kitten_tts_nano_v0_2.onnx | ||
| 8 | +fi | ||
| 9 | + | ||
| 10 | +if [ ! -f voices.npz ]; then | ||
| 11 | + curl -SL -O https://huggingface.co/KittenML/kitten-tts-nano-0.2/resolve/main/voices.npz | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +./generate_voices_bin.py | ||
| 15 | +./generate_tokens.py | ||
| 16 | +./convert_opset.py | ||
| 17 | +./show.py | ||
| 18 | +./add_meta_data.py --model ./model.fp16.onnx | ||
| 19 | +# ./test.py --model ./model.fp16.onnx --tokens ./tokens.txt --voice ./voices.bin | ||
| 20 | +ls -lh |
scripts/kitten-tts/nano_v0_2/show.py
0 → 120000
| 1 | +../nano_v0_1/show.py |
scripts/kitten-tts/nano_v0_2/test.py
0 → 120000
| 1 | +../nano_v0_1/test.py |
-
请 注册 或 登录 后发表评论