Committed by
GitHub
Export KittenTTS mini v0.1 to sherpa-onnx (#2578)
正在显示
13 个修改的文件
包含
200 行增加
和
7 行删除
| @@ -20,7 +20,7 @@ jobs: | @@ -20,7 +20,7 @@ jobs: | ||
| 20 | fail-fast: false | 20 | fail-fast: false |
| 21 | matrix: | 21 | matrix: |
| 22 | os: [ubuntu-latest] | 22 | os: [ubuntu-latest] |
| 23 | - version: ["nano_v0_1", "nano_v0_2"] | 23 | + version: ["nano_v0_1", "nano_v0_2", "mini_v0_1"] |
| 24 | python-version: ["3.10"] | 24 | python-version: ["3.10"] |
| 25 | 25 | ||
| 26 | steps: | 26 | steps: |
| @@ -59,6 +59,8 @@ jobs: | @@ -59,6 +59,8 @@ jobs: | ||
| 59 | d=kitten-nano-en-v0_1-fp16 | 59 | d=kitten-nano-en-v0_1-fp16 |
| 60 | elif [[ $version == "nano_v0_2" ]]; then | 60 | elif [[ $version == "nano_v0_2" ]]; then |
| 61 | d=kitten-nano-en-v0_2-fp16 | 61 | d=kitten-nano-en-v0_2-fp16 |
| 62 | + elif [[ $version == "mini_v0_1" ]]; then | ||
| 63 | + d=kitten-mini-en-v0_1-fp16 | ||
| 62 | else | 64 | else |
| 63 | echo "version $version" | 65 | echo "version $version" |
| 64 | exit 1 | 66 | exit 1 |
| @@ -111,6 +113,7 @@ jobs: | @@ -111,6 +113,7 @@ jobs: | ||
| 111 | dirs=( | 113 | dirs=( |
| 112 | kitten-nano-en-v0_1-fp16 | 114 | kitten-nano-en-v0_1-fp16 |
| 113 | kitten-nano-en-v0_2-fp16 | 115 | kitten-nano-en-v0_2-fp16 |
| 116 | + kitten-mini-en-v0_1-fp16 | ||
| 114 | ) | 117 | ) |
| 115 | 118 | ||
| 116 | export GIT_LFS_SKIP_SMUDGE=1 | 119 | export GIT_LFS_SKIP_SMUDGE=1 |
| @@ -32,7 +32,7 @@ jobs: | @@ -32,7 +32,7 @@ jobs: | ||
| 32 | pip install "numpy<=1.26.4" sherpa-onnx soundfile | 32 | pip install "numpy<=1.26.4" sherpa-onnx soundfile |
| 33 | 33 | ||
| 34 | - name: kitten | 34 | - name: kitten |
| 35 | - if: false | 35 | + if: true |
| 36 | shell: bash | 36 | shell: bash |
| 37 | env: | 37 | env: |
| 38 | HF_TOKEN: ${{ secrets.HF_TOKEN }} | 38 | HF_TOKEN: ${{ secrets.HF_TOKEN }} |
| @@ -46,8 +46,9 @@ jobs: | @@ -46,8 +46,9 @@ jobs: | ||
| 46 | export GIT_LFS_SKIP_SMUDGE=1 | 46 | export GIT_LFS_SKIP_SMUDGE=1 |
| 47 | export GIT_CLONE_PROTECTION_ACTIVE=false | 47 | export GIT_CLONE_PROTECTION_ACTIVE=false |
| 48 | git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf | 48 | git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf |
| 49 | - mkdir -p ./hf/kitten/v0.1/mp3 | ||
| 50 | - mkdir -p ./hf/kitten/v0.2/mp3 | 49 | + mkdir -p ./hf/kitten/v0.1-nano/mp3 |
| 50 | + mkdir -p ./hf/kitten/v0.2-nano/mp3 | ||
| 51 | + mkdir -p ./hf/kitten/v0.1-mini/mp3 | ||
| 51 | 52 | ||
| 52 | for v in 1 2; do | 53 | for v in 1 2; do |
| 53 | pushd nano_v0_$v | 54 | pushd nano_v0_$v |
| @@ -61,6 +62,18 @@ jobs: | @@ -61,6 +62,18 @@ jobs: | ||
| 61 | popd | 62 | popd |
| 62 | done | 63 | done |
| 63 | 64 | ||
| 65 | + for v in 1; do | ||
| 66 | + pushd mini_v0_$v | ||
| 67 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-mini-en-v0_$v-fp16.tar.bz2 | ||
| 68 | + tar xf kitten-mini-en-v0_$v-fp16.tar.bz2 | ||
| 69 | + rm kitten-mini-en-v0_$v-fp16.tar.bz2 | ||
| 70 | + | ||
| 71 | + ln -s ../hf . | ||
| 72 | + python3 ./generate_samples.py | ||
| 73 | + rm -rf kitten-mini-en-v0_$v-fp16 | ||
| 74 | + popd | ||
| 75 | + done | ||
| 76 | + | ||
| 64 | pushd hf | 77 | pushd hf |
| 65 | git pull | 78 | git pull |
| 66 | git add . | 79 | git add . |
| @@ -70,7 +83,7 @@ jobs: | @@ -70,7 +83,7 @@ jobs: | ||
| 70 | rm -rf hf | 83 | rm -rf hf |
| 71 | 84 | ||
| 72 | - name: matcha en (ljspeech) | 85 | - name: matcha en (ljspeech) |
| 73 | - if: true | 86 | + if: false |
| 74 | shell: bash | 87 | shell: bash |
| 75 | env: | 88 | env: |
| 76 | HF_TOKEN: ${{ secrets.HF_TOKEN }} | 89 | HF_TOKEN: ${{ secrets.HF_TOKEN }} |
| @@ -524,6 +524,11 @@ def get_kitten_models() -> List[TtsModel]: | @@ -524,6 +524,11 @@ def get_kitten_models() -> List[TtsModel]: | ||
| 524 | model_name="model.fp16.onnx", | 524 | model_name="model.fp16.onnx", |
| 525 | lang="en", | 525 | lang="en", |
| 526 | ), | 526 | ), |
| 527 | + TtsModel( | ||
| 528 | + model_dir="kitten-mini-en-v0_1-fp16", | ||
| 529 | + model_name="model.fp16.onnx", | ||
| 530 | + lang="en", | ||
| 531 | + ), | ||
| 527 | ] | 532 | ] |
| 528 | for m in english_models: | 533 | for m in english_models: |
| 529 | m.data_dir = f"{m.model_dir}/espeak-ng-data" | 534 | m.data_dir = f"{m.model_dir}/espeak-ng-data" |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) | ||
| 3 | + | ||
| 4 | + | ||
| 5 | +import argparse | ||
| 6 | + | ||
| 7 | +import numpy as np | ||
| 8 | +import onnx | ||
| 9 | + | ||
| 10 | +from generate_voices_bin import speaker2id | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +def get_args(): | ||
| 14 | + parser = argparse.ArgumentParser() | ||
| 15 | + parser.add_argument( | ||
| 16 | + "--model", type=str, required=True, help="input and output onnx model" | ||
| 17 | + ) | ||
| 18 | + | ||
| 19 | + return parser.parse_args() | ||
| 20 | + | ||
| 21 | + | ||
| 22 | +def main(): | ||
| 23 | + args = get_args() | ||
| 24 | + print(args.model) | ||
| 25 | + | ||
| 26 | + model = onnx.load(args.model) | ||
| 27 | + | ||
| 28 | + style = np.load("./voices.npz") | ||
| 29 | + style_shape = style[list(style.keys())[0]].shape | ||
| 30 | + | ||
| 31 | + speaker2id_str = "" | ||
| 32 | + id2speaker_str = "" | ||
| 33 | + sep = "" | ||
| 34 | + for s, i in speaker2id.items(): | ||
| 35 | + speaker2id_str += f"{sep}{s}->{i}" | ||
| 36 | + id2speaker_str += f"{sep}{i}->{s}" | ||
| 37 | + sep = "," | ||
| 38 | + | ||
| 39 | + meta_data = { | ||
| 40 | + "model_type": "kitten-tts", | ||
| 41 | + "language": "English", | ||
| 42 | + "has_espeak": 1, | ||
| 43 | + "sample_rate": 24000, | ||
| 44 | + "version": 1, | ||
| 45 | + "voice": "en-us", | ||
| 46 | + "style_dim": ",".join(map(str, style_shape)), | ||
| 47 | + "n_speakers": len(speaker2id), | ||
| 48 | + "speaker2id": speaker2id_str, | ||
| 49 | + "id2speaker": id2speaker_str, | ||
| 50 | + "speaker_names": ",".join(map(str, speaker2id.keys())), | ||
| 51 | + "model_url": "https://huggingface.co/KittenML/kitten-tts-nano-0.2", | ||
| 52 | + "see_also": "https://github.com/KittenML/KittenTTS", | ||
| 53 | + "maintainer": "k2-fsa", | ||
| 54 | + "comment": "This is kitten-tts-nano-0.2 and supports only English", | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + print(model.metadata_props) | ||
| 58 | + | ||
| 59 | + while len(model.metadata_props): | ||
| 60 | + model.metadata_props.pop() | ||
| 61 | + | ||
| 62 | + for key, value in meta_data.items(): | ||
| 63 | + meta = model.metadata_props.add() | ||
| 64 | + meta.key = key | ||
| 65 | + meta.value = str(value) | ||
| 66 | + print("--------------------") | ||
| 67 | + | ||
| 68 | + print(model.metadata_props) | ||
| 69 | + | ||
| 70 | + onnx.save(model, args.model) | ||
| 71 | + | ||
| 72 | + print(f"Please see {args.model}") | ||
| 73 | + | ||
| 74 | + | ||
| 75 | +if __name__ == "__main__": | ||
| 76 | + main() |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) | ||
| 3 | + | ||
| 4 | +""" | ||
| 5 | +Change the model so that it can be run in onnxruntime 1.17.1 | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | +import onnx | ||
| 9 | + | ||
| 10 | + | ||
| 11 | +def main(): | ||
| 12 | + model = onnx.load("kitten_tts_mini_v0_1.onnx") | ||
| 13 | + | ||
| 14 | + # Print current opsets | ||
| 15 | + for opset in model.opset_import: | ||
| 16 | + print(f"Domain: '{opset.domain}', Version: {opset.version}") | ||
| 17 | + | ||
| 18 | + # Modify the opset versions (be careful!) | ||
| 19 | + for opset in model.opset_import: | ||
| 20 | + if opset.domain == "": # ai.onnx domain | ||
| 21 | + opset.version = 19 # change from 20 to 19 | ||
| 22 | + elif opset.domain == "ai.onnx.ml": | ||
| 23 | + opset.version = 4 # change from 5 to 4 | ||
| 24 | + | ||
| 25 | + # Save the modified model | ||
| 26 | + onnx.save(model, "model.fp16.onnx") | ||
| 27 | + | ||
| 28 | + | ||
| 29 | +if __name__ == "__main__": | ||
| 30 | + main() |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) | ||
| 3 | +""" | ||
| 4 | +Generate samples for | ||
| 5 | +https://k2-fsa.github.io/sherpa/onnx/tts/all/ | ||
| 6 | +""" | ||
| 7 | + | ||
| 8 | + | ||
| 9 | +import sherpa_onnx | ||
| 10 | +import soundfile as sf | ||
| 11 | + | ||
| 12 | +from generate_voices_bin import speaker2id | ||
| 13 | + | ||
| 14 | +config = sherpa_onnx.OfflineTtsConfig( | ||
| 15 | + model=sherpa_onnx.OfflineTtsModelConfig( | ||
| 16 | + kitten=sherpa_onnx.OfflineTtsKittenModelConfig( | ||
| 17 | + model="kitten-mini-en-v0_1-fp16/model.fp16.onnx", | ||
| 18 | + voices="kitten-mini-en-v0_1-fp16/voices.bin", | ||
| 19 | + tokens="kitten-mini-en-v0_1-fp16/tokens.txt", | ||
| 20 | + data_dir="kitten-mini-en-v0_1-fp16/espeak-ng-data", | ||
| 21 | + ), | ||
| 22 | + num_threads=2, | ||
| 23 | + ), | ||
| 24 | + max_num_sentences=1, | ||
| 25 | +) | ||
| 26 | + | ||
| 27 | +if not config.validate(): | ||
| 28 | + raise ValueError("Please check your config") | ||
| 29 | + | ||
| 30 | +tts = sherpa_onnx.OfflineTts(config) | ||
| 31 | +text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." | ||
| 32 | + | ||
| 33 | +for s, i in speaker2id.items(): | ||
| 34 | + print(s, i, len(speaker2id)) | ||
| 35 | + audio = tts.generate(text, sid=i, speed=1.0) | ||
| 36 | + | ||
| 37 | + sf.write( | ||
| 38 | + f"./hf/kitten/v0.1-mini/mp3/{i}-{s}.mp3", | ||
| 39 | + audio.samples, | ||
| 40 | + samplerate=audio.sample_rate, | ||
| 41 | + ) |
| 1 | +../nano_v0_1/generate_tokens.py |
| 1 | +../nano_v0_1/generate_voices_bin.py |
scripts/kitten-tts/mini_v0_1/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) | ||
| 3 | + | ||
| 4 | +set -ex | ||
| 5 | + | ||
| 6 | +if [ ! -f kitten_tts_mini_v0_1.onnx ]; then | ||
| 7 | + curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/kitten_tts_mini_v0_1.onnx | ||
| 8 | +fi | ||
| 9 | + | ||
| 10 | +if [ ! -f voices.npz ]; then | ||
| 11 | + curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/voices.npz | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +./generate_voices_bin.py | ||
| 15 | +./generate_tokens.py | ||
| 16 | + | ||
| 17 | +./convert_opset.py | ||
| 18 | +./show.py | ||
| 19 | +./add_meta_data.py --model ./model.fp16.onnx | ||
| 20 | +# ./test.py --model ./model.fp16.onnx --tokens ./tokens.txt --voice ./voices.bin | ||
| 21 | +ls -lh |
scripts/kitten-tts/mini_v0_1/show.py
0 → 120000
| 1 | +../nano_v0_1/show.py |
scripts/kitten-tts/mini_v0_1/test.py
0 → 120000
| 1 | +../nano_v0_1/test.py |
| @@ -35,7 +35,7 @@ for s, i in speaker2id.items(): | @@ -35,7 +35,7 @@ for s, i in speaker2id.items(): | ||
| 35 | audio = tts.generate(text, sid=i, speed=1.0) | 35 | audio = tts.generate(text, sid=i, speed=1.0) |
| 36 | 36 | ||
| 37 | sf.write( | 37 | sf.write( |
| 38 | - f"./hf/kitten/v0.1/mp3/{i}-{s}.mp3", | 38 | + f"./hf/kitten/v0.1-nano/mp3/{i}-{s}.mp3", |
| 39 | audio.samples, | 39 | audio.samples, |
| 40 | samplerate=audio.sample_rate, | 40 | samplerate=audio.sample_rate, |
| 41 | ) | 41 | ) |
| @@ -35,7 +35,7 @@ for s, i in speaker2id.items(): | @@ -35,7 +35,7 @@ for s, i in speaker2id.items(): | ||
| 35 | audio = tts.generate(text, sid=i, speed=1.0) | 35 | audio = tts.generate(text, sid=i, speed=1.0) |
| 36 | 36 | ||
| 37 | sf.write( | 37 | sf.write( |
| 38 | - f"./hf/kitten/v0.2/mp3/{i}-{s}.mp3", | 38 | + f"./hf/kitten/v0.2-nano/mp3/{i}-{s}.mp3", |
| 39 | audio.samples, | 39 | audio.samples, |
| 40 | samplerate=audio.sample_rate, | 40 | samplerate=audio.sample_rate, |
| 41 | ) | 41 | ) |
-
请 注册 或 登录 后发表评论