Fangjun Kuang
Committed by GitHub

Export KittenTTS mini v0.1 to sherpa-onnx (#2578)

@@ -20,7 +20,7 @@ jobs: @@ -20,7 +20,7 @@ jobs:
20 fail-fast: false 20 fail-fast: false
21 matrix: 21 matrix:
22 os: [ubuntu-latest] 22 os: [ubuntu-latest]
23 - version: ["nano_v0_1", "nano_v0_2"] 23 + version: ["nano_v0_1", "nano_v0_2", "mini_v0_1"]
24 python-version: ["3.10"] 24 python-version: ["3.10"]
25 25
26 steps: 26 steps:
@@ -59,6 +59,8 @@ jobs: @@ -59,6 +59,8 @@ jobs:
59 d=kitten-nano-en-v0_1-fp16 59 d=kitten-nano-en-v0_1-fp16
60 elif [[ $version == "nano_v0_2" ]]; then 60 elif [[ $version == "nano_v0_2" ]]; then
61 d=kitten-nano-en-v0_2-fp16 61 d=kitten-nano-en-v0_2-fp16
  62 + elif [[ $version == "mini_v0_1" ]]; then
  63 + d=kitten-mini-en-v0_1-fp16
62 else 64 else
63 echo "version $version" 65 echo "version $version"
64 exit 1 66 exit 1
@@ -111,6 +113,7 @@ jobs: @@ -111,6 +113,7 @@ jobs:
111 dirs=( 113 dirs=(
112 kitten-nano-en-v0_1-fp16 114 kitten-nano-en-v0_1-fp16
113 kitten-nano-en-v0_2-fp16 115 kitten-nano-en-v0_2-fp16
  116 + kitten-mini-en-v0_1-fp16
114 ) 117 )
115 118
116 export GIT_LFS_SKIP_SMUDGE=1 119 export GIT_LFS_SKIP_SMUDGE=1
@@ -32,7 +32,7 @@ jobs: @@ -32,7 +32,7 @@ jobs:
32 pip install "numpy<=1.26.4" sherpa-onnx soundfile 32 pip install "numpy<=1.26.4" sherpa-onnx soundfile
33 33
34 - name: kitten 34 - name: kitten
35 - if: false 35 + if: true
36 shell: bash 36 shell: bash
37 env: 37 env:
38 HF_TOKEN: ${{ secrets.HF_TOKEN }} 38 HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -46,8 +46,9 @@ jobs: @@ -46,8 +46,9 @@ jobs:
46 export GIT_LFS_SKIP_SMUDGE=1 46 export GIT_LFS_SKIP_SMUDGE=1
47 export GIT_CLONE_PROTECTION_ACTIVE=false 47 export GIT_CLONE_PROTECTION_ACTIVE=false
48 git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf 48 git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
49 - mkdir -p ./hf/kitten/v0.1/mp3  
50 - mkdir -p ./hf/kitten/v0.2/mp3 49 + mkdir -p ./hf/kitten/v0.1-nano/mp3
  50 + mkdir -p ./hf/kitten/v0.2-nano/mp3
  51 + mkdir -p ./hf/kitten/v0.1-mini/mp3
51 52
52 for v in 1 2; do 53 for v in 1 2; do
53 pushd nano_v0_$v 54 pushd nano_v0_$v
@@ -61,6 +62,18 @@ jobs: @@ -61,6 +62,18 @@ jobs:
61 popd 62 popd
62 done 63 done
63 64
  65 + for v in 1; do
  66 + pushd mini_v0_$v
  67 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-mini-en-v0_$v-fp16.tar.bz2
  68 + tar xf kitten-mini-en-v0_$v-fp16.tar.bz2
  69 + rm kitten-mini-en-v0_$v-fp16.tar.bz2
  70 +
  71 + ln -s ../hf .
  72 + python3 ./generate_samples.py
  73 + rm -rf kitten-mini-en-v0_$v-fp16
  74 + popd
  75 + done
  76 +
64 pushd hf 77 pushd hf
65 git pull 78 git pull
66 git add . 79 git add .
@@ -70,7 +83,7 @@ jobs: @@ -70,7 +83,7 @@ jobs:
70 rm -rf hf 83 rm -rf hf
71 84
72 - name: matcha en (ljspeech) 85 - name: matcha en (ljspeech)
73 - if: true 86 + if: false
74 shell: bash 87 shell: bash
75 env: 88 env:
76 HF_TOKEN: ${{ secrets.HF_TOKEN }} 89 HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -524,6 +524,11 @@ def get_kitten_models() -> List[TtsModel]: @@ -524,6 +524,11 @@ def get_kitten_models() -> List[TtsModel]:
524 model_name="model.fp16.onnx", 524 model_name="model.fp16.onnx",
525 lang="en", 525 lang="en",
526 ), 526 ),
  527 + TtsModel(
  528 + model_dir="kitten-mini-en-v0_1-fp16",
  529 + model_name="model.fp16.onnx",
  530 + lang="en",
  531 + ),
527 ] 532 ]
528 for m in english_models: 533 for m in english_models:
529 m.data_dir = f"{m.model_dir}/espeak-ng-data" 534 m.data_dir = f"{m.model_dir}/espeak-ng-data"
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import argparse
  6 +
  7 +import numpy as np
  8 +import onnx
  9 +
  10 +from generate_voices_bin import speaker2id
  11 +
  12 +
  13 +def get_args():
  14 + parser = argparse.ArgumentParser()
  15 + parser.add_argument(
  16 + "--model", type=str, required=True, help="input and output onnx model"
  17 + )
  18 +
  19 + return parser.parse_args()
  20 +
  21 +
  22 +def main():
  23 + args = get_args()
  24 + print(args.model)
  25 +
  26 + model = onnx.load(args.model)
  27 +
  28 + style = np.load("./voices.npz")
  29 + style_shape = style[list(style.keys())[0]].shape
  30 +
  31 + speaker2id_str = ""
  32 + id2speaker_str = ""
  33 + sep = ""
  34 + for s, i in speaker2id.items():
  35 + speaker2id_str += f"{sep}{s}->{i}"
  36 + id2speaker_str += f"{sep}{i}->{s}"
  37 + sep = ","
  38 +
  39 + meta_data = {
  40 + "model_type": "kitten-tts",
  41 + "language": "English",
  42 + "has_espeak": 1,
  43 + "sample_rate": 24000,
  44 + "version": 1,
  45 + "voice": "en-us",
  46 + "style_dim": ",".join(map(str, style_shape)),
  47 + "n_speakers": len(speaker2id),
  48 + "speaker2id": speaker2id_str,
  49 + "id2speaker": id2speaker_str,
  50 + "speaker_names": ",".join(map(str, speaker2id.keys())),
  51 + "model_url": "https://huggingface.co/KittenML/kitten-tts-nano-0.2",
  52 + "see_also": "https://github.com/KittenML/KittenTTS",
  53 + "maintainer": "k2-fsa",
  54 + "comment": "This is kitten-tts-nano-0.2 and supports only English",
  55 + }
  56 +
  57 + print(model.metadata_props)
  58 +
  59 + while len(model.metadata_props):
  60 + model.metadata_props.pop()
  61 +
  62 + for key, value in meta_data.items():
  63 + meta = model.metadata_props.add()
  64 + meta.key = key
  65 + meta.value = str(value)
  66 + print("--------------------")
  67 +
  68 + print(model.metadata_props)
  69 +
  70 + onnx.save(model, args.model)
  71 +
  72 + print(f"Please see {args.model}")
  73 +
  74 +
  75 +if __name__ == "__main__":
  76 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +"""
  5 +Change the model so that it can be run in onnxruntime 1.17.1
  6 +"""
  7 +
  8 +import onnx
  9 +
  10 +
  11 +def main():
  12 + model = onnx.load("kitten_tts_mini_v0_1.onnx")
  13 +
  14 + # Print current opsets
  15 + for opset in model.opset_import:
  16 + print(f"Domain: '{opset.domain}', Version: {opset.version}")
  17 +
  18 + # Modify the opset versions (be careful!)
  19 + for opset in model.opset_import:
  20 + if opset.domain == "": # ai.onnx domain
  21 + opset.version = 19 # change from 20 to 19
  22 + elif opset.domain == "ai.onnx.ml":
  23 + opset.version = 4 # change from 5 to 4
  24 +
  25 + # Save the modified model
  26 + onnx.save(model, "model.fp16.onnx")
  27 +
  28 +
  29 +if __name__ == "__main__":
  30 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +"""
  4 +Generate samples for
  5 +https://k2-fsa.github.io/sherpa/onnx/tts/all/
  6 +"""
  7 +
  8 +
  9 +import sherpa_onnx
  10 +import soundfile as sf
  11 +
  12 +from generate_voices_bin import speaker2id
  13 +
  14 +config = sherpa_onnx.OfflineTtsConfig(
  15 + model=sherpa_onnx.OfflineTtsModelConfig(
  16 + kitten=sherpa_onnx.OfflineTtsKittenModelConfig(
  17 + model="kitten-mini-en-v0_1-fp16/model.fp16.onnx",
  18 + voices="kitten-mini-en-v0_1-fp16/voices.bin",
  19 + tokens="kitten-mini-en-v0_1-fp16/tokens.txt",
  20 + data_dir="kitten-mini-en-v0_1-fp16/espeak-ng-data",
  21 + ),
  22 + num_threads=2,
  23 + ),
  24 + max_num_sentences=1,
  25 +)
  26 +
  27 +if not config.validate():
  28 + raise ValueError("Please check your config")
  29 +
  30 +tts = sherpa_onnx.OfflineTts(config)
  31 +text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  32 +
  33 +for s, i in speaker2id.items():
  34 + print(s, i, len(speaker2id))
  35 + audio = tts.generate(text, sid=i, speed=1.0)
  36 +
  37 + sf.write(
  38 + f"./hf/kitten/v0.1-mini/mp3/{i}-{s}.mp3",
  39 + audio.samples,
  40 + samplerate=audio.sample_rate,
  41 + )
  1 +../nano_v0_1/generate_tokens.py
  1 +../nano_v0_1/generate_voices_bin.py
  1 +#!/usr/bin/env bash
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +set -ex
  5 +
  6 +if [ ! -f kitten_tts_mini_v0_1.onnx ]; then
  7 + curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/kitten_tts_mini_v0_1.onnx
  8 +fi
  9 +
  10 +if [ ! -f voices.npz ]; then
  11 + curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/voices.npz
  12 +fi
  13 +
  14 +./generate_voices_bin.py
  15 +./generate_tokens.py
  16 +
  17 +./convert_opset.py
  18 +./show.py
  19 +./add_meta_data.py --model ./model.fp16.onnx
  20 +# ./test.py --model ./model.fp16.onnx --tokens ./tokens.txt --voice ./voices.bin
  21 +ls -lh
@@ -35,7 +35,7 @@ for s, i in speaker2id.items(): @@ -35,7 +35,7 @@ for s, i in speaker2id.items():
35 audio = tts.generate(text, sid=i, speed=1.0) 35 audio = tts.generate(text, sid=i, speed=1.0)
36 36
37 sf.write( 37 sf.write(
38 - f"./hf/kitten/v0.1/mp3/{i}-{s}.mp3", 38 + f"./hf/kitten/v0.1-nano/mp3/{i}-{s}.mp3",
39 audio.samples, 39 audio.samples,
40 samplerate=audio.sample_rate, 40 samplerate=audio.sample_rate,
41 ) 41 )
@@ -35,7 +35,7 @@ for s, i in speaker2id.items(): @@ -35,7 +35,7 @@ for s, i in speaker2id.items():
35 audio = tts.generate(text, sid=i, speed=1.0) 35 audio = tts.generate(text, sid=i, speed=1.0)
36 36
37 sf.write( 37 sf.write(
38 - f"./hf/kitten/v0.2/mp3/{i}-{s}.mp3", 38 + f"./hf/kitten/v0.2-nano/mp3/{i}-{s}.mp3",
39 audio.samples, 39 audio.samples,
40 samplerate=audio.sample_rate, 40 samplerate=audio.sample_rate,
41 ) 41 )