Fangjun Kuang
Committed by GitHub

Export KittenTTS mini v0.1 to sherpa-onnx (#2578)

... ... @@ -20,7 +20,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
version: ["nano_v0_1", "nano_v0_2"]
version: ["nano_v0_1", "nano_v0_2", "mini_v0_1"]
python-version: ["3.10"]
steps:
... ... @@ -59,6 +59,8 @@ jobs:
d=kitten-nano-en-v0_1-fp16
elif [[ $version == "nano_v0_2" ]]; then
d=kitten-nano-en-v0_2-fp16
elif [[ $version == "mini_v0_1" ]]; then
d=kitten-mini-en-v0_1-fp16
else
echo "version $version"
exit 1
... ... @@ -111,6 +113,7 @@ jobs:
dirs=(
kitten-nano-en-v0_1-fp16
kitten-nano-en-v0_2-fp16
kitten-mini-en-v0_1-fp16
)
export GIT_LFS_SKIP_SMUDGE=1
... ...
... ... @@ -32,7 +32,7 @@ jobs:
pip install "numpy<=1.26.4" sherpa-onnx soundfile
- name: kitten
if: false
if: true
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
... ... @@ -46,8 +46,9 @@ jobs:
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
mkdir -p ./hf/kitten/v0.1/mp3
mkdir -p ./hf/kitten/v0.2/mp3
mkdir -p ./hf/kitten/v0.1-nano/mp3
mkdir -p ./hf/kitten/v0.2-nano/mp3
mkdir -p ./hf/kitten/v0.1-mini/mp3
for v in 1 2; do
pushd nano_v0_$v
... ... @@ -61,6 +62,18 @@ jobs:
popd
done
for v in 1; do
pushd mini_v0_$v
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-mini-en-v0_$v-fp16.tar.bz2
tar xf kitten-mini-en-v0_$v-fp16.tar.bz2
rm kitten-mini-en-v0_$v-fp16.tar.bz2
ln -s ../hf .
python3 ./generate_samples.py
rm -rf kitten-mini-en-v0_$v-fp16
popd
done
pushd hf
git pull
git add .
... ... @@ -70,7 +83,7 @@ jobs:
rm -rf hf
- name: matcha en (ljspeech)
if: true
if: false
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
... ...
... ... @@ -524,6 +524,11 @@ def get_kitten_models() -> List[TtsModel]:
model_name="model.fp16.onnx",
lang="en",
),
TtsModel(
model_dir="kitten-mini-en-v0_1-fp16",
model_name="model.fp16.onnx",
lang="en",
),
]
for m in english_models:
m.data_dir = f"{m.model_dir}/espeak-ng-data"
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import argparse
import numpy as np
import onnx
from generate_voices_bin import speaker2id
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model", type=str, required=True, help="input and output onnx model"
)
return parser.parse_args()
def main():
args = get_args()
print(args.model)
model = onnx.load(args.model)
style = np.load("./voices.npz")
style_shape = style[list(style.keys())[0]].shape
speaker2id_str = ""
id2speaker_str = ""
sep = ""
for s, i in speaker2id.items():
speaker2id_str += f"{sep}{s}->{i}"
id2speaker_str += f"{sep}{i}->{s}"
sep = ","
meta_data = {
"model_type": "kitten-tts",
"language": "English",
"has_espeak": 1,
"sample_rate": 24000,
"version": 1,
"voice": "en-us",
"style_dim": ",".join(map(str, style_shape)),
"n_speakers": len(speaker2id),
"speaker2id": speaker2id_str,
"id2speaker": id2speaker_str,
"speaker_names": ",".join(map(str, speaker2id.keys())),
"model_url": "https://huggingface.co/KittenML/kitten-tts-nano-0.2",
"see_also": "https://github.com/KittenML/KittenTTS",
"maintainer": "k2-fsa",
"comment": "This is kitten-tts-nano-0.2 and supports only English",
}
print(model.metadata_props)
while len(model.metadata_props):
model.metadata_props.pop()
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
print("--------------------")
print(model.metadata_props)
onnx.save(model, args.model)
print(f"Please see {args.model}")
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
"""
Change the model so that it can be run in onnxruntime 1.17.1
"""
import onnx
def main():
model = onnx.load("kitten_tts_mini_v0_1.onnx")
# Print current opsets
for opset in model.opset_import:
print(f"Domain: '{opset.domain}', Version: {opset.version}")
# Modify the opset versions (be careful!)
for opset in model.opset_import:
if opset.domain == "": # ai.onnx domain
opset.version = 19 # change from 20 to 19
elif opset.domain == "ai.onnx.ml":
opset.version = 4 # change from 5 to 4
# Save the modified model
onnx.save(model, "model.fp16.onnx")
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
"""
Generate samples for
https://k2-fsa.github.io/sherpa/onnx/tts/all/
"""
import sherpa_onnx
import soundfile as sf
from generate_voices_bin import speaker2id
config = sherpa_onnx.OfflineTtsConfig(
model=sherpa_onnx.OfflineTtsModelConfig(
kitten=sherpa_onnx.OfflineTtsKittenModelConfig(
model="kitten-mini-en-v0_1-fp16/model.fp16.onnx",
voices="kitten-mini-en-v0_1-fp16/voices.bin",
tokens="kitten-mini-en-v0_1-fp16/tokens.txt",
data_dir="kitten-mini-en-v0_1-fp16/espeak-ng-data",
),
num_threads=2,
),
max_num_sentences=1,
)
if not config.validate():
raise ValueError("Please check your config")
tts = sherpa_onnx.OfflineTts(config)
text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
for s, i in speaker2id.items():
print(s, i, len(speaker2id))
audio = tts.generate(text, sid=i, speed=1.0)
sf.write(
f"./hf/kitten/v0.1-mini/mp3/{i}-{s}.mp3",
audio.samples,
samplerate=audio.sample_rate,
)
... ...
../nano_v0_1/generate_tokens.py
\ No newline at end of file
... ...
../nano_v0_1/generate_voices_bin.py
\ No newline at end of file
... ...
#!/usr/bin/env bash
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
set -ex
if [ ! -f kitten_tts_mini_v0_1.onnx ]; then
curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/kitten_tts_mini_v0_1.onnx
fi
if [ ! -f voices.npz ]; then
curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/voices.npz
fi
./generate_voices_bin.py
./generate_tokens.py
./convert_opset.py
./show.py
./add_meta_data.py --model ./model.fp16.onnx
# ./test.py --model ./model.fp16.onnx --tokens ./tokens.txt --voice ./voices.bin
ls -lh
... ...
../nano_v0_1/show.py
\ No newline at end of file
... ...
../nano_v0_1/test.py
\ No newline at end of file
... ...
... ... @@ -35,7 +35,7 @@ for s, i in speaker2id.items():
audio = tts.generate(text, sid=i, speed=1.0)
sf.write(
f"./hf/kitten/v0.1/mp3/{i}-{s}.mp3",
f"./hf/kitten/v0.1-nano/mp3/{i}-{s}.mp3",
audio.samples,
samplerate=audio.sample_rate,
)
... ...
... ... @@ -35,7 +35,7 @@ for s, i in speaker2id.items():
audio = tts.generate(text, sid=i, speed=1.0)
sf.write(
f"./hf/kitten/v0.2/mp3/{i}-{s}.mp3",
f"./hf/kitten/v0.2-nano/mp3/{i}-{s}.mp3",
audio.samples,
samplerate=audio.sample_rate,
)
... ...