Fangjun Kuang
Committed by GitHub

Export https://github.com/KittenML/KittenTTS to sherpa-onnx (#2456)

  1 +name: export-kitten-to-onnx
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - kitten-tts
  7 +
  8 + workflow_dispatch:
  9 +
  10 +concurrency:
  11 + group: export-kitten-to-onnx-${{ github.ref }}
  12 + cancel-in-progress: true
  13 +
  14 +jobs:
  15 + export-kitten-to-onnx:
  16 + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
  17 + name: export kitten ${{ matrix.version }}
  18 + runs-on: ${{ matrix.os }}
  19 + strategy:
  20 + fail-fast: false
  21 + matrix:
  22 + os: [ubuntu-latest]
  23 + python-version: ["3.10"]
  24 +
  25 + steps:
  26 + - uses: actions/checkout@v4
  27 +
  28 + - name: Setup Python ${{ matrix.python-version }}
  29 + uses: actions/setup-python@v5
  30 + with:
  31 + python-version: ${{ matrix.python-version }}
  32 +
  33 + - name: Install Python dependencies
  34 + shell: bash
  35 + run: |
  36 + pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
  37 +
  38 + - name: Run
  39 + env:
  40 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  41 + shell: bash
  42 + run: |
  43 + cd scripts/kitten-tts/nano_v0_1
  44 + ./run.sh
  45 +
  46 + - name: Collect results
  47 + shell: bash
  48 + run: |
  49 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
  50 + tar xf espeak-ng-data.tar.bz2
  51 + rm espeak-ng-data.tar.bz2
  52 +
  53 + src=scripts/kitten-tts/nano_v0_1
  54 +
  55 + d=kitten-nano-en-v0_1-fp16
  56 +
  57 + mkdir $d
  58 + cp -a LICENSE $d/LICENSE
  59 + cp -a espeak-ng-data $d/
  60 + cp -v $src/model.fp16.onnx $d/model.fp16.onnx
  61 + cp -v $src/voices.bin $d/
  62 + cp -v $src/tokens.txt $d/
  63 + cp -v $src/../README.md $d/README.md
  64 + ls -lh $d/
  65 + tar cjfv $d.tar.bz2 $d
  66 +
  67 + ls -lh $d.tar.bz2
  68 +
  69 + - name: Release
  70 + if: github.repository_owner == 'csukuangfj'
  71 + uses: svenstaro/upload-release-action@v2
  72 + with:
  73 + file_glob: true
  74 + file: ./*.tar.bz2
  75 + overwrite: true
  76 + repo_name: k2-fsa/sherpa-onnx
  77 + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
  78 + tag: tts-models
  79 +
  80 + - name: Release
  81 + if: github.repository_owner == 'k2-fsa'
  82 + uses: svenstaro/upload-release-action@v2
  83 + with:
  84 + file_glob: true
  85 + file: ./*.tar.bz2
  86 + overwrite: true
  87 + tag: tts-models
  88 +
  89 + - name: Publish to huggingface
  90 + env:
  91 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  92 + uses: nick-fields/retry@v3
  93 + with:
  94 + max_attempts: 20
  95 + timeout_seconds: 200
  96 + shell: bash
  97 + command: |
  98 + git config --global user.email "csukuangfj@gmail.com"
  99 + git config --global user.name "Fangjun Kuang"
  100 +
  101 + dirs=(
  102 + kitten-nano-en-v0_1-fp16
  103 + )
  104 +
  105 + export GIT_LFS_SKIP_SMUDGE=1
  106 + export GIT_CLONE_PROTECTION_ACTIVE=false
  107 +
  108 + for d in ${dirs[@]}; do
  109 + rm -rf huggingface
  110 +
  111 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface
  112 + cd huggingface
  113 + rm -rf ./*
  114 +
  115 + git lfs track "*.onnx"
  116 + git lfs track af_dict
  117 + git lfs track ar_dict
  118 + git lfs track cmn_dict
  119 + git lfs track da_dict en_dict fa_dict hu_dict ia_dict it_dict lb_dict phondata ru_dict ta_dict
  120 + git lfs track ur_dict yue_dict
  121 +
  122 + cp -a ../$d/* ./
  123 +
  124 + git add .
  125 +
  126 + ls -lh
  127 +
  128 + git status
  129 +
  130 + git commit -m "add models"
  131 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main || true
  132 + done
@@ -142,3 +142,5 @@ README-DEV.txt @@ -142,3 +142,5 @@ README-DEV.txt
142 .idea 142 .idea
143 sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 143 sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
144 dict 144 dict
  145 +*.npz
  146 +voices.bin
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import argparse
  6 +
  7 +import numpy as np
  8 +import onnx
  9 +
  10 +from generate_voices_bin import speaker2id
  11 +
  12 +
  13 +def get_args():
  14 + parser = argparse.ArgumentParser()
  15 + parser.add_argument(
  16 + "--model", type=str, required=True, help="input and output onnx model"
  17 + )
  18 +
  19 + return parser.parse_args()
  20 +
  21 +
  22 +def main():
  23 + args = get_args()
  24 + print(args.model)
  25 +
  26 + model = onnx.load(args.model)
  27 +
  28 + style = np.load("./voices.npz")
  29 + style_shape = style[list(style.keys())[0]].shape
  30 +
  31 + speaker2id_str = ""
  32 + id2speaker_str = ""
  33 + sep = ""
  34 + for s, i in speaker2id.items():
  35 + speaker2id_str += f"{sep}{s}->{i}"
  36 + id2speaker_str += f"{sep}{i}->{s}"
  37 + sep = ","
  38 +
  39 + meta_data = {
  40 + "model_type": "kitten-tts",
  41 + "language": "English",
  42 + "has_espeak": 1,
  43 + "sample_rate": 24000,
  44 + "version": 1,
  45 + "voice": "en-us",
  46 + "style_dim": ",".join(map(str, style_shape)),
  47 + "n_speakers": len(speaker2id),
  48 + "speaker2id": speaker2id_str,
  49 + "id2speaker": id2speaker_str,
  50 + "speaker_names": ",".join(map(str, speaker2id.keys())),
  51 + "model_url": "https://huggingface.co/KittenML/kitten-tts-nano-0.1",
  52 + "see_also": "https://github.com/KittenML/KittenTTS",
  53 + "maintainer": "k2-fsa",
  54 + "comment": "This is kitten-tts-nano-0.1 and supports only English",
  55 + }
  56 +
  57 + print(model.metadata_props)
  58 +
  59 + while len(model.metadata_props):
  60 + model.metadata_props.pop()
  61 +
  62 + for key, value in meta_data.items():
  63 + meta = model.metadata_props.add()
  64 + meta.key = key
  65 + meta.value = str(value)
  66 + print("--------------------")
  67 +
  68 + print(model.metadata_props)
  69 +
  70 + onnx.save(model, args.model)
  71 +
  72 + print(f"Please see {args.model}")
  73 +
  74 +
  75 +if __name__ == "__main__":
  76 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +"""
  5 +Change the model so that it can be run in onnxruntime 1.17.1
  6 +"""
  7 +
  8 +import onnx
  9 +
  10 +
  11 +def main():
  12 + model = onnx.load("kitten_tts_nano_v0_1.onnx")
  13 +
  14 + # Print current opsets
  15 + for opset in model.opset_import:
  16 + print(f"Domain: '{opset.domain}', Version: {opset.version}")
  17 +
  18 + # Modify the opset versions (be careful!)
  19 + for opset in model.opset_import:
  20 + if opset.domain == "": # ai.onnx domain
  21 + opset.version = 19 # change from 20 to 19
  22 + elif opset.domain == "ai.onnx.ml":
  23 + opset.version = 4 # change from 5 to 4
  24 +
  25 + # Save the modified model
  26 + onnx.save(model, "model.fp16.onnx")
  27 +
  28 +
  29 +if __name__ == "__main__":
  30 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +def get_vocab():
  6 + # https://github.com/KittenML/KittenTTS/blob/main/kittentts/onnx_model.py#L17
  7 + _pad = "$"
  8 + _punctuation = ';:,.!?¡¿—…"«»"" '
  9 + _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
  10 + _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
  11 +
  12 + symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
  13 + dicts = {}
  14 + for i in range(len((symbols))):
  15 + dicts[symbols[i]] = i
  16 + return dicts
  17 +
  18 +
  19 +def main():
  20 + token2id = get_vocab()
  21 + with open("tokens.txt", "w", encoding="utf-8") as f:
  22 + for s, i in token2id.items():
  23 + f.write(f"{s} {i}\n")
  24 +
  25 +
  26 +if __name__ == "__main__":
  27 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +from pathlib import Path
  4 +
  5 +import numpy as np
  6 +
  7 +speakers = [
  8 + "expr-voice-2-m",
  9 + "expr-voice-2-f",
  10 + "expr-voice-3-m",
  11 + "expr-voice-3-f",
  12 + "expr-voice-4-m",
  13 + "expr-voice-4-f",
  14 + "expr-voice-5-m",
  15 + "expr-voice-5-f",
  16 +]
  17 +
  18 +id2speaker = {idx: speaker for idx, speaker in enumerate(speakers)}
  19 +
  20 +speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
  21 +
  22 +
  23 +def main():
  24 + if Path("./voices.bin").is_file():
  25 + print("./voices.bin exists - skip")
  26 + return
  27 +
  28 + voices = np.load("./voices.npz")
  29 +
  30 + with open("voices.bin", "wb") as f:
  31 + for speaker in speakers:
  32 + v = voices[speaker]
  33 + # v.shape (1, 256)
  34 + f.write(v.tobytes())
  35 +
  36 +
  37 +if __name__ == "__main__":
  38 + main()
  1 +#!/usr/bin/env bash
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +set -ex
  5 +
  6 +if [ ! -f kitten_tts_nano_v0_1.onnx ]; then
  7 + curl -SL -O https://huggingface.co/KittenML/kitten-tts-nano-0.1/resolve/main/kitten_tts_nano_v0_1.onnx
  8 +fi
  9 +
  10 +if [ ! -f voices.npz ]; then
  11 + curl -SL -O https://huggingface.co/KittenML/kitten-tts-nano-0.1/resolve/main/voices.npz
  12 +fi
  13 +
  14 +./generate_voices_bin.py
  15 +./generate_tokens.py
  16 +./convert_opset.py
  17 +./show.py
  18 +./add_meta_data.py --model ./model.fp16.onnx
  19 +# ./test.py --model ./model.fp16.onnx --tokens ./tokens.txt --voice ./voices.bin
  20 +ls -lh
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +import onnxruntime
  5 +import onnx
  6 +
  7 +"""
  8 +[key: "onnx.infer"
  9 +value: "onnxruntime.quant"
  10 +, key: "onnx.quant.pre_process"
  11 +value: "onnxruntime.quant"
  12 +]
  13 +NodeArg(name='input_ids', type='tensor(int64)', shape=[1, 'sequence_length'])
  14 +NodeArg(name='style', type='tensor(float)', shape=[1, 256])
  15 +NodeArg(name='speed', type='tensor(float)', shape=[1])
  16 +-----
  17 +NodeArg(name='waveform', type='tensor(float)', shape=['num_samples'])
  18 +NodeArg(name='duration', type='tensor(int64)', shape=['Castduration_dim_0'])
  19 +"""
  20 +
  21 +
  22 +def show(filename):
  23 + model = onnx.load(filename)
  24 + print(model.metadata_props)
  25 +
  26 + session_opts = onnxruntime.SessionOptions()
  27 + session_opts.log_severity_level = 3
  28 + sess = onnxruntime.InferenceSession(
  29 + filename, session_opts, providers=["CPUExecutionProvider"]
  30 + )
  31 + for i in sess.get_inputs():
  32 + print(i)
  33 +
  34 + print("-----")
  35 +
  36 + for i in sess.get_outputs():
  37 + print(i)
  38 +
  39 +
  40 +def main():
  41 + show("./model.fp16.onnx")
  42 +
  43 +
  44 +if __name__ == "__main__":
  45 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +import argparse
  5 +import time
  6 +from pathlib import Path
  7 +from typing import Dict, List
  8 +
  9 +import numpy as np
  10 +
  11 +try:
  12 + from piper_phonemize import phonemize_espeak
  13 +except Exception as ex:
  14 + raise RuntimeError(
  15 + f"{ex}\nPlease run\n"
  16 + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
  17 + )
  18 +
  19 +import onnxruntime as ort
  20 +import soundfile as sf
  21 +
  22 +
  23 +def get_args():
  24 + parser = argparse.ArgumentParser()
  25 + parser.add_argument(
  26 + "--model",
  27 + type=str,
  28 + required=True,
  29 + help="Path to the model",
  30 + )
  31 +
  32 + parser.add_argument(
  33 + "--voices-bin",
  34 + type=str,
  35 + required=True,
  36 + help="Path to the voices.bin",
  37 + )
  38 +
  39 + parser.add_argument(
  40 + "--tokens",
  41 + type=str,
  42 + required=True,
  43 + help="Path to tokens.txt",
  44 + )
  45 + return parser.parse_args()
  46 +
  47 +
  48 +def show(filename):
  49 + session_opts = ort.SessionOptions()
  50 + session_opts.log_severity_level = 3
  51 + sess = ort.InferenceSession(filename, session_opts)
  52 + for i in sess.get_inputs():
  53 + print(i)
  54 +
  55 + print("-----")
  56 +
  57 + for i in sess.get_outputs():
  58 + print(i)
  59 +
  60 +
  61 +def load_tokens(filename: str) -> Dict[str, int]:
  62 + ans = dict()
  63 + with open(filename, encoding="utf-8") as f:
  64 + for line in f:
  65 + fields = line.strip().split()
  66 + if len(fields) == 2:
  67 + token, idx = fields
  68 + ans[token] = int(idx)
  69 + else:
  70 + assert len(fields) == 1, (len(fields), line)
  71 + ans[" "] = int(fields[0])
  72 + return ans
  73 +
  74 +
  75 +def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
  76 + embedding = (
  77 + np.fromfile(voices_bin, dtype="uint8")
  78 + .view(np.float32)
  79 + .reshape(len(speaker_names), *dim)
  80 + )
  81 + ans = dict()
  82 + for i in range(len(speaker_names)):
  83 + ans[speaker_names[i]] = embedding[i]
  84 +
  85 + return ans
  86 +
  87 +
  88 +class OnnxModel:
  89 + def __init__(self, model_filename: str, voices_bin: str, tokens: str):
  90 + session_opts = ort.SessionOptions()
  91 + session_opts.inter_op_num_threads = 1
  92 + session_opts.intra_op_num_threads = 1
  93 +
  94 + self.session_opts = session_opts
  95 + self.model = ort.InferenceSession(
  96 + model_filename,
  97 + sess_options=self.session_opts,
  98 + providers=["CPUExecutionProvider"],
  99 + )
  100 + self.token2id = load_tokens(tokens)
  101 +
  102 + meta = self.model.get_modelmeta().custom_metadata_map
  103 + print(meta)
  104 + dim = list(map(int, meta["style_dim"].split(",")))
  105 + speaker_names = meta["speaker_names"].split(",")
  106 +
  107 + self.voices = load_voices(
  108 + speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
  109 + )
  110 +
  111 + self.sample_rate = int(meta["sample_rate"])
  112 +
  113 + def __call__(self, text: str, voice):
  114 + tokens = phonemize_espeak(text, "en-us")
  115 + # tokens is List[List[str]]
  116 + # Each sentence is a List[str]
  117 + # len(tokens) == number of sentences
  118 +
  119 + flatten = []
  120 + for t in tokens:
  121 + flatten.extend(t)
  122 + # we append a space at the end of a sentence so that there is
  123 + # a pause in the generated audio
  124 + flatten.append(" ")
  125 +
  126 + tokens = "".join(flatten)
  127 +
  128 + tokens = list(tokens)
  129 +
  130 + token_ids = [self.token2id[i] for i in tokens]
  131 +
  132 + style = self.voices[voice]
  133 +
  134 + token_ids = [0, *token_ids, 0]
  135 + token_ids = np.array([token_ids], dtype=np.int64)
  136 +
  137 + speed = np.array([1.0], dtype=np.float32)
  138 +
  139 + audio = self.model.run(
  140 + [
  141 + self.model.get_outputs()[0].name,
  142 + ],
  143 + {
  144 + self.model.get_inputs()[0].name: token_ids,
  145 + self.model.get_inputs()[1].name: style,
  146 + self.model.get_inputs()[2].name: speed,
  147 + },
  148 + )[0]
  149 + return audio
  150 +
  151 +
  152 +def main():
  153 + args = get_args()
  154 + print(vars(args))
  155 + show(args.model)
  156 +
  157 + # tokens = phonemize_espeak("how are you doing?", "en-us")
  158 + # [['h', 'ˌ', 'a', 'ʊ', ' ', 'ɑ', 'ː', 'ɹ', ' ', 'j', 'u', 'ː', ' ', 'd', 'ˈ', 'u', 'ː', 'ɪ', 'ŋ', '?']]
  159 + m = OnnxModel(
  160 + model_filename=args.model, voices_bin=args.voices_bin, tokens=args.tokens
  161 + )
  162 +
  163 + text = (
  164 + "Today as always, men fall into two groups: slaves and free men. "
  165 + + " Whoever does not have two-thirds of his day for himself, "
  166 + + "is a slave, whatever he may be: a statesman, a businessman, "
  167 + + "an official, or a scholar."
  168 + )
  169 +
  170 + for i, voice in enumerate(m.voices.keys(), 1):
  171 + print(f"Testing {i}/{len(m.voices)} - {voice}/{args.model}")
  172 +
  173 + start = time.time()
  174 + audio = m(text, voice=voice)
  175 + end = time.time()
  176 +
  177 + elapsed_seconds = end - start
  178 + audio_duration = len(audio) / m.sample_rate
  179 + real_time_factor = elapsed_seconds / audio_duration
  180 +
  181 + filename = f"{Path(args.model).stem}-{voice}.wav"
  182 + sf.write(
  183 + filename,
  184 + audio,
  185 + samplerate=m.sample_rate,
  186 + subtype="PCM_16",
  187 + )
  188 + print(f" Saved to {filename}")
  189 + print(f" Elapsed seconds: {elapsed_seconds:.3f}")
  190 + print(f" Audio duration in seconds: {audio_duration:.3f}")
  191 + print(
  192 + f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}"
  193 + )
  194 +
  195 +
  196 +if __name__ == "__main__":
  197 + main()