Fangjun Kuang
Committed by GitHub

Export kokoro to sherpa-onnx (#1713)

  1 +name: export-kokoro-to-onnx
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - export-kokoro
  7 +
  8 + workflow_dispatch:
  9 +
  10 +concurrency:
  11 + group: export-kokoro-to-onnx-${{ github.ref }}
  12 + cancel-in-progress: true
  13 +
  14 +jobs:
  15 + export-kokoro-to-onnx:
  16 + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
  17 + name: export kokoro
  18 + runs-on: ${{ matrix.os }}
  19 + strategy:
  20 + fail-fast: false
  21 + matrix:
  22 + os: [ubuntu-latest]
  23 + python-version: ["3.10"]
  24 +
  25 + steps:
  26 + - uses: actions/checkout@v4
  27 +
  28 + - name: Setup Python ${{ matrix.python-version }}
  29 + uses: actions/setup-python@v5
  30 + with:
  31 + python-version: ${{ matrix.python-version }}
  32 +
  33 + - name: Install Python dependencies
  34 + shell: bash
  35 + run: |
  36 + pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
  37 +
  38 + - name: Run
  39 + shell: bash
  40 + run: |
  41 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
  42 + tar xf espeak-ng-data.tar.bz2
  43 + rm espeak-ng-data.tar.bz2
  44 + cd scripts/kokoro
  45 + ./run.sh
  46 +
  47 + - name: Collect results
  48 + shell: bash
  49 + run: |
  50 + src=scripts/kokoro
  51 +
  52 + d=kokoro-en-v0_19
  53 + mkdir $d
  54 + cp -a LICENSE $d/LICENSE
  55 + cp -a espeak-ng-data $d/
  56 + cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx
  57 + cp -v $src/voices.bin $d/
  58 + cp -v $src/tokens.txt $d/
  59 + cp -v $src/README-new.md $d/README.md
  60 + ls -lh $d/
  61 + tar cjfv $d.tar.bz2 $d
  62 + rm -rf $d
  63 +
  64 + ls -h $.tar.bz2
  65 +
  66 + - name: Publish to huggingface
  67 + env:
  68 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  69 + uses: nick-fields/retry@v3
  70 + with:
  71 + max_attempts: 20
  72 + timeout_seconds: 200
  73 + shell: bash
  74 + command: |
  75 + git config --global user.email "csukuangfj@gmail.com"
  76 + git config --global user.name "Fangjun Kuang"
  77 +
  78 + rm -rf huggingface
  79 + export GIT_LFS_SKIP_SMUDGE=1
  80 + export GIT_CLONE_PROTECTION_ACTIVE=false
  81 +
  82 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 huggingface
  83 + cd huggingface
  84 + rm -rf ./*
  85 + git fetch
  86 + git pull
  87 +
  88 + git lfs track "cmn_dict"
  89 + git lfs track "ru_dict"
  90 + git lfs track "*.wav"
  91 +
  92 + cp -a ../espeak-ng-data ./
  93 + mkdir -p test_wavs
  94 +
  95 + cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx
  96 +
  97 + cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/
  98 +
  99 + cp -v ../scripts/kokoro/tokens.txt .
  100 + cp -v ../scripts/kokoro/voices.bin .
  101 + cp -v ../scripts/kokoro/README-new.md ./README.md
  102 + cp -v ../LICENSE ./
  103 +
  104 + git lfs track "*.onnx"
  105 + git add .
  106 +
  107 + ls -lh
  108 +
  109 + git status
  110 +
  111 + git commit -m "add models"
  112 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
  113 +
  114 + - name: Release
  115 + uses: svenstaro/upload-release-action@v2
  116 + with:
  117 + file_glob: true
  118 + file: ./*.tar.bz2
  119 + overwrite: true
  120 + repo_name: k2-fsa/sherpa-onnx
  121 + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
  122 + tag: tts-models
  1 +voices.json
  2 +voices.bin
  3 +README-new.md
  1 +# Introduction
  2 +
  3 +This folder contains scripts for adding meta data to models
  4 +from https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files
  5 +
  6 +See also
  7 +https://huggingface.co/hexgrad/Kokoro-82M/tree/main
  8 +and
  9 +https://huggingface.co/spaces/hexgrad/Kokoro-TTS
  10 +
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import argparse
  6 +import json
  7 +from pathlib import Path
  8 +
  9 +import numpy as np
  10 +import onnx
  11 +
  12 +
  13 +def get_args():
  14 + parser = argparse.ArgumentParser()
  15 + parser.add_argument(
  16 + "--model", type=str, required=True, help="input and output onnx model"
  17 + )
  18 +
  19 + parser.add_argument("--voices", type=str, required=True, help="Path to voices.json")
  20 + return parser.parse_args()
  21 +
  22 +
  23 +def load_voices(filename):
  24 + with open(filename) as f:
  25 + voices = json.load(f)
  26 + for key in voices:
  27 + voices[key] = np.array(voices[key], dtype=np.float32)
  28 + return voices
  29 +
  30 +
  31 +def get_vocab():
  32 + _pad = "$"
  33 + _punctuation = ';:,.!?¡¿—…"«»“” '
  34 + _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
  35 + _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
  36 + symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
  37 + dicts = {}
  38 + for i in range(len((symbols))):
  39 + dicts[symbols[i]] = i
  40 + return dicts
  41 +
  42 +
  43 +def generate_tokens():
  44 + token2id = get_vocab()
  45 + with open("tokens.txt", "w", encoding="utf-8") as f:
  46 + for s, i in token2id.items():
  47 + f.write(f"{s} {i}\n")
  48 +
  49 +
  50 +def main():
  51 + args = get_args()
  52 + print(args.model, args.voices)
  53 +
  54 + model = onnx.load(args.model)
  55 + voices = load_voices(args.voices)
  56 +
  57 + if Path("./tokens.txt").is_file():
  58 + print("./tokens.txt exist, skip generating it")
  59 + else:
  60 + generate_tokens()
  61 +
  62 + keys = list(voices.keys())
  63 + print(",".join(keys))
  64 +
  65 + if Path("./voices.bin").is_file():
  66 + print("./voices.bin exists, skip generating it")
  67 + else:
  68 + with open("voices.bin", "wb") as f:
  69 + for k in keys:
  70 + f.write(voices[k].tobytes())
  71 +
  72 + meta_data = {
  73 + "model_type": "kokoro",
  74 + "language": "English",
  75 + "has_espeak": 1,
  76 + "sample_rate": 24000,
  77 + "version": 1,
  78 + "voice": "en-us",
  79 + "style_dim": ",".join(map(str, voices[keys[0]].shape)),
  80 + "n_speakers": len(keys),
  81 + "speaker_names": ",".join(keys),
  82 + "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
  83 + "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
  84 + "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
  85 + "maintainer": "k2-fsa",
  86 + }
  87 +
  88 + print(model.metadata_props)
  89 +
  90 + while len(model.metadata_props):
  91 + model.metadata_props.pop()
  92 +
  93 + for key, value in meta_data.items():
  94 + meta = model.metadata_props.add()
  95 + meta.key = key
  96 + meta.value = str(value)
  97 + print("--------------------")
  98 +
  99 + print(model.metadata_props)
  100 +
  101 + onnx.save(model, args.model)
  102 +
  103 + print(f"Please see {args.model}, ./voices.bin, and ./tokens.txt")
  104 +
  105 +
  106 +if __name__ == "__main__":
  107 + main()
  1 +#!/usr/bin/env bash
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +set -ex
  5 +
  6 +cat > README-new.md <<EOF
  7 +# Introduction
  8 +
  9 +Files in this folder are from
  10 +https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files
  11 +
  12 +Please see also
  13 +https://huggingface.co/hexgrad/Kokoro-82M
  14 +and
  15 +https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
  16 +EOF
  17 +
  18 +files=(
  19 +kokoro-v0_19_hf.onnx
  20 +# kokoro-v0_19.onnx
  21 +# kokoro-quant.onnx
  22 +# kokoro-quant-convinteger.onnx
  23 +voices.json
  24 +)
  25 +
  26 +for f in ${files[@]}; do
  27 + if [ ! -f ./$f ]; then
  28 + curl -SL -O https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/$f
  29 + fi
  30 +done
  31 +
  32 +models=(
  33 +# kokoro-v0_19
  34 +# kokoro-quant
  35 +# kokoro-quant-convinteger
  36 +kokoro-v0_19_hf
  37 +)
  38 +
  39 +for m in ${models[@]}; do
  40 + ./add-meta-data.py --model $m.onnx --voices ./voices.json
  41 +done
  42 +
  43 +ls -l
  44 +echo "----------"
  45 +ls -lh
  46 +
  47 +for m in ${models[@]}; do
  48 + ./test.py --model $m.onnx --voices-bin ./voices.bin --tokens ./tokens.txt
  49 +done
  50 +ls -lh
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +"""
  5 +female (7)
  6 +'af', 'af_bella', 'af_nicole','af_sarah', 'af_sky',
  7 +'bf_emma', 'bf_isabella',
  8 +
  9 +male (4)
  10 +'am_adam', 'am_michael', 'bm_george', 'bm_lewis'
  11 +"""
  12 +
  13 +import argparse
  14 +import time
  15 +from pathlib import Path
  16 +from typing import Dict, List
  17 +
  18 +import numpy as np
  19 +
  20 +try:
  21 + from piper_phonemize import phonemize_espeak
  22 +except Exception as ex:
  23 + raise RuntimeError(
  24 + f"{ex}\nPlease run\n"
  25 + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
  26 + )
  27 +
  28 +import onnxruntime as ort
  29 +import soundfile as sf
  30 +
  31 +
  32 +def get_args():
  33 + parser = argparse.ArgumentParser()
  34 + parser.add_argument(
  35 + "--model",
  36 + type=str,
  37 + required=True,
  38 + help="Path to the model",
  39 + )
  40 +
  41 + parser.add_argument(
  42 + "--voices-bin",
  43 + type=str,
  44 + required=True,
  45 + help="Path to the voices.bin",
  46 + )
  47 +
  48 + parser.add_argument(
  49 + "--tokens",
  50 + type=str,
  51 + required=True,
  52 + help="Path to tokens.txt",
  53 + )
  54 + return parser.parse_args()
  55 +
  56 +
  57 +def show(filename):
  58 + session_opts = ort.SessionOptions()
  59 + session_opts.log_severity_level = 3
  60 + sess = ort.InferenceSession(filename, session_opts)
  61 + for i in sess.get_inputs():
  62 + print(i)
  63 +
  64 + print("-----")
  65 +
  66 + for i in sess.get_outputs():
  67 + print(i)
  68 +
  69 +
  70 +# NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])
  71 +# NodeArg(name='style', type='tensor(float)', shape=[1, 256])
  72 +# NodeArg(name='speed', type='tensor(float)', shape=[1])
  73 +# -----
  74 +# NodeArg(name='audio', type='tensor(float)', shape=['audio0'])
  75 +
  76 +
  77 +def load_tokens(filename: str) -> Dict[str, int]:
  78 + ans = dict()
  79 + with open(filename, encoding="utf-8") as f:
  80 + for line in f:
  81 + fields = line.strip().split()
  82 + if len(fields) == 2:
  83 + token, idx = fields
  84 + ans[token] = int(idx)
  85 + else:
  86 + assert len(fields) == 1, (len(fields), line)
  87 + ans[" "] = int(fields[0])
  88 + return ans
  89 +
  90 +
  91 +def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
  92 + embedding = (
  93 + np.fromfile(voices_bin, dtype="uint8")
  94 + .view(np.float32)
  95 + .reshape(len(speaker_names), *dim)
  96 + )
  97 + print("embedding.shape", embedding.shape)
  98 + ans = dict()
  99 + for i in range(len(speaker_names)):
  100 + ans[speaker_names[i]] = embedding[i]
  101 +
  102 + return ans
  103 +
  104 +
  105 +class OnnxModel:
  106 + def __init__(self, model_filename: str, voices_bin: str, tokens: str):
  107 + session_opts = ort.SessionOptions()
  108 + session_opts.inter_op_num_threads = 1
  109 + session_opts.intra_op_num_threads = 1
  110 +
  111 + self.session_opts = session_opts
  112 + self.model = ort.InferenceSession(
  113 + model_filename,
  114 + sess_options=self.session_opts,
  115 + providers=["CPUExecutionProvider"],
  116 + )
  117 + self.token2id = load_tokens(tokens)
  118 +
  119 + meta = self.model.get_modelmeta().custom_metadata_map
  120 + print(meta)
  121 + dim = list(map(int, meta["style_dim"].split(",")))
  122 + speaker_names = meta["speaker_names"].split(",")
  123 +
  124 + self.voices = load_voices(
  125 + speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
  126 + )
  127 +
  128 + self.sample_rate = int(meta["sample_rate"])
  129 +
  130 + print(list(self.voices.keys()))
  131 + # ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam',
  132 + # 'am_michael', 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
  133 + # af -> (511, 1, 256)
  134 + self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
  135 +
  136 + def __call__(self, text: str, voice):
  137 + tokens = phonemize_espeak(text, "en-us")
  138 + # tokens is List[List[str]]
  139 + # Each sentence is a List[str]
  140 + # len(tokens) == number of sentences
  141 +
  142 + tokens = sum(tokens, []) # flatten
  143 + tokens = "".join(tokens)
  144 +
  145 + tokens = tokens.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace(
  146 + "kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ"
  147 + )
  148 +
  149 + tokens = list(tokens)
  150 +
  151 + token_ids = [self.token2id[i] for i in tokens]
  152 + token_ids = token_ids[: self.max_len]
  153 +
  154 + style = self.voices[voice][len(token_ids)]
  155 +
  156 + token_ids = [0, *token_ids, 0]
  157 + token_ids = np.array([token_ids], dtype=np.int64)
  158 +
  159 + speed = np.array([1.0], dtype=np.float32)
  160 +
  161 + audio = self.model.run(
  162 + [
  163 + self.model.get_outputs()[0].name,
  164 + ],
  165 + {
  166 + self.model.get_inputs()[0].name: token_ids,
  167 + self.model.get_inputs()[1].name: style,
  168 + self.model.get_inputs()[2].name: speed,
  169 + },
  170 + )[0]
  171 + return audio
  172 +
  173 +
  174 +def test(model, voice, text) -> np.ndarray:
  175 + pass
  176 +
  177 +
  178 +def main():
  179 + args = get_args()
  180 + print(vars(args))
  181 + show(args.model)
  182 +
  183 + # tokens = phonemize_espeak("how are you doing?", "en-us")
  184 + # [['h', 'ˌ', 'a', 'ʊ', ' ', 'ɑ', 'ː', 'ɹ', ' ', 'j', 'u', 'ː', ' ', 'd', 'ˈ', 'u', 'ː', 'ɪ', 'ŋ', '?']]
  185 + m = OnnxModel(
  186 + model_filename=args.model, voices_bin=args.voices_bin, tokens=args.tokens
  187 + )
  188 +
  189 + text = (
  190 + "Today as always, men fall into two groups: slaves and free men."
  191 + + " Whoever does not have two-thirds of his day for himself, "
  192 + + "is a slave, whatever he may be: a statesman, a businessman, "
  193 + + "an official, or a scholar."
  194 + )
  195 +
  196 + for i, voice in enumerate(m.voices.keys(), 1):
  197 + print(f"Testing {i}/{len(m.voices)} - {voice}/{args.model}")
  198 +
  199 + start = time.time()
  200 + audio = m(text, voice=voice)
  201 + end = time.time()
  202 +
  203 + elapsed_seconds = end - start
  204 + audio_duration = len(audio) / m.sample_rate
  205 + real_time_factor = elapsed_seconds / audio_duration
  206 +
  207 + filename = f"{Path(args.model).stem}-{voice}.wav"
  208 + sf.write(
  209 + filename,
  210 + audio,
  211 + samplerate=m.sample_rate,
  212 + subtype="PCM_16",
  213 + )
  214 + print(f" Saved to {filename}")
  215 + print(f" Elapsed seconds: {elapsed_seconds:.3f}")
  216 + print(f" Audio duration in seconds: {audio_duration:.3f}")
  217 + print(
  218 + f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}"
  219 + )
  220 +
  221 +
  222 +if __name__ == "__main__":
  223 + main()
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 -# This script export ZH_EN TTS model, which supports both Chinese and English. 2 +# This script exports ZH_EN TTS model, which supports both Chinese and English.
3 # This model has only 1 speaker. 3 # This model has only 1 speaker.
4 4
5 from typing import Any, Dict 5 from typing import Any, Dict