Fangjun Kuang
Committed by GitHub

Export Kokoro 1.0 to sherpa-onnx (#1788)

... ... @@ -4,6 +4,7 @@ on:
push:
branches:
- export-kokoro
- kokoro-1.0-2
workflow_dispatch:
... ... @@ -14,12 +15,13 @@ concurrency:
jobs:
export-kokoro-to-onnx:
if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
name: export kokoro
name: export kokoro ${{ matrix.version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
version: ["0.19", "1.0"]
python-version: ["3.10"]
steps:
... ... @@ -33,7 +35,7 @@ jobs:
- name: Install Python dependencies
shell: bash
run: |
pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
- name: Run
shell: bash
... ... @@ -42,9 +44,16 @@ jobs:
tar xf espeak-ng-data.tar.bz2
rm espeak-ng-data.tar.bz2
cd scripts/kokoro
./run.sh
- name: Collect results
v=${{ matrix.version }}
if [[ $v = "0.19" ]]; then
./run.sh
elif [[ $v == "1.0" ]]; then
cd v1.0
./run.sh
fi
- name: Collect results ${{ matrix.version }}
if: matrix.version == '0.19'
shell: bash
run: |
src=scripts/kokoro
... ... @@ -53,7 +62,7 @@ jobs:
mkdir $d
cp -a LICENSE $d/LICENSE
cp -a espeak-ng-data $d/
cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx
cp -v $src/kokoro-v0_19.onnx $d/model.onnx
cp -v $src/voices.bin $d/
cp -v $src/tokens.txt $d/
cp -v $src/README-new.md $d/README.md
... ... @@ -61,9 +70,31 @@ jobs:
tar cjfv $d.tar.bz2 $d
rm -rf $d
ls -h $.tar.bz2
ls -lh $d.tar.bz2
- name: Collect results ${{ matrix.version }}
if: matrix.version == '1.0'
shell: bash
run: |
src=scripts/kokoro/v1.0
d=kokoro-multi-lang-v1_0
mkdir $d
cp -a LICENSE $d/LICENSE
cp -a espeak-ng-data $d/
cp -v $src/kokoro.onnx $d/model.onnx
cp -v $src/voices.bin $d/
cp -v $src/tokens.txt $d/
cp -v $src/lexicon*.txt $d/
cp -v $src/README.md $d/README.md
ls -lh $d/
tar cjfv $d.tar.bz2 $d
rm -rf $d
ls -lh $d.tar.bz2
- name: Publish to huggingface
- name: Publish to huggingface ${{ matrix.version }}
if: matrix.version == '0.19'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
... ... @@ -92,9 +123,9 @@ jobs:
cp -a ../espeak-ng-data ./
mkdir -p test_wavs
cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx
cp -v ../scripts/kokoro/kokoro-v0_19.onnx ./model.onnx
cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/
cp -v ../scripts/kokoro/kokoro-v0_19-*.wav ./test_wavs/
cp -v ../scripts/kokoro/tokens.txt .
cp -v ../scripts/kokoro/voices.bin .
... ... @@ -111,6 +142,55 @@ jobs:
git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
- name: Publish to huggingface ${{ matrix.version }}
if: matrix.version == '1.0'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 huggingface
cd huggingface
rm -rf ./*
git fetch
git pull
git lfs track "cmn_dict"
git lfs track "ru_dict"
git lfs track "*.wav"
git lfs track "lexicon*.txt"
cp -a ../espeak-ng-data ./
cp -v ../scripts/kokoro/v1.0/kokoro.onnx ./model.onnx
cp -v ../scripts/kokoro/v1.0/tokens.txt .
cp -v ../scripts/kokoro/v1.0/voices.bin .
cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
cp -v ../scripts/kokoro/v1.0/README.md ./README.md
cp -v ../LICENSE ./
git lfs track "*.onnx"
git add .
ls -lh
git status
git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true
- name: Release
uses: svenstaro/upload-release-action@v2
with:
... ...
... ... @@ -128,3 +128,7 @@ harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md
matcha-icefall-zh-baker
matcha-icefall-en_US-ljspeech
kokoro-en-v0_19
*.pt
lexicon.txt
us_gold.json
us_silver.json
... ...
... ... @@ -69,6 +69,14 @@ def main():
for k in keys:
f.write(voices[k].tobytes())
speaker2id_str = ""
id2speaker_str = ""
sep = ""
for i, s in enumerate(keys):
speaker2id_str += f"{sep}{s}->{i}"
id2speaker_str += f"{sep}{i}->{s}"
sep = ","
meta_data = {
"model_type": "kokoro",
"language": "English",
... ... @@ -78,6 +86,8 @@ def main():
"voice": "en-us",
"style_dim": ",".join(map(str, voices[keys[0]].shape)),
"n_speakers": len(keys),
"speaker2id": speaker2id_str,
"id2speaker": id2speaker_str,
"speaker_names": ",".join(keys),
"model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
"see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
... ...
... ... @@ -16,8 +16,8 @@ https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
EOF
files=(
kokoro-v0_19_hf.onnx
# kokoro-v0_19.onnx
# kokoro-v0_19_hf.onnx
kokoro-v0_19.onnx
# kokoro-quant.onnx
# kokoro-quant-convinteger.onnx
voices.json
... ... @@ -30,14 +30,14 @@ for f in ${files[@]}; do
done
models=(
# kokoro-v0_19
kokoro-v0_19
# kokoro-quant
# kokoro-quant-convinteger
kokoro-v0_19_hf
# kokoro-v0_19_hf
)
for m in ${models[@]}; do
./add-meta-data.py --model $m.onnx --voices ./voices.json
./add_meta_data.py --model $m.onnx --voices ./voices.json
done
ls -l
... ...
config.json
*.json
*.txt
.add-meta-data.done
voices
... ...
# Introduction
This directory is for kokoro v1.0
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import argparse
import json
from pathlib import Path
import numpy as np
import onnx
import torch
from generate_voices_bin import speaker2id
def main():
model = onnx.load("./kokoro.onnx")
style = torch.load("./voices/af_alloy.pt", weights_only=True, map_location="cpu")
id2speaker_str = ""
speaker2id_str = ""
sep = ""
for s, i in speaker2id.items():
speaker2id_str += f"{sep}{s}->{i}"
id2speaker_str += f"{sep}{i}->{s}"
sep = ","
meta_data = {
"model_type": "kokoro",
"language": "English",
"has_espeak": 1,
"sample_rate": 24000,
"version": 2,
"voice": "en-us",
"style_dim": ",".join(map(str, style.shape)),
"n_speakers": len(speaker2id),
"id2speaker": id2speaker_str,
"speaker2id": speaker2id_str,
"speaker_names": ",".join(map(str, speaker2id.keys())),
"model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
"see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
"see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
"maintainer": "k2-fsa",
"comment": "This is Kokoro v1.0, a multilingual TTS model, supporting English, Chinese, French, Japanese etc.",
}
print(model.metadata_props)
while len(model.metadata_props):
model.metadata_props.pop()
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
print("--------------------")
print(model.metadata_props)
onnx.save(model, "./kokoro.onnx")
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import json
from pypinyin import phrases_dict, pinyin_dict
from misaki import zh
from typing import List, Tuple
def generate_english_lexicon(kind: str):
assert kind in ("us", "gb"), kind
# If you want to add new words, please add them to
# the user_defined dict.
user_defined = {
"Kokoro": "kˈOkəɹO",
"Misaki": "misˈɑki",
}
user_defined_lower = dict()
for k, v in user_defined.items():
user_defined_lower[k.lower()] = v
with open(f"./{kind}_gold.json", encoding="utf-8") as f:
gold = json.load(f)
with open(f"./{kind}_silver.json", encoding="utf-8") as f:
silver = json.load(f)
# words in us_gold has a higher priority than those in s_silver, so
# we put us_gold after us_silver below
english = {**silver, **gold}
lexicon = dict()
for k, v in english.items():
k_lower = k.lower()
if k_lower in user_defined_lower:
print(f"{k} already exist in the user defined dict. Skip adding")
continue
if isinstance(v, str):
lexicon[k_lower] = v
else:
assert isinstance(v, dict), (k, v)
assert "DEFAULT" in v, (k, v)
lexicon[k_lower] = v["DEFAULT"]
return list(user_defined_lower.items()) + list(lexicon.items())
def generate_chinese_lexicon():
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
g2p = zh.ZHG2P()
lexicon = []
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens: str = g2p(w)
lexicon.append((w, tokens))
for key in phrases:
tokens: str = g2p(key)
lexicon.append((key, tokens))
return lexicon
def save(filename: str, lexicon: List[Tuple[str, str]]):
with open(filename, "w", encoding="utf-8") as f:
for word, phones in lexicon:
tokens = " ".join(list(phones))
f.write(f"{word} {tokens}\n")
def main():
us = generate_english_lexicon("us")
gb = generate_english_lexicon("gb")
zh = generate_chinese_lexicon()
save("lexicon-us-en.txt", us)
save("lexicon-gb-en.txt", gb)
save("lexicon-zh.txt", zh)
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import json
def main():
with open("config.json") as f:
config = json.load(f)
vocab = config["vocab"]
with open("tokens.txt", "w", encoding="utf-8") as f:
for k, i in vocab.items():
f.write(f"{k} {i}\n")
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import torch
from pathlib import Path
id2speaker = {
0: "af_alloy",
1: "af_aoede",
2: "af_bella",
3: "af_heart",
4: "af_jessica",
5: "af_kore",
6: "af_nicole",
7: "af_nova",
8: "af_river",
9: "af_sarah",
10: "af_sky",
11: "am_adam",
12: "am_echo",
13: "am_eric",
14: "am_fenrir",
15: "am_liam",
16: "am_michael",
17: "am_onyx",
18: "am_puck",
19: "am_santa",
20: "bf_alice",
21: "bf_emma",
22: "bf_isabella",
23: "bf_lily",
24: "bm_daniel",
25: "bm_fable",
26: "bm_george",
27: "bm_lewis",
28: "ef_dora",
29: "em_alex",
30: "ff_siwis",
31: "hf_alpha",
32: "hf_beta",
33: "hm_omega",
34: "hm_psi",
35: "if_sara",
36: "im_nicola",
37: "jf_alpha",
38: "jf_gongitsune",
39: "jf_nezumi",
40: "jf_tebukuro",
41: "jm_kumo",
42: "pf_dora",
43: "pm_alex",
44: "pm_santa",
45: "zf_xiaobei",
46: "zf_xiaoni",
47: "zf_xiaoxiao",
48: "zf_xiaoyi",
49: "zm_yunjian",
50: "zm_yunxi",
51: "zm_yunxia",
52: "zm_yunyang",
}
speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
def main():
if Path("./voices.bin").is_file():
print("./voices.bin exists - skip")
return
with open("voices.bin", "wb") as f:
for _, speaker in id2speaker.items():
m = torch.load(
f"{speaker}.pt",
weights_only=True,
map_location="cpu",
).numpy()
# m.shape (510, 1, 256)
f.write(m.tobytes())
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env bash
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
set -ex
if [ ! -f kokoro.onnx ]; then
# see https://github.com/taylorchu/kokoro-onnx/releases
curl -SL -O https://github.com/taylorchu/kokoro-onnx/releases/download/v0.2.0/kokoro.onnx
fi
if [ ! -f config.json ]; then
# see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/config.json
fi
# see https://huggingface.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L83
# and
# https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
#
# af -> American female
# am -> American male
# bf -> British female
# bm -> British male
voices=(
af_alloy
af_aoede
af_bella
af_heart
af_jessica
af_kore
af_nicole
af_nova
af_river
af_sarah
af_sky
am_adam
am_echo
am_eric
am_fenrir
am_liam
am_michael
am_onyx
am_puck
am_santa
bf_alice
bf_emma
bf_isabella
bf_lily
bm_daniel
bm_fable
bm_george
bm_lewis
ef_dora
em_alex
ff_siwis
hf_alpha
hf_beta
hm_omega
hm_psi
if_sara
im_nicola
jf_alpha
jf_gongitsune
jf_nezumi
jf_tebukuro
jm_kumo
pf_dora
pm_alex
pm_santa
zf_xiaobei # 东北话
zf_xiaoni
zf_xiaoxiao
zf_xiaoyi
zm_yunjian
zm_yunxi
zm_yunxia
zm_yunyang
)
mkdir -p voices
for v in ${voices[@]}; do
if [ ! -f voices/$v.pt ]; then
curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/voices/$v.pt
fi
done
if [ ! -f ./.add-meta-data.done ]; then
python3 ./add_meta_data.py
touch ./.add-meta-data.done
fi
if [ ! -f us_gold.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
fi
if [ ! -f us_silver.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json
fi
if [ ! -f gb_gold.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json
fi
if [ ! -f gb_silver.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json
fi
if [ ! -f ./tokens.txt ]; then
./generate_tokens.py
fi
if [ ! -f ./lexicon.txt ]; then
./generate_lexicon.py
fi
if [ ! -f ./voices.bin ]; then
./generate_voices_bin.py
fi
./test.py
ls -lh
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import re
import time
from typing import Dict, List
import jieba
import numpy as np
import onnxruntime as ort
import soundfile as sf
import torch
from misaki import zh
try:
from piper_phonemize import phonemize_espeak
except Exception as ex:
raise RuntimeError(
f"{ex}\nPlease run\n"
"pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
)
def show(filename):
session_opts = ort.SessionOptions()
session_opts.log_severity_level = 3
sess = ort.InferenceSession(filename, session_opts)
for i in sess.get_inputs():
print(i)
print("-----")
for i in sess.get_outputs():
print(i)
"""
NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
NodeArg(name='style', type='tensor(float)', shape=[1, 256])
NodeArg(name='speed', type='tensor(float)', shape=[1])
-----
NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
"""
def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
embedding = (
np.fromfile(voices_bin, dtype="uint8")
.view(np.float32)
.reshape(len(speaker_names), *dim)
)
print("embedding.shape", embedding.shape)
ans = dict()
for i in range(len(speaker_names)):
ans[speaker_names[i]] = embedding[i]
return ans
def load_tokens(filename: str) -> Dict[str, int]:
ans = dict()
with open(filename, encoding="utf-8") as f:
for line in f:
fields = line.strip().split()
if len(fields) == 2:
token, idx = fields
ans[token] = int(idx)
else:
assert len(fields) == 1, (len(fields), line)
ans[" "] = int(fields[0])
return ans
def load_lexicon(filename: str) -> Dict[str, List[str]]:
ans = dict()
for lexicon in filename.split(","):
print(lexicon)
with open(lexicon, encoding="utf-8") as f:
for line in f:
w, tokens = line.strip().split(" ", maxsplit=1)
ans[w] = "".join(tokens.split())
return ans
class OnnxModel:
def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str):
session_opts = ort.SessionOptions()
session_opts.inter_op_num_threads = 1
session_opts.intra_op_num_threads = 1
self.session_opts = session_opts
self.model = ort.InferenceSession(
model_filename,
sess_options=self.session_opts,
providers=["CPUExecutionProvider"],
)
self.token2id = load_tokens(tokens)
self.word2tokens = load_lexicon(lexicon)
meta = self.model.get_modelmeta().custom_metadata_map
print(meta)
dim = list(map(int, meta["style_dim"].split(",")))
speaker_names = meta["speaker_names"].split(",")
self.voices = load_voices(
speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
)
self.sample_rate = int(meta["sample_rate"])
print(list(self.voices.keys()))
self.sample_rate = 24000
self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
def __call__(self, text: str, voice: str):
punctuations = ';:,.!?-…()"“”'
text = text.lower()
g2p = zh.ZHG2P()
tokens = ""
for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text):
if ord(t[0]) < 0x7F:
for w in t.split():
while w:
if w[0] in punctuations:
tokens += w[0] + " "
w = w[1:]
continue
if w[-1] in punctuations:
if w[:-1] in self.word2tokens:
tokens += self.word2tokens[w[:-1]]
tokens += w[-1]
else:
if w in self.word2tokens:
tokens += self.word2tokens[w]
else:
print(f"Use espeak-ng for word {w}")
tokens += "".join(phonemize_espeak(w, "en-us")[0])
tokens += " "
break
else:
# Chinese
for w in jieba.cut(t):
if w in self.word2tokens:
tokens += self.word2tokens[w]
else:
for i in w:
if i in self.word2tokens:
tokens += self.word2tokens[i]
else:
print(f"skip {i}")
token_ids = [self.token2id[i] for i in tokens]
token_ids = token_ids[: self.max_len]
style = self.voices[voice][len(token_ids)]
token_ids = [0, *token_ids, 0]
token_ids = np.array([token_ids], dtype=np.int64)
speed = np.array([1.0], dtype=np.float32)
audio = self.model.run(
[
self.model.get_outputs()[0].name,
],
{
self.model.get_inputs()[0].name: token_ids,
self.model.get_inputs()[1].name: style,
self.model.get_inputs()[2].name: speed,
},
)[0]
return audio
def main():
m = OnnxModel(
model_filename="./kokoro.onnx",
tokens="./tokens.txt",
lexicon="./lexicon-gb-en.txt,./lexicon-zh.txt",
voices_bin="./voices.bin",
)
text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?"
text = text.lower()
voice = "bf_alice"
start = time.time()
audio = m(text, voice=voice)
end = time.time()
elapsed_seconds = end - start
audio_duration = len(audio) / m.sample_rate
real_time_factor = elapsed_seconds / audio_duration
filename = f"kokoro_v1.0_{voice}_zh_en.wav"
sf.write(
filename,
audio,
samplerate=m.sample_rate,
subtype="PCM_16",
)
print(f" Saved to {filename}")
print(f" Elapsed seconds: {elapsed_seconds:.3f}")
print(f" Audio duration in seconds: {audio_duration:.3f}")
print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
if __name__ == "__main__":
main()
... ...