Fangjun Kuang
Committed by GitHub

Refactor kokoro export (#2302)

- generate samples for https://k2-fsa.github.io/sherpa/onnx/tts/all/
- provide int8 model for kokoro v0.19 kokoro-int8-en-v0_19.tar.bz2
@@ -3,7 +3,7 @@ name: export-kokoro-to-onnx @@ -3,7 +3,7 @@ name: export-kokoro-to-onnx
3 on: 3 on:
4 push: 4 push:
5 branches: 5 branches:
6 - - fix-export-kokoro-1.0-2 6 + - refactor-kokoro-2
7 7
8 workflow_dispatch: 8 workflow_dispatch:
9 9
@@ -34,24 +34,94 @@ jobs: @@ -34,24 +34,94 @@ jobs:
34 - name: Install Python dependencies 34 - name: Install Python dependencies
35 shell: bash 35 shell: bash
36 run: | 36 run: |
37 - pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch 37 + pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch sherpa-onnx
38 38
39 - name: Run 39 - name: Run
  40 + env:
  41 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
40 shell: bash 42 shell: bash
41 run: | 43 run: |
42 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2 44 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
43 tar xf espeak-ng-data.tar.bz2 45 tar xf espeak-ng-data.tar.bz2
44 rm espeak-ng-data.tar.bz2 46 rm espeak-ng-data.tar.bz2
  47 + cp -a ./espeak-ng-data ./scripts/kokoro/v0.19
  48 + cp -a ./espeak-ng-data ./scripts/kokoro/v1.0
  49 + cp -a ./espeak-ng-data ./scripts/kokoro/v1.1-zh
  50 +
  51 + git config --global user.email "csukuangfj@gmail.com"
  52 + git config --global user.name "Fangjun Kuang"
  53 +
45 cd scripts/kokoro 54 cd scripts/kokoro
46 v=${{ matrix.version }} 55 v=${{ matrix.version }}
47 if [[ $v = "0.19" ]]; then 56 if [[ $v = "0.19" ]]; then
  57 + cd v0.19
48 ./run.sh 58 ./run.sh
  59 +
  60 + if false; then
  61 + # generate samples
  62 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
  63 + mkdir -p hf/kokoro/v0.19/mp3
  64 + ./generate_samples.py
  65 + pushd hf
  66 + git pull
  67 + git add .
  68 + git commit -m 'add kokoro samples for v0.19'
  69 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
  70 + popd
  71 + rm -rf hf
  72 + fi
  73 +
49 elif [[ $v == "1.0" ]]; then 74 elif [[ $v == "1.0" ]]; then
50 cd v1.0 75 cd v1.0
51 ./run.sh 76 ./run.sh
  77 +
  78 + if false; then
  79 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
  80 + mkdir -p hf/kokoro/v1.0/mp3
  81 +
  82 + curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
  83 + tar xvf dict.tar.bz2
  84 + rm dict.tar.bz2
  85 +
  86 + curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
  87 + curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
  88 + curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
  89 +
  90 + ./generate_samples.py
  91 + pushd hf
  92 + git pull
  93 + git add .
  94 + git commit -m 'add kokoro samples for v1.0'
  95 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
  96 + popd
  97 + rm -rf hf
  98 + fi
  99 +
52 elif [[ $v == "1.1-zh" ]]; then 100 elif [[ $v == "1.1-zh" ]]; then
53 cd v1.1-zh 101 cd v1.1-zh
54 ./run.sh 102 ./run.sh
  103 +
  104 + if false; then
  105 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
  106 + mkdir -p hf/kokoro/v1.1-zh/mp3
  107 +
  108 + curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
  109 + tar xvf dict.tar.bz2
  110 + rm dict.tar.bz2
  111 +
  112 + curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
  113 + curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
  114 + curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
  115 +
  116 + ./generate_samples.py
  117 + pushd hf
  118 + git pull
  119 + git add .
  120 + git commit -m 'add kokoro samples for v1.1-zh'
  121 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
  122 + popd
  123 + rm -rf hf
  124 + fi
55 else 125 else
56 echo "Unknown version $v" 126 echo "Unknown version $v"
57 exit 1 127 exit 1
@@ -61,19 +131,39 @@ jobs: @@ -61,19 +131,39 @@ jobs:
61 if: matrix.version == '0.19' 131 if: matrix.version == '0.19'
62 shell: bash 132 shell: bash
63 run: | 133 run: |
64 - src=scripts/kokoro 134 + src=scripts/kokoro/v0.19
65 135
66 d=kokoro-en-v0_19 136 d=kokoro-en-v0_19
  137 +
67 mkdir $d 138 mkdir $d
68 cp -a LICENSE $d/LICENSE 139 cp -a LICENSE $d/LICENSE
69 cp -a espeak-ng-data $d/ 140 cp -a espeak-ng-data $d/
70 - cp -v $src/kokoro-v0_19.onnx $d/model.onnx 141 + cp -v $src/model.onnx $d/model.onnx
71 cp -v $src/voices.bin $d/ 142 cp -v $src/voices.bin $d/
72 cp -v $src/tokens.txt $d/ 143 cp -v $src/tokens.txt $d/
73 - cp -v $src/README-new.md $d/README.md 144 + cp -v $src/../README.md $d/README.md
  145 + ls -lh $d/
  146 + tar cjfv $d.tar.bz2 $d
  147 +
  148 + ls -lh $d.tar.bz2
  149 +
  150 + - name: Collect results 0.19 (int8)
  151 + if: matrix.version == '0.19'
  152 + shell: bash
  153 + run: |
  154 + src=scripts/kokoro/v0.19
  155 +
  156 + d=kokoro-int8-en-v0_19
  157 +
  158 + mkdir $d
  159 + cp -a LICENSE $d/LICENSE
  160 + cp -a espeak-ng-data $d/
  161 + cp -v $src/model.int8.onnx $d/model.int8.onnx
  162 + cp -v $src/voices.bin $d/
  163 + cp -v $src/tokens.txt $d/
  164 + cp -v $src/../README.md $d/README.md
74 ls -lh $d/ 165 ls -lh $d/
75 tar cjfv $d.tar.bz2 $d 166 tar cjfv $d.tar.bz2 $d
76 - rm -rf $d  
77 167
78 ls -lh $d.tar.bz2 168 ls -lh $d.tar.bz2
79 169
@@ -219,33 +309,30 @@ jobs: @@ -219,33 +309,30 @@ jobs:
219 git config --global user.email "csukuangfj@gmail.com" 309 git config --global user.email "csukuangfj@gmail.com"
220 git config --global user.name "Fangjun Kuang" 310 git config --global user.name "Fangjun Kuang"
221 311
222 - rm -rf huggingface 312 + dirs=(
  313 + kokoro-en-v0_19
  314 + # kokoro-int8-en-v0_19
  315 + )
  316 +
223 export GIT_LFS_SKIP_SMUDGE=1 317 export GIT_LFS_SKIP_SMUDGE=1
224 export GIT_CLONE_PROTECTION_ACTIVE=false 318 export GIT_CLONE_PROTECTION_ACTIVE=false
  319 + for d in ${dirs[@]}; do
  320 + rm -rf huggingface
225 321
226 git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 huggingface 322 git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 huggingface
227 cd huggingface 323 cd huggingface
228 rm -rf ./* 324 rm -rf ./*
229 - git fetch  
230 - git pull  
231 325
232 - git lfs track "cmn_dict"  
233 - git lfs track "ru_dict"  
234 - git lfs track "*.wav"  
235 -  
236 - cp -a ../espeak-ng-data ./  
237 - mkdir -p test_wavs  
238 -  
239 - cp -v ../scripts/kokoro/kokoro-v0_19.onnx ./model.onnx 326 + git lfs track "*.onnx"
  327 + git lfs track af_dict
  328 + git lfs track ar_dict
  329 + git lfs track cmn_dict
  330 + git lfs track da_dict en_dict fa_dict hu_dict ia_dict it_dict lb_dict phondata ru_dict ta_dict
  331 + git lfs track ur_dict yue_dict
240 332
241 - cp -v ../scripts/kokoro/kokoro-v0_19-*.wav ./test_wavs/  
242 333
243 - cp -v ../scripts/kokoro/tokens.txt .  
244 - cp -v ../scripts/kokoro/voices.bin .  
245 - cp -v ../scripts/kokoro/README-new.md ./README.md  
246 - cp -v ../LICENSE ./ 334 + cp -a ../$d ./
247 335
248 - git lfs track "*.onnx"  
249 git add . 336 git add .
250 337
251 ls -lh 338 ls -lh
@@ -254,6 +341,7 @@ jobs: @@ -254,6 +341,7 @@ jobs:
254 341
255 git commit -m "add models" 342 git commit -m "add models"
256 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true 343 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
  344 + done
257 345
258 - name: Publish to huggingface 1.0 float32 346 - name: Publish to huggingface 1.0 float32
259 if: matrix.version == '1.0' 347 if: matrix.version == '1.0'
  1 +espeak-ng-data
1 voices.json 2 voices.json
2 voices.bin 3 voices.bin
3 README-new.md 4 README-new.md
1 # Introduction 1 # Introduction
2 2
3 -This folder contains scripts for adding meta data to models  
4 -from https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files  
5 -  
6 -See also  
7 -https://huggingface.co/hexgrad/Kokoro-82M/tree/main 3 +Please see also
  4 +https://huggingface.co/hexgrad/Kokoro-82M
8 and 5 and
9 -https://huggingface.co/spaces/hexgrad/Kokoro-TTS  
10 - 6 +https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
1 -#!/usr/bin/env bash  
2 -# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)  
3 -  
4 -set -ex  
5 -  
6 -cat > README-new.md <<EOF  
7 -# Introduction  
8 -  
9 -Files in this folder are from  
10 -https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files  
11 -  
12 -Please see also  
13 -https://huggingface.co/hexgrad/Kokoro-82M  
14 -and  
15 -https://huggingface.co/hexgrad/Kokoro-82M/discussions/14  
16 -EOF  
17 -  
18 -files=(  
19 -# kokoro-v0_19_hf.onnx  
20 -kokoro-v0_19.onnx  
21 -# kokoro-quant.onnx  
22 -# kokoro-quant-convinteger.onnx  
23 -voices.json  
24 -)  
25 -  
26 -for f in ${files[@]}; do  
27 - if [ ! -f ./$f ]; then  
28 - curl -SL -O https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/$f  
29 - fi  
30 -done  
31 -  
32 -models=(  
33 -kokoro-v0_19  
34 -# kokoro-quant  
35 -# kokoro-quant-convinteger  
36 -# kokoro-v0_19_hf  
37 -)  
38 -  
39 -for m in ${models[@]}; do  
40 - ./add_meta_data.py --model $m.onnx --voices ./voices.json  
41 -done  
42 -  
43 -ls -l  
44 -echo "----------"  
45 -ls -lh  
46 -  
47 -for m in ${models[@]}; do  
48 - ./test.py --model $m.onnx --voices-bin ./voices.bin --tokens ./tokens.txt  
49 -done  
50 -ls -lh  
@@ -3,11 +3,11 @@ @@ -3,11 +3,11 @@
3 3
4 4
5 import argparse 5 import argparse
6 -import json  
7 -from pathlib import Path  
8 6
9 -import numpy as np  
10 import onnx 7 import onnx
  8 +import torch
  9 +
  10 +from generate_voices_bin import speaker2id
11 11
12 12
13 def get_args(): 13 def get_args():
@@ -16,63 +16,23 @@ def get_args(): @@ -16,63 +16,23 @@ def get_args():
16 "--model", type=str, required=True, help="input and output onnx model" 16 "--model", type=str, required=True, help="input and output onnx model"
17 ) 17 )
18 18
19 - parser.add_argument("--voices", type=str, required=True, help="Path to voices.json")  
20 return parser.parse_args() 19 return parser.parse_args()
21 20
22 21
23 -def load_voices(filename):  
24 - with open(filename) as f:  
25 - voices = json.load(f)  
26 - for key in voices:  
27 - voices[key] = np.array(voices[key], dtype=np.float32)  
28 - return voices  
29 -  
30 -  
31 -def get_vocab():  
32 - _pad = "$"  
33 - _punctuation = ';:,.!?¡¿—…"«»“” '  
34 - _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"  
35 - _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"  
36 - symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)  
37 - dicts = {}  
38 - for i in range(len((symbols))):  
39 - dicts[symbols[i]] = i  
40 - return dicts  
41 -  
42 -  
43 -def generate_tokens():  
44 - token2id = get_vocab()  
45 - with open("tokens.txt", "w", encoding="utf-8") as f:  
46 - for s, i in token2id.items():  
47 - f.write(f"{s} {i}\n")  
48 -  
49 -  
50 def main(): 22 def main():
51 args = get_args() 23 args = get_args()
52 - print(args.model, args.voices) 24 + print(args.model)
53 25
54 model = onnx.load(args.model) 26 model = onnx.load(args.model)
55 - voices = load_voices(args.voices)  
56 -  
57 - if Path("./tokens.txt").is_file():  
58 - print("./tokens.txt exist, skip generating it")  
59 - else:  
60 - generate_tokens()  
61 27
62 - keys = list(voices.keys())  
63 - print(",".join(keys))  
64 -  
65 - if Path("./voices.bin").is_file():  
66 - print("./voices.bin exists, skip generating it")  
67 - else:  
68 - with open("voices.bin", "wb") as f:  
69 - for k in keys:  
70 - f.write(voices[k].tobytes()) 28 + style = torch.load(
  29 + "./kLegacy/v0.19/voices/af.pt", weights_only=True, map_location="cpu"
  30 + )
71 31
72 speaker2id_str = "" 32 speaker2id_str = ""
73 id2speaker_str = "" 33 id2speaker_str = ""
74 sep = "" 34 sep = ""
75 - for i, s in enumerate(keys): 35 + for s, i in speaker2id.items():
76 speaker2id_str += f"{sep}{s}->{i}" 36 speaker2id_str += f"{sep}{s}->{i}"
77 id2speaker_str += f"{sep}{i}->{s}" 37 id2speaker_str += f"{sep}{i}->{s}"
78 sep = "," 38 sep = ","
@@ -84,15 +44,15 @@ def main(): @@ -84,15 +44,15 @@ def main():
84 "sample_rate": 24000, 44 "sample_rate": 24000,
85 "version": 1, 45 "version": 1,
86 "voice": "en-us", 46 "voice": "en-us",
87 - "style_dim": ",".join(map(str, voices[keys[0]].shape)),  
88 - "n_speakers": len(keys), 47 + "style_dim": ",".join(map(str, style.shape)),
  48 + "n_speakers": len(speaker2id),
89 "speaker2id": speaker2id_str, 49 "speaker2id": speaker2id_str,
90 "id2speaker": id2speaker_str, 50 "id2speaker": id2speaker_str,
91 - "speaker_names": ",".join(keys),  
92 - "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files", 51 + "speaker_names": ",".join(map(str, speaker2id.keys())),
  52 + "model_url": "https://huggingface.co/hexgrad/kLegacy/",
93 "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS", 53 "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
94 - "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",  
95 "maintainer": "k2-fsa", 54 "maintainer": "k2-fsa",
  55 + "comment": "This is kokoro v0.19 and supports only English",
96 } 56 }
97 57
98 print(model.metadata_props) 58 print(model.metadata_props)
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +from pathlib import Path
  5 +
  6 +import onnxruntime
  7 +from onnxruntime.quantization import QuantType, quantize_dynamic
  8 +
  9 +
  10 +def show(filename):
  11 + session_opts = onnxruntime.SessionOptions()
  12 + session_opts.log_severity_level = 3
  13 + sess = onnxruntime.InferenceSession(filename, session_opts)
  14 + for i in sess.get_inputs():
  15 + print(i)
  16 +
  17 + print("-----")
  18 +
  19 + for i in sess.get_outputs():
  20 + print(i)
  21 +
  22 +
  23 +"""
  24 +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])
  25 +NodeArg(name='style', type='tensor(float)', shape=[1, 256])
  26 +NodeArg(name='speed', type='tensor(float)', shape=[1])
  27 +-----
  28 +NodeArg(name='audio', type='tensor(float)', shape=['audio0'])
  29 +"""
  30 +
  31 +
  32 +def main():
  33 + show("./model.onnx")
  34 +
  35 + if not Path("./model.int8.onnx").is_file():
  36 + quantize_dynamic(
  37 + model_input="model.onnx",
  38 + model_output="model.int8.onnx",
  39 + # op_types_to_quantize=["MatMul"],
  40 + weight_type=QuantType.QUInt8,
  41 + )
  42 + else:
  43 + print("./model.int8.onnx exists - skip")
  44 +
  45 +
  46 +if __name__ == "__main__":
  47 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +"""
  4 +Generate samples for
  5 +https://k2-fsa.github.io/sherpa/onnx/tts/all/
  6 +"""
  7 +
  8 +import sherpa_onnx
  9 +import soundfile as sf
  10 +
  11 +from generate_voices_bin import speaker2id
  12 +
  13 +config = sherpa_onnx.OfflineTtsConfig(
  14 + model=sherpa_onnx.OfflineTtsModelConfig(
  15 + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
  16 + model="./model.onnx",
  17 + voices="./voices.bin",
  18 + tokens="./tokens.txt",
  19 + data_dir="./espeak-ng-data",
  20 + ),
  21 + num_threads=2,
  22 + ),
  23 + max_num_sentences=1,
  24 +)
  25 +
  26 +if not config.validate():
  27 + raise ValueError("Please check your config")
  28 +
  29 +tts = sherpa_onnx.OfflineTts(config)
  30 +text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  31 +
  32 +for s, i in speaker2id.items():
  33 + print(s, i, len(speaker2id))
  34 + audio = tts.generate(text, sid=i, speed=1.0)
  35 +
  36 + sf.write(
  37 + f"./hf/kokoro/v0.19/mp3/{i}-{s}.mp3",
  38 + audio.samples,
  39 + samplerate=audio.sample_rate,
  40 + )
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +def get_vocab():
  6 + # https://huggingface.co/hexgrad/kLegacy/blob/main/v0.19/kokoro.py#L75
  7 + _pad = "$"
  8 + _punctuation = ';:,.!?¡¿—…"«»“” '
  9 + _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
  10 + _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
  11 + symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
  12 + dicts = {}
  13 + for i in range(len((symbols))):
  14 + dicts[symbols[i]] = i
  15 + return dicts
  16 +
  17 +
  18 +def main():
  19 + token2id = get_vocab()
  20 + with open("tokens.txt", "w", encoding="utf-8") as f:
  21 + for s, i in token2id.items():
  22 + f.write(f"{s} {i}\n")
  23 +
  24 +
  25 +if __name__ == "__main__":
  26 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +import torch
  4 +from pathlib import Path
  5 +
  6 +
  7 +id2speaker = {
  8 + 0: "af",
  9 + 1: "af_bella",
  10 + 2: "af_nicole",
  11 + 3: "af_sarah",
  12 + 4: "af_sky",
  13 + 5: "am_adam",
  14 + 6: "am_michael",
  15 + 7: "bf_emma",
  16 + 8: "bf_isabella",
  17 + 9: "bm_george",
  18 + 10: "bm_lewis",
  19 +}
  20 +
  21 +speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
  22 +
  23 +
  24 +def main():
  25 + if Path("./voices.bin").is_file():
  26 + print("./voices.bin exists - skip")
  27 + return
  28 +
  29 + with open("voices.bin", "wb") as f:
  30 + for _, speaker in id2speaker.items():
  31 + m = torch.load(
  32 + f"kLegacy/v0.19/voices/{speaker}.pt",
  33 + weights_only=True,
  34 + map_location="cpu",
  35 + ).numpy()
  36 + # m.shape (511, 1, 256)
  37 +
  38 + f.write(m.tobytes())
  39 +
  40 +
  41 +if __name__ == "__main__":
  42 + main()
  1 +#!/usr/bin/env bash
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +set -ex
  5 +
  6 +cat > README-new.md <<EOF
  7 +# Introduction
  8 +
  9 +Files in this folder are from
  10 +git clone https://huggingface.co/hexgrad/kLegacy
  11 +EOF
  12 +
  13 +if [ ! -d kLegacy ]; then
  14 + git clone https://huggingface.co/hexgrad/kLegacy
  15 + pushd kLegacy/v0.19
  16 + git lfs pull
  17 + popd
  18 +fi
  19 +
  20 +if [ ! -f ./voices.bin ]; then
  21 + ./generate_voices_bin.py
  22 +fi
  23 +
  24 +if [ ! -f ./tokens.txt ]; then
  25 + ./generate_tokens.py
  26 +fi
  27 +
  28 +if [ ! -f ./model.onnx ]; then
  29 + mv kLegacy/v0.19/kokoro-v0_19.onnx ./model.onnx
  30 +fi
  31 +
  32 +./add_meta_data.py --model ./model.onnx
  33 +
  34 +if [ ! -f model.int8.onnx ]; then
  35 + ./dynamic_quantization.py
  36 +fi
@@ -67,11 +67,13 @@ def show(filename): @@ -67,11 +67,13 @@ def show(filename):
67 print(i) 67 print(i)
68 68
69 69
70 -# NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])  
71 -# NodeArg(name='style', type='tensor(float)', shape=[1, 256])  
72 -# NodeArg(name='speed', type='tensor(float)', shape=[1])  
73 -# -----  
74 -# NodeArg(name='audio', type='tensor(float)', shape=['audio0']) 70 +"""
  71 +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])
  72 +NodeArg(name='style', type='tensor(float)', shape=[1, 256])
  73 +NodeArg(name='speed', type='tensor(float)', shape=[1])
  74 +-----
  75 +NodeArg(name='audio', type='tensor(float)', shape=['audio0'])
  76 +"""
75 77
76 78
77 def load_tokens(filename: str) -> Dict[str, int]: 79 def load_tokens(filename: str) -> Dict[str, int]:
@@ -171,10 +173,6 @@ class OnnxModel: @@ -171,10 +173,6 @@ class OnnxModel:
171 return audio 173 return audio
172 174
173 175
174 -def test(model, voice, text) -> np.ndarray:  
175 - pass  
176 -  
177 -  
178 def main(): 176 def main():
179 args = get_args() 177 args = get_args()
180 print(vars(args)) 178 print(vars(args))
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +"""
  4 +Generate samples for
  5 +https://k2-fsa.github.io/sherpa/onnx/tts/all/
  6 +"""
  7 +
  8 +import sherpa_onnx
  9 +import soundfile as sf
  10 +
  11 +from generate_voices_bin import speaker2id
  12 +
  13 +config = sherpa_onnx.OfflineTtsConfig(
  14 + model=sherpa_onnx.OfflineTtsModelConfig(
  15 + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
  16 + model="./kokoro.onnx",
  17 + voices="./voices.bin",
  18 + tokens="./tokens.txt",
  19 + data_dir="./espeak-ng-data",
  20 + dict_dir="./dict",
  21 + lexicon="./lexicon-zh.txt,./lexicon-us-en.txt",
  22 + ),
  23 + num_threads=2,
  24 + debug=True,
  25 + ),
  26 + rule_fsts="./phone-zh.fst,./date-zh.fst,./number-zh.fst",
  27 + max_num_sentences=1,
  28 +)
  29 +
  30 +if not config.validate():
  31 + raise ValueError("Please check your config")
  32 +
  33 +tts = sherpa_onnx.OfflineTts(config)
  34 +text = "This model supports both Chinese and English. 小米的核心价值观是什么?答案是真诚热爱!有困难,请拨打110 或者18601200909。I am learning 机器学习. 我在研究 machine learning。What do you think 中英文说的如何呢? 今天是 2025年6月18号."
  35 +
  36 +print("text", text)
  37 +
  38 +for s, i in speaker2id.items():
  39 + print(s, i, len(speaker2id))
  40 + audio = tts.generate(text, sid=i, speed=1.0)
  41 +
  42 + sf.write(
  43 + f"./hf/kokoro/v1.0/mp3/{i}-{s}.mp3",
  44 + audio.samples,
  45 + samplerate=audio.sample_rate,
  46 + )
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +"""
  4 +Generate samples for
  5 +https://k2-fsa.github.io/sherpa/onnx/tts/all/
  6 +"""
  7 +
  8 +import sherpa_onnx
  9 +import soundfile as sf
  10 +
  11 +from generate_voices_bin import speaker2id
  12 +
  13 +config = sherpa_onnx.OfflineTtsConfig(
  14 + model=sherpa_onnx.OfflineTtsModelConfig(
  15 + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
  16 + model="./kokoro.onnx",
  17 + voices="./voices.bin",
  18 + tokens="./tokens.txt",
  19 + data_dir="./espeak-ng-data",
  20 + dict_dir="./dict",
  21 + lexicon="./lexicon-zh.txt,./lexicon-us-en.txt",
  22 + ),
  23 + num_threads=2,
  24 + debug=True,
  25 + ),
  26 + rule_fsts="./phone-zh.fst,./date-zh.fst,./number-zh.fst",
  27 + max_num_sentences=1,
  28 +)
  29 +
  30 +if not config.validate():
  31 + raise ValueError("Please check your config")
  32 +
  33 +tts = sherpa_onnx.OfflineTts(config)
  34 +text = "This model supports both Chinese and English. 小米的核心价值观是什么?答案是真诚热爱!有困难,请拨打110 或者18601200909。I am learning 机器学习. 我在研究 machine learning。What do you think 中英文说的如何呢? 今天是 2025年6月18号."
  35 +
  36 +print("text", text)
  37 +
  38 +for s, i in speaker2id.items():
  39 + print(s, i, len(speaker2id))
  40 + audio = tts.generate(text, sid=i, speed=1.0)
  41 +
  42 + sf.write(
  43 + f"./hf/kokoro/v1.1-zh/mp3/{i}-{s}.mp3",
  44 + audio.samples,
  45 + samplerate=audio.sample_rate,
  46 + )
@@ -11,6 +11,8 @@ fi @@ -11,6 +11,8 @@ fi
11 if [ ! -f config.json ]; then 11 if [ ! -f config.json ]; then
12 # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json 12 # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
13 curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json 13 curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json
  14 + mkdir -p Kokoro-82M
  15 + cp ./config.json ./Kokoro-82M
14 fi 16 fi
15 17
16 voices=( 18 voices=(
@@ -34,7 +34,7 @@ static void PybindOfflineTtsConfig(py::module *m) { @@ -34,7 +34,7 @@ static void PybindOfflineTtsConfig(py::module *m) {
34 .def(py::init<const OfflineTtsModelConfig &, const std::string &, 34 .def(py::init<const OfflineTtsModelConfig &, const std::string &,
35 const std::string &, int32_t, float>(), 35 const std::string &, int32_t, float>(),
36 py::arg("model"), py::arg("rule_fsts") = "", 36 py::arg("model"), py::arg("rule_fsts") = "",
37 - py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2, 37 + py::arg("rule_fars") = "", py::arg("max_num_sentences") = 1,
38 py::arg("silence_scale") = 0.2) 38 py::arg("silence_scale") = 0.2)
39 .def_readwrite("model", &PyClass::model) 39 .def_readwrite("model", &PyClass::model)
40 .def_readwrite("rule_fsts", &PyClass::rule_fsts) 40 .def_readwrite("rule_fsts", &PyClass::rule_fsts)