Fangjun Kuang
Committed by GitHub

Add Kokoro v1.1-zh (#1942)

@@ -3,7 +3,7 @@ name: export-kokoro-to-onnx @@ -3,7 +3,7 @@ name: export-kokoro-to-onnx
3 on: 3 on:
4 push: 4 push:
5 branches: 5 branches:
6 - - export-kokoro 6 + - export-kokoro-2
7 7
8 workflow_dispatch: 8 workflow_dispatch:
9 9
@@ -20,7 +20,7 @@ jobs: @@ -20,7 +20,7 @@ jobs:
20 fail-fast: false 20 fail-fast: false
21 matrix: 21 matrix:
22 os: [ubuntu-latest] 22 os: [ubuntu-latest]
23 - version: ["0.19", "1.0"] 23 + version: ["0.19", "1.0", "1.1-zh"]
24 python-version: ["3.10"] 24 python-version: ["3.10"]
25 25
26 steps: 26 steps:
@@ -34,7 +34,7 @@ jobs: @@ -34,7 +34,7 @@ jobs:
34 - name: Install Python dependencies 34 - name: Install Python dependencies
35 shell: bash 35 shell: bash
36 run: | 36 run: |
37 - pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch 37 + pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
38 38
39 - name: Run 39 - name: Run
40 shell: bash 40 shell: bash
@@ -49,9 +49,15 @@ jobs: @@ -49,9 +49,15 @@ jobs:
49 elif [[ $v == "1.0" ]]; then 49 elif [[ $v == "1.0" ]]; then
50 cd v1.0 50 cd v1.0
51 ./run.sh 51 ./run.sh
  52 + elif [[ $v == "1.1-zh" ]]; then
  53 + cd v1.1-zh
  54 + ./run.sh
  55 + else
  56 + echo "Unknown version $v"
  57 + exit 1
52 fi 58 fi
53 59
54 - - name: Collect results ${{ matrix.version }} 60 + - name: Collect results 0.19
55 if: matrix.version == '0.19' 61 if: matrix.version == '0.19'
56 shell: bash 62 shell: bash
57 run: | 63 run: |
@@ -71,7 +77,7 @@ jobs: @@ -71,7 +77,7 @@ jobs:
71 77
72 ls -lh $d.tar.bz2 78 ls -lh $d.tar.bz2
73 79
74 - - name: Collect results ${{ matrix.version }} 80 + - name: Collect results 1.0
75 if: matrix.version == '1.0' 81 if: matrix.version == '1.0'
76 shell: bash 82 shell: bash
77 run: | 83 run: |
@@ -87,7 +93,7 @@ jobs: @@ -87,7 +93,7 @@ jobs:
87 93
88 d=kokoro-multi-lang-v1_0 94 d=kokoro-multi-lang-v1_0
89 mkdir $d 95 mkdir $d
90 - cp -a LICENSE $d/LICENSE 96 + cp -v LICENSE $d/LICENSE
91 cp -a espeak-ng-data $d/ 97 cp -a espeak-ng-data $d/
92 cp -v $src/kokoro.onnx $d/model.onnx 98 cp -v $src/kokoro.onnx $d/model.onnx
93 cp -v $src/voices.bin $d/ 99 cp -v $src/voices.bin $d/
@@ -105,7 +111,63 @@ jobs: @@ -105,7 +111,63 @@ jobs:
105 111
106 ls -lh $d.tar.bz2 112 ls -lh $d.tar.bz2
107 113
108 - - name: Publish to huggingface ${{ matrix.version }} 114 + - name: Collect results 1.1-zh
  115 + if: matrix.version == '1.1-zh'
  116 + shell: bash
  117 + run: |
  118 + curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
  119 + tar xvf dict.tar.bz2
  120 + rm dict.tar.bz2
  121 +
  122 + curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
  123 + curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
  124 + curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
  125 +
  126 + src=scripts/kokoro/v1.1-zh
  127 +
  128 + d=kokoro-multi-lang-v1_1
  129 + mkdir $d
  130 + cp -v LICENSE $d/LICENSE
  131 + cp -a espeak-ng-data $d/
  132 + cp -v $src/kokoro.onnx $d/model.onnx
  133 + cp -v $src/voices.bin $d/
  134 + cp -v $src/tokens.txt $d/
  135 + cp -v $src/lexicon*.txt $d/
  136 + cp -v $src/README.md $d/README.md
  137 + cp -av dict $d/
  138 + cp -v ./*.fst $d/
  139 + ls -lh $d/
  140 + echo "---"
  141 + ls -lh $d/dict
  142 +
  143 + tar cjfv $d.tar.bz2 $d
  144 + rm -rf $d
  145 + ls -lh $d.tar.bz2
  146 +
  147 + d=kokoro-int8-multi-lang-v1_1
  148 + mkdir $d
  149 + cp -v LICENSE $d/LICENSE
  150 + cp -a espeak-ng-data $d/
  151 + cp -v $src/kokoro.int8.onnx $d/model.int8.onnx
  152 + cp -v $src/voices.bin $d/
  153 + cp -v $src/tokens.txt $d/
  154 + cp -v $src/lexicon*.txt $d/
  155 + cp -v $src/README.md $d/README.md
  156 + cp -av dict $d/
  157 + cp -v ./*.fst $d/
  158 + ls -lh $d/
  159 + echo "---"
  160 + ls -lh $d/dict
  161 +
  162 + tar cjfv $d.tar.bz2 $d
  163 + rm -rf $d
  164 + ls -lh $d.tar.bz2
  165 +
  166 + echo "---"
  167 + ls -lh *.tar.bz2
  168 +
  169 +
  170 + - name: Publish to huggingface 0.19
109 if: matrix.version == '0.19' 171 if: matrix.version == '0.19'
110 env: 172 env:
111 HF_TOKEN: ${{ secrets.HF_TOKEN }} 173 HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -154,7 +216,7 @@ jobs: @@ -154,7 +216,7 @@ jobs:
154 git commit -m "add models" 216 git commit -m "add models"
155 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true 217 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
156 218
157 - - name: Publish to huggingface ${{ matrix.version }} 219 + - name: Publish to huggingface 1.0
158 if: matrix.version == '1.0' 220 if: matrix.version == '1.0'
159 env: 221 env:
160 HF_TOKEN: ${{ secrets.HF_TOKEN }} 222 HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -205,6 +267,108 @@ jobs: @@ -205,6 +267,108 @@ jobs:
205 git commit -m "add models" 267 git commit -m "add models"
206 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true 268 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true
207 269
  270 + - name: Publish to huggingface 1.1-zh
  271 + if: matrix.version == '1.1-zh'
  272 + env:
  273 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  274 + uses: nick-fields/retry@v3
  275 + with:
  276 + max_attempts: 20
  277 + timeout_seconds: 200
  278 + shell: bash
  279 + command: |
  280 + git config --global user.email "csukuangfj@gmail.com"
  281 + git config --global user.name "Fangjun Kuang"
  282 +
  283 + rm -rf huggingface
  284 + export GIT_LFS_SKIP_SMUDGE=1
  285 + export GIT_CLONE_PROTECTION_ACTIVE=false
  286 +
  287 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 huggingface
  288 + cd huggingface
  289 + rm -rf ./*
  290 + git fetch
  291 + git pull
  292 +
  293 + git lfs track "cmn_dict"
  294 + git lfs track "ru_dict"
  295 + git lfs track "*.wav"
  296 + git lfs track "lexicon*.txt"
  297 +
  298 + cp -a ../espeak-ng-data ./
  299 +
  300 + cp -v ../scripts/kokoro/v1.1-zh/kokoro.onnx ./model.onnx
  301 +
  302 +
  303 + cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
  304 + cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
  305 + cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
  306 + cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md
  307 + cp -v ../LICENSE ./
  308 + cp -av ../dict ./
  309 + cp -v ../*.fst ./
  310 +
  311 + git lfs track "*.onnx"
  312 + git add .
  313 +
  314 + ls -lh
  315 +
  316 + git status
  317 +
  318 + git commit -m "add models"
  319 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 main || true
  320 +
  321 + - name: Publish to huggingface 1.1-zh-int8
  322 + if: matrix.version == '1.1-zh'
  323 + env:
  324 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  325 + uses: nick-fields/retry@v3
  326 + with:
  327 + max_attempts: 20
  328 + timeout_seconds: 200
  329 + shell: bash
  330 + command: |
  331 + git config --global user.email "csukuangfj@gmail.com"
  332 + git config --global user.name "Fangjun Kuang"
  333 +
  334 + rm -rf huggingface
  335 + export GIT_LFS_SKIP_SMUDGE=1
  336 + export GIT_CLONE_PROTECTION_ACTIVE=false
  337 +
  338 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 huggingface
  339 + cd huggingface
  340 + rm -rf ./*
  341 + git fetch
  342 + git pull
  343 +
  344 + git lfs track "cmn_dict"
  345 + git lfs track "ru_dict"
  346 + git lfs track "*.wav"
  347 + git lfs track "lexicon*.txt"
  348 +
  349 + cp -a ../espeak-ng-data ./
  350 +
  351 + cp -v ../scripts/kokoro/v1.1-zh/kokoro.int8.onnx ./model.int8.onnx
  352 +
  353 +
  354 + cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
  355 + cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
  356 + cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
  357 + cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md
  358 + cp -v ../LICENSE ./
  359 + cp -av ../dict ./
  360 + cp -v ../*.fst ./
  361 +
  362 + git lfs track "*.onnx"
  363 + git add .
  364 +
  365 + ls -lh
  366 +
  367 + git status
  368 +
  369 + git commit -m "add models"
  370 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 main || true
  371 +
208 - name: Release 372 - name: Release
209 if: github.repository_owner == 'csukuangfj' 373 if: github.repository_owner == 'csukuangfj'
210 uses: svenstaro/upload-release-action@v2 374 uses: svenstaro/upload-release-action@v2
@@ -438,7 +438,17 @@ def get_kokoro_models() -> List[TtsModel]: @@ -438,7 +438,17 @@ def get_kokoro_models() -> List[TtsModel]:
438 model_dir="kokoro-multi-lang-v1_0", 438 model_dir="kokoro-multi-lang-v1_0",
439 model_name="model.onnx", 439 model_name="model.onnx",
440 lang="en", 440 lang="en",
441 - ) 441 + ),
  442 + TtsModel(
  443 + model_dir="kokoro-multi-lang-v1_1",
  444 + model_name="model.onnx",
  445 + lang="en",
  446 + ),
  447 + TtsModel(
  448 + model_dir="kokoro-int8-multi-lang-v1_1",
  449 + model_name="model.int8.onnx",
  450 + lang="en",
  451 + ),
442 ] 452 ]
443 for m in multi_lingual_models: 453 for m in multi_lingual_models:
444 m.data_dir = f"{m.model_dir}/espeak-ng-data" 454 m.data_dir = f"{m.model_dir}/espeak-ng-data"
1 voices.json 1 voices.json
2 voices.bin 2 voices.bin
3 README-new.md 3 README-new.md
  4 +lexicon-*.txt
  5 +config.json
@@ -2,11 +2,6 @@ @@ -2,11 +2,6 @@
2 # Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) 2 # Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
3 3
4 4
5 -import argparse  
6 -import json  
7 -from pathlib import Path  
8 -  
9 -import numpy as np  
10 import onnx 5 import onnx
11 import torch 6 import torch
12 7
@@ -4,19 +4,6 @@ @@ -4,19 +4,6 @@
4 import json 4 import json
5 from typing import List, Tuple 5 from typing import List, Tuple
6 6
7 -from misaki import zh  
8 -from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict  
9 -  
10 -user_dict = {  
11 - "还田": [["huan2"], ["tian2"]],  
12 - "行长": [["hang2"], ["zhang3"]],  
13 - "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],  
14 -}  
15 -  
16 -load_phrases_dict(user_dict)  
17 -  
18 -phrases_dict.phrases_dict.update(**user_dict)  
19 -  
20 7
21 def generate_english_lexicon(kind: str): 8 def generate_english_lexicon(kind: str):
22 assert kind in ("us", "gb"), kind 9 assert kind in ("us", "gb"), kind
@@ -59,28 +46,6 @@ def generate_english_lexicon(kind: str): @@ -59,28 +46,6 @@ def generate_english_lexicon(kind: str):
59 return list(user_defined_lower.items()) + list(lexicon.items()) 46 return list(user_defined_lower.items()) + list(lexicon.items())
60 47
61 48
62 -def generate_chinese_lexicon():  
63 - word_dict = pinyin_dict.pinyin_dict  
64 - phrases = phrases_dict.phrases_dict  
65 -  
66 - g2p = zh.ZHG2P()  
67 - lexicon = []  
68 -  
69 - for key in word_dict:  
70 - if not (0x4E00 <= key <= 0x9FFF):  
71 - continue  
72 - w = chr(key)  
73 - tokens: str = g2p.word2ipa(w)  
74 - tokens = tokens.replace(chr(815), "")  
75 - lexicon.append((w, tokens))  
76 -  
77 - for key in phrases:  
78 - tokens: str = g2p.word2ipa(key)  
79 - tokens = tokens.replace(chr(815), "")  
80 - lexicon.append((key, tokens))  
81 - return lexicon  
82 -  
83 -  
84 def save(filename: str, lexicon: List[Tuple[str, str]]): 49 def save(filename: str, lexicon: List[Tuple[str, str]]):
85 with open(filename, "w", encoding="utf-8") as f: 50 with open(filename, "w", encoding="utf-8") as f:
86 for word, phones in lexicon: 51 for word, phones in lexicon:
@@ -91,11 +56,9 @@ def save(filename: str, lexicon: List[Tuple[str, str]]): @@ -91,11 +56,9 @@ def save(filename: str, lexicon: List[Tuple[str, str]]):
91 def main(): 56 def main():
92 us = generate_english_lexicon("us") 57 us = generate_english_lexicon("us")
93 gb = generate_english_lexicon("gb") 58 gb = generate_english_lexicon("gb")
94 - zh = generate_chinese_lexicon()  
95 59
96 save("lexicon-us-en.txt", us) 60 save("lexicon-us-en.txt", us)
97 save("lexicon-gb-en.txt", gb) 61 save("lexicon-gb-en.txt", gb)
98 - save("lexicon-zh.txt", zh)  
99 62
100 63
101 if __name__ == "__main__": 64 if __name__ == "__main__":
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +from typing import List, Tuple
  5 +
  6 +from misaki import zh
  7 +from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
  8 +
  9 +user_dict = {
  10 + "还田": [["huan2"], ["tian2"]],
  11 + "行长": [["hang2"], ["zhang3"]],
  12 + "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
  13 +}
  14 +
  15 +load_phrases_dict(user_dict)
  16 +
  17 +phrases_dict.phrases_dict.update(**user_dict)
  18 +
  19 +
  20 +def generate_chinese_lexicon():
  21 + word_dict = pinyin_dict.pinyin_dict
  22 + phrases = phrases_dict.phrases_dict
  23 +
  24 + g2p = zh.ZHG2P()
  25 + lexicon = []
  26 +
  27 + for key in word_dict:
  28 + if not (0x4E00 <= key <= 0x9FFF):
  29 + continue
  30 + w = chr(key)
  31 + tokens: str = g2p.word2ipa(w)
  32 + tokens = tokens.replace(chr(815), "")
  33 + lexicon.append((w, tokens))
  34 +
  35 + for key in phrases:
  36 + tokens: str = g2p.word2ipa(key)
  37 + tokens = tokens.replace(chr(815), "")
  38 + lexicon.append((key, tokens))
  39 + return lexicon
  40 +
  41 +
  42 +def save(filename: str, lexicon: List[Tuple[str, str]]):
  43 + with open(filename, "w", encoding="utf-8") as f:
  44 + for word, phones in lexicon:
  45 + tokens = " ".join(list(phones))
  46 + f.write(f"{word} {tokens}\n")
  47 +
  48 +
  49 +def main():
  50 + zh = generate_chinese_lexicon()
  51 +
  52 + save("lexicon-zh.txt", zh)
  53 +
  54 +
  55 +if __name__ == "__main__":
  56 + main()
@@ -111,7 +111,11 @@ if [ ! -f ./tokens.txt ]; then @@ -111,7 +111,11 @@ if [ ! -f ./tokens.txt ]; then
111 fi 111 fi
112 112
113 if [ ! -f ./lexicon-zh.txt ]; then 113 if [ ! -f ./lexicon-zh.txt ]; then
114 - ./generate_lexicon.py 114 + ./generate_lexicon_zh.py
  115 +fi
  116 +
  117 +if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
  118 + ./generate_lexicon_en.py
115 fi 119 fi
116 120
117 if [ ! -f ./voices.bin ]; then 121 if [ ! -f ./voices.bin ]; then
@@ -10,8 +10,6 @@ import jieba @@ -10,8 +10,6 @@ import jieba
10 import numpy as np 10 import numpy as np
11 import onnxruntime as ort 11 import onnxruntime as ort
12 import soundfile as sf 12 import soundfile as sf
13 -import torch  
14 -from misaki import zh  
15 13
16 try: 14 try:
17 from piper_phonemize import phonemize_espeak 15 from piper_phonemize import phonemize_espeak
@@ -114,7 +112,6 @@ class OnnxModel: @@ -114,7 +112,6 @@ class OnnxModel:
114 def __call__(self, text: str, voice: str): 112 def __call__(self, text: str, voice: str):
115 punctuations = ';:,.!?-…()"“”' 113 punctuations = ';:,.!?-…()"“”'
116 text = text.lower() 114 text = text.lower()
117 - g2p = zh.ZHG2P()  
118 115
119 tokens = "" 116 tokens = ""
120 117
  1 +# Introduction
  2 +
  3 +This directory is for kokoro v1.1-zh.
  4 +
  5 +See also https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import onnx
  6 +import torch
  7 +
  8 +from generate_voices_bin import speaker2id
  9 +
  10 +
  11 +def main():
  12 + model = onnx.load("./kokoro.onnx")
  13 + style = torch.load("./voices/zf_001.pt", weights_only=True, map_location="cpu")
  14 +
  15 + id2speaker_str = ""
  16 + speaker2id_str = ""
  17 + sep = ""
  18 + for s, i in speaker2id.items():
  19 + speaker2id_str += f"{sep}{s}->{i}"
  20 + id2speaker_str += f"{sep}{i}->{s}"
  21 + sep = ","
  22 +
  23 + meta_data = {
  24 + "model_type": "kokoro",
  25 + "language": "multi-lang, e.g., English, Chinese",
  26 + "has_espeak": 1,
  27 + "sample_rate": 24000,
  28 + "version": 2,
  29 + "voice": "en-us",
  30 + "style_dim": ",".join(map(str, style.shape)),
  31 + "n_speakers": len(speaker2id),
  32 + "id2speaker": id2speaker_str,
  33 + "speaker2id": speaker2id_str,
  34 + "speaker_names": ",".join(map(str, speaker2id.keys())),
  35 + "model_url": "https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh",
  36 + "maintainer": "k2-fsa",
  37 + "comment": "This is Kokoro v1.1-zh, a multilingual TTS model, supporting English, Chinese.",
  38 + }
  39 +
  40 + print(model.metadata_props)
  41 +
  42 + while len(model.metadata_props):
  43 + model.metadata_props.pop()
  44 +
  45 + for key, value in meta_data.items():
  46 + meta = model.metadata_props.add()
  47 + meta.key = key
  48 + meta.value = str(value)
  49 + print("--------------------")
  50 +
  51 + print(model.metadata_props)
  52 +
  53 + onnx.save(model, "./kokoro.onnx")
  54 +
  55 +
  56 +if __name__ == "__main__":
  57 + main()
  1 +#!/usr/bin/env python3
  2 +import argparse
  3 +
  4 +import onnxruntime
  5 +from onnxruntime.quantization import QuantType, quantize_dynamic
  6 +
  7 +
  8 +def show(filename):
  9 + session_opts = onnxruntime.SessionOptions()
  10 + session_opts.log_severity_level = 3
  11 + sess = onnxruntime.InferenceSession(filename, session_opts)
  12 + for i in sess.get_inputs():
  13 + print(i)
  14 +
  15 + print("-----")
  16 +
  17 + for i in sess.get_outputs():
  18 + print(i)
  19 +
  20 +
  21 +"""
  22 +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
  23 +NodeArg(name='style', type='tensor(float)', shape=[1, 256])
  24 +NodeArg(name='speed', type='tensor(float)', shape=[1])
  25 +-----
  26 +NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
  27 +"""
  28 +
  29 +
  30 +def main():
  31 + show("./kokoro.onnx")
  32 +
  33 + quantize_dynamic(
  34 + model_input="kokoro.onnx",
  35 + model_output="kokoro.int8.onnx",
  36 + # op_types_to_quantize=["MatMul"],
  37 + weight_type=QuantType.QUInt8,
  38 + )
  39 +
  40 +
  41 +if __name__ == "__main__":
  42 + main()
  1 +#!/usr/bin/env python3
  2 +
  3 +import json
  4 +
  5 +import torch
  6 +from kokoro import KModel
  7 +from kokoro.model import KModelForONNX
  8 +
  9 +
  10 +@torch.no_grad()
  11 +def main():
  12 + with open("config.json") as f:
  13 + config = json.load(f)
  14 +
  15 + model = (
  16 + KModel(
  17 + repo_id="not-used-any-value-is-ok",
  18 + model="kokoro-v1_1-zh.pth",
  19 + config=config,
  20 + disable_complex=True,
  21 + )
  22 + .to("cpu")
  23 + .eval()
  24 + )
  25 +
  26 + x = torch.randint(1, 100, (48,)).numpy()
  27 + x = torch.LongTensor([[0, *x, 0]])
  28 +
  29 + style = torch.rand(1, 256, dtype=torch.float32)
  30 + speed = torch.rand(1)
  31 +
  32 + print(x.shape, x.dtype)
  33 + print(style.shape, style.dtype)
  34 + print(speed, speed.dtype)
  35 +
  36 + model2 = KModelForONNX(model)
  37 +
  38 + torch.onnx.export(
  39 + model2,
  40 + (x, style, speed),
  41 + "kokoro.onnx",
  42 + input_names=["tokens", "style", "speed"],
  43 + output_names=["audio"],
  44 + dynamic_axes={
  45 + "tokens": {1: "sequence_length"},
  46 + "audio": {0: "audio_length"},
  47 + },
  48 + opset_version=14, # minimum working version for this kokoro model is 14
  49 + )
  50 +
  51 +
  52 +if __name__ == "__main__":
  53 + main()
  1 +../v1.0/generate_lexicon_en.py
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +import re
  5 +from typing import List, Tuple
  6 +
  7 +from misaki import zh
  8 +from misaki.token import MToken
  9 +from misaki.zh_frontend import ZH_MAP
  10 +from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
  11 +
  12 +user_dict = {
  13 + "还田": [["huan2"], ["tian2"]],
  14 + "行长": [["hang2"], ["zhang3"]],
  15 + "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
  16 +}
  17 +
  18 +load_phrases_dict(user_dict)
  19 +
  20 +phrases_dict.phrases_dict.update(**user_dict)
  21 +
  22 +
  23 +def process_text(self, text, with_erhua=True):
  24 + """
  25 + This function is modified from
  26 + https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155
  27 +
  28 + Note that we have removed jieba.posseg.lcut().
  29 + """
  30 + seg_cut = [(text, "v")]
  31 + seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
  32 + tokens = []
  33 + seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
  34 + initials = []
  35 + finals = []
  36 + # pypinyin, g2pM
  37 + for word, pos in seg_cut:
  38 + if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
  39 + pos = "X"
  40 + elif pos != "x" and word in self.punc:
  41 + pos = "x"
  42 + tk = MToken(text=word, tag=pos, whitespace="")
  43 + if pos in ("x", "eng"):
  44 + if not word.isspace():
  45 + if pos == "x" and word in self.punc:
  46 + tk.phonemes = word
  47 + tokens.append(tk)
  48 + elif tokens:
  49 + tokens[-1].whitespace += word
  50 + continue
  51 + elif (
  52 + tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
  53 + ):
  54 + tokens[-1].whitespace = "/"
  55 +
  56 + # g2p
  57 + sub_initials, sub_finals = self._get_initials_finals(word)
  58 + # tone sandhi
  59 + sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
  60 + # er hua
  61 + if with_erhua:
  62 + sub_initials, sub_finals = self._merge_erhua(
  63 + sub_initials, sub_finals, word, pos
  64 + )
  65 +
  66 + initials.append(sub_initials)
  67 + finals.append(sub_finals)
  68 + # assert len(sub_initials) == len(sub_finals) == len(word)
  69 +
  70 + # sum(iterable[, start])
  71 + # initials = sum(initials, [])
  72 + # finals = sum(finals, [])
  73 +
  74 + phones = []
  75 + for c, v in zip(sub_initials, sub_finals):
  76 + # NOTE: post process for pypinyin outputs
  77 + # we discriminate i, ii and iii
  78 + if c:
  79 + phones.append(c)
  80 + # replace punctuation by ` `
  81 + # if c and c in self.punc:
  82 + # phones.append(c)
  83 + if v and (v not in self.punc or v != c): # and v not in self.rhy_phns:
  84 + phones.append(v)
  85 + phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
  86 + phones = re.sub(r"(?=\d)", "_", phones).split("_")
  87 + tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
  88 + tokens.append(tk)
  89 +
  90 + result = "".join(
  91 + (self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
  92 + for tk in tokens
  93 + )
  94 +
  95 + return result, tokens
  96 +
  97 +
  98 +def generate_chinese_lexicon():
  99 + word_dict = pinyin_dict.pinyin_dict
  100 + phrases = phrases_dict.phrases_dict
  101 +
  102 + g2p = zh.ZHG2P(version="1.1")
  103 +
  104 + lexicon = []
  105 + for key in word_dict:
  106 + if not (0x4E00 <= key <= 0x9FFF):
  107 + continue
  108 + w = chr(key)
  109 + tokens: str = process_text(g2p.frontend, w)[0]
  110 + lexicon.append((w, tokens))
  111 +
  112 + for key in phrases:
  113 + tokens: str = process_text(g2p.frontend, key)[0]
  114 + lexicon.append((key, tokens))
  115 + return lexicon
  116 +
  117 +
  118 +def save(filename: str, lexicon: List[Tuple[str, str]]):
  119 + with open(filename, "w", encoding="utf-8") as f:
  120 + for word, phones in lexicon:
  121 + tokens = " ".join(list(phones))
  122 + f.write(f"{word} {tokens}\n")
  123 +
  124 +
  125 +def main():
  126 + zh = generate_chinese_lexicon()
  127 +
  128 + save("lexicon-zh.txt", zh)
  129 +
  130 +
  131 +if __name__ == "__main__":
  132 + main()
  1 +../v1.0/generate_tokens.py
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +import torch
  4 +from pathlib import Path
  5 +
  6 +
  7 +speakers = [
  8 + "af_maple",
  9 + "af_sol",
  10 + "bf_vale",
  11 +]
  12 +for i in range(1, 99 + 1):
  13 + name = "zf_{:03d}".format(i)
  14 + if Path(f"voices/{name}.pt").is_file():
  15 + speakers.append(name)
  16 +
  17 +for i in range(9, 100 + 1):
  18 + name = "zm_{:03d}".format(i)
  19 + if Path(f"voices/{name}.pt").is_file():
  20 + speakers.append(name)
  21 +
  22 +
  23 +id2speaker = {index: value for index, value in enumerate(speakers)}
  24 +
  25 +speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
  26 +
  27 +
  28 +def main():
  29 + if Path("./voices.bin").is_file():
  30 + print("./voices.bin exists - skip")
  31 + return
  32 +
  33 + with open("voices.bin", "wb") as f:
  34 + for _, speaker in id2speaker.items():
  35 + m = torch.load(
  36 + f"voices/{speaker}.pt",
  37 + weights_only=True,
  38 + map_location="cpu",
  39 + ).numpy()
  40 + # m.shape (510, 1, 256)
  41 +
  42 + f.write(m.tobytes())
  43 +
  44 +
  45 +if __name__ == "__main__":
  46 + main()
  1 +#!/usr/bin/env bash
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +#
  4 +set -ex
  5 +
  6 +if [ ! -f kokoro-v1_1-zh.pth ]; then
  7 + curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/kokoro-v1_1-zh.pth
  8 +fi
  9 +
  10 +
  11 +if [ ! -f config.json ]; then
  12 + # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
  13 + curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json
  14 +fi
  15 +
  16 +voices=(
  17 +af_maple
  18 +af_sol
  19 +bf_vale
  20 +)
  21 +# zf_001-zf_099
  22 +for i in $(seq 1 99); do
  23 + a=$(printf "zf_%03d" $i)
  24 + voices+=($a)
  25 +done
  26 +
  27 +# zm_009-zm_100
  28 +for i in $(seq 9 100); do
  29 + a=$(printf "zm_%03d" $i)
  30 + voices+=($a)
  31 +done
  32 +
  33 +echo ${voices[@]} # all elements
  34 +echo ${#voices[@]} # length
  35 +
  36 +mkdir -p voices
  37 +
  38 +for v in ${voices[@]}; do
  39 + if [ ! -f voices/$v.pt ]; then
  40 + curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/voices/$v.pt
  41 + fi
  42 +done
  43 +pushd voices
  44 +find . -type f -size -10k -exec rm -v {} +
  45 +ls -lh
  46 +du -h -d1 .
  47 +popd
  48 +
  49 +if [ ! -f ./kokoro.onnx ]; then
  50 + python3 ./export_onnx.py
  51 +fi
  52 +
  53 +if [ ! -f ./.add-meta-data.done ]; then
  54 + python3 ./add_meta_data.py
  55 + touch ./.add-meta-data.done
  56 +fi
  57 +
  58 +if [ ! -f ./kokoro.int8.onnx ]; then
  59 + python3 ./dynamic_quantization.py
  60 +fi
  61 +
  62 +if [ ! -f us_gold.json ]; then
  63 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
  64 +fi
  65 +
  66 +if [ ! -f us_silver.json ]; then
  67 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json
  68 +fi
  69 +
  70 +if [ ! -f gb_gold.json ]; then
  71 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json
  72 +fi
  73 +
  74 +if [ ! -f gb_silver.json ]; then
  75 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json
  76 +fi
  77 +
  78 +if [ ! -f ./tokens.txt ]; then
  79 + ./generate_tokens.py
  80 +fi
  81 +
  82 +if [ ! -f ./lexicon-zh.txt ]; then
  83 + ./generate_lexicon_zh.py
  84 +fi
  85 +
  86 +if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
  87 + ./generate_lexicon_en.py
  88 +fi
  89 +
  90 +if [ ! -f ./voices.bin ]; then
  91 + ./generate_voices_bin.py
  92 +fi
  93 +
  94 +./test.py
  95 +ls -lh
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import re
  6 +import time
  7 +from typing import Dict, List
  8 +
  9 +import jieba
  10 +import numpy as np
  11 +import onnxruntime as ort
  12 +import soundfile as sf
  13 +
  14 +try:
  15 + from piper_phonemize import phonemize_espeak
  16 +except Exception as ex:
  17 + raise RuntimeError(
  18 + f"{ex}\nPlease run\n"
  19 + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
  20 + )
  21 +
  22 +
  23 +def show(filename):
  24 + session_opts = ort.SessionOptions()
  25 + session_opts.log_severity_level = 3
  26 + sess = ort.InferenceSession(filename, session_opts)
  27 + for i in sess.get_inputs():
  28 + print(i)
  29 +
  30 + print("-----")
  31 +
  32 + for i in sess.get_outputs():
  33 + print(i)
  34 +
  35 +
  36 +"""
  37 +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
  38 +NodeArg(name='style', type='tensor(float)', shape=[1, 256])
  39 +NodeArg(name='speed', type='tensor(float)', shape=[1])
  40 +-----
  41 +NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
  42 +"""
  43 +
  44 +
  45 +def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
  46 + embedding = (
  47 + np.fromfile(voices_bin, dtype="uint8")
  48 + .view(np.float32)
  49 + .reshape(len(speaker_names), *dim)
  50 + )
  51 + print("embedding.shape", embedding.shape)
  52 + ans = dict()
  53 + for i in range(len(speaker_names)):
  54 + ans[speaker_names[i]] = embedding[i]
  55 +
  56 + return ans
  57 +
  58 +
  59 +def load_tokens(filename: str) -> Dict[str, int]:
  60 + ans = dict()
  61 + with open(filename, encoding="utf-8") as f:
  62 + for line in f:
  63 + fields = line.strip().split()
  64 + if len(fields) == 2:
  65 + token, idx = fields
  66 + ans[token] = int(idx)
  67 + else:
  68 + assert len(fields) == 1, (len(fields), line)
  69 + ans[" "] = int(fields[0])
  70 + return ans
  71 +
  72 +
  73 +def load_lexicon(filename: str) -> Dict[str, List[str]]:
  74 + ans = dict()
  75 + for lexicon in filename.split(","):
  76 + print(lexicon)
  77 + with open(lexicon, encoding="utf-8") as f:
  78 + for line in f:
  79 + w, tokens = line.strip().split(" ", maxsplit=1)
  80 + ans[w] = "".join(tokens.split())
  81 + return ans
  82 +
  83 +
  84 +class OnnxModel:
  85 + def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str):
  86 + session_opts = ort.SessionOptions()
  87 + session_opts.inter_op_num_threads = 3
  88 + session_opts.intra_op_num_threads = 3
  89 +
  90 + self.session_opts = session_opts
  91 + self.model = ort.InferenceSession(
  92 + model_filename,
  93 + sess_options=self.session_opts,
  94 + providers=["CPUExecutionProvider"],
  95 + )
  96 + self.token2id = load_tokens(tokens)
  97 + self.word2tokens = load_lexicon(lexicon)
  98 +
  99 + meta = self.model.get_modelmeta().custom_metadata_map
  100 + print(meta)
  101 + dim = list(map(int, meta["style_dim"].split(",")))
  102 + speaker_names = meta["speaker_names"].split(",")
  103 + self.voices = load_voices(
  104 + speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
  105 + )
  106 + self.sample_rate = int(meta["sample_rate"])
  107 + print(list(self.voices.keys()))
  108 +
  109 + self.sample_rate = 24000
  110 + self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
  111 +
  112 + def __call__(self, text: str, voice: str):
  113 + punctuations = ';:,.!?-…()"“”'
  114 + text = text.lower()
  115 +
  116 + tokens = ""
  117 +
  118 + for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text):
  119 + if ord(t[0]) < 0x7F:
  120 + for w in t.split():
  121 + while w:
  122 + if w[0] in punctuations:
  123 + tokens += w[0] + " "
  124 + w = w[1:]
  125 + continue
  126 +
  127 + if w[-1] in punctuations:
  128 + if w[:-1] in self.word2tokens:
  129 + tokens += self.word2tokens[w[:-1]]
  130 + tokens += w[-1]
  131 + else:
  132 + if w in self.word2tokens:
  133 + tokens += self.word2tokens[w]
  134 + else:
  135 + print(f"Use espeak-ng for word {w}")
  136 + tokens += "".join(phonemize_espeak(w, "en-us")[0])
  137 +
  138 + tokens += " "
  139 + break
  140 + else:
  141 + # Chinese
  142 + for w in jieba.cut(t):
  143 + if w in self.word2tokens:
  144 + tokens += self.word2tokens[w]
  145 + else:
  146 + for i in w:
  147 + if i in self.word2tokens:
  148 + tokens += self.word2tokens[i]
  149 + else:
  150 + print(f"skip {i}")
  151 +
  152 + token_ids = [self.token2id[i] for i in tokens]
  153 + token_ids = token_ids[: self.max_len]
  154 +
  155 + style = self.voices[voice][len(token_ids)]
  156 +
  157 + token_ids = [0, *token_ids, 0]
  158 + token_ids = np.array([token_ids], dtype=np.int64)
  159 +
  160 + speed = np.array([1.0], dtype=np.float32)
  161 +
  162 + audio = self.model.run(
  163 + [
  164 + self.model.get_outputs()[0].name,
  165 + ],
  166 + {
  167 + self.model.get_inputs()[0].name: token_ids,
  168 + self.model.get_inputs()[1].name: style,
  169 + self.model.get_inputs()[2].name: speed,
  170 + },
  171 + )[0]
  172 + return audio
  173 +
  174 +
  175 +def main():
  176 + m = OnnxModel(
  177 + model_filename="./kokoro.onnx",
  178 + tokens="./tokens.txt",
  179 + lexicon="./lexicon-us-en.txt,./lexicon-zh.txt",
  180 + voices_bin="./voices.bin",
  181 + )
  182 + text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?"
  183 +
  184 + text = text.lower()
  185 +
  186 + voice = "zf_001"
  187 + start = time.time()
  188 + audio = m(text, voice=voice)
  189 + end = time.time()
  190 +
  191 + elapsed_seconds = end - start
  192 + audio_duration = len(audio) / m.sample_rate
  193 + real_time_factor = elapsed_seconds / audio_duration
  194 +
  195 + filename = f"kokoro_v1.1_{voice}_zh_en.wav"
  196 + sf.write(
  197 + filename,
  198 + audio,
  199 + samplerate=m.sample_rate,
  200 + subtype="PCM_16",
  201 + )
  202 + print(f" Saved to {filename}")
  203 + print(f" Elapsed seconds: {elapsed_seconds:.3f}")
  204 + print(f" Audio duration in seconds: {audio_duration:.3f}")
  205 + print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
  206 +
  207 +
  208 +if __name__ == "__main__":
  209 + main()
@@ -128,15 +128,19 @@ class KokoroMultiLangLexicon::Impl { @@ -128,15 +128,19 @@ class KokoroMultiLangLexicon::Impl {
128 } 128 }
129 129
130 for (const auto &ids : ids_vec) { 130 for (const auto &ids : ids_vec) {
131 - if (ids.size() > 4) { 131 + if (ids.size() > 10 + 2) {
132 ans.emplace_back(ids); 132 ans.emplace_back(ids);
133 } else { 133 } else {
134 if (ans.empty()) { 134 if (ans.empty()) {
135 ans.emplace_back(ids); 135 ans.emplace_back(ids);
136 } else { 136 } else {
137 - ans.back().tokens.back() = ids[1];  
138 - ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,  
139 - ids.end()); 137 + if (ans.back().tokens.size() + ids.size() < 50) {
  138 + ans.back().tokens.back() = ids[1];
  139 + ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
  140 + ids.end());
  141 + } else {
  142 + ans.emplace_back(ids);
  143 + }
140 } 144 }
141 } 145 }
142 } 146 }
@@ -33,8 +33,8 @@ GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const { @@ -33,8 +33,8 @@ GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
33 if (scale == 1) { 33 if (scale == 1) {
34 return *this; 34 return *this;
35 } 35 }
36 - // if the interval is larger than 0.6 second, then we assume it is a pause  
37 - int32_t threshold = static_cast<int32_t>(sample_rate * 0.6); 36 + // if the interval is larger than 0.2 second, then we assume it is a pause
  37 + int32_t threshold = static_cast<int32_t>(sample_rate * 0.2);
38 38
39 std::vector<SilenceInterval> intervals; 39 std::vector<SilenceInterval> intervals;
40 int32_t num_samples = static_cast<int32_t>(samples.size()); 40 int32_t num_samples = static_cast<int32_t>(samples.size());