Fangjun Kuang
Committed by GitHub

Export Kokoro 1.0 to sherpa-onnx (#1788)

@@ -4,6 +4,7 @@ on: @@ -4,6 +4,7 @@ on:
4 push: 4 push:
5 branches: 5 branches:
6 - export-kokoro 6 - export-kokoro
  7 + - kokoro-1.0-2
7 8
8 workflow_dispatch: 9 workflow_dispatch:
9 10
@@ -14,12 +15,13 @@ concurrency: @@ -14,12 +15,13 @@ concurrency:
14 jobs: 15 jobs:
15 export-kokoro-to-onnx: 16 export-kokoro-to-onnx:
16 if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' 17 if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
17 - name: export kokoro 18 + name: export kokoro ${{ matrix.version }}
18 runs-on: ${{ matrix.os }} 19 runs-on: ${{ matrix.os }}
19 strategy: 20 strategy:
20 fail-fast: false 21 fail-fast: false
21 matrix: 22 matrix:
22 os: [ubuntu-latest] 23 os: [ubuntu-latest]
  24 + version: ["0.19", "1.0"]
23 python-version: ["3.10"] 25 python-version: ["3.10"]
24 26
25 steps: 27 steps:
@@ -33,7 +35,7 @@ jobs: @@ -33,7 +35,7 @@ jobs:
33 - name: Install Python dependencies 35 - name: Install Python dependencies
34 shell: bash 36 shell: bash
35 run: | 37 run: |
36 - pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html 38 + pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
37 39
38 - name: Run 40 - name: Run
39 shell: bash 41 shell: bash
@@ -42,9 +44,16 @@ jobs: @@ -42,9 +44,16 @@ jobs:
42 tar xf espeak-ng-data.tar.bz2 44 tar xf espeak-ng-data.tar.bz2
43 rm espeak-ng-data.tar.bz2 45 rm espeak-ng-data.tar.bz2
44 cd scripts/kokoro 46 cd scripts/kokoro
  47 + v=${{ matrix.version }}
  48 + if [[ $v = "0.19" ]]; then
45 ./run.sh 49 ./run.sh
  50 + elif [[ $v == "1.0" ]]; then
  51 + cd v1.0
  52 + ./run.sh
  53 + fi
46 54
47 - - name: Collect results 55 + - name: Collect results ${{ matrix.version }}
  56 + if: matrix.version == '0.19'
48 shell: bash 57 shell: bash
49 run: | 58 run: |
50 src=scripts/kokoro 59 src=scripts/kokoro
@@ -53,7 +62,7 @@ jobs: @@ -53,7 +62,7 @@ jobs:
53 mkdir $d 62 mkdir $d
54 cp -a LICENSE $d/LICENSE 63 cp -a LICENSE $d/LICENSE
55 cp -a espeak-ng-data $d/ 64 cp -a espeak-ng-data $d/
56 - cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx 65 + cp -v $src/kokoro-v0_19.onnx $d/model.onnx
57 cp -v $src/voices.bin $d/ 66 cp -v $src/voices.bin $d/
58 cp -v $src/tokens.txt $d/ 67 cp -v $src/tokens.txt $d/
59 cp -v $src/README-new.md $d/README.md 68 cp -v $src/README-new.md $d/README.md
@@ -61,9 +70,31 @@ jobs: @@ -61,9 +70,31 @@ jobs:
61 tar cjfv $d.tar.bz2 $d 70 tar cjfv $d.tar.bz2 $d
62 rm -rf $d 71 rm -rf $d
63 72
64 - ls -h $.tar.bz2 73 + ls -lh $d.tar.bz2
  74 +
  75 + - name: Collect results ${{ matrix.version }}
  76 + if: matrix.version == '1.0'
  77 + shell: bash
  78 + run: |
  79 + src=scripts/kokoro/v1.0
  80 +
  81 + d=kokoro-multi-lang-v1_0
  82 + mkdir $d
  83 + cp -a LICENSE $d/LICENSE
  84 + cp -a espeak-ng-data $d/
  85 + cp -v $src/kokoro.onnx $d/model.onnx
  86 + cp -v $src/voices.bin $d/
  87 + cp -v $src/tokens.txt $d/
  88 + cp -v $src/lexicon*.txt $d/
  89 + cp -v $src/README.md $d/README.md
  90 + ls -lh $d/
  91 + tar cjfv $d.tar.bz2 $d
  92 + rm -rf $d
  93 +
  94 + ls -lh $d.tar.bz2
65 95
66 - - name: Publish to huggingface 96 + - name: Publish to huggingface ${{ matrix.version }}
  97 + if: matrix.version == '0.19'
67 env: 98 env:
68 HF_TOKEN: ${{ secrets.HF_TOKEN }} 99 HF_TOKEN: ${{ secrets.HF_TOKEN }}
69 uses: nick-fields/retry@v3 100 uses: nick-fields/retry@v3
@@ -92,9 +123,9 @@ jobs: @@ -92,9 +123,9 @@ jobs:
92 cp -a ../espeak-ng-data ./ 123 cp -a ../espeak-ng-data ./
93 mkdir -p test_wavs 124 mkdir -p test_wavs
94 125
95 - cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx 126 + cp -v ../scripts/kokoro/kokoro-v0_19.onnx ./model.onnx
96 127
97 - cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/ 128 + cp -v ../scripts/kokoro/kokoro-v0_19-*.wav ./test_wavs/
98 129
99 cp -v ../scripts/kokoro/tokens.txt . 130 cp -v ../scripts/kokoro/tokens.txt .
100 cp -v ../scripts/kokoro/voices.bin . 131 cp -v ../scripts/kokoro/voices.bin .
@@ -111,6 +142,55 @@ jobs: @@ -111,6 +142,55 @@ jobs:
111 git commit -m "add models" 142 git commit -m "add models"
112 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true 143 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
113 144
  145 + - name: Publish to huggingface ${{ matrix.version }}
  146 + if: matrix.version == '1.0'
  147 + env:
  148 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  149 + uses: nick-fields/retry@v3
  150 + with:
  151 + max_attempts: 20
  152 + timeout_seconds: 200
  153 + shell: bash
  154 + command: |
  155 + git config --global user.email "csukuangfj@gmail.com"
  156 + git config --global user.name "Fangjun Kuang"
  157 +
  158 + rm -rf huggingface
  159 + export GIT_LFS_SKIP_SMUDGE=1
  160 + export GIT_CLONE_PROTECTION_ACTIVE=false
  161 +
  162 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 huggingface
  163 + cd huggingface
  164 + rm -rf ./*
  165 + git fetch
  166 + git pull
  167 +
  168 + git lfs track "cmn_dict"
  169 + git lfs track "ru_dict"
  170 + git lfs track "*.wav"
  171 + git lfs track "lexicon*.txt"
  172 +
  173 + cp -a ../espeak-ng-data ./
  174 +
  175 + cp -v ../scripts/kokoro/v1.0/kokoro.onnx ./model.onnx
  176 +
  177 +
  178 + cp -v ../scripts/kokoro/v1.0/tokens.txt .
  179 + cp -v ../scripts/kokoro/v1.0/voices.bin .
  180 + cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
  181 + cp -v ../scripts/kokoro/v1.0/README.md ./README.md
  182 + cp -v ../LICENSE ./
  183 +
  184 + git lfs track "*.onnx"
  185 + git add .
  186 +
  187 + ls -lh
  188 +
  189 + git status
  190 +
  191 + git commit -m "add models"
  192 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true
  193 +
114 - name: Release 194 - name: Release
115 uses: svenstaro/upload-release-action@v2 195 uses: svenstaro/upload-release-action@v2
116 with: 196 with:
@@ -128,3 +128,7 @@ harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md @@ -128,3 +128,7 @@ harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md
128 matcha-icefall-zh-baker 128 matcha-icefall-zh-baker
129 matcha-icefall-en_US-ljspeech 129 matcha-icefall-en_US-ljspeech
130 kokoro-en-v0_19 130 kokoro-en-v0_19
  131 +*.pt
  132 +lexicon.txt
  133 +us_gold.json
  134 +us_silver.json
@@ -69,6 +69,14 @@ def main(): @@ -69,6 +69,14 @@ def main():
69 for k in keys: 69 for k in keys:
70 f.write(voices[k].tobytes()) 70 f.write(voices[k].tobytes())
71 71
  72 + speaker2id_str = ""
  73 + id2speaker_str = ""
  74 + sep = ""
  75 + for i, s in enumerate(keys):
  76 + speaker2id_str += f"{sep}{s}->{i}"
  77 + id2speaker_str += f"{sep}{i}->{s}"
  78 + sep = ","
  79 +
72 meta_data = { 80 meta_data = {
73 "model_type": "kokoro", 81 "model_type": "kokoro",
74 "language": "English", 82 "language": "English",
@@ -78,6 +86,8 @@ def main(): @@ -78,6 +86,8 @@ def main():
78 "voice": "en-us", 86 "voice": "en-us",
79 "style_dim": ",".join(map(str, voices[keys[0]].shape)), 87 "style_dim": ",".join(map(str, voices[keys[0]].shape)),
80 "n_speakers": len(keys), 88 "n_speakers": len(keys),
  89 + "speaker2id": speaker2id_str,
  90 + "id2speaker": id2speaker_str,
81 "speaker_names": ",".join(keys), 91 "speaker_names": ",".join(keys),
82 "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files", 92 "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
83 "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS", 93 "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
@@ -16,8 +16,8 @@ https://huggingface.co/hexgrad/Kokoro-82M/discussions/14 @@ -16,8 +16,8 @@ https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
16 EOF 16 EOF
17 17
18 files=( 18 files=(
19 -kokoro-v0_19_hf.onnx  
20 -# kokoro-v0_19.onnx 19 +# kokoro-v0_19_hf.onnx
  20 +kokoro-v0_19.onnx
21 # kokoro-quant.onnx 21 # kokoro-quant.onnx
22 # kokoro-quant-convinteger.onnx 22 # kokoro-quant-convinteger.onnx
23 voices.json 23 voices.json
@@ -30,14 +30,14 @@ for f in ${files[@]}; do @@ -30,14 +30,14 @@ for f in ${files[@]}; do
30 done 30 done
31 31
32 models=( 32 models=(
33 -# kokoro-v0_19 33 +kokoro-v0_19
34 # kokoro-quant 34 # kokoro-quant
35 # kokoro-quant-convinteger 35 # kokoro-quant-convinteger
36 -kokoro-v0_19_hf 36 +# kokoro-v0_19_hf
37 ) 37 )
38 38
39 for m in ${models[@]}; do 39 for m in ${models[@]}; do
40 - ./add-meta-data.py --model $m.onnx --voices ./voices.json 40 + ./add_meta_data.py --model $m.onnx --voices ./voices.json
41 done 41 done
42 42
43 ls -l 43 ls -l
  1 +config.json
  2 +*.json
  3 +*.txt
  4 +.add-meta-data.done
  5 +voices
  1 +# Introduction
  2 +
  3 +This directory is for kokoro v1.0
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import argparse
  6 +import json
  7 +from pathlib import Path
  8 +
  9 +import numpy as np
  10 +import onnx
  11 +import torch
  12 +
  13 +from generate_voices_bin import speaker2id
  14 +
  15 +
  16 +def main():
  17 + model = onnx.load("./kokoro.onnx")
  18 + style = torch.load("./voices/af_alloy.pt", weights_only=True, map_location="cpu")
  19 +
  20 + id2speaker_str = ""
  21 + speaker2id_str = ""
  22 + sep = ""
  23 + for s, i in speaker2id.items():
  24 + speaker2id_str += f"{sep}{s}->{i}"
  25 + id2speaker_str += f"{sep}{i}->{s}"
  26 + sep = ","
  27 +
  28 + meta_data = {
  29 + "model_type": "kokoro",
  30 + "language": "English",
  31 + "has_espeak": 1,
  32 + "sample_rate": 24000,
  33 + "version": 2,
  34 + "voice": "en-us",
  35 + "style_dim": ",".join(map(str, style.shape)),
  36 + "n_speakers": len(speaker2id),
  37 + "id2speaker": id2speaker_str,
  38 + "speaker2id": speaker2id_str,
  39 + "speaker_names": ",".join(map(str, speaker2id.keys())),
  40 + "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
  41 + "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
  42 + "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
  43 + "maintainer": "k2-fsa",
  44 + "comment": "This is Kokoro v1.0, a multilingual TTS model, supporting English, Chinese, French, Japanese etc.",
  45 + }
  46 +
  47 + print(model.metadata_props)
  48 +
  49 + while len(model.metadata_props):
  50 + model.metadata_props.pop()
  51 +
  52 + for key, value in meta_data.items():
  53 + meta = model.metadata_props.add()
  54 + meta.key = key
  55 + meta.value = str(value)
  56 + print("--------------------")
  57 +
  58 + print(model.metadata_props)
  59 +
  60 + onnx.save(model, "./kokoro.onnx")
  61 +
  62 +
  63 +if __name__ == "__main__":
  64 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +import json
  5 +from pypinyin import phrases_dict, pinyin_dict
  6 +from misaki import zh
  7 +from typing import List, Tuple
  8 +
  9 +
  10 +def generate_english_lexicon(kind: str):
  11 + assert kind in ("us", "gb"), kind
  12 + # If you want to add new words, please add them to
  13 + # the user_defined dict.
  14 + user_defined = {
  15 + "Kokoro": "kˈOkəɹO",
  16 + "Misaki": "misˈɑki",
  17 + }
  18 +
  19 + user_defined_lower = dict()
  20 + for k, v in user_defined.items():
  21 + user_defined_lower[k.lower()] = v
  22 +
  23 + with open(f"./{kind}_gold.json", encoding="utf-8") as f:
  24 + gold = json.load(f)
  25 +
  26 + with open(f"./{kind}_silver.json", encoding="utf-8") as f:
  27 + silver = json.load(f)
  28 +
  29 + # words in us_gold has a higher priority than those in s_silver, so
  30 + # we put us_gold after us_silver below
  31 + english = {**silver, **gold}
  32 +
  33 + lexicon = dict()
  34 + for k, v in english.items():
  35 + k_lower = k.lower()
  36 +
  37 + if k_lower in user_defined_lower:
  38 + print(f"{k} already exist in the user defined dict. Skip adding")
  39 + continue
  40 +
  41 + if isinstance(v, str):
  42 + lexicon[k_lower] = v
  43 + else:
  44 + assert isinstance(v, dict), (k, v)
  45 + assert "DEFAULT" in v, (k, v)
  46 + lexicon[k_lower] = v["DEFAULT"]
  47 +
  48 + return list(user_defined_lower.items()) + list(lexicon.items())
  49 +
  50 +
  51 +def generate_chinese_lexicon():
  52 + word_dict = pinyin_dict.pinyin_dict
  53 + phrases = phrases_dict.phrases_dict
  54 +
  55 + g2p = zh.ZHG2P()
  56 + lexicon = []
  57 +
  58 + for key in word_dict:
  59 + if not (0x4E00 <= key <= 0x9FFF):
  60 + continue
  61 + w = chr(key)
  62 + tokens: str = g2p(w)
  63 + lexicon.append((w, tokens))
  64 +
  65 + for key in phrases:
  66 + tokens: str = g2p(key)
  67 + lexicon.append((key, tokens))
  68 + return lexicon
  69 +
  70 +
  71 +def save(filename: str, lexicon: List[Tuple[str, str]]):
  72 + with open(filename, "w", encoding="utf-8") as f:
  73 + for word, phones in lexicon:
  74 + tokens = " ".join(list(phones))
  75 + f.write(f"{word} {tokens}\n")
  76 +
  77 +
  78 +def main():
  79 + us = generate_english_lexicon("us")
  80 + gb = generate_english_lexicon("gb")
  81 + zh = generate_chinese_lexicon()
  82 +
  83 + save("lexicon-us-en.txt", us)
  84 + save("lexicon-gb-en.txt", gb)
  85 + save("lexicon-zh.txt", zh)
  86 +
  87 +
  88 +if __name__ == "__main__":
  89 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import json
  6 +
  7 +
  8 +def main():
  9 + with open("config.json") as f:
  10 + config = json.load(f)
  11 + vocab = config["vocab"]
  12 +
  13 + with open("tokens.txt", "w", encoding="utf-8") as f:
  14 + for k, i in vocab.items():
  15 + f.write(f"{k} {i}\n")
  16 +
  17 +
  18 +if __name__ == "__main__":
  19 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +import torch
  4 +from pathlib import Path
  5 +
  6 +
  7 +id2speaker = {
  8 + 0: "af_alloy",
  9 + 1: "af_aoede",
  10 + 2: "af_bella",
  11 + 3: "af_heart",
  12 + 4: "af_jessica",
  13 + 5: "af_kore",
  14 + 6: "af_nicole",
  15 + 7: "af_nova",
  16 + 8: "af_river",
  17 + 9: "af_sarah",
  18 + 10: "af_sky",
  19 + 11: "am_adam",
  20 + 12: "am_echo",
  21 + 13: "am_eric",
  22 + 14: "am_fenrir",
  23 + 15: "am_liam",
  24 + 16: "am_michael",
  25 + 17: "am_onyx",
  26 + 18: "am_puck",
  27 + 19: "am_santa",
  28 + 20: "bf_alice",
  29 + 21: "bf_emma",
  30 + 22: "bf_isabella",
  31 + 23: "bf_lily",
  32 + 24: "bm_daniel",
  33 + 25: "bm_fable",
  34 + 26: "bm_george",
  35 + 27: "bm_lewis",
  36 + 28: "ef_dora",
  37 + 29: "em_alex",
  38 + 30: "ff_siwis",
  39 + 31: "hf_alpha",
  40 + 32: "hf_beta",
  41 + 33: "hm_omega",
  42 + 34: "hm_psi",
  43 + 35: "if_sara",
  44 + 36: "im_nicola",
  45 + 37: "jf_alpha",
  46 + 38: "jf_gongitsune",
  47 + 39: "jf_nezumi",
  48 + 40: "jf_tebukuro",
  49 + 41: "jm_kumo",
  50 + 42: "pf_dora",
  51 + 43: "pm_alex",
  52 + 44: "pm_santa",
  53 + 45: "zf_xiaobei",
  54 + 46: "zf_xiaoni",
  55 + 47: "zf_xiaoxiao",
  56 + 48: "zf_xiaoyi",
  57 + 49: "zm_yunjian",
  58 + 50: "zm_yunxi",
  59 + 51: "zm_yunxia",
  60 + 52: "zm_yunyang",
  61 +}
  62 +
  63 +speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
  64 +
  65 +
  66 +def main():
  67 + if Path("./voices.bin").is_file():
  68 + print("./voices.bin exists - skip")
  69 + return
  70 +
  71 + with open("voices.bin", "wb") as f:
  72 + for _, speaker in id2speaker.items():
  73 + m = torch.load(
  74 + f"{speaker}.pt",
  75 + weights_only=True,
  76 + map_location="cpu",
  77 + ).numpy()
  78 + # m.shape (510, 1, 256)
  79 +
  80 + f.write(m.tobytes())
  81 +
  82 +
  83 +if __name__ == "__main__":
  84 + main()
  1 +#!/usr/bin/env bash
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +set -ex
  5 +
  6 +if [ ! -f kokoro.onnx ]; then
  7 + # see https://github.com/taylorchu/kokoro-onnx/releases
  8 + curl -SL -O https://github.com/taylorchu/kokoro-onnx/releases/download/v0.2.0/kokoro.onnx
  9 +fi
  10 +
  11 +if [ ! -f config.json ]; then
  12 + # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
  13 + curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/config.json
  14 +fi
  15 +
  16 +# see https://huggingface.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L83
  17 +# and
  18 +# https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
  19 +#
  20 +# af -> American female
  21 +# am -> American male
  22 +# bf -> British female
  23 +# bm -> British male
  24 +voices=(
  25 +af_alloy
  26 +af_aoede
  27 +af_bella
  28 +af_heart
  29 +af_jessica
  30 +af_kore
  31 +af_nicole
  32 +af_nova
  33 +af_river
  34 +af_sarah
  35 +af_sky
  36 +am_adam
  37 +am_echo
  38 +am_eric
  39 +am_fenrir
  40 +am_liam
  41 +am_michael
  42 +am_onyx
  43 +am_puck
  44 +am_santa
  45 +bf_alice
  46 +bf_emma
  47 +bf_isabella
  48 +bf_lily
  49 +bm_daniel
  50 +bm_fable
  51 +bm_george
  52 +bm_lewis
  53 +ef_dora
  54 +em_alex
  55 +ff_siwis
  56 +hf_alpha
  57 +hf_beta
  58 +hm_omega
  59 +hm_psi
  60 +if_sara
  61 +im_nicola
  62 +jf_alpha
  63 +jf_gongitsune
  64 +jf_nezumi
  65 +jf_tebukuro
  66 +jm_kumo
  67 +pf_dora
  68 +pm_alex
  69 +pm_santa
  70 +zf_xiaobei # 东北话
  71 +zf_xiaoni
  72 +zf_xiaoxiao
  73 +zf_xiaoyi
  74 +zm_yunjian
  75 +zm_yunxi
  76 +zm_yunxia
  77 +zm_yunyang
  78 +)
  79 +
  80 +mkdir -p voices
  81 +
  82 +for v in ${voices[@]}; do
  83 + if [ ! -f voices/$v.pt ]; then
  84 + curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/voices/$v.pt
  85 + fi
  86 +done
  87 +
  88 +if [ ! -f ./.add-meta-data.done ]; then
  89 + python3 ./add_meta_data.py
  90 + touch ./.add-meta-data.done
  91 +fi
  92 +
  93 +if [ ! -f us_gold.json ]; then
  94 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
  95 +fi
  96 +
  97 +if [ ! -f us_silver.json ]; then
  98 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json
  99 +fi
  100 +
  101 +if [ ! -f gb_gold.json ]; then
  102 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json
  103 +fi
  104 +
  105 +if [ ! -f gb_silver.json ]; then
  106 + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json
  107 +fi
  108 +
  109 +if [ ! -f ./tokens.txt ]; then
  110 + ./generate_tokens.py
  111 +fi
  112 +
  113 +if [ ! -f ./lexicon.txt ]; then
  114 + ./generate_lexicon.py
  115 +fi
  116 +
  117 +if [ ! -f ./voices.bin ]; then
  118 + ./generate_voices_bin.py
  119 +fi
  120 +
  121 +./test.py
  122 +ls -lh
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import re
  6 +import time
  7 +from typing import Dict, List
  8 +
  9 +import jieba
  10 +import numpy as np
  11 +import onnxruntime as ort
  12 +import soundfile as sf
  13 +import torch
  14 +from misaki import zh
  15 +
  16 +try:
  17 + from piper_phonemize import phonemize_espeak
  18 +except Exception as ex:
  19 + raise RuntimeError(
  20 + f"{ex}\nPlease run\n"
  21 + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
  22 + )
  23 +
  24 +
  25 +def show(filename):
  26 + session_opts = ort.SessionOptions()
  27 + session_opts.log_severity_level = 3
  28 + sess = ort.InferenceSession(filename, session_opts)
  29 + for i in sess.get_inputs():
  30 + print(i)
  31 +
  32 + print("-----")
  33 +
  34 + for i in sess.get_outputs():
  35 + print(i)
  36 +
  37 +
  38 +"""
  39 +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
  40 +NodeArg(name='style', type='tensor(float)', shape=[1, 256])
  41 +NodeArg(name='speed', type='tensor(float)', shape=[1])
  42 +-----
  43 +NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
  44 +"""
  45 +
  46 +
  47 +def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
  48 + embedding = (
  49 + np.fromfile(voices_bin, dtype="uint8")
  50 + .view(np.float32)
  51 + .reshape(len(speaker_names), *dim)
  52 + )
  53 + print("embedding.shape", embedding.shape)
  54 + ans = dict()
  55 + for i in range(len(speaker_names)):
  56 + ans[speaker_names[i]] = embedding[i]
  57 +
  58 + return ans
  59 +
  60 +
  61 +def load_tokens(filename: str) -> Dict[str, int]:
  62 + ans = dict()
  63 + with open(filename, encoding="utf-8") as f:
  64 + for line in f:
  65 + fields = line.strip().split()
  66 + if len(fields) == 2:
  67 + token, idx = fields
  68 + ans[token] = int(idx)
  69 + else:
  70 + assert len(fields) == 1, (len(fields), line)
  71 + ans[" "] = int(fields[0])
  72 + return ans
  73 +
  74 +
  75 +def load_lexicon(filename: str) -> Dict[str, List[str]]:
  76 + ans = dict()
  77 + for lexicon in filename.split(","):
  78 + print(lexicon)
  79 + with open(lexicon, encoding="utf-8") as f:
  80 + for line in f:
  81 + w, tokens = line.strip().split(" ", maxsplit=1)
  82 + ans[w] = "".join(tokens.split())
  83 + return ans
  84 +
  85 +
  86 +class OnnxModel:
  87 + def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str):
  88 + session_opts = ort.SessionOptions()
  89 + session_opts.inter_op_num_threads = 1
  90 + session_opts.intra_op_num_threads = 1
  91 +
  92 + self.session_opts = session_opts
  93 + self.model = ort.InferenceSession(
  94 + model_filename,
  95 + sess_options=self.session_opts,
  96 + providers=["CPUExecutionProvider"],
  97 + )
  98 + self.token2id = load_tokens(tokens)
  99 + self.word2tokens = load_lexicon(lexicon)
  100 +
  101 + meta = self.model.get_modelmeta().custom_metadata_map
  102 + print(meta)
  103 + dim = list(map(int, meta["style_dim"].split(",")))
  104 + speaker_names = meta["speaker_names"].split(",")
  105 + self.voices = load_voices(
  106 + speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
  107 + )
  108 + self.sample_rate = int(meta["sample_rate"])
  109 + print(list(self.voices.keys()))
  110 +
  111 + self.sample_rate = 24000
  112 + self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
  113 +
  114 + def __call__(self, text: str, voice: str):
  115 + punctuations = ';:,.!?-…()"“”'
  116 + text = text.lower()
  117 + g2p = zh.ZHG2P()
  118 +
  119 + tokens = ""
  120 +
  121 + for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text):
  122 + if ord(t[0]) < 0x7F:
  123 + for w in t.split():
  124 + while w:
  125 + if w[0] in punctuations:
  126 + tokens += w[0] + " "
  127 + w = w[1:]
  128 + continue
  129 +
  130 + if w[-1] in punctuations:
  131 + if w[:-1] in self.word2tokens:
  132 + tokens += self.word2tokens[w[:-1]]
  133 + tokens += w[-1]
  134 + else:
  135 + if w in self.word2tokens:
  136 + tokens += self.word2tokens[w]
  137 + else:
  138 + print(f"Use espeak-ng for word {w}")
  139 + tokens += "".join(phonemize_espeak(w, "en-us")[0])
  140 +
  141 + tokens += " "
  142 + break
  143 + else:
  144 + # Chinese
  145 + for w in jieba.cut(t):
  146 + if w in self.word2tokens:
  147 + tokens += self.word2tokens[w]
  148 + else:
  149 + for i in w:
  150 + if i in self.word2tokens:
  151 + tokens += self.word2tokens[i]
  152 + else:
  153 + print(f"skip {i}")
  154 +
  155 + token_ids = [self.token2id[i] for i in tokens]
  156 + token_ids = token_ids[: self.max_len]
  157 +
  158 + style = self.voices[voice][len(token_ids)]
  159 +
  160 + token_ids = [0, *token_ids, 0]
  161 + token_ids = np.array([token_ids], dtype=np.int64)
  162 +
  163 + speed = np.array([1.0], dtype=np.float32)
  164 +
  165 + audio = self.model.run(
  166 + [
  167 + self.model.get_outputs()[0].name,
  168 + ],
  169 + {
  170 + self.model.get_inputs()[0].name: token_ids,
  171 + self.model.get_inputs()[1].name: style,
  172 + self.model.get_inputs()[2].name: speed,
  173 + },
  174 + )[0]
  175 + return audio
  176 +
  177 +
  178 +def main():
  179 + m = OnnxModel(
  180 + model_filename="./kokoro.onnx",
  181 + tokens="./tokens.txt",
  182 + lexicon="./lexicon-gb-en.txt,./lexicon-zh.txt",
  183 + voices_bin="./voices.bin",
  184 + )
  185 + text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?"
  186 +
  187 + text = text.lower()
  188 +
  189 + voice = "bf_alice"
  190 + start = time.time()
  191 + audio = m(text, voice=voice)
  192 + end = time.time()
  193 +
  194 + elapsed_seconds = end - start
  195 + audio_duration = len(audio) / m.sample_rate
  196 + real_time_factor = elapsed_seconds / audio_duration
  197 +
  198 + filename = f"kokoro_v1.0_{voice}_zh_en.wav"
  199 + sf.write(
  200 + filename,
  201 + audio,
  202 + samplerate=m.sample_rate,
  203 + subtype="PCM_16",
  204 + )
  205 + print(f" Saved to {filename}")
  206 + print(f" Elapsed seconds: {elapsed_seconds:.3f}")
  207 + print(f" Audio duration in seconds: {audio_duration:.3f}")
  208 + print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
  209 +
  210 +
  211 +if __name__ == "__main__":
  212 + main()