Add Kokoro v1.1-zh (#1942)

Fangjun Kuang · GitHub
Commit dfcbc8d40b222296d855a526f61261b00e0a792d dfcbc8d4 1 parent f5dfcf8d
.github/workflows/export-kokoro.yaml
scripts/apk/generate-tts-apk-script.py
scripts/kokoro/.gitignore
scripts/kokoro/v1.0/add_meta_data.py
scripts/kokoro/v1.0/generate_lexicon.py → scripts/kokoro/v1.0/generate_lexicon_en.py
scripts/kokoro/v1.0/generate_lexicon_zh.py
scripts/kokoro/v1.0/run.sh
scripts/kokoro/v1.0/test.py
scripts/kokoro/v1.1-zh/README.md
scripts/kokoro/v1.1-zh/add_meta_data.py
scripts/kokoro/v1.1-zh/dynamic_quantization.py
scripts/kokoro/v1.1-zh/export_onnx.py
scripts/kokoro/v1.1-zh/generate_lexicon_en.py
scripts/kokoro/v1.1-zh/generate_lexicon_zh.py
scripts/kokoro/v1.1-zh/generate_tokens.py
scripts/kokoro/v1.1-zh/generate_voices_bin.py
scripts/kokoro/v1.1-zh/run.sh
scripts/kokoro/v1.1-zh/test.py
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
sherpa-onnx/csrc/offline-tts.cc
--- a/.github/workflows/export-kokoro.yaml
查看文件 @dfcbc8d
+++ b/.github/workflows/export-kokoro.yaml
查看文件 @dfcbc8d
@@ -3,7 +3,7 @@ name: export-kokoro-to-onnx
 on:
   push:
     branches:
-       - export-kokoro
+       - export-kokoro-2
 
   workflow_dispatch:
 
@@ -20,7 +20,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-         version: ["0.19", "1.0"]
+         version: ["0.19", "1.0", "1.1-zh"]
         python-version: ["3.10"]
 
     steps:
@@ -34,7 +34,7 @@ jobs:
       - name: Install Python dependencies
         shell: bash
         run: |
-           pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
+           pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
 
       - name: Run
         shell: bash
@@ -49,9 +49,15 @@ jobs:
           elif [[ $v == "1.0" ]]; then
             cd v1.0
             ./run.sh
+           elif [[ $v == "1.1-zh" ]]; then
+             cd v1.1-zh
+             ./run.sh
+           else
+             echo "Unknown version $v"
+             exit 1
           fi
 
-       - name: Collect results ${{ matrix.version }}
+       - name: Collect results 0.19
         if: matrix.version == '0.19'
         shell: bash
         run: |
@@ -71,7 +77,7 @@ jobs:
 
           ls -lh $d.tar.bz2
 
-       - name: Collect results ${{ matrix.version }}
+       - name: Collect results 1.0
         if: matrix.version == '1.0'
         shell: bash
         run: |
@@ -87,7 +93,7 @@ jobs:
 
           d=kokoro-multi-lang-v1_0
           mkdir $d
-           cp -a LICENSE $d/LICENSE
+           cp -v LICENSE $d/LICENSE
           cp -a espeak-ng-data $d/
           cp -v $src/kokoro.onnx $d/model.onnx
           cp -v $src/voices.bin $d/
@@ -105,7 +111,63 @@ jobs:
 
           ls -lh $d.tar.bz2
 
-       - name: Publish to huggingface ${{ matrix.version }}
+       - name: Collect results 1.1-zh
+         if: matrix.version == '1.1-zh'
+         shell: bash
+         run: |
+           curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
+           tar xvf dict.tar.bz2
+           rm dict.tar.bz2
+ 
+           curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
+           curl -SL -o number-zh.fst  https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
+           curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
+ 
+           src=scripts/kokoro/v1.1-zh
+ 
+           d=kokoro-multi-lang-v1_1
+           mkdir $d
+           cp -v LICENSE $d/LICENSE
+           cp -a espeak-ng-data $d/
+           cp -v $src/kokoro.onnx $d/model.onnx
+           cp -v $src/voices.bin $d/
+           cp -v $src/tokens.txt $d/
+           cp -v $src/lexicon*.txt $d/
+           cp -v $src/README.md $d/README.md
+           cp -av dict $d/
+           cp -v ./*.fst $d/
+           ls -lh $d/
+           echo "---"
+           ls -lh $d/dict
+ 
+           tar cjfv $d.tar.bz2 $d
+           rm -rf $d
+           ls -lh $d.tar.bz2
+ 
+           d=kokoro-int8-multi-lang-v1_1
+           mkdir $d
+           cp -v LICENSE $d/LICENSE
+           cp -a espeak-ng-data $d/
+           cp -v $src/kokoro.int8.onnx $d/model.int8.onnx
+           cp -v $src/voices.bin $d/
+           cp -v $src/tokens.txt $d/
+           cp -v $src/lexicon*.txt $d/
+           cp -v $src/README.md $d/README.md
+           cp -av dict $d/
+           cp -v ./*.fst $d/
+           ls -lh $d/
+           echo "---"
+           ls -lh $d/dict
+ 
+           tar cjfv $d.tar.bz2 $d
+           rm -rf $d
+           ls -lh $d.tar.bz2
+ 
+           echo "---"
+           ls -lh *.tar.bz2
+ 
+ 
+       - name: Publish to huggingface 0.19
         if: matrix.version == '0.19'
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -154,7 +216,7 @@ jobs:
             git commit -m "add models"
             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
 
-       - name: Publish to huggingface ${{ matrix.version }}
+       - name: Publish to huggingface 1.0
         if: matrix.version == '1.0'
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -205,6 +267,108 @@ jobs:
             git commit -m "add models"
             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true
 
+       - name: Publish to huggingface 1.1-zh
+         if: matrix.version == '1.1-zh'
+         env:
+           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+         uses: nick-fields/retry@v3
+         with:
+           max_attempts: 20
+           timeout_seconds: 200
+           shell: bash
+           command: |
+             git config --global user.email "csukuangfj@gmail.com"
+             git config --global user.name "Fangjun Kuang"
+ 
+             rm -rf huggingface
+             export GIT_LFS_SKIP_SMUDGE=1
+             export GIT_CLONE_PROTECTION_ACTIVE=false
+ 
+             git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 huggingface
+             cd huggingface
+             rm -rf ./*
+             git fetch
+             git pull
+ 
+             git lfs track "cmn_dict"
+             git lfs track "ru_dict"
+             git lfs track "*.wav"
+             git lfs track "lexicon*.txt"
+ 
+             cp -a ../espeak-ng-data ./
+ 
+             cp -v ../scripts/kokoro/v1.1-zh/kokoro.onnx ./model.onnx
+ 
+ 
+             cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
+             cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
+             cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
+             cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md
+             cp -v ../LICENSE ./
+             cp -av ../dict ./
+             cp -v ../*.fst ./
+ 
+             git lfs track "*.onnx"
+             git add .
+ 
+             ls -lh
+ 
+             git status
+ 
+             git commit -m "add models"
+             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 main || true
+ 
+       - name: Publish to huggingface 1.1-zh-int8
+         if: matrix.version == '1.1-zh'
+         env:
+           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+         uses: nick-fields/retry@v3
+         with:
+           max_attempts: 20
+           timeout_seconds: 200
+           shell: bash
+           command: |
+             git config --global user.email "csukuangfj@gmail.com"
+             git config --global user.name "Fangjun Kuang"
+ 
+             rm -rf huggingface
+             export GIT_LFS_SKIP_SMUDGE=1
+             export GIT_CLONE_PROTECTION_ACTIVE=false
+ 
+             git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 huggingface
+             cd huggingface
+             rm -rf ./*
+             git fetch
+             git pull
+ 
+             git lfs track "cmn_dict"
+             git lfs track "ru_dict"
+             git lfs track "*.wav"
+             git lfs track "lexicon*.txt"
+ 
+             cp -a ../espeak-ng-data ./
+ 
+             cp -v ../scripts/kokoro/v1.1-zh/kokoro.int8.onnx ./model.int8.onnx
+ 
+ 
+             cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
+             cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
+             cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
+             cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md
+             cp -v ../LICENSE ./
+             cp -av ../dict ./
+             cp -v ../*.fst ./
+ 
+             git lfs track "*.onnx"
+             git add .
+ 
+             ls -lh
+ 
+             git status
+ 
+             git commit -m "add models"
+             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 main || true
+ 
       - name: Release
         if: github.repository_owner == 'csukuangfj'
         uses: svenstaro/upload-release-action@v2
--- a/scripts/apk/generate-tts-apk-script.py
查看文件 @dfcbc8d
+++ b/scripts/apk/generate-tts-apk-script.py
查看文件 @dfcbc8d
@@ -438,7 +438,17 @@ def get_kokoro_models() -> List[TtsModel]:
             model_dir="kokoro-multi-lang-v1_0",
             model_name="model.onnx",
             lang="en",
-         )
+         ),
+         TtsModel(
+             model_dir="kokoro-multi-lang-v1_1",
+             model_name="model.onnx",
+             lang="en",
+         ),
+         TtsModel(
+             model_dir="kokoro-int8-multi-lang-v1_1",
+             model_name="model.int8.onnx",
+             lang="en",
+         ),
     ]
     for m in multi_lingual_models:
         m.data_dir = f"{m.model_dir}/espeak-ng-data"
--- a/scripts/kokoro/.gitignore
查看文件 @dfcbc8d
+++ b/scripts/kokoro/.gitignore
查看文件 @dfcbc8d
 voices.json
 voices.bin
 README-new.md
+ lexicon-*.txt
+ config.json
--- a/scripts/kokoro/v1.0/add_meta_data.py
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.0/add_meta_data.py
查看文件 @dfcbc8d
@@ -2,11 +2,6 @@
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
 
 
- import argparse
- import json
- from pathlib import Path
- 
- import numpy as np
 import onnx
 import torch
 
--- a/scripts/kokoro/v1.0/generate_lexicon.py → scripts/kokoro/v1.0/generate_lexicon_en.py
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.0/generate_lexicon.py → scripts/kokoro/v1.0/generate_lexicon_en.py
查看文件 @dfcbc8d
@@ -4,19 +4,6 @@
 import json
 from typing import List, Tuple
 
- from misaki import zh
- from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
- 
- user_dict = {
-     "还田": [["huan2"], ["tian2"]],
-     "行长": [["hang2"], ["zhang3"]],
-     "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
- }
- 
- load_phrases_dict(user_dict)
- 
- phrases_dict.phrases_dict.update(**user_dict)
- 
 
 def generate_english_lexicon(kind: str):
     assert kind in ("us", "gb"), kind
@@ -59,28 +46,6 @@ def generate_english_lexicon(kind: str):
     return list(user_defined_lower.items()) + list(lexicon.items())
 
 
- def generate_chinese_lexicon():
-     word_dict = pinyin_dict.pinyin_dict
-     phrases = phrases_dict.phrases_dict
- 
-     g2p = zh.ZHG2P()
-     lexicon = []
- 
-     for key in word_dict:
-         if not (0x4E00 <= key <= 0x9FFF):
-             continue
-         w = chr(key)
-         tokens: str = g2p.word2ipa(w)
-         tokens = tokens.replace(chr(815), "")
-         lexicon.append((w, tokens))
- 
-     for key in phrases:
-         tokens: str = g2p.word2ipa(key)
-         tokens = tokens.replace(chr(815), "")
-         lexicon.append((key, tokens))
-     return lexicon
- 
- 
 def save(filename: str, lexicon: List[Tuple[str, str]]):
     with open(filename, "w", encoding="utf-8") as f:
         for word, phones in lexicon:
@@ -91,11 +56,9 @@ def save(filename: str, lexicon: List[Tuple[str, str]]):
 def main():
     us = generate_english_lexicon("us")
     gb = generate_english_lexicon("gb")
-     zh = generate_chinese_lexicon()
 
     save("lexicon-us-en.txt", us)
     save("lexicon-gb-en.txt", gb)
-     save("lexicon-zh.txt", zh)
 
 
 if __name__ == "__main__":
--- a/scripts/kokoro/v1.0/generate_lexicon_zh.py 0 → 100755
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.0/generate_lexicon_zh.py 0 → 100755
查看文件 @dfcbc8d
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ from typing import List, Tuple
+ 
+ from misaki import zh
+ from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
+ 
+ user_dict = {
+     "还田": [["huan2"], ["tian2"]],
+     "行长": [["hang2"], ["zhang3"]],
+     "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
+ }
+ 
+ load_phrases_dict(user_dict)
+ 
+ phrases_dict.phrases_dict.update(**user_dict)
+ 
+ 
+ def generate_chinese_lexicon():
+     word_dict = pinyin_dict.pinyin_dict
+     phrases = phrases_dict.phrases_dict
+ 
+     g2p = zh.ZHG2P()
+     lexicon = []
+ 
+     for key in word_dict:
+         if not (0x4E00 <= key <= 0x9FFF):
+             continue
+         w = chr(key)
+         tokens: str = g2p.word2ipa(w)
+         tokens = tokens.replace(chr(815), "")
+         lexicon.append((w, tokens))
+ 
+     for key in phrases:
+         tokens: str = g2p.word2ipa(key)
+         tokens = tokens.replace(chr(815), "")
+         lexicon.append((key, tokens))
+     return lexicon
+ 
+ 
+ def save(filename: str, lexicon: List[Tuple[str, str]]):
+     with open(filename, "w", encoding="utf-8") as f:
+         for word, phones in lexicon:
+             tokens = " ".join(list(phones))
+             f.write(f"{word} {tokens}\n")
+ 
+ 
+ def main():
+     zh = generate_chinese_lexicon()
+ 
+     save("lexicon-zh.txt", zh)
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/kokoro/v1.0/run.sh
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.0/run.sh
查看文件 @dfcbc8d
@@ -111,7 +111,11 @@ if [ ! -f ./tokens.txt ]; then
 fi
 
 if [ ! -f ./lexicon-zh.txt ]; then
-   ./generate_lexicon.py
+   ./generate_lexicon_zh.py
+ fi
+ 
+ if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
+   ./generate_lexicon_en.py
 fi
 
 if [ ! -f ./voices.bin ]; then
--- a/scripts/kokoro/v1.0/test.py
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.0/test.py
查看文件 @dfcbc8d
@@ -10,8 +10,6 @@ import jieba
 import numpy as np
 import onnxruntime as ort
 import soundfile as sf
- import torch
- from misaki import zh
 
 try:
     from piper_phonemize import phonemize_espeak
@@ -114,7 +112,6 @@ class OnnxModel:
     def __call__(self, text: str, voice: str):
         punctuations = ';:,.!?-…()"“”'
         text = text.lower()
-         g2p = zh.ZHG2P()
 
         tokens = ""
 
--- a/scripts/kokoro/v1.1-zh/README.md 0 → 100644
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/README.md 0 → 100644
查看文件 @dfcbc8d
+ # Introduction
+ 
+ This directory is for kokoro v1.1-zh.
+ 
+ See also https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh
--- a/scripts/kokoro/v1.1-zh/add_meta_data.py 0 → 100755
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/add_meta_data.py 0 → 100755
查看文件 @dfcbc8d
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ 
+ import onnx
+ import torch
+ 
+ from generate_voices_bin import speaker2id
+ 
+ 
+ def main():
+     model = onnx.load("./kokoro.onnx")
+     style = torch.load("./voices/zf_001.pt", weights_only=True, map_location="cpu")
+ 
+     id2speaker_str = ""
+     speaker2id_str = ""
+     sep = ""
+     for s, i in speaker2id.items():
+         speaker2id_str += f"{sep}{s}->{i}"
+         id2speaker_str += f"{sep}{i}->{s}"
+         sep = ","
+ 
+     meta_data = {
+         "model_type": "kokoro",
+         "language": "multi-lang, e.g., English, Chinese",
+         "has_espeak": 1,
+         "sample_rate": 24000,
+         "version": 2,
+         "voice": "en-us",
+         "style_dim": ",".join(map(str, style.shape)),
+         "n_speakers": len(speaker2id),
+         "id2speaker": id2speaker_str,
+         "speaker2id": speaker2id_str,
+         "speaker_names": ",".join(map(str, speaker2id.keys())),
+         "model_url": "https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh",
+         "maintainer": "k2-fsa",
+         "comment": "This is Kokoro v1.1-zh, a multilingual TTS model, supporting English, Chinese.",
+     }
+ 
+     print(model.metadata_props)
+ 
+     while len(model.metadata_props):
+         model.metadata_props.pop()
+ 
+     for key, value in meta_data.items():
+         meta = model.metadata_props.add()
+         meta.key = key
+         meta.value = str(value)
+     print("--------------------")
+ 
+     print(model.metadata_props)
+ 
+     onnx.save(model, "./kokoro.onnx")
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/kokoro/v1.1-zh/dynamic_quantization.py 0 → 100755
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/dynamic_quantization.py 0 → 100755
查看文件 @dfcbc8d
+ #!/usr/bin/env python3
+ import argparse
+ 
+ import onnxruntime
+ from onnxruntime.quantization import QuantType, quantize_dynamic
+ 
+ 
+ def show(filename):
+     session_opts = onnxruntime.SessionOptions()
+     session_opts.log_severity_level = 3
+     sess = onnxruntime.InferenceSession(filename, session_opts)
+     for i in sess.get_inputs():
+         print(i)
+ 
+     print("-----")
+ 
+     for i in sess.get_outputs():
+         print(i)
+ 
+ 
+ """
+ NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
+ NodeArg(name='style', type='tensor(float)', shape=[1, 256])
+ NodeArg(name='speed', type='tensor(float)', shape=[1])
+ -----
+ NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
+ """
+ 
+ 
+ def main():
+     show("./kokoro.onnx")
+ 
+     quantize_dynamic(
+         model_input="kokoro.onnx",
+         model_output="kokoro.int8.onnx",
+         #  op_types_to_quantize=["MatMul"],
+         weight_type=QuantType.QUInt8,
+     )
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/kokoro/v1.1-zh/export_onnx.py 0 → 100644
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/export_onnx.py 0 → 100644
查看文件 @dfcbc8d
+ #!/usr/bin/env python3
+ 
+ import json
+ 
+ import torch
+ from kokoro import KModel
+ from kokoro.model import KModelForONNX
+ 
+ 
+ @torch.no_grad()
+ def main():
+     with open("config.json") as f:
+         config = json.load(f)
+ 
+     model = (
+         KModel(
+             repo_id="not-used-any-value-is-ok",
+             model="kokoro-v1_1-zh.pth",
+             config=config,
+             disable_complex=True,
+         )
+         .to("cpu")
+         .eval()
+     )
+ 
+     x = torch.randint(1, 100, (48,)).numpy()
+     x = torch.LongTensor([[0, *x, 0]])
+ 
+     style = torch.rand(1, 256, dtype=torch.float32)
+     speed = torch.rand(1)
+ 
+     print(x.shape, x.dtype)
+     print(style.shape, style.dtype)
+     print(speed, speed.dtype)
+ 
+     model2 = KModelForONNX(model)
+ 
+     torch.onnx.export(
+         model2,
+         (x, style, speed),
+         "kokoro.onnx",
+         input_names=["tokens", "style", "speed"],
+         output_names=["audio"],
+         dynamic_axes={
+             "tokens": {1: "sequence_length"},
+             "audio": {0: "audio_length"},
+         },
+         opset_version=14,  # minimum working version for this kokoro model is 14
+     )
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/kokoro/v1.1-zh/generate_lexicon_en.py 0 → 120000
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/generate_lexicon_en.py 0 → 120000
查看文件 @dfcbc8d
+ ../v1.0/generate_lexicon_en.py
\ No newline at end of file
--- a/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py 0 → 100755
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py 0 → 100755
查看文件 @dfcbc8d
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ import re
+ from typing import List, Tuple
+ 
+ from misaki import zh
+ from misaki.token import MToken
+ from misaki.zh_frontend import ZH_MAP
+ from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
+ 
+ user_dict = {
+     "还田": [["huan2"], ["tian2"]],
+     "行长": [["hang2"], ["zhang3"]],
+     "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
+ }
+ 
+ load_phrases_dict(user_dict)
+ 
+ phrases_dict.phrases_dict.update(**user_dict)
+ 
+ 
+ def process_text(self, text, with_erhua=True):
+     """
+     This function is modified from
+     https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155
+ 
+     Note that we have removed jieba.posseg.lcut().
+     """
+     seg_cut = [(text, "v")]
+     seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+     tokens = []
+     seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+     initials = []
+     finals = []
+     # pypinyin, g2pM
+     for word, pos in seg_cut:
+         if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
+             pos = "X"
+         elif pos != "x" and word in self.punc:
+             pos = "x"
+         tk = MToken(text=word, tag=pos, whitespace="")
+         if pos in ("x", "eng"):
+             if not word.isspace():
+                 if pos == "x" and word in self.punc:
+                     tk.phonemes = word
+                 tokens.append(tk)
+             elif tokens:
+                 tokens[-1].whitespace += word
+             continue
+         elif (
+             tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
+         ):
+             tokens[-1].whitespace = "/"
+ 
+         # g2p
+         sub_initials, sub_finals = self._get_initials_finals(word)
+         # tone sandhi
+         sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
+         # er hua
+         if with_erhua:
+             sub_initials, sub_finals = self._merge_erhua(
+                 sub_initials, sub_finals, word, pos
+             )
+ 
+         initials.append(sub_initials)
+         finals.append(sub_finals)
+         # assert len(sub_initials) == len(sub_finals) == len(word)
+ 
+         # sum(iterable[, start])
+         # initials = sum(initials, [])
+         # finals = sum(finals, [])
+ 
+         phones = []
+         for c, v in zip(sub_initials, sub_finals):
+             # NOTE: post process for pypinyin outputs
+             # we discriminate i, ii and iii
+             if c:
+                 phones.append(c)
+             # replace punctuation by ` `
+             # if c and c in self.punc:
+             #     phones.append(c)
+             if v and (v not in self.punc or v != c):  # and v not in self.rhy_phns:
+                 phones.append(v)
+         phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
+         phones = re.sub(r"(?=\d)", "_", phones).split("_")
+         tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
+         tokens.append(tk)
+ 
+     result = "".join(
+         (self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
+         for tk in tokens
+     )
+ 
+     return result, tokens
+ 
+ 
+ def generate_chinese_lexicon():
+     word_dict = pinyin_dict.pinyin_dict
+     phrases = phrases_dict.phrases_dict
+ 
+     g2p = zh.ZHG2P(version="1.1")
+ 
+     lexicon = []
+     for key in word_dict:
+         if not (0x4E00 <= key <= 0x9FFF):
+             continue
+         w = chr(key)
+         tokens: str = process_text(g2p.frontend, w)[0]
+         lexicon.append((w, tokens))
+ 
+     for key in phrases:
+         tokens: str = process_text(g2p.frontend, key)[0]
+         lexicon.append((key, tokens))
+     return lexicon
+ 
+ 
+ def save(filename: str, lexicon: List[Tuple[str, str]]):
+     with open(filename, "w", encoding="utf-8") as f:
+         for word, phones in lexicon:
+             tokens = " ".join(list(phones))
+             f.write(f"{word} {tokens}\n")
+ 
+ 
+ def main():
+     zh = generate_chinese_lexicon()
+ 
+     save("lexicon-zh.txt", zh)
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/kokoro/v1.1-zh/generate_tokens.py 0 → 120000
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/generate_tokens.py 0 → 120000
查看文件 @dfcbc8d
+ ../v1.0/generate_tokens.py
\ No newline at end of file
--- a/scripts/kokoro/v1.1-zh/generate_voices_bin.py 0 → 100755
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/generate_voices_bin.py 0 → 100755
查看文件 @dfcbc8d
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ import torch
+ from pathlib import Path
+ 
+ 
+ speakers = [
+     "af_maple",
+     "af_sol",
+     "bf_vale",
+ ]
+ for i in range(1, 99 + 1):
+     name = "zf_{:03d}".format(i)
+     if Path(f"voices/{name}.pt").is_file():
+         speakers.append(name)
+ 
+ for i in range(9, 100 + 1):
+     name = "zm_{:03d}".format(i)
+     if Path(f"voices/{name}.pt").is_file():
+         speakers.append(name)
+ 
+ 
+ id2speaker = {index: value for index, value in enumerate(speakers)}
+ 
+ speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
+ 
+ 
+ def main():
+     if Path("./voices.bin").is_file():
+         print("./voices.bin exists - skip")
+         return
+ 
+     with open("voices.bin", "wb") as f:
+         for _, speaker in id2speaker.items():
+             m = torch.load(
+                 f"voices/{speaker}.pt",
+                 weights_only=True,
+                 map_location="cpu",
+             ).numpy()
+             # m.shape (510, 1, 256)
+ 
+             f.write(m.tobytes())
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/kokoro/v1.1-zh/run.sh 0 → 100755
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/run.sh 0 → 100755
查看文件 @dfcbc8d
+ #!/usr/bin/env bash
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ #
+ set -ex
+ 
+ if [ ! -f kokoro-v1_1-zh.pth ]; then
+   curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/kokoro-v1_1-zh.pth
+ fi
+ 
+ 
+ if [ ! -f config.json ]; then
+   # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
+   curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json
+ fi
+ 
+ voices=(
+ af_maple
+ af_sol
+ bf_vale
+ )
+ # zf_001-zf_099
+ for i in $(seq 1 99); do
+   a=$(printf "zf_%03d" $i)
+   voices+=($a)
+ done
+ 
+ # zm_009-zm_100
+ for i in $(seq 9 100); do
+   a=$(printf "zm_%03d" $i)
+   voices+=($a)
+ done
+ 
+ echo ${voices[@]} # all elements
+ echo ${#voices[@]} # length
+ 
+ mkdir -p voices
+ 
+ for v in ${voices[@]}; do
+   if [ ! -f voices/$v.pt ]; then
+     curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/voices/$v.pt
+   fi
+ done
+ pushd voices
+ find . -type f -size -10k -exec rm -v {} +
+ ls -lh
+ du -h -d1 .
+ popd
+ 
+ if [ ! -f ./kokoro.onnx ]; then
+   python3 ./export_onnx.py
+ fi
+ 
+ if [ ! -f ./.add-meta-data.done ]; then
+   python3 ./add_meta_data.py
+   touch ./.add-meta-data.done
+ fi
+ 
+ if [ ! -f ./kokoro.int8.onnx ]; then
+   python3 ./dynamic_quantization.py
+ fi
+ 
+ if [ ! -f us_gold.json ]; then
+   curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
+ fi
+ 
+ if [ ! -f us_silver.json ]; then
+   curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json
+ fi
+ 
+ if [ ! -f gb_gold.json ]; then
+   curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json
+ fi
+ 
+ if [ ! -f gb_silver.json ]; then
+   curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json
+ fi
+ 
+ if [ ! -f ./tokens.txt ]; then
+   ./generate_tokens.py
+ fi
+ 
+ if [ ! -f ./lexicon-zh.txt ]; then
+   ./generate_lexicon_zh.py
+ fi
+ 
+ if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
+   ./generate_lexicon_en.py
+ fi
+ 
+ if [ ! -f ./voices.bin ]; then
+   ./generate_voices_bin.py
+ fi
+ 
+ ./test.py
+ ls -lh
--- a/scripts/kokoro/v1.1-zh/test.py 0 → 100755
查看文件 @dfcbc8d
+++ b/scripts/kokoro/v1.1-zh/test.py 0 → 100755
查看文件 @dfcbc8d
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ 
+ import re
+ import time
+ from typing import Dict, List
+ 
+ import jieba
+ import numpy as np
+ import onnxruntime as ort
+ import soundfile as sf
+ 
+ try:
+     from piper_phonemize import phonemize_espeak
+ except Exception as ex:
+     raise RuntimeError(
+         f"{ex}\nPlease run\n"
+         "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
+     )
+ 
+ 
+ def show(filename):
+     session_opts = ort.SessionOptions()
+     session_opts.log_severity_level = 3
+     sess = ort.InferenceSession(filename, session_opts)
+     for i in sess.get_inputs():
+         print(i)
+ 
+     print("-----")
+ 
+     for i in sess.get_outputs():
+         print(i)
+ 
+ 
+ """
+ NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
+ NodeArg(name='style', type='tensor(float)', shape=[1, 256])
+ NodeArg(name='speed', type='tensor(float)', shape=[1])
+ -----
+ NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
+ """
+ 
+ 
+ def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
+     embedding = (
+         np.fromfile(voices_bin, dtype="uint8")
+         .view(np.float32)
+         .reshape(len(speaker_names), *dim)
+     )
+     print("embedding.shape", embedding.shape)
+     ans = dict()
+     for i in range(len(speaker_names)):
+         ans[speaker_names[i]] = embedding[i]
+ 
+     return ans
+ 
+ 
+ def load_tokens(filename: str) -> Dict[str, int]:
+     ans = dict()
+     with open(filename, encoding="utf-8") as f:
+         for line in f:
+             fields = line.strip().split()
+             if len(fields) == 2:
+                 token, idx = fields
+                 ans[token] = int(idx)
+             else:
+                 assert len(fields) == 1, (len(fields), line)
+                 ans[" "] = int(fields[0])
+     return ans
+ 
+ 
+ def load_lexicon(filename: str) -> Dict[str, List[str]]:
+     ans = dict()
+     for lexicon in filename.split(","):
+         print(lexicon)
+         with open(lexicon, encoding="utf-8") as f:
+             for line in f:
+                 w, tokens = line.strip().split(" ", maxsplit=1)
+                 ans[w] = "".join(tokens.split())
+     return ans
+ 
+ 
+ class OnnxModel:
+     def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str):
+         session_opts = ort.SessionOptions()
+         session_opts.inter_op_num_threads = 3
+         session_opts.intra_op_num_threads = 3
+ 
+         self.session_opts = session_opts
+         self.model = ort.InferenceSession(
+             model_filename,
+             sess_options=self.session_opts,
+             providers=["CPUExecutionProvider"],
+         )
+         self.token2id = load_tokens(tokens)
+         self.word2tokens = load_lexicon(lexicon)
+ 
+         meta = self.model.get_modelmeta().custom_metadata_map
+         print(meta)
+         dim = list(map(int, meta["style_dim"].split(",")))
+         speaker_names = meta["speaker_names"].split(",")
+         self.voices = load_voices(
+             speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
+         )
+         self.sample_rate = int(meta["sample_rate"])
+         print(list(self.voices.keys()))
+ 
+         self.sample_rate = 24000
+         self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
+ 
+     def __call__(self, text: str, voice: str):
+         punctuations = ';:,.!?-…()"“”'
+         text = text.lower()
+ 
+         tokens = ""
+ 
+         for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text):
+             if ord(t[0]) < 0x7F:
+                 for w in t.split():
+                     while w:
+                         if w[0] in punctuations:
+                             tokens += w[0] + " "
+                             w = w[1:]
+                             continue
+ 
+                         if w[-1] in punctuations:
+                             if w[:-1] in self.word2tokens:
+                                 tokens += self.word2tokens[w[:-1]]
+                                 tokens += w[-1]
+                         else:
+                             if w in self.word2tokens:
+                                 tokens += self.word2tokens[w]
+                             else:
+                                 print(f"Use espeak-ng for word {w}")
+                                 tokens += "".join(phonemize_espeak(w, "en-us")[0])
+ 
+                         tokens += " "
+                         break
+             else:
+                 # Chinese
+                 for w in jieba.cut(t):
+                     if w in self.word2tokens:
+                         tokens += self.word2tokens[w]
+                     else:
+                         for i in w:
+                             if i in self.word2tokens:
+                                 tokens += self.word2tokens[i]
+                             else:
+                                 print(f"skip {i}")
+ 
+         token_ids = [self.token2id[i] for i in tokens]
+         token_ids = token_ids[: self.max_len]
+ 
+         style = self.voices[voice][len(token_ids)]
+ 
+         token_ids = [0, *token_ids, 0]
+         token_ids = np.array([token_ids], dtype=np.int64)
+ 
+         speed = np.array([1.0], dtype=np.float32)
+ 
+         audio = self.model.run(
+             [
+                 self.model.get_outputs()[0].name,
+             ],
+             {
+                 self.model.get_inputs()[0].name: token_ids,
+                 self.model.get_inputs()[1].name: style,
+                 self.model.get_inputs()[2].name: speed,
+             },
+         )[0]
+         return audio
+ 
+ 
+ def main():
+     m = OnnxModel(
+         model_filename="./kokoro.onnx",
+         tokens="./tokens.txt",
+         lexicon="./lexicon-us-en.txt,./lexicon-zh.txt",
+         voices_bin="./voices.bin",
+     )
+     text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?"
+ 
+     text = text.lower()
+ 
+     voice = "zf_001"
+     start = time.time()
+     audio = m(text, voice=voice)
+     end = time.time()
+ 
+     elapsed_seconds = end - start
+     audio_duration = len(audio) / m.sample_rate
+     real_time_factor = elapsed_seconds / audio_duration
+ 
+     filename = f"kokoro_v1.1_{voice}_zh_en.wav"
+     sf.write(
+         filename,
+         audio,
+         samplerate=m.sample_rate,
+         subtype="PCM_16",
+     )
+     print(f" Saved to {filename}")
+     print(f" Elapsed seconds: {elapsed_seconds:.3f}")
+     print(f" Audio duration in seconds: {audio_duration:.3f}")
+     print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
查看文件 @dfcbc8d
+++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
查看文件 @dfcbc8d
@@ -128,15 +128,19 @@ class KokoroMultiLangLexicon::Impl {
       }
 
       for (const auto &ids : ids_vec) {
-         if (ids.size() > 4) {
+         if (ids.size() > 10 + 2) {
           ans.emplace_back(ids);
         } else {
           if (ans.empty()) {
             ans.emplace_back(ids);
           } else {
+             if (ans.back().tokens.size() + ids.size() < 50) {
               ans.back().tokens.back() = ids[1];
               ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
                                        ids.end());
+             } else {
+               ans.emplace_back(ids);
+             }
           }
         }
       }
--- a/sherpa-onnx/csrc/offline-tts.cc
查看文件 @dfcbc8d
+++ b/sherpa-onnx/csrc/offline-tts.cc
查看文件 @dfcbc8d
@@ -33,8 +33,8 @@ GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
   if (scale == 1) {
     return *this;
   }
-   // if the interval is larger than 0.6 second, then we assume it is a pause
-   int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
+   // if the interval is larger than 0.2 second, then we assume it is a pause
+   int32_t threshold = static_cast<int32_t>(sample_rate * 0.2);
 
   std::vector<SilenceInterval> intervals;
   int32_t num_samples = static_cast<int32_t>(samples.size());