Export kokoro to sherpa-onnx (#1713)

Fangjun Kuang · GitHub
Commit 9efe26a64624ed0ab71cb0dd98e23b53f4874a47 9efe26a6 1 parent ce71b632
.github/workflows/export-kokoro.yaml
scripts/kokoro/.gitignore
scripts/kokoro/README.md
scripts/kokoro/add-meta-data.py
scripts/kokoro/run.sh
scripts/kokoro/test.py
scripts/melo-tts/export-onnx.py
--- a/.github/workflows/export-kokoro.yaml 0 → 100644
查看文件 @9efe26a
+++ b/.github/workflows/export-kokoro.yaml 0 → 100644
查看文件 @9efe26a
+ name: export-kokoro-to-onnx
+ 
+ on:
+   push:
+     branches:
+       - export-kokoro
+ 
+   workflow_dispatch:
+ 
+ concurrency:
+   group: export-kokoro-to-onnx-${{ github.ref }}
+   cancel-in-progress: true
+ 
+ jobs:
+   export-kokoro-to-onnx:
+     if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
+     name: export kokoro
+     runs-on: ${{ matrix.os }}
+     strategy:
+       fail-fast: false
+       matrix:
+         os: [ubuntu-latest]
+         python-version: ["3.10"]
+ 
+     steps:
+       - uses: actions/checkout@v4
+ 
+       - name: Setup Python ${{ matrix.python-version }}
+         uses: actions/setup-python@v5
+         with:
+           python-version: ${{ matrix.python-version }}
+ 
+       - name: Install Python dependencies
+         shell: bash
+         run: |
+           pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+ 
+       - name: Run
+         shell: bash
+         run: |
+           curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+           tar xf espeak-ng-data.tar.bz2
+           rm espeak-ng-data.tar.bz2
+           cd scripts/kokoro
+           ./run.sh
+ 
+       - name: Collect results
+         shell: bash
+         run: |
+           src=scripts/kokoro
+ 
+           d=kokoro-en-v0_19
+           mkdir $d
+           cp -a LICENSE $d/LICENSE
+           cp -a espeak-ng-data $d/
+           cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx
+           cp -v $src/voices.bin $d/
+           cp -v $src/tokens.txt $d/
+           cp -v $src/README-new.md $d/README.md
+           ls -lh $d/
+           tar cjfv $d.tar.bz2 $d
+           rm -rf $d
+ 
+           ls -h $.tar.bz2
+ 
+       - name: Publish to huggingface
+         env:
+           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+         uses: nick-fields/retry@v3
+         with:
+           max_attempts: 20
+           timeout_seconds: 200
+           shell: bash
+           command: |
+             git config --global user.email "csukuangfj@gmail.com"
+             git config --global user.name "Fangjun Kuang"
+ 
+             rm -rf huggingface
+             export GIT_LFS_SKIP_SMUDGE=1
+             export GIT_CLONE_PROTECTION_ACTIVE=false
+ 
+             git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 huggingface
+             cd huggingface
+             rm -rf ./*
+             git fetch
+             git pull
+ 
+             git lfs track "cmn_dict"
+             git lfs track "ru_dict"
+             git lfs track "*.wav"
+ 
+             cp -a ../espeak-ng-data ./
+             mkdir -p test_wavs
+ 
+             cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx
+ 
+             cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/
+ 
+             cp -v ../scripts/kokoro/tokens.txt .
+             cp -v ../scripts/kokoro/voices.bin .
+             cp -v ../scripts/kokoro/README-new.md ./README.md
+             cp -v ../LICENSE ./
+ 
+             git lfs track "*.onnx"
+             git add .
+ 
+             ls -lh
+ 
+             git status
+ 
+             git commit -m "add models"
+             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
+ 
+       - name: Release
+         uses: svenstaro/upload-release-action@v2
+         with:
+           file_glob: true
+           file: ./*.tar.bz2
+           overwrite: true
+           repo_name: k2-fsa/sherpa-onnx
+           repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+           tag: tts-models
--- a/scripts/kokoro/.gitignore 0 → 100644
查看文件 @9efe26a
+++ b/scripts/kokoro/.gitignore 0 → 100644
查看文件 @9efe26a
+ voices.json
+ voices.bin
+ README-new.md
--- a/scripts/kokoro/README.md 0 → 100644
查看文件 @9efe26a
+++ b/scripts/kokoro/README.md 0 → 100644
查看文件 @9efe26a
+ # Introduction
+ 
+ This folder contains scripts for adding meta data to models
+ from https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files
+ 
+ See also
+ https://huggingface.co/hexgrad/Kokoro-82M/tree/main
+ and
+ https://huggingface.co/spaces/hexgrad/Kokoro-TTS
+ 
--- a/scripts/kokoro/add-meta-data.py 0 → 100755
查看文件 @9efe26a
+++ b/scripts/kokoro/add-meta-data.py 0 → 100755
查看文件 @9efe26a
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ 
+ import argparse
+ import json
+ from pathlib import Path
+ 
+ import numpy as np
+ import onnx
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser()
+     parser.add_argument(
+         "--model", type=str, required=True, help="input and output onnx model"
+     )
+ 
+     parser.add_argument("--voices", type=str, required=True, help="Path to voices.json")
+     return parser.parse_args()
+ 
+ 
+ def load_voices(filename):
+     with open(filename) as f:
+         voices = json.load(f)
+     for key in voices:
+         voices[key] = np.array(voices[key], dtype=np.float32)
+     return voices
+ 
+ 
+ def get_vocab():
+     _pad = "$"
+     _punctuation = ';:,.!?¡¿—…"«»“” '
+     _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+     _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+     symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+     dicts = {}
+     for i in range(len((symbols))):
+         dicts[symbols[i]] = i
+     return dicts
+ 
+ 
+ def generate_tokens():
+     token2id = get_vocab()
+     with open("tokens.txt", "w", encoding="utf-8") as f:
+         for s, i in token2id.items():
+             f.write(f"{s} {i}\n")
+ 
+ 
+ def main():
+     args = get_args()
+     print(args.model, args.voices)
+ 
+     model = onnx.load(args.model)
+     voices = load_voices(args.voices)
+ 
+     if Path("./tokens.txt").is_file():
+         print("./tokens.txt exist, skip generating it")
+     else:
+         generate_tokens()
+ 
+     keys = list(voices.keys())
+     print(",".join(keys))
+ 
+     if Path("./voices.bin").is_file():
+         print("./voices.bin exists, skip generating it")
+     else:
+         with open("voices.bin", "wb") as f:
+             for k in keys:
+                 f.write(voices[k].tobytes())
+ 
+     meta_data = {
+         "model_type": "kokoro",
+         "language": "English",
+         "has_espeak": 1,
+         "sample_rate": 24000,
+         "version": 1,
+         "voice": "en-us",
+         "style_dim": ",".join(map(str, voices[keys[0]].shape)),
+         "n_speakers": len(keys),
+         "speaker_names": ",".join(keys),
+         "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
+         "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
+         "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
+         "maintainer": "k2-fsa",
+     }
+ 
+     print(model.metadata_props)
+ 
+     while len(model.metadata_props):
+         model.metadata_props.pop()
+ 
+     for key, value in meta_data.items():
+         meta = model.metadata_props.add()
+         meta.key = key
+         meta.value = str(value)
+     print("--------------------")
+ 
+     print(model.metadata_props)
+ 
+     onnx.save(model, args.model)
+ 
+     print(f"Please see {args.model}, ./voices.bin, and ./tokens.txt")
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/kokoro/run.sh 0 → 100755
查看文件 @9efe26a
+++ b/scripts/kokoro/run.sh 0 → 100755
查看文件 @9efe26a
+ #!/usr/bin/env bash
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ set -ex
+ 
+ cat > README-new.md <<EOF
+ # Introduction
+ 
+ Files in this folder are from
+ https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files
+ 
+ Please see also
+ https://huggingface.co/hexgrad/Kokoro-82M
+ and
+ https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
+ EOF
+ 
+ files=(
+ kokoro-v0_19_hf.onnx
+ # kokoro-v0_19.onnx
+ # kokoro-quant.onnx
+ # kokoro-quant-convinteger.onnx
+ voices.json
+ )
+ 
+ for f in ${files[@]}; do
+   if [ ! -f ./$f ]; then
+     curl -SL -O https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/$f
+   fi
+ done
+ 
+ models=(
+ # kokoro-v0_19
+ # kokoro-quant
+ # kokoro-quant-convinteger
+ kokoro-v0_19_hf
+ )
+ 
+ for m in ${models[@]}; do
+   ./add-meta-data.py --model $m.onnx --voices ./voices.json
+ done
+ 
+ ls -l
+ echo "----------"
+ ls -lh
+ 
+ for m in ${models[@]}; do
+   ./test.py --model $m.onnx --voices-bin ./voices.bin --tokens ./tokens.txt
+ done
+ ls -lh
--- a/scripts/kokoro/test.py 0 → 100755
查看文件 @9efe26a
+++ b/scripts/kokoro/test.py 0 → 100755
查看文件 @9efe26a
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ """
+ female (7)
+ 'af', 'af_bella', 'af_nicole','af_sarah', 'af_sky',
+ 'bf_emma', 'bf_isabella',
+ 
+ male (4)
+ 'am_adam',  'am_michael', 'bm_george', 'bm_lewis'
+ """
+ 
+ import argparse
+ import time
+ from pathlib import Path
+ from typing import Dict, List
+ 
+ import numpy as np
+ 
+ try:
+     from piper_phonemize import phonemize_espeak
+ except Exception as ex:
+     raise RuntimeError(
+         f"{ex}\nPlease run\n"
+         "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
+     )
+ 
+ import onnxruntime as ort
+ import soundfile as sf
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser()
+     parser.add_argument(
+         "--model",
+         type=str,
+         required=True,
+         help="Path to the model",
+     )
+ 
+     parser.add_argument(
+         "--voices-bin",
+         type=str,
+         required=True,
+         help="Path to the voices.bin",
+     )
+ 
+     parser.add_argument(
+         "--tokens",
+         type=str,
+         required=True,
+         help="Path to tokens.txt",
+     )
+     return parser.parse_args()
+ 
+ 
+ def show(filename):
+     session_opts = ort.SessionOptions()
+     session_opts.log_severity_level = 3
+     sess = ort.InferenceSession(filename, session_opts)
+     for i in sess.get_inputs():
+         print(i)
+ 
+     print("-----")
+ 
+     for i in sess.get_outputs():
+         print(i)
+ 
+ 
+ #  NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])
+ #  NodeArg(name='style', type='tensor(float)', shape=[1, 256])
+ #  NodeArg(name='speed', type='tensor(float)', shape=[1])
+ #  -----
+ #  NodeArg(name='audio', type='tensor(float)', shape=['audio0'])
+ 
+ 
+ def load_tokens(filename: str) -> Dict[str, int]:
+     ans = dict()
+     with open(filename, encoding="utf-8") as f:
+         for line in f:
+             fields = line.strip().split()
+             if len(fields) == 2:
+                 token, idx = fields
+                 ans[token] = int(idx)
+             else:
+                 assert len(fields) == 1, (len(fields), line)
+                 ans[" "] = int(fields[0])
+     return ans
+ 
+ 
+ def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
+     embedding = (
+         np.fromfile(voices_bin, dtype="uint8")
+         .view(np.float32)
+         .reshape(len(speaker_names), *dim)
+     )
+     print("embedding.shape", embedding.shape)
+     ans = dict()
+     for i in range(len(speaker_names)):
+         ans[speaker_names[i]] = embedding[i]
+ 
+     return ans
+ 
+ 
+ class OnnxModel:
+     def __init__(self, model_filename: str, voices_bin: str, tokens: str):
+         session_opts = ort.SessionOptions()
+         session_opts.inter_op_num_threads = 1
+         session_opts.intra_op_num_threads = 1
+ 
+         self.session_opts = session_opts
+         self.model = ort.InferenceSession(
+             model_filename,
+             sess_options=self.session_opts,
+             providers=["CPUExecutionProvider"],
+         )
+         self.token2id = load_tokens(tokens)
+ 
+         meta = self.model.get_modelmeta().custom_metadata_map
+         print(meta)
+         dim = list(map(int, meta["style_dim"].split(",")))
+         speaker_names = meta["speaker_names"].split(",")
+ 
+         self.voices = load_voices(
+             speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
+         )
+ 
+         self.sample_rate = int(meta["sample_rate"])
+ 
+         print(list(self.voices.keys()))
+         # ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam',
+         # 'am_michael', 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
+         # af -> (511, 1, 256)
+         self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
+ 
+     def __call__(self, text: str, voice):
+         tokens = phonemize_espeak(text, "en-us")
+         # tokens is List[List[str]]
+         # Each sentence is a List[str]
+         # len(tokens) == number of sentences
+ 
+         tokens = sum(tokens, [])  # flatten
+         tokens = "".join(tokens)
+ 
+         tokens = tokens.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace(
+             "kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ"
+         )
+ 
+         tokens = list(tokens)
+ 
+         token_ids = [self.token2id[i] for i in tokens]
+         token_ids = token_ids[: self.max_len]
+ 
+         style = self.voices[voice][len(token_ids)]
+ 
+         token_ids = [0, *token_ids, 0]
+         token_ids = np.array([token_ids], dtype=np.int64)
+ 
+         speed = np.array([1.0], dtype=np.float32)
+ 
+         audio = self.model.run(
+             [
+                 self.model.get_outputs()[0].name,
+             ],
+             {
+                 self.model.get_inputs()[0].name: token_ids,
+                 self.model.get_inputs()[1].name: style,
+                 self.model.get_inputs()[2].name: speed,
+             },
+         )[0]
+         return audio
+ 
+ 
+ def test(model, voice, text) -> np.ndarray:
+     pass
+ 
+ 
+ def main():
+     args = get_args()
+     print(vars(args))
+     show(args.model)
+ 
+     #  tokens = phonemize_espeak("how are you doing?", "en-us")
+     # [['h', 'ˌ', 'a', 'ʊ', ' ', 'ɑ', 'ː', 'ɹ', ' ', 'j', 'u', 'ː', ' ', 'd', 'ˈ', 'u', 'ː', 'ɪ', 'ŋ', '?']]
+     m = OnnxModel(
+         model_filename=args.model, voices_bin=args.voices_bin, tokens=args.tokens
+     )
+ 
+     text = (
+         "Today as always, men fall into two groups: slaves and free men."
+         + " Whoever does not have two-thirds of his day for himself, "
+         + "is a slave, whatever he may be: a statesman, a businessman, "
+         + "an official, or a scholar."
+     )
+ 
+     for i, voice in enumerate(m.voices.keys(), 1):
+         print(f"Testing {i}/{len(m.voices)} - {voice}/{args.model}")
+ 
+         start = time.time()
+         audio = m(text, voice=voice)
+         end = time.time()
+ 
+         elapsed_seconds = end - start
+         audio_duration = len(audio) / m.sample_rate
+         real_time_factor = elapsed_seconds / audio_duration
+ 
+         filename = f"{Path(args.model).stem}-{voice}.wav"
+         sf.write(
+             filename,
+             audio,
+             samplerate=m.sample_rate,
+             subtype="PCM_16",
+         )
+         print(f" Saved to {filename}")
+         print(f" Elapsed seconds: {elapsed_seconds:.3f}")
+         print(f" Audio duration in seconds: {audio_duration:.3f}")
+         print(
+             f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}"
+         )
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/melo-tts/export-onnx.py
查看文件 @9efe26a
+++ b/scripts/melo-tts/export-onnx.py
查看文件 @9efe26a
 #!/usr/bin/env python3
- # This script export ZH_EN TTS model, which supports both Chinese and English.
+ # This script exports ZH_EN TTS model, which supports both Chinese and English.
 # This model has only 1 speaker.
 
 from typing import Any, Dict