Scripts to generate tts samples (#2513)

Fangjun Kuang · GitHub
Commit ba13109c8006db5d899b1561537a7d0663f58376 ba13109c 1 parent 866cbe49
.github/workflows/export-kitten.yaml
.github/workflows/export-matcha-fa-en.yaml
.github/workflows/generate-tts-samples.yaml
README.md
scripts/kitten-tts/nano_v0_1/generate_samples.py
scripts/kitten-tts/nano_v0_2/generate_samples.py
scripts/matcha-tts/zh/generate_samples.py
--- a/.github/workflows/export-kitten.yaml
查看文件 @ba13109
+++ b/.github/workflows/export-kitten.yaml
查看文件 @ba13109
@@ -117,9 +117,13 @@ jobs:
             export GIT_CLONE_PROTECTION_ACTIVE=false
 
             for d in ${dirs[@]}; do
-               if [ ! -d ../$d ]]; then
+               echo "d $d"
+               if [[ ! -d $d ]]; then
+                 echo "$d does not exist"
                 continue
               fi
+ 
+               echo "$d exists"
               rm -rf huggingface
 
               git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface
--- a/.github/workflows/export-matcha-fa-en.yaml
查看文件 @ba13109
+++ b/.github/workflows/export-matcha-fa-en.yaml
查看文件 @ba13109
@@ -3,7 +3,7 @@ name: export-matcha-fa-en-to-onnx
 on:
   push:
     branches:
-       - fix-ci
+       - tts-matcha-samples
 
   workflow_dispatch:
 
@@ -33,15 +33,48 @@ jobs:
       - name: Install Python dependencies
         shell: bash
         run: |
-           pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+           pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html sherpa-onnx
 
       - name: Run
+         if: false
         shell: bash
         run: |
           cd scripts/matcha-tts/fa-en
           ./run.sh
 
+       - name: Generate samples
+         env:
+           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+         shell: bash
+         run: |
+           cd scripts/matcha-tts/zh
+ 
+           git config --global user.email "csukuangfj@gmail.com"
+           git config --global user.name "Fangjun Kuang"
+ 
+           curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
+           tar xvf matcha-icefall-zh-baker.tar.bz2
+           rm matcha-icefall-zh-baker.tar.bz2
+ 
+           curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
+ 
+           git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
+           mkdir -p ./hf/matcha/icefall-zh/mp3
+ 
+           ./generate_samples.py
+ 
+           pushd hf
+           git pull
+           git add .
+           git commit -m 'add kokoro samples for matcha tts zh'
+           git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
+           popd
+           rm -rf hf
+ 
+           ls -lh
+ 
       - name: Collect results ${{ matrix.version }}
+         if: false
         shell: bash
         run: |
           curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
@@ -73,6 +106,7 @@ jobs:
           ls -lh $dst2.tar.bz2
 
       - name: Publish to huggingface male (musa)
+         if: false
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         uses: nick-fields/retry@v3
@@ -110,6 +144,7 @@ jobs:
             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-musa main || true
 
       - name: Publish to huggingface female (khadijah)
+         if: false
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         uses: nick-fields/retry@v3
@@ -147,7 +182,8 @@ jobs:
             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/matcha-tts-fa_en-khadijah main || true
 
       - name: Release
-         if: github.repository_owner == 'csukuangfj'
+         # if: github.repository_owner == 'csukuangfj'
+         if: false
         uses: svenstaro/upload-release-action@v2
         with:
           file_glob: true
@@ -158,7 +194,8 @@ jobs:
           tag: tts-models
 
       - name: Release
-         if: github.repository_owner == 'k2-fsa'
+         # if: github.repository_owner == 'k2-fsa'
+         if: false
         uses: svenstaro/upload-release-action@v2
         with:
           file_glob: true
--- a/.github/workflows/generate-tts-samples.yaml 0 → 100644
查看文件 @ba13109
+++ b/.github/workflows/generate-tts-samples.yaml 0 → 100644
查看文件 @ba13109
+ name: generate-tts-samples
+ 
+ on:
+   push:
+     branches:
+       - tts-samples-2
+ 
+   workflow_dispatch:
+ 
+ concurrency:
+   group: generate-tts-samples-${{ github.ref }}
+   cancel-in-progress: true
+ 
+ jobs:
+   generate_tts_samples:
+     name: ${{ matrix.os }}
+     runs-on: ${{ matrix.os }}
+     strategy:
+       fail-fast: false
+       matrix:
+         os: [ubuntu-latest]
+         python-version: ["3.10"]
+ 
+     steps:
+       - uses: actions/checkout@v4
+         with:
+           fetch-depth: 0
+ 
+       - name: Install Python dependencies
+         shell: bash
+         run: |
+           pip install "numpy<=1.26.4" sherpa-onnx soundfile
+ 
+       - name: kitten
+         if: true
+         shell: bash
+         env:
+           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+         run: |
+           git config --global user.email "csukuangfj@gmail.com"
+           git config --global user.name "Fangjun Kuang"
+ 
+           cd scripts/kitten-tts
+           pwd=$PWD
+ 
+           export GIT_LFS_SKIP_SMUDGE=1
+           export GIT_CLONE_PROTECTION_ACTIVE=false
+           git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
+           mkdir -p ./hf/kitten/v0.1/mp3
+           mkdir -p ./hf/kitten/v0.2/mp3
+ 
+           for v in 1 2; do
+             pushd nano_v0_$v
+             curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_$v-fp16.tar.bz2
+             tar xf kitten-nano-en-v0_$v-fp16.tar.bz2
+             rm kitten-nano-en-v0_$v-fp16.tar.bz2
+ 
+             ln -s ../hf .
+             python3 ./generate_samples.py
+             rm -rf kitten-nano-en-v0_$v-fp16
+             popd
+           done
+ 
+           pushd hf
+           git pull
+           git add .
+           git commit -m 'add kitten tts samples'
+           git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
+           popd
+           rm -rf hf
--- a/README.md
查看文件 @ba13109
+++ b/README.md
查看文件 @ba13109
@@ -45,6 +45,9 @@ For Rust support, please see [sherpa-rs][sherpa-rs]
 
 It also supports WebAssembly.
 
+ [Join our discord](https://discord.gg/fJdxzg2VbG)
+ 
+ 
 ## Introduction
 
 This repository supports running the following functions **locally**
--- a/scripts/kitten-tts/nano_v0_1/generate_samples.py 0 → 100755
查看文件 @ba13109
+++ b/scripts/kitten-tts/nano_v0_1/generate_samples.py 0 → 100755
查看文件 @ba13109
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ """
+ Generate samples for
+ https://k2-fsa.github.io/sherpa/onnx/tts/all/
+ """
+ 
+ 
+ import sherpa_onnx
+ import soundfile as sf
+ 
+ from generate_voices_bin import speaker2id
+ 
+ config = sherpa_onnx.OfflineTtsConfig(
+     model=sherpa_onnx.OfflineTtsModelConfig(
+         kitten=sherpa_onnx.OfflineTtsKittenModelConfig(
+             model="kitten-nano-en-v0_1-fp16/model.fp16.onnx",
+             voices="kitten-nano-en-v0_1-fp16/voices.bin",
+             tokens="kitten-nano-en-v0_1-fp16/tokens.txt",
+             data_dir="kitten-nano-en-v0_1-fp16/espeak-ng-data",
+         ),
+         num_threads=2,
+     ),
+     max_num_sentences=1,
+ )
+ 
+ if not config.validate():
+     raise ValueError("Please check your config")
+ 
+ tts = sherpa_onnx.OfflineTts(config)
+ text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
+ 
+ for s, i in speaker2id.items():
+     print(s, i, len(speaker2id))
+     audio = tts.generate(text, sid=i, speed=1.0)
+ 
+     sf.write(
+         f"./hf/kitten/v0.1/mp3/{i}-{s}.mp3",
+         audio.samples,
+         samplerate=audio.sample_rate,
+     )
--- a/scripts/kitten-tts/nano_v0_2/generate_samples.py 0 → 100755
查看文件 @ba13109
+++ b/scripts/kitten-tts/nano_v0_2/generate_samples.py 0 → 100755
查看文件 @ba13109
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ """
+ Generate samples for
+ https://k2-fsa.github.io/sherpa/onnx/tts/all/
+ """
+ 
+ 
+ import sherpa_onnx
+ import soundfile as sf
+ 
+ from generate_voices_bin import speaker2id
+ 
+ config = sherpa_onnx.OfflineTtsConfig(
+     model=sherpa_onnx.OfflineTtsModelConfig(
+         kitten=sherpa_onnx.OfflineTtsKittenModelConfig(
+             model="kitten-nano-en-v0_2-fp16/model.fp16.onnx",
+             voices="kitten-nano-en-v0_2-fp16/voices.bin",
+             tokens="kitten-nano-en-v0_2-fp16/tokens.txt",
+             data_dir="kitten-nano-en-v0_2-fp16/espeak-ng-data",
+         ),
+         num_threads=2,
+     ),
+     max_num_sentences=1,
+ )
+ 
+ if not config.validate():
+     raise ValueError("Please check your config")
+ 
+ tts = sherpa_onnx.OfflineTts(config)
+ text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
+ 
+ for s, i in speaker2id.items():
+     print(s, i, len(speaker2id))
+     audio = tts.generate(text, sid=i, speed=1.0)
+ 
+     sf.write(
+         f"./hf/kitten/v0.2/mp3/{i}-{s}.mp3",
+         audio.samples,
+         samplerate=audio.sample_rate,
+     )
--- a/scripts/matcha-tts/zh/generate_samples.py 0 → 100755
查看文件 @ba13109
+++ b/scripts/matcha-tts/zh/generate_samples.py 0 → 100755
查看文件 @ba13109
+ #!/usr/bin/env python3
+ # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+ """
+ Generate samples for
+ https://k2-fsa.github.io/sherpa/onnx/tts/all/
+ """
+ 
+ 
+ import sherpa_onnx
+ import soundfile as sf
+ 
+ config = sherpa_onnx.OfflineTtsConfig(
+     model=sherpa_onnx.OfflineTtsModelConfig(
+         matcha=sherpa_onnx.OfflineTtsMatchaModelConfig(
+             acoustic_model="matcha-icefall-zh-baker/model-steps-3.onnx",
+             vocoder="vocos-22khz-univ.onnx",
+             lexicon="matcha-icefall-zh-baker/lexicon.txt",
+             tokens="matcha-icefall-zh-baker/tokens.txt",
+             dict_dir="matcha-icefall-zh-baker/dict",
+         ),
+         num_threads=2,
+     ),
+     max_num_sentences=1,
+     rule_fsts="./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst",
+ )
+ 
+ if not config.validate():
+     raise ValueError("Please check your config")
+ 
+ tts = sherpa_onnx.OfflineTts(config)
+ text = "某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔."
+ 
+ 
+ audio = tts.generate(text, sid=0, speed=1.0)
+ 
+ sf.write(
+     "./hf/matcha/icefall-zh/mp3/0.mp3",
+     audio.samples,
+     samplerate=audio.sample_rate,
+ )