export sense-voice to onnx (#1144)

Fangjun Kuang · GitHub
Commit 346f419f39dc97fa2871c0009b8714f25e39fb46 346f419f 1 parent 4198d9a1
.github/workflows/export-melo-tts-to-onnx.yaml
.github/workflows/export-sense-voice-to-onnx.yaml
scripts/melo-tts/run.sh
scripts/sense-voice/README.md
scripts/sense-voice/export-onnx.py
scripts/sense-voice/run.sh
scripts/sense-voice/show-info.py
--- a/.github/workflows/export-melo-tts-to-onnx.yaml
查看文件 @346f419
+++ b/.github/workflows/export-melo-tts-to-onnx.yaml
查看文件 @346f419
@@ -40,7 +40,7 @@ jobs:
           name: test.wav
           path: scripts/melo-tts/test.wav
 
-       - name: Publish to huggingface (aishell)
+       - name: Publish to huggingface
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         uses: nick-fields/retry@v3
--- a/.github/workflows/export-sense-voice-to-onnx.yaml 0 → 100644
查看文件 @346f419
+++ b/.github/workflows/export-sense-voice-to-onnx.yaml 0 → 100644
查看文件 @346f419
+ name: export-sense-voice-to-onnx
+ 
+ on:
+   workflow_dispatch:
+ 
+ concurrency:
+   group: export-sense-voice-to-onnx-${{ github.ref }}
+   cancel-in-progress: true
+ 
+ jobs:
+   export-sense-voice-to-onnx:
+     if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
+     name: export sense-voice
+     runs-on: ${{ matrix.os }}
+     strategy:
+       fail-fast: false
+       matrix:
+         os: [ubuntu-latest]
+         python-version: ["3.10"]
+ 
+     steps:
+       - uses: actions/checkout@v4
+ 
+       - name: Setup Python ${{ matrix.python-version }}
+         uses: actions/setup-python@v5
+         with:
+           python-version: ${{ matrix.python-version }}
+ 
+       - name: Download test_wavs
+         shell: bash
+         run: |
+           sudo apt-get install -y -qq sox libsox-fmt-mp3
+           curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/zh.mp3
+           curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/en.mp3
+           curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/ja.mp3
+           curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/ko.mp3
+           curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/yue.mp3
+ 
+           soxi *.mp3
+ 
+           sox zh.mp3 -r 16k zh.wav
+           sox en.mp3 -r 16k en.wav
+           sox ja.mp3 -r 16k ja.wav
+           sox ko.mp3 -r 16k ko.wav
+           sox yue.mp3 -r 16k yue.wav
+ 
+       - name: Run
+         shell: bash
+         run: |
+           cd scripts/sense-voice
+           ./run.sh
+ 
+       - name: Publish to huggingface
+         env:
+           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+         uses: nick-fields/retry@v3
+         with:
+           max_attempts: 20
+           timeout_seconds: 200
+           shell: bash
+           command: |
+             git config --global user.email "csukuangfj@gmail.com"
+             git config --global user.name "Fangjun Kuang"
+ 
+             rm -rf huggingface
+             export GIT_LFS_SKIP_SMUDGE=1
+             export GIT_CLONE_PROTECTION_ACTIVE=false
+ 
+             git clone https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 huggingface
+             cd huggingface
+             git fetch
+             git pull
+             echo "pwd: $PWD"
+             ls -lh ../scripts/sense-voice
+ 
+             rm -rf ./
+ 
+             cp -v ../scripts/sense-voice/*.onnx .
+             cp -v ../scripts/sense-voice/tokens.txt .
+             cp -v ../scripts/sense-voice/README.md .
+             cp -v ../scripts/sense-voice/export-onnx.py .
+ 
+             mkdir test_wavs
+             cp -v ../*.wav ./test_wavs/
+ 
+             curl -SL -O https://raw.githubusercontent.com/FunAudioLLM/SenseVoice/main/LICENSE
+ 
+             git lfs track "*.onnx"
+             git add .
+ 
+             ls -lh
+ 
+             git status
+ 
+             git commit -m "add models"
+             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 main || true
+ 
+             cd ..
+ 
+             rm -rf huggingface/.git*
+             dst=sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+ 
+             mv huggingface $dst
+ 
+             tar cjvf $dst.tar.bz2 $dst
+             rm -rf $dst
+ 
+       - name: Release
+         uses: svenstaro/upload-release-action@v2
+         with:
+           file_glob: true
+           file: ./*.tar.bz2
+           overwrite: true
+           repo_name: k2-fsa/sherpa-onnx
+           repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+           tag: asr-models
--- a/scripts/melo-tts/run.sh
查看文件 @346f419
+++ b/scripts/melo-tts/run.sh
查看文件 @346f419
@@ -2,8 +2,6 @@
 
 set -ex
 
- 
- 
 function install() {
   pip install torch==2.3.1+cpu torchaudio==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
 
--- a/scripts/sense-voice/README.md 0 → 100644
查看文件 @346f419
+++ b/scripts/sense-voice/README.md 0 → 100644
查看文件 @346f419
+ # Introduction
+ 
+ This directory contains models converted from
+ https://github.com/FunAudioLLM/SenseVoice
--- a/scripts/sense-voice/export-onnx.py 0 → 100755
查看文件 @346f419
+++ b/scripts/sense-voice/export-onnx.py 0 → 100755
查看文件 @346f419
+ #!/usr/bin/env python3
+ # Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ """
+ We use
+ https://hf-mirror.com/yuekai/model_repo_sense_voice_small/blob/main/export_onnx.py
+ as a reference while writing this file.
+ 
+ Thanks to https://github.com/yuekaizhang for making the file public.
+ """
+ 
+ import os
+ from typing import Any, Dict, Tuple
+ 
+ import onnx
+ import torch
+ from model import SenseVoiceSmall
+ from onnxruntime.quantization import QuantType, quantize_dynamic
+ 
+ 
+ def add_meta_data(filename: str, meta_data: Dict[str, Any]):
+     """Add meta data to an ONNX model. It is changed in-place.
+ 
+     Args:
+       filename:
+         Filename of the ONNX model to be changed.
+       meta_data:
+         Key-value pairs.
+     """
+     model = onnx.load(filename)
+     while len(model.metadata_props):
+         model.metadata_props.pop()
+ 
+     for key, value in meta_data.items():
+         meta = model.metadata_props.add()
+         meta.key = key
+         meta.value = str(value)
+ 
+     onnx.save(model, filename)
+ 
+ 
+ def modified_forward(
+     self,
+     x: torch.Tensor,
+     x_length: torch.Tensor,
+     language: torch.Tensor,
+     text_norm: torch.Tensor,
+ ):
+     """
+     Args:
+       x:
+         A 3-D tensor of shape (N, T, C) with dtype torch.float32
+       x_length:
+         A 1-D tensor of shape (N,) with dtype torch.int32
+       language:
+         A 1-D tensor of shape (N,) with dtype torch.int32
+         See also https://github.com/FunAudioLLM/SenseVoice/blob/a80e676461b24419cf1130a33d4dd2f04053e5cc/model.py#L640
+       text_norm:
+         A 1-D tensor of shape (N,) with dtype torch.int32
+         See also https://github.com/FunAudioLLM/SenseVoice/blob/a80e676461b24419cf1130a33d4dd2f04053e5cc/model.py#L642
+     """
+     language_query = self.embed(language).unsqueeze(1)
+     text_norm_query = self.embed(text_norm).unsqueeze(1)
+ 
+     event_emo_query = self.embed(torch.LongTensor([[1, 2]])).repeat(x.size(0), 1, 1)
+ 
+     x = torch.cat((language_query, event_emo_query, text_norm_query, x), dim=1)
+     x_length += 4
+ 
+     encoder_out, encoder_out_lens = self.encoder(x, x_length)
+     if isinstance(encoder_out, tuple):
+         encoder_out = encoder_out[0]
+ 
+     ctc_logits = self.ctc.ctc_lo(encoder_out)
+ 
+     return ctc_logits
+ 
+ 
+ def load_cmvn(filename) -> Tuple[str, str]:
+     neg_mean = None
+     inv_stddev = None
+ 
+     with open(filename) as f:
+         for line in f:
+             if not line.startswith("<LearnRateCoef>"):
+                 continue
+             t = line.split()[3:-1]
+ 
+             if neg_mean is None:
+                 neg_mean = ",".join(t)
+             else:
+                 inv_stddev = ",".join(t)
+ 
+     return neg_mean, inv_stddev
+ 
+ 
+ def generate_tokens(params):
+     sp = params["tokenizer"].sp
+     with open("tokens.txt", "w", encoding="utf-8") as f:
+         for i in range(sp.vocab_size()):
+             f.write(f"{sp.id_to_piece(i)} {i}\n")
+ 
+     os.system("head tokens.txt; tail -n200 tokens.txt")
+ 
+ 
+ def display_params(params):
+     print("----------params----------")
+     print(params)
+ 
+     print("----------frontend_conf----------")
+     print(params["frontend_conf"])
+ 
+     os.system(f"cat {params['frontend_conf']['cmvn_file']}")
+ 
+     print("----------config----------")
+     print(params["config"])
+ 
+     os.system(f"cat {params['config']}")
+ 
+ 
+ def main():
+     model, params = SenseVoiceSmall.from_pretrained(model="iic/SenseVoiceSmall")
+     display_params(params)
+ 
+     generate_tokens(params)
+ 
+     model.__class__.forward = modified_forward
+ 
+     x = torch.randn(2, 100, 560, dtype=torch.float32)
+     x_length = torch.tensor([80, 100], dtype=torch.int32)
+     language = torch.tensor([0, 3], dtype=torch.int32)
+     text_norm = torch.tensor([14, 15], dtype=torch.int32)
+ 
+     opset_version = 13
+     filename = "model.onnx"
+     torch.onnx.export(
+         model,
+         (x, x_length, language, text_norm),
+         filename,
+         opset_version=opset_version,
+         input_names=["x", "x_length", "language", "text_norm"],
+         output_names=["logits"],
+         dynamic_axes={
+             "x": {0: "N", 1: "T"},
+             "x_length": {0: "N"},
+             "language": {0: "N"},
+             "text_norm": {0: "N"},
+             "logits": {0: "N", 1: "T"},
+         },
+     )
+ 
+     lfr_window_size = params["frontend_conf"]["lfr_m"]
+     lfr_window_shift = params["frontend_conf"]["lfr_n"]
+ 
+     neg_mean, inv_stddev = load_cmvn(params["frontend_conf"]["cmvn_file"])
+     vocab_size = params["tokenizer"].sp.vocab_size()
+ 
+     meta_data = {
+         "lfr_window_size": lfr_window_size,
+         "lfr_window_shift": lfr_window_shift,
+         "neg_mean": neg_mean,
+         "inv_stddev": inv_stddev,
+         "model_type": "sense_voice_ctc",
+         "version": "1",
+         "model_author": "iic",
+         "maintainer": "k2-fsa",
+         "vocab_size": vocab_size,
+         "comment": "iic/SenseVoiceSmall",
+         "lang_auto": model.lid_dict["auto"],
+         "lang_zh": model.lid_dict["zh"],
+         "lang_en": model.lid_dict["en"],
+         "lang_yue": model.lid_dict["yue"],  # cantonese
+         "lang_ja": model.lid_dict["ja"],
+         "lang_ko": model.lid_dict["ko"],
+         "lang_nospeech": model.lid_dict["nospeech"],
+         "with_itn": model.textnorm_dict["withitn"],
+         "without_itn": model.textnorm_dict["woitn"],
+         "url": "https://huggingface.co/FunAudioLLM/SenseVoiceSmall",
+     }
+     add_meta_data(filename=filename, meta_data=meta_data)
+ 
+     filename_int8 = "model.int8.onnx"
+     quantize_dynamic(
+         model_input=filename,
+         model_output=filename_int8,
+         op_types_to_quantize=["MatMul"],
+         weight_type=QuantType.QInt8,
+     )
+ 
+ 
+ if __name__ == "__main__":
+     torch.manual_seed(20240717)
+     main()
--- a/scripts/sense-voice/run.sh 0 → 100755
查看文件 @346f419
+++ b/scripts/sense-voice/run.sh 0 → 100755
查看文件 @346f419
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ 
+ function install() {
+   pip install torch==2.3.1+cpu torchaudio==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+ 
+   pushd /tmp
+ 
+   git clone https://github.com/alibaba/FunASR.git
+   cd FunASR
+   pip3 install -qq -e ./
+   cd ..
+ 
+   git clone https://github.com/FunAudioLLM/SenseVoice
+   cd SenseVoice
+   pip install -qq -r ./requirements.txt
+   cd ..
+ 
+   pip install soundfile onnx onnxruntime kaldi-native-fbank librosa soundfile
+ 
+   popd
+ }
+ 
+ install
+ 
+ export PYTHONPATH=/tmp/FunASR:$PYTHONPATH
+ export PYTHONPATH=/tmp/SenseVoice:$PYTHONPATH
+ 
+ echo "pwd: $PWD"
+ 
+ ./export-onnx.py
+ 
+ ./show-info.py
+ 
+ ls -lh
--- a/scripts/sense-voice/show-info.py 0 → 100755
查看文件 @346f419
+++ b/scripts/sense-voice/show-info.py 0 → 100755
查看文件 @346f419
+ #!/usr/bin/env python3
+ # Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ import onnxruntime
+ 
+ 
+ def show(filename):
+     session_opts = onnxruntime.SessionOptions()
+     session_opts.log_severity_level = 3
+     sess = onnxruntime.InferenceSession(filename, session_opts)
+     for i in sess.get_inputs():
+         print(i)
+ 
+     print("-----")
+ 
+     for i in sess.get_outputs():
+         print(i)
+ 
+     meta = sess.get_modelmeta().custom_metadata_map
+     print("*****************************************")
+     print("meta\n", meta)
+ 
+ 
+ def main():
+     print("=========model==========")
+     show("./model.onnx")
+ 
+ 
+ if __name__ == "__main__":
+     main()
+ """
+ =========model==========
+ NodeArg(name='x', type='tensor(float)', shape=['N', 'T', 560])
+ NodeArg(name='x_length', type='tensor(int32)', shape=['N'])
+ NodeArg(name='language', type='tensor(int32)', shape=['N'])
+ NodeArg(name='text_norm', type='tensor(int32)', shape=['N'])
+ -----
+ NodeArg(name='logits', type='tensor(float)', shape=['N', 'T', 25055])
+ *****************************************
+ """