Begin to support https://github.com/usefulsensors/moonshine (#1470)

Fangjun Kuang · GitHub
Commit b06b460851ae76e4f9e66fee2796e317a16bee77 b06b4608 1 parent 3d6344ea
.github/workflows/export-moonshine-to-onnx.yaml
scripts/moonshine/.gitignore
scripts/moonshine/README.md
scripts/moonshine/export-onnx.py
scripts/moonshine/run.sh
scripts/moonshine/test.py
--- a/.github/workflows/export-moonshine-to-onnx.yaml 0 → 100644
查看文件 @b06b460
+++ b/.github/workflows/export-moonshine-to-onnx.yaml 0 → 100644
查看文件 @b06b460
+name: export-moonshine-to-onnx
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: export-moonshine-to-onnx-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  export-moonshine-to-onnx:
+    if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
+    name: export moonshine models to ONNX
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos-latest]
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Python dependencies
+        shell: bash
+        run: |
+          pip install -q onnx onnxruntime librosa tokenizers soundfile
+
+      - name: Run
+        shell: bash
+        run: |
+          pushd scripts/moonshine
+          ./run.sh
+          popd
+
+          mv -v scripts/moonshine/*.tar.bz2 .
+          mv -v scripts/moonshine/sherpa-onnx-* ./
+
+      - name: Release
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          file: ./*.tar.bz2
+          overwrite: true
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: asr-models
+
+      - name: Publish to huggingface (tiny)
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            d=sherpa-onnx-moonshine-tiny-en-int8
+            export GIT_LFS_SKIP_SMUDGE=1
+            export GIT_CLONE_PROTECTION_ACTIVE=false
+            git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface
+            mv -v $d/* ./huggingface
+            cd huggingface
+            git lfs track "*.onnx"
+            git lfs track "*.wav"
+            git status
+            git add .
+            git status
+            git commit -m "add models"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main
+            rm -rf huggingface
+
+      - name: Publish to huggingface (base)
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            d=sherpa-onnx-moonshine-base-en-int8
+            export GIT_LFS_SKIP_SMUDGE=1
+            export GIT_CLONE_PROTECTION_ACTIVE=false
+            git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface
+            mv -v $d/* ./huggingface
+            cd huggingface
+            git lfs track "*.onnx"
+            git lfs track "*.wav"
+            git status
+            git add .
+            git status
+            git commit -m "add models"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main
+            rm -rf huggingface
--- a/scripts/moonshine/.gitignore 0 → 100644
查看文件 @b06b460
+++ b/scripts/moonshine/.gitignore 0 → 100644
查看文件 @b06b460
+tokenizer.json
--- a/scripts/moonshine/README.md 0 → 100644
查看文件 @b06b460
+++ b/scripts/moonshine/README.md 0 → 100644
查看文件 @b06b460
+# Introduction
+
+This directory contains models from
+https://github.com/usefulsensors/moonshine
+
+See its license at
+https://github.com/usefulsensors/moonshine/blob/main/LICENSE
--- a/scripts/moonshine/export-onnx.py 0 → 100755
查看文件 @b06b460
+++ b/scripts/moonshine/export-onnx.py 0 → 100755
查看文件 @b06b460
+#!/usr/bin/env python3
+# Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+from pathlib import Path
+
+import tokenizers
+from onnxruntime.quantization import QuantType, quantize_dynamic
+
+
+def generate_tokens():
+    if Path("./tokens.txt").is_file():
+        return
+    print("Generating tokens.txt")
+    tokenizer = tokenizers.Tokenizer.from_file("./tokenizer.json")
+    vocab_size = tokenizer.get_vocab_size()
+    with open("tokens.txt", "w", encoding="utf-8") as f:
+        for i in range(vocab_size):
+            s = tokenizer.id_to_token(i).strip()
+            f.write(f"{s}\t{i}\n")
+
+
+def main():
+    generate_tokens()
+
+    # Note(fangjun): Don't use int8 for the preprocessor since it has
+    # a larger impact on the accuracy
+    for f in ["uncached_decode", "cached_decode", "encode"]:
+        if Path(f"{f}.int8.onnx").is_file():
+            continue
+
+        print("processing", f)
+        quantize_dynamic(
+            model_input=f"{f}.onnx",
+            model_output=f"{f}.int8.onnx",
+            weight_type=QuantType.QInt8,
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/moonshine/run.sh 0 → 100755
查看文件 @b06b460
+++ b/scripts/moonshine/run.sh 0 → 100755
查看文件 @b06b460
+#!/usr/bin/env bash
+# Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+set -ex
+
+cat >LICENSE <<EOF
+MIT License
+
+Copyright (c) 2024 Useful Sensors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+EOF
+
+function download_files() {
+  for d in tiny base; do
+    mkdir $d
+
+    pushd $d
+      curl -SL -O https://huggingface.co/UsefulSensors/moonshine/resolve/main/onnx/$d/preprocess.onnx
+      curl -SL -O https://huggingface.co/UsefulSensors/moonshine/resolve/main/onnx/$d/encode.onnx
+      curl -SL -O https://huggingface.co/UsefulSensors/moonshine/resolve/main/onnx/$d/uncached_decode.onnx
+      curl -SL -O https://huggingface.co/UsefulSensors/moonshine/resolve/main/onnx/$d/cached_decode.onnx
+    popd
+  done
+
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-base/resolve/main/test_wavs/0.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-base/resolve/main/test_wavs/1.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-base/resolve/main/test_wavs/8k.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-base/resolve/main/test_wavs/trans.txt
+
+  curl -SL -O https://raw.githubusercontent.com/usefulsensors/moonshine/refs/heads/main/moonshine/assets/tokenizer.json
+}
+
+function quantize() {
+  for d in tiny base; do
+    echo "==========$d=========="
+    ls -lh
+    mv $d/*.onnx .
+    ./export-onnx.py
+    rm cached_decode.onnx
+    rm uncached_decode.onnx
+    rm encode.onnx
+    ls -lh
+
+    ./test.py
+
+    mv *.onnx $d
+    mv tokens.txt $d
+    ls -lh $d
+
+  done
+}
+
+function zip() {
+  for d in tiny base; do
+    s=sherpa-onnx-moonshine-$d-en-int8
+    mv $d $s
+
+    mkdir $s/test_wavs
+
+    cp -v *.wav $s/test_wavs
+    cp trans.txt $s/test_wavs
+    cp LICENSE $s/
+    cp ./README.md $s
+
+    ls -lh $s
+    tar cjfv $s.tar.bz2 $s
+  done
+}
+
+download_files
+quantize
+zip
+
+ls -lh
--- a/scripts/moonshine/test.py 0 → 100755
查看文件 @b06b460
+++ b/scripts/moonshine/test.py 0 → 100755
查看文件 @b06b460
+#!/usr/bin/env python3
+# Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+import datetime as dt
+
+import librosa
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+
+
+def display(sess, name):
+    print(f"=========={name} Input==========")
+    for i in sess.get_inputs():
+        print(i)
+    print(f"=========={name} Output==========")
+    for i in sess.get_outputs():
+        print(i)
+
+
+class OnnxModel:
+    def __init__(
+        self,
+        preprocess: str,
+        encode: str,
+        uncached_decode: str,
+        cached_decode: str,
+    ):
+        self.init_preprocess(preprocess)
+        display(self.preprocess, "preprocess")
+
+        self.init_encode(encode)
+        display(self.encode, "encode")
+
+        self.init_uncached_decode(uncached_decode)
+        display(self.uncached_decode, "uncached_decode")
+
+        self.init_cached_decode(cached_decode)
+        display(self.cached_decode, "cached_decode")
+
+    def init_preprocess(self, preprocess):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+
+        self.preprocess = ort.InferenceSession(
+            preprocess,
+            sess_options=session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+
+    def init_encode(self, encode):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+
+        self.encode = ort.InferenceSession(
+            encode,
+            sess_options=session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+
+    def init_uncached_decode(self, uncached_decode):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+
+        self.uncached_decode = ort.InferenceSession(
+            uncached_decode,
+            sess_options=session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+
+    def init_cached_decode(self, cached_decode):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+
+        self.cached_decode = ort.InferenceSession(
+            cached_decode,
+            sess_options=session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+
+    def run_preprocess(self, audio):
+        """
+        Args:
+          audio: (batch_size, num_samples), float32
+        Returns:
+          A tensor of shape (batch_size, T, dim), float32
+        """
+        return self.preprocess.run(
+            [
+                self.preprocess.get_outputs()[0].name,
+            ],
+            {
+                self.preprocess.get_inputs()[0].name: audio,
+            },
+        )[0]
+
+    def run_encode(self, features):
+        """
+        Args:
+          features: (batch_size, T, dim)
+        Returns:
+          A tensor of shape (batch_size, T, dim)
+        """
+        features_len = np.array([features.shape[1]], dtype=np.int32)
+
+        return self.encode.run(
+            [
+                self.encode.get_outputs()[0].name,
+            ],
+            {
+                self.encode.get_inputs()[0].name: features,
+                self.encode.get_inputs()[1].name: features_len,
+            },
+        )[0]
+
+    def run_uncached_decode(self, token: int, token_len: int, encoder_out: np.ndarray):
+        """
+        Args:
+          token: The current token
+          token_len: Number of predicted tokens so far
+          encoder_out: A tensor fo shape (batch_size, T, dim)
+        Returns:
+          A a tuple:
+            - a tensor of shape (batch_size, 1, dim)
+            - a list of states
+        """
+        token_tensor = np.array([[token]], dtype=np.int32)
+        token_len_tensor = np.array([token_len], dtype=np.int32)
+
+        num_outs = len(self.uncached_decode.get_outputs())
+        out_names = [
+            self.uncached_decode.get_outputs()[i].name for i in range(num_outs)
+        ]
+
+        out = self.uncached_decode.run(
+            out_names,
+            {
+                self.uncached_decode.get_inputs()[0].name: token_tensor,
+                self.uncached_decode.get_inputs()[1].name: encoder_out,
+                self.uncached_decode.get_inputs()[2].name: token_len_tensor,
+            },
+        )
+
+        logits = out[0]
+        states = out[1:]
+
+        return logits, states
+
+    def run_cached_decode(
+        self, token: int, token_len: int, encoder_out: np.ndarray, states
+    ):
+        """
+        Args:
+          token: The current token
+          token_len: Number of predicted tokens so far
+          encoder_out: A tensor of shape (batch_size, T, dim)
+          states: previous states
+        Returns:
+          A a tuple:
+            - a tensor of shape (batch_size, 1, dim)
+            - a list of states
+        """
+        token_tensor = np.array([[token]], dtype=np.int32)
+        token_len_tensor = np.array([token_len], dtype=np.int32)
+
+        num_outs = len(self.cached_decode.get_outputs())
+        out_names = [self.cached_decode.get_outputs()[i].name for i in range(num_outs)]
+
+        states_inputs = {}
+        for i in range(3, len(self.cached_decode.get_inputs())):
+            name = self.cached_decode.get_inputs()[i].name
+            states_inputs[name] = states[i - 3]
+
+        out = self.cached_decode.run(
+            out_names,
+            {
+                self.cached_decode.get_inputs()[0].name: token_tensor,
+                self.cached_decode.get_inputs()[1].name: encoder_out,
+                self.cached_decode.get_inputs()[2].name: token_len_tensor,
+                **states_inputs,
+            },
+        )
+
+        logits = out[0]
+        states = out[1:]
+
+        return logits, states
+
+
+def main():
+    wave = "./1.wav"
+    id2token = dict()
+    token2id = dict()
+    with open("./tokens.txt", encoding="utf-8") as f:
+        for k, line in enumerate(f):
+            t, idx = line.split("\t")
+            id2token[int(idx)] = t
+            token2id[t] = int(idx)
+
+    model = OnnxModel(
+        preprocess="./preprocess.onnx",
+        encode="./encode.int8.onnx",
+        uncached_decode="./uncached_decode.int8.onnx",
+        cached_decode="./cached_decode.int8.onnx",
+    )
+
+    audio, sample_rate = sf.read(wave, dtype="float32", always_2d=True)
+    audio = audio[:, 0]  # only use the first channel
+    if sample_rate != 16000:
+        audio = librosa.resample(
+            audio,
+            orig_sr=sample_rate,
+            target_sr=16000,
+        )
+        sample_rate = 16000
+    audio = audio[None]  # (1, num_samples)
+    print("audio.shape", audio.shape)  # (1, 159414)
+
+    start_t = dt.datetime.now()
+
+    features = model.run_preprocess(audio)  # (1, 413, 288)
+    print("features", features.shape)
+
+    sos = token2id["<s>"]
+    eos = token2id["</s>"]
+
+    tokens = [sos]
+
+    encoder_out = model.run_encode(features)
+    print("encoder_out.shape", encoder_out.shape)  # (1, 413, 288)
+
+    logits, states = model.run_uncached_decode(
+        token=tokens[-1],
+        token_len=len(tokens),
+        encoder_out=encoder_out,
+    )
+
+    print("logits.shape", logits.shape)  # (1, 1, 32768)
+    print("len(states)", len(states))  # 24
+
+    max_len = int((audio.shape[-1] / 16000) * 6)
+
+    for i in range(max_len):
+        token = logits.squeeze().argmax()
+        if token == eos:
+            break
+        tokens.append(token)
+
+        logits, states = model.run_cached_decode(
+            token=tokens[-1],
+            token_len=len(tokens),
+            encoder_out=encoder_out,
+            states=states,
+        )
+
+    tokens = tokens[1:]  # remove sos
+    words = [id2token[i] for i in tokens]
+    underline = "▁"
+    #  underline = b"\xe2\x96\x81".decode()
+    text = "".join(words).replace(underline, " ").strip()
+
+    end_t = dt.datetime.now()
+    t = (end_t - start_t).total_seconds()
+    rtf = t * 16000 / audio.shape[-1]
+
+    print(text)
+    print("RTF:", rtf)
+
+
+if __name__ == "__main__":
+    main()