Support distil-small.en whisper (#472)

Fangjun Kuang · GitHub
Commit 868c339e5e78e0bafd318c95a2cd598474103fd8 868c339e 1 parent 3ae984f1
.github/scripts/test-offline-whisper.sh
.github/workflows/export-whisper-to-onnx.yaml
.github/workflows/test-python-offline-websocket-server.yaml
CMakeLists.txt
scripts/whisper/export-onnx.py
scripts/whisper/test.py
sherpa-onnx/csrc/offline-recognizer-whisper-impl.h
--- a/.github/scripts/test-offline-whisper.sh
查看文件 @868c339
+++ b/.github/scripts/test-offline-whisper.sh
查看文件 @868c339
@@ -22,6 +22,8 @@ tiny
 base
 small
 medium
+distil-medium.en
+distil-small.en
 )
 for name in ${names[@]}; do
--- a/.github/workflows/export-whisper-to-onnx.yaml
查看文件 @868c339
+++ b/.github/workflows/export-whisper-to-onnx.yaml
查看文件 @868c339
@@ -15,8 +15,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest]
-        model: ["distil-medium.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
+        os: [macos-latest]
+        # model: ["distil-medium.en", "distil-small.en",  "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "large-v1", "large-v2", "distil-large-v2"]
+        model: ["distil-medium.en", "distil-small.en",  "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium"]
         python-version: ["3.8"]
     steps:
@@ -42,23 +43,33 @@ jobs:
           if [[ $model == distil-medium.en ]]; then
             wget -q -O distil-medium-en-original-model.bin https://huggingface.co/distil-whisper/distil-medium.en/resolve/main/original-model.bin
             ls -lh
+          elif [[ $model == distil-large-v2 ]]; then
+            wget -q -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin
+            ls -lh
+          elif [[ $model == distil-small.en ]]; then
+            wget -q -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin
+            ls -lh
           fi
           python3 ./export-onnx.py --model ${{ matrix.model }}
           # python3 -m onnxruntime.tools.convert_onnx_models_to_ort --optimization_style=Fixed ./
           ls -lh
-          if [[ $model != distil-medium.en ]]; then
-            ls -lh ~/.cache/whisper
-          fi
+          ls -lh ~/.cache/whisper || true
+          ls -lh distil*original-model.bin || true
+          rm -rf ~/.cache/whisper
+          rm -f distil*original-model.bin
           src=sherpa-onnx-whisper-${{ matrix.model }}
-          mkdir $src
-          cp *.onnx $src/
-          cp *tokens.txt $src
+          cd ..
+          mv whisper $src
+
+          echo "------------------------------"
           cd $src
+          du -h -d1 .
+          ls -lh
           mkdir -p test_wavs
           cd test_wavs
           wget -q https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/0.wav
@@ -66,21 +77,32 @@ jobs:
           wget -q https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/8k.wav
           wget -q https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/trans.txt
           cd ../..
-          mv $src ../..
+          mv $src ../
+          echo "pwd: $PWD"
-          cd ../..
+          cd ../
           echo "--------------------"
           ls -lh
           ls -lh $src
           echo "--------------------"
-          tar cjvf ./$src.tar.bz2 $src
+          if [[ $model == large || $model == large-v1 || $model == large-v2 || $model == distil-large-v2 ]]; then
+            #tar cvjf - $src | split --bytes=1024MB - $src.tar.bz2.
+            tar cvjf $src.tar.bz2 $src
+            split -b 1G $src.tar.bz2 $src.tar.bz2.
+            rm $src.tar.bz2
+            # cat $src.tar.gz.* | tar xjf -
+          else
+            tar cvjf $src.tar.bz2 $src
+          fi
+          ls -lh
+
       - name: Release
         uses: svenstaro/upload-release-action@v2
         with:
           file_glob: true
-          file: ./*.tar.bz2
+          file: ./*.tar*
           overwrite: true
           repo_name: k2-fsa/sherpa-onnx
           repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
@@ -99,14 +121,21 @@ jobs:
           GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} huggingface
           rm -rf huggingface/*
-          cp -av $src/* ./huggingface/
+          if [[ $model == large || $model == large-v1 || $model == large-v2 || $model == distil-large-v2 ]]; then
+            mv $src.tar* ./huggingface
+          else
+            cp -v $src/*.onnx ./huggingface
+            cp -v $src/*tokens* ./huggingface
+            cp -av $src/test_wavs ./huggingface
+          fi
           cd huggingface
           git status
           ls -lh
-          git lfs track "*.onnx"
-          # git lfs track "*.ort"
+          git lfs track "*gz*"
+          git lfs track "*onnx*"
+
           git add .
           git commit -m "upload ${{ matrix.model }}"
           git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} main
--- a/.github/workflows/test-python-offline-websocket-server.yaml
查看文件 @868c339
+++ b/.github/workflows/test-python-offline-websocket-server.yaml
查看文件 @868c339
@@ -90,7 +90,7 @@ jobs:
             ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav
       - name: Start server for paraformer models
-        if: matrix.model_type == 'paraformer'
+        if: matrix.model_type == 'paraformer' && matrix.os != 'windows-latest'
         shell: bash
         run: |
           GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-bilingual-zh-en
@@ -106,7 +106,7 @@ jobs:
           sleep 10
       - name: Start client for paraformer models
-        if: matrix.model_type == 'paraformer'
+        if: matrix.model_type == 'paraformer' && matrix.os != 'windows-latest'
         shell: bash
         run: |
           python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py \
--- a/CMakeLists.txt
查看文件 @868c339
+++ b/CMakeLists.txt
查看文件 @868c339
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-onnx)
-set(SHERPA_ONNX_VERSION "1.9.0")
+set(SHERPA_ONNX_VERSION "1.9.1")
 # Disable warning about
 #
--- a/scripts/whisper/export-onnx.py
查看文件 @868c339
+++ b/scripts/whisper/export-onnx.py
查看文件 @868c339
@@ -44,7 +44,7 @@ def get_args():
             "tiny", "tiny.en", "base", "base.en",
             "small", "small.en", "medium", "medium.en",
             "large", "large-v1", "large-v2",
-            "distil-medium.en",
+            "distil-medium.en", "distil-small.en", "distil-large-v2"
             ],
         # fmt: on
     )
@@ -314,6 +314,32 @@ def main():
             """
             )
         model = whisper.load_model(filename)
+    elif name == "distil-large-v2":
+        filename = "./distil-large-v2-original-model.bin"
+        if not Path(filename).is_file():
+            raise ValueError(
+                """
+                Please go to https://huggingface.co/distil-whisper/distil-large-v2
+                to download original-model.bin
+                You can use the following command to do that:
+
+                wget -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin
+            """
+            )
+        model = whisper.load_model(filename)
+    elif name == "distil-small.en":
+        filename = "./distil-small-en-original-model.bin"
+        if not Path(filename).is_file():
+            raise ValueError(
+                """
+                Please go to https://huggingface.co/distil-whisper/distil-small.en
+                to download original-model.bin
+                You can use the following command to do that:
+
+                wget -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin
+            """
+            )
+        model = whisper.load_model(filename)
     else:
         model = whisper.load_model(name)
     print(model.dims)
--- a/scripts/whisper/test.py
查看文件 @868c339
+++ b/scripts/whisper/test.py
查看文件 @868c339
@@ -209,7 +209,7 @@ class OnnxModel:
         logits = logits.reshape(-1)
         mask = torch.ones(logits.shape[0], dtype=torch.int64)
         mask[self.all_language_tokens] = 0
-        logits[mask] = float("-inf")
+        logits[mask != 0] = float("-inf")
         lang_id = logits.argmax().item()
         print("detected language: ", self.id2lang[lang_id])
         return lang_id
@@ -263,7 +263,9 @@ def compute_features(filename: str) -> torch.Tensor:
     target = 3000
     if mel.shape[0] > target:
-        mel = mel[:target]
+        # -50 so that there are some zero tail paddings.
+        mel = mel[: target - 50]
+        mel = torch.nn.functional.pad(mel, (0, 0, 0, 50), "constant", 0)
     # We don't need to pad it to 30 seconds now!
     #  mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
--- a/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h
查看文件 @868c339
+++ b/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h
查看文件 @868c339
@@ -106,11 +106,12 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
     std::vector<float> f = s->GetFrames();
     int32_t num_frames = f.size() / feat_dim;
-    if (num_frames > max_num_frames) {
+    // we use 50 here so that there will be some zero tail paddings
+    if (num_frames >= max_num_frames - 50) {
       SHERPA_ONNX_LOGE(
           "Only waves less than 30 seconds are supported. We process only the "
           "first 30 seconds and discard the remaining data");
-      num_frames = max_num_frames;
+      num_frames = max_num_frames - 50;
     }
     NormalizeFeatures(f.data(), num_frames, feat_dim);
@@ -140,7 +141,7 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
     Ort::Value mel = Ort::Value::CreateTensor<float>(
         model_->Allocator(), shape.data(), shape.size());
     float *p_mel = mel.GetTensorMutableData<float>();
-    std::copy(f.begin(), f.end(), p_mel);
+    std::copy(f.data(), f.data() + actual_frames * feat_dim, p_mel);
     memset(p_mel + f.size(), 0,
            (actual_frames - num_frames) * feat_dim * sizeof(float));