Fangjun Kuang
Committed by GitHub

Add int8 quantized whisper large models (#1126)

@@ -16,7 +16,7 @@ jobs: @@ -16,7 +16,7 @@ jobs:
16 fail-fast: false 16 fail-fast: false
17 matrix: 17 matrix:
18 os: [macos-latest] 18 os: [macos-latest]
19 - model: ["distil-medium.en", "distil-small.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "medium-aishell", "large", "large-v1", "large-v2", "distil-large-v2"] 19 + model: ["distil-medium.en", "distil-small.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "medium-aishell", "large", "large-v1", "large-v2", "large-v3", "distil-large-v2"]
20 # model: ["large", "large-v1", "large-v2", "large-v3", "distil-large-v2"] 20 # model: ["large", "large-v1", "large-v2", "large-v3", "distil-large-v2"]
21 python-version: ["3.8"] 21 python-version: ["3.8"]
22 22
@@ -56,11 +56,7 @@ jobs: @@ -56,11 +56,7 @@ jobs:
56 python3 ./export-onnx.py --model ${{ matrix.model }} 56 python3 ./export-onnx.py --model ${{ matrix.model }}
57 # python3 -m onnxruntime.tools.convert_onnx_models_to_ort --optimization_style=Fixed ./ 57 # python3 -m onnxruntime.tools.convert_onnx_models_to_ort --optimization_style=Fixed ./
58 # 58 #
59 - if [[ $model == medium-aishell ]]; then  
60 - ls -lh *.onnx  
61 - rm -fv medium-aishell-encoder.onnx  
62 - rm -fv medium-aishell-decoder.onnx  
63 - fi 59 +
64 60
65 ls -lh 61 ls -lh
66 62
@@ -97,16 +93,34 @@ jobs: @@ -97,16 +93,34 @@ jobs:
97 ls -lh $src 93 ls -lh $src
98 echo "--------------------" 94 echo "--------------------"
99 95
100 - if [[ $model == large || $model == large-v1 || $model == large-v2 || $model == distil-large-v2 ]]; then  
101 - echo "Don't release model to github for large models. $model" 96 + if [[ $model == medium-aishell ]]; then
  97 + ls -lh *.onnx # the float32 onnx model for medium-aishell is too large to be uploaded to GitHub
  98 + mkdir -p bak
  99 + mv -v $src/$model-encoder.onnx ./bak
  100 + mv -v $src/$model-decoder.onnx ./bak
  101 + ls -lh $src
  102 +
  103 + tar cvjf $src.tar.bz2 $src
  104 + mv -v ./bak/* $src/
  105 + rm -rf bak
  106 + elif [[ -f $src/$model-encoder.weights ]]; then
  107 + # we only publish int8 models to GitHub for large Whisper models
  108 + mkdir -p bak
  109 + mv -v $src/*weights ./bak
  110 + mv -v $src/$model-encoder.onnx ./bak
  111 + mv -v $src/$model-decoder.onnx ./bak
  112 + ls -lh $src
  113 +
  114 + tar cvjf $src.tar.bz2 $src
  115 + mv -v ./bak/* $src/
  116 + rm -rf bak
102 else 117 else
103 tar cvjf $src.tar.bz2 $src 118 tar cvjf $src.tar.bz2 $src
104 fi 119 fi
105 120
106 - ls -lh 121 + ls -lh *.tar.bz2
107 122
108 - name: Release 123 - name: Release
109 - if: matrix.model != 'large' && matrix.model != 'large-v1' && matrix.model != 'large-v2' && matrix.model != 'large-v3' && matrix.model != 'distil-large-v2'  
110 uses: svenstaro/upload-release-action@v2 124 uses: svenstaro/upload-release-action@v2
111 with: 125 with:
112 file_glob: true 126 file_glob: true
@@ -132,9 +146,7 @@ jobs: @@ -132,9 +146,7 @@ jobs:
132 146
133 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} huggingface 147 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} huggingface
134 148
135 - if [[ $model != medium-aishell ]]; then  
136 - rm -rf huggingface/*  
137 - fi 149 + rm -rf huggingface/*
138 150
139 cp -av $src/* ./huggingface/ 151 cp -av $src/* ./huggingface/
140 152
@@ -149,11 +161,10 @@ jobs: @@ -149,11 +161,10 @@ jobs:
149 git commit -m "upload ${{ matrix.model }}" 161 git commit -m "upload ${{ matrix.model }}"
150 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} main 162 git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} main
151 163
152 - - name: Test ${{ matrix.model }} 164 + - name: Test float32 ${{ matrix.model }}
153 shell: bash 165 shell: bash
154 run: | 166 run: |
155 python3 -m pip install kaldi-native-fbank 167 python3 -m pip install kaldi-native-fbank
156 - git checkout .  
157 model=${{ matrix.model }} 168 model=${{ matrix.model }}
158 src=sherpa-onnx-whisper-$model 169 src=sherpa-onnx-whisper-$model
159 time python3 scripts/whisper/test.py \ 170 time python3 scripts/whisper/test.py \
@@ -161,3 +172,14 @@ jobs: @@ -161,3 +172,14 @@ jobs:
161 --decoder $src/$model-decoder.onnx \ 172 --decoder $src/$model-decoder.onnx \
162 --tokens $src/$model-tokens.txt \ 173 --tokens $src/$model-tokens.txt \
163 $src/test_wavs/0.wav 174 $src/test_wavs/0.wav
  175 +
  176 + - name: Test int8 ${{ matrix.model }}
  177 + shell: bash
  178 + run: |
  179 + model=${{ matrix.model }}
  180 + src=sherpa-onnx-whisper-$model
  181 + time python3 scripts/whisper/test.py \
  182 + --encoder $src/$model-encoder.int8.onnx \
  183 + --decoder $src/$model-decoder.int8.onnx \
  184 + --tokens $src/$model-tokens.txt \
  185 + $src/test_wavs/0.wav
@@ -582,9 +582,6 @@ def main(): @@ -582,9 +582,6 @@ def main():
582 location=decoder_external_filename + ".weights", 582 location=decoder_external_filename + ".weights",
583 ) 583 )
584 584
585 - if "large" in args.model:  
586 - # it causes errors for large models, so skip it.  
587 - return  
588 # Generate int8 quantization models 585 # Generate int8 quantization models
589 # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection 586 # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
590 587
@@ -90,6 +90,7 @@ class OnnxModel: @@ -90,6 +90,7 @@ class OnnxModel:
90 self.n_text_layer = int(meta["n_text_layer"]) 90 self.n_text_layer = int(meta["n_text_layer"])
91 self.n_text_ctx = int(meta["n_text_ctx"]) 91 self.n_text_ctx = int(meta["n_text_ctx"])
92 self.n_text_state = int(meta["n_text_state"]) 92 self.n_text_state = int(meta["n_text_state"])
  93 + self.n_mels = int(meta["n_mels"])
93 self.sot = int(meta["sot"]) 94 self.sot = int(meta["sot"])
94 self.eot = int(meta["eot"]) 95 self.eot = int(meta["eot"])
95 self.translate = int(meta["translate"]) 96 self.translate = int(meta["translate"])
@@ -294,8 +295,9 @@ def main(): @@ -294,8 +295,9 @@ def main():
294 args = get_args() 295 args = get_args()
295 296
296 model = OnnxModel(args.encoder, args.decoder) 297 model = OnnxModel(args.encoder, args.decoder)
297 - dim = 80 if "large-v3" not in args.encoder else 128  
298 - mel = compute_features(args.sound_file, dim=dim) 298 + n_mels = model.n_mels
  299 +
  300 + mel = compute_features(args.sound_file, dim=n_mels)
299 301
300 n_layer_cross_k, n_layer_cross_v = model.run_encoder(mel) 302 n_layer_cross_k, n_layer_cross_v = model.run_encoder(mel)
301 303