Fangjun Kuang
Committed by GitHub

Export whisper distil-large-v3 and distil-large-v3.5 to sherpa-onnx (#2506)

This PR adds support for exporting two new whisper distil models (distil-large-v3 and distil-large-v3.5) to ONNX format for use with sherpa-onnx. The changes enable these models to be processed through the existing export pipeline.

- Added support for distil-large-v3 and distil-large-v3.5 models in the export script
- Updated GitHub workflow to include the new models in the CI matrix
- Configured proper n_mels parameter (128) for the new distil models
@@ -16,8 +16,9 @@ jobs: @@ -16,8 +16,9 @@ jobs:
16 fail-fast: false 16 fail-fast: false
17 matrix: 17 matrix:
18 os: [macos-latest] 18 os: [macos-latest]
19 - model: ["turbo", "distil-medium.en", "distil-small.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "medium-aishell", "large", "large-v1", "large-v2", "large-v3", "distil-large-v2"] 19 + model: ["turbo", "distil-medium.en", "distil-small.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "medium-aishell", "large", "large-v1", "large-v2", "large-v3", "distil-large-v2", "distil-large-v3", "distil-large-v3.5"]
20 # model: ["large", "large-v1", "large-v2", "large-v3", "distil-large-v2"] 20 # model: ["large", "large-v1", "large-v2", "large-v3", "distil-large-v2"]
  21 + # model: ["distil-large-v3.5", "distil-large-v3"]
21 python-version: ["3.8"] 22 python-version: ["3.8"]
22 23
23 steps: 24 steps:
@@ -47,6 +48,12 @@ jobs: @@ -47,6 +48,12 @@ jobs:
47 elif [[ $model == distil-large-v2 ]]; then 48 elif [[ $model == distil-large-v2 ]]; then
48 wget -q -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin 49 wget -q -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin
49 ls -lh 50 ls -lh
  51 + elif [[ $model == distil-large-v3 ]]; then
  52 + wget -q -O distil-large-v3-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3-openai/resolve/main/model.bin
  53 + ls -lh
  54 + elif [[ $model == distil-large-v3.5 ]]; then
  55 + wget -q -O distil-large-v3.5-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3.5-openai/resolve/main/model.bin
  56 + ls -lh
50 elif [[ $model == distil-small.en ]]; then 57 elif [[ $model == distil-small.en ]]; then
51 wget -q -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin 58 wget -q -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin
52 ls -lh 59 ls -lh
@@ -155,6 +162,7 @@ jobs: @@ -155,6 +162,7 @@ jobs:
155 162
156 git status 163 git status
157 ls -lh 164 ls -lh
  165 + git lfs track "*.wav*"
158 git lfs track "*onnx*" 166 git lfs track "*onnx*"
159 git lfs track "*weights*" 167 git lfs track "*weights*"
160 168
@@ -49,7 +49,8 @@ def get_args(): @@ -49,7 +49,8 @@ def get_args():
49 "large-v1", "large-v2", 49 "large-v1", "large-v2",
50 "large", "large-v3", "turbo", # these three have feature dim 128 50 "large", "large-v3", "turbo", # these three have feature dim 128
51 "distil-medium.en", "distil-small.en", "distil-large-v2", 51 "distil-medium.en", "distil-small.en", "distil-large-v2",
52 - # "distil-large-v3", # distil-large-v3 is not supported! 52 + "distil-large-v3",
  53 + "distil-large-v3.5",
53 # for fine-tuned models from icefall 54 # for fine-tuned models from icefall
54 "medium-aishell", 55 "medium-aishell",
55 ], 56 ],
@@ -348,6 +349,32 @@ def main(): @@ -348,6 +349,32 @@ def main():
348 """ 349 """
349 ) 350 )
350 model = whisper.load_model(filename) 351 model = whisper.load_model(filename)
  352 + elif name == "distil-large-v3":
  353 + filename = "./distil-large-v3-original-model.bin"
  354 + if not Path(filename).is_file():
  355 + raise ValueError(
  356 + """
  357 + Please go to https://huggingface.co/distil-whisper/distil-large-v3-openai
  358 + to download model.bin
  359 + You can use the following command to do that:
  360 +
  361 + wget -O distil-large-v3-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3-openai/resolve/main/model.bin
  362 + """
  363 + )
  364 + model = whisper.load_model(filename)
  365 + elif name == "distil-large-v3.5":
  366 + filename = "./distil-large-v3.5-original-model.bin"
  367 + if not Path(filename).is_file():
  368 + raise ValueError(
  369 + """
  370 + Please go to https://huggingface.co/distil-whisper/distil-large-v3.5-openai/
  371 + to download model.bin
  372 + You can use the following command to do that:
  373 +
  374 + wget -O distil-large-v3.5-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3.5-openai/resolve/main/model.bin
  375 + """
  376 + )
  377 + model = whisper.load_model(filename)
351 elif name == "distil-small.en": 378 elif name == "distil-small.en":
352 filename = "./distil-small-en-original-model.bin" 379 filename = "./distil-small-en-original-model.bin"
353 if not Path(filename).is_file(): 380 if not Path(filename).is_file():
@@ -405,10 +432,17 @@ def main(): @@ -405,10 +432,17 @@ def main():
405 audio = whisper.pad_or_trim(audio) 432 audio = whisper.pad_or_trim(audio)
406 assert audio.shape == (16000 * 30,), audio.shape 433 assert audio.shape == (16000 * 30,), audio.shape
407 434
408 - if args.model in ("large", "large-v3", "turbo"): 435 + if args.model in ("distil-large-v3", "distil-large-v3.5"):
  436 + n_mels = 128
  437 + elif args.model in (
  438 + "large",
  439 + "large-v3",
  440 + "turbo",
  441 + ):
409 n_mels = 128 442 n_mels = 128
410 else: 443 else:
411 n_mels = 80 444 n_mels = 80
  445 +
412 mel = ( 446 mel = (
413 whisper.log_mel_spectrogram(audio, n_mels=n_mels).to(model.device).unsqueeze(0) 447 whisper.log_mel_spectrogram(audio, n_mels=n_mels).to(model.device).unsqueeze(0)
414 ) 448 )