Committed by
GitHub
Export whisper distil-large-v3 and distil-large-v3.5 to sherpa-onnx (#2506)
This PR adds support for exporting two new whisper distil models (distil-large-v3 and distil-large-v3.5) to ONNX format for use with sherpa-onnx. The changes enable these models to be processed through the existing export pipeline. - Added support for distil-large-v3 and distil-large-v3.5 models in the export script - Updated GitHub workflow to include the new models in the CI matrix - Configured proper n_mels parameter (128) for the new distil models
正在显示
2 个修改的文件
包含
45 行增加
和
3 行删除
| @@ -16,8 +16,9 @@ jobs: | @@ -16,8 +16,9 @@ jobs: | ||
| 16 | fail-fast: false | 16 | fail-fast: false |
| 17 | matrix: | 17 | matrix: |
| 18 | os: [macos-latest] | 18 | os: [macos-latest] |
| 19 | - model: ["turbo", "distil-medium.en", "distil-small.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "medium-aishell", "large", "large-v1", "large-v2", "large-v3", "distil-large-v2"] | 19 | + model: ["turbo", "distil-medium.en", "distil-small.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "medium-aishell", "large", "large-v1", "large-v2", "large-v3", "distil-large-v2", "distil-large-v3", "distil-large-v3.5"] |
| 20 | # model: ["large", "large-v1", "large-v2", "large-v3", "distil-large-v2"] | 20 | # model: ["large", "large-v1", "large-v2", "large-v3", "distil-large-v2"] |
| 21 | + # model: ["distil-large-v3.5", "distil-large-v3"] | ||
| 21 | python-version: ["3.8"] | 22 | python-version: ["3.8"] |
| 22 | 23 | ||
| 23 | steps: | 24 | steps: |
| @@ -47,6 +48,12 @@ jobs: | @@ -47,6 +48,12 @@ jobs: | ||
| 47 | elif [[ $model == distil-large-v2 ]]; then | 48 | elif [[ $model == distil-large-v2 ]]; then |
| 48 | wget -q -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin | 49 | wget -q -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin |
| 49 | ls -lh | 50 | ls -lh |
| 51 | + elif [[ $model == distil-large-v3 ]]; then | ||
| 52 | + wget -q -O distil-large-v3-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3-openai/resolve/main/model.bin | ||
| 53 | + ls -lh | ||
| 54 | + elif [[ $model == distil-large-v3.5 ]]; then | ||
| 55 | + wget -q -O distil-large-v3.5-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3.5-openai/resolve/main/model.bin | ||
| 56 | + ls -lh | ||
| 50 | elif [[ $model == distil-small.en ]]; then | 57 | elif [[ $model == distil-small.en ]]; then |
| 51 | wget -q -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin | 58 | wget -q -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin |
| 52 | ls -lh | 59 | ls -lh |
| @@ -155,6 +162,7 @@ jobs: | @@ -155,6 +162,7 @@ jobs: | ||
| 155 | 162 | ||
| 156 | git status | 163 | git status |
| 157 | ls -lh | 164 | ls -lh |
| 165 | + git lfs track "*.wav*" | ||
| 158 | git lfs track "*onnx*" | 166 | git lfs track "*onnx*" |
| 159 | git lfs track "*weights*" | 167 | git lfs track "*weights*" |
| 160 | 168 |
| @@ -49,7 +49,8 @@ def get_args(): | @@ -49,7 +49,8 @@ def get_args(): | ||
| 49 | "large-v1", "large-v2", | 49 | "large-v1", "large-v2", |
| 50 | "large", "large-v3", "turbo", # these three have feature dim 128 | 50 | "large", "large-v3", "turbo", # these three have feature dim 128 |
| 51 | "distil-medium.en", "distil-small.en", "distil-large-v2", | 51 | "distil-medium.en", "distil-small.en", "distil-large-v2", |
| 52 | - # "distil-large-v3", # distil-large-v3 is not supported! | 52 | + "distil-large-v3", |
| 53 | + "distil-large-v3.5", | ||
| 53 | # for fine-tuned models from icefall | 54 | # for fine-tuned models from icefall |
| 54 | "medium-aishell", | 55 | "medium-aishell", |
| 55 | ], | 56 | ], |
| @@ -348,6 +349,32 @@ def main(): | @@ -348,6 +349,32 @@ def main(): | ||
| 348 | """ | 349 | """ |
| 349 | ) | 350 | ) |
| 350 | model = whisper.load_model(filename) | 351 | model = whisper.load_model(filename) |
| 352 | + elif name == "distil-large-v3": | ||
| 353 | + filename = "./distil-large-v3-original-model.bin" | ||
| 354 | + if not Path(filename).is_file(): | ||
| 355 | + raise ValueError( | ||
| 356 | + """ | ||
| 357 | + Please go to https://huggingface.co/distil-whisper/distil-large-v3-openai | ||
| 358 | + to download model.bin | ||
| 359 | + You can use the following command to do that: | ||
| 360 | + | ||
| 361 | + wget -O distil-large-v3-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3-openai/resolve/main/model.bin | ||
| 362 | + """ | ||
| 363 | + ) | ||
| 364 | + model = whisper.load_model(filename) | ||
| 365 | + elif name == "distil-large-v3.5": | ||
| 366 | + filename = "./distil-large-v3.5-original-model.bin" | ||
| 367 | + if not Path(filename).is_file(): | ||
| 368 | + raise ValueError( | ||
| 369 | + """ | ||
| 370 | + Please go to https://huggingface.co/distil-whisper/distil-large-v3.5-openai/ | ||
| 371 | + to download model.bin | ||
| 372 | + You can use the following command to do that: | ||
| 373 | + | ||
| 374 | + wget -O distil-large-v3.5-original-model.bin https://huggingface.co/distil-whisper/distil-large-v3.5-openai/resolve/main/model.bin | ||
| 375 | + """ | ||
| 376 | + ) | ||
| 377 | + model = whisper.load_model(filename) | ||
| 351 | elif name == "distil-small.en": | 378 | elif name == "distil-small.en": |
| 352 | filename = "./distil-small-en-original-model.bin" | 379 | filename = "./distil-small-en-original-model.bin" |
| 353 | if not Path(filename).is_file(): | 380 | if not Path(filename).is_file(): |
| @@ -405,10 +432,17 @@ def main(): | @@ -405,10 +432,17 @@ def main(): | ||
| 405 | audio = whisper.pad_or_trim(audio) | 432 | audio = whisper.pad_or_trim(audio) |
| 406 | assert audio.shape == (16000 * 30,), audio.shape | 433 | assert audio.shape == (16000 * 30,), audio.shape |
| 407 | 434 | ||
| 408 | - if args.model in ("large", "large-v3", "turbo"): | 435 | + if args.model in ("distil-large-v3", "distil-large-v3.5"): |
| 436 | + n_mels = 128 | ||
| 437 | + elif args.model in ( | ||
| 438 | + "large", | ||
| 439 | + "large-v3", | ||
| 440 | + "turbo", | ||
| 441 | + ): | ||
| 409 | n_mels = 128 | 442 | n_mels = 128 |
| 410 | else: | 443 | else: |
| 411 | n_mels = 80 | 444 | n_mels = 80 |
| 445 | + | ||
| 412 | mel = ( | 446 | mel = ( |
| 413 | whisper.log_mel_spectrogram(audio, n_mels=n_mels).to(model.device).unsqueeze(0) | 447 | whisper.log_mel_spectrogram(audio, n_mels=n_mels).to(model.device).unsqueeze(0) |
| 414 | ) | 448 | ) |
-
请 注册 或 登录 后发表评论