Fangjun Kuang
Committed by GitHub

Export https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3 to sherpa-onnx (#2500)

This PR adds support for the newer version (v3) of NVIDIA's parakeet-tdt-0.6b model by exporting it to sherpa-onnx format. The v3 model supports 25 languages, maintaining the same usage pattern as v2 but with improved language coverage.
1 -name: export-nemo-parakeet-tdt-0.6b-v2 1 +name: export-nemo-parakeet-tdt-0.6b
2 2
3 on: 3 on:
4 push: 4 push:
@@ -10,81 +10,111 @@ concurrency: @@ -10,81 +10,111 @@ concurrency:
10 group: export-nemo-parakeet-tdt-0.6b-v2-${{ github.ref }} 10 group: export-nemo-parakeet-tdt-0.6b-v2-${{ github.ref }}
11 cancel-in-progress: true 11 cancel-in-progress: true
12 12
  13 +env:
  14 + HF_HUB_ENABLE_HF_TRANSFER: "0"
  15 +
13 jobs: 16 jobs:
14 - export-nemo-parakeet-tdt-0_6b-v2: 17 + export-nemo-parakeet-tdt-0_6b:
15 if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' 18 if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
16 - name: parakeet tdt 0.6b v2 19 + name: parakeet tdt 0.6b ${{ matrix.version }}
17 runs-on: ${{ matrix.os }} 20 runs-on: ${{ matrix.os }}
18 strategy: 21 strategy:
19 fail-fast: false 22 fail-fast: false
20 matrix: 23 matrix:
21 os: [macos-latest] 24 os: [macos-latest]
22 python-version: ["3.10"] 25 python-version: ["3.10"]
  26 + version: ["v2", "v3"]
23 27
24 steps: 28 steps:
25 - uses: actions/checkout@v4 29 - uses: actions/checkout@v4
26 30
  31 + - name: Show disk space
  32 + run: |
  33 + df -h
  34 +
  35 + # See https://github.com/vlayer-xyz/vlayer/pull/543/files
  36 + # Free up disk space as the macOS runners end up using most for Xcode
  37 + # versions we don't need and use iOS simulators.
  38 + - name: Free up disk space
  39 + run: |
  40 + echo '*** Delete iOS simulators and their caches'
  41 + xcrun simctl delete all
  42 + sudo rm -rf ~/Library/Developer/CoreSimulator/Caches/*
  43 +
  44 + - name: Show disk space
  45 + run: |
  46 + df -h
  47 +
27 - name: Setup Python ${{ matrix.python-version }} 48 - name: Setup Python ${{ matrix.python-version }}
28 uses: actions/setup-python@v5 49 uses: actions/setup-python@v5
29 with: 50 with:
30 python-version: ${{ matrix.python-version }} 51 python-version: ${{ matrix.python-version }}
31 52
32 - - name: Run 53 + - name: Run ${{ matrix.version }}
  54 + if: matrix.version == 'v2'
33 shell: bash 55 shell: bash
34 run: | 56 run: |
35 cd scripts/nemo/parakeet-tdt-0.6b-v2 57 cd scripts/nemo/parakeet-tdt-0.6b-v2
36 ./run.sh 58 ./run.sh
37 59
38 ls -lh *.onnx 60 ls -lh *.onnx
  61 + ls -lh *.weights
  62 +
39 mv -v *.onnx ../../.. 63 mv -v *.onnx ../../..
  64 + mv -v *.weights ../../..
40 mv -v tokens.txt ../../.. 65 mv -v tokens.txt ../../..
41 mv 2086-149220-0033.wav ../../../0.wav 66 mv 2086-149220-0033.wav ../../../0.wav
42 67
43 - - name: Collect files (fp32) 68 + - name: Run ${{ matrix.version }}
  69 + if: matrix.version == 'v3'
44 shell: bash 70 shell: bash
45 run: | 71 run: |
46 - d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2  
47 - mkdir -p $d  
48 - cp encoder.int8.onnx $d  
49 - cp decoder.onnx $d  
50 - cp joiner.onnx $d  
51 - cp tokens.txt $d  
52 -  
53 - mkdir $d/test_wavs  
54 - cp 0.wav $d/test_wavs 72 + cd scripts/nemo/parakeet-tdt-0.6b-v3
  73 + ./run.sh
55 74
56 - tar cjfv $d.tar.bz2 $d 75 + ls -lh *.onnx
  76 + mv -v *.onnx ../../..
  77 + mv -v *.weights ../../..
  78 + mv -v tokens.txt ../../..
  79 + mv *.wav ../../../
57 80
58 - - name: Collect files (int8) 81 + - name: Collect files (fp32)
59 shell: bash 82 shell: bash
60 run: | 83 run: |
61 - d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8 84 + version=${{ matrix.version }}
  85 + d=sherpa-onnx-nemo-parakeet-tdt-0.6b-$version
62 mkdir -p $d 86 mkdir -p $d
63 - cp encoder.int8.onnx $d  
64 - cp decoder.int8.onnx $d  
65 - cp joiner.int8.onnx $d  
66 - cp tokens.txt $d 87 + cp -v encoder.onnx $d
  88 + cp -v encoder.weights $d
  89 + cp -v decoder.onnx $d
  90 + cp -v joiner.onnx $d
  91 + cp -v tokens.txt $d
67 92
68 mkdir $d/test_wavs 93 mkdir $d/test_wavs
69 - cp 0.wav $d/test_wavs 94 + cp -v *.wav $d/test_wavs
70 95
71 - tar cjfv $d.tar.bz2 $d 96 + # tar cjfv $d.tar.bz2 $d
  97 +
  98 + # ls -lh *.tar.bz2
72 99
73 - - name: Collect files (fp16) 100 + - name: Collect files (int8)
74 shell: bash 101 shell: bash
75 run: | 102 run: |
76 - d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16 103 + version=${{ matrix.version }}
  104 + d=sherpa-onnx-nemo-parakeet-tdt-0.6b-$version-int8
77 mkdir -p $d 105 mkdir -p $d
78 - cp encoder.fp16.onnx $d  
79 - cp decoder.fp16.onnx $d  
80 - cp joiner.fp16.onnx $d  
81 - cp tokens.txt $d 106 + cp -v encoder.int8.onnx $d
  107 + cp -v decoder.int8.onnx $d
  108 + cp -v joiner.int8.onnx $d
  109 + cp -v tokens.txt $d
82 110
83 mkdir $d/test_wavs 111 mkdir $d/test_wavs
84 - cp 0.wav $d/test_wavs 112 + cp -v *.wav $d/test_wavs
85 113
86 tar cjfv $d.tar.bz2 $d 114 tar cjfv $d.tar.bz2 $d
87 115
  116 + ls -lh *.tar.bz2
  117 +
88 - name: Publish to huggingface 118 - name: Publish to huggingface
89 env: 119 env:
90 HF_TOKEN: ${{ secrets.HF_TOKEN }} 120 HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -94,13 +124,13 @@ jobs: @@ -94,13 +124,13 @@ jobs:
94 timeout_seconds: 200 124 timeout_seconds: 200
95 shell: bash 125 shell: bash
96 command: | 126 command: |
  127 + version=${{ matrix.version }}
97 git config --global user.email "csukuangfj@gmail.com" 128 git config --global user.email "csukuangfj@gmail.com"
98 git config --global user.name "Fangjun Kuang" 129 git config --global user.name "Fangjun Kuang"
99 130
100 models=( 131 models=(
101 - sherpa-onnx-nemo-parakeet-tdt-0.6b-v2  
102 - sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8  
103 - sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16 132 + sherpa-onnx-nemo-parakeet-tdt-0.6b-$version
  133 + sherpa-onnx-nemo-parakeet-tdt-0.6b-$version-int8
104 ) 134 )
105 135
106 for m in ${models[@]}; do 136 for m in ${models[@]}; do
@@ -112,6 +142,7 @@ jobs: @@ -112,6 +142,7 @@ jobs:
112 cd huggingface 142 cd huggingface
113 git lfs track "*.onnx" 143 git lfs track "*.onnx"
114 git lfs track "*.wav" 144 git lfs track "*.wav"
  145 + git lfs track "*.weights"
115 git status 146 git status
116 git add . 147 git add .
117 git status 148 git status
@@ -681,6 +681,22 @@ def get_models(): @@ -681,6 +681,22 @@ def get_models():
681 popd 681 popd
682 """, 682 """,
683 ), 683 ),
  684 + Model(
  685 + model_name="sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8",
  686 + idx=40,
  687 + lang="multi",
  688 + lang2="25_languages",
  689 + short_name="parakeet_tdt_0.6b_v3",
  690 + cmd="""
  691 + pushd $model_name
  692 +
  693 + rm -rfv test_wavs
  694 +
  695 + ls -lh
  696 +
  697 + popd
  698 + """,
  699 + ),
684 ] 700 ]
685 return models 701 return models
686 702
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 # Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) 2 # Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
3 3
  4 +import os
4 from pathlib import Path 5 from pathlib import Path
5 from typing import Dict 6 from typing import Dict
6 -import os  
7 7
8 import nemo.collections.asr as nemo_asr 8 import nemo.collections.asr as nemo_asr
9 import onnx 9 import onnx
10 -import onnxmltools  
11 import torch 10 import torch
12 -from onnxmltools.utils.float16_converter import (  
13 - convert_float_to_float16,  
14 - convert_float_to_float16_model_path,  
15 -)  
16 from onnxruntime.quantization import QuantType, quantize_dynamic 11 from onnxruntime.quantization import QuantType, quantize_dynamic
17 12
18 13
19 -def export_onnx_fp16(onnx_fp32_path, onnx_fp16_path):  
20 - onnx_fp32_model = onnxmltools.utils.load_model(onnx_fp32_path)  
21 - onnx_fp16_model = convert_float_to_float16(onnx_fp32_model, keep_io_types=True)  
22 - onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path)  
23 -  
24 -  
25 -def export_onnx_fp16_large_2gb(onnx_fp32_path, onnx_fp16_path):  
26 - onnx_fp16_model = convert_float_to_float16_model_path(  
27 - onnx_fp32_path, keep_io_types=True  
28 - )  
29 - onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path)  
30 -  
31 -  
32 def add_meta_data(filename: str, meta_data: Dict[str, str]): 14 def add_meta_data(filename: str, meta_data: Dict[str, str]):
33 """Add meta data to an ONNX model. It is changed in-place. 15 """Add meta data to an ONNX model. It is changed in-place.
34 16
@@ -47,14 +29,29 @@ def add_meta_data(filename: str, meta_data: Dict[str, str]): @@ -47,14 +29,29 @@ def add_meta_data(filename: str, meta_data: Dict[str, str]):
47 meta.key = key 29 meta.key = key
48 meta.value = str(value) 30 meta.value = str(value)
49 31
50 - onnx.save(model, filename) 32 + if filename == "encoder.onnx":
  33 + external_filename = "encoder"
  34 + onnx.save(
  35 + model,
  36 + filename,
  37 + save_as_external_data=True,
  38 + all_tensors_to_one_file=True,
  39 + location=external_filename + ".weights",
  40 + )
  41 + else:
  42 + onnx.save(model, filename)
51 43
52 44
53 @torch.no_grad() 45 @torch.no_grad()
54 def main(): 46 def main():
55 - asr_model = nemo_asr.models.ASRModel.from_pretrained(  
56 - model_name="nvidia/parakeet-tdt-0.6b-v2"  
57 - ) 47 + if Path("./parakeet-tdt-0.6b-v2.nemo").is_file():
  48 + asr_model = nemo_asr.models.ASRModel.restore_from(
  49 + restore_path="./parakeet-tdt-0.6b-v2.nemo"
  50 + )
  51 + else:
  52 + asr_model = nemo_asr.models.ASRModel.from_pretrained(
  53 + model_name="nvidia/parakeet-tdt-0.6b-v2"
  54 + )
58 55
59 asr_model.eval() 56 asr_model.eval()
60 57
@@ -95,13 +92,8 @@ def main(): @@ -95,13 +92,8 @@ def main():
95 ) 92 )
96 os.system("ls -lh *.onnx") 93 os.system("ls -lh *.onnx")
97 94
98 - if m == "encoder":  
99 - export_onnx_fp16_large_2gb(f"{m}.onnx", f"{m}.fp16.onnx")  
100 - else:  
101 - export_onnx_fp16(f"{m}.onnx", f"{m}.fp16.onnx")  
102 -  
103 add_meta_data("encoder.int8.onnx", meta_data) 95 add_meta_data("encoder.int8.onnx", meta_data)
104 - add_meta_data("encoder.fp16.onnx", meta_data) 96 + add_meta_data("encoder.onnx", meta_data)
105 print("meta_data", meta_data) 97 print("meta_data", meta_data)
106 98
107 99
@@ -9,8 +9,9 @@ log() { @@ -9,8 +9,9 @@ log() {
9 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
10 } 10 }
11 11
12 -curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav 12 +curl -SL -O https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2/resolve/main/parakeet-tdt-0.6b-v2.nemo
13 13
  14 +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
14 15
15 16
16 pip install \ 17 pip install \
@@ -20,7 +21,6 @@ pip install \ @@ -20,7 +21,6 @@ pip install \
20 kaldi-native-fbank \ 21 kaldi-native-fbank \
21 librosa \ 22 librosa \
22 onnx==1.17.0 \ 23 onnx==1.17.0 \
23 - onnxmltools \  
24 onnxruntime==1.17.1 \ 24 onnxruntime==1.17.1 \
25 soundfile 25 soundfile
26 26
@@ -42,11 +42,3 @@ python3 ./test_onnx.py \ @@ -42,11 +42,3 @@ python3 ./test_onnx.py \
42 --joiner ./joiner.int8.onnx \ 42 --joiner ./joiner.int8.onnx \
43 --tokens ./tokens.txt \ 43 --tokens ./tokens.txt \
44 --wav 2086-149220-0033.wav 44 --wav 2086-149220-0033.wav
45 -  
46 -echo "---fp16----"  
47 -python3 ./test_onnx.py \  
48 - --encoder ./encoder.fp16.onnx \  
49 - --decoder ./decoder.fp16.onnx \  
50 - --joiner ./joiner.fp16.onnx \  
51 - --tokens ./tokens.txt \  
52 - --wav 2086-149220-0033.wav  
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +from pathlib import Path
  5 +from typing import Dict
  6 +import os
  7 +
  8 +import nemo.collections.asr as nemo_asr
  9 +import onnx
  10 +import torch
  11 +from onnxruntime.quantization import QuantType, quantize_dynamic
  12 +
  13 +
  14 +def add_meta_data(filename: str, meta_data: Dict[str, str]):
  15 + """Add meta data to an ONNX model. It is changed in-place.
  16 +
  17 + Args:
  18 + filename:
  19 + Filename of the ONNX model to be changed.
  20 + meta_data:
  21 + Key-value pairs.
  22 + """
  23 + model = onnx.load(filename)
  24 + while len(model.metadata_props):
  25 + model.metadata_props.pop()
  26 +
  27 + for key, value in meta_data.items():
  28 + meta = model.metadata_props.add()
  29 + meta.key = key
  30 + meta.value = str(value)
  31 +
  32 + if filename == "encoder.onnx":
  33 + external_filename = "encoder"
  34 + onnx.save(
  35 + model,
  36 + filename,
  37 + save_as_external_data=True,
  38 + all_tensors_to_one_file=True,
  39 + location=external_filename + ".weights",
  40 + )
  41 + else:
  42 + onnx.save(model, filename)
  43 +
  44 +
  45 +@torch.no_grad()
  46 +def main():
  47 + if Path("./parakeet-tdt-0.6b-v3.nemo").is_file():
  48 + asr_model = nemo_asr.models.ASRModel.restore_from(
  49 + restore_path="./parakeet-tdt-0.6b-v3.nemo"
  50 + )
  51 + else:
  52 + asr_model = nemo_asr.models.ASRModel.from_pretrained(
  53 + model_name="nvidia/parakeet-tdt-0.6b-v3"
  54 + )
  55 +
  56 + asr_model.eval()
  57 +
  58 + with open("./tokens.txt", "w", encoding="utf-8") as f:
  59 + for i, s in enumerate(asr_model.joint.vocabulary):
  60 + f.write(f"{s} {i}\n")
  61 + f.write(f"<blk> {i+1}\n")
  62 + print("Saved to tokens.txt")
  63 +
  64 + asr_model.encoder.export("encoder.onnx")
  65 + asr_model.decoder.export("decoder.onnx")
  66 + asr_model.joint.export("joiner.onnx")
  67 + os.system("ls -lh *.onnx")
  68 +
  69 + normalize_type = asr_model.cfg.preprocessor.normalize
  70 + if normalize_type == "NA":
  71 + normalize_type = ""
  72 +
  73 + meta_data = {
  74 + "vocab_size": asr_model.decoder.vocab_size,
  75 + "normalize_type": normalize_type,
  76 + "pred_rnn_layers": asr_model.decoder.pred_rnn_layers,
  77 + "pred_hidden": asr_model.decoder.pred_hidden,
  78 + "subsampling_factor": 8,
  79 + "model_type": "EncDecRNNTBPEModel",
  80 + "version": "2",
  81 + "model_author": "NeMo",
  82 + "url": "https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3",
  83 + "comment": "Only the transducer branch is exported",
  84 + "feat_dim": 128,
  85 + }
  86 +
  87 + for m in ["encoder", "decoder", "joiner"]:
  88 + quantize_dynamic(
  89 + model_input=f"./{m}.onnx",
  90 + model_output=f"./{m}.int8.onnx",
  91 + weight_type=QuantType.QUInt8 if m == "encoder" else QuantType.QInt8,
  92 + )
  93 + os.system("ls -lh *.onnx")
  94 +
  95 + add_meta_data("encoder.int8.onnx", meta_data)
  96 + add_meta_data("encoder.onnx", meta_data)
  97 + print("meta_data", meta_data)
  98 +
  99 +
  100 +if __name__ == "__main__":
  101 + main()
  1 +#!/usr/bin/env bash
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +set -ex
  5 +
  6 +log() {
  7 + # This function is from espnet
  8 + local fname=${BASH_SOURCE[1]##*/}
  9 + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  10 +}
  11 +
  12 +curl -SL -O https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3/resolve/main/parakeet-tdt-0.6b-v3.nemo
  13 +
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/en.wav
  15 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
  16 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/fr.wav
  17 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/es.wav
  18 +
  19 +ls -lh
  20 +
  21 +
  22 +pip install \
  23 + nemo_toolkit['asr'] \
  24 + "numpy<2" \
  25 + ipython \
  26 + kaldi-native-fbank \
  27 + librosa \
  28 + onnx==1.17.0 \
  29 + onnxruntime==1.17.1 \
  30 + soundfile
  31 +
  32 +python3 ./export_onnx.py
  33 +ls -lh *.onnx
  34 +
  35 +for w in en.wav de.wav fr.wav es.wav; do
  36 + echo "---fp32----"
  37 + python3 ./test_onnx.py \
  38 + --encoder ./encoder.int8.onnx \
  39 + --decoder ./decoder.onnx \
  40 + --joiner ./joiner.onnx \
  41 + --tokens ./tokens.txt \
  42 + --wav $w
  43 +
  44 + echo "---int8----"
  45 + python3 ./test_onnx.py \
  46 + --encoder ./encoder.int8.onnx \
  47 + --decoder ./decoder.int8.onnx \
  48 + --joiner ./joiner.int8.onnx \
  49 + --tokens ./tokens.txt \
  50 + --wav $w
  51 +done
  1 +../parakeet-tdt-0.6b-v2/test_onnx.py
@@ -46,7 +46,7 @@ bool OfflineTtsModelConfig::Validate() const { @@ -46,7 +46,7 @@ bool OfflineTtsModelConfig::Validate() const {
46 return kitten.Validate(); 46 return kitten.Validate();
47 } 47 }
48 48
49 - SHERPA_ONNX_LOGE("Please provide at exactly one tts model."); 49 + SHERPA_ONNX_LOGE("Please provide exactly one tts model.");
50 50
51 return false; 51 return false;
52 } 52 }
@@ -65,7 +65,7 @@ struct GeneratedAudio { @@ -65,7 +65,7 @@ struct GeneratedAudio {
65 65
66 class OfflineTtsImpl; 66 class OfflineTtsImpl;
67 67
68 -// If the callback returns 0, then it stop generating 68 +// If the callback returns 0, then it stops generating
69 // if the callback returns 1, then it keeps generating 69 // if the callback returns 1, then it keeps generating
70 using GeneratedAudioCallback = std::function<int32_t( 70 using GeneratedAudioCallback = std::function<int32_t(
71 const float * /*samples*/, int32_t /*n*/, float /*progress*/)>; 71 const float * /*samples*/, int32_t /*n*/, float /*progress*/)>;
@@ -677,6 +677,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { @@ -677,6 +677,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
677 tokens = "$modelDir/tokens.txt", 677 tokens = "$modelDir/tokens.txt",
678 ) 678 )
679 } 679 }
  680 +
  681 + 40 -> {
  682 + val modelDir = "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8"
  683 + return OfflineModelConfig(
  684 + transducer = OfflineTransducerModelConfig(
  685 + encoder = "$modelDir/encoder.int8.onnx",
  686 + decoder = "$modelDir/decoder.int8.onnx",
  687 + joiner = "$modelDir/joiner.int8.onnx",
  688 + ),
  689 + tokens = "$modelDir/tokens.txt",
  690 + modelType = "nemo_transducer",
  691 + )
  692 + }
680 } 693 }
681 return null 694 return null
682 } 695 }