Fangjun Kuang
Committed by GitHub

Export https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3 to sherpa-onnx (#2500)

This PR adds support for the newer version (v3) of NVIDIA's parakeet-tdt-0.6b model by exporting it to sherpa-onnx format. The v3 model supports 25 languages, maintaining the same usage pattern as v2 but with improved language coverage.
name: export-nemo-parakeet-tdt-0.6b-v2
name: export-nemo-parakeet-tdt-0.6b
on:
push:
... ... @@ -10,81 +10,111 @@ concurrency:
group: export-nemo-parakeet-tdt-0.6b-v2-${{ github.ref }}
cancel-in-progress: true
env:
HF_HUB_ENABLE_HF_TRANSFER: "0"
jobs:
export-nemo-parakeet-tdt-0_6b-v2:
export-nemo-parakeet-tdt-0_6b:
if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
name: parakeet tdt 0.6b v2
name: parakeet tdt 0.6b ${{ matrix.version }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-latest]
python-version: ["3.10"]
version: ["v2", "v3"]
steps:
- uses: actions/checkout@v4
- name: Show disk space
run: |
df -h
# See https://github.com/vlayer-xyz/vlayer/pull/543/files
# Free up disk space as the macOS runners end up using most for Xcode
# versions we don't need and use iOS simulators.
- name: Free up disk space
run: |
echo '*** Delete iOS simulators and their caches'
xcrun simctl delete all
sudo rm -rf ~/Library/Developer/CoreSimulator/Caches/*
- name: Show disk space
run: |
df -h
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Run
- name: Run ${{ matrix.version }}
if: matrix.version == 'v2'
shell: bash
run: |
cd scripts/nemo/parakeet-tdt-0.6b-v2
./run.sh
ls -lh *.onnx
ls -lh *.weights
mv -v *.onnx ../../..
mv -v *.weights ../../..
mv -v tokens.txt ../../..
mv 2086-149220-0033.wav ../../../0.wav
- name: Collect files (fp32)
- name: Run ${{ matrix.version }}
if: matrix.version == 'v3'
shell: bash
run: |
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2
mkdir -p $d
cp encoder.int8.onnx $d
cp decoder.onnx $d
cp joiner.onnx $d
cp tokens.txt $d
mkdir $d/test_wavs
cp 0.wav $d/test_wavs
cd scripts/nemo/parakeet-tdt-0.6b-v3
./run.sh
tar cjfv $d.tar.bz2 $d
ls -lh *.onnx
mv -v *.onnx ../../..
mv -v *.weights ../../..
mv -v tokens.txt ../../..
mv *.wav ../../../
- name: Collect files (int8)
- name: Collect files (fp32)
shell: bash
run: |
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8
version=${{ matrix.version }}
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-$version
mkdir -p $d
cp encoder.int8.onnx $d
cp decoder.int8.onnx $d
cp joiner.int8.onnx $d
cp tokens.txt $d
cp -v encoder.onnx $d
cp -v encoder.weights $d
cp -v decoder.onnx $d
cp -v joiner.onnx $d
cp -v tokens.txt $d
mkdir $d/test_wavs
cp 0.wav $d/test_wavs
cp -v *.wav $d/test_wavs
tar cjfv $d.tar.bz2 $d
# tar cjfv $d.tar.bz2 $d
# ls -lh *.tar.bz2
- name: Collect files (fp16)
- name: Collect files (int8)
shell: bash
run: |
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16
version=${{ matrix.version }}
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-$version-int8
mkdir -p $d
cp encoder.fp16.onnx $d
cp decoder.fp16.onnx $d
cp joiner.fp16.onnx $d
cp tokens.txt $d
cp -v encoder.int8.onnx $d
cp -v decoder.int8.onnx $d
cp -v joiner.int8.onnx $d
cp -v tokens.txt $d
mkdir $d/test_wavs
cp 0.wav $d/test_wavs
cp -v *.wav $d/test_wavs
tar cjfv $d.tar.bz2 $d
ls -lh *.tar.bz2
- name: Publish to huggingface
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
... ... @@ -94,13 +124,13 @@ jobs:
timeout_seconds: 200
shell: bash
command: |
version=${{ matrix.version }}
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
models=(
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16
sherpa-onnx-nemo-parakeet-tdt-0.6b-$version
sherpa-onnx-nemo-parakeet-tdt-0.6b-$version-int8
)
for m in ${models[@]}; do
... ... @@ -112,6 +142,7 @@ jobs:
cd huggingface
git lfs track "*.onnx"
git lfs track "*.wav"
git lfs track "*.weights"
git status
git add .
git status
... ...
... ... @@ -681,6 +681,22 @@ def get_models():
popd
""",
),
Model(
model_name="sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8",
idx=40,
lang="multi",
lang2="25_languages",
short_name="parakeet_tdt_0.6b_v3",
cmd="""
pushd $model_name
rm -rfv test_wavs
ls -lh
popd
""",
),
]
return models
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import os
from pathlib import Path
from typing import Dict
import os
import nemo.collections.asr as nemo_asr
import onnx
import onnxmltools
import torch
from onnxmltools.utils.float16_converter import (
convert_float_to_float16,
convert_float_to_float16_model_path,
)
from onnxruntime.quantization import QuantType, quantize_dynamic
def export_onnx_fp16(onnx_fp32_path, onnx_fp16_path):
onnx_fp32_model = onnxmltools.utils.load_model(onnx_fp32_path)
onnx_fp16_model = convert_float_to_float16(onnx_fp32_model, keep_io_types=True)
onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path)
def export_onnx_fp16_large_2gb(onnx_fp32_path, onnx_fp16_path):
onnx_fp16_model = convert_float_to_float16_model_path(
onnx_fp32_path, keep_io_types=True
)
onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path)
def add_meta_data(filename: str, meta_data: Dict[str, str]):
"""Add meta data to an ONNX model. It is changed in-place.
... ... @@ -47,11 +29,26 @@ def add_meta_data(filename: str, meta_data: Dict[str, str]):
meta.key = key
meta.value = str(value)
if filename == "encoder.onnx":
external_filename = "encoder"
onnx.save(
model,
filename,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=external_filename + ".weights",
)
else:
onnx.save(model, filename)
@torch.no_grad()
def main():
if Path("./parakeet-tdt-0.6b-v2.nemo").is_file():
asr_model = nemo_asr.models.ASRModel.restore_from(
restore_path="./parakeet-tdt-0.6b-v2.nemo"
)
else:
asr_model = nemo_asr.models.ASRModel.from_pretrained(
model_name="nvidia/parakeet-tdt-0.6b-v2"
)
... ... @@ -95,13 +92,8 @@ def main():
)
os.system("ls -lh *.onnx")
if m == "encoder":
export_onnx_fp16_large_2gb(f"{m}.onnx", f"{m}.fp16.onnx")
else:
export_onnx_fp16(f"{m}.onnx", f"{m}.fp16.onnx")
add_meta_data("encoder.int8.onnx", meta_data)
add_meta_data("encoder.fp16.onnx", meta_data)
add_meta_data("encoder.onnx", meta_data)
print("meta_data", meta_data)
... ...
... ... @@ -9,8 +9,9 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
curl -SL -O https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2/resolve/main/parakeet-tdt-0.6b-v2.nemo
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
pip install \
... ... @@ -20,7 +21,6 @@ pip install \
kaldi-native-fbank \
librosa \
onnx==1.17.0 \
onnxmltools \
onnxruntime==1.17.1 \
soundfile
... ... @@ -42,11 +42,3 @@ python3 ./test_onnx.py \
--joiner ./joiner.int8.onnx \
--tokens ./tokens.txt \
--wav 2086-149220-0033.wav
echo "---fp16----"
python3 ./test_onnx.py \
--encoder ./encoder.fp16.onnx \
--decoder ./decoder.fp16.onnx \
--joiner ./joiner.fp16.onnx \
--tokens ./tokens.txt \
--wav 2086-149220-0033.wav
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
from pathlib import Path
from typing import Dict
import os
import nemo.collections.asr as nemo_asr
import onnx
import torch
from onnxruntime.quantization import QuantType, quantize_dynamic
def add_meta_data(filename: str, meta_data: Dict[str, str]):
"""Add meta data to an ONNX model. It is changed in-place.
Args:
filename:
Filename of the ONNX model to be changed.
meta_data:
Key-value pairs.
"""
model = onnx.load(filename)
while len(model.metadata_props):
model.metadata_props.pop()
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
if filename == "encoder.onnx":
external_filename = "encoder"
onnx.save(
model,
filename,
save_as_external_data=True,
all_tensors_to_one_file=True,
location=external_filename + ".weights",
)
else:
onnx.save(model, filename)
@torch.no_grad()
def main():
if Path("./parakeet-tdt-0.6b-v3.nemo").is_file():
asr_model = nemo_asr.models.ASRModel.restore_from(
restore_path="./parakeet-tdt-0.6b-v3.nemo"
)
else:
asr_model = nemo_asr.models.ASRModel.from_pretrained(
model_name="nvidia/parakeet-tdt-0.6b-v3"
)
asr_model.eval()
with open("./tokens.txt", "w", encoding="utf-8") as f:
for i, s in enumerate(asr_model.joint.vocabulary):
f.write(f"{s} {i}\n")
f.write(f"<blk> {i+1}\n")
print("Saved to tokens.txt")
asr_model.encoder.export("encoder.onnx")
asr_model.decoder.export("decoder.onnx")
asr_model.joint.export("joiner.onnx")
os.system("ls -lh *.onnx")
normalize_type = asr_model.cfg.preprocessor.normalize
if normalize_type == "NA":
normalize_type = ""
meta_data = {
"vocab_size": asr_model.decoder.vocab_size,
"normalize_type": normalize_type,
"pred_rnn_layers": asr_model.decoder.pred_rnn_layers,
"pred_hidden": asr_model.decoder.pred_hidden,
"subsampling_factor": 8,
"model_type": "EncDecRNNTBPEModel",
"version": "2",
"model_author": "NeMo",
"url": "https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3",
"comment": "Only the transducer branch is exported",
"feat_dim": 128,
}
for m in ["encoder", "decoder", "joiner"]:
quantize_dynamic(
model_input=f"./{m}.onnx",
model_output=f"./{m}.int8.onnx",
weight_type=QuantType.QUInt8 if m == "encoder" else QuantType.QInt8,
)
os.system("ls -lh *.onnx")
add_meta_data("encoder.int8.onnx", meta_data)
add_meta_data("encoder.onnx", meta_data)
print("meta_data", meta_data)
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env bash
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
set -ex
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
curl -SL -O https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3/resolve/main/parakeet-tdt-0.6b-v3.nemo
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/en.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/fr.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/es.wav
ls -lh
pip install \
nemo_toolkit['asr'] \
"numpy<2" \
ipython \
kaldi-native-fbank \
librosa \
onnx==1.17.0 \
onnxruntime==1.17.1 \
soundfile
python3 ./export_onnx.py
ls -lh *.onnx
for w in en.wav de.wav fr.wav es.wav; do
echo "---fp32----"
python3 ./test_onnx.py \
--encoder ./encoder.int8.onnx \
--decoder ./decoder.onnx \
--joiner ./joiner.onnx \
--tokens ./tokens.txt \
--wav $w
echo "---int8----"
python3 ./test_onnx.py \
--encoder ./encoder.int8.onnx \
--decoder ./decoder.int8.onnx \
--joiner ./joiner.int8.onnx \
--tokens ./tokens.txt \
--wav $w
done
... ...
../parakeet-tdt-0.6b-v2/test_onnx.py
\ No newline at end of file
... ...
... ... @@ -46,7 +46,7 @@ bool OfflineTtsModelConfig::Validate() const {
return kitten.Validate();
}
SHERPA_ONNX_LOGE("Please provide at exactly one tts model.");
SHERPA_ONNX_LOGE("Please provide exactly one tts model.");
return false;
}
... ...
... ... @@ -65,7 +65,7 @@ struct GeneratedAudio {
class OfflineTtsImpl;
// If the callback returns 0, then it stop generating
// If the callback returns 0, then it stops generating
// if the callback returns 1, then it keeps generating
using GeneratedAudioCallback = std::function<int32_t(
const float * /*samples*/, int32_t /*n*/, float /*progress*/)>;
... ...
... ... @@ -677,6 +677,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
tokens = "$modelDir/tokens.txt",
)
}
40 -> {
val modelDir = "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8"
return OfflineModelConfig(
transducer = OfflineTransducerModelConfig(
encoder = "$modelDir/encoder.int8.onnx",
decoder = "$modelDir/decoder.int8.onnx",
joiner = "$modelDir/joiner.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
modelType = "nemo_transducer",
)
}
}
return null
}
... ...