Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2025-05-06 16:32:59 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2025-05-06 16:32:59 +0800
Commit
d660d5adea2fb76ac168538fb551e947cf666b1d
d660d5ad
1 parent
abc4daa4
export parakeet-tdt-0.6b-v2 to sherpa-onnx (#2180)
隐藏空白字符变更
内嵌
并排对比
正在显示
4 个修改的文件
包含
570 行增加
和
0 行删除
.github/workflows/export-nemo-parakeet-tdt-0.6b-v2.yaml
scripts/nemo/parakeet-tdt-0.6b-v2/export_onnx.py
scripts/nemo/parakeet-tdt-0.6b-v2/run.sh
scripts/nemo/parakeet-tdt-0.6b-v2/test_onnx.py
.github/workflows/export-nemo-parakeet-tdt-0.6b-v2.yaml
0 → 100644
查看文件 @
d660d5a
name
:
export-nemo-parakeet-tdt-0.6b-v2
on
:
push
:
branches
:
-
export-nemo-parakeet-tdt-0.6b-v2
workflow_dispatch
:
concurrency
:
group
:
export-nemo-parakeet-tdt-0.6b-v2-${{ github.ref }}
cancel-in-progress
:
true
jobs
:
export-nemo-parakeet-tdt-0_6b-v2
:
if
:
github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
name
:
parakeet tdt 0.6b v2
runs-on
:
${{ matrix.os }}
strategy
:
fail-fast
:
false
matrix
:
os
:
[
macos-latest
]
python-version
:
[
"
3.10"
]
steps
:
-
uses
:
actions/checkout@v4
-
name
:
Setup Python ${{ matrix.python-version }}
uses
:
actions/setup-python@v5
with
:
python-version
:
${{ matrix.python-version }}
-
name
:
Run
shell
:
bash
run
:
|
cd scripts/nemo/parakeet-tdt-0.6b-v2
./run.sh
ls -lh *.onnx
mv -v *.onnx ../../..
mv -v tokens.txt ../../..
mv 2086-149220-0033.wav ../../../0.wav
-
name
:
Collect files (fp32)
shell
:
bash
run
:
|
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2
mkdir -p $d
cp encoder.int8.onnx $d
cp decoder.onnx $d
cp joiner.onnx $d
cp tokens.txt $d
mkdir $d/test_wavs
cp 0.wav $d/test_wavs
tar cjfv $d.tar.bz2 $d
-
name
:
Collect files (int8)
shell
:
bash
run
:
|
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8
mkdir -p $d
cp encoder.int8.onnx $d
cp decoder.int8.onnx $d
cp joiner.int8.onnx $d
cp tokens.txt $d
mkdir $d/test_wavs
cp 0.wav $d/test_wavs
tar cjfv $d.tar.bz2 $d
-
name
:
Collect files (fp16)
shell
:
bash
run
:
|
d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16
mkdir -p $d
cp encoder.fp16.onnx $d
cp decoder.fp16.onnx $d
cp joiner.fp16.onnx $d
cp tokens.txt $d
mkdir $d/test_wavs
cp 0.wav $d/test_wavs
tar cjfv $d.tar.bz2 $d
-
name
:
Publish to huggingface
env
:
HF_TOKEN
:
${{ secrets.HF_TOKEN }}
uses
:
nick-fields/retry@v3
with
:
max_attempts
:
20
timeout_seconds
:
200
shell
:
bash
command
:
|
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
models=(
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8
sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16
)
for m in ${models[@]}; do
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
cp -av $m/* huggingface
cd huggingface
git lfs track "*.onnx"
git lfs track "*.wav"
git status
git add .
git status
git commit -m "first commit"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
cd ..
done
-
name
:
Release
uses
:
svenstaro/upload-release-action@v2
with
:
file_glob
:
true
file
:
./*.tar.bz2
overwrite
:
true
repo_name
:
k2-fsa/sherpa-onnx
repo_token
:
${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag
:
asr-models
...
...
scripts/nemo/parakeet-tdt-0.6b-v2/export_onnx.py
0 → 100755
查看文件 @
d660d5a
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
from
pathlib
import
Path
from
typing
import
Dict
import
os
import
nemo.collections.asr
as
nemo_asr
import
onnx
import
onnxmltools
import
torch
from
onnxmltools.utils.float16_converter
import
(
convert_float_to_float16
,
convert_float_to_float16_model_path
,
)
from
onnxruntime.quantization
import
QuantType
,
quantize_dynamic
def
export_onnx_fp16
(
onnx_fp32_path
,
onnx_fp16_path
):
onnx_fp32_model
=
onnxmltools
.
utils
.
load_model
(
onnx_fp32_path
)
onnx_fp16_model
=
convert_float_to_float16
(
onnx_fp32_model
,
keep_io_types
=
True
)
onnxmltools
.
utils
.
save_model
(
onnx_fp16_model
,
onnx_fp16_path
)
def
export_onnx_fp16_large_2gb
(
onnx_fp32_path
,
onnx_fp16_path
):
onnx_fp16_model
=
convert_float_to_float16_model_path
(
onnx_fp32_path
,
keep_io_types
=
True
)
onnxmltools
.
utils
.
save_model
(
onnx_fp16_model
,
onnx_fp16_path
)
def
add_meta_data
(
filename
:
str
,
meta_data
:
Dict
[
str
,
str
]):
"""Add meta data to an ONNX model. It is changed in-place.
Args:
filename:
Filename of the ONNX model to be changed.
meta_data:
Key-value pairs.
"""
model
=
onnx
.
load
(
filename
)
while
len
(
model
.
metadata_props
):
model
.
metadata_props
.
pop
()
for
key
,
value
in
meta_data
.
items
():
meta
=
model
.
metadata_props
.
add
()
meta
.
key
=
key
meta
.
value
=
str
(
value
)
onnx
.
save
(
model
,
filename
)
@torch.no_grad
()
def
main
():
asr_model
=
nemo_asr
.
models
.
ASRModel
.
from_pretrained
(
model_name
=
"nvidia/parakeet-tdt-0.6b-v2"
)
asr_model
.
eval
()
with
open
(
"./tokens.txt"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
for
i
,
s
in
enumerate
(
asr_model
.
joint
.
vocabulary
):
f
.
write
(
f
"{s} {i}
\n
"
)
f
.
write
(
f
"<blk> {i+1}
\n
"
)
print
(
"Saved to tokens.txt"
)
asr_model
.
encoder
.
export
(
"encoder.onnx"
)
asr_model
.
decoder
.
export
(
"decoder.onnx"
)
asr_model
.
joint
.
export
(
"joiner.onnx"
)
os
.
system
(
"ls -lh *.onnx"
)
normalize_type
=
asr_model
.
cfg
.
preprocessor
.
normalize
if
normalize_type
==
"NA"
:
normalize_type
=
""
meta_data
=
{
"vocab_size"
:
asr_model
.
decoder
.
vocab_size
,
"normalize_type"
:
normalize_type
,
"pred_rnn_layers"
:
asr_model
.
decoder
.
pred_rnn_layers
,
"pred_hidden"
:
asr_model
.
decoder
.
pred_hidden
,
"subsampling_factor"
:
8
,
"model_type"
:
"EncDecRNNTBPEModel"
,
"version"
:
"2"
,
"model_author"
:
"NeMo"
,
"url"
:
"https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2"
,
"comment"
:
"Only the transducer branch is exported"
,
"feat_dim"
:
128
,
}
for
m
in
[
"encoder"
,
"decoder"
,
"joiner"
]:
quantize_dynamic
(
model_input
=
f
"./{m}.onnx"
,
model_output
=
f
"./{m}.int8.onnx"
,
weight_type
=
QuantType
.
QUInt8
if
m
==
"encoder"
else
QuantType
.
QInt8
,
)
os
.
system
(
"ls -lh *.onnx"
)
if
m
==
"encoder"
:
export_onnx_fp16_large_2gb
(
f
"{m}.onnx"
,
f
"{m}.fp16.onnx"
)
else
:
export_onnx_fp16
(
f
"{m}.onnx"
,
f
"{m}.fp16.onnx"
)
add_meta_data
(
"encoder.int8.onnx"
,
meta_data
)
add_meta_data
(
"encoder.fp16.onnx"
,
meta_data
)
print
(
"meta_data"
,
meta_data
)
if
__name__
==
"__main__"
:
main
()
...
...
scripts/nemo/parakeet-tdt-0.6b-v2/run.sh
0 → 100755
查看文件 @
d660d5a
#!/usr/bin/env bash
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
set
-ex
log
()
{
# This function is from espnet
local
fname
=
${
BASH_SOURCE
[1]##*/
}
echo
-e
"
$(
date
'+%Y-%m-%d %H:%M:%S'
)
(
${
fname
}
:
${
BASH_LINENO
[0]
}
:
${
FUNCNAME
[1]
}
)
$*
"
}
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
pip install
\
nemo_toolkit[
'asr'
]
\
"numpy<2"
\
ipython
\
kaldi-native-fbank
\
librosa
\
onnx
==
1.17.0
\
onnxmltools
\
onnxruntime
==
1.17.1
\
soundfile
python3 ./export_onnx.py
ls -lh
*
.onnx
echo
"---fp32----"
python3 ./test_onnx.py
\
--encoder ./encoder.int8.onnx
\
--decoder ./decoder.onnx
\
--joiner ./joiner.onnx
\
--tokens ./tokens.txt
\
--wav 2086-149220-0033.wav
echo
"---int8----"
python3 ./test_onnx.py
\
--encoder ./encoder.int8.onnx
\
--decoder ./decoder.int8.onnx
\
--joiner ./joiner.int8.onnx
\
--tokens ./tokens.txt
\
--wav 2086-149220-0033.wav
echo
"---fp16----"
python3 ./test_onnx.py
\
--encoder ./encoder.fp16.onnx
\
--decoder ./decoder.fp16.onnx
\
--joiner ./joiner.fp16.onnx
\
--tokens ./tokens.txt
\
--wav 2086-149220-0033.wav
...
...
scripts/nemo/parakeet-tdt-0.6b-v2/test_onnx.py
0 → 100755
查看文件 @
d660d5a
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import
argparse
from
pathlib
import
Path
import
kaldi_native_fbank
as
knf
import
librosa
import
numpy
as
np
import
onnxruntime
as
ort
import
soundfile
as
sf
import
torch
import
time
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--encoder"
,
type
=
str
,
required
=
True
,
help
=
"Path to encoder.onnx"
)
parser
.
add_argument
(
"--decoder"
,
type
=
str
,
required
=
True
,
help
=
"Path to decoder.onnx"
)
parser
.
add_argument
(
"--joiner"
,
type
=
str
,
required
=
True
,
help
=
"Path to joiner.onnx"
)
parser
.
add_argument
(
"--tokens"
,
type
=
str
,
required
=
True
,
help
=
"Path to tokens.txt"
)
parser
.
add_argument
(
"--wav"
,
type
=
str
,
required
=
True
,
help
=
"Path to test.wav"
)
return
parser
.
parse_args
()
def
create_fbank
():
opts
=
knf
.
FbankOptions
()
opts
.
frame_opts
.
dither
=
0
opts
.
frame_opts
.
remove_dc_offset
=
False
opts
.
frame_opts
.
window_type
=
"hann"
opts
.
mel_opts
.
low_freq
=
0
opts
.
mel_opts
.
num_bins
=
128
opts
.
mel_opts
.
is_librosa
=
True
fbank
=
knf
.
OnlineFbank
(
opts
)
return
fbank
def
compute_features
(
audio
,
fbank
):
assert
len
(
audio
.
shape
)
==
1
,
audio
.
shape
fbank
.
accept_waveform
(
16000
,
audio
)
ans
=
[]
processed
=
0
while
processed
<
fbank
.
num_frames_ready
:
ans
.
append
(
np
.
array
(
fbank
.
get_frame
(
processed
)))
processed
+=
1
ans
=
np
.
stack
(
ans
)
return
ans
def
display
(
sess
,
model
):
print
(
f
"=========={model} Input=========="
)
for
i
in
sess
.
get_inputs
():
print
(
i
)
print
(
f
"=========={model }Output=========="
)
for
i
in
sess
.
get_outputs
():
print
(
i
)
class
OnnxModel
:
def
__init__
(
self
,
encoder
:
str
,
decoder
:
str
,
joiner
:
str
,
):
self
.
init_encoder
(
encoder
)
display
(
self
.
encoder
,
"encoder"
)
self
.
init_decoder
(
decoder
)
display
(
self
.
decoder
,
"decoder"
)
self
.
init_joiner
(
joiner
)
display
(
self
.
joiner
,
"joiner"
)
def
init_encoder
(
self
,
encoder
):
session_opts
=
ort
.
SessionOptions
()
session_opts
.
inter_op_num_threads
=
1
session_opts
.
intra_op_num_threads
=
1
self
.
encoder
=
ort
.
InferenceSession
(
encoder
,
sess_options
=
session_opts
,
providers
=
[
"CPUExecutionProvider"
],
)
meta
=
self
.
encoder
.
get_modelmeta
()
.
custom_metadata_map
self
.
normalize_type
=
meta
[
"normalize_type"
]
print
(
meta
)
self
.
pred_rnn_layers
=
int
(
meta
[
"pred_rnn_layers"
])
self
.
pred_hidden
=
int
(
meta
[
"pred_hidden"
])
def
init_decoder
(
self
,
decoder
):
session_opts
=
ort
.
SessionOptions
()
session_opts
.
inter_op_num_threads
=
1
session_opts
.
intra_op_num_threads
=
1
self
.
decoder
=
ort
.
InferenceSession
(
decoder
,
sess_options
=
session_opts
,
providers
=
[
"CPUExecutionProvider"
],
)
def
init_joiner
(
self
,
joiner
):
session_opts
=
ort
.
SessionOptions
()
session_opts
.
inter_op_num_threads
=
1
session_opts
.
intra_op_num_threads
=
1
self
.
joiner
=
ort
.
InferenceSession
(
joiner
,
sess_options
=
session_opts
,
providers
=
[
"CPUExecutionProvider"
],
)
def
get_decoder_state
(
self
):
batch_size
=
1
state0
=
torch
.
zeros
(
self
.
pred_rnn_layers
,
batch_size
,
self
.
pred_hidden
)
.
numpy
()
state1
=
torch
.
zeros
(
self
.
pred_rnn_layers
,
batch_size
,
self
.
pred_hidden
)
.
numpy
()
return
state0
,
state1
def
run_encoder
(
self
,
x
:
np
.
ndarray
):
# x: (T, C)
x
=
torch
.
from_numpy
(
x
)
x
=
x
.
t
()
.
unsqueeze
(
0
)
# x: [1, C, T]
x_lens
=
torch
.
tensor
([
x
.
shape
[
-
1
]],
dtype
=
torch
.
int64
)
(
encoder_out
,
out_len
)
=
self
.
encoder
.
run
(
[
self
.
encoder
.
get_outputs
()[
0
]
.
name
,
self
.
encoder
.
get_outputs
()[
1
]
.
name
,
],
{
self
.
encoder
.
get_inputs
()[
0
]
.
name
:
x
.
numpy
(),
self
.
encoder
.
get_inputs
()[
1
]
.
name
:
x_lens
.
numpy
(),
},
)
# [batch_size, dim, T]
return
encoder_out
def
run_decoder
(
self
,
token
:
int
,
state0
:
np
.
ndarray
,
state1
:
np
.
ndarray
,
):
target
=
torch
.
tensor
([[
token
]],
dtype
=
torch
.
int32
)
.
numpy
()
target_len
=
torch
.
tensor
([
1
],
dtype
=
torch
.
int32
)
.
numpy
()
(
decoder_out
,
decoder_out_length
,
state0_next
,
state1_next
,)
=
self
.
decoder
.
run
(
[
self
.
decoder
.
get_outputs
()[
0
]
.
name
,
self
.
decoder
.
get_outputs
()[
1
]
.
name
,
self
.
decoder
.
get_outputs
()[
2
]
.
name
,
self
.
decoder
.
get_outputs
()[
3
]
.
name
,
],
{
self
.
decoder
.
get_inputs
()[
0
]
.
name
:
target
,
self
.
decoder
.
get_inputs
()[
1
]
.
name
:
target_len
,
self
.
decoder
.
get_inputs
()[
2
]
.
name
:
state0
,
self
.
decoder
.
get_inputs
()[
3
]
.
name
:
state1
,
},
)
return
decoder_out
,
state0_next
,
state1_next
def
run_joiner
(
self
,
encoder_out
:
np
.
ndarray
,
decoder_out
:
np
.
ndarray
,
):
# encoder_out: [batch_size, dim, 1]
# decoder_out: [batch_size, dim, 1]
logit
=
self
.
joiner
.
run
(
[
self
.
joiner
.
get_outputs
()[
0
]
.
name
,
],
{
self
.
joiner
.
get_inputs
()[
0
]
.
name
:
encoder_out
,
self
.
joiner
.
get_inputs
()[
1
]
.
name
:
decoder_out
,
},
)[
0
]
# logit: [batch_size, 1, 1, vocab_size]
return
logit
def
main
():
args
=
get_args
()
assert
Path
(
args
.
encoder
)
.
is_file
(),
args
.
encoder
assert
Path
(
args
.
decoder
)
.
is_file
(),
args
.
decoder
assert
Path
(
args
.
joiner
)
.
is_file
(),
args
.
joiner
assert
Path
(
args
.
tokens
)
.
is_file
(),
args
.
tokens
assert
Path
(
args
.
wav
)
.
is_file
(),
args
.
wav
print
(
vars
(
args
))
model
=
OnnxModel
(
args
.
encoder
,
args
.
decoder
,
args
.
joiner
)
id2token
=
dict
()
with
open
(
args
.
tokens
,
encoding
=
"utf-8"
)
as
f
:
for
line
in
f
:
t
,
idx
=
line
.
split
()
id2token
[
int
(
idx
)]
=
t
start
=
time
.
time
()
fbank
=
create_fbank
()
audio
,
sample_rate
=
sf
.
read
(
args
.
wav
,
dtype
=
"float32"
,
always_2d
=
True
)
audio
=
audio
[:,
0
]
# only use the first channel
if
sample_rate
!=
16000
:
audio
=
librosa
.
resample
(
audio
,
orig_sr
=
sample_rate
,
target_sr
=
16000
,
)
sample_rate
=
16000
tail_padding
=
np
.
zeros
(
sample_rate
*
2
)
audio
=
np
.
concatenate
([
audio
,
tail_padding
])
blank
=
len
(
id2token
)
-
1
ans
=
[
blank
]
state0
,
state1
=
model
.
get_decoder_state
()
decoder_out
,
state0_next
,
state1_next
=
model
.
run_decoder
(
ans
[
-
1
],
state0
,
state1
)
features
=
compute_features
(
audio
,
fbank
)
if
model
.
normalize_type
!=
""
:
assert
model
.
normalize_type
==
"per_feature"
,
model
.
normalize_type
features
=
torch
.
from_numpy
(
features
)
mean
=
features
.
mean
(
dim
=
1
,
keepdims
=
True
)
stddev
=
features
.
std
(
dim
=
1
,
keepdims
=
True
)
+
1e-5
features
=
(
features
-
mean
)
/
stddev
features
=
features
.
numpy
()
print
(
audio
.
shape
)
print
(
"features.shape"
,
features
.
shape
)
encoder_out
=
model
.
run_encoder
(
features
)
# encoder_out:[batch_size, dim, T)
for
t
in
range
(
encoder_out
.
shape
[
2
]):
encoder_out_t
=
encoder_out
[:,
:,
t
:
t
+
1
]
logits
=
model
.
run_joiner
(
encoder_out_t
,
decoder_out
)
logits
=
torch
.
from_numpy
(
logits
)
logits
=
logits
.
squeeze
()
idx
=
torch
.
argmax
(
logits
,
dim
=-
1
)
.
item
()
if
idx
!=
blank
:
ans
.
append
(
idx
)
state0
=
state0_next
state1
=
state1_next
decoder_out
,
state0_next
,
state1_next
=
model
.
run_decoder
(
ans
[
-
1
],
state0
,
state1
)
end
=
time
.
time
()
elapsed_seconds
=
end
-
start
audio_duration
=
audio
.
shape
[
0
]
/
16000
real_time_factor
=
elapsed_seconds
/
audio_duration
ans
=
ans
[
1
:]
# remove the first blank
tokens
=
[
id2token
[
i
]
for
i
in
ans
]
underline
=
"▁"
# underline = b"\xe2\x96\x81".decode()
text
=
""
.
join
(
tokens
)
.
replace
(
underline
,
" "
)
.
strip
()
print
(
ans
)
print
(
args
.
wav
)
print
(
text
)
print
(
f
"RTF: {real_time_factor}"
)
if
__name__
==
"__main__"
:
main
()
...
...
请
注册
或
登录
后发表评论