Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2024-05-08 12:33:46 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-05-08 12:33:46 +0800
Commit
a9f936e92b01355a9ee4f5b02f800257bfd2d89f
a9f936e9
1 parent
dbaa26ff
Export NeMo FastConformer Hybrid Transducer-CTC Large Streaming to ONNX. (#843)
隐藏空白字符变更
内嵌
并排对比
正在显示
5 个修改的文件
包含
431 行增加
和
0 行删除
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml
scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc.py
scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc.sh
scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc.py
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml
0 → 100644
查看文件 @
a9f936e
name
:
export-nemo-speaker-verification-to-onnx
on
:
workflow_dispatch
:
concurrency
:
group
:
export-nemo-fast-conformer-hybrid-transducer-ctc-to-onnx-${{ github.ref }}
cancel-in-progress
:
true
jobs
:
export-nemo-fast-conformer-hybrid-transducer-ctc-to-onnx
:
if
:
github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
name
:
export NeMo fast conformer
runs-on
:
${{ matrix.os }}
strategy
:
fail-fast
:
false
matrix
:
os
:
[
macos-latest
]
python-version
:
[
"
3.10"
]
steps
:
-
uses
:
actions/checkout@v4
-
name
:
Setup Python ${{ matrix.python-version }}
uses
:
actions/setup-python@v5
with
:
python-version
:
${{ matrix.python-version }}
-
name
:
Install NeMo
shell
:
bash
run
:
|
BRANCH='main'
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
pip install onnxruntime
pip install kaldi-native-fbank
pip install soundfile librosa
-
name
:
Run
shell
:
bash
run
:
|
cd scripts/nemo/fast-conformer-hybrid-transducer-ctc
./run-ctc.sh
mv -v sherpa-onnx-nemo* ../../..
-
name
:
Download test waves
shell
:
bash
run
:
|
mkdir test_wavs
pushd test_wavs
curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/0.wav
curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/1.wav
curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/8k.wav
curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-nemo-ctc-en-conformer-small/resolve/main/test_wavs/trans.txt
popd
cp -av test_wavs ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-80ms
cp -av test_wavs ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-480ms
cp -av test_wavs ./sherpa-onnx-nemo-streaming-fast-conformer-ctc-1040ms
tar cjvf sherpa-onnx-nemo-streaming-fast-conformer-ctc-80ms.tar.bz2 sherpa-onnx-nemo-streaming-fast-conformer-ctc-80ms
tar cjvf sherpa-onnx-nemo-streaming-fast-conformer-ctc-480ms.tar.bz2 sherpa-onnx-nemo-streaming-fast-conformer-ctc-480ms
tar cjvf sherpa-onnx-nemo-streaming-fast-conformer-ctc-1040ms.tar.bz2 sherpa-onnx-nemo-streaming-fast-conformer-ctc-1040ms
-
name
:
Release
uses
:
svenstaro/upload-release-action@v2
with
:
file_glob
:
true
file
:
./*.tar.bz2
overwrite
:
true
repo_name
:
k2-fsa/sherpa-onnx
repo_token
:
${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag
:
asr-models
...
...
scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
0 → 100644
查看文件 @
a9f936e
# Introduction
This folder contains scripts for exporting models from
-
https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_80ms
-
https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_480ms
-
https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_streaming_1040ms
to
`sherpa-onnx`
.
...
...
scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc.py
0 → 100755
查看文件 @
a9f936e
#!/usr/bin/env python3
import
argparse
from
typing
import
Dict
import
nemo.collections.asr
as
nemo_asr
import
onnx
import
torch
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
choices
=
[
"80"
,
"480"
,
"1040"
],
)
return
parser
.
parse_args
()
def
add_meta_data
(
filename
:
str
,
meta_data
:
Dict
[
str
,
str
]):
"""Add meta data to an ONNX model. It is changed in-place.
Args:
filename:
Filename of the ONNX model to be changed.
meta_data:
Key-value pairs.
"""
model
=
onnx
.
load
(
filename
)
while
len
(
model
.
metadata_props
):
model
.
metadata_props
.
pop
()
for
key
,
value
in
meta_data
.
items
():
meta
=
model
.
metadata_props
.
add
()
meta
.
key
=
key
meta
.
value
=
str
(
value
)
onnx
.
save
(
model
,
filename
)
@torch.no_grad
()
def
main
():
args
=
get_args
()
model_name
=
f
"stt_en_fastconformer_hybrid_large_streaming_{args.model}ms"
asr_model
=
nemo_asr
.
models
.
ASRModel
.
from_pretrained
(
model_name
=
model_name
)
with
open
(
"./tokens.txt"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
for
i
,
s
in
enumerate
(
asr_model
.
joint
.
vocabulary
):
f
.
write
(
f
"{s} {i}
\n
"
)
f
.
write
(
f
"<blk> {i+1}
\n
"
)
print
(
"Saved to tokens.txt"
)
decoder_type
=
"ctc"
asr_model
.
change_decoding_strategy
(
decoder_type
=
decoder_type
)
asr_model
.
eval
()
assert
asr_model
.
encoder
.
streaming_cfg
is
not
None
if
isinstance
(
asr_model
.
encoder
.
streaming_cfg
.
chunk_size
,
list
):
chunk_size
=
asr_model
.
encoder
.
streaming_cfg
.
chunk_size
[
1
]
else
:
chunk_size
=
asr_model
.
encoder
.
streaming_cfg
.
chunk_size
if
isinstance
(
asr_model
.
encoder
.
streaming_cfg
.
pre_encode_cache_size
,
list
):
pre_encode_cache_size
=
asr_model
.
encoder
.
streaming_cfg
.
pre_encode_cache_size
[
1
]
else
:
pre_encode_cache_size
=
asr_model
.
encoder
.
streaming_cfg
.
pre_encode_cache_size
window_size
=
chunk_size
+
pre_encode_cache_size
print
(
"chunk_size"
,
chunk_size
)
print
(
"pre_encode_cache_size"
,
pre_encode_cache_size
)
print
(
"window_size"
,
window_size
)
chunk_shift
=
chunk_size
# cache_last_channel: (batch_size, dim1, dim2, dim3)
cache_last_channel_dim1
=
len
(
asr_model
.
encoder
.
layers
)
cache_last_channel_dim2
=
asr_model
.
encoder
.
streaming_cfg
.
last_channel_cache_size
cache_last_channel_dim3
=
asr_model
.
encoder
.
d_model
# cache_last_time: (batch_size, dim1, dim2, dim3)
cache_last_time_dim1
=
len
(
asr_model
.
encoder
.
layers
)
cache_last_time_dim2
=
asr_model
.
encoder
.
d_model
cache_last_time_dim3
=
asr_model
.
encoder
.
conv_context_size
[
0
]
asr_model
.
set_export_config
({
"decoder_type"
:
"ctc"
,
"cache_support"
:
True
})
filename
=
"model.onnx"
asr_model
.
export
(
filename
)
meta_data
=
{
"vocab_size"
:
asr_model
.
decoder
.
vocab_size
,
"window_size"
:
window_size
,
"chunk_shift"
:
chunk_shift
,
"normalize_type"
:
"None"
,
"cache_last_channel_dim1"
:
cache_last_channel_dim1
,
"cache_last_channel_dim2"
:
cache_last_channel_dim2
,
"cache_last_channel_dim3"
:
cache_last_channel_dim3
,
"cache_last_time_dim1"
:
cache_last_time_dim1
,
"cache_last_time_dim2"
:
cache_last_time_dim2
,
"cache_last_time_dim3"
:
cache_last_time_dim3
,
"subsampling_factor"
:
8
,
"model_type"
:
"EncDecHybridRNNTCTCBPEModel"
,
"version"
:
"1"
,
"model_author"
:
"NeMo"
,
"url"
:
f
"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}"
,
"comment"
:
"Only the CTC branch is exported"
,
}
add_meta_data
(
filename
,
meta_data
)
print
(
meta_data
)
if
__name__
==
"__main__"
:
main
()
...
...
scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc.sh
0 → 100755
查看文件 @
a9f936e
#!/usr/bin/env bash
set
-ex
if
[
! -e ./0.wav
]
;
then
# curl -SL -O https://hf-mirror.com/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18/resolve/main/test_wavs/0.wav
curl -SL -O https://huggingface.co/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18/resolve/main/test_wavs/0.wav
fi
ms
=(
80
480
1040
)
for
m
in
${
ms
[@]
}
;
do
./export-onnx-ctc.py --model
$m
d
=
sherpa-onnx-nemo-streaming-fast-conformer-ctc-
${
m
}
ms
if
[
! -f
$d
/model.onnx
]
;
then
mkdir -p
$d
mv -v model.onnx
$d
/
mv -v tokens.txt
$d
/
ls -lh
$d
fi
done
# Now test the exported models
for
m
in
${
ms
[@]
}
;
do
d
=
sherpa-onnx-nemo-streaming-fast-conformer-ctc-
${
m
}
ms
python3 ./test-onnx-ctc.py
\
--model
$d
/model.onnx
\
--tokens
$d
/tokens.txt
\
--wav ./0.wav
done
...
...
scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc.py
0 → 100755
查看文件 @
a9f936e
#!/usr/bin/env python3
import
argparse
from
pathlib
import
Path
import
kaldi_native_fbank
as
knf
import
numpy
as
np
import
onnxruntime
as
ort
import
torch
import
soundfile
as
sf
import
librosa
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
help
=
"Path to model.onnx"
)
parser
.
add_argument
(
"--tokens"
,
type
=
str
,
required
=
True
,
help
=
"Path to tokens.txt"
)
parser
.
add_argument
(
"--wav"
,
type
=
str
,
required
=
True
,
help
=
"Path to test.wav"
)
return
parser
.
parse_args
()
def
create_fbank
():
opts
=
knf
.
FbankOptions
()
opts
.
frame_opts
.
dither
=
0
opts
.
frame_opts
.
remove_dc_offset
=
False
opts
.
frame_opts
.
window_type
=
"hann"
opts
.
mel_opts
.
low_freq
=
0
opts
.
mel_opts
.
num_bins
=
80
opts
.
mel_opts
.
is_librosa
=
True
fbank
=
knf
.
OnlineFbank
(
opts
)
return
fbank
def
compute_features
(
audio
,
fbank
):
assert
len
(
audio
.
shape
)
==
1
,
audio
.
shape
fbank
.
accept_waveform
(
16000
,
audio
)
ans
=
[]
processed
=
0
while
processed
<
fbank
.
num_frames_ready
:
ans
.
append
(
np
.
array
(
fbank
.
get_frame
(
processed
)))
processed
+=
1
ans
=
np
.
stack
(
ans
)
return
ans
class
OnnxModel
:
def
__init__
(
self
,
filename
:
str
,
):
session_opts
=
ort
.
SessionOptions
()
session_opts
.
inter_op_num_threads
=
1
session_opts
.
intra_op_num_threads
=
1
self
.
session_opts
=
session_opts
self
.
model
=
ort
.
InferenceSession
(
filename
,
sess_options
=
self
.
session_opts
,
providers
=
[
"CPUExecutionProvider"
],
)
meta
=
self
.
model
.
get_modelmeta
()
.
custom_metadata_map
print
(
meta
)
self
.
window_size
=
int
(
meta
[
"window_size"
])
self
.
chunk_shift
=
int
(
meta
[
"chunk_shift"
])
self
.
cache_last_channel_dim1
=
int
(
meta
[
"cache_last_channel_dim1"
])
self
.
cache_last_channel_dim2
=
int
(
meta
[
"cache_last_channel_dim2"
])
self
.
cache_last_channel_dim3
=
int
(
meta
[
"cache_last_channel_dim3"
])
self
.
cache_last_time_dim1
=
int
(
meta
[
"cache_last_time_dim1"
])
self
.
cache_last_time_dim2
=
int
(
meta
[
"cache_last_time_dim2"
])
self
.
cache_last_time_dim3
=
int
(
meta
[
"cache_last_time_dim3"
])
self
.
init_cache_state
()
def
init_cache_state
(
self
):
self
.
cache_last_channel
=
torch
.
zeros
(
1
,
self
.
cache_last_channel_dim1
,
self
.
cache_last_channel_dim2
,
self
.
cache_last_channel_dim3
,
dtype
=
torch
.
float32
,
)
.
numpy
()
self
.
cache_last_time
=
torch
.
zeros
(
1
,
self
.
cache_last_time_dim1
,
self
.
cache_last_time_dim2
,
self
.
cache_last_time_dim3
,
dtype
=
torch
.
float32
,
)
.
numpy
()
self
.
cache_last_channel_len
=
torch
.
ones
([
1
],
dtype
=
torch
.
int64
)
.
numpy
()
def
__call__
(
self
,
x
:
np
.
ndarray
):
# x: (T, C)
x
=
torch
.
from_numpy
(
x
)
x
=
x
.
t
()
.
unsqueeze
(
0
)
# x: [1, C, T]
x_lens
=
torch
.
tensor
([
x
.
shape
[
-
1
]],
dtype
=
torch
.
int64
)
(
log_probs
,
log_probs_len
,
cache_last_channel_next
,
cache_last_time_next
,
cache_last_channel_len_next
,
)
=
self
.
model
.
run
(
[
self
.
model
.
get_outputs
()[
0
]
.
name
,
self
.
model
.
get_outputs
()[
1
]
.
name
,
self
.
model
.
get_outputs
()[
2
]
.
name
,
self
.
model
.
get_outputs
()[
3
]
.
name
,
self
.
model
.
get_outputs
()[
4
]
.
name
,
],
{
self
.
model
.
get_inputs
()[
0
]
.
name
:
x
.
numpy
(),
self
.
model
.
get_inputs
()[
1
]
.
name
:
x_lens
.
numpy
(),
self
.
model
.
get_inputs
()[
2
]
.
name
:
self
.
cache_last_channel
,
self
.
model
.
get_inputs
()[
3
]
.
name
:
self
.
cache_last_time
,
self
.
model
.
get_inputs
()[
4
]
.
name
:
self
.
cache_last_channel_len
,
},
)
self
.
cache_last_channel
=
cache_last_channel_next
self
.
cache_last_time
=
cache_last_time_next
self
.
cache_last_channel_len
=
cache_last_channel_len_next
# [T, vocab_size]
return
torch
.
from_numpy
(
log_probs
)
.
squeeze
(
0
)
def
main
():
args
=
get_args
()
assert
Path
(
args
.
model
)
.
is_file
(),
args
.
model
assert
Path
(
args
.
tokens
)
.
is_file
(),
args
.
tokens
assert
Path
(
args
.
wav
)
.
is_file
(),
args
.
wav
print
(
vars
(
args
))
model
=
OnnxModel
(
args
.
model
)
id2token
=
dict
()
with
open
(
args
.
tokens
,
encoding
=
"utf-8"
)
as
f
:
for
line
in
f
:
t
,
idx
=
line
.
split
()
id2token
[
int
(
idx
)]
=
t
fbank
=
create_fbank
()
audio
,
sample_rate
=
sf
.
read
(
args
.
wav
,
dtype
=
"float32"
,
always_2d
=
True
)
audio
=
audio
[:,
0
]
# only use the first channel
if
sample_rate
!=
16000
:
audio
=
librosa
.
resample
(
audio
,
orig_sr
=
sample_rate
,
target_sr
=
16000
,
)
sample_rate
=
16000
window_size
=
model
.
window_size
chunk_shift
=
model
.
chunk_shift
blank
=
len
(
id2token
)
-
1
prev
=
-
1
ans
=
[]
features
=
compute_features
(
audio
,
fbank
)
num_chunks
=
(
features
.
shape
[
0
]
-
window_size
)
//
chunk_shift
+
1
for
i
in
range
(
num_chunks
):
start
=
i
*
chunk_shift
end
=
start
+
window_size
chunk
=
features
[
start
:
end
,
:]
log_probs
=
model
(
chunk
)
ids
=
torch
.
argmax
(
log_probs
,
dim
=
1
)
.
tolist
()
for
i
in
ids
:
if
i
!=
blank
and
i
!=
prev
:
ans
.
append
(
i
)
prev
=
i
tokens
=
[
id2token
[
i
]
for
i
in
ans
]
underline
=
"▁"
# underline = b"\xe2\x96\x81".decode()
text
=
""
.
join
(
tokens
)
.
replace
(
underline
,
" "
)
.
strip
()
print
(
args
.
wav
)
print
(
text
)
main
()
...
...
请
注册
或
登录
后发表评论