Fangjun Kuang
Committed by GitHub

Support Portuguese and German ASR models from NeMo (#2394)

... ... @@ -39,6 +39,7 @@ jobs:
shell: bash
run: |
cd scripts/nemo/fast-conformer-hybrid-transducer-ctc
./run-ctc-non-streaming-2.sh
./run-ctc-non-streaming.sh
mv -v sherpa-onnx-nemo* ../../..
... ... @@ -66,6 +67,10 @@ jobs:
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288-int8
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000-int8
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
)
for m in ${models[@]}; do
... ... @@ -75,7 +80,7 @@ jobs:
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
cp -av $m/* huggingface
cd huggingface
git lfs track "*.onnx"
git lfs track "*.onnx" "*.wav"
git status
git add .
git status
... ... @@ -99,6 +104,10 @@ jobs:
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288-int8
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000-int8
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
)
for d in ${dirs[@]}; do
tar cjvf ${d}.tar.bz2 ./$d
... ...
... ... @@ -39,6 +39,7 @@ jobs:
shell: bash
run: |
cd scripts/nemo/fast-conformer-hybrid-transducer-ctc
./run-transducer-non-streaming-2.sh
./run-transducer-non-streaming.sh
mv -v sherpa-onnx-nemo* ../../..
... ... @@ -66,6 +67,10 @@ jobs:
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288-int8
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000-int8
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
)
for m in ${models[@]}; do
... ... @@ -75,7 +80,7 @@ jobs:
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
cp -av $m/* huggingface
cd huggingface
git lfs track "*.onnx"
git lfs track "*.onnx" "*.wav"
git status
git add .
git status
... ... @@ -98,6 +103,10 @@ jobs:
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288-int8
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000-int8
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
)
for d in ${dirs[@]}; do
tar cjvf ${d}.tar.bz2 ./$d
... ...
... ... @@ -600,6 +600,70 @@ def get_models():
popd
""",
),
Model(
model_name="sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8",
idx=35,
lang="pt",
lang2="Portuguese",
short_name="stt_pt_fastconformer_hybrid_large_pc_transducer_int8",
cmd="""
pushd $model_name
rm -rfv test_wavs
ls -lh
popd
""",
),
Model(
model_name="sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8",
idx=36,
lang="pt",
lang2="Portuguese",
short_name="stt_pt_fastconformer_hybrid_large_pc_ctc-int8",
cmd="""
pushd $model_name
rm -rfv test_wavs
ls -lh
popd
""",
),
Model(
model_name="sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8",
idx=37,
lang="de",
lang2="German",
short_name="stt_de_fastconformer_hybrid_large_pc_transducer_int8",
cmd="""
pushd $model_name
rm -rfv test_wavs
ls -lh
popd
""",
),
Model(
model_name="sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8",
idx=38,
lang="de",
lang2="German",
short_name="stt_de_fastconformer_hybrid_large_pc_ctc-int8",
cmd="""
pushd $model_name
rm -rfv test_wavs
ls -lh
popd
""",
),
]
return models
... ...
... ... @@ -24,5 +24,7 @@ This folder contains scripts for exporting models from
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
- https://huggingface.co/nvidia/parakeet-tdt_ctc-0.6b-ja
- https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
- https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
to `sherpa-onnx`.
... ...
... ... @@ -81,7 +81,9 @@ def main():
"model_type": "EncDecHybridRNNTCTCBPEModel",
"version": "1",
"model_author": "NeMo",
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}",
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}"
if "/" in model_name
else f"https://huggingface.co/{model_name}",
"comment": "Only the CTC branch is exported",
"doc": args.doc,
}
... ...
... ... @@ -85,7 +85,9 @@ def main():
"model_type": "EncDecHybridRNNTCTCBPEModel",
"version": "1",
"model_author": "NeMo",
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}",
"url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}"
if "/" in model_name
else f"https://huggingface.co/{model_name}",
"comment": "Only the transducer branch is exported",
"doc": args.doc,
}
... ...
#!/usr/bin/env bash
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
set -ex
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
# 2200 hours of Portuguese speech
url=https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
name=$(basename $url)
name="nvidia/$name"
doc="STT PT FastConformer Hybrid Transducer-CTC Large transcribes text in upper and lower case Portuguese alphabet along with spaces, period, comma, question mark. This collection contains the Brazilian Portuguese FastConformer Hybrid (Transducer and CTC) Large model (around 115M parameters) with punctuation and capitalization trained on around 2200h hours of Portuguese speech. "
log "Process $name at $url"
./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
d=sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
mkdir -p $d
mv -v model.onnx $d/
cp -v tokens.txt $d/
ls -lh $d
mkdir test_wavs
pushd test_wavs
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/pt_br.wav
popd
cp -a test_wavs $d
d=sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
mkdir -p $d
mv -v model.int8.onnx $d/
mv -v tokens.txt $d/
ls -lh $d
mv test_wavs $d
python3 ./test-onnx-ctc-non-streaming.py \
--model $d/model.int8.onnx \
--tokens $d/tokens.txt \
--wav $d/test_wavs/pt_br.wav
# 2500 hours of German speech
url=https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
name=$(basename $url)
name="nvidia/$name"
doc="This model transcribes speech in upper and lower case German alphabet along with spaces, periods, commas, and question marks. It is a 'large' version of FastConformer Transducer-CTC (around 115M parameters) model. This is a hybrid model trained on two losses: Transducer (default) and CTC."
log "Process $name at $url"
./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
d=sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
mkdir -p $d
mv -v model.onnx $d/
cp -v tokens.txt $d/
ls -lh $d
mkdir test_wavs
pushd test_wavs
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
popd
cp -a test_wavs $d
d=sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
mkdir -p $d
mv -v model.int8.onnx $d/
mv -v tokens.txt $d/
ls -lh $d
mv test_wavs $d
python3 ./test-onnx-ctc-non-streaming.py \
--model $d/model.int8.onnx \
--tokens $d/tokens.txt \
--wav $d/test_wavs/de.wav
... ...
#!/usr/bin/env bash
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
set -ex
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
# 2200 hours of Portuguese speech
url=https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
name=$(basename $url)
name="nvidia/$name"
doc="STT PT FastConformer Hybrid Transducer-CTC Large transcribes text in upper and lower case Portuguese alphabet along with spaces, period, comma, question mark. This collection contains the Brazilian Portuguese FastConformer Hybrid (Transducer and CTC) Large model (around 115M parameters) with punctuation and capitalization trained on around 2200h hours of Portuguese speech. "
log "Process $name at $url"
./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
d=sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
mkdir -p $d
mv -v encoder.onnx $d/
mv -v decoder.onnx $d/
mv -v joiner.onnx $d/
cp -v tokens.txt $d/
ls -lh $d
mkdir test_wavs
pushd test_wavs
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/pt_br.wav
popd
cp -a test_wavs $d
d=sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
mkdir -p $d
mv -v encoder.int8.onnx $d/
mv -v decoder.int8.onnx $d/
mv -v joiner.int8.onnx $d/
mv -v tokens.txt $d/
ls -lh $d
mv test_wavs $d
python3 ./test-onnx-transducer-non-streaming.py \
--encoder $d/encoder.int8.onnx \
--decoder $d/decoder.int8.onnx \
--joiner $d/joiner.int8.onnx \
--tokens $d/tokens.txt \
--wav $d/test_wavs/pt_br.wav
# 2500 hours of German speech
url=https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
name=$(basename $url)
name="nvidia/$name"
doc="This model transcribes speech in upper and lower case German alphabet along with spaces, periods, commas, and question marks. It is a 'large' version of FastConformer Transducer-CTC (around 115M parameters) model. This is a hybrid model trained on two losses: Transducer (default) and CTC."
log "Process $name at $url"
./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
d=sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
mkdir -p $d
mv -v encoder.onnx $d/
mv -v decoder.onnx $d/
mv -v joiner.onnx $d/
cp -v tokens.txt $d/
ls -lh $d
mkdir test_wavs
pushd test_wavs
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
popd
cp -a test_wavs $d
d=sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
mkdir -p $d
mv -v encoder.int8.onnx $d/
mv -v decoder.int8.onnx $d/
mv -v joiner.int8.onnx $d/
mv -v tokens.txt $d/
ls -lh $d
mv test_wavs $d
python3 ./test-onnx-transducer-non-streaming.py \
--encoder $d/encoder.int8.onnx \
--decoder $d/decoder.int8.onnx \
--joiner $d/joiner.int8.onnx \
--tokens $d/tokens.txt \
--wav $d/test_wavs/de.wav
... ...
... ... @@ -621,6 +621,52 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
tokens = "$modelDir/tokens.txt",
)
}
35 -> {
val modelDir = "sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8"
return OfflineModelConfig(
transducer = OfflineTransducerModelConfig(
encoder = "$modelDir/encoder.int8.onnx",
decoder = "$modelDir/decoder.int8.onnx",
joiner = "$modelDir/joiner.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
modelType = "nemo_transducer",
)
}
36 -> {
val modelDir = "sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8"
return OfflineModelConfig(
nemo = OfflineNemoEncDecCtcModelConfig(
model = "$modelDir/model.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
)
}
37 -> {
val modelDir = "sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8"
return OfflineModelConfig(
transducer = OfflineTransducerModelConfig(
encoder = "$modelDir/encoder.int8.onnx",
decoder = "$modelDir/decoder.int8.onnx",
joiner = "$modelDir/joiner.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
modelType = "nemo_transducer",
)
}
38 -> {
val modelDir = "sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8"
return OfflineModelConfig(
nemo = OfflineNemoEncDecCtcModelConfig(
model = "$modelDir/model.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
)
}
}
return null
}
... ...