Support Portuguese and German ASR models from NeMo (#2394)

Fangjun Kuang · GitHub
Commit 8693b1ee310f4aa31e7372cafc0364e69e587418 8693b1ee 1 parent 27098a0e
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
scripts/apk/generate-vad-asr-apk-script.py
scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc-non-streaming.py
scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-transducer-non-streaming.py
scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming-2.sh
scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming-2.sh
sherpa-onnx/kotlin-api/OfflineRecognizer.kt
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
查看文件 @8693b1e
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
查看文件 @8693b1e
@@ -39,6 +39,7 @@ jobs:
         shell: bash
         run: |
           cd scripts/nemo/fast-conformer-hybrid-transducer-ctc
+           ./run-ctc-non-streaming-2.sh
           ./run-ctc-non-streaming.sh
 
           mv -v sherpa-onnx-nemo* ../../..
@@ -66,6 +67,10 @@ jobs:
               sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288-int8
               sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
               sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000-int8
+               sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
+               sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
+               sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
+               sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
             )
 
             for m in ${models[@]}; do
@@ -75,7 +80,7 @@ jobs:
               git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
               cp -av $m/* huggingface
               cd huggingface
-               git lfs track "*.onnx"
+               git lfs track "*.onnx" "*.wav"
               git status
               git add .
               git status
@@ -99,6 +104,10 @@ jobs:
             sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288-int8
             sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
             sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000-int8
+             sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
+             sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
+             sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
+             sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
           )
           for d in ${dirs[@]}; do
             tar cjvf ${d}.tar.bz2 ./$d
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
查看文件 @8693b1e
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
查看文件 @8693b1e
@@ -39,6 +39,7 @@ jobs:
         shell: bash
         run: |
           cd scripts/nemo/fast-conformer-hybrid-transducer-ctc
+           ./run-transducer-non-streaming-2.sh
           ./run-transducer-non-streaming.sh
 
           mv -v sherpa-onnx-nemo* ../../..
@@ -66,6 +67,10 @@ jobs:
               sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288-int8
               sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
               sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000-int8
+               sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
+               sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
+               sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
+               sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
             )
 
             for m in ${models[@]}; do
@@ -75,7 +80,7 @@ jobs:
               git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
               cp -av $m/* huggingface
               cd huggingface
-               git lfs track "*.onnx"
+               git lfs track "*.onnx" "*.wav"
               git status
               git add .
               git status
@@ -98,6 +103,10 @@ jobs:
             sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288-int8
             sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k-int8
             sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000-int8
+             sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
+             sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
+             sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
+             sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
           )
           for d in ${dirs[@]}; do
             tar cjvf ${d}.tar.bz2 ./$d
--- a/scripts/apk/generate-vad-asr-apk-script.py
查看文件 @8693b1e
+++ b/scripts/apk/generate-vad-asr-apk-script.py
查看文件 @8693b1e
@@ -600,6 +600,70 @@ def get_models():
             popd
             """,
         ),
+         Model(
+             model_name="sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8",
+             idx=35,
+             lang="pt",
+             lang2="Portuguese",
+             short_name="stt_pt_fastconformer_hybrid_large_pc_transducer_int8",
+             cmd="""
+             pushd $model_name
+ 
+             rm -rfv test_wavs
+ 
+             ls -lh
+ 
+             popd
+             """,
+         ),
+         Model(
+             model_name="sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8",
+             idx=36,
+             lang="pt",
+             lang2="Portuguese",
+             short_name="stt_pt_fastconformer_hybrid_large_pc_ctc-int8",
+             cmd="""
+             pushd $model_name
+ 
+             rm -rfv test_wavs
+ 
+             ls -lh
+ 
+             popd
+             """,
+         ),
+         Model(
+             model_name="sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8",
+             idx=37,
+             lang="de",
+             lang2="German",
+             short_name="stt_de_fastconformer_hybrid_large_pc_transducer_int8",
+             cmd="""
+             pushd $model_name
+ 
+             rm -rfv test_wavs
+ 
+             ls -lh
+ 
+             popd
+             """,
+         ),
+         Model(
+             model_name="sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8",
+             idx=38,
+             lang="de",
+             lang2="German",
+             short_name="stt_de_fastconformer_hybrid_large_pc_ctc-int8",
+             cmd="""
+             pushd $model_name
+ 
+             rm -rfv test_wavs
+ 
+             ls -lh
+ 
+             popd
+             """,
+         ),
     ]
     return models
 
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
查看文件 @8693b1e
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
查看文件 @8693b1e
@@ -24,5 +24,7 @@ This folder contains scripts for exporting models from
 
   - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
   - https://huggingface.co/nvidia/parakeet-tdt_ctc-0.6b-ja
+   - https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
+   - https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
 
 to `sherpa-onnx`.
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc-non-streaming.py
查看文件 @8693b1e
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-ctc-non-streaming.py
查看文件 @8693b1e
@@ -81,7 +81,9 @@ def main():
         "model_type": "EncDecHybridRNNTCTCBPEModel",
         "version": "1",
         "model_author": "NeMo",
-         "url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}",
+         "url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}"
+         if "/" in model_name
+         else f"https://huggingface.co/{model_name}",
         "comment": "Only the CTC branch is exported",
         "doc": args.doc,
     }
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-transducer-non-streaming.py
查看文件 @8693b1e
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/export-onnx-transducer-non-streaming.py
查看文件 @8693b1e
@@ -85,7 +85,9 @@ def main():
         "model_type": "EncDecHybridRNNTCTCBPEModel",
         "version": "1",
         "model_author": "NeMo",
-         "url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}",
+         "url": f"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/{model_name}"
+         if "/" in model_name
+         else f"https://huggingface.co/{model_name}",
         "comment": "Only the transducer branch is exported",
         "doc": args.doc,
     }
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming-2.sh 0 → 100755
查看文件 @8693b1e
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming-2.sh 0 → 100755
查看文件 @8693b1e
+ #!/usr/bin/env bash
+ # Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ set -ex
+ 
+ log() {
+   # This function is from espnet
+   local fname=${BASH_SOURCE[1]##*/}
+   echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+ }
+ 
+ # 2200 hours of Portuguese speech
+ url=https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
+ name=$(basename $url)
+ name="nvidia/$name"
+ doc="STT PT FastConformer Hybrid Transducer-CTC Large transcribes text in upper and lower case Portuguese alphabet along with spaces, period, comma, question mark. This collection contains the Brazilian Portuguese FastConformer Hybrid (Transducer and CTC) Large model (around 115M parameters) with punctuation and capitalization trained on around 2200h hours of Portuguese speech. "
+ 
+ log "Process $name at $url"
+ ./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
+ d=sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc
+ mkdir -p $d
+ mv -v model.onnx $d/
+ cp -v tokens.txt $d/
+ ls -lh $d
+ 
+ mkdir test_wavs
+ pushd test_wavs
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/pt_br.wav
+ popd
+ cp -a test_wavs $d
+ 
+ d=sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8
+ mkdir -p $d
+ mv -v model.int8.onnx $d/
+ mv -v tokens.txt $d/
+ ls -lh $d
+ mv test_wavs $d
+ 
+ python3 ./test-onnx-ctc-non-streaming.py \
+   --model $d/model.int8.onnx \
+   --tokens $d/tokens.txt \
+   --wav $d/test_wavs/pt_br.wav
+ 
+ 
+ # 2500 hours of German speech
+ url=https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
+ name=$(basename $url)
+ name="nvidia/$name"
+ doc="This model transcribes speech in upper and lower case German alphabet along with spaces, periods, commas, and question marks. It is a 'large' version of FastConformer Transducer-CTC (around 115M parameters) model. This is a hybrid model trained on two losses: Transducer (default) and CTC."
+ 
+ log "Process $name at $url"
+ ./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
+ d=sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc
+ mkdir -p $d
+ mv -v model.onnx $d/
+ cp -v tokens.txt $d/
+ ls -lh $d
+ 
+ mkdir test_wavs
+ pushd test_wavs
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
+ popd
+ cp -a test_wavs $d
+ 
+ d=sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8
+ mkdir -p $d
+ mv -v model.int8.onnx $d/
+ mv -v tokens.txt $d/
+ ls -lh $d
+ mv test_wavs $d
+ 
+ python3 ./test-onnx-ctc-non-streaming.py \
+   --model $d/model.int8.onnx \
+   --tokens $d/tokens.txt \
+   --wav $d/test_wavs/de.wav
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming-2.sh 0 → 100755
查看文件 @8693b1e
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming-2.sh 0 → 100755
查看文件 @8693b1e
+ #!/usr/bin/env bash
+ # Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ set -ex
+ 
+ log() {
+   # This function is from espnet
+   local fname=${BASH_SOURCE[1]##*/}
+   echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+ }
+ 
+ # 2200 hours of Portuguese speech
+ url=https://huggingface.co/nvidia/stt_pt_fastconformer_hybrid_large_pc
+ name=$(basename $url)
+ name="nvidia/$name"
+ doc="STT PT FastConformer Hybrid Transducer-CTC Large transcribes text in upper and lower case Portuguese alphabet along with spaces, period, comma, question mark. This collection contains the Brazilian Portuguese FastConformer Hybrid (Transducer and CTC) Large model (around 115M parameters) with punctuation and capitalization trained on around 2200h hours of Portuguese speech. "
+ 
+ log "Process $name at $url"
+ ./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
+ d=sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc
+ mkdir -p $d
+ mv -v encoder.onnx $d/
+ mv -v decoder.onnx $d/
+ mv -v joiner.onnx $d/
+ cp -v tokens.txt $d/
+ ls -lh $d
+ 
+ mkdir test_wavs
+ pushd test_wavs
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/pt_br.wav
+ popd
+ cp -a test_wavs $d
+ 
+ d=sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8
+ mkdir -p $d
+ mv -v encoder.int8.onnx $d/
+ mv -v decoder.int8.onnx $d/
+ mv -v joiner.int8.onnx $d/
+ mv -v tokens.txt $d/
+ ls -lh $d
+ mv test_wavs $d
+ 
+ python3 ./test-onnx-transducer-non-streaming.py \
+   --encoder $d/encoder.int8.onnx \
+   --decoder $d/decoder.int8.onnx \
+   --joiner $d/joiner.int8.onnx \
+   --tokens $d/tokens.txt \
+   --wav $d/test_wavs/pt_br.wav
+ 
+ # 2500 hours of German speech
+ url=https://huggingface.co/nvidia/stt_de_fastconformer_hybrid_large_pc
+ name=$(basename $url)
+ name="nvidia/$name"
+ doc="This model transcribes speech in upper and lower case German alphabet along with spaces, periods, commas, and question marks. It is a 'large' version of FastConformer Transducer-CTC (around 115M parameters) model. This is a hybrid model trained on two losses: Transducer (default) and CTC."
+ 
+ log "Process $name at $url"
+ ./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
+ d=sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc
+ mkdir -p $d
+ mv -v encoder.onnx $d/
+ mv -v decoder.onnx $d/
+ mv -v joiner.onnx $d/
+ cp -v tokens.txt $d/
+ ls -lh $d
+ 
+ mkdir test_wavs
+ pushd test_wavs
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/de.wav
+ popd
+ cp -a test_wavs $d
+ 
+ d=sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8
+ mkdir -p $d
+ mv -v encoder.int8.onnx $d/
+ mv -v decoder.int8.onnx $d/
+ mv -v joiner.int8.onnx $d/
+ mv -v tokens.txt $d/
+ ls -lh $d
+ mv test_wavs $d
+ 
+ python3 ./test-onnx-transducer-non-streaming.py \
+   --encoder $d/encoder.int8.onnx \
+   --decoder $d/decoder.int8.onnx \
+   --joiner $d/joiner.int8.onnx \
+   --tokens $d/tokens.txt \
+   --wav $d/test_wavs/de.wav
--- a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt
查看文件 @8693b1e
+++ b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt
查看文件 @8693b1e
@@ -621,6 +621,52 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
                 tokens = "$modelDir/tokens.txt",
             )
         }
+ 
+         35 -> {
+             val modelDir = "sherpa-onnx-nemo-transducer-stt_pt_fastconformer_hybrid_large_pc-int8"
+             return OfflineModelConfig(
+                 transducer = OfflineTransducerModelConfig(
+                     encoder = "$modelDir/encoder.int8.onnx",
+                     decoder = "$modelDir/decoder.int8.onnx",
+                     joiner = "$modelDir/joiner.int8.onnx",
+                 ),
+                 tokens = "$modelDir/tokens.txt",
+                 modelType = "nemo_transducer",
+             )
+         }
+ 
+         36 -> {
+             val modelDir = "sherpa-onnx-nemo-stt_pt_fastconformer_hybrid_large_pc-int8"
+             return OfflineModelConfig(
+                 nemo = OfflineNemoEncDecCtcModelConfig(
+                     model = "$modelDir/model.int8.onnx",
+                 ),
+                 tokens = "$modelDir/tokens.txt",
+             )
+         }
+ 
+         37 -> {
+             val modelDir = "sherpa-onnx-nemo-transducer-stt_de_fastconformer_hybrid_large_pc-int8"
+             return OfflineModelConfig(
+                 transducer = OfflineTransducerModelConfig(
+                     encoder = "$modelDir/encoder.int8.onnx",
+                     decoder = "$modelDir/decoder.int8.onnx",
+                     joiner = "$modelDir/joiner.int8.onnx",
+                 ),
+                 tokens = "$modelDir/tokens.txt",
+                 modelType = "nemo_transducer",
+             )
+         }
+ 
+         38 -> {
+             val modelDir = "sherpa-onnx-nemo-stt_de_fastconformer_hybrid_large_pc-int8"
+             return OfflineModelConfig(
+                 nemo = OfflineNemoEncDecCtcModelConfig(
+                     model = "$modelDir/model.int8.onnx",
+                 ),
+                 tokens = "$modelDir/tokens.txt",
+             )
+         }
     }
     return null
 }