Support Parakeet models from NeMo (#1381)

Fangjun Kuang · GitHub
Commit 11f0cb7e1cad02419912593b476a7a9b0acc2c48 11f0cb7e 1 parent 12d04ce8
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml
.github/workflows/test-build-wheel.yaml
.github/workflows/test-pip-install.yaml
scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh
scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh
scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py
scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py
sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
查看文件 @11f0cb7
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
查看文件 @11f0cb7
@@ -31,7 +31,7 @@ jobs:
         run: |
           BRANCH='main'
           pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
           pip install kaldi-native-fbank
           pip install soundfile librosa
@@ -43,6 +43,43 @@ jobs:
           mv -v sherpa-onnx-nemo* ../../..
+      - name: Publish to huggingface
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            models=(
+              sherpa-onnx-nemo-fast-conformer-ctc-en-24500
+              sherpa-onnx-nemo-fast-conformer-ctc-es-1424
+              sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
+              sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
+              sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
+            )
+
+            for m in ${models[@]}; do
+              rm -rf huggingface
+              export GIT_LFS_SKIP_SMUDGE=1
+              export GIT_CLONE_PROTECTION_ACTIVE=false
+              git clone https://huggingface.co/csukuangfj/$m huggingface
+              cp -av $m/* huggingface
+              cd huggingface
+              git lfs track "*.onnx"
+              git status
+              git add .
+              git status
+              git commit -m "first commit"
+              git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
+              cd ..
+              rm -rf huggingface
+            done
+
       - name: Compress files
         shell: bash
         run: |
@@ -51,6 +88,7 @@ jobs:
             sherpa-onnx-nemo-fast-conformer-ctc-es-1424
             sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
             sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
+            sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
           )
           for d in ${dirs[@]}; do
             tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +103,5 @@ jobs:
           repo_name: k2-fsa/sherpa-onnx
           repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
           tag: asr-models
+
+
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml
查看文件 @11f0cb7
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml
查看文件 @11f0cb7
@@ -31,7 +31,7 @@ jobs:
         run: |
           BRANCH='main'
           pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
           pip install kaldi-native-fbank
           pip install soundfile librosa
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
查看文件 @11f0cb7
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
查看文件 @11f0cb7
@@ -31,7 +31,7 @@ jobs:
         run: |
           BRANCH='main'
           pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
           pip install kaldi-native-fbank
           pip install soundfile librosa
@@ -43,6 +43,42 @@ jobs:
           mv -v sherpa-onnx-nemo* ../../..
+      - name: Publish to huggingface
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            models=(
+              sherpa-onnx-nemo-fast-conformer-transducer-en-24500
+              sherpa-onnx-nemo-fast-conformer-transducer-es-1424
+              sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
+              sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
+              sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
+            )
+
+            for m in ${models[@]}; do
+              rm -rf huggingface
+              export GIT_LFS_SKIP_SMUDGE=1
+              export GIT_CLONE_PROTECTION_ACTIVE=false
+              git clone https://huggingface.co/csukuangfj/$m huggingface
+              cp -av $m/* huggingface
+              cd huggingface
+              git lfs track "*.onnx"
+              git status
+              git add .
+              git status
+              git commit -m "first commit"
+              git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
+              cd ..
+            done
+
       - name: Compress files
         shell: bash
         run: |
@@ -51,6 +87,7 @@ jobs:
             sherpa-onnx-nemo-fast-conformer-transducer-es-1424
             sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
             sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
+            sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
           )
           for d in ${dirs[@]}; do
             tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +102,5 @@ jobs:
           repo_name: k2-fsa/sherpa-onnx
           repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
           tag: asr-models
+
+
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml
查看文件 @11f0cb7
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml
查看文件 @11f0cb7
@@ -31,7 +31,7 @@ jobs:
         run: |
           BRANCH='main'
           pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
           pip install kaldi-native-fbank
           pip install soundfile librosa
--- a/.github/workflows/test-build-wheel.yaml
查看文件 @11f0cb7
+++ b/.github/workflows/test-build-wheel.yaml
查看文件 @11f0cb7
@@ -139,7 +139,7 @@ jobs:
           export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
-          export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH
+          export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
           which sherpa-onnx
           sherpa-onnx --help
--- a/.github/workflows/test-pip-install.yaml
查看文件 @11f0cb7
+++ b/.github/workflows/test-pip-install.yaml
查看文件 @11f0cb7
@@ -104,7 +104,7 @@ jobs:
           export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
-          export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH
+          export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
           sherpa-onnx --help
           sherpa-onnx-keyword-spotter --help
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
查看文件 @11f0cb7
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
查看文件 @11f0cb7
@@ -22,4 +22,6 @@ This folder contains scripts for exporting models from
   - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu
   - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc
+  - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
+
 to `sherpa-onnx`.
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh
查看文件 @11f0cb7
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh
查看文件 @11f0cb7
@@ -9,6 +9,19 @@ log() {
   echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
+# 36000 hours of English data
+url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
+name=$(basename $url)
+doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
+
+log "Process $name at $url"
+./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
+d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
+mkdir -p $d
+mv -v model.onnx $d/
+mv -v tokens.txt $d/
+ls -lh $d
+
 # 8500 hours of English speech
 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
 name=$(basename $url)
@@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
 rm spoken-language-identification-test-wavs.tar.bz2
 data=spoken-language-identification-test-wavs
+curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
+mv 2086-149220-0033.wav en.wav
+
+d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
+python3 ./test-onnx-ctc-non-streaming.py \
+  --model $d/model.onnx \
+  --tokens $d/tokens.txt \
+  --wav $data/en-english.wav
+mkdir -p $d/test_wavs
+
+cp en.wav $d/test_wavs/0.wav
+cp -v $data/en-english.wav $d/test_wavs/1.wav
+
 d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500
 python3 ./test-onnx-ctc-non-streaming.py \
   --model $d/model.onnx \
   --tokens $d/tokens.txt \
   --wav $data/en-english.wav
 mkdir -p $d/test_wavs
+cp en.wav $d/test_wavs/0.wav
 cp -v $data/en-english.wav $d/test_wavs
 d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh
查看文件 @11f0cb7
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh
查看文件 @11f0cb7
@@ -9,6 +9,19 @@ log() {
   echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
+# 36000 hours of English data
+url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
+name=$(basename $url)
+doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
+
+log "Process $name at $url"
+./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
+d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
+mkdir -p $d
+mv -v *.onnx $d/
+mv -v tokens.txt $d/
+ls -lh $d
+
 # 8500 hours of English speech
 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
 name=$(basename $url)
@@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
 rm spoken-language-identification-test-wavs.tar.bz2
 data=spoken-language-identification-test-wavs
+curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
+mv 2086-149220-0033.wav en.wav
+
+d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
+python3 ./test-onnx-transducer-non-streaming.py \
+  --encoder $d/encoder.onnx \
+  --decoder $d/decoder.onnx \
+  --joiner $d/joiner.onnx \
+  --tokens $d/tokens.txt \
+  --wav $data/en-english.wav
+
+python3 ./test-onnx-transducer-non-streaming.py \
+  --encoder $d/encoder.onnx \
+  --decoder $d/decoder.onnx \
+  --joiner $d/joiner.onnx \
+  --tokens $d/tokens.txt \
+  --wav ./en.wav
+
+mkdir -p $d/test_wavs
+cp en.wav $d/test_wavs/0.wav
+cp -v $data/en-english.wav $d/test_wavs
+
 d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500
 python3 ./test-onnx-transducer-non-streaming.py \
   --encoder $d/encoder.onnx \
@@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \
   --tokens $d/tokens.txt \
   --wav $data/en-english.wav
 mkdir -p $d/test_wavs
+cp en.wav $d/test_wavs/0.wav
 cp -v $data/en-english.wav $d/test_wavs
 d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py
查看文件 @11f0cb7
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py
查看文件 @11f0cb7
@@ -141,7 +141,7 @@ def main():
         assert model.normalize_type == "per_feature", model.normalize_type
         features = torch.from_numpy(features)
         mean = features.mean(dim=1, keepdims=True)
-        stddev = features.std(dim=1, keepdims=True)
+        stddev = features.std(dim=1, keepdims=True) + 1e-5
         features = (features - mean) / stddev
         features = features.numpy()
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py
查看文件 @11f0cb7
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py
查看文件 @11f0cb7
@@ -268,7 +268,7 @@ def main():
         assert model.normalize_type == "per_feature", model.normalize_type
         features = torch.from_numpy(features)
         mean = features.mean(dim=1, keepdims=True)
-        stddev = features.std(dim=1, keepdims=True)
+        stddev = features.std(dim=1, keepdims=True) + 1e-5
         features = (features - mean) / stddev
         features = features.numpy()
     print(audio.shape)
--- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
查看文件 @11f0cb7
+++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
查看文件 @11f0cb7
@@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
       config_.feat_config.is_mfcc = true;
     }
+    if (!config_.model_config.nemo_ctc.model.empty()) {
+      config_.feat_config.low_freq = 0;
+      config_.feat_config.high_freq = 0;
+      config_.feat_config.is_librosa = true;
+      config_.feat_config.remove_dc_offset = false;
+      config_.feat_config.window_type = "hann";
+    }
+
     if (!config_.model_config.wenet_ctc.model.empty()) {
       // WeNet CTC models assume input samples are in the range
       // [-32768, 32767], so we set normalize_samples to false