Fangjun Kuang
Committed by GitHub

Support Parakeet models from NeMo (#1381)

@@ -31,7 +31,7 @@ jobs: @@ -31,7 +31,7 @@ jobs:
31 run: | 31 run: |
32 BRANCH='main' 32 BRANCH='main'
33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] 33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
34 - pip install onnxruntime 34 + pip install onnxruntime ipython
35 pip install kaldi-native-fbank 35 pip install kaldi-native-fbank
36 pip install soundfile librosa 36 pip install soundfile librosa
37 37
@@ -43,6 +43,43 @@ jobs: @@ -43,6 +43,43 @@ jobs:
43 43
44 mv -v sherpa-onnx-nemo* ../../.. 44 mv -v sherpa-onnx-nemo* ../../..
45 45
  46 + - name: Publish to huggingface
  47 + env:
  48 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  49 + uses: nick-fields/retry@v3
  50 + with:
  51 + max_attempts: 20
  52 + timeout_seconds: 200
  53 + shell: bash
  54 + command: |
  55 + git config --global user.email "csukuangfj@gmail.com"
  56 + git config --global user.name "Fangjun Kuang"
  57 +
  58 + models=(
  59 + sherpa-onnx-nemo-fast-conformer-ctc-en-24500
  60 + sherpa-onnx-nemo-fast-conformer-ctc-es-1424
  61 + sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
  62 + sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
  63 + sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
  64 + )
  65 +
  66 + for m in ${models[@]}; do
  67 + rm -rf huggingface
  68 + export GIT_LFS_SKIP_SMUDGE=1
  69 + export GIT_CLONE_PROTECTION_ACTIVE=false
  70 + git clone https://huggingface.co/csukuangfj/$m huggingface
  71 + cp -av $m/* huggingface
  72 + cd huggingface
  73 + git lfs track "*.onnx"
  74 + git status
  75 + git add .
  76 + git status
  77 + git commit -m "first commit"
  78 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
  79 + cd ..
  80 + rm -rf huggingface
  81 + done
  82 +
46 - name: Compress files 83 - name: Compress files
47 shell: bash 84 shell: bash
48 run: | 85 run: |
@@ -51,6 +88,7 @@ jobs: @@ -51,6 +88,7 @@ jobs:
51 sherpa-onnx-nemo-fast-conformer-ctc-es-1424 88 sherpa-onnx-nemo-fast-conformer-ctc-es-1424
52 sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288 89 sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
53 sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k 90 sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
  91 + sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
54 ) 92 )
55 for d in ${dirs[@]}; do 93 for d in ${dirs[@]}; do
56 tar cjvf ${d}.tar.bz2 ./$d 94 tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +103,5 @@ jobs: @@ -65,3 +103,5 @@ jobs:
65 repo_name: k2-fsa/sherpa-onnx 103 repo_name: k2-fsa/sherpa-onnx
66 repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} 104 repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
67 tag: asr-models 105 tag: asr-models
  106 +
  107 +
@@ -31,7 +31,7 @@ jobs: @@ -31,7 +31,7 @@ jobs:
31 run: | 31 run: |
32 BRANCH='main' 32 BRANCH='main'
33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] 33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
34 - pip install onnxruntime 34 + pip install onnxruntime ipython
35 pip install kaldi-native-fbank 35 pip install kaldi-native-fbank
36 pip install soundfile librosa 36 pip install soundfile librosa
37 37
@@ -31,7 +31,7 @@ jobs: @@ -31,7 +31,7 @@ jobs:
31 run: | 31 run: |
32 BRANCH='main' 32 BRANCH='main'
33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] 33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
34 - pip install onnxruntime 34 + pip install onnxruntime ipython
35 pip install kaldi-native-fbank 35 pip install kaldi-native-fbank
36 pip install soundfile librosa 36 pip install soundfile librosa
37 37
@@ -43,6 +43,42 @@ jobs: @@ -43,6 +43,42 @@ jobs:
43 43
44 mv -v sherpa-onnx-nemo* ../../.. 44 mv -v sherpa-onnx-nemo* ../../..
45 45
  46 + - name: Publish to huggingface
  47 + env:
  48 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  49 + uses: nick-fields/retry@v3
  50 + with:
  51 + max_attempts: 20
  52 + timeout_seconds: 200
  53 + shell: bash
  54 + command: |
  55 + git config --global user.email "csukuangfj@gmail.com"
  56 + git config --global user.name "Fangjun Kuang"
  57 +
  58 + models=(
  59 + sherpa-onnx-nemo-fast-conformer-transducer-en-24500
  60 + sherpa-onnx-nemo-fast-conformer-transducer-es-1424
  61 + sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
  62 + sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
  63 + sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
  64 + )
  65 +
  66 + for m in ${models[@]}; do
  67 + rm -rf huggingface
  68 + export GIT_LFS_SKIP_SMUDGE=1
  69 + export GIT_CLONE_PROTECTION_ACTIVE=false
  70 + git clone https://huggingface.co/csukuangfj/$m huggingface
  71 + cp -av $m/* huggingface
  72 + cd huggingface
  73 + git lfs track "*.onnx"
  74 + git status
  75 + git add .
  76 + git status
  77 + git commit -m "first commit"
  78 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
  79 + cd ..
  80 + done
  81 +
46 - name: Compress files 82 - name: Compress files
47 shell: bash 83 shell: bash
48 run: | 84 run: |
@@ -51,6 +87,7 @@ jobs: @@ -51,6 +87,7 @@ jobs:
51 sherpa-onnx-nemo-fast-conformer-transducer-es-1424 87 sherpa-onnx-nemo-fast-conformer-transducer-es-1424
52 sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288 88 sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
53 sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k 89 sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
  90 + sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
54 ) 91 )
55 for d in ${dirs[@]}; do 92 for d in ${dirs[@]}; do
56 tar cjvf ${d}.tar.bz2 ./$d 93 tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +102,5 @@ jobs: @@ -65,3 +102,5 @@ jobs:
65 repo_name: k2-fsa/sherpa-onnx 102 repo_name: k2-fsa/sherpa-onnx
66 repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} 103 repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
67 tag: asr-models 104 tag: asr-models
  105 +
  106 +
@@ -31,7 +31,7 @@ jobs: @@ -31,7 +31,7 @@ jobs:
31 run: | 31 run: |
32 BRANCH='main' 32 BRANCH='main'
33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] 33 pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
34 - pip install onnxruntime 34 + pip install onnxruntime ipython
35 pip install kaldi-native-fbank 35 pip install kaldi-native-fbank
36 pip install soundfile librosa 36 pip install soundfile librosa
37 37
@@ -139,7 +139,7 @@ jobs: @@ -139,7 +139,7 @@ jobs:
139 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH 139 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
140 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH 140 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
141 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH 141 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
142 - export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH 142 + export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
143 143
144 which sherpa-onnx 144 which sherpa-onnx
145 sherpa-onnx --help 145 sherpa-onnx --help
@@ -104,7 +104,7 @@ jobs: @@ -104,7 +104,7 @@ jobs:
104 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH 104 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
105 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH 105 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
106 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH 106 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
107 - export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH 107 + export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
108 108
109 sherpa-onnx --help 109 sherpa-onnx --help
110 sherpa-onnx-keyword-spotter --help 110 sherpa-onnx-keyword-spotter --help
@@ -22,4 +22,6 @@ This folder contains scripts for exporting models from @@ -22,4 +22,6 @@ This folder contains scripts for exporting models from
22 - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu 22 - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu
23 - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc 23 - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc
24 24
  25 + - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
  26 +
25 to `sherpa-onnx`. 27 to `sherpa-onnx`.
@@ -9,6 +9,19 @@ log() { @@ -9,6 +9,19 @@ log() {
9 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
10 } 10 }
11 11
  12 +# 36000 hours of English data
  13 +url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
  14 +name=$(basename $url)
  15 +doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
  16 +
  17 +log "Process $name at $url"
  18 +./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
  19 +d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
  20 +mkdir -p $d
  21 +mv -v model.onnx $d/
  22 +mv -v tokens.txt $d/
  23 +ls -lh $d
  24 +
12 # 8500 hours of English speech 25 # 8500 hours of English speech
13 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc 26 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
14 name=$(basename $url) 27 name=$(basename $url)
@@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 @@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
66 rm spoken-language-identification-test-wavs.tar.bz2 79 rm spoken-language-identification-test-wavs.tar.bz2
67 data=spoken-language-identification-test-wavs 80 data=spoken-language-identification-test-wavs
68 81
  82 +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
  83 +mv 2086-149220-0033.wav en.wav
  84 +
  85 +d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
  86 +python3 ./test-onnx-ctc-non-streaming.py \
  87 + --model $d/model.onnx \
  88 + --tokens $d/tokens.txt \
  89 + --wav $data/en-english.wav
  90 +mkdir -p $d/test_wavs
  91 +
  92 +cp en.wav $d/test_wavs/0.wav
  93 +cp -v $data/en-english.wav $d/test_wavs/1.wav
  94 +
69 d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500 95 d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500
70 python3 ./test-onnx-ctc-non-streaming.py \ 96 python3 ./test-onnx-ctc-non-streaming.py \
71 --model $d/model.onnx \ 97 --model $d/model.onnx \
72 --tokens $d/tokens.txt \ 98 --tokens $d/tokens.txt \
73 --wav $data/en-english.wav 99 --wav $data/en-english.wav
74 mkdir -p $d/test_wavs 100 mkdir -p $d/test_wavs
  101 +cp en.wav $d/test_wavs/0.wav
75 cp -v $data/en-english.wav $d/test_wavs 102 cp -v $data/en-english.wav $d/test_wavs
76 103
77 d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424 104 d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424
@@ -9,6 +9,19 @@ log() { @@ -9,6 +9,19 @@ log() {
9 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 9 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
10 } 10 }
11 11
  12 +# 36000 hours of English data
  13 +url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
  14 +name=$(basename $url)
  15 +doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
  16 +
  17 +log "Process $name at $url"
  18 +./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
  19 +d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
  20 +mkdir -p $d
  21 +mv -v *.onnx $d/
  22 +mv -v tokens.txt $d/
  23 +ls -lh $d
  24 +
12 # 8500 hours of English speech 25 # 8500 hours of English speech
13 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc 26 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
14 name=$(basename $url) 27 name=$(basename $url)
@@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 @@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
66 rm spoken-language-identification-test-wavs.tar.bz2 79 rm spoken-language-identification-test-wavs.tar.bz2
67 data=spoken-language-identification-test-wavs 80 data=spoken-language-identification-test-wavs
68 81
  82 +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
  83 +mv 2086-149220-0033.wav en.wav
  84 +
  85 +d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
  86 +python3 ./test-onnx-transducer-non-streaming.py \
  87 + --encoder $d/encoder.onnx \
  88 + --decoder $d/decoder.onnx \
  89 + --joiner $d/joiner.onnx \
  90 + --tokens $d/tokens.txt \
  91 + --wav $data/en-english.wav
  92 +
  93 +python3 ./test-onnx-transducer-non-streaming.py \
  94 + --encoder $d/encoder.onnx \
  95 + --decoder $d/decoder.onnx \
  96 + --joiner $d/joiner.onnx \
  97 + --tokens $d/tokens.txt \
  98 + --wav ./en.wav
  99 +
  100 +mkdir -p $d/test_wavs
  101 +cp en.wav $d/test_wavs/0.wav
  102 +cp -v $data/en-english.wav $d/test_wavs
  103 +
69 d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500 104 d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500
70 python3 ./test-onnx-transducer-non-streaming.py \ 105 python3 ./test-onnx-transducer-non-streaming.py \
71 --encoder $d/encoder.onnx \ 106 --encoder $d/encoder.onnx \
@@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \ @@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \
74 --tokens $d/tokens.txt \ 109 --tokens $d/tokens.txt \
75 --wav $data/en-english.wav 110 --wav $data/en-english.wav
76 mkdir -p $d/test_wavs 111 mkdir -p $d/test_wavs
  112 +cp en.wav $d/test_wavs/0.wav
77 cp -v $data/en-english.wav $d/test_wavs 113 cp -v $data/en-english.wav $d/test_wavs
78 114
79 d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424 115 d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424
@@ -141,7 +141,7 @@ def main(): @@ -141,7 +141,7 @@ def main():
141 assert model.normalize_type == "per_feature", model.normalize_type 141 assert model.normalize_type == "per_feature", model.normalize_type
142 features = torch.from_numpy(features) 142 features = torch.from_numpy(features)
143 mean = features.mean(dim=1, keepdims=True) 143 mean = features.mean(dim=1, keepdims=True)
144 - stddev = features.std(dim=1, keepdims=True) 144 + stddev = features.std(dim=1, keepdims=True) + 1e-5
145 features = (features - mean) / stddev 145 features = (features - mean) / stddev
146 features = features.numpy() 146 features = features.numpy()
147 147
@@ -268,7 +268,7 @@ def main(): @@ -268,7 +268,7 @@ def main():
268 assert model.normalize_type == "per_feature", model.normalize_type 268 assert model.normalize_type == "per_feature", model.normalize_type
269 features = torch.from_numpy(features) 269 features = torch.from_numpy(features)
270 mean = features.mean(dim=1, keepdims=True) 270 mean = features.mean(dim=1, keepdims=True)
271 - stddev = features.std(dim=1, keepdims=True) 271 + stddev = features.std(dim=1, keepdims=True) + 1e-5
272 features = (features - mean) / stddev 272 features = (features - mean) / stddev
273 features = features.numpy() 273 features = features.numpy()
274 print(audio.shape) 274 print(audio.shape)
@@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { @@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
103 config_.feat_config.is_mfcc = true; 103 config_.feat_config.is_mfcc = true;
104 } 104 }
105 105
  106 + if (!config_.model_config.nemo_ctc.model.empty()) {
  107 + config_.feat_config.low_freq = 0;
  108 + config_.feat_config.high_freq = 0;
  109 + config_.feat_config.is_librosa = true;
  110 + config_.feat_config.remove_dc_offset = false;
  111 + config_.feat_config.window_type = "hann";
  112 + }
  113 +
106 if (!config_.model_config.wenet_ctc.model.empty()) { 114 if (!config_.model_config.wenet_ctc.model.empty()) {
107 // WeNet CTC models assume input samples are in the range 115 // WeNet CTC models assume input samples are in the range
108 // [-32768, 32767], so we set normalize_samples to false 116 // [-32768, 32767], so we set normalize_samples to false