Committed by
GitHub
Support Parakeet models from NeMo (#1381)
正在显示
12 个修改的文件
包含
160 行增加
和
8 行删除
| @@ -31,7 +31,7 @@ jobs: | @@ -31,7 +31,7 @@ jobs: | ||
| 31 | run: | | 31 | run: | |
| 32 | BRANCH='main' | 32 | BRANCH='main' |
| 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] | 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] |
| 34 | - pip install onnxruntime | 34 | + pip install onnxruntime ipython |
| 35 | pip install kaldi-native-fbank | 35 | pip install kaldi-native-fbank |
| 36 | pip install soundfile librosa | 36 | pip install soundfile librosa |
| 37 | 37 | ||
| @@ -43,6 +43,43 @@ jobs: | @@ -43,6 +43,43 @@ jobs: | ||
| 43 | 43 | ||
| 44 | mv -v sherpa-onnx-nemo* ../../.. | 44 | mv -v sherpa-onnx-nemo* ../../.. |
| 45 | 45 | ||
| 46 | + - name: Publish to huggingface | ||
| 47 | + env: | ||
| 48 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 49 | + uses: nick-fields/retry@v3 | ||
| 50 | + with: | ||
| 51 | + max_attempts: 20 | ||
| 52 | + timeout_seconds: 200 | ||
| 53 | + shell: bash | ||
| 54 | + command: | | ||
| 55 | + git config --global user.email "csukuangfj@gmail.com" | ||
| 56 | + git config --global user.name "Fangjun Kuang" | ||
| 57 | + | ||
| 58 | + models=( | ||
| 59 | + sherpa-onnx-nemo-fast-conformer-ctc-en-24500 | ||
| 60 | + sherpa-onnx-nemo-fast-conformer-ctc-es-1424 | ||
| 61 | + sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288 | ||
| 62 | + sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k | ||
| 63 | + sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 | ||
| 64 | + ) | ||
| 65 | + | ||
| 66 | + for m in ${models[@]}; do | ||
| 67 | + rm -rf huggingface | ||
| 68 | + export GIT_LFS_SKIP_SMUDGE=1 | ||
| 69 | + export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 70 | + git clone https://huggingface.co/csukuangfj/$m huggingface | ||
| 71 | + cp -av $m/* huggingface | ||
| 72 | + cd huggingface | ||
| 73 | + git lfs track "*.onnx" | ||
| 74 | + git status | ||
| 75 | + git add . | ||
| 76 | + git status | ||
| 77 | + git commit -m "first commit" | ||
| 78 | + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main | ||
| 79 | + cd .. | ||
| 80 | + rm -rf huggingface | ||
| 81 | + done | ||
| 82 | + | ||
| 46 | - name: Compress files | 83 | - name: Compress files |
| 47 | shell: bash | 84 | shell: bash |
| 48 | run: | | 85 | run: | |
| @@ -51,6 +88,7 @@ jobs: | @@ -51,6 +88,7 @@ jobs: | ||
| 51 | sherpa-onnx-nemo-fast-conformer-ctc-es-1424 | 88 | sherpa-onnx-nemo-fast-conformer-ctc-es-1424 |
| 52 | sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288 | 89 | sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288 |
| 53 | sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k | 90 | sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k |
| 91 | + sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 | ||
| 54 | ) | 92 | ) |
| 55 | for d in ${dirs[@]}; do | 93 | for d in ${dirs[@]}; do |
| 56 | tar cjvf ${d}.tar.bz2 ./$d | 94 | tar cjvf ${d}.tar.bz2 ./$d |
| @@ -65,3 +103,5 @@ jobs: | @@ -65,3 +103,5 @@ jobs: | ||
| 65 | repo_name: k2-fsa/sherpa-onnx | 103 | repo_name: k2-fsa/sherpa-onnx |
| 66 | repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | 104 | repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} |
| 67 | tag: asr-models | 105 | tag: asr-models |
| 106 | + | ||
| 107 | + |
| @@ -31,7 +31,7 @@ jobs: | @@ -31,7 +31,7 @@ jobs: | ||
| 31 | run: | | 31 | run: | |
| 32 | BRANCH='main' | 32 | BRANCH='main' |
| 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] | 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] |
| 34 | - pip install onnxruntime | 34 | + pip install onnxruntime ipython |
| 35 | pip install kaldi-native-fbank | 35 | pip install kaldi-native-fbank |
| 36 | pip install soundfile librosa | 36 | pip install soundfile librosa |
| 37 | 37 |
| @@ -31,7 +31,7 @@ jobs: | @@ -31,7 +31,7 @@ jobs: | ||
| 31 | run: | | 31 | run: | |
| 32 | BRANCH='main' | 32 | BRANCH='main' |
| 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] | 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] |
| 34 | - pip install onnxruntime | 34 | + pip install onnxruntime ipython |
| 35 | pip install kaldi-native-fbank | 35 | pip install kaldi-native-fbank |
| 36 | pip install soundfile librosa | 36 | pip install soundfile librosa |
| 37 | 37 | ||
| @@ -43,6 +43,42 @@ jobs: | @@ -43,6 +43,42 @@ jobs: | ||
| 43 | 43 | ||
| 44 | mv -v sherpa-onnx-nemo* ../../.. | 44 | mv -v sherpa-onnx-nemo* ../../.. |
| 45 | 45 | ||
| 46 | + - name: Publish to huggingface | ||
| 47 | + env: | ||
| 48 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 49 | + uses: nick-fields/retry@v3 | ||
| 50 | + with: | ||
| 51 | + max_attempts: 20 | ||
| 52 | + timeout_seconds: 200 | ||
| 53 | + shell: bash | ||
| 54 | + command: | | ||
| 55 | + git config --global user.email "csukuangfj@gmail.com" | ||
| 56 | + git config --global user.name "Fangjun Kuang" | ||
| 57 | + | ||
| 58 | + models=( | ||
| 59 | + sherpa-onnx-nemo-fast-conformer-transducer-en-24500 | ||
| 60 | + sherpa-onnx-nemo-fast-conformer-transducer-es-1424 | ||
| 61 | + sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288 | ||
| 62 | + sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k | ||
| 63 | + sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 | ||
| 64 | + ) | ||
| 65 | + | ||
| 66 | + for m in ${models[@]}; do | ||
| 67 | + rm -rf huggingface | ||
| 68 | + export GIT_LFS_SKIP_SMUDGE=1 | ||
| 69 | + export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 70 | + git clone https://huggingface.co/csukuangfj/$m huggingface | ||
| 71 | + cp -av $m/* huggingface | ||
| 72 | + cd huggingface | ||
| 73 | + git lfs track "*.onnx" | ||
| 74 | + git status | ||
| 75 | + git add . | ||
| 76 | + git status | ||
| 77 | + git commit -m "first commit" | ||
| 78 | + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main | ||
| 79 | + cd .. | ||
| 80 | + done | ||
| 81 | + | ||
| 46 | - name: Compress files | 82 | - name: Compress files |
| 47 | shell: bash | 83 | shell: bash |
| 48 | run: | | 84 | run: | |
| @@ -51,6 +87,7 @@ jobs: | @@ -51,6 +87,7 @@ jobs: | ||
| 51 | sherpa-onnx-nemo-fast-conformer-transducer-es-1424 | 87 | sherpa-onnx-nemo-fast-conformer-transducer-es-1424 |
| 52 | sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288 | 88 | sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288 |
| 53 | sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k | 89 | sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k |
| 90 | + sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 | ||
| 54 | ) | 91 | ) |
| 55 | for d in ${dirs[@]}; do | 92 | for d in ${dirs[@]}; do |
| 56 | tar cjvf ${d}.tar.bz2 ./$d | 93 | tar cjvf ${d}.tar.bz2 ./$d |
| @@ -65,3 +102,5 @@ jobs: | @@ -65,3 +102,5 @@ jobs: | ||
| 65 | repo_name: k2-fsa/sherpa-onnx | 102 | repo_name: k2-fsa/sherpa-onnx |
| 66 | repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | 103 | repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} |
| 67 | tag: asr-models | 104 | tag: asr-models |
| 105 | + | ||
| 106 | + |
| @@ -31,7 +31,7 @@ jobs: | @@ -31,7 +31,7 @@ jobs: | ||
| 31 | run: | | 31 | run: | |
| 32 | BRANCH='main' | 32 | BRANCH='main' |
| 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] | 33 | pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] |
| 34 | - pip install onnxruntime | 34 | + pip install onnxruntime ipython |
| 35 | pip install kaldi-native-fbank | 35 | pip install kaldi-native-fbank |
| 36 | pip install soundfile librosa | 36 | pip install soundfile librosa |
| 37 | 37 |
| @@ -139,7 +139,7 @@ jobs: | @@ -139,7 +139,7 @@ jobs: | ||
| 139 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH | 139 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH |
| 140 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH | 140 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH |
| 141 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH | 141 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH |
| 142 | - export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH | 142 | + export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH |
| 143 | 143 | ||
| 144 | which sherpa-onnx | 144 | which sherpa-onnx |
| 145 | sherpa-onnx --help | 145 | sherpa-onnx --help |
| @@ -104,7 +104,7 @@ jobs: | @@ -104,7 +104,7 @@ jobs: | ||
| 104 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH | 104 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH |
| 105 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH | 105 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH |
| 106 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH | 106 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH |
| 107 | - export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH | 107 | + export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH |
| 108 | 108 | ||
| 109 | sherpa-onnx --help | 109 | sherpa-onnx --help |
| 110 | sherpa-onnx-keyword-spotter --help | 110 | sherpa-onnx-keyword-spotter --help |
| @@ -22,4 +22,6 @@ This folder contains scripts for exporting models from | @@ -22,4 +22,6 @@ This folder contains scripts for exporting models from | ||
| 22 | - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu | 22 | - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu |
| 23 | - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc | 23 | - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc |
| 24 | 24 | ||
| 25 | + - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m | ||
| 26 | + | ||
| 25 | to `sherpa-onnx`. | 27 | to `sherpa-onnx`. |
| @@ -9,6 +9,19 @@ log() { | @@ -9,6 +9,19 @@ log() { | ||
| 9 | echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" | 9 | echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" |
| 10 | } | 10 | } |
| 11 | 11 | ||
| 12 | +# 36000 hours of English data | ||
| 13 | +url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m | ||
| 14 | +name=$(basename $url) | ||
| 15 | +doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams." | ||
| 16 | + | ||
| 17 | +log "Process $name at $url" | ||
| 18 | +./export-onnx-ctc-non-streaming.py --model $name --doc "$doc" | ||
| 19 | +d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 | ||
| 20 | +mkdir -p $d | ||
| 21 | +mv -v model.onnx $d/ | ||
| 22 | +mv -v tokens.txt $d/ | ||
| 23 | +ls -lh $d | ||
| 24 | + | ||
| 12 | # 8500 hours of English speech | 25 | # 8500 hours of English speech |
| 13 | url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc | 26 | url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc |
| 14 | name=$(basename $url) | 27 | name=$(basename $url) |
| @@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 | @@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 | ||
| 66 | rm spoken-language-identification-test-wavs.tar.bz2 | 79 | rm spoken-language-identification-test-wavs.tar.bz2 |
| 67 | data=spoken-language-identification-test-wavs | 80 | data=spoken-language-identification-test-wavs |
| 68 | 81 | ||
| 82 | +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav | ||
| 83 | +mv 2086-149220-0033.wav en.wav | ||
| 84 | + | ||
| 85 | +d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 | ||
| 86 | +python3 ./test-onnx-ctc-non-streaming.py \ | ||
| 87 | + --model $d/model.onnx \ | ||
| 88 | + --tokens $d/tokens.txt \ | ||
| 89 | + --wav $data/en-english.wav | ||
| 90 | +mkdir -p $d/test_wavs | ||
| 91 | + | ||
| 92 | +cp en.wav $d/test_wavs/0.wav | ||
| 93 | +cp -v $data/en-english.wav $d/test_wavs/1.wav | ||
| 94 | + | ||
| 69 | d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500 | 95 | d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500 |
| 70 | python3 ./test-onnx-ctc-non-streaming.py \ | 96 | python3 ./test-onnx-ctc-non-streaming.py \ |
| 71 | --model $d/model.onnx \ | 97 | --model $d/model.onnx \ |
| 72 | --tokens $d/tokens.txt \ | 98 | --tokens $d/tokens.txt \ |
| 73 | --wav $data/en-english.wav | 99 | --wav $data/en-english.wav |
| 74 | mkdir -p $d/test_wavs | 100 | mkdir -p $d/test_wavs |
| 101 | +cp en.wav $d/test_wavs/0.wav | ||
| 75 | cp -v $data/en-english.wav $d/test_wavs | 102 | cp -v $data/en-english.wav $d/test_wavs |
| 76 | 103 | ||
| 77 | d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424 | 104 | d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424 |
| @@ -9,6 +9,19 @@ log() { | @@ -9,6 +9,19 @@ log() { | ||
| 9 | echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" | 9 | echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" |
| 10 | } | 10 | } |
| 11 | 11 | ||
| 12 | +# 36000 hours of English data | ||
| 13 | +url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m | ||
| 14 | +name=$(basename $url) | ||
| 15 | +doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams." | ||
| 16 | + | ||
| 17 | +log "Process $name at $url" | ||
| 18 | +./export-onnx-transducer-non-streaming.py --model $name --doc "$doc" | ||
| 19 | +d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 | ||
| 20 | +mkdir -p $d | ||
| 21 | +mv -v *.onnx $d/ | ||
| 22 | +mv -v tokens.txt $d/ | ||
| 23 | +ls -lh $d | ||
| 24 | + | ||
| 12 | # 8500 hours of English speech | 25 | # 8500 hours of English speech |
| 13 | url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc | 26 | url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc |
| 14 | name=$(basename $url) | 27 | name=$(basename $url) |
| @@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 | @@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 | ||
| 66 | rm spoken-language-identification-test-wavs.tar.bz2 | 79 | rm spoken-language-identification-test-wavs.tar.bz2 |
| 67 | data=spoken-language-identification-test-wavs | 80 | data=spoken-language-identification-test-wavs |
| 68 | 81 | ||
| 82 | +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav | ||
| 83 | +mv 2086-149220-0033.wav en.wav | ||
| 84 | + | ||
| 85 | +d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 | ||
| 86 | +python3 ./test-onnx-transducer-non-streaming.py \ | ||
| 87 | + --encoder $d/encoder.onnx \ | ||
| 88 | + --decoder $d/decoder.onnx \ | ||
| 89 | + --joiner $d/joiner.onnx \ | ||
| 90 | + --tokens $d/tokens.txt \ | ||
| 91 | + --wav $data/en-english.wav | ||
| 92 | + | ||
| 93 | +python3 ./test-onnx-transducer-non-streaming.py \ | ||
| 94 | + --encoder $d/encoder.onnx \ | ||
| 95 | + --decoder $d/decoder.onnx \ | ||
| 96 | + --joiner $d/joiner.onnx \ | ||
| 97 | + --tokens $d/tokens.txt \ | ||
| 98 | + --wav ./en.wav | ||
| 99 | + | ||
| 100 | +mkdir -p $d/test_wavs | ||
| 101 | +cp en.wav $d/test_wavs/0.wav | ||
| 102 | +cp -v $data/en-english.wav $d/test_wavs | ||
| 103 | + | ||
| 69 | d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500 | 104 | d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500 |
| 70 | python3 ./test-onnx-transducer-non-streaming.py \ | 105 | python3 ./test-onnx-transducer-non-streaming.py \ |
| 71 | --encoder $d/encoder.onnx \ | 106 | --encoder $d/encoder.onnx \ |
| @@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \ | @@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \ | ||
| 74 | --tokens $d/tokens.txt \ | 109 | --tokens $d/tokens.txt \ |
| 75 | --wav $data/en-english.wav | 110 | --wav $data/en-english.wav |
| 76 | mkdir -p $d/test_wavs | 111 | mkdir -p $d/test_wavs |
| 112 | +cp en.wav $d/test_wavs/0.wav | ||
| 77 | cp -v $data/en-english.wav $d/test_wavs | 113 | cp -v $data/en-english.wav $d/test_wavs |
| 78 | 114 | ||
| 79 | d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424 | 115 | d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424 |
| @@ -141,7 +141,7 @@ def main(): | @@ -141,7 +141,7 @@ def main(): | ||
| 141 | assert model.normalize_type == "per_feature", model.normalize_type | 141 | assert model.normalize_type == "per_feature", model.normalize_type |
| 142 | features = torch.from_numpy(features) | 142 | features = torch.from_numpy(features) |
| 143 | mean = features.mean(dim=1, keepdims=True) | 143 | mean = features.mean(dim=1, keepdims=True) |
| 144 | - stddev = features.std(dim=1, keepdims=True) | 144 | + stddev = features.std(dim=1, keepdims=True) + 1e-5 |
| 145 | features = (features - mean) / stddev | 145 | features = (features - mean) / stddev |
| 146 | features = features.numpy() | 146 | features = features.numpy() |
| 147 | 147 |
| @@ -268,7 +268,7 @@ def main(): | @@ -268,7 +268,7 @@ def main(): | ||
| 268 | assert model.normalize_type == "per_feature", model.normalize_type | 268 | assert model.normalize_type == "per_feature", model.normalize_type |
| 269 | features = torch.from_numpy(features) | 269 | features = torch.from_numpy(features) |
| 270 | mean = features.mean(dim=1, keepdims=True) | 270 | mean = features.mean(dim=1, keepdims=True) |
| 271 | - stddev = features.std(dim=1, keepdims=True) | 271 | + stddev = features.std(dim=1, keepdims=True) + 1e-5 |
| 272 | features = (features - mean) / stddev | 272 | features = (features - mean) / stddev |
| 273 | features = features.numpy() | 273 | features = features.numpy() |
| 274 | print(audio.shape) | 274 | print(audio.shape) |
| @@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { | @@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { | ||
| 103 | config_.feat_config.is_mfcc = true; | 103 | config_.feat_config.is_mfcc = true; |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | + if (!config_.model_config.nemo_ctc.model.empty()) { | ||
| 107 | + config_.feat_config.low_freq = 0; | ||
| 108 | + config_.feat_config.high_freq = 0; | ||
| 109 | + config_.feat_config.is_librosa = true; | ||
| 110 | + config_.feat_config.remove_dc_offset = false; | ||
| 111 | + config_.feat_config.window_type = "hann"; | ||
| 112 | + } | ||
| 113 | + | ||
| 106 | if (!config_.model_config.wenet_ctc.model.empty()) { | 114 | if (!config_.model_config.wenet_ctc.model.empty()) { |
| 107 | // WeNet CTC models assume input samples are in the range | 115 | // WeNet CTC models assume input samples are in the range |
| 108 | // [-32768, 32767], so we set normalize_samples to false | 116 | // [-32768, 32767], so we set normalize_samples to false |
-
请 注册 或 登录 后发表评论