Committed by
GitHub
Add C++ runtime for Tele-AI/TeleSpeech-ASR (#970)
正在显示
52 个修改的文件
包含
1019 行增加
和
112 行删除
| @@ -2,7 +2,16 @@ | @@ -2,7 +2,16 @@ | ||
| 2 | 2 | ||
| 3 | cd dotnet-examples/ | 3 | cd dotnet-examples/ |
| 4 | 4 | ||
| 5 | -cd vad-non-streaming-asr-paraformer | 5 | +cd ./offline-decode-files |
| 6 | +./run-telespeech-ctc.sh | ||
| 7 | +./run-nemo-ctc.sh | ||
| 8 | +./run-paraformer.sh | ||
| 9 | +./run-zipformer.sh | ||
| 10 | +./run-hotwords.sh | ||
| 11 | +./run-whisper.sh | ||
| 12 | +./run-tdnn-yesno.sh | ||
| 13 | + | ||
| 14 | +cd ../vad-non-streaming-asr-paraformer | ||
| 6 | ./run.sh | 15 | ./run.sh |
| 7 | 16 | ||
| 8 | cd ../offline-punctuation | 17 | cd ../offline-punctuation |
| @@ -22,14 +31,6 @@ cd ../online-decode-files | @@ -22,14 +31,6 @@ cd ../online-decode-files | ||
| 22 | ./run-transducer.sh | 31 | ./run-transducer.sh |
| 23 | ./run-paraformer.sh | 32 | ./run-paraformer.sh |
| 24 | 33 | ||
| 25 | -cd ../offline-decode-files | ||
| 26 | -./run-nemo-ctc.sh | ||
| 27 | -./run-paraformer.sh | ||
| 28 | -./run-zipformer.sh | ||
| 29 | -./run-hotwords.sh | ||
| 30 | -./run-whisper.sh | ||
| 31 | -./run-tdnn-yesno.sh | ||
| 32 | - | ||
| 33 | cd ../offline-tts | 34 | cd ../offline-tts |
| 34 | ./run-aishell3.sh | 35 | ./run-aishell3.sh |
| 35 | ./run-piper.sh | 36 | ./run-piper.sh |
| @@ -15,6 +15,39 @@ echo "PATH: $PATH" | @@ -15,6 +15,39 @@ echo "PATH: $PATH" | ||
| 15 | 15 | ||
| 16 | which $EXE | 16 | which $EXE |
| 17 | 17 | ||
| 18 | +log "test offline TeleSpeech CTC" | ||
| 19 | +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 20 | +name=$(basename $url) | ||
| 21 | +repo=$(basename -s .tar.bz2 $name) | ||
| 22 | + | ||
| 23 | +curl -SL -O $url | ||
| 24 | +tar xvf $name | ||
| 25 | +rm $name | ||
| 26 | +ls -lh $repo | ||
| 27 | + | ||
| 28 | +test_wavs=( | ||
| 29 | +3-sichuan.wav | ||
| 30 | +4-tianjin.wav | ||
| 31 | +5-henan.wav | ||
| 32 | +) | ||
| 33 | +for w in ${test_wavs[@]}; do | ||
| 34 | + time $EXE \ | ||
| 35 | + --tokens=$repo/tokens.txt \ | ||
| 36 | + --telespeech-ctc=$repo/model.int8.onnx \ | ||
| 37 | + --debug=1 \ | ||
| 38 | + $repo/test_wavs/$w | ||
| 39 | +done | ||
| 40 | + | ||
| 41 | +time $EXE \ | ||
| 42 | + --tokens=$repo/tokens.txt \ | ||
| 43 | + --telespeech-ctc=$repo/model.int8.onnx \ | ||
| 44 | + --debug=1 \ | ||
| 45 | + $repo/test_wavs/3-sichuan.wav \ | ||
| 46 | + $repo/test_wavs/4-tianjin.wav \ | ||
| 47 | + $repo/test_wavs/5-henan.wav | ||
| 48 | + | ||
| 49 | +rm -rf $repo | ||
| 50 | + | ||
| 18 | log "-----------------------------------------------------------------" | 51 | log "-----------------------------------------------------------------" |
| 19 | log "Run Nemo fast conformer hybrid transducer ctc models (CTC branch)" | 52 | log "Run Nemo fast conformer hybrid transducer ctc models (CTC branch)" |
| 20 | log "-----------------------------------------------------------------" | 53 | log "-----------------------------------------------------------------" |
| @@ -10,6 +10,18 @@ log() { | @@ -10,6 +10,18 @@ log() { | ||
| 10 | 10 | ||
| 11 | export GIT_CLONE_PROTECTION_ACTIVE=false | 11 | export GIT_CLONE_PROTECTION_ACTIVE=false |
| 12 | 12 | ||
| 13 | +log "test offline TeleSpeech CTC" | ||
| 14 | +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 15 | +name=$(basename $url) | ||
| 16 | +repo=$(basename -s .tar.bz2 $name) | ||
| 17 | + | ||
| 18 | +curl -SL -O $url | ||
| 19 | +tar xvf $name | ||
| 20 | +rm $name | ||
| 21 | +ls -lh $repo | ||
| 22 | +python3 ./python-api-examples/offline-telespeech-ctc-decode-files.py | ||
| 23 | +rm -rf $repo | ||
| 24 | + | ||
| 13 | log "test online NeMo CTC" | 25 | log "test online NeMo CTC" |
| 14 | 26 | ||
| 15 | url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2 | 27 | url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2 |
| @@ -82,7 +82,7 @@ jobs: | @@ -82,7 +82,7 @@ jobs: | ||
| 82 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} | 82 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} |
| 83 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} | 83 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} |
| 84 | run: | | 84 | run: | |
| 85 | - python3 -m pip install --upgrade pip | ||
| 86 | - python3 -m pip install wheel twine setuptools | 85 | + python3 -m pip install --break-system-packages --upgrade pip |
| 86 | + python3 -m pip install --break-system-packages wheel twine setuptools | ||
| 87 | 87 | ||
| 88 | twine upload ./wheelhouse/*.whl | 88 | twine upload ./wheelhouse/*.whl |
| 1 | +name: build-wheels-macos-universal2 | ||
| 2 | + | ||
| 3 | +on: | ||
| 4 | + push: | ||
| 5 | + branches: | ||
| 6 | + - wheel | ||
| 7 | + tags: | ||
| 8 | + - '*' | ||
| 9 | + workflow_dispatch: | ||
| 10 | + | ||
| 11 | +env: | ||
| 12 | + SHERPA_ONNX_IS_IN_GITHUB_ACTIONS: 1 | ||
| 13 | + | ||
| 14 | +concurrency: | ||
| 15 | + group: build-wheels-macos-universal2-${{ github.ref }} | ||
| 16 | + cancel-in-progress: true | ||
| 17 | + | ||
| 18 | +jobs: | ||
| 19 | + build_wheels_macos_universal2: | ||
| 20 | + name: ${{ matrix.python-version }} | ||
| 21 | + runs-on: ${{ matrix.os }} | ||
| 22 | + strategy: | ||
| 23 | + fail-fast: false | ||
| 24 | + matrix: | ||
| 25 | + os: [macos-latest] | ||
| 26 | + python-version: ["cp38", "cp39", "cp310", "cp311", "cp312"] | ||
| 27 | + | ||
| 28 | + steps: | ||
| 29 | + - uses: actions/checkout@v4 | ||
| 30 | + | ||
| 31 | + - name: Build wheels | ||
| 32 | + uses: pypa/cibuildwheel@v2.15.0 | ||
| 33 | + env: | ||
| 34 | + CIBW_BUILD: "${{ matrix.python-version}}-* " | ||
| 35 | + CIBW_ENVIRONMENT: SHERPA_ONNX_CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES='arm64;x86_64'" | ||
| 36 | + CIBW_ARCHS: "universal2" | ||
| 37 | + CIBW_BUILD_VERBOSITY: 3 | ||
| 38 | + | ||
| 39 | + # Don't repair macOS wheels | ||
| 40 | + CIBW_REPAIR_WHEEL_COMMAND_MACOS: "" | ||
| 41 | + | ||
| 42 | + - name: Display wheels | ||
| 43 | + shell: bash | ||
| 44 | + run: | | ||
| 45 | + ls -lh ./wheelhouse/ | ||
| 46 | + | ||
| 47 | + - uses: actions/upload-artifact@v4 | ||
| 48 | + with: | ||
| 49 | + name: wheel-${{ matrix.python-version }} | ||
| 50 | + path: ./wheelhouse/*.whl | ||
| 51 | + | ||
| 52 | + - name: Publish to huggingface | ||
| 53 | + if: matrix.python-version == 'cp38' | ||
| 54 | + env: | ||
| 55 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 56 | + uses: nick-fields/retry@v3 | ||
| 57 | + with: | ||
| 58 | + max_attempts: 20 | ||
| 59 | + timeout_seconds: 200 | ||
| 60 | + shell: bash | ||
| 61 | + command: | | ||
| 62 | + git config --global user.email "csukuangfj@gmail.com" | ||
| 63 | + git config --global user.name "Fangjun Kuang" | ||
| 64 | + | ||
| 65 | + rm -rf huggingface | ||
| 66 | + export GIT_LFS_SKIP_SMUDGE=1 | ||
| 67 | + export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 68 | + | ||
| 69 | + git clone https://huggingface.co/csukuangfj/sherpa-onnx-wheels huggingface | ||
| 70 | + cd huggingface | ||
| 71 | + git fetch | ||
| 72 | + git pull | ||
| 73 | + git merge -m "merge remote" --ff origin main | ||
| 74 | + | ||
| 75 | + cp -v ../wheelhouse/*.whl . | ||
| 76 | + | ||
| 77 | + git status | ||
| 78 | + git add . | ||
| 79 | + git commit -m "add more wheels" | ||
| 80 | + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-wheels main | ||
| 81 | + | ||
| 82 | + - name: Publish wheels to PyPI | ||
| 83 | + env: | ||
| 84 | + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} | ||
| 85 | + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} | ||
| 86 | + run: | | ||
| 87 | + python3 -m pip install --break-system-packages --upgrade pip | ||
| 88 | + python3 -m pip install --break-system-packages wheel twine setuptools | ||
| 89 | + | ||
| 90 | + twine upload ./wheelhouse/*.whl |
| @@ -99,7 +99,7 @@ jobs: | @@ -99,7 +99,7 @@ jobs: | ||
| 99 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} | 99 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} |
| 100 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} | 100 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} |
| 101 | run: | | 101 | run: | |
| 102 | - python3 -m pip install --upgrade pip | ||
| 103 | - python3 -m pip install wheel twine setuptools | 102 | + python3 -m pip install --break-system-packages --upgrade pip |
| 103 | + python3 -m pip install --break-system-packages wheel twine setuptools | ||
| 104 | 104 | ||
| 105 | twine upload ./wheelhouse/*.whl | 105 | twine upload ./wheelhouse/*.whl |
| @@ -48,3 +48,49 @@ jobs: | @@ -48,3 +48,49 @@ jobs: | ||
| 48 | repo_name: k2-fsa/sherpa-onnx | 48 | repo_name: k2-fsa/sherpa-onnx |
| 49 | repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} | 49 | repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} |
| 50 | tag: asr-models | 50 | tag: asr-models |
| 51 | + | ||
| 52 | + - name: Publish float32 model to huggingface | ||
| 53 | + shell: bash | ||
| 54 | + env: | ||
| 55 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 56 | + run: | | ||
| 57 | + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-zh-2024-06-04 | ||
| 58 | + git config --global user.email "csukuangfj@gmail.com" | ||
| 59 | + git config --global user.name "Fangjun Kuang" | ||
| 60 | + | ||
| 61 | + export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 62 | + | ||
| 63 | + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 hf | ||
| 64 | + cp -a $src/* hf/ | ||
| 65 | + cd hf | ||
| 66 | + git lfs track "*.pdf" | ||
| 67 | + git lfs track "*.onnx" | ||
| 68 | + git add . | ||
| 69 | + git commit -m 'add model files' || true | ||
| 70 | + git status | ||
| 71 | + ls -lh | ||
| 72 | + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 main || true | ||
| 73 | + rm -rf hf | ||
| 74 | + | ||
| 75 | + - name: Publish int8 model to huggingface | ||
| 76 | + shell: bash | ||
| 77 | + env: | ||
| 78 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 79 | + run: | | ||
| 80 | + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 | ||
| 81 | + git config --global user.email "csukuangfj@gmail.com" | ||
| 82 | + git config --global user.name "Fangjun Kuang" | ||
| 83 | + | ||
| 84 | + export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 85 | + | ||
| 86 | + rm -rf hf | ||
| 87 | + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 hf | ||
| 88 | + cp -a $src/* hf/ | ||
| 89 | + cd hf | ||
| 90 | + git lfs track "*.pdf" | ||
| 91 | + git lfs track "*.onnx" | ||
| 92 | + git add . | ||
| 93 | + git commit -m 'add model files' || true | ||
| 94 | + git status | ||
| 95 | + ls -lh | ||
| 96 | + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 main || true |
| @@ -130,34 +130,34 @@ jobs: | @@ -130,34 +130,34 @@ jobs: | ||
| 130 | name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} | 130 | name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} |
| 131 | path: install/* | 131 | path: install/* |
| 132 | 132 | ||
| 133 | - - name: Test online transducer | 133 | + - name: Test offline CTC |
| 134 | shell: bash | 134 | shell: bash |
| 135 | run: | | 135 | run: | |
| 136 | du -h -d1 . | 136 | du -h -d1 . |
| 137 | export PATH=$PWD/build/bin:$PATH | 137 | export PATH=$PWD/build/bin:$PATH |
| 138 | - export EXE=sherpa-onnx | 138 | + export EXE=sherpa-onnx-offline |
| 139 | 139 | ||
| 140 | - .github/scripts/test-online-transducer.sh | 140 | + .github/scripts/test-offline-ctc.sh |
| 141 | du -h -d1 . | 141 | du -h -d1 . |
| 142 | 142 | ||
| 143 | - - name: Test online transducer (C API) | 143 | + - name: Test online transducer |
| 144 | shell: bash | 144 | shell: bash |
| 145 | run: | | 145 | run: | |
| 146 | du -h -d1 . | 146 | du -h -d1 . |
| 147 | export PATH=$PWD/build/bin:$PATH | 147 | export PATH=$PWD/build/bin:$PATH |
| 148 | - export EXE=decode-file-c-api | 148 | + export EXE=sherpa-onnx |
| 149 | 149 | ||
| 150 | .github/scripts/test-online-transducer.sh | 150 | .github/scripts/test-online-transducer.sh |
| 151 | du -h -d1 . | 151 | du -h -d1 . |
| 152 | 152 | ||
| 153 | - - name: Test offline CTC | 153 | + - name: Test online transducer (C API) |
| 154 | shell: bash | 154 | shell: bash |
| 155 | run: | | 155 | run: | |
| 156 | du -h -d1 . | 156 | du -h -d1 . |
| 157 | export PATH=$PWD/build/bin:$PATH | 157 | export PATH=$PWD/build/bin:$PATH |
| 158 | - export EXE=sherpa-onnx-offline | 158 | + export EXE=decode-file-c-api |
| 159 | 159 | ||
| 160 | - .github/scripts/test-offline-ctc.sh | 160 | + .github/scripts/test-online-transducer.sh |
| 161 | du -h -d1 . | 161 | du -h -d1 . |
| 162 | 162 | ||
| 163 | - name: Test spoken language identification (C++ API) | 163 | - name: Test spoken language identification (C++ API) |
| @@ -107,6 +107,14 @@ jobs: | @@ -107,6 +107,14 @@ jobs: | ||
| 107 | otool -L build/bin/sherpa-onnx | 107 | otool -L build/bin/sherpa-onnx |
| 108 | otool -l build/bin/sherpa-onnx | 108 | otool -l build/bin/sherpa-onnx |
| 109 | 109 | ||
| 110 | + - name: Test offline CTC | ||
| 111 | + shell: bash | ||
| 112 | + run: | | ||
| 113 | + export PATH=$PWD/build/bin:$PATH | ||
| 114 | + export EXE=sherpa-onnx-offline | ||
| 115 | + | ||
| 116 | + .github/scripts/test-offline-ctc.sh | ||
| 117 | + | ||
| 110 | - name: Test offline transducer | 118 | - name: Test offline transducer |
| 111 | shell: bash | 119 | shell: bash |
| 112 | run: | | 120 | run: | |
| @@ -192,13 +200,7 @@ jobs: | @@ -192,13 +200,7 @@ jobs: | ||
| 192 | 200 | ||
| 193 | .github/scripts/test-offline-whisper.sh | 201 | .github/scripts/test-offline-whisper.sh |
| 194 | 202 | ||
| 195 | - - name: Test offline CTC | ||
| 196 | - shell: bash | ||
| 197 | - run: | | ||
| 198 | - export PATH=$PWD/build/bin:$PATH | ||
| 199 | - export EXE=sherpa-onnx-offline | ||
| 200 | 203 | ||
| 201 | - .github/scripts/test-offline-ctc.sh | ||
| 202 | 204 | ||
| 203 | - name: Test online transducer | 205 | - name: Test online transducer |
| 204 | shell: bash | 206 | shell: bash |
| @@ -30,14 +30,12 @@ concurrency: | @@ -30,14 +30,12 @@ concurrency: | ||
| 30 | 30 | ||
| 31 | jobs: | 31 | jobs: |
| 32 | test-go: | 32 | test-go: |
| 33 | - name: ${{ matrix.os }} ${{matrix.arch }} | 33 | + name: ${{ matrix.os }} |
| 34 | runs-on: ${{ matrix.os }} | 34 | runs-on: ${{ matrix.os }} |
| 35 | strategy: | 35 | strategy: |
| 36 | fail-fast: false | 36 | fail-fast: false |
| 37 | matrix: | 37 | matrix: |
| 38 | - include: | ||
| 39 | - - os: macos-latest | ||
| 40 | - arch: amd64 | 38 | + os: [macos-latest, macos-14] |
| 41 | 39 | ||
| 42 | steps: | 40 | steps: |
| 43 | - uses: actions/checkout@v4 | 41 | - uses: actions/checkout@v4 |
| @@ -47,7 +45,7 @@ jobs: | @@ -47,7 +45,7 @@ jobs: | ||
| 47 | - name: ccache | 45 | - name: ccache |
| 48 | uses: hendrikmuhs/ccache-action@v1.2 | 46 | uses: hendrikmuhs/ccache-action@v1.2 |
| 49 | with: | 47 | with: |
| 50 | - key: ${{ matrix.os }}-${{ matrix.arch }} | 48 | + key: ${{ matrix.os }}-go |
| 51 | 49 | ||
| 52 | - uses: actions/setup-go@v5 | 50 | - uses: actions/setup-go@v5 |
| 53 | with: | 51 | with: |
| @@ -109,8 +107,6 @@ jobs: | @@ -109,8 +107,6 @@ jobs: | ||
| 109 | go build | 107 | go build |
| 110 | ls -lh | 108 | ls -lh |
| 111 | 109 | ||
| 112 | - git lfs install | ||
| 113 | - | ||
| 114 | echo "Test vits-ljs" | 110 | echo "Test vits-ljs" |
| 115 | ./run-vits-ljs.sh | 111 | ./run-vits-ljs.sh |
| 116 | rm -rf vits-ljs | 112 | rm -rf vits-ljs |
| @@ -144,7 +140,13 @@ jobs: | @@ -144,7 +140,13 @@ jobs: | ||
| 144 | go build | 140 | go build |
| 145 | ls -lh | 141 | ls -lh |
| 146 | 142 | ||
| 147 | - git lfs install | 143 | + echo "Test telespeech ctc" |
| 144 | + ./run-telespeech-ctc.sh | ||
| 145 | + rm -rf sherpa-onnx-telespeech-ctc-* | ||
| 146 | + | ||
| 147 | + echo "Test transducer" | ||
| 148 | + ./run-transducer.sh | ||
| 149 | + rm -rf sherpa-onnx-zipformer-en-2023-06-26 | ||
| 148 | 150 | ||
| 149 | echo "Test transducer" | 151 | echo "Test transducer" |
| 150 | ./run-transducer.sh | 152 | ./run-transducer.sh |
| @@ -57,7 +57,7 @@ jobs: | @@ -57,7 +57,7 @@ jobs: | ||
| 57 | 57 | ||
| 58 | mkdir build | 58 | mkdir build |
| 59 | cd build | 59 | cd build |
| 60 | - cmake -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install .. | 60 | + cmake -DSHERPA_ONNX_ENABLE_EPSEAK_NG_EXE=ON -DBUILD_ESPEAK_NG_EXE=ON -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install .. |
| 61 | 61 | ||
| 62 | - name: Build | 62 | - name: Build |
| 63 | shell: bash | 63 | shell: bash |
| @@ -6,7 +6,7 @@ set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment ve | @@ -6,7 +6,7 @@ set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment ve | ||
| 6 | 6 | ||
| 7 | project(sherpa-onnx) | 7 | project(sherpa-onnx) |
| 8 | 8 | ||
| 9 | -set(SHERPA_ONNX_VERSION "1.9.26") | 9 | +set(SHERPA_ONNX_VERSION "1.9.27") |
| 10 | 10 | ||
| 11 | # Disable warning about | 11 | # Disable warning about |
| 12 | # | 12 | # |
| @@ -14,7 +14,9 @@ function(download_espeak_ng_for_piper) | @@ -14,7 +14,9 @@ function(download_espeak_ng_for_piper) | ||
| 14 | set(USE_SPEECHPLAYER OFF CACHE BOOL "" FORCE) | 14 | set(USE_SPEECHPLAYER OFF CACHE BOOL "" FORCE) |
| 15 | set(EXTRA_cmn ON CACHE BOOL "" FORCE) | 15 | set(EXTRA_cmn ON CACHE BOOL "" FORCE) |
| 16 | set(EXTRA_ru ON CACHE BOOL "" FORCE) | 16 | set(EXTRA_ru ON CACHE BOOL "" FORCE) |
| 17 | + if (NOT SHERPA_ONNX_ENABLE_EPSEAK_NG_EXE) | ||
| 17 | set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE) | 18 | set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE) |
| 19 | + endif() | ||
| 18 | 20 | ||
| 19 | # If you don't have access to the Internet, | 21 | # If you don't have access to the Internet, |
| 20 | # please pre-download kaldi-decoder | 22 | # please pre-download kaldi-decoder |
| 1 | function(download_kaldi_native_fbank) | 1 | function(download_kaldi_native_fbank) |
| 2 | include(FetchContent) | 2 | include(FetchContent) |
| 3 | 3 | ||
| 4 | - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz") | ||
| 5 | - set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz") | ||
| 6 | - set(kaldi_native_fbank_HASH "SHA256=0cae8cbb9ea42916b214e088912f9e8f2f648f54756b305f93f552382f31f904") | 4 | + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz") |
| 5 | + set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz") | ||
| 6 | + set(kaldi_native_fbank_HASH "SHA256=335fe1daf1b9bfb2a7b6bf03b64c4c4686c39077c57fb8058c02611981676638") | ||
| 7 | 7 | ||
| 8 | set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) | 8 | set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) |
| 9 | set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) | 9 | set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) |
| @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) | @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) | ||
| 12 | # If you don't have access to the Internet, | 12 | # If you don't have access to the Internet, |
| 13 | # please pre-download kaldi-native-fbank | 13 | # please pre-download kaldi-native-fbank |
| 14 | set(possible_file_locations | 14 | set(possible_file_locations |
| 15 | - $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.1.tar.gz | ||
| 16 | - ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.1.tar.gz | ||
| 17 | - ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.1.tar.gz | ||
| 18 | - /tmp/kaldi-native-fbank-1.19.1.tar.gz | ||
| 19 | - /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.1.tar.gz | 15 | + $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.3.tar.gz |
| 16 | + ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.3.tar.gz | ||
| 17 | + ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.3.tar.gz | ||
| 18 | + /tmp/kaldi-native-fbank-1.19.3.tar.gz | ||
| 19 | + /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.3.tar.gz | ||
| 20 | ) | 20 | ) |
| 21 | 21 | ||
| 22 | foreach(f IN LISTS possible_file_locations) | 22 | foreach(f IN LISTS possible_file_locations) |
| @@ -34,6 +34,9 @@ class OfflineDecodeFiles | @@ -34,6 +34,9 @@ class OfflineDecodeFiles | ||
| 34 | [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] | 34 | [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] |
| 35 | public string Joiner { get; set; } | 35 | public string Joiner { get; set; } |
| 36 | 36 | ||
| 37 | + [Option("model-type", Required = false, Default = "", HelpText = "model type")] | ||
| 38 | + public string ModelType { get; set; } | ||
| 39 | + | ||
| 37 | [Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")] | 40 | [Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")] |
| 38 | public string WhisperEncoder { get; set; } | 41 | public string WhisperEncoder { get; set; } |
| 39 | 42 | ||
| @@ -56,6 +59,9 @@ class OfflineDecodeFiles | @@ -56,6 +59,9 @@ class OfflineDecodeFiles | ||
| 56 | [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] | 59 | [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] |
| 57 | public string NeMoCtc { get; set; } | 60 | public string NeMoCtc { get; set; } |
| 58 | 61 | ||
| 62 | + [Option("telespeech-ctc", Required = false, HelpText = "Path to model.onnx. Used only for TeleSpeech CTC models")] | ||
| 63 | + public string TeleSpeechCtc { get; set; } | ||
| 64 | + | ||
| 59 | [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] | 65 | [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] |
| 60 | public int NumThreads { get; set; } | 66 | public int NumThreads { get; set; } |
| 61 | 67 | ||
| @@ -201,6 +207,10 @@ to download pre-trained Tdnn models. | @@ -201,6 +207,10 @@ to download pre-trained Tdnn models. | ||
| 201 | { | 207 | { |
| 202 | config.ModelConfig.NeMoCtc.Model = options.NeMoCtc; | 208 | config.ModelConfig.NeMoCtc.Model = options.NeMoCtc; |
| 203 | } | 209 | } |
| 210 | + else if (!String.IsNullOrEmpty(options.TeleSpeechCtc)) | ||
| 211 | + { | ||
| 212 | + config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; | ||
| 213 | + } | ||
| 204 | else if (!String.IsNullOrEmpty(options.WhisperEncoder)) | 214 | else if (!String.IsNullOrEmpty(options.WhisperEncoder)) |
| 205 | { | 215 | { |
| 206 | config.ModelConfig.Whisper.Encoder = options.WhisperEncoder; | 216 | config.ModelConfig.Whisper.Encoder = options.WhisperEncoder; |
| @@ -218,6 +228,7 @@ to download pre-trained Tdnn models. | @@ -218,6 +228,7 @@ to download pre-trained Tdnn models. | ||
| 218 | return; | 228 | return; |
| 219 | } | 229 | } |
| 220 | 230 | ||
| 231 | + config.ModelConfig.ModelType = options.ModelType; | ||
| 221 | config.DecodingMethod = options.DecodingMethod; | 232 | config.DecodingMethod = options.DecodingMethod; |
| 222 | config.MaxActivePaths = options.MaxActivePaths; | 233 | config.MaxActivePaths = options.MaxActivePaths; |
| 223 | config.HotwordsFile = options.HotwordsFile; | 234 | config.HotwordsFile = options.HotwordsFile; |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then | ||
| 6 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 7 | + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 8 | + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 9 | +fi | ||
| 10 | + | ||
| 11 | +dotnet run \ | ||
| 12 | + --telespeech-ctc=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ | ||
| 13 | + --tokens=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ | ||
| 14 | + --model-type=telespeech-ctc \ | ||
| 15 | + --files ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav |
| @@ -40,6 +40,9 @@ func main() { | @@ -40,6 +40,9 @@ func main() { | ||
| 40 | flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") | 40 | flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") |
| 41 | flag.StringVar(&config.ModelConfig.ModelType, "model-type", "", "Optional. Used for loading the model in a faster way") | 41 | flag.StringVar(&config.ModelConfig.ModelType, "model-type", "", "Optional. Used for loading the model in a faster way") |
| 42 | flag.StringVar(&config.ModelConfig.Provider, "provider", "cpu", "Provider to use") | 42 | flag.StringVar(&config.ModelConfig.Provider, "provider", "cpu", "Provider to use") |
| 43 | + flag.StringVar(&config.ModelConfig.ModelingUnit, "modeling-unit", "cjkchar", "cjkchar, bpe, cjkchar+bpe, or leave it to empty") | ||
| 44 | + flag.StringVar(&config.ModelConfig.BpeVocab, "bpe-vocab", "", "") | ||
| 45 | + flag.StringVar(&config.ModelConfig.TeleSpeechCtc, "telespeech-ctc", "", "Used for TeleSpeechCtc model") | ||
| 43 | flag.StringVar(&config.LmConfig.Model, "lm-model", "", "Optional. Path to the LM model") | 46 | flag.StringVar(&config.LmConfig.Model, "lm-model", "", "Optional. Path to the LM model") |
| 44 | flag.Float32Var(&config.LmConfig.Scale, "lm-scale", 1.0, "Optional. Scale for the LM model") | 47 | flag.Float32Var(&config.LmConfig.Scale, "lm-scale", 1.0, "Optional. Scale for the LM model") |
| 45 | 48 |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then | ||
| 6 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 7 | + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 8 | + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 9 | +fi | ||
| 10 | + | ||
| 11 | +go mod tidy | ||
| 12 | +go build | ||
| 13 | + | ||
| 14 | +./non-streaming-decode-files \ | ||
| 15 | + --telespeech-ctc ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ | ||
| 16 | + --tokens ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ | ||
| 17 | + --model-type telespeech-ctc \ | ||
| 18 | + --debug 0 \ | ||
| 19 | + ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav |
| @@ -4,7 +4,7 @@ | @@ -4,7 +4,7 @@ | ||
| 4 | // to decode files. | 4 | // to decode files. |
| 5 | import com.k2fsa.sherpa.onnx.*; | 5 | import com.k2fsa.sherpa.onnx.*; |
| 6 | 6 | ||
| 7 | -public class NonStreamingDecodeFileTransducer { | 7 | +public class NonStreamingDecodeFileParaformer { |
| 8 | public static void main(String[] args) { | 8 | public static void main(String[] args) { |
| 9 | // please refer to | 9 | // please refer to |
| 10 | // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english | 10 | // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english |
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use an offline TeleSpeech CTC model | ||
| 4 | +// to decode files. | ||
| 5 | +import com.k2fsa.sherpa.onnx.*; | ||
| 6 | + | ||
| 7 | +public class NonStreamingDecodeFileTeleSpeechCtc { | ||
| 8 | + public static void main(String[] args) { | ||
| 9 | + // please refer to | ||
| 10 | + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english | ||
| 11 | + // to download model files | ||
| 12 | + String model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx"; | ||
| 13 | + String tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt"; | ||
| 14 | + | ||
| 15 | + String waveFilename = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav"; | ||
| 16 | + | ||
| 17 | + WaveReader reader = new WaveReader(waveFilename); | ||
| 18 | + | ||
| 19 | + OfflineModelConfig modelConfig = | ||
| 20 | + OfflineModelConfig.builder() | ||
| 21 | + .setTeleSpeech(model) | ||
| 22 | + .setTokens(tokens) | ||
| 23 | + .setNumThreads(1) | ||
| 24 | + .setDebug(true) | ||
| 25 | + .setModelType("telespeech_ctc") | ||
| 26 | + .build(); | ||
| 27 | + | ||
| 28 | + OfflineRecognizerConfig config = | ||
| 29 | + OfflineRecognizerConfig.builder() | ||
| 30 | + .setOfflineModelConfig(modelConfig) | ||
| 31 | + .setDecodingMethod("greedy_search") | ||
| 32 | + .build(); | ||
| 33 | + | ||
| 34 | + OfflineRecognizer recognizer = new OfflineRecognizer(config); | ||
| 35 | + OfflineStream stream = recognizer.createStream(); | ||
| 36 | + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); | ||
| 37 | + | ||
| 38 | + recognizer.decode(stream); | ||
| 39 | + | ||
| 40 | + String text = recognizer.getResult(stream).getText(); | ||
| 41 | + | ||
| 42 | + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); | ||
| 43 | + | ||
| 44 | + stream.release(); | ||
| 45 | + recognizer.release(); | ||
| 46 | + } | ||
| 47 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +if [ ! -f ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt ]; then | ||
| 29 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 30 | + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 31 | + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 | ||
| 32 | +fi | ||
| 33 | + | ||
| 34 | +java \ | ||
| 35 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 36 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 37 | + ./NonStreamingDecodeFileTeleSpeechCtc.java |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +This file shows how to use a non-streaming CTC model from | ||
| 5 | +https://github.com/Tele-AI/TeleSpeech-ASR | ||
| 6 | +to decode files. | ||
| 7 | + | ||
| 8 | +Please download model files from | ||
| 9 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 10 | + | ||
| 11 | + | ||
| 12 | +""" | ||
| 13 | + | ||
| 14 | +from pathlib import Path | ||
| 15 | + | ||
| 16 | +import sherpa_onnx | ||
| 17 | +import soundfile as sf | ||
| 18 | + | ||
| 19 | + | ||
| 20 | +def create_recognizer(): | ||
| 21 | + model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx" | ||
| 22 | + tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt" | ||
| 23 | + test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav" | ||
| 24 | + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/4-tianjin.wav" | ||
| 25 | + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/5-henan.wav" | ||
| 26 | + | ||
| 27 | + if not Path(model).is_file() or not Path(test_wav).is_file(): | ||
| 28 | + raise ValueError( | ||
| 29 | + """Please download model files from | ||
| 30 | + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 31 | + """ | ||
| 32 | + ) | ||
| 33 | + return ( | ||
| 34 | + sherpa_onnx.OfflineRecognizer.from_telespeech_ctc( | ||
| 35 | + model=model, | ||
| 36 | + tokens=tokens, | ||
| 37 | + debug=True, | ||
| 38 | + ), | ||
| 39 | + test_wav, | ||
| 40 | + ) | ||
| 41 | + | ||
| 42 | + | ||
| 43 | +def main(): | ||
| 44 | + recognizer, wave_filename = create_recognizer() | ||
| 45 | + | ||
| 46 | + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) | ||
| 47 | + audio = audio[:, 0] # only use the first channel | ||
| 48 | + | ||
| 49 | + # audio is a 1-D float32 numpy array normalized to the range [-1, 1] | ||
| 50 | + # sample_rate does not need to be 16000 Hz | ||
| 51 | + | ||
| 52 | + stream = recognizer.create_stream() | ||
| 53 | + stream.accept_waveform(sample_rate, audio) | ||
| 54 | + recognizer.decode_stream(stream) | ||
| 55 | + print(wave_filename) | ||
| 56 | + print(stream.result) | ||
| 57 | + | ||
| 58 | + | ||
| 59 | +if __name__ == "__main__": | ||
| 60 | + main() |
| @@ -166,6 +166,22 @@ def get_models(): | @@ -166,6 +166,22 @@ def get_models(): | ||
| 166 | popd | 166 | popd |
| 167 | """, | 167 | """, |
| 168 | ), | 168 | ), |
| 169 | + Model( | ||
| 170 | + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04", | ||
| 171 | + idx=11, | ||
| 172 | + lang="zh", | ||
| 173 | + short_name="telespeech", | ||
| 174 | + cmd=""" | ||
| 175 | + pushd $model_name | ||
| 176 | + | ||
| 177 | + rm -rfv test_wavs | ||
| 178 | + rm test.py | ||
| 179 | + | ||
| 180 | + ls -lh | ||
| 181 | + | ||
| 182 | + popd | ||
| 183 | + """, | ||
| 184 | + ), | ||
| 169 | ] | 185 | ] |
| 170 | return models | 186 | return models |
| 171 | 187 |
| @@ -25,6 +25,7 @@ namespace SherpaOnnx | @@ -25,6 +25,7 @@ namespace SherpaOnnx | ||
| 25 | ModelType = ""; | 25 | ModelType = ""; |
| 26 | ModelingUnit = "cjkchar"; | 26 | ModelingUnit = "cjkchar"; |
| 27 | BpeVocab = ""; | 27 | BpeVocab = ""; |
| 28 | + TeleSpeechCtc = ""; | ||
| 28 | } | 29 | } |
| 29 | public OfflineTransducerModelConfig Transducer; | 30 | public OfflineTransducerModelConfig Transducer; |
| 30 | public OfflineParaformerModelConfig Paraformer; | 31 | public OfflineParaformerModelConfig Paraformer; |
| @@ -50,5 +51,8 @@ namespace SherpaOnnx | @@ -50,5 +51,8 @@ namespace SherpaOnnx | ||
| 50 | 51 | ||
| 51 | [MarshalAs(UnmanagedType.LPStr)] | 52 | [MarshalAs(UnmanagedType.LPStr)] |
| 52 | public string BpeVocab; | 53 | public string BpeVocab; |
| 54 | + | ||
| 55 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 56 | + public string TeleSpeechCtc; | ||
| 53 | } | 57 | } |
| 54 | } | 58 | } |
| @@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86 | @@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86 | ||
| 30 | linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl | 30 | linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl |
| 31 | linux_wheel=$src_dir/$linux_wheel_filename | 31 | linux_wheel=$src_dir/$linux_wheel_filename |
| 32 | 32 | ||
| 33 | -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl | 33 | +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_universal2.whl |
| 34 | macos_wheel=$src_dir/$macos_wheel_filename | 34 | macos_wheel=$src_dir/$macos_wheel_filename |
| 35 | 35 | ||
| 36 | windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl | 36 | windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl |
| @@ -61,7 +61,7 @@ if [ ! -f $src_dir/linux/libsherpa-onnx-core.so ]; then | @@ -61,7 +61,7 @@ if [ ! -f $src_dir/linux/libsherpa-onnx-core.so ]; then | ||
| 61 | fi | 61 | fi |
| 62 | 62 | ||
| 63 | if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then | 63 | if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then |
| 64 | - echo "---macOS x86_64---" | 64 | + echo "--- macOS x86_64/arm64 universal2---" |
| 65 | cd macos | 65 | cd macos |
| 66 | mkdir -p wheel | 66 | mkdir -p wheel |
| 67 | cd wheel | 67 | cd wheel |
| 1 | +../../../../go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh |
| @@ -383,6 +383,7 @@ type OfflineModelConfig struct { | @@ -383,6 +383,7 @@ type OfflineModelConfig struct { | ||
| 383 | 383 | ||
| 384 | ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe | 384 | ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe |
| 385 | BpeVocab string // Optional. | 385 | BpeVocab string // Optional. |
| 386 | + TeleSpeechCtc string // Optional. | ||
| 386 | } | 387 | } |
| 387 | 388 | ||
| 388 | // Configuration for the offline/non-streaming recognizer. | 389 | // Configuration for the offline/non-streaming recognizer. |
| @@ -477,6 +478,9 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { | @@ -477,6 +478,9 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { | ||
| 477 | c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab) | 478 | c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab) |
| 478 | defer C.free(unsafe.Pointer(c.model_config.bpe_vocab)) | 479 | defer C.free(unsafe.Pointer(c.model_config.bpe_vocab)) |
| 479 | 480 | ||
| 481 | + c.model_config.telespeech_ctc = C.CString(config.ModelConfig.TeleSpeechCtc) | ||
| 482 | + defer C.free(unsafe.Pointer(c.model_config.telespeech_ctc)) | ||
| 483 | + | ||
| 480 | c.lm_config.model = C.CString(config.LmConfig.Model) | 484 | c.lm_config.model = C.CString(config.LmConfig.Model) |
| 481 | defer C.free(unsafe.Pointer(c.lm_config.model)) | 485 | defer C.free(unsafe.Pointer(c.lm_config.model)) |
| 482 | 486 |
| @@ -128,6 +128,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { | @@ -128,6 +128,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { | ||
| 128 | SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType); | 128 | SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType); |
| 129 | SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit); | 129 | SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit); |
| 130 | SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab); | 130 | SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab); |
| 131 | + SHERPA_ONNX_ASSIGN_ATTR_STR(telespeech_ctc, teleSpeechCtc); | ||
| 131 | 132 | ||
| 132 | return c; | 133 | return c; |
| 133 | } | 134 | } |
| @@ -242,6 +243,10 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { | @@ -242,6 +243,10 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { | ||
| 242 | delete[] c.model_config.bpe_vocab; | 243 | delete[] c.model_config.bpe_vocab; |
| 243 | } | 244 | } |
| 244 | 245 | ||
| 246 | + if (c.model_config.telespeech_ctc) { | ||
| 247 | + delete[] c.model_config.telespeech_ctc; | ||
| 248 | + } | ||
| 249 | + | ||
| 245 | if (c.lm_config.model) { | 250 | if (c.lm_config.model) { |
| 246 | delete[] c.lm_config.model; | 251 | delete[] c.lm_config.model; |
| 247 | } | 252 | } |
| @@ -366,6 +366,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | @@ -366,6 +366,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | ||
| 366 | recognizer_config.model_config.bpe_vocab = | 366 | recognizer_config.model_config.bpe_vocab = |
| 367 | SHERPA_ONNX_OR(config->model_config.bpe_vocab, ""); | 367 | SHERPA_ONNX_OR(config->model_config.bpe_vocab, ""); |
| 368 | 368 | ||
| 369 | + recognizer_config.model_config.telespeech_ctc = | ||
| 370 | + SHERPA_ONNX_OR(config->model_config.telespeech_ctc, ""); | ||
| 371 | + | ||
| 369 | recognizer_config.lm_config.model = | 372 | recognizer_config.lm_config.model = |
| 370 | SHERPA_ONNX_OR(config->lm_config.model, ""); | 373 | SHERPA_ONNX_OR(config->lm_config.model, ""); |
| 371 | recognizer_config.lm_config.scale = | 374 | recognizer_config.lm_config.scale = |
| @@ -395,6 +395,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { | @@ -395,6 +395,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { | ||
| 395 | // - cjkchar+bpe | 395 | // - cjkchar+bpe |
| 396 | const char *modeling_unit; | 396 | const char *modeling_unit; |
| 397 | const char *bpe_vocab; | 397 | const char *bpe_vocab; |
| 398 | + const char *telespeech_ctc; | ||
| 398 | } SherpaOnnxOfflineModelConfig; | 399 | } SherpaOnnxOfflineModelConfig; |
| 399 | 400 | ||
| 400 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { | 401 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { |
| @@ -39,6 +39,7 @@ set(sources | @@ -39,6 +39,7 @@ set(sources | ||
| 39 | offline-stream.cc | 39 | offline-stream.cc |
| 40 | offline-tdnn-ctc-model.cc | 40 | offline-tdnn-ctc-model.cc |
| 41 | offline-tdnn-model-config.cc | 41 | offline-tdnn-model-config.cc |
| 42 | + offline-telespeech-ctc-model.cc | ||
| 42 | offline-transducer-greedy-search-decoder.cc | 43 | offline-transducer-greedy-search-decoder.cc |
| 43 | offline-transducer-greedy-search-nemo-decoder.cc | 44 | offline-transducer-greedy-search-nemo-decoder.cc |
| 44 | offline-transducer-model-config.cc | 45 | offline-transducer-model-config.cc |
| @@ -56,22 +56,11 @@ std::string FeatureExtractorConfig::ToString() const { | @@ -56,22 +56,11 @@ std::string FeatureExtractorConfig::ToString() const { | ||
| 56 | class FeatureExtractor::Impl { | 56 | class FeatureExtractor::Impl { |
| 57 | public: | 57 | public: |
| 58 | explicit Impl(const FeatureExtractorConfig &config) : config_(config) { | 58 | explicit Impl(const FeatureExtractorConfig &config) : config_(config) { |
| 59 | - opts_.frame_opts.dither = config.dither; | ||
| 60 | - opts_.frame_opts.snip_edges = config.snip_edges; | ||
| 61 | - opts_.frame_opts.samp_freq = config.sampling_rate; | ||
| 62 | - opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; | ||
| 63 | - opts_.frame_opts.frame_length_ms = config.frame_length_ms; | ||
| 64 | - opts_.frame_opts.remove_dc_offset = config.remove_dc_offset; | ||
| 65 | - opts_.frame_opts.window_type = config.window_type; | ||
| 66 | - | ||
| 67 | - opts_.mel_opts.num_bins = config.feature_dim; | ||
| 68 | - | ||
| 69 | - opts_.mel_opts.high_freq = config.high_freq; | ||
| 70 | - opts_.mel_opts.low_freq = config.low_freq; | ||
| 71 | - | ||
| 72 | - opts_.mel_opts.is_librosa = config.is_librosa; | ||
| 73 | - | ||
| 74 | - fbank_ = std::make_unique<knf::OnlineFbank>(opts_); | 59 | + if (config_.is_mfcc) { |
| 60 | + InitMfcc(); | ||
| 61 | + } else { | ||
| 62 | + InitFbank(); | ||
| 63 | + } | ||
| 75 | } | 64 | } |
| 76 | 65 | ||
| 77 | void AcceptWaveform(int32_t sampling_rate, const float *waveform, int32_t n) { | 66 | void AcceptWaveform(int32_t sampling_rate, const float *waveform, int32_t n) { |
| @@ -101,35 +90,48 @@ class FeatureExtractor::Impl { | @@ -101,35 +90,48 @@ class FeatureExtractor::Impl { | ||
| 101 | 90 | ||
| 102 | std::vector<float> samples; | 91 | std::vector<float> samples; |
| 103 | resampler_->Resample(waveform, n, false, &samples); | 92 | resampler_->Resample(waveform, n, false, &samples); |
| 104 | - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), | 93 | + if (fbank_) { |
| 94 | + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), | ||
| 105 | samples.size()); | 95 | samples.size()); |
| 96 | + } else { | ||
| 97 | + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), | ||
| 98 | + samples.size()); | ||
| 99 | + } | ||
| 106 | return; | 100 | return; |
| 107 | } | 101 | } |
| 108 | 102 | ||
| 109 | - if (sampling_rate != opts_.frame_opts.samp_freq) { | 103 | + if (sampling_rate != config_.sampling_rate) { |
| 110 | SHERPA_ONNX_LOGE( | 104 | SHERPA_ONNX_LOGE( |
| 111 | "Creating a resampler:\n" | 105 | "Creating a resampler:\n" |
| 112 | " in_sample_rate: %d\n" | 106 | " in_sample_rate: %d\n" |
| 113 | " output_sample_rate: %d\n", | 107 | " output_sample_rate: %d\n", |
| 114 | - sampling_rate, static_cast<int32_t>(opts_.frame_opts.samp_freq)); | 108 | + sampling_rate, static_cast<int32_t>(config_.sampling_rate)); |
| 115 | 109 | ||
| 116 | - float min_freq = | ||
| 117 | - std::min<int32_t>(sampling_rate, opts_.frame_opts.samp_freq); | 110 | + float min_freq = std::min<int32_t>(sampling_rate, config_.sampling_rate); |
| 118 | float lowpass_cutoff = 0.99 * 0.5 * min_freq; | 111 | float lowpass_cutoff = 0.99 * 0.5 * min_freq; |
| 119 | 112 | ||
| 120 | int32_t lowpass_filter_width = 6; | 113 | int32_t lowpass_filter_width = 6; |
| 121 | resampler_ = std::make_unique<LinearResample>( | 114 | resampler_ = std::make_unique<LinearResample>( |
| 122 | - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, | 115 | + sampling_rate, config_.sampling_rate, lowpass_cutoff, |
| 123 | lowpass_filter_width); | 116 | lowpass_filter_width); |
| 124 | 117 | ||
| 125 | std::vector<float> samples; | 118 | std::vector<float> samples; |
| 126 | resampler_->Resample(waveform, n, false, &samples); | 119 | resampler_->Resample(waveform, n, false, &samples); |
| 127 | - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), | 120 | + if (fbank_) { |
| 121 | + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), | ||
| 122 | + samples.size()); | ||
| 123 | + } else { | ||
| 124 | + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), | ||
| 128 | samples.size()); | 125 | samples.size()); |
| 126 | + } | ||
| 129 | return; | 127 | return; |
| 130 | } | 128 | } |
| 131 | 129 | ||
| 130 | + if (fbank_) { | ||
| 132 | fbank_->AcceptWaveform(sampling_rate, waveform, n); | 131 | fbank_->AcceptWaveform(sampling_rate, waveform, n); |
| 132 | + } else { | ||
| 133 | + mfcc_->AcceptWaveform(sampling_rate, waveform, n); | ||
| 134 | + } | ||
| 133 | } | 135 | } |
| 134 | 136 | ||
| 135 | void InputFinished() const { | 137 | void InputFinished() const { |
| @@ -179,11 +181,56 @@ class FeatureExtractor::Impl { | @@ -179,11 +181,56 @@ class FeatureExtractor::Impl { | ||
| 179 | return features; | 181 | return features; |
| 180 | } | 182 | } |
| 181 | 183 | ||
| 182 | - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } | 184 | + int32_t FeatureDim() const { |
| 185 | + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins; | ||
| 186 | + } | ||
| 187 | + | ||
| 188 | + private: | ||
| 189 | + void InitFbank() { | ||
| 190 | + opts_.frame_opts.dither = config_.dither; | ||
| 191 | + opts_.frame_opts.snip_edges = config_.snip_edges; | ||
| 192 | + opts_.frame_opts.samp_freq = config_.sampling_rate; | ||
| 193 | + opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; | ||
| 194 | + opts_.frame_opts.frame_length_ms = config_.frame_length_ms; | ||
| 195 | + opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; | ||
| 196 | + opts_.frame_opts.window_type = config_.window_type; | ||
| 197 | + | ||
| 198 | + opts_.mel_opts.num_bins = config_.feature_dim; | ||
| 199 | + | ||
| 200 | + opts_.mel_opts.high_freq = config_.high_freq; | ||
| 201 | + opts_.mel_opts.low_freq = config_.low_freq; | ||
| 202 | + | ||
| 203 | + opts_.mel_opts.is_librosa = config_.is_librosa; | ||
| 204 | + | ||
| 205 | + fbank_ = std::make_unique<knf::OnlineFbank>(opts_); | ||
| 206 | + } | ||
| 207 | + void InitMfcc() { | ||
| 208 | + mfcc_opts_.frame_opts.dither = config_.dither; | ||
| 209 | + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges; | ||
| 210 | + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate; | ||
| 211 | + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; | ||
| 212 | + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms; | ||
| 213 | + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; | ||
| 214 | + mfcc_opts_.frame_opts.window_type = config_.window_type; | ||
| 215 | + | ||
| 216 | + mfcc_opts_.mel_opts.num_bins = config_.feature_dim; | ||
| 217 | + | ||
| 218 | + mfcc_opts_.mel_opts.high_freq = config_.high_freq; | ||
| 219 | + mfcc_opts_.mel_opts.low_freq = config_.low_freq; | ||
| 220 | + | ||
| 221 | + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa; | ||
| 222 | + | ||
| 223 | + mfcc_opts_.num_ceps = config_.num_ceps; | ||
| 224 | + mfcc_opts_.use_energy = config_.use_energy; | ||
| 225 | + | ||
| 226 | + mfcc_ = std::make_unique<knf::OnlineMfcc>(mfcc_opts_); | ||
| 227 | + } | ||
| 183 | 228 | ||
| 184 | private: | 229 | private: |
| 185 | std::unique_ptr<knf::OnlineFbank> fbank_; | 230 | std::unique_ptr<knf::OnlineFbank> fbank_; |
| 231 | + std::unique_ptr<knf::OnlineMfcc> mfcc_; | ||
| 186 | knf::FbankOptions opts_; | 232 | knf::FbankOptions opts_; |
| 233 | + knf::MfccOptions mfcc_opts_; | ||
| 187 | FeatureExtractorConfig config_; | 234 | FeatureExtractorConfig config_; |
| 188 | mutable std::mutex mutex_; | 235 | mutable std::mutex mutex_; |
| 189 | std::unique_ptr<LinearResample> resampler_; | 236 | std::unique_ptr<LinearResample> resampler_; |
| @@ -18,7 +18,10 @@ struct FeatureExtractorConfig { | @@ -18,7 +18,10 @@ struct FeatureExtractorConfig { | ||
| 18 | // the sampling rate of the input waveform, we will do resampling inside. | 18 | // the sampling rate of the input waveform, we will do resampling inside. |
| 19 | int32_t sampling_rate = 16000; | 19 | int32_t sampling_rate = 16000; |
| 20 | 20 | ||
| 21 | - // Feature dimension | 21 | + // num_mel_bins |
| 22 | + // | ||
| 23 | + // Note: for mfcc, this value is also for num_mel_bins. | ||
| 24 | + // The actual feature dimension is actuall num_ceps | ||
| 22 | int32_t feature_dim = 80; | 25 | int32_t feature_dim = 80; |
| 23 | 26 | ||
| 24 | // minimal frequency for Mel-filterbank, in Hz | 27 | // minimal frequency for Mel-filterbank, in Hz |
| @@ -69,6 +72,12 @@ struct FeatureExtractorConfig { | @@ -69,6 +72,12 @@ struct FeatureExtractorConfig { | ||
| 69 | // for details | 72 | // for details |
| 70 | std::string nemo_normalize_type; | 73 | std::string nemo_normalize_type; |
| 71 | 74 | ||
| 75 | + // for MFCC | ||
| 76 | + int32_t num_ceps = 13; | ||
| 77 | + bool use_energy = true; | ||
| 78 | + | ||
| 79 | + bool is_mfcc = false; | ||
| 80 | + | ||
| 72 | std::string ToString() const; | 81 | std::string ToString() const; |
| 73 | 82 | ||
| 74 | void Register(ParseOptions *po); | 83 | void Register(ParseOptions *po); |
| @@ -12,6 +12,7 @@ | @@ -12,6 +12,7 @@ | ||
| 12 | #include "sherpa-onnx/csrc/macros.h" | 12 | #include "sherpa-onnx/csrc/macros.h" |
| 13 | #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h" | 13 | #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h" |
| 14 | #include "sherpa-onnx/csrc/offline-tdnn-ctc-model.h" | 14 | #include "sherpa-onnx/csrc/offline-tdnn-ctc-model.h" |
| 15 | +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h" | ||
| 15 | #include "sherpa-onnx/csrc/offline-wenet-ctc-model.h" | 16 | #include "sherpa-onnx/csrc/offline-wenet-ctc-model.h" |
| 16 | #include "sherpa-onnx/csrc/offline-zipformer-ctc-model.h" | 17 | #include "sherpa-onnx/csrc/offline-zipformer-ctc-model.h" |
| 17 | #include "sherpa-onnx/csrc/onnx-utils.h" | 18 | #include "sherpa-onnx/csrc/onnx-utils.h" |
| @@ -24,6 +25,7 @@ enum class ModelType { | @@ -24,6 +25,7 @@ enum class ModelType { | ||
| 24 | kTdnn, | 25 | kTdnn, |
| 25 | kZipformerCtc, | 26 | kZipformerCtc, |
| 26 | kWenetCtc, | 27 | kWenetCtc, |
| 28 | + kTeleSpeechCtc, | ||
| 27 | kUnknown, | 29 | kUnknown, |
| 28 | }; | 30 | }; |
| 29 | 31 | ||
| @@ -63,6 +65,9 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, | @@ -63,6 +65,9 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, | ||
| 63 | "If you are using models from WeNet, please refer to\n" | 65 | "If you are using models from WeNet, please refer to\n" |
| 64 | "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/" | 66 | "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/" |
| 65 | "run.sh\n" | 67 | "run.sh\n" |
| 68 | + "If you are using models from TeleSpeech, please refer to\n" | ||
| 69 | + "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/" | ||
| 70 | + "add-metadata.py" | ||
| 66 | "\n" | 71 | "\n" |
| 67 | "for how to add metadta to model.onnx\n"); | 72 | "for how to add metadta to model.onnx\n"); |
| 68 | return ModelType::kUnknown; | 73 | return ModelType::kUnknown; |
| @@ -78,6 +83,8 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, | @@ -78,6 +83,8 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, | ||
| 78 | return ModelType::kZipformerCtc; | 83 | return ModelType::kZipformerCtc; |
| 79 | } else if (model_type.get() == std::string("wenet_ctc")) { | 84 | } else if (model_type.get() == std::string("wenet_ctc")) { |
| 80 | return ModelType::kWenetCtc; | 85 | return ModelType::kWenetCtc; |
| 86 | + } else if (model_type.get() == std::string("telespeech_ctc")) { | ||
| 87 | + return ModelType::kTeleSpeechCtc; | ||
| 81 | } else { | 88 | } else { |
| 82 | SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.get()); | 89 | SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.get()); |
| 83 | return ModelType::kUnknown; | 90 | return ModelType::kUnknown; |
| @@ -97,6 +104,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | @@ -97,6 +104,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | ||
| 97 | filename = config.zipformer_ctc.model; | 104 | filename = config.zipformer_ctc.model; |
| 98 | } else if (!config.wenet_ctc.model.empty()) { | 105 | } else if (!config.wenet_ctc.model.empty()) { |
| 99 | filename = config.wenet_ctc.model; | 106 | filename = config.wenet_ctc.model; |
| 107 | + } else if (!config.telespeech_ctc.empty()) { | ||
| 108 | + filename = config.telespeech_ctc; | ||
| 100 | } else { | 109 | } else { |
| 101 | SHERPA_ONNX_LOGE("Please specify a CTC model"); | 110 | SHERPA_ONNX_LOGE("Please specify a CTC model"); |
| 102 | exit(-1); | 111 | exit(-1); |
| @@ -124,6 +133,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | @@ -124,6 +133,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | ||
| 124 | case ModelType::kWenetCtc: | 133 | case ModelType::kWenetCtc: |
| 125 | return std::make_unique<OfflineWenetCtcModel>(config); | 134 | return std::make_unique<OfflineWenetCtcModel>(config); |
| 126 | break; | 135 | break; |
| 136 | + case ModelType::kTeleSpeechCtc: | ||
| 137 | + return std::make_unique<OfflineTeleSpeechCtcModel>(config); | ||
| 138 | + break; | ||
| 127 | case ModelType::kUnknown: | 139 | case ModelType::kUnknown: |
| 128 | SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); | 140 | SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); |
| 129 | return nullptr; | 141 | return nullptr; |
| @@ -147,6 +159,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | @@ -147,6 +159,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | ||
| 147 | filename = config.zipformer_ctc.model; | 159 | filename = config.zipformer_ctc.model; |
| 148 | } else if (!config.wenet_ctc.model.empty()) { | 160 | } else if (!config.wenet_ctc.model.empty()) { |
| 149 | filename = config.wenet_ctc.model; | 161 | filename = config.wenet_ctc.model; |
| 162 | + } else if (!config.telespeech_ctc.empty()) { | ||
| 163 | + filename = config.telespeech_ctc; | ||
| 150 | } else { | 164 | } else { |
| 151 | SHERPA_ONNX_LOGE("Please specify a CTC model"); | 165 | SHERPA_ONNX_LOGE("Please specify a CTC model"); |
| 152 | exit(-1); | 166 | exit(-1); |
| @@ -175,6 +189,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | @@ -175,6 +189,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( | ||
| 175 | case ModelType::kWenetCtc: | 189 | case ModelType::kWenetCtc: |
| 176 | return std::make_unique<OfflineWenetCtcModel>(mgr, config); | 190 | return std::make_unique<OfflineWenetCtcModel>(mgr, config); |
| 177 | break; | 191 | break; |
| 192 | + case ModelType::kTeleSpeechCtc: | ||
| 193 | + return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config); | ||
| 194 | + break; | ||
| 178 | case ModelType::kUnknown: | 195 | case ModelType::kUnknown: |
| 179 | SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); | 196 | SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); |
| 180 | return nullptr; | 197 | return nullptr; |
| @@ -19,6 +19,9 @@ void OfflineModelConfig::Register(ParseOptions *po) { | @@ -19,6 +19,9 @@ void OfflineModelConfig::Register(ParseOptions *po) { | ||
| 19 | zipformer_ctc.Register(po); | 19 | zipformer_ctc.Register(po); |
| 20 | wenet_ctc.Register(po); | 20 | wenet_ctc.Register(po); |
| 21 | 21 | ||
| 22 | + po->Register("telespeech-ctc", &telespeech_ctc, | ||
| 23 | + "Path to model.onnx for telespeech ctc"); | ||
| 24 | + | ||
| 22 | po->Register("tokens", &tokens, "Path to tokens.txt"); | 25 | po->Register("tokens", &tokens, "Path to tokens.txt"); |
| 23 | 26 | ||
| 24 | po->Register("num-threads", &num_threads, | 27 | po->Register("num-threads", &num_threads, |
| @@ -33,7 +36,7 @@ void OfflineModelConfig::Register(ParseOptions *po) { | @@ -33,7 +36,7 @@ void OfflineModelConfig::Register(ParseOptions *po) { | ||
| 33 | po->Register("model-type", &model_type, | 36 | po->Register("model-type", &model_type, |
| 34 | "Specify it to reduce model initialization time. " | 37 | "Specify it to reduce model initialization time. " |
| 35 | "Valid values are: transducer, paraformer, nemo_ctc, whisper, " | 38 | "Valid values are: transducer, paraformer, nemo_ctc, whisper, " |
| 36 | - "tdnn, zipformer2_ctc" | 39 | + "tdnn, zipformer2_ctc, telespeech_ctc." |
| 37 | "All other values lead to loading the model twice."); | 40 | "All other values lead to loading the model twice."); |
| 38 | po->Register("modeling-unit", &modeling_unit, | 41 | po->Register("modeling-unit", &modeling_unit, |
| 39 | "The modeling unit of the model, commonly used units are bpe, " | 42 | "The modeling unit of the model, commonly used units are bpe, " |
| @@ -55,14 +58,14 @@ bool OfflineModelConfig::Validate() const { | @@ -55,14 +58,14 @@ bool OfflineModelConfig::Validate() const { | ||
| 55 | } | 58 | } |
| 56 | 59 | ||
| 57 | if (!FileExists(tokens)) { | 60 | if (!FileExists(tokens)) { |
| 58 | - SHERPA_ONNX_LOGE("tokens: %s does not exist", tokens.c_str()); | 61 | + SHERPA_ONNX_LOGE("tokens: '%s' does not exist", tokens.c_str()); |
| 59 | return false; | 62 | return false; |
| 60 | } | 63 | } |
| 61 | 64 | ||
| 62 | if (!modeling_unit.empty() && | 65 | if (!modeling_unit.empty() && |
| 63 | (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { | 66 | (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { |
| 64 | if (!FileExists(bpe_vocab)) { | 67 | if (!FileExists(bpe_vocab)) { |
| 65 | - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); | 68 | + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str()); |
| 66 | return false; | 69 | return false; |
| 67 | } | 70 | } |
| 68 | } | 71 | } |
| @@ -91,6 +94,14 @@ bool OfflineModelConfig::Validate() const { | @@ -91,6 +94,14 @@ bool OfflineModelConfig::Validate() const { | ||
| 91 | return wenet_ctc.Validate(); | 94 | return wenet_ctc.Validate(); |
| 92 | } | 95 | } |
| 93 | 96 | ||
| 97 | + if (!telespeech_ctc.empty() && !FileExists(telespeech_ctc)) { | ||
| 98 | + SHERPA_ONNX_LOGE("telespeech_ctc: '%s' does not exist", | ||
| 99 | + telespeech_ctc.c_str()); | ||
| 100 | + return false; | ||
| 101 | + } else { | ||
| 102 | + return true; | ||
| 103 | + } | ||
| 104 | + | ||
| 94 | return transducer.Validate(); | 105 | return transducer.Validate(); |
| 95 | } | 106 | } |
| 96 | 107 | ||
| @@ -105,6 +116,7 @@ std::string OfflineModelConfig::ToString() const { | @@ -105,6 +116,7 @@ std::string OfflineModelConfig::ToString() const { | ||
| 105 | os << "tdnn=" << tdnn.ToString() << ", "; | 116 | os << "tdnn=" << tdnn.ToString() << ", "; |
| 106 | os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", "; | 117 | os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", "; |
| 107 | os << "wenet_ctc=" << wenet_ctc.ToString() << ", "; | 118 | os << "wenet_ctc=" << wenet_ctc.ToString() << ", "; |
| 119 | + os << "telespeech_ctc=\"" << telespeech_ctc << "\", "; | ||
| 108 | os << "tokens=\"" << tokens << "\", "; | 120 | os << "tokens=\"" << tokens << "\", "; |
| 109 | os << "num_threads=" << num_threads << ", "; | 121 | os << "num_threads=" << num_threads << ", "; |
| 110 | os << "debug=" << (debug ? "True" : "False") << ", "; | 122 | os << "debug=" << (debug ? "True" : "False") << ", "; |
| @@ -24,6 +24,7 @@ struct OfflineModelConfig { | @@ -24,6 +24,7 @@ struct OfflineModelConfig { | ||
| 24 | OfflineTdnnModelConfig tdnn; | 24 | OfflineTdnnModelConfig tdnn; |
| 25 | OfflineZipformerCtcModelConfig zipformer_ctc; | 25 | OfflineZipformerCtcModelConfig zipformer_ctc; |
| 26 | OfflineWenetCtcModelConfig wenet_ctc; | 26 | OfflineWenetCtcModelConfig wenet_ctc; |
| 27 | + std::string telespeech_ctc; | ||
| 27 | 28 | ||
| 28 | std::string tokens; | 29 | std::string tokens; |
| 29 | int32_t num_threads = 2; | 30 | int32_t num_threads = 2; |
| @@ -52,6 +53,7 @@ struct OfflineModelConfig { | @@ -52,6 +53,7 @@ struct OfflineModelConfig { | ||
| 52 | const OfflineTdnnModelConfig &tdnn, | 53 | const OfflineTdnnModelConfig &tdnn, |
| 53 | const OfflineZipformerCtcModelConfig &zipformer_ctc, | 54 | const OfflineZipformerCtcModelConfig &zipformer_ctc, |
| 54 | const OfflineWenetCtcModelConfig &wenet_ctc, | 55 | const OfflineWenetCtcModelConfig &wenet_ctc, |
| 56 | + const std::string &telespeech_ctc, | ||
| 55 | const std::string &tokens, int32_t num_threads, bool debug, | 57 | const std::string &tokens, int32_t num_threads, bool debug, |
| 56 | const std::string &provider, const std::string &model_type, | 58 | const std::string &provider, const std::string &model_type, |
| 57 | const std::string &modeling_unit, | 59 | const std::string &modeling_unit, |
| @@ -63,6 +65,7 @@ struct OfflineModelConfig { | @@ -63,6 +65,7 @@ struct OfflineModelConfig { | ||
| 63 | tdnn(tdnn), | 65 | tdnn(tdnn), |
| 64 | zipformer_ctc(zipformer_ctc), | 66 | zipformer_ctc(zipformer_ctc), |
| 65 | wenet_ctc(wenet_ctc), | 67 | wenet_ctc(wenet_ctc), |
| 68 | + telespeech_ctc(telespeech_ctc), | ||
| 66 | tokens(tokens), | 69 | tokens(tokens), |
| 67 | num_threads(num_threads), | 70 | num_threads(num_threads), |
| 68 | debug(debug), | 71 | debug(debug), |
| @@ -88,6 +88,17 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { | @@ -88,6 +88,17 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { | ||
| 88 | #endif | 88 | #endif |
| 89 | 89 | ||
| 90 | void Init() { | 90 | void Init() { |
| 91 | + if (!config_.model_config.telespeech_ctc.empty()) { | ||
| 92 | + config_.feat_config.snip_edges = true; | ||
| 93 | + config_.feat_config.num_ceps = 40; | ||
| 94 | + config_.feat_config.feature_dim = 40; | ||
| 95 | + config_.feat_config.low_freq = 40; | ||
| 96 | + config_.feat_config.high_freq = -200; | ||
| 97 | + config_.feat_config.use_energy = false; | ||
| 98 | + config_.feat_config.normalize_samples = false; | ||
| 99 | + config_.feat_config.is_mfcc = true; | ||
| 100 | + } | ||
| 101 | + | ||
| 91 | if (!config_.model_config.wenet_ctc.model.empty()) { | 102 | if (!config_.model_config.wenet_ctc.model.empty()) { |
| 92 | // WeNet CTC models assume input samples are in the range | 103 | // WeNet CTC models assume input samples are in the range |
| 93 | // [-32768, 32767], so we set normalize_samples to false | 104 | // [-32768, 32767], so we set normalize_samples to false |
| @@ -29,7 +29,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -29,7 +29,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 29 | } else if (model_type == "paraformer") { | 29 | } else if (model_type == "paraformer") { |
| 30 | return std::make_unique<OfflineRecognizerParaformerImpl>(config); | 30 | return std::make_unique<OfflineRecognizerParaformerImpl>(config); |
| 31 | } else if (model_type == "nemo_ctc" || model_type == "tdnn" || | 31 | } else if (model_type == "nemo_ctc" || model_type == "tdnn" || |
| 32 | - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { | 32 | + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || |
| 33 | + model_type == "telespeech_ctc") { | ||
| 33 | return std::make_unique<OfflineRecognizerCtcImpl>(config); | 34 | return std::make_unique<OfflineRecognizerCtcImpl>(config); |
| 34 | } else if (model_type == "whisper") { | 35 | } else if (model_type == "whisper") { |
| 35 | return std::make_unique<OfflineRecognizerWhisperImpl>(config); | 36 | return std::make_unique<OfflineRecognizerWhisperImpl>(config); |
| @@ -53,6 +54,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -53,6 +54,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 53 | model_filename = config.model_config.paraformer.model; | 54 | model_filename = config.model_config.paraformer.model; |
| 54 | } else if (!config.model_config.nemo_ctc.model.empty()) { | 55 | } else if (!config.model_config.nemo_ctc.model.empty()) { |
| 55 | model_filename = config.model_config.nemo_ctc.model; | 56 | model_filename = config.model_config.nemo_ctc.model; |
| 57 | + } else if (!config.model_config.telespeech_ctc.empty()) { | ||
| 58 | + model_filename = config.model_config.telespeech_ctc; | ||
| 56 | } else if (!config.model_config.tdnn.model.empty()) { | 59 | } else if (!config.model_config.tdnn.model.empty()) { |
| 57 | model_filename = config.model_config.tdnn.model; | 60 | model_filename = config.model_config.tdnn.model; |
| 58 | } else if (!config.model_config.zipformer_ctc.model.empty()) { | 61 | } else if (!config.model_config.zipformer_ctc.model.empty()) { |
| @@ -111,6 +114,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -111,6 +114,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 111 | "\n " | 114 | "\n " |
| 112 | "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" | 115 | "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" |
| 113 | "\n" | 116 | "\n" |
| 117 | + "(7) CTC models from TeleSpeech" | ||
| 118 | + "\n " | ||
| 119 | + "https://github.com/Tele-AI/TeleSpeech-ASR" | ||
| 120 | + "\n" | ||
| 114 | "\n"); | 121 | "\n"); |
| 115 | exit(-1); | 122 | exit(-1); |
| 116 | } | 123 | } |
| @@ -133,7 +140,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -133,7 +140,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 133 | 140 | ||
| 134 | if (model_type == "EncDecCTCModelBPE" || | 141 | if (model_type == "EncDecCTCModelBPE" || |
| 135 | model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || | 142 | model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || |
| 136 | - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { | 143 | + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || |
| 144 | + model_type == "telespeech_ctc") { | ||
| 137 | return std::make_unique<OfflineRecognizerCtcImpl>(config); | 145 | return std::make_unique<OfflineRecognizerCtcImpl>(config); |
| 138 | } | 146 | } |
| 139 | 147 | ||
| @@ -151,7 +159,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -151,7 +159,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 151 | " - Whisper models\n" | 159 | " - Whisper models\n" |
| 152 | " - Tdnn models\n" | 160 | " - Tdnn models\n" |
| 153 | " - Zipformer CTC models\n" | 161 | " - Zipformer CTC models\n" |
| 154 | - " - WeNet CTC models\n", | 162 | + " - WeNet CTC models\n" |
| 163 | + " - TeleSpeech CTC models\n", | ||
| 155 | model_type.c_str()); | 164 | model_type.c_str()); |
| 156 | 165 | ||
| 157 | exit(-1); | 166 | exit(-1); |
| @@ -169,7 +178,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -169,7 +178,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 169 | } else if (model_type == "paraformer") { | 178 | } else if (model_type == "paraformer") { |
| 170 | return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config); | 179 | return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config); |
| 171 | } else if (model_type == "nemo_ctc" || model_type == "tdnn" || | 180 | } else if (model_type == "nemo_ctc" || model_type == "tdnn" || |
| 172 | - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { | 181 | + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || |
| 182 | + model_type == "telespeech_ctc") { | ||
| 173 | return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config); | 183 | return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config); |
| 174 | } else if (model_type == "whisper") { | 184 | } else if (model_type == "whisper") { |
| 175 | return std::make_unique<OfflineRecognizerWhisperImpl>(mgr, config); | 185 | return std::make_unique<OfflineRecognizerWhisperImpl>(mgr, config); |
| @@ -199,6 +209,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -199,6 +209,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 199 | model_filename = config.model_config.zipformer_ctc.model; | 209 | model_filename = config.model_config.zipformer_ctc.model; |
| 200 | } else if (!config.model_config.wenet_ctc.model.empty()) { | 210 | } else if (!config.model_config.wenet_ctc.model.empty()) { |
| 201 | model_filename = config.model_config.wenet_ctc.model; | 211 | model_filename = config.model_config.wenet_ctc.model; |
| 212 | + } else if (!config.model_config.telespeech_ctc.empty()) { | ||
| 213 | + model_filename = config.model_config.telespeech_ctc; | ||
| 202 | } else if (!config.model_config.whisper.encoder.empty()) { | 214 | } else if (!config.model_config.whisper.encoder.empty()) { |
| 203 | model_filename = config.model_config.whisper.encoder; | 215 | model_filename = config.model_config.whisper.encoder; |
| 204 | } else { | 216 | } else { |
| @@ -251,6 +263,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -251,6 +263,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 251 | "\n " | 263 | "\n " |
| 252 | "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" | 264 | "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" |
| 253 | "\n" | 265 | "\n" |
| 266 | + "(7) CTC models from TeleSpeech" | ||
| 267 | + "\n " | ||
| 268 | + "https://github.com/Tele-AI/TeleSpeech-ASR" | ||
| 269 | + "\n" | ||
| 254 | "\n"); | 270 | "\n"); |
| 255 | exit(-1); | 271 | exit(-1); |
| 256 | } | 272 | } |
| @@ -273,7 +289,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -273,7 +289,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 273 | 289 | ||
| 274 | if (model_type == "EncDecCTCModelBPE" || | 290 | if (model_type == "EncDecCTCModelBPE" || |
| 275 | model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || | 291 | model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || |
| 276 | - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { | 292 | + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || |
| 293 | + model_type == "telespeech_ctc") { | ||
| 277 | return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config); | 294 | return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config); |
| 278 | } | 295 | } |
| 279 | 296 | ||
| @@ -291,7 +308,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | @@ -291,7 +308,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( | ||
| 291 | " - Whisper models\n" | 308 | " - Whisper models\n" |
| 292 | " - Tdnn models\n" | 309 | " - Tdnn models\n" |
| 293 | " - Zipformer CTC models\n" | 310 | " - Zipformer CTC models\n" |
| 294 | - " - WeNet CTC models\n", | 311 | + " - WeNet CTC models\n" |
| 312 | + " - TeleSpeech CTC models\n", | ||
| 295 | model_type.c_str()); | 313 | model_type.c_str()); |
| 296 | 314 | ||
| 297 | exit(-1); | 315 | exit(-1); |
| @@ -57,6 +57,27 @@ class OfflineStream::Impl { | @@ -57,6 +57,27 @@ class OfflineStream::Impl { | ||
| 57 | explicit Impl(const FeatureExtractorConfig &config, | 57 | explicit Impl(const FeatureExtractorConfig &config, |
| 58 | ContextGraphPtr context_graph) | 58 | ContextGraphPtr context_graph) |
| 59 | : config_(config), context_graph_(context_graph) { | 59 | : config_(config), context_graph_(context_graph) { |
| 60 | + if (config.is_mfcc) { | ||
| 61 | + mfcc_opts_.frame_opts.dither = config_.dither; | ||
| 62 | + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges; | ||
| 63 | + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate; | ||
| 64 | + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; | ||
| 65 | + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms; | ||
| 66 | + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; | ||
| 67 | + mfcc_opts_.frame_opts.window_type = config_.window_type; | ||
| 68 | + | ||
| 69 | + mfcc_opts_.mel_opts.num_bins = config_.feature_dim; | ||
| 70 | + | ||
| 71 | + mfcc_opts_.mel_opts.high_freq = config_.high_freq; | ||
| 72 | + mfcc_opts_.mel_opts.low_freq = config_.low_freq; | ||
| 73 | + | ||
| 74 | + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa; | ||
| 75 | + | ||
| 76 | + mfcc_opts_.num_ceps = config_.num_ceps; | ||
| 77 | + mfcc_opts_.use_energy = config_.use_energy; | ||
| 78 | + | ||
| 79 | + mfcc_ = std::make_unique<knf::OnlineMfcc>(mfcc_opts_); | ||
| 80 | + } else { | ||
| 60 | opts_.frame_opts.dither = config.dither; | 81 | opts_.frame_opts.dither = config.dither; |
| 61 | opts_.frame_opts.snip_edges = config.snip_edges; | 82 | opts_.frame_opts.snip_edges = config.snip_edges; |
| 62 | opts_.frame_opts.samp_freq = config.sampling_rate; | 83 | opts_.frame_opts.samp_freq = config.sampling_rate; |
| @@ -74,6 +95,7 @@ class OfflineStream::Impl { | @@ -74,6 +95,7 @@ class OfflineStream::Impl { | ||
| 74 | 95 | ||
| 75 | fbank_ = std::make_unique<knf::OnlineFbank>(opts_); | 96 | fbank_ = std::make_unique<knf::OnlineFbank>(opts_); |
| 76 | } | 97 | } |
| 98 | + } | ||
| 77 | 99 | ||
| 78 | explicit Impl(WhisperTag /*tag*/) { | 100 | explicit Impl(WhisperTag /*tag*/) { |
| 79 | config_.normalize_samples = true; | 101 | config_.normalize_samples = true; |
| @@ -81,6 +103,7 @@ class OfflineStream::Impl { | @@ -81,6 +103,7 @@ class OfflineStream::Impl { | ||
| 81 | opts_.mel_opts.num_bins = 80; // not used | 103 | opts_.mel_opts.num_bins = 80; // not used |
| 82 | whisper_fbank_ = | 104 | whisper_fbank_ = |
| 83 | std::make_unique<knf::OnlineWhisperFbank>(opts_.frame_opts); | 105 | std::make_unique<knf::OnlineWhisperFbank>(opts_.frame_opts); |
| 106 | + config_.sampling_rate = opts_.frame_opts.samp_freq; | ||
| 84 | } | 107 | } |
| 85 | 108 | ||
| 86 | explicit Impl(CEDTag /*tag*/) { | 109 | explicit Impl(CEDTag /*tag*/) { |
| @@ -98,6 +121,8 @@ class OfflineStream::Impl { | @@ -98,6 +121,8 @@ class OfflineStream::Impl { | ||
| 98 | opts_.mel_opts.num_bins = 64; | 121 | opts_.mel_opts.num_bins = 64; |
| 99 | opts_.mel_opts.high_freq = 8000; | 122 | opts_.mel_opts.high_freq = 8000; |
| 100 | 123 | ||
| 124 | + config_.sampling_rate = opts_.frame_opts.samp_freq; | ||
| 125 | + | ||
| 101 | fbank_ = std::make_unique<knf::OnlineFbank>(opts_); | 126 | fbank_ = std::make_unique<knf::OnlineFbank>(opts_); |
| 102 | } | 127 | } |
| 103 | 128 | ||
| @@ -115,52 +140,60 @@ class OfflineStream::Impl { | @@ -115,52 +140,60 @@ class OfflineStream::Impl { | ||
| 115 | 140 | ||
| 116 | void AcceptWaveformImpl(int32_t sampling_rate, const float *waveform, | 141 | void AcceptWaveformImpl(int32_t sampling_rate, const float *waveform, |
| 117 | int32_t n) { | 142 | int32_t n) { |
| 118 | - if (sampling_rate != opts_.frame_opts.samp_freq) { | 143 | + if (sampling_rate != config_.sampling_rate) { |
| 119 | SHERPA_ONNX_LOGE( | 144 | SHERPA_ONNX_LOGE( |
| 120 | "Creating a resampler:\n" | 145 | "Creating a resampler:\n" |
| 121 | " in_sample_rate: %d\n" | 146 | " in_sample_rate: %d\n" |
| 122 | " output_sample_rate: %d\n", | 147 | " output_sample_rate: %d\n", |
| 123 | - sampling_rate, static_cast<int32_t>(opts_.frame_opts.samp_freq)); | 148 | + sampling_rate, static_cast<int32_t>(config_.sampling_rate)); |
| 124 | 149 | ||
| 125 | - float min_freq = | ||
| 126 | - std::min<int32_t>(sampling_rate, opts_.frame_opts.samp_freq); | 150 | + float min_freq = std::min<int32_t>(sampling_rate, config_.sampling_rate); |
| 127 | float lowpass_cutoff = 0.99 * 0.5 * min_freq; | 151 | float lowpass_cutoff = 0.99 * 0.5 * min_freq; |
| 128 | 152 | ||
| 129 | int32_t lowpass_filter_width = 6; | 153 | int32_t lowpass_filter_width = 6; |
| 130 | auto resampler = std::make_unique<LinearResample>( | 154 | auto resampler = std::make_unique<LinearResample>( |
| 131 | - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, | 155 | + sampling_rate, config_.sampling_rate, lowpass_cutoff, |
| 132 | lowpass_filter_width); | 156 | lowpass_filter_width); |
| 133 | std::vector<float> samples; | 157 | std::vector<float> samples; |
| 134 | resampler->Resample(waveform, n, true, &samples); | 158 | resampler->Resample(waveform, n, true, &samples); |
| 135 | 159 | ||
| 136 | if (fbank_) { | 160 | if (fbank_) { |
| 137 | - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), | 161 | + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), |
| 138 | samples.size()); | 162 | samples.size()); |
| 139 | fbank_->InputFinished(); | 163 | fbank_->InputFinished(); |
| 164 | + } else if (mfcc_) { | ||
| 165 | + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), | ||
| 166 | + samples.size()); | ||
| 167 | + mfcc_->InputFinished(); | ||
| 140 | } else { | 168 | } else { |
| 141 | - whisper_fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, | ||
| 142 | - samples.data(), samples.size()); | 169 | + whisper_fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), |
| 170 | + samples.size()); | ||
| 143 | whisper_fbank_->InputFinished(); | 171 | whisper_fbank_->InputFinished(); |
| 144 | } | 172 | } |
| 145 | 173 | ||
| 146 | return; | 174 | return; |
| 147 | - } // if (sampling_rate != opts_.frame_opts.samp_freq) | 175 | + } // if (sampling_rate != config_.sampling_rate) |
| 148 | 176 | ||
| 149 | if (fbank_) { | 177 | if (fbank_) { |
| 150 | fbank_->AcceptWaveform(sampling_rate, waveform, n); | 178 | fbank_->AcceptWaveform(sampling_rate, waveform, n); |
| 151 | fbank_->InputFinished(); | 179 | fbank_->InputFinished(); |
| 180 | + } else if (mfcc_) { | ||
| 181 | + mfcc_->AcceptWaveform(sampling_rate, waveform, n); | ||
| 182 | + mfcc_->InputFinished(); | ||
| 152 | } else { | 183 | } else { |
| 153 | whisper_fbank_->AcceptWaveform(sampling_rate, waveform, n); | 184 | whisper_fbank_->AcceptWaveform(sampling_rate, waveform, n); |
| 154 | whisper_fbank_->InputFinished(); | 185 | whisper_fbank_->InputFinished(); |
| 155 | } | 186 | } |
| 156 | } | 187 | } |
| 157 | 188 | ||
| 158 | - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } | 189 | + int32_t FeatureDim() const { |
| 190 | + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins; | ||
| 191 | + } | ||
| 159 | 192 | ||
| 160 | std::vector<float> GetFrames() const { | 193 | std::vector<float> GetFrames() const { |
| 161 | - int32_t n = | ||
| 162 | - fbank_ ? fbank_->NumFramesReady() : whisper_fbank_->NumFramesReady(); | ||
| 163 | - | 194 | + int32_t n = fbank_ ? fbank_->NumFramesReady() |
| 195 | + : mfcc_ ? mfcc_->NumFramesReady() | ||
| 196 | + : whisper_fbank_->NumFramesReady(); | ||
| 164 | assert(n > 0 && "Please first call AcceptWaveform()"); | 197 | assert(n > 0 && "Please first call AcceptWaveform()"); |
| 165 | 198 | ||
| 166 | int32_t feature_dim = FeatureDim(); | 199 | int32_t feature_dim = FeatureDim(); |
| @@ -170,8 +203,9 @@ class OfflineStream::Impl { | @@ -170,8 +203,9 @@ class OfflineStream::Impl { | ||
| 170 | float *p = features.data(); | 203 | float *p = features.data(); |
| 171 | 204 | ||
| 172 | for (int32_t i = 0; i != n; ++i) { | 205 | for (int32_t i = 0; i != n; ++i) { |
| 173 | - const float *f = | ||
| 174 | - fbank_ ? fbank_->GetFrame(i) : whisper_fbank_->GetFrame(i); | 206 | + const float *f = fbank_ ? fbank_->GetFrame(i) |
| 207 | + : mfcc_ ? mfcc_->GetFrame(i) | ||
| 208 | + : whisper_fbank_->GetFrame(i); | ||
| 175 | std::copy(f, f + feature_dim, p); | 209 | std::copy(f, f + feature_dim, p); |
| 176 | p += feature_dim; | 210 | p += feature_dim; |
| 177 | } | 211 | } |
| @@ -222,8 +256,10 @@ class OfflineStream::Impl { | @@ -222,8 +256,10 @@ class OfflineStream::Impl { | ||
| 222 | private: | 256 | private: |
| 223 | FeatureExtractorConfig config_; | 257 | FeatureExtractorConfig config_; |
| 224 | std::unique_ptr<knf::OnlineFbank> fbank_; | 258 | std::unique_ptr<knf::OnlineFbank> fbank_; |
| 259 | + std::unique_ptr<knf::OnlineMfcc> mfcc_; | ||
| 225 | std::unique_ptr<knf::OnlineWhisperFbank> whisper_fbank_; | 260 | std::unique_ptr<knf::OnlineWhisperFbank> whisper_fbank_; |
| 226 | knf::FbankOptions opts_; | 261 | knf::FbankOptions opts_; |
| 262 | + knf::MfccOptions mfcc_opts_; | ||
| 227 | OfflineRecognitionResult r_; | 263 | OfflineRecognitionResult r_; |
| 228 | ContextGraphPtr context_graph_; | 264 | ContextGraphPtr context_graph_; |
| 229 | }; | 265 | }; |
| 1 | +// sherpa-onnx/csrc/offline-telespeech-ctc-model.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023-2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h" | ||
| 6 | + | ||
| 7 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 8 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 9 | +#include "sherpa-onnx/csrc/session.h" | ||
| 10 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 11 | +#include "sherpa-onnx/csrc/transpose.h" | ||
| 12 | + | ||
| 13 | +namespace sherpa_onnx { | ||
| 14 | + | ||
| 15 | +class OfflineTeleSpeechCtcModel::Impl { | ||
| 16 | + public: | ||
| 17 | + explicit Impl(const OfflineModelConfig &config) | ||
| 18 | + : config_(config), | ||
| 19 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 20 | + sess_opts_(GetSessionOptions(config)), | ||
| 21 | + allocator_{} { | ||
| 22 | + auto buf = ReadFile(config_.telespeech_ctc); | ||
| 23 | + Init(buf.data(), buf.size()); | ||
| 24 | + } | ||
| 25 | + | ||
| 26 | +#if __ANDROID_API__ >= 9 | ||
| 27 | + Impl(AAssetManager *mgr, const OfflineModelConfig &config) | ||
| 28 | + : config_(config), | ||
| 29 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 30 | + sess_opts_(GetSessionOptions(config)), | ||
| 31 | + allocator_{} { | ||
| 32 | + auto buf = ReadFile(mgr, config_.telespeech_ctc); | ||
| 33 | + Init(buf.data(), buf.size()); | ||
| 34 | + } | ||
| 35 | +#endif | ||
| 36 | + | ||
| 37 | + std::vector<Ort::Value> Forward(Ort::Value features, | ||
| 38 | + Ort::Value /*features_length*/) { | ||
| 39 | + std::vector<int64_t> shape = | ||
| 40 | + features.GetTensorTypeAndShapeInfo().GetShape(); | ||
| 41 | + | ||
| 42 | + if (static_cast<int32_t>(shape[0]) != 1) { | ||
| 43 | + SHERPA_ONNX_LOGE("This model supports only batch size 1. Given %d", | ||
| 44 | + static_cast<int32_t>(shape[0])); | ||
| 45 | + } | ||
| 46 | + | ||
| 47 | + auto out = sess_->Run({}, input_names_ptr_.data(), &features, 1, | ||
| 48 | + output_names_ptr_.data(), output_names_ptr_.size()); | ||
| 49 | + | ||
| 50 | + std::vector<int64_t> logits_shape = {1}; | ||
| 51 | + Ort::Value logits_length = Ort::Value::CreateTensor<int64_t>( | ||
| 52 | + allocator_, logits_shape.data(), logits_shape.size()); | ||
| 53 | + | ||
| 54 | + int64_t *dst = logits_length.GetTensorMutableData<int64_t>(); | ||
| 55 | + dst[0] = out[0].GetTensorTypeAndShapeInfo().GetShape()[0]; | ||
| 56 | + | ||
| 57 | + // (T, B, C) -> (B, T, C) | ||
| 58 | + Ort::Value logits = Transpose01(allocator_, &out[0]); | ||
| 59 | + | ||
| 60 | + std::vector<Ort::Value> ans; | ||
| 61 | + ans.reserve(2); | ||
| 62 | + ans.push_back(std::move(logits)); | ||
| 63 | + ans.push_back(std::move(logits_length)); | ||
| 64 | + | ||
| 65 | + return ans; | ||
| 66 | + } | ||
| 67 | + | ||
| 68 | + int32_t VocabSize() const { return vocab_size_; } | ||
| 69 | + | ||
| 70 | + int32_t SubsamplingFactor() const { return subsampling_factor_; } | ||
| 71 | + | ||
| 72 | + OrtAllocator *Allocator() const { return allocator_; } | ||
| 73 | + | ||
| 74 | + private: | ||
| 75 | + void Init(void *model_data, size_t model_data_length) { | ||
| 76 | + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | ||
| 77 | + sess_opts_); | ||
| 78 | + | ||
| 79 | + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); | ||
| 80 | + | ||
| 81 | + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); | ||
| 82 | + | ||
| 83 | + // get meta data | ||
| 84 | + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); | ||
| 85 | + if (config_.debug) { | ||
| 86 | + std::ostringstream os; | ||
| 87 | + PrintModelMetadata(os, meta_data); | ||
| 88 | + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + { | ||
| 92 | + auto shape = | ||
| 93 | + sess_->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); | ||
| 94 | + vocab_size_ = shape[2]; | ||
| 95 | + } | ||
| 96 | + } | ||
| 97 | + | ||
| 98 | + private: | ||
| 99 | + OfflineModelConfig config_; | ||
| 100 | + Ort::Env env_; | ||
| 101 | + Ort::SessionOptions sess_opts_; | ||
| 102 | + Ort::AllocatorWithDefaultOptions allocator_; | ||
| 103 | + | ||
| 104 | + std::unique_ptr<Ort::Session> sess_; | ||
| 105 | + | ||
| 106 | + std::vector<std::string> input_names_; | ||
| 107 | + std::vector<const char *> input_names_ptr_; | ||
| 108 | + | ||
| 109 | + std::vector<std::string> output_names_; | ||
| 110 | + std::vector<const char *> output_names_ptr_; | ||
| 111 | + | ||
| 112 | + int32_t vocab_size_ = 0; | ||
| 113 | + int32_t subsampling_factor_ = 4; | ||
| 114 | +}; | ||
| 115 | + | ||
| 116 | +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel( | ||
| 117 | + const OfflineModelConfig &config) | ||
| 118 | + : impl_(std::make_unique<Impl>(config)) {} | ||
| 119 | + | ||
| 120 | +#if __ANDROID_API__ >= 9 | ||
| 121 | +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel( | ||
| 122 | + AAssetManager *mgr, const OfflineModelConfig &config) | ||
| 123 | + : impl_(std::make_unique<Impl>(mgr, config)) {} | ||
| 124 | +#endif | ||
| 125 | + | ||
| 126 | +OfflineTeleSpeechCtcModel::~OfflineTeleSpeechCtcModel() = default; | ||
| 127 | + | ||
| 128 | +std::vector<Ort::Value> OfflineTeleSpeechCtcModel::Forward( | ||
| 129 | + Ort::Value features, Ort::Value features_length) { | ||
| 130 | + return impl_->Forward(std::move(features), std::move(features_length)); | ||
| 131 | +} | ||
| 132 | + | ||
| 133 | +int32_t OfflineTeleSpeechCtcModel::VocabSize() const { | ||
| 134 | + return impl_->VocabSize(); | ||
| 135 | +} | ||
| 136 | +int32_t OfflineTeleSpeechCtcModel::SubsamplingFactor() const { | ||
| 137 | + return impl_->SubsamplingFactor(); | ||
| 138 | +} | ||
| 139 | + | ||
| 140 | +OrtAllocator *OfflineTeleSpeechCtcModel::Allocator() const { | ||
| 141 | + return impl_->Allocator(); | ||
| 142 | +} | ||
| 143 | + | ||
| 144 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/csrc/offline-telespeech-ctc-model.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ | ||
| 6 | +#include <memory> | ||
| 7 | +#include <string> | ||
| 8 | +#include <utility> | ||
| 9 | +#include <vector> | ||
| 10 | + | ||
| 11 | +#if __ANDROID_API__ >= 9 | ||
| 12 | +#include "android/asset_manager.h" | ||
| 13 | +#include "android/asset_manager_jni.h" | ||
| 14 | +#endif | ||
| 15 | + | ||
| 16 | +#include "onnxruntime_cxx_api.h" // NOLINT | ||
| 17 | +#include "sherpa-onnx/csrc/offline-ctc-model.h" | ||
| 18 | +#include "sherpa-onnx/csrc/offline-model-config.h" | ||
| 19 | + | ||
| 20 | +namespace sherpa_onnx { | ||
| 21 | + | ||
| 22 | +/** This class implements the CTC model from | ||
| 23 | + * https://github.com/Tele-AI/TeleSpeech-ASR. | ||
| 24 | + * | ||
| 25 | + * See | ||
| 26 | + * https://github.com/lovemefan/telespeech-asr-python/blob/main/telespeechasr/onnx/onnx_infer.py | ||
| 27 | + * and | ||
| 28 | + * https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/test.py | ||
| 29 | + */ | ||
| 30 | +class OfflineTeleSpeechCtcModel : public OfflineCtcModel { | ||
| 31 | + public: | ||
| 32 | + explicit OfflineTeleSpeechCtcModel(const OfflineModelConfig &config); | ||
| 33 | + | ||
| 34 | +#if __ANDROID_API__ >= 9 | ||
| 35 | + OfflineTeleSpeechCtcModel(AAssetManager *mgr, | ||
| 36 | + const OfflineModelConfig &config); | ||
| 37 | +#endif | ||
| 38 | + | ||
| 39 | + ~OfflineTeleSpeechCtcModel() override; | ||
| 40 | + | ||
| 41 | + /** Run the forward method of the model. | ||
| 42 | + * | ||
| 43 | + * @param features A tensor of shape (N, T, C). | ||
| 44 | + * @param features_length A 1-D tensor of shape (N,) containing number of | ||
| 45 | + * valid frames in `features` before padding. | ||
| 46 | + * Its dtype is int64_t. | ||
| 47 | + * | ||
| 48 | + * @return Return a vector containing: | ||
| 49 | + * - log_probs: A 3-D tensor of shape (N, T', vocab_size). | ||
| 50 | + * - log_probs_length A 1-D tensor of shape (N,). Its dtype is int64_t | ||
| 51 | + */ | ||
| 52 | + std::vector<Ort::Value> Forward(Ort::Value features, | ||
| 53 | + Ort::Value features_length) override; | ||
| 54 | + | ||
| 55 | + /** Return the vocabulary size of the model | ||
| 56 | + */ | ||
| 57 | + int32_t VocabSize() const override; | ||
| 58 | + | ||
| 59 | + /** SubsamplingFactor of the model | ||
| 60 | + */ | ||
| 61 | + int32_t SubsamplingFactor() const override; | ||
| 62 | + | ||
| 63 | + /** Return an allocator for allocating memory | ||
| 64 | + */ | ||
| 65 | + OrtAllocator *Allocator() const override; | ||
| 66 | + | ||
| 67 | + // TeleSpeech CTC models do not support batch size > 1 | ||
| 68 | + bool SupportBatchProcessing() const override { return false; } | ||
| 69 | + | ||
| 70 | + std::string FeatureNormalizationMethod() const override { | ||
| 71 | + return "per_feature"; | ||
| 72 | + } | ||
| 73 | + | ||
| 74 | + private: | ||
| 75 | + class Impl; | ||
| 76 | + std::unique_ptr<Impl> impl_; | ||
| 77 | +}; | ||
| 78 | + | ||
| 79 | +} // namespace sherpa_onnx | ||
| 80 | + | ||
| 81 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ |
| @@ -66,7 +66,7 @@ bool OnlineModelConfig::Validate() const { | @@ -66,7 +66,7 @@ bool OnlineModelConfig::Validate() const { | ||
| 66 | if (!modeling_unit.empty() && | 66 | if (!modeling_unit.empty() && |
| 67 | (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { | 67 | (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { |
| 68 | if (!FileExists(bpe_vocab)) { | 68 | if (!FileExists(bpe_vocab)) { |
| 69 | - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); | 69 | + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str()); |
| 70 | return false; | 70 | return false; |
| 71 | } | 71 | } |
| 72 | } | 72 | } |
| @@ -7,6 +7,7 @@ public class OfflineModelConfig { | @@ -7,6 +7,7 @@ public class OfflineModelConfig { | ||
| 7 | private final OfflineParaformerModelConfig paraformer; | 7 | private final OfflineParaformerModelConfig paraformer; |
| 8 | private final OfflineWhisperModelConfig whisper; | 8 | private final OfflineWhisperModelConfig whisper; |
| 9 | private final OfflineNemoEncDecCtcModelConfig nemo; | 9 | private final OfflineNemoEncDecCtcModelConfig nemo; |
| 10 | + private final String teleSpeech; | ||
| 10 | private final String tokens; | 11 | private final String tokens; |
| 11 | private final int numThreads; | 12 | private final int numThreads; |
| 12 | private final boolean debug; | 13 | private final boolean debug; |
| @@ -21,6 +22,7 @@ public class OfflineModelConfig { | @@ -21,6 +22,7 @@ public class OfflineModelConfig { | ||
| 21 | this.paraformer = builder.paraformer; | 22 | this.paraformer = builder.paraformer; |
| 22 | this.whisper = builder.whisper; | 23 | this.whisper = builder.whisper; |
| 23 | this.nemo = builder.nemo; | 24 | this.nemo = builder.nemo; |
| 25 | + this.teleSpeech = builder.teleSpeech; | ||
| 24 | this.tokens = builder.tokens; | 26 | this.tokens = builder.tokens; |
| 25 | this.numThreads = builder.numThreads; | 27 | this.numThreads = builder.numThreads; |
| 26 | this.debug = builder.debug; | 28 | this.debug = builder.debug; |
| @@ -74,11 +76,16 @@ public class OfflineModelConfig { | @@ -74,11 +76,16 @@ public class OfflineModelConfig { | ||
| 74 | return bpeVocab; | 76 | return bpeVocab; |
| 75 | } | 77 | } |
| 76 | 78 | ||
| 79 | + public String getTeleSpeech() { | ||
| 80 | + return teleSpeech; | ||
| 81 | + } | ||
| 82 | + | ||
| 77 | public static class Builder { | 83 | public static class Builder { |
| 78 | private OfflineParaformerModelConfig paraformer = OfflineParaformerModelConfig.builder().build(); | 84 | private OfflineParaformerModelConfig paraformer = OfflineParaformerModelConfig.builder().build(); |
| 79 | private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); | 85 | private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); |
| 80 | private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); | 86 | private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); |
| 81 | private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); | 87 | private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); |
| 88 | + private String teleSpeech = ""; | ||
| 82 | private String tokens = ""; | 89 | private String tokens = ""; |
| 83 | private int numThreads = 1; | 90 | private int numThreads = 1; |
| 84 | private boolean debug = true; | 91 | private boolean debug = true; |
| @@ -106,6 +113,12 @@ public class OfflineModelConfig { | @@ -106,6 +113,12 @@ public class OfflineModelConfig { | ||
| 106 | return this; | 113 | return this; |
| 107 | } | 114 | } |
| 108 | 115 | ||
| 116 | + | ||
| 117 | + public Builder setTeleSpeech(String teleSpeech) { | ||
| 118 | + this.teleSpeech = teleSpeech; | ||
| 119 | + return this; | ||
| 120 | + } | ||
| 121 | + | ||
| 109 | public Builder setWhisper(OfflineWhisperModelConfig whisper) { | 122 | public Builder setWhisper(OfflineWhisperModelConfig whisper) { |
| 110 | this.whisper = whisper; | 123 | this.whisper = whisper; |
| 111 | return this; | 124 | return this; |
| @@ -172,6 +172,12 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { | @@ -172,6 +172,12 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { | ||
| 172 | ans.model_config.nemo_ctc.model = p; | 172 | ans.model_config.nemo_ctc.model = p; |
| 173 | env->ReleaseStringUTFChars(s, p); | 173 | env->ReleaseStringUTFChars(s, p); |
| 174 | 174 | ||
| 175 | + fid = env->GetFieldID(model_config_cls, "teleSpeech", "Ljava/lang/String;"); | ||
| 176 | + s = (jstring)env->GetObjectField(model_config, fid); | ||
| 177 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 178 | + ans.model_config.telespeech_ctc = p; | ||
| 179 | + env->ReleaseStringUTFChars(s, p); | ||
| 180 | + | ||
| 175 | return ans; | 181 | return ans; |
| 176 | } | 182 | } |
| 177 | 183 |
| @@ -35,6 +35,7 @@ data class OfflineModelConfig( | @@ -35,6 +35,7 @@ data class OfflineModelConfig( | ||
| 35 | var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), | 35 | var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), |
| 36 | var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), | 36 | var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), |
| 37 | var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), | 37 | var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), |
| 38 | + var teleSpeech: String = "", | ||
| 38 | var numThreads: Int = 1, | 39 | var numThreads: Int = 1, |
| 39 | var debug: Boolean = false, | 40 | var debug: Boolean = false, |
| 40 | var provider: String = "cpu", | 41 | var provider: String = "cpu", |
| @@ -272,6 +273,15 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { | @@ -272,6 +273,15 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { | ||
| 272 | tokens = "$modelDir/tokens.txt", | 273 | tokens = "$modelDir/tokens.txt", |
| 273 | ) | 274 | ) |
| 274 | } | 275 | } |
| 276 | + | ||
| 277 | + 11 -> { | ||
| 278 | + val modelDir = "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04" | ||
| 279 | + return OfflineModelConfig( | ||
| 280 | + teleSpeech = "$modelDir/model.int8.onnx", | ||
| 281 | + tokens = "$modelDir/tokens.txt", | ||
| 282 | + modelType = "tele_speech", | ||
| 283 | + ) | ||
| 284 | + } | ||
| 275 | } | 285 | } |
| 276 | return null | 286 | return null |
| 277 | } | 287 | } |
| @@ -29,15 +29,16 @@ void PybindOfflineModelConfig(py::module *m) { | @@ -29,15 +29,16 @@ void PybindOfflineModelConfig(py::module *m) { | ||
| 29 | 29 | ||
| 30 | using PyClass = OfflineModelConfig; | 30 | using PyClass = OfflineModelConfig; |
| 31 | py::class_<PyClass>(*m, "OfflineModelConfig") | 31 | py::class_<PyClass>(*m, "OfflineModelConfig") |
| 32 | - .def(py::init<const OfflineTransducerModelConfig &, | 32 | + .def( |
| 33 | + py::init< | ||
| 34 | + const OfflineTransducerModelConfig &, | ||
| 33 | const OfflineParaformerModelConfig &, | 35 | const OfflineParaformerModelConfig &, |
| 34 | const OfflineNemoEncDecCtcModelConfig &, | 36 | const OfflineNemoEncDecCtcModelConfig &, |
| 35 | - const OfflineWhisperModelConfig &, | ||
| 36 | - const OfflineTdnnModelConfig &, | 37 | + const OfflineWhisperModelConfig &, const OfflineTdnnModelConfig &, |
| 37 | const OfflineZipformerCtcModelConfig &, | 38 | const OfflineZipformerCtcModelConfig &, |
| 38 | const OfflineWenetCtcModelConfig &, const std::string &, | 39 | const OfflineWenetCtcModelConfig &, const std::string &, |
| 39 | - int32_t, bool, const std::string &, const std::string &, | ||
| 40 | - const std::string &, const std::string &>(), | 40 | + const std::string &, int32_t, bool, const std::string &, |
| 41 | + const std::string &, const std::string &, const std::string &>(), | ||
| 41 | py::arg("transducer") = OfflineTransducerModelConfig(), | 42 | py::arg("transducer") = OfflineTransducerModelConfig(), |
| 42 | py::arg("paraformer") = OfflineParaformerModelConfig(), | 43 | py::arg("paraformer") = OfflineParaformerModelConfig(), |
| 43 | py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(), | 44 | py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(), |
| @@ -45,7 +46,8 @@ void PybindOfflineModelConfig(py::module *m) { | @@ -45,7 +46,8 @@ void PybindOfflineModelConfig(py::module *m) { | ||
| 45 | py::arg("tdnn") = OfflineTdnnModelConfig(), | 46 | py::arg("tdnn") = OfflineTdnnModelConfig(), |
| 46 | py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(), | 47 | py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(), |
| 47 | py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), | 48 | py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), |
| 48 | - py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false, | 49 | + py::arg("telespeech_ctc") = "", py::arg("tokens"), |
| 50 | + py::arg("num_threads"), py::arg("debug") = false, | ||
| 49 | py::arg("provider") = "cpu", py::arg("model_type") = "", | 51 | py::arg("provider") = "cpu", py::arg("model_type") = "", |
| 50 | py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "") | 52 | py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "") |
| 51 | .def_readwrite("transducer", &PyClass::transducer) | 53 | .def_readwrite("transducer", &PyClass::transducer) |
| @@ -55,6 +57,7 @@ void PybindOfflineModelConfig(py::module *m) { | @@ -55,6 +57,7 @@ void PybindOfflineModelConfig(py::module *m) { | ||
| 55 | .def_readwrite("tdnn", &PyClass::tdnn) | 57 | .def_readwrite("tdnn", &PyClass::tdnn) |
| 56 | .def_readwrite("zipformer_ctc", &PyClass::zipformer_ctc) | 58 | .def_readwrite("zipformer_ctc", &PyClass::zipformer_ctc) |
| 57 | .def_readwrite("wenet_ctc", &PyClass::wenet_ctc) | 59 | .def_readwrite("wenet_ctc", &PyClass::wenet_ctc) |
| 60 | + .def_readwrite("telespeech_ctc", &PyClass::telespeech_ctc) | ||
| 58 | .def_readwrite("tokens", &PyClass::tokens) | 61 | .def_readwrite("tokens", &PyClass::tokens) |
| 59 | .def_readwrite("num_threads", &PyClass::num_threads) | 62 | .def_readwrite("num_threads", &PyClass::num_threads) |
| 60 | .def_readwrite("debug", &PyClass::debug) | 63 | .def_readwrite("debug", &PyClass::debug) |
| @@ -212,6 +212,71 @@ class OfflineRecognizer(object): | @@ -212,6 +212,71 @@ class OfflineRecognizer(object): | ||
| 212 | return self | 212 | return self |
| 213 | 213 | ||
| 214 | @classmethod | 214 | @classmethod |
| 215 | + def from_telespeech_ctc( | ||
| 216 | + cls, | ||
| 217 | + model: str, | ||
| 218 | + tokens: str, | ||
| 219 | + num_threads: int = 1, | ||
| 220 | + sample_rate: int = 16000, | ||
| 221 | + feature_dim: int = 40, | ||
| 222 | + decoding_method: str = "greedy_search", | ||
| 223 | + debug: bool = False, | ||
| 224 | + provider: str = "cpu", | ||
| 225 | + ): | ||
| 226 | + """ | ||
| 227 | + Please refer to | ||
| 228 | + `<https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models>`_ | ||
| 229 | + to download pre-trained models. | ||
| 230 | + | ||
| 231 | + Args: | ||
| 232 | + model: | ||
| 233 | + Path to ``model.onnx``. | ||
| 234 | + tokens: | ||
| 235 | + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two | ||
| 236 | + columns:: | ||
| 237 | + | ||
| 238 | + symbol integer_id | ||
| 239 | + | ||
| 240 | + num_threads: | ||
| 241 | + Number of threads for neural network computation. | ||
| 242 | + sample_rate: | ||
| 243 | + Sample rate of the training data used to train the model. It is | ||
| 244 | + ignored and is hard-coded in C++ to 40. | ||
| 245 | + feature_dim: | ||
| 246 | + Dimension of the feature used to train the model. It is ignored | ||
| 247 | + and is hard-coded in C++ to 40. | ||
| 248 | + decoding_method: | ||
| 249 | + Valid values are greedy_search. | ||
| 250 | + debug: | ||
| 251 | + True to show debug messages. | ||
| 252 | + provider: | ||
| 253 | + onnxruntime execution providers. Valid values are: cpu, cuda, coreml. | ||
| 254 | + """ | ||
| 255 | + self = cls.__new__(cls) | ||
| 256 | + model_config = OfflineModelConfig( | ||
| 257 | + telespeech_ctc=model, | ||
| 258 | + tokens=tokens, | ||
| 259 | + num_threads=num_threads, | ||
| 260 | + debug=debug, | ||
| 261 | + provider=provider, | ||
| 262 | + model_type="nemo_ctc", | ||
| 263 | + ) | ||
| 264 | + | ||
| 265 | + feat_config = FeatureExtractorConfig( | ||
| 266 | + sampling_rate=sample_rate, | ||
| 267 | + feature_dim=feature_dim, | ||
| 268 | + ) | ||
| 269 | + | ||
| 270 | + recognizer_config = OfflineRecognizerConfig( | ||
| 271 | + feat_config=feat_config, | ||
| 272 | + model_config=model_config, | ||
| 273 | + decoding_method=decoding_method, | ||
| 274 | + ) | ||
| 275 | + self.recognizer = _Recognizer(recognizer_config) | ||
| 276 | + self.config = recognizer_config | ||
| 277 | + return self | ||
| 278 | + | ||
| 279 | + @classmethod | ||
| 215 | def from_nemo_ctc( | 280 | def from_nemo_ctc( |
| 216 | cls, | 281 | cls, |
| 217 | model: str, | 282 | model: str, |
| @@ -102,7 +102,7 @@ func sherpaOnnxOnlineModelConfig( | @@ -102,7 +102,7 @@ func sherpaOnnxOnlineModelConfig( | ||
| 102 | debug: Int32(debug), | 102 | debug: Int32(debug), |
| 103 | model_type: toCPointer(modelType), | 103 | model_type: toCPointer(modelType), |
| 104 | modeling_unit: toCPointer(modelingUnit), | 104 | modeling_unit: toCPointer(modelingUnit), |
| 105 | - bpeVocab: toCPointer(bpeVocab) | 105 | + bpe_vocab: toCPointer(bpeVocab) |
| 106 | ) | 106 | ) |
| 107 | } | 107 | } |
| 108 | 108 | ||
| @@ -360,7 +360,8 @@ func sherpaOnnxOfflineModelConfig( | @@ -360,7 +360,8 @@ func sherpaOnnxOfflineModelConfig( | ||
| 360 | debug: Int = 0, | 360 | debug: Int = 0, |
| 361 | modelType: String = "", | 361 | modelType: String = "", |
| 362 | modelingUnit: String = "cjkchar", | 362 | modelingUnit: String = "cjkchar", |
| 363 | - bpeVocab: String = "" | 363 | + bpeVocab: String = "", |
| 364 | + teleSpeechCtc: String = "" | ||
| 364 | ) -> SherpaOnnxOfflineModelConfig { | 365 | ) -> SherpaOnnxOfflineModelConfig { |
| 365 | return SherpaOnnxOfflineModelConfig( | 366 | return SherpaOnnxOfflineModelConfig( |
| 366 | transducer: transducer, | 367 | transducer: transducer, |
| @@ -374,7 +375,8 @@ func sherpaOnnxOfflineModelConfig( | @@ -374,7 +375,8 @@ func sherpaOnnxOfflineModelConfig( | ||
| 374 | provider: toCPointer(provider), | 375 | provider: toCPointer(provider), |
| 375 | model_type: toCPointer(modelType), | 376 | model_type: toCPointer(modelType), |
| 376 | modeling_unit: toCPointer(modelingUnit), | 377 | modeling_unit: toCPointer(modelingUnit), |
| 377 | - bpeVocab: toCPointer(bpeVocab) | 378 | + bpe_vocab: toCPointer(bpeVocab), |
| 379 | + telespeech_ctc: toCPointer(teleSpeechCtc) | ||
| 378 | ) | 380 | ) |
| 379 | } | 381 | } |
| 380 | 382 |
| @@ -529,7 +529,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -529,7 +529,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 529 | const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); | 529 | const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); |
| 530 | 530 | ||
| 531 | const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + | 531 | const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + |
| 532 | - tdnn.len + 7 * 4; | 532 | + tdnn.len + 8 * 4; |
| 533 | const ptr = Module._malloc(len); | 533 | const ptr = Module._malloc(len); |
| 534 | 534 | ||
| 535 | let offset = 0; | 535 | let offset = 0; |
| @@ -553,9 +553,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -553,9 +553,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 553 | const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; | 553 | const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; |
| 554 | const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; | 554 | const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; |
| 555 | const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; | 555 | const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; |
| 556 | + const teleSpeechCtcLen = | ||
| 557 | + Module.lengthBytesUTF8(config.teleSpeechCtc || '') + 1; | ||
| 556 | 558 | ||
| 557 | - const bufferLen = | ||
| 558 | - tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen; | 559 | + const bufferLen = tokensLen + providerLen + modelTypeLen + modelingUnitLen + |
| 560 | + bpeVocabLen + teleSpeechCtcLen; | ||
| 559 | const buffer = Module._malloc(bufferLen); | 561 | const buffer = Module._malloc(bufferLen); |
| 560 | 562 | ||
| 561 | offset = 0; | 563 | offset = 0; |
| @@ -575,6 +577,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -575,6 +577,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 575 | Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); | 577 | Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); |
| 576 | offset += bpeVocabLen; | 578 | offset += bpeVocabLen; |
| 577 | 579 | ||
| 580 | + Module.stringToUTF8( | ||
| 581 | + config.teleSpeechCtc || '', buffer + offset, teleSpeechCtcLen); | ||
| 582 | + offset += teleSpeechCtcLen; | ||
| 583 | + | ||
| 578 | offset = | 584 | offset = |
| 579 | transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len; | 585 | transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len; |
| 580 | Module.setValue(ptr + offset, buffer, 'i8*'); // tokens | 586 | Module.setValue(ptr + offset, buffer, 'i8*'); // tokens |
| @@ -604,6 +610,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -604,6 +610,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 604 | 'i8*'); // bpeVocab | 610 | 'i8*'); // bpeVocab |
| 605 | offset += 4; | 611 | offset += 4; |
| 606 | 612 | ||
| 613 | + Module.setValue( | ||
| 614 | + ptr + offset, | ||
| 615 | + buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen + | ||
| 616 | + bpeVocabLen, | ||
| 617 | + 'i8*'); // teleSpeechCtc | ||
| 618 | + offset += 4; | ||
| 619 | + | ||
| 607 | return { | 620 | return { |
| 608 | buffer: buffer, ptr: ptr, len: len, transducer: transducer, | 621 | buffer: buffer, ptr: ptr, len: len, transducer: transducer, |
| 609 | paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn | 622 | paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn |
| @@ -23,7 +23,7 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == | @@ -23,7 +23,7 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == | ||
| 23 | sizeof(SherpaOnnxOfflineParaformerModelConfig) + | 23 | sizeof(SherpaOnnxOfflineParaformerModelConfig) + |
| 24 | sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + | 24 | sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + |
| 25 | sizeof(SherpaOnnxOfflineWhisperModelConfig) + | 25 | sizeof(SherpaOnnxOfflineWhisperModelConfig) + |
| 26 | - sizeof(SherpaOnnxOfflineTdnnModelConfig) + 7 * 4, | 26 | + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4, |
| 27 | ""); | 27 | ""); |
| 28 | static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); | 28 | static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); |
| 29 | static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == | 29 | static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == |
| @@ -92,6 +92,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | @@ -92,6 +92,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | ||
| 92 | fprintf(stdout, "model type: %s\n", model_config->model_type); | 92 | fprintf(stdout, "model type: %s\n", model_config->model_type); |
| 93 | fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit); | 93 | fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit); |
| 94 | fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab); | 94 | fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab); |
| 95 | + fprintf(stdout, "telespeech_ctc: %s\n", model_config->telespeech_ctc); | ||
| 95 | 96 | ||
| 96 | fprintf(stdout, "----------feat config----------\n"); | 97 | fprintf(stdout, "----------feat config----------\n"); |
| 97 | fprintf(stdout, "sample rate: %d\n", feat->sample_rate); | 98 | fprintf(stdout, "sample rate: %d\n", feat->sample_rate); |
-
请 注册 或 登录 后发表评论