Fangjun Kuang
Committed by GitHub

Add C++ runtime for Tele-AI/TeleSpeech-ASR (#970)

正在显示 52 个修改的文件 包含 1019 行增加112 行删除
@@ -2,7 +2,16 @@ @@ -2,7 +2,16 @@
2 2
3 cd dotnet-examples/ 3 cd dotnet-examples/
4 4
5 -cd vad-non-streaming-asr-paraformer 5 +cd ./offline-decode-files
  6 +./run-telespeech-ctc.sh
  7 +./run-nemo-ctc.sh
  8 +./run-paraformer.sh
  9 +./run-zipformer.sh
  10 +./run-hotwords.sh
  11 +./run-whisper.sh
  12 +./run-tdnn-yesno.sh
  13 +
  14 +cd ../vad-non-streaming-asr-paraformer
6 ./run.sh 15 ./run.sh
7 16
8 cd ../offline-punctuation 17 cd ../offline-punctuation
@@ -22,14 +31,6 @@ cd ../online-decode-files @@ -22,14 +31,6 @@ cd ../online-decode-files
22 ./run-transducer.sh 31 ./run-transducer.sh
23 ./run-paraformer.sh 32 ./run-paraformer.sh
24 33
25 -cd ../offline-decode-files  
26 -./run-nemo-ctc.sh  
27 -./run-paraformer.sh  
28 -./run-zipformer.sh  
29 -./run-hotwords.sh  
30 -./run-whisper.sh  
31 -./run-tdnn-yesno.sh  
32 -  
33 cd ../offline-tts 34 cd ../offline-tts
34 ./run-aishell3.sh 35 ./run-aishell3.sh
35 ./run-piper.sh 36 ./run-piper.sh
@@ -15,6 +15,39 @@ echo "PATH: $PATH" @@ -15,6 +15,39 @@ echo "PATH: $PATH"
15 15
16 which $EXE 16 which $EXE
17 17
  18 +log "test offline TeleSpeech CTC"
  19 +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  20 +name=$(basename $url)
  21 +repo=$(basename -s .tar.bz2 $name)
  22 +
  23 +curl -SL -O $url
  24 +tar xvf $name
  25 +rm $name
  26 +ls -lh $repo
  27 +
  28 +test_wavs=(
  29 +3-sichuan.wav
  30 +4-tianjin.wav
  31 +5-henan.wav
  32 +)
  33 +for w in ${test_wavs[@]}; do
  34 + time $EXE \
  35 + --tokens=$repo/tokens.txt \
  36 + --telespeech-ctc=$repo/model.int8.onnx \
  37 + --debug=1 \
  38 + $repo/test_wavs/$w
  39 +done
  40 +
  41 +time $EXE \
  42 + --tokens=$repo/tokens.txt \
  43 + --telespeech-ctc=$repo/model.int8.onnx \
  44 + --debug=1 \
  45 + $repo/test_wavs/3-sichuan.wav \
  46 + $repo/test_wavs/4-tianjin.wav \
  47 + $repo/test_wavs/5-henan.wav
  48 +
  49 +rm -rf $repo
  50 +
18 log "-----------------------------------------------------------------" 51 log "-----------------------------------------------------------------"
19 log "Run Nemo fast conformer hybrid transducer ctc models (CTC branch)" 52 log "Run Nemo fast conformer hybrid transducer ctc models (CTC branch)"
20 log "-----------------------------------------------------------------" 53 log "-----------------------------------------------------------------"
@@ -10,6 +10,18 @@ log() { @@ -10,6 +10,18 @@ log() {
10 10
11 export GIT_CLONE_PROTECTION_ACTIVE=false 11 export GIT_CLONE_PROTECTION_ACTIVE=false
12 12
  13 +log "test offline TeleSpeech CTC"
  14 +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  15 +name=$(basename $url)
  16 +repo=$(basename -s .tar.bz2 $name)
  17 +
  18 +curl -SL -O $url
  19 +tar xvf $name
  20 +rm $name
  21 +ls -lh $repo
  22 +python3 ./python-api-examples/offline-telespeech-ctc-decode-files.py
  23 +rm -rf $repo
  24 +
13 log "test online NeMo CTC" 25 log "test online NeMo CTC"
14 26
15 url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2 27 url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2
@@ -82,7 +82,7 @@ jobs: @@ -82,7 +82,7 @@ jobs:
82 TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 82 TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
83 TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 83 TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
84 run: | 84 run: |
85 - python3 -m pip install --upgrade pip  
86 - python3 -m pip install wheel twine setuptools 85 + python3 -m pip install --break-system-packages --upgrade pip
  86 + python3 -m pip install --break-system-packages wheel twine setuptools
87 87
88 twine upload ./wheelhouse/*.whl 88 twine upload ./wheelhouse/*.whl
  1 +name: build-wheels-macos-universal2
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - wheel
  7 + tags:
  8 + - '*'
  9 + workflow_dispatch:
  10 +
  11 +env:
  12 + SHERPA_ONNX_IS_IN_GITHUB_ACTIONS: 1
  13 +
  14 +concurrency:
  15 + group: build-wheels-macos-universal2-${{ github.ref }}
  16 + cancel-in-progress: true
  17 +
  18 +jobs:
  19 + build_wheels_macos_universal2:
  20 + name: ${{ matrix.python-version }}
  21 + runs-on: ${{ matrix.os }}
  22 + strategy:
  23 + fail-fast: false
  24 + matrix:
  25 + os: [macos-latest]
  26 + python-version: ["cp38", "cp39", "cp310", "cp311", "cp312"]
  27 +
  28 + steps:
  29 + - uses: actions/checkout@v4
  30 +
  31 + - name: Build wheels
  32 + uses: pypa/cibuildwheel@v2.15.0
  33 + env:
  34 + CIBW_BUILD: "${{ matrix.python-version}}-* "
  35 + CIBW_ENVIRONMENT: SHERPA_ONNX_CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES='arm64;x86_64'"
  36 + CIBW_ARCHS: "universal2"
  37 + CIBW_BUILD_VERBOSITY: 3
  38 +
  39 + # Don't repair macOS wheels
  40 + CIBW_REPAIR_WHEEL_COMMAND_MACOS: ""
  41 +
  42 + - name: Display wheels
  43 + shell: bash
  44 + run: |
  45 + ls -lh ./wheelhouse/
  46 +
  47 + - uses: actions/upload-artifact@v4
  48 + with:
  49 + name: wheel-${{ matrix.python-version }}
  50 + path: ./wheelhouse/*.whl
  51 +
  52 + - name: Publish to huggingface
  53 + if: matrix.python-version == 'cp38'
  54 + env:
  55 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  56 + uses: nick-fields/retry@v3
  57 + with:
  58 + max_attempts: 20
  59 + timeout_seconds: 200
  60 + shell: bash
  61 + command: |
  62 + git config --global user.email "csukuangfj@gmail.com"
  63 + git config --global user.name "Fangjun Kuang"
  64 +
  65 + rm -rf huggingface
  66 + export GIT_LFS_SKIP_SMUDGE=1
  67 + export GIT_CLONE_PROTECTION_ACTIVE=false
  68 +
  69 + git clone https://huggingface.co/csukuangfj/sherpa-onnx-wheels huggingface
  70 + cd huggingface
  71 + git fetch
  72 + git pull
  73 + git merge -m "merge remote" --ff origin main
  74 +
  75 + cp -v ../wheelhouse/*.whl .
  76 +
  77 + git status
  78 + git add .
  79 + git commit -m "add more wheels"
  80 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-wheels main
  81 +
  82 + - name: Publish wheels to PyPI
  83 + env:
  84 + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
  85 + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
  86 + run: |
  87 + python3 -m pip install --break-system-packages --upgrade pip
  88 + python3 -m pip install --break-system-packages wheel twine setuptools
  89 +
  90 + twine upload ./wheelhouse/*.whl
@@ -99,7 +99,7 @@ jobs: @@ -99,7 +99,7 @@ jobs:
99 TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 99 TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
100 TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 100 TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
101 run: | 101 run: |
102 - python3 -m pip install --upgrade pip  
103 - python3 -m pip install wheel twine setuptools 102 + python3 -m pip install --break-system-packages --upgrade pip
  103 + python3 -m pip install --break-system-packages wheel twine setuptools
104 104
105 twine upload ./wheelhouse/*.whl 105 twine upload ./wheelhouse/*.whl
@@ -48,3 +48,49 @@ jobs: @@ -48,3 +48,49 @@ jobs:
48 repo_name: k2-fsa/sherpa-onnx 48 repo_name: k2-fsa/sherpa-onnx
49 repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} 49 repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
50 tag: asr-models 50 tag: asr-models
  51 +
  52 + - name: Publish float32 model to huggingface
  53 + shell: bash
  54 + env:
  55 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  56 + run: |
  57 + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-zh-2024-06-04
  58 + git config --global user.email "csukuangfj@gmail.com"
  59 + git config --global user.name "Fangjun Kuang"
  60 +
  61 + export GIT_CLONE_PROTECTION_ACTIVE=false
  62 +
  63 + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 hf
  64 + cp -a $src/* hf/
  65 + cd hf
  66 + git lfs track "*.pdf"
  67 + git lfs track "*.onnx"
  68 + git add .
  69 + git commit -m 'add model files' || true
  70 + git status
  71 + ls -lh
  72 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 main || true
  73 + rm -rf hf
  74 +
  75 + - name: Publish int8 model to huggingface
  76 + shell: bash
  77 + env:
  78 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  79 + run: |
  80 + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04
  81 + git config --global user.email "csukuangfj@gmail.com"
  82 + git config --global user.name "Fangjun Kuang"
  83 +
  84 + export GIT_CLONE_PROTECTION_ACTIVE=false
  85 +
  86 + rm -rf hf
  87 + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 hf
  88 + cp -a $src/* hf/
  89 + cd hf
  90 + git lfs track "*.pdf"
  91 + git lfs track "*.onnx"
  92 + git add .
  93 + git commit -m 'add model files' || true
  94 + git status
  95 + ls -lh
  96 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 main || true
@@ -130,34 +130,34 @@ jobs: @@ -130,34 +130,34 @@ jobs:
130 name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} 130 name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
131 path: install/* 131 path: install/*
132 132
133 - - name: Test online transducer 133 + - name: Test offline CTC
134 shell: bash 134 shell: bash
135 run: | 135 run: |
136 du -h -d1 . 136 du -h -d1 .
137 export PATH=$PWD/build/bin:$PATH 137 export PATH=$PWD/build/bin:$PATH
138 - export EXE=sherpa-onnx 138 + export EXE=sherpa-onnx-offline
139 139
140 - .github/scripts/test-online-transducer.sh 140 + .github/scripts/test-offline-ctc.sh
141 du -h -d1 . 141 du -h -d1 .
142 142
143 - - name: Test online transducer (C API) 143 + - name: Test online transducer
144 shell: bash 144 shell: bash
145 run: | 145 run: |
146 du -h -d1 . 146 du -h -d1 .
147 export PATH=$PWD/build/bin:$PATH 147 export PATH=$PWD/build/bin:$PATH
148 - export EXE=decode-file-c-api 148 + export EXE=sherpa-onnx
149 149
150 .github/scripts/test-online-transducer.sh 150 .github/scripts/test-online-transducer.sh
151 du -h -d1 . 151 du -h -d1 .
152 152
153 - - name: Test offline CTC 153 + - name: Test online transducer (C API)
154 shell: bash 154 shell: bash
155 run: | 155 run: |
156 du -h -d1 . 156 du -h -d1 .
157 export PATH=$PWD/build/bin:$PATH 157 export PATH=$PWD/build/bin:$PATH
158 - export EXE=sherpa-onnx-offline 158 + export EXE=decode-file-c-api
159 159
160 - .github/scripts/test-offline-ctc.sh 160 + .github/scripts/test-online-transducer.sh
161 du -h -d1 . 161 du -h -d1 .
162 162
163 - name: Test spoken language identification (C++ API) 163 - name: Test spoken language identification (C++ API)
@@ -107,6 +107,14 @@ jobs: @@ -107,6 +107,14 @@ jobs:
107 otool -L build/bin/sherpa-onnx 107 otool -L build/bin/sherpa-onnx
108 otool -l build/bin/sherpa-onnx 108 otool -l build/bin/sherpa-onnx
109 109
  110 + - name: Test offline CTC
  111 + shell: bash
  112 + run: |
  113 + export PATH=$PWD/build/bin:$PATH
  114 + export EXE=sherpa-onnx-offline
  115 +
  116 + .github/scripts/test-offline-ctc.sh
  117 +
110 - name: Test offline transducer 118 - name: Test offline transducer
111 shell: bash 119 shell: bash
112 run: | 120 run: |
@@ -192,13 +200,7 @@ jobs: @@ -192,13 +200,7 @@ jobs:
192 200
193 .github/scripts/test-offline-whisper.sh 201 .github/scripts/test-offline-whisper.sh
194 202
195 - - name: Test offline CTC  
196 - shell: bash  
197 - run: |  
198 - export PATH=$PWD/build/bin:$PATH  
199 - export EXE=sherpa-onnx-offline  
200 203
201 - .github/scripts/test-offline-ctc.sh  
202 204
203 - name: Test online transducer 205 - name: Test online transducer
204 shell: bash 206 shell: bash
@@ -39,7 +39,7 @@ jobs: @@ -39,7 +39,7 @@ jobs:
39 strategy: 39 strategy:
40 fail-fast: false 40 fail-fast: false
41 matrix: 41 matrix:
42 - os: [macos-13] 42 + os: [macos-latest, macos-14]
43 43
44 steps: 44 steps:
45 - uses: actions/checkout@v4 45 - uses: actions/checkout@v4
@@ -30,14 +30,12 @@ concurrency: @@ -30,14 +30,12 @@ concurrency:
30 30
31 jobs: 31 jobs:
32 test-go: 32 test-go:
33 - name: ${{ matrix.os }} ${{matrix.arch }} 33 + name: ${{ matrix.os }}
34 runs-on: ${{ matrix.os }} 34 runs-on: ${{ matrix.os }}
35 strategy: 35 strategy:
36 fail-fast: false 36 fail-fast: false
37 matrix: 37 matrix:
38 - include:  
39 - - os: macos-latest  
40 - arch: amd64 38 + os: [macos-latest, macos-14]
41 39
42 steps: 40 steps:
43 - uses: actions/checkout@v4 41 - uses: actions/checkout@v4
@@ -47,7 +45,7 @@ jobs: @@ -47,7 +45,7 @@ jobs:
47 - name: ccache 45 - name: ccache
48 uses: hendrikmuhs/ccache-action@v1.2 46 uses: hendrikmuhs/ccache-action@v1.2
49 with: 47 with:
50 - key: ${{ matrix.os }}-${{ matrix.arch }} 48 + key: ${{ matrix.os }}-go
51 49
52 - uses: actions/setup-go@v5 50 - uses: actions/setup-go@v5
53 with: 51 with:
@@ -109,8 +107,6 @@ jobs: @@ -109,8 +107,6 @@ jobs:
109 go build 107 go build
110 ls -lh 108 ls -lh
111 109
112 - git lfs install  
113 -  
114 echo "Test vits-ljs" 110 echo "Test vits-ljs"
115 ./run-vits-ljs.sh 111 ./run-vits-ljs.sh
116 rm -rf vits-ljs 112 rm -rf vits-ljs
@@ -144,7 +140,13 @@ jobs: @@ -144,7 +140,13 @@ jobs:
144 go build 140 go build
145 ls -lh 141 ls -lh
146 142
147 - git lfs install 143 + echo "Test telespeech ctc"
  144 + ./run-telespeech-ctc.sh
  145 + rm -rf sherpa-onnx-telespeech-ctc-*
  146 +
  147 + echo "Test transducer"
  148 + ./run-transducer.sh
  149 + rm -rf sherpa-onnx-zipformer-en-2023-06-26
148 150
149 echo "Test transducer" 151 echo "Test transducer"
150 ./run-transducer.sh 152 ./run-transducer.sh
@@ -57,7 +57,7 @@ jobs: @@ -57,7 +57,7 @@ jobs:
57 57
58 mkdir build 58 mkdir build
59 cd build 59 cd build
60 - cmake -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install .. 60 + cmake -DSHERPA_ONNX_ENABLE_EPSEAK_NG_EXE=ON -DBUILD_ESPEAK_NG_EXE=ON -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install ..
61 61
62 - name: Build 62 - name: Build
63 shell: bash 63 shell: bash
@@ -106,3 +106,4 @@ node_modules @@ -106,3 +106,4 @@ node_modules
106 package-lock.json 106 package-lock.json
107 sherpa-onnx-nemo-* 107 sherpa-onnx-nemo-*
108 sherpa-onnx-vits-* 108 sherpa-onnx-vits-*
  109 +sherpa-onnx-telespeech-ctc-*
@@ -6,7 +6,7 @@ set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment ve @@ -6,7 +6,7 @@ set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment ve
6 6
7 project(sherpa-onnx) 7 project(sherpa-onnx)
8 8
9 -set(SHERPA_ONNX_VERSION "1.9.26") 9 +set(SHERPA_ONNX_VERSION "1.9.27")
10 10
11 # Disable warning about 11 # Disable warning about
12 # 12 #
@@ -14,7 +14,9 @@ function(download_espeak_ng_for_piper) @@ -14,7 +14,9 @@ function(download_espeak_ng_for_piper)
14 set(USE_SPEECHPLAYER OFF CACHE BOOL "" FORCE) 14 set(USE_SPEECHPLAYER OFF CACHE BOOL "" FORCE)
15 set(EXTRA_cmn ON CACHE BOOL "" FORCE) 15 set(EXTRA_cmn ON CACHE BOOL "" FORCE)
16 set(EXTRA_ru ON CACHE BOOL "" FORCE) 16 set(EXTRA_ru ON CACHE BOOL "" FORCE)
  17 + if (NOT SHERPA_ONNX_ENABLE_EPSEAK_NG_EXE)
17 set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE) 18 set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE)
  19 + endif()
18 20
19 # If you don't have access to the Internet, 21 # If you don't have access to the Internet,
20 # please pre-download kaldi-decoder 22 # please pre-download kaldi-decoder
1 function(download_kaldi_native_fbank) 1 function(download_kaldi_native_fbank)
2 include(FetchContent) 2 include(FetchContent)
3 3
4 - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz")  
5 - set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz")  
6 - set(kaldi_native_fbank_HASH "SHA256=0cae8cbb9ea42916b214e088912f9e8f2f648f54756b305f93f552382f31f904") 4 + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz")
  5 + set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz")
  6 + set(kaldi_native_fbank_HASH "SHA256=335fe1daf1b9bfb2a7b6bf03b64c4c4686c39077c57fb8058c02611981676638")
7 7
8 set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) 8 set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
9 set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) 9 set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
@@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
12 # If you don't have access to the Internet, 12 # If you don't have access to the Internet,
13 # please pre-download kaldi-native-fbank 13 # please pre-download kaldi-native-fbank
14 set(possible_file_locations 14 set(possible_file_locations
15 - $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.1.tar.gz  
16 - ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.1.tar.gz  
17 - ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.1.tar.gz  
18 - /tmp/kaldi-native-fbank-1.19.1.tar.gz  
19 - /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.1.tar.gz 15 + $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.3.tar.gz
  16 + ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.3.tar.gz
  17 + ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.3.tar.gz
  18 + /tmp/kaldi-native-fbank-1.19.3.tar.gz
  19 + /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.3.tar.gz
20 ) 20 )
21 21
22 foreach(f IN LISTS possible_file_locations) 22 foreach(f IN LISTS possible_file_locations)
@@ -34,6 +34,9 @@ class OfflineDecodeFiles @@ -34,6 +34,9 @@ class OfflineDecodeFiles
34 [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] 34 [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")]
35 public string Joiner { get; set; } 35 public string Joiner { get; set; }
36 36
  37 + [Option("model-type", Required = false, Default = "", HelpText = "model type")]
  38 + public string ModelType { get; set; }
  39 +
37 [Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")] 40 [Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")]
38 public string WhisperEncoder { get; set; } 41 public string WhisperEncoder { get; set; }
39 42
@@ -56,6 +59,9 @@ class OfflineDecodeFiles @@ -56,6 +59,9 @@ class OfflineDecodeFiles
56 [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] 59 [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")]
57 public string NeMoCtc { get; set; } 60 public string NeMoCtc { get; set; }
58 61
  62 + [Option("telespeech-ctc", Required = false, HelpText = "Path to model.onnx. Used only for TeleSpeech CTC models")]
  63 + public string TeleSpeechCtc { get; set; }
  64 +
59 [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] 65 [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")]
60 public int NumThreads { get; set; } 66 public int NumThreads { get; set; }
61 67
@@ -201,6 +207,10 @@ to download pre-trained Tdnn models. @@ -201,6 +207,10 @@ to download pre-trained Tdnn models.
201 { 207 {
202 config.ModelConfig.NeMoCtc.Model = options.NeMoCtc; 208 config.ModelConfig.NeMoCtc.Model = options.NeMoCtc;
203 } 209 }
  210 + else if (!String.IsNullOrEmpty(options.TeleSpeechCtc))
  211 + {
  212 + config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc;
  213 + }
204 else if (!String.IsNullOrEmpty(options.WhisperEncoder)) 214 else if (!String.IsNullOrEmpty(options.WhisperEncoder))
205 { 215 {
206 config.ModelConfig.Whisper.Encoder = options.WhisperEncoder; 216 config.ModelConfig.Whisper.Encoder = options.WhisperEncoder;
@@ -218,6 +228,7 @@ to download pre-trained Tdnn models. @@ -218,6 +228,7 @@ to download pre-trained Tdnn models.
218 return; 228 return;
219 } 229 }
220 230
  231 + config.ModelConfig.ModelType = options.ModelType;
221 config.DecodingMethod = options.DecodingMethod; 232 config.DecodingMethod = options.DecodingMethod;
222 config.MaxActivePaths = options.MaxActivePaths; 233 config.MaxActivePaths = options.MaxActivePaths;
223 config.HotwordsFile = options.HotwordsFile; 234 config.HotwordsFile = options.HotwordsFile;
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  7 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  8 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  9 +fi
  10 +
  11 +dotnet run \
  12 + --telespeech-ctc=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \
  13 + --tokens=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \
  14 + --model-type=telespeech-ctc \
  15 + --files ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav
@@ -40,6 +40,9 @@ func main() { @@ -40,6 +40,9 @@ func main() {
40 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") 40 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message")
41 flag.StringVar(&config.ModelConfig.ModelType, "model-type", "", "Optional. Used for loading the model in a faster way") 41 flag.StringVar(&config.ModelConfig.ModelType, "model-type", "", "Optional. Used for loading the model in a faster way")
42 flag.StringVar(&config.ModelConfig.Provider, "provider", "cpu", "Provider to use") 42 flag.StringVar(&config.ModelConfig.Provider, "provider", "cpu", "Provider to use")
  43 + flag.StringVar(&config.ModelConfig.ModelingUnit, "modeling-unit", "cjkchar", "cjkchar, bpe, cjkchar+bpe, or leave it to empty")
  44 + flag.StringVar(&config.ModelConfig.BpeVocab, "bpe-vocab", "", "")
  45 + flag.StringVar(&config.ModelConfig.TeleSpeechCtc, "telespeech-ctc", "", "Used for TeleSpeechCtc model")
43 flag.StringVar(&config.LmConfig.Model, "lm-model", "", "Optional. Path to the LM model") 46 flag.StringVar(&config.LmConfig.Model, "lm-model", "", "Optional. Path to the LM model")
44 flag.Float32Var(&config.LmConfig.Scale, "lm-scale", 1.0, "Optional. Scale for the LM model") 47 flag.Float32Var(&config.LmConfig.Scale, "lm-scale", 1.0, "Optional. Scale for the LM model")
45 48
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  7 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  8 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./non-streaming-decode-files \
  15 + --telespeech-ctc ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \
  16 + --tokens ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \
  17 + --model-type telespeech-ctc \
  18 + --debug 0 \
  19 + ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav
@@ -4,7 +4,7 @@ @@ -4,7 +4,7 @@
4 // to decode files. 4 // to decode files.
5 import com.k2fsa.sherpa.onnx.*; 5 import com.k2fsa.sherpa.onnx.*;
6 6
7 -public class NonStreamingDecodeFileTransducer { 7 +public class NonStreamingDecodeFileParaformer {
8 public static void main(String[] args) { 8 public static void main(String[] args) {
9 // please refer to 9 // please refer to
10 // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english 10 // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use an offline TeleSpeech CTC model
  4 +// to decode files.
  5 +import com.k2fsa.sherpa.onnx.*;
  6 +
  7 +public class NonStreamingDecodeFileTeleSpeechCtc {
  8 + public static void main(String[] args) {
  9 + // please refer to
  10 + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english
  11 + // to download model files
  12 + String model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx";
  13 + String tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt";
  14 +
  15 + String waveFilename = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav";
  16 +
  17 + WaveReader reader = new WaveReader(waveFilename);
  18 +
  19 + OfflineModelConfig modelConfig =
  20 + OfflineModelConfig.builder()
  21 + .setTeleSpeech(model)
  22 + .setTokens(tokens)
  23 + .setNumThreads(1)
  24 + .setDebug(true)
  25 + .setModelType("telespeech_ctc")
  26 + .build();
  27 +
  28 + OfflineRecognizerConfig config =
  29 + OfflineRecognizerConfig.builder()
  30 + .setOfflineModelConfig(modelConfig)
  31 + .setDecodingMethod("greedy_search")
  32 + .build();
  33 +
  34 + OfflineRecognizer recognizer = new OfflineRecognizer(config);
  35 + OfflineStream stream = recognizer.createStream();
  36 + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
  37 +
  38 + recognizer.decode(stream);
  39 +
  40 + String text = recognizer.getResult(stream).getText();
  41 +
  42 + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
  43 +
  44 + stream.release();
  45 + recognizer.release();
  46 + }
  47 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  30 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  31 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
  32 +fi
  33 +
  34 +java \
  35 + -Djava.library.path=$PWD/../build/lib \
  36 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  37 + ./NonStreamingDecodeFileTeleSpeechCtc.java
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This file shows how to use a non-streaming CTC model from
  5 +https://github.com/Tele-AI/TeleSpeech-ASR
  6 +to decode files.
  7 +
  8 +Please download model files from
  9 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  10 +
  11 +
  12 +"""
  13 +
  14 +from pathlib import Path
  15 +
  16 +import sherpa_onnx
  17 +import soundfile as sf
  18 +
  19 +
  20 +def create_recognizer():
  21 + model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx"
  22 + tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt"
  23 + test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav"
  24 + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/4-tianjin.wav"
  25 + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/5-henan.wav"
  26 +
  27 + if not Path(model).is_file() or not Path(test_wav).is_file():
  28 + raise ValueError(
  29 + """Please download model files from
  30 + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  31 + """
  32 + )
  33 + return (
  34 + sherpa_onnx.OfflineRecognizer.from_telespeech_ctc(
  35 + model=model,
  36 + tokens=tokens,
  37 + debug=True,
  38 + ),
  39 + test_wav,
  40 + )
  41 +
  42 +
  43 +def main():
  44 + recognizer, wave_filename = create_recognizer()
  45 +
  46 + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
  47 + audio = audio[:, 0] # only use the first channel
  48 +
  49 + # audio is a 1-D float32 numpy array normalized to the range [-1, 1]
  50 + # sample_rate does not need to be 16000 Hz
  51 +
  52 + stream = recognizer.create_stream()
  53 + stream.accept_waveform(sample_rate, audio)
  54 + recognizer.decode_stream(stream)
  55 + print(wave_filename)
  56 + print(stream.result)
  57 +
  58 +
  59 +if __name__ == "__main__":
  60 + main()
@@ -166,6 +166,22 @@ def get_models(): @@ -166,6 +166,22 @@ def get_models():
166 popd 166 popd
167 """, 167 """,
168 ), 168 ),
  169 + Model(
  170 + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04",
  171 + idx=11,
  172 + lang="zh",
  173 + short_name="telespeech",
  174 + cmd="""
  175 + pushd $model_name
  176 +
  177 + rm -rfv test_wavs
  178 + rm test.py
  179 +
  180 + ls -lh
  181 +
  182 + popd
  183 + """,
  184 + ),
169 ] 185 ]
170 return models 186 return models
171 187
@@ -25,6 +25,7 @@ namespace SherpaOnnx @@ -25,6 +25,7 @@ namespace SherpaOnnx
25 ModelType = ""; 25 ModelType = "";
26 ModelingUnit = "cjkchar"; 26 ModelingUnit = "cjkchar";
27 BpeVocab = ""; 27 BpeVocab = "";
  28 + TeleSpeechCtc = "";
28 } 29 }
29 public OfflineTransducerModelConfig Transducer; 30 public OfflineTransducerModelConfig Transducer;
30 public OfflineParaformerModelConfig Paraformer; 31 public OfflineParaformerModelConfig Paraformer;
@@ -50,5 +51,8 @@ namespace SherpaOnnx @@ -50,5 +51,8 @@ namespace SherpaOnnx
50 51
51 [MarshalAs(UnmanagedType.LPStr)] 52 [MarshalAs(UnmanagedType.LPStr)]
52 public string BpeVocab; 53 public string BpeVocab;
  54 +
  55 + [MarshalAs(UnmanagedType.LPStr)]
  56 + public string TeleSpeechCtc;
53 } 57 }
54 } 58 }
@@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86 @@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86
30 linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 30 linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
31 linux_wheel=$src_dir/$linux_wheel_filename 31 linux_wheel=$src_dir/$linux_wheel_filename
32 32
33 -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl 33 +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_universal2.whl
34 macos_wheel=$src_dir/$macos_wheel_filename 34 macos_wheel=$src_dir/$macos_wheel_filename
35 35
36 windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl 36 windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl
@@ -61,7 +61,7 @@ if [ ! -f $src_dir/linux/libsherpa-onnx-core.so ]; then @@ -61,7 +61,7 @@ if [ ! -f $src_dir/linux/libsherpa-onnx-core.so ]; then
61 fi 61 fi
62 62
63 if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then 63 if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then
64 - echo "---macOS x86_64---" 64 + echo "--- macOS x86_64/arm64 universal2---"
65 cd macos 65 cd macos
66 mkdir -p wheel 66 mkdir -p wheel
67 cd wheel 67 cd wheel
  1 +../../../../go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh
@@ -383,6 +383,7 @@ type OfflineModelConfig struct { @@ -383,6 +383,7 @@ type OfflineModelConfig struct {
383 383
384 ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe 384 ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe
385 BpeVocab string // Optional. 385 BpeVocab string // Optional.
  386 + TeleSpeechCtc string // Optional.
386 } 387 }
387 388
388 // Configuration for the offline/non-streaming recognizer. 389 // Configuration for the offline/non-streaming recognizer.
@@ -477,6 +478,9 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { @@ -477,6 +478,9 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer {
477 c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab) 478 c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab)
478 defer C.free(unsafe.Pointer(c.model_config.bpe_vocab)) 479 defer C.free(unsafe.Pointer(c.model_config.bpe_vocab))
479 480
  481 + c.model_config.telespeech_ctc = C.CString(config.ModelConfig.TeleSpeechCtc)
  482 + defer C.free(unsafe.Pointer(c.model_config.telespeech_ctc))
  483 +
480 c.lm_config.model = C.CString(config.LmConfig.Model) 484 c.lm_config.model = C.CString(config.LmConfig.Model)
481 defer C.free(unsafe.Pointer(c.lm_config.model)) 485 defer C.free(unsafe.Pointer(c.lm_config.model))
482 486
@@ -128,6 +128,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { @@ -128,6 +128,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
128 SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType); 128 SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType);
129 SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit); 129 SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit);
130 SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab); 130 SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab);
  131 + SHERPA_ONNX_ASSIGN_ATTR_STR(telespeech_ctc, teleSpeechCtc);
131 132
132 return c; 133 return c;
133 } 134 }
@@ -242,6 +243,10 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { @@ -242,6 +243,10 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
242 delete[] c.model_config.bpe_vocab; 243 delete[] c.model_config.bpe_vocab;
243 } 244 }
244 245
  246 + if (c.model_config.telespeech_ctc) {
  247 + delete[] c.model_config.telespeech_ctc;
  248 + }
  249 +
245 if (c.lm_config.model) { 250 if (c.lm_config.model) {
246 delete[] c.lm_config.model; 251 delete[] c.lm_config.model;
247 } 252 }
@@ -366,6 +366,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( @@ -366,6 +366,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
366 recognizer_config.model_config.bpe_vocab = 366 recognizer_config.model_config.bpe_vocab =
367 SHERPA_ONNX_OR(config->model_config.bpe_vocab, ""); 367 SHERPA_ONNX_OR(config->model_config.bpe_vocab, "");
368 368
  369 + recognizer_config.model_config.telespeech_ctc =
  370 + SHERPA_ONNX_OR(config->model_config.telespeech_ctc, "");
  371 +
369 recognizer_config.lm_config.model = 372 recognizer_config.lm_config.model =
370 SHERPA_ONNX_OR(config->lm_config.model, ""); 373 SHERPA_ONNX_OR(config->lm_config.model, "");
371 recognizer_config.lm_config.scale = 374 recognizer_config.lm_config.scale =
@@ -395,6 +395,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { @@ -395,6 +395,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
395 // - cjkchar+bpe 395 // - cjkchar+bpe
396 const char *modeling_unit; 396 const char *modeling_unit;
397 const char *bpe_vocab; 397 const char *bpe_vocab;
  398 + const char *telespeech_ctc;
398 } SherpaOnnxOfflineModelConfig; 399 } SherpaOnnxOfflineModelConfig;
399 400
400 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { 401 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {
@@ -39,6 +39,7 @@ set(sources @@ -39,6 +39,7 @@ set(sources
39 offline-stream.cc 39 offline-stream.cc
40 offline-tdnn-ctc-model.cc 40 offline-tdnn-ctc-model.cc
41 offline-tdnn-model-config.cc 41 offline-tdnn-model-config.cc
  42 + offline-telespeech-ctc-model.cc
42 offline-transducer-greedy-search-decoder.cc 43 offline-transducer-greedy-search-decoder.cc
43 offline-transducer-greedy-search-nemo-decoder.cc 44 offline-transducer-greedy-search-nemo-decoder.cc
44 offline-transducer-model-config.cc 45 offline-transducer-model-config.cc
@@ -56,22 +56,11 @@ std::string FeatureExtractorConfig::ToString() const { @@ -56,22 +56,11 @@ std::string FeatureExtractorConfig::ToString() const {
56 class FeatureExtractor::Impl { 56 class FeatureExtractor::Impl {
57 public: 57 public:
58 explicit Impl(const FeatureExtractorConfig &config) : config_(config) { 58 explicit Impl(const FeatureExtractorConfig &config) : config_(config) {
59 - opts_.frame_opts.dither = config.dither;  
60 - opts_.frame_opts.snip_edges = config.snip_edges;  
61 - opts_.frame_opts.samp_freq = config.sampling_rate;  
62 - opts_.frame_opts.frame_shift_ms = config.frame_shift_ms;  
63 - opts_.frame_opts.frame_length_ms = config.frame_length_ms;  
64 - opts_.frame_opts.remove_dc_offset = config.remove_dc_offset;  
65 - opts_.frame_opts.window_type = config.window_type;  
66 -  
67 - opts_.mel_opts.num_bins = config.feature_dim;  
68 -  
69 - opts_.mel_opts.high_freq = config.high_freq;  
70 - opts_.mel_opts.low_freq = config.low_freq;  
71 -  
72 - opts_.mel_opts.is_librosa = config.is_librosa;  
73 -  
74 - fbank_ = std::make_unique<knf::OnlineFbank>(opts_); 59 + if (config_.is_mfcc) {
  60 + InitMfcc();
  61 + } else {
  62 + InitFbank();
  63 + }
75 } 64 }
76 65
77 void AcceptWaveform(int32_t sampling_rate, const float *waveform, int32_t n) { 66 void AcceptWaveform(int32_t sampling_rate, const float *waveform, int32_t n) {
@@ -101,35 +90,48 @@ class FeatureExtractor::Impl { @@ -101,35 +90,48 @@ class FeatureExtractor::Impl {
101 90
102 std::vector<float> samples; 91 std::vector<float> samples;
103 resampler_->Resample(waveform, n, false, &samples); 92 resampler_->Resample(waveform, n, false, &samples);
104 - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), 93 + if (fbank_) {
  94 + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(),
105 samples.size()); 95 samples.size());
  96 + } else {
  97 + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(),
  98 + samples.size());
  99 + }
106 return; 100 return;
107 } 101 }
108 102
109 - if (sampling_rate != opts_.frame_opts.samp_freq) { 103 + if (sampling_rate != config_.sampling_rate) {
110 SHERPA_ONNX_LOGE( 104 SHERPA_ONNX_LOGE(
111 "Creating a resampler:\n" 105 "Creating a resampler:\n"
112 " in_sample_rate: %d\n" 106 " in_sample_rate: %d\n"
113 " output_sample_rate: %d\n", 107 " output_sample_rate: %d\n",
114 - sampling_rate, static_cast<int32_t>(opts_.frame_opts.samp_freq)); 108 + sampling_rate, static_cast<int32_t>(config_.sampling_rate));
115 109
116 - float min_freq =  
117 - std::min<int32_t>(sampling_rate, opts_.frame_opts.samp_freq); 110 + float min_freq = std::min<int32_t>(sampling_rate, config_.sampling_rate);
118 float lowpass_cutoff = 0.99 * 0.5 * min_freq; 111 float lowpass_cutoff = 0.99 * 0.5 * min_freq;
119 112
120 int32_t lowpass_filter_width = 6; 113 int32_t lowpass_filter_width = 6;
121 resampler_ = std::make_unique<LinearResample>( 114 resampler_ = std::make_unique<LinearResample>(
122 - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, 115 + sampling_rate, config_.sampling_rate, lowpass_cutoff,
123 lowpass_filter_width); 116 lowpass_filter_width);
124 117
125 std::vector<float> samples; 118 std::vector<float> samples;
126 resampler_->Resample(waveform, n, false, &samples); 119 resampler_->Resample(waveform, n, false, &samples);
127 - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), 120 + if (fbank_) {
  121 + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(),
  122 + samples.size());
  123 + } else {
  124 + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(),
128 samples.size()); 125 samples.size());
  126 + }
129 return; 127 return;
130 } 128 }
131 129
  130 + if (fbank_) {
132 fbank_->AcceptWaveform(sampling_rate, waveform, n); 131 fbank_->AcceptWaveform(sampling_rate, waveform, n);
  132 + } else {
  133 + mfcc_->AcceptWaveform(sampling_rate, waveform, n);
  134 + }
133 } 135 }
134 136
135 void InputFinished() const { 137 void InputFinished() const {
@@ -179,11 +181,56 @@ class FeatureExtractor::Impl { @@ -179,11 +181,56 @@ class FeatureExtractor::Impl {
179 return features; 181 return features;
180 } 182 }
181 183
182 - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } 184 + int32_t FeatureDim() const {
  185 + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins;
  186 + }
  187 +
  188 + private:
  189 + void InitFbank() {
  190 + opts_.frame_opts.dither = config_.dither;
  191 + opts_.frame_opts.snip_edges = config_.snip_edges;
  192 + opts_.frame_opts.samp_freq = config_.sampling_rate;
  193 + opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms;
  194 + opts_.frame_opts.frame_length_ms = config_.frame_length_ms;
  195 + opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset;
  196 + opts_.frame_opts.window_type = config_.window_type;
  197 +
  198 + opts_.mel_opts.num_bins = config_.feature_dim;
  199 +
  200 + opts_.mel_opts.high_freq = config_.high_freq;
  201 + opts_.mel_opts.low_freq = config_.low_freq;
  202 +
  203 + opts_.mel_opts.is_librosa = config_.is_librosa;
  204 +
  205 + fbank_ = std::make_unique<knf::OnlineFbank>(opts_);
  206 + }
  207 + void InitMfcc() {
  208 + mfcc_opts_.frame_opts.dither = config_.dither;
  209 + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges;
  210 + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate;
  211 + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms;
  212 + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms;
  213 + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset;
  214 + mfcc_opts_.frame_opts.window_type = config_.window_type;
  215 +
  216 + mfcc_opts_.mel_opts.num_bins = config_.feature_dim;
  217 +
  218 + mfcc_opts_.mel_opts.high_freq = config_.high_freq;
  219 + mfcc_opts_.mel_opts.low_freq = config_.low_freq;
  220 +
  221 + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa;
  222 +
  223 + mfcc_opts_.num_ceps = config_.num_ceps;
  224 + mfcc_opts_.use_energy = config_.use_energy;
  225 +
  226 + mfcc_ = std::make_unique<knf::OnlineMfcc>(mfcc_opts_);
  227 + }
183 228
184 private: 229 private:
185 std::unique_ptr<knf::OnlineFbank> fbank_; 230 std::unique_ptr<knf::OnlineFbank> fbank_;
  231 + std::unique_ptr<knf::OnlineMfcc> mfcc_;
186 knf::FbankOptions opts_; 232 knf::FbankOptions opts_;
  233 + knf::MfccOptions mfcc_opts_;
187 FeatureExtractorConfig config_; 234 FeatureExtractorConfig config_;
188 mutable std::mutex mutex_; 235 mutable std::mutex mutex_;
189 std::unique_ptr<LinearResample> resampler_; 236 std::unique_ptr<LinearResample> resampler_;
@@ -18,7 +18,10 @@ struct FeatureExtractorConfig { @@ -18,7 +18,10 @@ struct FeatureExtractorConfig {
18 // the sampling rate of the input waveform, we will do resampling inside. 18 // the sampling rate of the input waveform, we will do resampling inside.
19 int32_t sampling_rate = 16000; 19 int32_t sampling_rate = 16000;
20 20
21 - // Feature dimension 21 + // num_mel_bins
  22 + //
  23 + // Note: for mfcc, this value is also for num_mel_bins.
  24 + // The actual feature dimension is actuall num_ceps
22 int32_t feature_dim = 80; 25 int32_t feature_dim = 80;
23 26
24 // minimal frequency for Mel-filterbank, in Hz 27 // minimal frequency for Mel-filterbank, in Hz
@@ -69,6 +72,12 @@ struct FeatureExtractorConfig { @@ -69,6 +72,12 @@ struct FeatureExtractorConfig {
69 // for details 72 // for details
70 std::string nemo_normalize_type; 73 std::string nemo_normalize_type;
71 74
  75 + // for MFCC
  76 + int32_t num_ceps = 13;
  77 + bool use_energy = true;
  78 +
  79 + bool is_mfcc = false;
  80 +
72 std::string ToString() const; 81 std::string ToString() const;
73 82
74 void Register(ParseOptions *po); 83 void Register(ParseOptions *po);
@@ -12,6 +12,7 @@ @@ -12,6 +12,7 @@
12 #include "sherpa-onnx/csrc/macros.h" 12 #include "sherpa-onnx/csrc/macros.h"
13 #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h" 13 #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h"
14 #include "sherpa-onnx/csrc/offline-tdnn-ctc-model.h" 14 #include "sherpa-onnx/csrc/offline-tdnn-ctc-model.h"
  15 +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h"
15 #include "sherpa-onnx/csrc/offline-wenet-ctc-model.h" 16 #include "sherpa-onnx/csrc/offline-wenet-ctc-model.h"
16 #include "sherpa-onnx/csrc/offline-zipformer-ctc-model.h" 17 #include "sherpa-onnx/csrc/offline-zipformer-ctc-model.h"
17 #include "sherpa-onnx/csrc/onnx-utils.h" 18 #include "sherpa-onnx/csrc/onnx-utils.h"
@@ -24,6 +25,7 @@ enum class ModelType { @@ -24,6 +25,7 @@ enum class ModelType {
24 kTdnn, 25 kTdnn,
25 kZipformerCtc, 26 kZipformerCtc,
26 kWenetCtc, 27 kWenetCtc,
  28 + kTeleSpeechCtc,
27 kUnknown, 29 kUnknown,
28 }; 30 };
29 31
@@ -63,6 +65,9 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, @@ -63,6 +65,9 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,
63 "If you are using models from WeNet, please refer to\n" 65 "If you are using models from WeNet, please refer to\n"
64 "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/" 66 "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/"
65 "run.sh\n" 67 "run.sh\n"
  68 + "If you are using models from TeleSpeech, please refer to\n"
  69 + "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/"
  70 + "add-metadata.py"
66 "\n" 71 "\n"
67 "for how to add metadta to model.onnx\n"); 72 "for how to add metadta to model.onnx\n");
68 return ModelType::kUnknown; 73 return ModelType::kUnknown;
@@ -78,6 +83,8 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, @@ -78,6 +83,8 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,
78 return ModelType::kZipformerCtc; 83 return ModelType::kZipformerCtc;
79 } else if (model_type.get() == std::string("wenet_ctc")) { 84 } else if (model_type.get() == std::string("wenet_ctc")) {
80 return ModelType::kWenetCtc; 85 return ModelType::kWenetCtc;
  86 + } else if (model_type.get() == std::string("telespeech_ctc")) {
  87 + return ModelType::kTeleSpeechCtc;
81 } else { 88 } else {
82 SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.get()); 89 SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.get());
83 return ModelType::kUnknown; 90 return ModelType::kUnknown;
@@ -97,6 +104,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( @@ -97,6 +104,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
97 filename = config.zipformer_ctc.model; 104 filename = config.zipformer_ctc.model;
98 } else if (!config.wenet_ctc.model.empty()) { 105 } else if (!config.wenet_ctc.model.empty()) {
99 filename = config.wenet_ctc.model; 106 filename = config.wenet_ctc.model;
  107 + } else if (!config.telespeech_ctc.empty()) {
  108 + filename = config.telespeech_ctc;
100 } else { 109 } else {
101 SHERPA_ONNX_LOGE("Please specify a CTC model"); 110 SHERPA_ONNX_LOGE("Please specify a CTC model");
102 exit(-1); 111 exit(-1);
@@ -124,6 +133,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( @@ -124,6 +133,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
124 case ModelType::kWenetCtc: 133 case ModelType::kWenetCtc:
125 return std::make_unique<OfflineWenetCtcModel>(config); 134 return std::make_unique<OfflineWenetCtcModel>(config);
126 break; 135 break;
  136 + case ModelType::kTeleSpeechCtc:
  137 + return std::make_unique<OfflineTeleSpeechCtcModel>(config);
  138 + break;
127 case ModelType::kUnknown: 139 case ModelType::kUnknown:
128 SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); 140 SHERPA_ONNX_LOGE("Unknown model type in offline CTC!");
129 return nullptr; 141 return nullptr;
@@ -147,6 +159,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( @@ -147,6 +159,8 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
147 filename = config.zipformer_ctc.model; 159 filename = config.zipformer_ctc.model;
148 } else if (!config.wenet_ctc.model.empty()) { 160 } else if (!config.wenet_ctc.model.empty()) {
149 filename = config.wenet_ctc.model; 161 filename = config.wenet_ctc.model;
  162 + } else if (!config.telespeech_ctc.empty()) {
  163 + filename = config.telespeech_ctc;
150 } else { 164 } else {
151 SHERPA_ONNX_LOGE("Please specify a CTC model"); 165 SHERPA_ONNX_LOGE("Please specify a CTC model");
152 exit(-1); 166 exit(-1);
@@ -175,6 +189,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create( @@ -175,6 +189,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
175 case ModelType::kWenetCtc: 189 case ModelType::kWenetCtc:
176 return std::make_unique<OfflineWenetCtcModel>(mgr, config); 190 return std::make_unique<OfflineWenetCtcModel>(mgr, config);
177 break; 191 break;
  192 + case ModelType::kTeleSpeechCtc:
  193 + return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config);
  194 + break;
178 case ModelType::kUnknown: 195 case ModelType::kUnknown:
179 SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); 196 SHERPA_ONNX_LOGE("Unknown model type in offline CTC!");
180 return nullptr; 197 return nullptr;
@@ -19,6 +19,9 @@ void OfflineModelConfig::Register(ParseOptions *po) { @@ -19,6 +19,9 @@ void OfflineModelConfig::Register(ParseOptions *po) {
19 zipformer_ctc.Register(po); 19 zipformer_ctc.Register(po);
20 wenet_ctc.Register(po); 20 wenet_ctc.Register(po);
21 21
  22 + po->Register("telespeech-ctc", &telespeech_ctc,
  23 + "Path to model.onnx for telespeech ctc");
  24 +
22 po->Register("tokens", &tokens, "Path to tokens.txt"); 25 po->Register("tokens", &tokens, "Path to tokens.txt");
23 26
24 po->Register("num-threads", &num_threads, 27 po->Register("num-threads", &num_threads,
@@ -33,7 +36,7 @@ void OfflineModelConfig::Register(ParseOptions *po) { @@ -33,7 +36,7 @@ void OfflineModelConfig::Register(ParseOptions *po) {
33 po->Register("model-type", &model_type, 36 po->Register("model-type", &model_type,
34 "Specify it to reduce model initialization time. " 37 "Specify it to reduce model initialization time. "
35 "Valid values are: transducer, paraformer, nemo_ctc, whisper, " 38 "Valid values are: transducer, paraformer, nemo_ctc, whisper, "
36 - "tdnn, zipformer2_ctc" 39 + "tdnn, zipformer2_ctc, telespeech_ctc."
37 "All other values lead to loading the model twice."); 40 "All other values lead to loading the model twice.");
38 po->Register("modeling-unit", &modeling_unit, 41 po->Register("modeling-unit", &modeling_unit,
39 "The modeling unit of the model, commonly used units are bpe, " 42 "The modeling unit of the model, commonly used units are bpe, "
@@ -55,14 +58,14 @@ bool OfflineModelConfig::Validate() const { @@ -55,14 +58,14 @@ bool OfflineModelConfig::Validate() const {
55 } 58 }
56 59
57 if (!FileExists(tokens)) { 60 if (!FileExists(tokens)) {
58 - SHERPA_ONNX_LOGE("tokens: %s does not exist", tokens.c_str()); 61 + SHERPA_ONNX_LOGE("tokens: '%s' does not exist", tokens.c_str());
59 return false; 62 return false;
60 } 63 }
61 64
62 if (!modeling_unit.empty() && 65 if (!modeling_unit.empty() &&
63 (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { 66 (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) {
64 if (!FileExists(bpe_vocab)) { 67 if (!FileExists(bpe_vocab)) {
65 - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); 68 + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str());
66 return false; 69 return false;
67 } 70 }
68 } 71 }
@@ -91,6 +94,14 @@ bool OfflineModelConfig::Validate() const { @@ -91,6 +94,14 @@ bool OfflineModelConfig::Validate() const {
91 return wenet_ctc.Validate(); 94 return wenet_ctc.Validate();
92 } 95 }
93 96
  97 + if (!telespeech_ctc.empty() && !FileExists(telespeech_ctc)) {
  98 + SHERPA_ONNX_LOGE("telespeech_ctc: '%s' does not exist",
  99 + telespeech_ctc.c_str());
  100 + return false;
  101 + } else {
  102 + return true;
  103 + }
  104 +
94 return transducer.Validate(); 105 return transducer.Validate();
95 } 106 }
96 107
@@ -105,6 +116,7 @@ std::string OfflineModelConfig::ToString() const { @@ -105,6 +116,7 @@ std::string OfflineModelConfig::ToString() const {
105 os << "tdnn=" << tdnn.ToString() << ", "; 116 os << "tdnn=" << tdnn.ToString() << ", ";
106 os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", "; 117 os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", ";
107 os << "wenet_ctc=" << wenet_ctc.ToString() << ", "; 118 os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
  119 + os << "telespeech_ctc=\"" << telespeech_ctc << "\", ";
108 os << "tokens=\"" << tokens << "\", "; 120 os << "tokens=\"" << tokens << "\", ";
109 os << "num_threads=" << num_threads << ", "; 121 os << "num_threads=" << num_threads << ", ";
110 os << "debug=" << (debug ? "True" : "False") << ", "; 122 os << "debug=" << (debug ? "True" : "False") << ", ";
@@ -24,6 +24,7 @@ struct OfflineModelConfig { @@ -24,6 +24,7 @@ struct OfflineModelConfig {
24 OfflineTdnnModelConfig tdnn; 24 OfflineTdnnModelConfig tdnn;
25 OfflineZipformerCtcModelConfig zipformer_ctc; 25 OfflineZipformerCtcModelConfig zipformer_ctc;
26 OfflineWenetCtcModelConfig wenet_ctc; 26 OfflineWenetCtcModelConfig wenet_ctc;
  27 + std::string telespeech_ctc;
27 28
28 std::string tokens; 29 std::string tokens;
29 int32_t num_threads = 2; 30 int32_t num_threads = 2;
@@ -52,6 +53,7 @@ struct OfflineModelConfig { @@ -52,6 +53,7 @@ struct OfflineModelConfig {
52 const OfflineTdnnModelConfig &tdnn, 53 const OfflineTdnnModelConfig &tdnn,
53 const OfflineZipformerCtcModelConfig &zipformer_ctc, 54 const OfflineZipformerCtcModelConfig &zipformer_ctc,
54 const OfflineWenetCtcModelConfig &wenet_ctc, 55 const OfflineWenetCtcModelConfig &wenet_ctc,
  56 + const std::string &telespeech_ctc,
55 const std::string &tokens, int32_t num_threads, bool debug, 57 const std::string &tokens, int32_t num_threads, bool debug,
56 const std::string &provider, const std::string &model_type, 58 const std::string &provider, const std::string &model_type,
57 const std::string &modeling_unit, 59 const std::string &modeling_unit,
@@ -63,6 +65,7 @@ struct OfflineModelConfig { @@ -63,6 +65,7 @@ struct OfflineModelConfig {
63 tdnn(tdnn), 65 tdnn(tdnn),
64 zipformer_ctc(zipformer_ctc), 66 zipformer_ctc(zipformer_ctc),
65 wenet_ctc(wenet_ctc), 67 wenet_ctc(wenet_ctc),
  68 + telespeech_ctc(telespeech_ctc),
66 tokens(tokens), 69 tokens(tokens),
67 num_threads(num_threads), 70 num_threads(num_threads),
68 debug(debug), 71 debug(debug),
@@ -88,6 +88,17 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { @@ -88,6 +88,17 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
88 #endif 88 #endif
89 89
90 void Init() { 90 void Init() {
  91 + if (!config_.model_config.telespeech_ctc.empty()) {
  92 + config_.feat_config.snip_edges = true;
  93 + config_.feat_config.num_ceps = 40;
  94 + config_.feat_config.feature_dim = 40;
  95 + config_.feat_config.low_freq = 40;
  96 + config_.feat_config.high_freq = -200;
  97 + config_.feat_config.use_energy = false;
  98 + config_.feat_config.normalize_samples = false;
  99 + config_.feat_config.is_mfcc = true;
  100 + }
  101 +
91 if (!config_.model_config.wenet_ctc.model.empty()) { 102 if (!config_.model_config.wenet_ctc.model.empty()) {
92 // WeNet CTC models assume input samples are in the range 103 // WeNet CTC models assume input samples are in the range
93 // [-32768, 32767], so we set normalize_samples to false 104 // [-32768, 32767], so we set normalize_samples to false
@@ -29,7 +29,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -29,7 +29,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
29 } else if (model_type == "paraformer") { 29 } else if (model_type == "paraformer") {
30 return std::make_unique<OfflineRecognizerParaformerImpl>(config); 30 return std::make_unique<OfflineRecognizerParaformerImpl>(config);
31 } else if (model_type == "nemo_ctc" || model_type == "tdnn" || 31 } else if (model_type == "nemo_ctc" || model_type == "tdnn" ||
32 - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { 32 + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" ||
  33 + model_type == "telespeech_ctc") {
33 return std::make_unique<OfflineRecognizerCtcImpl>(config); 34 return std::make_unique<OfflineRecognizerCtcImpl>(config);
34 } else if (model_type == "whisper") { 35 } else if (model_type == "whisper") {
35 return std::make_unique<OfflineRecognizerWhisperImpl>(config); 36 return std::make_unique<OfflineRecognizerWhisperImpl>(config);
@@ -53,6 +54,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -53,6 +54,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
53 model_filename = config.model_config.paraformer.model; 54 model_filename = config.model_config.paraformer.model;
54 } else if (!config.model_config.nemo_ctc.model.empty()) { 55 } else if (!config.model_config.nemo_ctc.model.empty()) {
55 model_filename = config.model_config.nemo_ctc.model; 56 model_filename = config.model_config.nemo_ctc.model;
  57 + } else if (!config.model_config.telespeech_ctc.empty()) {
  58 + model_filename = config.model_config.telespeech_ctc;
56 } else if (!config.model_config.tdnn.model.empty()) { 59 } else if (!config.model_config.tdnn.model.empty()) {
57 model_filename = config.model_config.tdnn.model; 60 model_filename = config.model_config.tdnn.model;
58 } else if (!config.model_config.zipformer_ctc.model.empty()) { 61 } else if (!config.model_config.zipformer_ctc.model.empty()) {
@@ -111,6 +114,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -111,6 +114,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
111 "\n " 114 "\n "
112 "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" 115 "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh"
113 "\n" 116 "\n"
  117 + "(7) CTC models from TeleSpeech"
  118 + "\n "
  119 + "https://github.com/Tele-AI/TeleSpeech-ASR"
  120 + "\n"
114 "\n"); 121 "\n");
115 exit(-1); 122 exit(-1);
116 } 123 }
@@ -133,7 +140,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -133,7 +140,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
133 140
134 if (model_type == "EncDecCTCModelBPE" || 141 if (model_type == "EncDecCTCModelBPE" ||
135 model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || 142 model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" ||
136 - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { 143 + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" ||
  144 + model_type == "telespeech_ctc") {
137 return std::make_unique<OfflineRecognizerCtcImpl>(config); 145 return std::make_unique<OfflineRecognizerCtcImpl>(config);
138 } 146 }
139 147
@@ -151,7 +159,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -151,7 +159,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
151 " - Whisper models\n" 159 " - Whisper models\n"
152 " - Tdnn models\n" 160 " - Tdnn models\n"
153 " - Zipformer CTC models\n" 161 " - Zipformer CTC models\n"
154 - " - WeNet CTC models\n", 162 + " - WeNet CTC models\n"
  163 + " - TeleSpeech CTC models\n",
155 model_type.c_str()); 164 model_type.c_str());
156 165
157 exit(-1); 166 exit(-1);
@@ -169,7 +178,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -169,7 +178,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
169 } else if (model_type == "paraformer") { 178 } else if (model_type == "paraformer") {
170 return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config); 179 return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config);
171 } else if (model_type == "nemo_ctc" || model_type == "tdnn" || 180 } else if (model_type == "nemo_ctc" || model_type == "tdnn" ||
172 - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { 181 + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" ||
  182 + model_type == "telespeech_ctc") {
173 return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config); 183 return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config);
174 } else if (model_type == "whisper") { 184 } else if (model_type == "whisper") {
175 return std::make_unique<OfflineRecognizerWhisperImpl>(mgr, config); 185 return std::make_unique<OfflineRecognizerWhisperImpl>(mgr, config);
@@ -199,6 +209,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -199,6 +209,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
199 model_filename = config.model_config.zipformer_ctc.model; 209 model_filename = config.model_config.zipformer_ctc.model;
200 } else if (!config.model_config.wenet_ctc.model.empty()) { 210 } else if (!config.model_config.wenet_ctc.model.empty()) {
201 model_filename = config.model_config.wenet_ctc.model; 211 model_filename = config.model_config.wenet_ctc.model;
  212 + } else if (!config.model_config.telespeech_ctc.empty()) {
  213 + model_filename = config.model_config.telespeech_ctc;
202 } else if (!config.model_config.whisper.encoder.empty()) { 214 } else if (!config.model_config.whisper.encoder.empty()) {
203 model_filename = config.model_config.whisper.encoder; 215 model_filename = config.model_config.whisper.encoder;
204 } else { 216 } else {
@@ -251,6 +263,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -251,6 +263,10 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
251 "\n " 263 "\n "
252 "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" 264 "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh"
253 "\n" 265 "\n"
  266 + "(7) CTC models from TeleSpeech"
  267 + "\n "
  268 + "https://github.com/Tele-AI/TeleSpeech-ASR"
  269 + "\n"
254 "\n"); 270 "\n");
255 exit(-1); 271 exit(-1);
256 } 272 }
@@ -273,7 +289,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -273,7 +289,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
273 289
274 if (model_type == "EncDecCTCModelBPE" || 290 if (model_type == "EncDecCTCModelBPE" ||
275 model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || 291 model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" ||
276 - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { 292 + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" ||
  293 + model_type == "telespeech_ctc") {
277 return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config); 294 return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config);
278 } 295 }
279 296
@@ -291,7 +308,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create( @@ -291,7 +308,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
291 " - Whisper models\n" 308 " - Whisper models\n"
292 " - Tdnn models\n" 309 " - Tdnn models\n"
293 " - Zipformer CTC models\n" 310 " - Zipformer CTC models\n"
294 - " - WeNet CTC models\n", 311 + " - WeNet CTC models\n"
  312 + " - TeleSpeech CTC models\n",
295 model_type.c_str()); 313 model_type.c_str());
296 314
297 exit(-1); 315 exit(-1);
@@ -57,6 +57,27 @@ class OfflineStream::Impl { @@ -57,6 +57,27 @@ class OfflineStream::Impl {
57 explicit Impl(const FeatureExtractorConfig &config, 57 explicit Impl(const FeatureExtractorConfig &config,
58 ContextGraphPtr context_graph) 58 ContextGraphPtr context_graph)
59 : config_(config), context_graph_(context_graph) { 59 : config_(config), context_graph_(context_graph) {
  60 + if (config.is_mfcc) {
  61 + mfcc_opts_.frame_opts.dither = config_.dither;
  62 + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges;
  63 + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate;
  64 + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms;
  65 + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms;
  66 + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset;
  67 + mfcc_opts_.frame_opts.window_type = config_.window_type;
  68 +
  69 + mfcc_opts_.mel_opts.num_bins = config_.feature_dim;
  70 +
  71 + mfcc_opts_.mel_opts.high_freq = config_.high_freq;
  72 + mfcc_opts_.mel_opts.low_freq = config_.low_freq;
  73 +
  74 + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa;
  75 +
  76 + mfcc_opts_.num_ceps = config_.num_ceps;
  77 + mfcc_opts_.use_energy = config_.use_energy;
  78 +
  79 + mfcc_ = std::make_unique<knf::OnlineMfcc>(mfcc_opts_);
  80 + } else {
60 opts_.frame_opts.dither = config.dither; 81 opts_.frame_opts.dither = config.dither;
61 opts_.frame_opts.snip_edges = config.snip_edges; 82 opts_.frame_opts.snip_edges = config.snip_edges;
62 opts_.frame_opts.samp_freq = config.sampling_rate; 83 opts_.frame_opts.samp_freq = config.sampling_rate;
@@ -74,6 +95,7 @@ class OfflineStream::Impl { @@ -74,6 +95,7 @@ class OfflineStream::Impl {
74 95
75 fbank_ = std::make_unique<knf::OnlineFbank>(opts_); 96 fbank_ = std::make_unique<knf::OnlineFbank>(opts_);
76 } 97 }
  98 + }
77 99
78 explicit Impl(WhisperTag /*tag*/) { 100 explicit Impl(WhisperTag /*tag*/) {
79 config_.normalize_samples = true; 101 config_.normalize_samples = true;
@@ -81,6 +103,7 @@ class OfflineStream::Impl { @@ -81,6 +103,7 @@ class OfflineStream::Impl {
81 opts_.mel_opts.num_bins = 80; // not used 103 opts_.mel_opts.num_bins = 80; // not used
82 whisper_fbank_ = 104 whisper_fbank_ =
83 std::make_unique<knf::OnlineWhisperFbank>(opts_.frame_opts); 105 std::make_unique<knf::OnlineWhisperFbank>(opts_.frame_opts);
  106 + config_.sampling_rate = opts_.frame_opts.samp_freq;
84 } 107 }
85 108
86 explicit Impl(CEDTag /*tag*/) { 109 explicit Impl(CEDTag /*tag*/) {
@@ -98,6 +121,8 @@ class OfflineStream::Impl { @@ -98,6 +121,8 @@ class OfflineStream::Impl {
98 opts_.mel_opts.num_bins = 64; 121 opts_.mel_opts.num_bins = 64;
99 opts_.mel_opts.high_freq = 8000; 122 opts_.mel_opts.high_freq = 8000;
100 123
  124 + config_.sampling_rate = opts_.frame_opts.samp_freq;
  125 +
101 fbank_ = std::make_unique<knf::OnlineFbank>(opts_); 126 fbank_ = std::make_unique<knf::OnlineFbank>(opts_);
102 } 127 }
103 128
@@ -115,52 +140,60 @@ class OfflineStream::Impl { @@ -115,52 +140,60 @@ class OfflineStream::Impl {
115 140
116 void AcceptWaveformImpl(int32_t sampling_rate, const float *waveform, 141 void AcceptWaveformImpl(int32_t sampling_rate, const float *waveform,
117 int32_t n) { 142 int32_t n) {
118 - if (sampling_rate != opts_.frame_opts.samp_freq) { 143 + if (sampling_rate != config_.sampling_rate) {
119 SHERPA_ONNX_LOGE( 144 SHERPA_ONNX_LOGE(
120 "Creating a resampler:\n" 145 "Creating a resampler:\n"
121 " in_sample_rate: %d\n" 146 " in_sample_rate: %d\n"
122 " output_sample_rate: %d\n", 147 " output_sample_rate: %d\n",
123 - sampling_rate, static_cast<int32_t>(opts_.frame_opts.samp_freq)); 148 + sampling_rate, static_cast<int32_t>(config_.sampling_rate));
124 149
125 - float min_freq =  
126 - std::min<int32_t>(sampling_rate, opts_.frame_opts.samp_freq); 150 + float min_freq = std::min<int32_t>(sampling_rate, config_.sampling_rate);
127 float lowpass_cutoff = 0.99 * 0.5 * min_freq; 151 float lowpass_cutoff = 0.99 * 0.5 * min_freq;
128 152
129 int32_t lowpass_filter_width = 6; 153 int32_t lowpass_filter_width = 6;
130 auto resampler = std::make_unique<LinearResample>( 154 auto resampler = std::make_unique<LinearResample>(
131 - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, 155 + sampling_rate, config_.sampling_rate, lowpass_cutoff,
132 lowpass_filter_width); 156 lowpass_filter_width);
133 std::vector<float> samples; 157 std::vector<float> samples;
134 resampler->Resample(waveform, n, true, &samples); 158 resampler->Resample(waveform, n, true, &samples);
135 159
136 if (fbank_) { 160 if (fbank_) {
137 - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), 161 + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(),
138 samples.size()); 162 samples.size());
139 fbank_->InputFinished(); 163 fbank_->InputFinished();
  164 + } else if (mfcc_) {
  165 + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(),
  166 + samples.size());
  167 + mfcc_->InputFinished();
140 } else { 168 } else {
141 - whisper_fbank_->AcceptWaveform(opts_.frame_opts.samp_freq,  
142 - samples.data(), samples.size()); 169 + whisper_fbank_->AcceptWaveform(config_.sampling_rate, samples.data(),
  170 + samples.size());
143 whisper_fbank_->InputFinished(); 171 whisper_fbank_->InputFinished();
144 } 172 }
145 173
146 return; 174 return;
147 - } // if (sampling_rate != opts_.frame_opts.samp_freq) 175 + } // if (sampling_rate != config_.sampling_rate)
148 176
149 if (fbank_) { 177 if (fbank_) {
150 fbank_->AcceptWaveform(sampling_rate, waveform, n); 178 fbank_->AcceptWaveform(sampling_rate, waveform, n);
151 fbank_->InputFinished(); 179 fbank_->InputFinished();
  180 + } else if (mfcc_) {
  181 + mfcc_->AcceptWaveform(sampling_rate, waveform, n);
  182 + mfcc_->InputFinished();
152 } else { 183 } else {
153 whisper_fbank_->AcceptWaveform(sampling_rate, waveform, n); 184 whisper_fbank_->AcceptWaveform(sampling_rate, waveform, n);
154 whisper_fbank_->InputFinished(); 185 whisper_fbank_->InputFinished();
155 } 186 }
156 } 187 }
157 188
158 - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } 189 + int32_t FeatureDim() const {
  190 + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins;
  191 + }
159 192
160 std::vector<float> GetFrames() const { 193 std::vector<float> GetFrames() const {
161 - int32_t n =  
162 - fbank_ ? fbank_->NumFramesReady() : whisper_fbank_->NumFramesReady();  
163 - 194 + int32_t n = fbank_ ? fbank_->NumFramesReady()
  195 + : mfcc_ ? mfcc_->NumFramesReady()
  196 + : whisper_fbank_->NumFramesReady();
164 assert(n > 0 && "Please first call AcceptWaveform()"); 197 assert(n > 0 && "Please first call AcceptWaveform()");
165 198
166 int32_t feature_dim = FeatureDim(); 199 int32_t feature_dim = FeatureDim();
@@ -170,8 +203,9 @@ class OfflineStream::Impl { @@ -170,8 +203,9 @@ class OfflineStream::Impl {
170 float *p = features.data(); 203 float *p = features.data();
171 204
172 for (int32_t i = 0; i != n; ++i) { 205 for (int32_t i = 0; i != n; ++i) {
173 - const float *f =  
174 - fbank_ ? fbank_->GetFrame(i) : whisper_fbank_->GetFrame(i); 206 + const float *f = fbank_ ? fbank_->GetFrame(i)
  207 + : mfcc_ ? mfcc_->GetFrame(i)
  208 + : whisper_fbank_->GetFrame(i);
175 std::copy(f, f + feature_dim, p); 209 std::copy(f, f + feature_dim, p);
176 p += feature_dim; 210 p += feature_dim;
177 } 211 }
@@ -222,8 +256,10 @@ class OfflineStream::Impl { @@ -222,8 +256,10 @@ class OfflineStream::Impl {
222 private: 256 private:
223 FeatureExtractorConfig config_; 257 FeatureExtractorConfig config_;
224 std::unique_ptr<knf::OnlineFbank> fbank_; 258 std::unique_ptr<knf::OnlineFbank> fbank_;
  259 + std::unique_ptr<knf::OnlineMfcc> mfcc_;
225 std::unique_ptr<knf::OnlineWhisperFbank> whisper_fbank_; 260 std::unique_ptr<knf::OnlineWhisperFbank> whisper_fbank_;
226 knf::FbankOptions opts_; 261 knf::FbankOptions opts_;
  262 + knf::MfccOptions mfcc_opts_;
227 OfflineRecognitionResult r_; 263 OfflineRecognitionResult r_;
228 ContextGraphPtr context_graph_; 264 ContextGraphPtr context_graph_;
229 }; 265 };
  1 +// sherpa-onnx/csrc/offline-telespeech-ctc-model.cc
  2 +//
  3 +// Copyright (c) 2023-2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h"
  6 +
  7 +#include "sherpa-onnx/csrc/macros.h"
  8 +#include "sherpa-onnx/csrc/onnx-utils.h"
  9 +#include "sherpa-onnx/csrc/session.h"
  10 +#include "sherpa-onnx/csrc/text-utils.h"
  11 +#include "sherpa-onnx/csrc/transpose.h"
  12 +
  13 +namespace sherpa_onnx {
  14 +
  15 +class OfflineTeleSpeechCtcModel::Impl {
  16 + public:
  17 + explicit Impl(const OfflineModelConfig &config)
  18 + : config_(config),
  19 + env_(ORT_LOGGING_LEVEL_ERROR),
  20 + sess_opts_(GetSessionOptions(config)),
  21 + allocator_{} {
  22 + auto buf = ReadFile(config_.telespeech_ctc);
  23 + Init(buf.data(), buf.size());
  24 + }
  25 +
  26 +#if __ANDROID_API__ >= 9
  27 + Impl(AAssetManager *mgr, const OfflineModelConfig &config)
  28 + : config_(config),
  29 + env_(ORT_LOGGING_LEVEL_ERROR),
  30 + sess_opts_(GetSessionOptions(config)),
  31 + allocator_{} {
  32 + auto buf = ReadFile(mgr, config_.telespeech_ctc);
  33 + Init(buf.data(), buf.size());
  34 + }
  35 +#endif
  36 +
  37 + std::vector<Ort::Value> Forward(Ort::Value features,
  38 + Ort::Value /*features_length*/) {
  39 + std::vector<int64_t> shape =
  40 + features.GetTensorTypeAndShapeInfo().GetShape();
  41 +
  42 + if (static_cast<int32_t>(shape[0]) != 1) {
  43 + SHERPA_ONNX_LOGE("This model supports only batch size 1. Given %d",
  44 + static_cast<int32_t>(shape[0]));
  45 + }
  46 +
  47 + auto out = sess_->Run({}, input_names_ptr_.data(), &features, 1,
  48 + output_names_ptr_.data(), output_names_ptr_.size());
  49 +
  50 + std::vector<int64_t> logits_shape = {1};
  51 + Ort::Value logits_length = Ort::Value::CreateTensor<int64_t>(
  52 + allocator_, logits_shape.data(), logits_shape.size());
  53 +
  54 + int64_t *dst = logits_length.GetTensorMutableData<int64_t>();
  55 + dst[0] = out[0].GetTensorTypeAndShapeInfo().GetShape()[0];
  56 +
  57 + // (T, B, C) -> (B, T, C)
  58 + Ort::Value logits = Transpose01(allocator_, &out[0]);
  59 +
  60 + std::vector<Ort::Value> ans;
  61 + ans.reserve(2);
  62 + ans.push_back(std::move(logits));
  63 + ans.push_back(std::move(logits_length));
  64 +
  65 + return ans;
  66 + }
  67 +
  68 + int32_t VocabSize() const { return vocab_size_; }
  69 +
  70 + int32_t SubsamplingFactor() const { return subsampling_factor_; }
  71 +
  72 + OrtAllocator *Allocator() const { return allocator_; }
  73 +
  74 + private:
  75 + void Init(void *model_data, size_t model_data_length) {
  76 + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
  77 + sess_opts_);
  78 +
  79 + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
  80 +
  81 + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
  82 +
  83 + // get meta data
  84 + Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
  85 + if (config_.debug) {
  86 + std::ostringstream os;
  87 + PrintModelMetadata(os, meta_data);
  88 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
  89 + }
  90 +
  91 + {
  92 + auto shape =
  93 + sess_->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
  94 + vocab_size_ = shape[2];
  95 + }
  96 + }
  97 +
  98 + private:
  99 + OfflineModelConfig config_;
  100 + Ort::Env env_;
  101 + Ort::SessionOptions sess_opts_;
  102 + Ort::AllocatorWithDefaultOptions allocator_;
  103 +
  104 + std::unique_ptr<Ort::Session> sess_;
  105 +
  106 + std::vector<std::string> input_names_;
  107 + std::vector<const char *> input_names_ptr_;
  108 +
  109 + std::vector<std::string> output_names_;
  110 + std::vector<const char *> output_names_ptr_;
  111 +
  112 + int32_t vocab_size_ = 0;
  113 + int32_t subsampling_factor_ = 4;
  114 +};
  115 +
  116 +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel(
  117 + const OfflineModelConfig &config)
  118 + : impl_(std::make_unique<Impl>(config)) {}
  119 +
  120 +#if __ANDROID_API__ >= 9
  121 +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel(
  122 + AAssetManager *mgr, const OfflineModelConfig &config)
  123 + : impl_(std::make_unique<Impl>(mgr, config)) {}
  124 +#endif
  125 +
  126 +OfflineTeleSpeechCtcModel::~OfflineTeleSpeechCtcModel() = default;
  127 +
  128 +std::vector<Ort::Value> OfflineTeleSpeechCtcModel::Forward(
  129 + Ort::Value features, Ort::Value features_length) {
  130 + return impl_->Forward(std::move(features), std::move(features_length));
  131 +}
  132 +
  133 +int32_t OfflineTeleSpeechCtcModel::VocabSize() const {
  134 + return impl_->VocabSize();
  135 +}
  136 +int32_t OfflineTeleSpeechCtcModel::SubsamplingFactor() const {
  137 + return impl_->SubsamplingFactor();
  138 +}
  139 +
  140 +OrtAllocator *OfflineTeleSpeechCtcModel::Allocator() const {
  141 + return impl_->Allocator();
  142 +}
  143 +
  144 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-telespeech-ctc-model.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_
  5 +#define SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_
  6 +#include <memory>
  7 +#include <string>
  8 +#include <utility>
  9 +#include <vector>
  10 +
  11 +#if __ANDROID_API__ >= 9
  12 +#include "android/asset_manager.h"
  13 +#include "android/asset_manager_jni.h"
  14 +#endif
  15 +
  16 +#include "onnxruntime_cxx_api.h" // NOLINT
  17 +#include "sherpa-onnx/csrc/offline-ctc-model.h"
  18 +#include "sherpa-onnx/csrc/offline-model-config.h"
  19 +
  20 +namespace sherpa_onnx {
  21 +
  22 +/** This class implements the CTC model from
  23 + * https://github.com/Tele-AI/TeleSpeech-ASR.
  24 + *
  25 + * See
  26 + * https://github.com/lovemefan/telespeech-asr-python/blob/main/telespeechasr/onnx/onnx_infer.py
  27 + * and
  28 + * https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/test.py
  29 + */
  30 +class OfflineTeleSpeechCtcModel : public OfflineCtcModel {
  31 + public:
  32 + explicit OfflineTeleSpeechCtcModel(const OfflineModelConfig &config);
  33 +
  34 +#if __ANDROID_API__ >= 9
  35 + OfflineTeleSpeechCtcModel(AAssetManager *mgr,
  36 + const OfflineModelConfig &config);
  37 +#endif
  38 +
  39 + ~OfflineTeleSpeechCtcModel() override;
  40 +
  41 + /** Run the forward method of the model.
  42 + *
  43 + * @param features A tensor of shape (N, T, C).
  44 + * @param features_length A 1-D tensor of shape (N,) containing number of
  45 + * valid frames in `features` before padding.
  46 + * Its dtype is int64_t.
  47 + *
  48 + * @return Return a vector containing:
  49 + * - log_probs: A 3-D tensor of shape (N, T', vocab_size).
  50 + * - log_probs_length A 1-D tensor of shape (N,). Its dtype is int64_t
  51 + */
  52 + std::vector<Ort::Value> Forward(Ort::Value features,
  53 + Ort::Value features_length) override;
  54 +
  55 + /** Return the vocabulary size of the model
  56 + */
  57 + int32_t VocabSize() const override;
  58 +
  59 + /** SubsamplingFactor of the model
  60 + */
  61 + int32_t SubsamplingFactor() const override;
  62 +
  63 + /** Return an allocator for allocating memory
  64 + */
  65 + OrtAllocator *Allocator() const override;
  66 +
  67 + // TeleSpeech CTC models do not support batch size > 1
  68 + bool SupportBatchProcessing() const override { return false; }
  69 +
  70 + std::string FeatureNormalizationMethod() const override {
  71 + return "per_feature";
  72 + }
  73 +
  74 + private:
  75 + class Impl;
  76 + std::unique_ptr<Impl> impl_;
  77 +};
  78 +
  79 +} // namespace sherpa_onnx
  80 +
  81 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_
@@ -66,7 +66,7 @@ bool OnlineModelConfig::Validate() const { @@ -66,7 +66,7 @@ bool OnlineModelConfig::Validate() const {
66 if (!modeling_unit.empty() && 66 if (!modeling_unit.empty() &&
67 (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { 67 (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) {
68 if (!FileExists(bpe_vocab)) { 68 if (!FileExists(bpe_vocab)) {
69 - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); 69 + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str());
70 return false; 70 return false;
71 } 71 }
72 } 72 }
@@ -7,6 +7,7 @@ public class OfflineModelConfig { @@ -7,6 +7,7 @@ public class OfflineModelConfig {
7 private final OfflineParaformerModelConfig paraformer; 7 private final OfflineParaformerModelConfig paraformer;
8 private final OfflineWhisperModelConfig whisper; 8 private final OfflineWhisperModelConfig whisper;
9 private final OfflineNemoEncDecCtcModelConfig nemo; 9 private final OfflineNemoEncDecCtcModelConfig nemo;
  10 + private final String teleSpeech;
10 private final String tokens; 11 private final String tokens;
11 private final int numThreads; 12 private final int numThreads;
12 private final boolean debug; 13 private final boolean debug;
@@ -21,6 +22,7 @@ public class OfflineModelConfig { @@ -21,6 +22,7 @@ public class OfflineModelConfig {
21 this.paraformer = builder.paraformer; 22 this.paraformer = builder.paraformer;
22 this.whisper = builder.whisper; 23 this.whisper = builder.whisper;
23 this.nemo = builder.nemo; 24 this.nemo = builder.nemo;
  25 + this.teleSpeech = builder.teleSpeech;
24 this.tokens = builder.tokens; 26 this.tokens = builder.tokens;
25 this.numThreads = builder.numThreads; 27 this.numThreads = builder.numThreads;
26 this.debug = builder.debug; 28 this.debug = builder.debug;
@@ -74,11 +76,16 @@ public class OfflineModelConfig { @@ -74,11 +76,16 @@ public class OfflineModelConfig {
74 return bpeVocab; 76 return bpeVocab;
75 } 77 }
76 78
  79 + public String getTeleSpeech() {
  80 + return teleSpeech;
  81 + }
  82 +
77 public static class Builder { 83 public static class Builder {
78 private OfflineParaformerModelConfig paraformer = OfflineParaformerModelConfig.builder().build(); 84 private OfflineParaformerModelConfig paraformer = OfflineParaformerModelConfig.builder().build();
79 private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); 85 private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build();
80 private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); 86 private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build();
81 private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); 87 private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build();
  88 + private String teleSpeech = "";
82 private String tokens = ""; 89 private String tokens = "";
83 private int numThreads = 1; 90 private int numThreads = 1;
84 private boolean debug = true; 91 private boolean debug = true;
@@ -106,6 +113,12 @@ public class OfflineModelConfig { @@ -106,6 +113,12 @@ public class OfflineModelConfig {
106 return this; 113 return this;
107 } 114 }
108 115
  116 +
  117 + public Builder setTeleSpeech(String teleSpeech) {
  118 + this.teleSpeech = teleSpeech;
  119 + return this;
  120 + }
  121 +
109 public Builder setWhisper(OfflineWhisperModelConfig whisper) { 122 public Builder setWhisper(OfflineWhisperModelConfig whisper) {
110 this.whisper = whisper; 123 this.whisper = whisper;
111 return this; 124 return this;
@@ -172,6 +172,12 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { @@ -172,6 +172,12 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
172 ans.model_config.nemo_ctc.model = p; 172 ans.model_config.nemo_ctc.model = p;
173 env->ReleaseStringUTFChars(s, p); 173 env->ReleaseStringUTFChars(s, p);
174 174
  175 + fid = env->GetFieldID(model_config_cls, "teleSpeech", "Ljava/lang/String;");
  176 + s = (jstring)env->GetObjectField(model_config, fid);
  177 + p = env->GetStringUTFChars(s, nullptr);
  178 + ans.model_config.telespeech_ctc = p;
  179 + env->ReleaseStringUTFChars(s, p);
  180 +
175 return ans; 181 return ans;
176 } 182 }
177 183
@@ -35,6 +35,7 @@ data class OfflineModelConfig( @@ -35,6 +35,7 @@ data class OfflineModelConfig(
35 var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), 35 var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(),
36 var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), 36 var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(),
37 var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), 37 var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(),
  38 + var teleSpeech: String = "",
38 var numThreads: Int = 1, 39 var numThreads: Int = 1,
39 var debug: Boolean = false, 40 var debug: Boolean = false,
40 var provider: String = "cpu", 41 var provider: String = "cpu",
@@ -272,6 +273,15 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { @@ -272,6 +273,15 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
272 tokens = "$modelDir/tokens.txt", 273 tokens = "$modelDir/tokens.txt",
273 ) 274 )
274 } 275 }
  276 +
  277 + 11 -> {
  278 + val modelDir = "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04"
  279 + return OfflineModelConfig(
  280 + teleSpeech = "$modelDir/model.int8.onnx",
  281 + tokens = "$modelDir/tokens.txt",
  282 + modelType = "tele_speech",
  283 + )
  284 + }
275 } 285 }
276 return null 286 return null
277 } 287 }
@@ -29,15 +29,16 @@ void PybindOfflineModelConfig(py::module *m) { @@ -29,15 +29,16 @@ void PybindOfflineModelConfig(py::module *m) {
29 29
30 using PyClass = OfflineModelConfig; 30 using PyClass = OfflineModelConfig;
31 py::class_<PyClass>(*m, "OfflineModelConfig") 31 py::class_<PyClass>(*m, "OfflineModelConfig")
32 - .def(py::init<const OfflineTransducerModelConfig &, 32 + .def(
  33 + py::init<
  34 + const OfflineTransducerModelConfig &,
33 const OfflineParaformerModelConfig &, 35 const OfflineParaformerModelConfig &,
34 const OfflineNemoEncDecCtcModelConfig &, 36 const OfflineNemoEncDecCtcModelConfig &,
35 - const OfflineWhisperModelConfig &,  
36 - const OfflineTdnnModelConfig &, 37 + const OfflineWhisperModelConfig &, const OfflineTdnnModelConfig &,
37 const OfflineZipformerCtcModelConfig &, 38 const OfflineZipformerCtcModelConfig &,
38 const OfflineWenetCtcModelConfig &, const std::string &, 39 const OfflineWenetCtcModelConfig &, const std::string &,
39 - int32_t, bool, const std::string &, const std::string &,  
40 - const std::string &, const std::string &>(), 40 + const std::string &, int32_t, bool, const std::string &,
  41 + const std::string &, const std::string &, const std::string &>(),
41 py::arg("transducer") = OfflineTransducerModelConfig(), 42 py::arg("transducer") = OfflineTransducerModelConfig(),
42 py::arg("paraformer") = OfflineParaformerModelConfig(), 43 py::arg("paraformer") = OfflineParaformerModelConfig(),
43 py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(), 44 py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(),
@@ -45,7 +46,8 @@ void PybindOfflineModelConfig(py::module *m) { @@ -45,7 +46,8 @@ void PybindOfflineModelConfig(py::module *m) {
45 py::arg("tdnn") = OfflineTdnnModelConfig(), 46 py::arg("tdnn") = OfflineTdnnModelConfig(),
46 py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(), 47 py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(),
47 py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), 48 py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(),
48 - py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false, 49 + py::arg("telespeech_ctc") = "", py::arg("tokens"),
  50 + py::arg("num_threads"), py::arg("debug") = false,
49 py::arg("provider") = "cpu", py::arg("model_type") = "", 51 py::arg("provider") = "cpu", py::arg("model_type") = "",
50 py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "") 52 py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "")
51 .def_readwrite("transducer", &PyClass::transducer) 53 .def_readwrite("transducer", &PyClass::transducer)
@@ -55,6 +57,7 @@ void PybindOfflineModelConfig(py::module *m) { @@ -55,6 +57,7 @@ void PybindOfflineModelConfig(py::module *m) {
55 .def_readwrite("tdnn", &PyClass::tdnn) 57 .def_readwrite("tdnn", &PyClass::tdnn)
56 .def_readwrite("zipformer_ctc", &PyClass::zipformer_ctc) 58 .def_readwrite("zipformer_ctc", &PyClass::zipformer_ctc)
57 .def_readwrite("wenet_ctc", &PyClass::wenet_ctc) 59 .def_readwrite("wenet_ctc", &PyClass::wenet_ctc)
  60 + .def_readwrite("telespeech_ctc", &PyClass::telespeech_ctc)
58 .def_readwrite("tokens", &PyClass::tokens) 61 .def_readwrite("tokens", &PyClass::tokens)
59 .def_readwrite("num_threads", &PyClass::num_threads) 62 .def_readwrite("num_threads", &PyClass::num_threads)
60 .def_readwrite("debug", &PyClass::debug) 63 .def_readwrite("debug", &PyClass::debug)
@@ -212,6 +212,71 @@ class OfflineRecognizer(object): @@ -212,6 +212,71 @@ class OfflineRecognizer(object):
212 return self 212 return self
213 213
214 @classmethod 214 @classmethod
  215 + def from_telespeech_ctc(
  216 + cls,
  217 + model: str,
  218 + tokens: str,
  219 + num_threads: int = 1,
  220 + sample_rate: int = 16000,
  221 + feature_dim: int = 40,
  222 + decoding_method: str = "greedy_search",
  223 + debug: bool = False,
  224 + provider: str = "cpu",
  225 + ):
  226 + """
  227 + Please refer to
  228 + `<https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models>`_
  229 + to download pre-trained models.
  230 +
  231 + Args:
  232 + model:
  233 + Path to ``model.onnx``.
  234 + tokens:
  235 + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
  236 + columns::
  237 +
  238 + symbol integer_id
  239 +
  240 + num_threads:
  241 + Number of threads for neural network computation.
  242 + sample_rate:
  243 + Sample rate of the training data used to train the model. It is
  244 + ignored and is hard-coded in C++ to 40.
  245 + feature_dim:
  246 + Dimension of the feature used to train the model. It is ignored
  247 + and is hard-coded in C++ to 40.
  248 + decoding_method:
  249 + Valid values are greedy_search.
  250 + debug:
  251 + True to show debug messages.
  252 + provider:
  253 + onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
  254 + """
  255 + self = cls.__new__(cls)
  256 + model_config = OfflineModelConfig(
  257 + telespeech_ctc=model,
  258 + tokens=tokens,
  259 + num_threads=num_threads,
  260 + debug=debug,
  261 + provider=provider,
  262 + model_type="nemo_ctc",
  263 + )
  264 +
  265 + feat_config = FeatureExtractorConfig(
  266 + sampling_rate=sample_rate,
  267 + feature_dim=feature_dim,
  268 + )
  269 +
  270 + recognizer_config = OfflineRecognizerConfig(
  271 + feat_config=feat_config,
  272 + model_config=model_config,
  273 + decoding_method=decoding_method,
  274 + )
  275 + self.recognizer = _Recognizer(recognizer_config)
  276 + self.config = recognizer_config
  277 + return self
  278 +
  279 + @classmethod
215 def from_nemo_ctc( 280 def from_nemo_ctc(
216 cls, 281 cls,
217 model: str, 282 model: str,
@@ -102,7 +102,7 @@ func sherpaOnnxOnlineModelConfig( @@ -102,7 +102,7 @@ func sherpaOnnxOnlineModelConfig(
102 debug: Int32(debug), 102 debug: Int32(debug),
103 model_type: toCPointer(modelType), 103 model_type: toCPointer(modelType),
104 modeling_unit: toCPointer(modelingUnit), 104 modeling_unit: toCPointer(modelingUnit),
105 - bpeVocab: toCPointer(bpeVocab) 105 + bpe_vocab: toCPointer(bpeVocab)
106 ) 106 )
107 } 107 }
108 108
@@ -360,7 +360,8 @@ func sherpaOnnxOfflineModelConfig( @@ -360,7 +360,8 @@ func sherpaOnnxOfflineModelConfig(
360 debug: Int = 0, 360 debug: Int = 0,
361 modelType: String = "", 361 modelType: String = "",
362 modelingUnit: String = "cjkchar", 362 modelingUnit: String = "cjkchar",
363 - bpeVocab: String = "" 363 + bpeVocab: String = "",
  364 + teleSpeechCtc: String = ""
364 ) -> SherpaOnnxOfflineModelConfig { 365 ) -> SherpaOnnxOfflineModelConfig {
365 return SherpaOnnxOfflineModelConfig( 366 return SherpaOnnxOfflineModelConfig(
366 transducer: transducer, 367 transducer: transducer,
@@ -374,7 +375,8 @@ func sherpaOnnxOfflineModelConfig( @@ -374,7 +375,8 @@ func sherpaOnnxOfflineModelConfig(
374 provider: toCPointer(provider), 375 provider: toCPointer(provider),
375 model_type: toCPointer(modelType), 376 model_type: toCPointer(modelType),
376 modeling_unit: toCPointer(modelingUnit), 377 modeling_unit: toCPointer(modelingUnit),
377 - bpeVocab: toCPointer(bpeVocab) 378 + bpe_vocab: toCPointer(bpeVocab),
  379 + telespeech_ctc: toCPointer(teleSpeechCtc)
378 ) 380 )
379 } 381 }
380 382
@@ -529,7 +529,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -529,7 +529,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
529 const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); 529 const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module);
530 530
531 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + 531 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
532 - tdnn.len + 7 * 4; 532 + tdnn.len + 8 * 4;
533 const ptr = Module._malloc(len); 533 const ptr = Module._malloc(len);
534 534
535 let offset = 0; 535 let offset = 0;
@@ -553,9 +553,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -553,9 +553,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
553 const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; 553 const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1;
554 const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; 554 const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1;
555 const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; 555 const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1;
  556 + const teleSpeechCtcLen =
  557 + Module.lengthBytesUTF8(config.teleSpeechCtc || '') + 1;
556 558
557 - const bufferLen =  
558 - tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen; 559 + const bufferLen = tokensLen + providerLen + modelTypeLen + modelingUnitLen +
  560 + bpeVocabLen + teleSpeechCtcLen;
559 const buffer = Module._malloc(bufferLen); 561 const buffer = Module._malloc(bufferLen);
560 562
561 offset = 0; 563 offset = 0;
@@ -575,6 +577,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -575,6 +577,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
575 Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); 577 Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen);
576 offset += bpeVocabLen; 578 offset += bpeVocabLen;
577 579
  580 + Module.stringToUTF8(
  581 + config.teleSpeechCtc || '', buffer + offset, teleSpeechCtcLen);
  582 + offset += teleSpeechCtcLen;
  583 +
578 offset = 584 offset =
579 transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len; 585 transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len;
580 Module.setValue(ptr + offset, buffer, 'i8*'); // tokens 586 Module.setValue(ptr + offset, buffer, 'i8*'); // tokens
@@ -604,6 +610,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -604,6 +610,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
604 'i8*'); // bpeVocab 610 'i8*'); // bpeVocab
605 offset += 4; 611 offset += 4;
606 612
  613 + Module.setValue(
  614 + ptr + offset,
  615 + buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen +
  616 + bpeVocabLen,
  617 + 'i8*'); // teleSpeechCtc
  618 + offset += 4;
  619 +
607 return { 620 return {
608 buffer: buffer, ptr: ptr, len: len, transducer: transducer, 621 buffer: buffer, ptr: ptr, len: len, transducer: transducer,
609 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn 622 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn
@@ -23,7 +23,7 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == @@ -23,7 +23,7 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
23 sizeof(SherpaOnnxOfflineParaformerModelConfig) + 23 sizeof(SherpaOnnxOfflineParaformerModelConfig) +
24 sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + 24 sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) +
25 sizeof(SherpaOnnxOfflineWhisperModelConfig) + 25 sizeof(SherpaOnnxOfflineWhisperModelConfig) +
26 - sizeof(SherpaOnnxOfflineTdnnModelConfig) + 7 * 4, 26 + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4,
27 ""); 27 "");
28 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); 28 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
29 static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == 29 static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) ==
@@ -92,6 +92,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -92,6 +92,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
92 fprintf(stdout, "model type: %s\n", model_config->model_type); 92 fprintf(stdout, "model type: %s\n", model_config->model_type);
93 fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit); 93 fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit);
94 fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab); 94 fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab);
  95 + fprintf(stdout, "telespeech_ctc: %s\n", model_config->telespeech_ctc);
95 96
96 fprintf(stdout, "----------feat config----------\n"); 97 fprintf(stdout, "----------feat config----------\n");
97 fprintf(stdout, "sample rate: %d\n", feat->sample_rate); 98 fprintf(stdout, "sample rate: %d\n", feat->sample_rate);