Committed by
GitHub
WebAssembly example for VAD + Non-streaming ASR (#1284)
正在显示
29 个修改的文件
包含
1281 行增加
和
70 行删除
| @@ -25,8 +25,12 @@ jobs: | @@ -25,8 +25,12 @@ jobs: | ||
| 25 | - uses: actions/checkout@v4 | 25 | - uses: actions/checkout@v4 |
| 26 | with: | 26 | with: |
| 27 | fetch-depth: 0 | 27 | fetch-depth: 0 |
| 28 | + | ||
| 28 | - name: Install emsdk | 29 | - name: Install emsdk |
| 29 | uses: mymindstorm/setup-emsdk@v14 | 30 | uses: mymindstorm/setup-emsdk@v14 |
| 31 | + with: | ||
| 32 | + version: 3.1.51 | ||
| 33 | + actions-cache-folder: 'emsdk-cache' | ||
| 30 | 34 | ||
| 31 | - name: View emsdk version | 35 | - name: View emsdk version |
| 32 | shell: bash | 36 | shell: bash |
| @@ -27,6 +27,9 @@ jobs: | @@ -27,6 +27,9 @@ jobs: | ||
| 27 | fetch-depth: 0 | 27 | fetch-depth: 0 |
| 28 | - name: Install emsdk | 28 | - name: Install emsdk |
| 29 | uses: mymindstorm/setup-emsdk@v14 | 29 | uses: mymindstorm/setup-emsdk@v14 |
| 30 | + with: | ||
| 31 | + version: 3.1.51 | ||
| 32 | + actions-cache-folder: 'emsdk-cache' | ||
| 30 | 33 | ||
| 31 | - name: View emsdk version | 34 | - name: View emsdk version |
| 32 | shell: bash | 35 | shell: bash |
| @@ -25,8 +25,12 @@ jobs: | @@ -25,8 +25,12 @@ jobs: | ||
| 25 | - uses: actions/checkout@v4 | 25 | - uses: actions/checkout@v4 |
| 26 | with: | 26 | with: |
| 27 | fetch-depth: 0 | 27 | fetch-depth: 0 |
| 28 | + | ||
| 28 | - name: Install emsdk | 29 | - name: Install emsdk |
| 29 | uses: mymindstorm/setup-emsdk@v14 | 30 | uses: mymindstorm/setup-emsdk@v14 |
| 31 | + with: | ||
| 32 | + version: 3.1.51 | ||
| 33 | + actions-cache-folder: 'emsdk-cache' | ||
| 30 | 34 | ||
| 31 | - name: View emsdk version | 35 | - name: View emsdk version |
| 32 | shell: bash | 36 | shell: bash |
| @@ -25,6 +25,7 @@ jobs: | @@ -25,6 +25,7 @@ jobs: | ||
| 25 | - uses: actions/checkout@v4 | 25 | - uses: actions/checkout@v4 |
| 26 | with: | 26 | with: |
| 27 | fetch-depth: 0 | 27 | fetch-depth: 0 |
| 28 | + | ||
| 28 | - name: Install emsdk | 29 | - name: Install emsdk |
| 29 | uses: mymindstorm/setup-emsdk@v14 | 30 | uses: mymindstorm/setup-emsdk@v14 |
| 30 | with: | 31 | with: |
| 1 | +name: wasm-simd-hf-space-vad-asr | ||
| 2 | + | ||
| 3 | +on: | ||
| 4 | + push: | ||
| 5 | + branches: | ||
| 6 | + - wasm | ||
| 7 | + tags: | ||
| 8 | + - 'v[0-9]+.[0-9]+.[0-9]+*' | ||
| 9 | + | ||
| 10 | + workflow_dispatch: | ||
| 11 | + | ||
| 12 | +concurrency: | ||
| 13 | + group: wasm-simd-hf-space-vad-asr${{ github.ref }} | ||
| 14 | + cancel-in-progress: true | ||
| 15 | + | ||
| 16 | +jobs: | ||
| 17 | + wasm-simd-hf-space-vad-asr: | ||
| 18 | + name: ${{ matrix.index }}/${{ matrix.total }} | ||
| 19 | + runs-on: ${{ matrix.os }} | ||
| 20 | + strategy: | ||
| 21 | + fail-fast: false | ||
| 22 | + matrix: | ||
| 23 | + os: [ubuntu-latest] | ||
| 24 | + total: ["8"] | ||
| 25 | + index: ["0", "1", "2", "3", "4", "5", "6", "7"] | ||
| 26 | + | ||
| 27 | + steps: | ||
| 28 | + - uses: actions/checkout@v4 | ||
| 29 | + with: | ||
| 30 | + fetch-depth: 0 | ||
| 31 | + | ||
| 32 | + - name: Install Python dependencies | ||
| 33 | + shell: bash | ||
| 34 | + run: | | ||
| 35 | + python3 -m pip install --upgrade pip jinja2 | ||
| 36 | + | ||
| 37 | + - name: Install emsdk | ||
| 38 | + uses: mymindstorm/setup-emsdk@v14 | ||
| 39 | + with: | ||
| 40 | + version: 3.1.51 | ||
| 41 | + actions-cache-folder: 'emsdk-cache' | ||
| 42 | + | ||
| 43 | + - name: View emsdk version | ||
| 44 | + shell: bash | ||
| 45 | + run: | | ||
| 46 | + emcc -v | ||
| 47 | + echo "--------------------" | ||
| 48 | + emcc --check | ||
| 49 | + | ||
| 50 | + - name: Generate build script | ||
| 51 | + shell: bash | ||
| 52 | + run: | | ||
| 53 | + cd scripts/wasm | ||
| 54 | + | ||
| 55 | + total=${{ matrix.total }} | ||
| 56 | + index=${{ matrix.index }} | ||
| 57 | + | ||
| 58 | + ./generate-vad-asr.py --total $total --index $index | ||
| 59 | + | ||
| 60 | + chmod +x run-vad-asr.sh | ||
| 61 | + mv -v ./run-vad-asr.sh ../.. | ||
| 62 | + | ||
| 63 | + - name: Show build scripts | ||
| 64 | + shell: bash | ||
| 65 | + run: | | ||
| 66 | + cat ./run-vad-asr.sh | ||
| 67 | + | ||
| 68 | + - uses: actions/upload-artifact@v4 | ||
| 69 | + with: | ||
| 70 | + name: run-vad-asr-${{ matrix.index }} | ||
| 71 | + path: ./run-vad-asr.sh | ||
| 72 | + | ||
| 73 | + - name: Build sherpa-onnx for WebAssembly | ||
| 74 | + shell: bash | ||
| 75 | + env: | ||
| 76 | + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }} | ||
| 77 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| 78 | + run: | | ||
| 79 | + ./run-vad-asr.sh | ||
| 80 | + | ||
| 81 | + - name: Release jar | ||
| 82 | + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') | ||
| 83 | + uses: svenstaro/upload-release-action@v2 | ||
| 84 | + with: | ||
| 85 | + file_glob: true | ||
| 86 | + overwrite: true | ||
| 87 | + file: ./*.tar.bz2 | ||
| 88 | + | ||
| 89 | + - name: Upload wasm files | ||
| 90 | + uses: actions/upload-artifact@v4 | ||
| 91 | + with: | ||
| 92 | + name: sherpa-onnx-wasm-simd-vad-asr-${{ matrix.index }} | ||
| 93 | + path: ./sherpa-onnx-wasm-simd-*.tar.bz2 |
| @@ -25,8 +25,12 @@ jobs: | @@ -25,8 +25,12 @@ jobs: | ||
| 25 | - uses: actions/checkout@v4 | 25 | - uses: actions/checkout@v4 |
| 26 | with: | 26 | with: |
| 27 | fetch-depth: 0 | 27 | fetch-depth: 0 |
| 28 | + | ||
| 28 | - name: Install emsdk | 29 | - name: Install emsdk |
| 29 | uses: mymindstorm/setup-emsdk@v14 | 30 | uses: mymindstorm/setup-emsdk@v14 |
| 31 | + with: | ||
| 32 | + version: 3.1.51 | ||
| 33 | + actions-cache-folder: 'emsdk-cache' | ||
| 30 | 34 | ||
| 31 | - name: View emsdk version | 35 | - name: View emsdk version |
| 32 | shell: bash | 36 | shell: bash |
| @@ -25,8 +25,12 @@ jobs: | @@ -25,8 +25,12 @@ jobs: | ||
| 25 | - uses: actions/checkout@v4 | 25 | - uses: actions/checkout@v4 |
| 26 | with: | 26 | with: |
| 27 | fetch-depth: 0 | 27 | fetch-depth: 0 |
| 28 | + | ||
| 28 | - name: Install emsdk | 29 | - name: Install emsdk |
| 29 | uses: mymindstorm/setup-emsdk@v14 | 30 | uses: mymindstorm/setup-emsdk@v14 |
| 31 | + with: | ||
| 32 | + version: 3.1.51 | ||
| 33 | + actions-cache-folder: 'emsdk-cache' | ||
| 30 | 34 | ||
| 31 | - name: View emsdk version | 35 | - name: View emsdk version |
| 32 | shell: bash | 36 | shell: bash |
| @@ -25,8 +25,12 @@ jobs: | @@ -25,8 +25,12 @@ jobs: | ||
| 25 | - uses: actions/checkout@v4 | 25 | - uses: actions/checkout@v4 |
| 26 | with: | 26 | with: |
| 27 | fetch-depth: 0 | 27 | fetch-depth: 0 |
| 28 | + | ||
| 28 | - name: Install emsdk | 29 | - name: Install emsdk |
| 29 | uses: mymindstorm/setup-emsdk@v14 | 30 | uses: mymindstorm/setup-emsdk@v14 |
| 31 | + with: | ||
| 32 | + version: 3.1.51 | ||
| 33 | + actions-cache-folder: 'emsdk-cache' | ||
| 30 | 34 | ||
| 31 | - name: View emsdk version | 35 | - name: View emsdk version |
| 32 | shell: bash | 36 | shell: bash |
| @@ -36,6 +36,7 @@ option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) | @@ -36,6 +36,7 @@ option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) | ||
| 36 | option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) | 36 | option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) |
| 37 | option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) | 37 | option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) |
| 38 | option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF) | 38 | option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF) |
| 39 | +option(SHERPA_ONNX_ENABLE_WASM_VAD_ASR "Whether to enable WASM for VAD+ASR" OFF) | ||
| 39 | option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) | 40 | option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) |
| 40 | option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) | 41 | option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) |
| 41 | option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON) | 42 | option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON) |
| @@ -137,6 +138,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") | @@ -137,6 +138,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") | ||
| 137 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") | 138 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") |
| 138 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") | 139 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") |
| 139 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}") | 140 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}") |
| 141 | +message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD_ASR ${SHERPA_ONNX_ENABLE_WASM_VAD_ASR}") | ||
| 140 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") | 142 | message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") |
| 141 | message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}") | 143 | message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}") |
| 142 | message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") | 144 | message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") |
| @@ -211,11 +213,22 @@ if(SHERPA_ONNX_ENABLE_WASM) | @@ -211,11 +213,22 @@ if(SHERPA_ONNX_ENABLE_WASM) | ||
| 211 | endif() | 213 | endif() |
| 212 | 214 | ||
| 213 | if(SHERPA_ONNX_ENABLE_WASM_KWS) | 215 | if(SHERPA_ONNX_ENABLE_WASM_KWS) |
| 216 | + if(NOT SHERPA_ONNX_ENABLE_WASM) | ||
| 217 | + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for KWS") | ||
| 218 | + endif() | ||
| 214 | add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) | 219 | add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) |
| 215 | endif() | 220 | endif() |
| 216 | 221 | ||
| 217 | if(SHERPA_ONNX_ENABLE_WASM_VAD) | 222 | if(SHERPA_ONNX_ENABLE_WASM_VAD) |
| 218 | - add_definitions(-DSHERPA_ONNX_ENABLE_WASM_VAD=1) | 223 | + if(NOT SHERPA_ONNX_ENABLE_WASM) |
| 224 | + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD") | ||
| 225 | + endif() | ||
| 226 | +endif() | ||
| 227 | + | ||
| 228 | +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR) | ||
| 229 | + if(NOT SHERPA_ONNX_ENABLE_WASM) | ||
| 230 | + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD+ASR") | ||
| 231 | + endif() | ||
| 219 | endif() | 232 | endif() |
| 220 | 233 | ||
| 221 | if(NOT CMAKE_CXX_STANDARD) | 234 | if(NOT CMAKE_CXX_STANDARD) |
| @@ -14,13 +14,13 @@ | @@ -14,13 +14,13 @@ | ||
| 14 | 14 | ||
| 15 | ### Supported platforms | 15 | ### Supported platforms |
| 16 | 16 | ||
| 17 | -|Architecture| Android | iOS | Windows | macOS | linux | | ||
| 18 | -|------------|------------------|---------------|------------|-------|-------| | ||
| 19 | -| x64 | ✔️ | | ✔️ | ✔️ | ✔️ | | ||
| 20 | -| x86 | ✔️ | | ✔️ | | | | ||
| 21 | -| arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | | ||
| 22 | -| arm32 | ✔️ | | | | ✔️ | | ||
| 23 | -| riscv64 | | | | | ✔️ | | 17 | +|Architecture| Android | iOS | Windows | macOS | linux | |
| 18 | +|------------|---------|---------|------------|-------|-------| | ||
| 19 | +| x64 | ✔️ | | ✔️ | ✔️ | ✔️ | | ||
| 20 | +| x86 | ✔️ | | ✔️ | | | | ||
| 21 | +| arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | | ||
| 22 | +| arm32 | ✔️ | | | | ✔️ | | ||
| 23 | +| riscv64 | | | | | ✔️ | | ||
| 24 | 24 | ||
| 25 | 25 | ||
| 26 | ### Supported programming languages | 26 | ### Supported programming languages |
| @@ -37,7 +37,7 @@ | @@ -37,7 +37,7 @@ | ||
| 37 | |-------|----------|----------|------------| | 37 | |-------|----------|----------|------------| |
| 38 | | ✔️ | ✔️ | ✔️ | ✔️ | | 38 | | ✔️ | ✔️ | ✔️ | ✔️ | |
| 39 | 39 | ||
| 40 | -For Rust support, please see https://github.com/thewh1teagle/sherpa-rs | 40 | +For Rust support, please see [sherpa-rs][sherpa-rs] |
| 41 | 41 | ||
| 42 | It also supports WebAssembly. | 42 | It also supports WebAssembly. |
| 43 | 43 | ||
| @@ -51,7 +51,7 @@ This repository supports running the following functions **locally** | @@ -51,7 +51,7 @@ This repository supports running the following functions **locally** | ||
| 51 | - Speaker verification | 51 | - Speaker verification |
| 52 | - Spoken language identification | 52 | - Spoken language identification |
| 53 | - Audio tagging | 53 | - Audio tagging |
| 54 | - - VAD (e.g., [silero-vad](https://github.com/snakers4/silero-vad)) | 54 | + - VAD (e.g., [silero-vad][silero-vad]) |
| 55 | - Keyword spotting | 55 | - Keyword spotting |
| 56 | 56 | ||
| 57 | on the following platforms and operating systems: | 57 | on the following platforms and operating systems: |
| @@ -62,11 +62,12 @@ on the following platforms and operating systems: | @@ -62,11 +62,12 @@ on the following platforms and operating systems: | ||
| 62 | - iOS | 62 | - iOS |
| 63 | - NodeJS | 63 | - NodeJS |
| 64 | - WebAssembly | 64 | - WebAssembly |
| 65 | - - [Raspberry Pi](https://www.raspberrypi.com/) | ||
| 66 | - - [RV1126](https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf) | ||
| 67 | - - [LicheePi4A](https://sipeed.com/licheepi4a) | ||
| 68 | - - [VisionFive 2](https://www.starfivetech.com/en/site/boards) | ||
| 69 | - - [旭日X3派](https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html) | 65 | + - [Raspberry Pi][Raspberry Pi] |
| 66 | + - [RV1126][RV1126] | ||
| 67 | + - [LicheePi4A][LicheePi4A] | ||
| 68 | + - [VisionFive 2][VisionFive 2] | ||
| 69 | + - [旭日X3派][旭日X3派] | ||
| 70 | + - [爱芯派][爱芯派] | ||
| 70 | - etc | 71 | - etc |
| 71 | 72 | ||
| 72 | with the following APIs | 73 | with the following APIs |
| @@ -81,59 +82,68 @@ with the following APIs | @@ -81,59 +82,68 @@ with the following APIs | ||
| 81 | You can visit the following Huggingface spaces to try `sherpa-onnx` without | 82 | You can visit the following Huggingface spaces to try `sherpa-onnx` without |
| 82 | installing anything. All you need is a browser. | 83 | installing anything. All you need is a browser. |
| 83 | 84 | ||
| 84 | -| Description | URL | | ||
| 85 | -|---|---| | ||
| 86 | -| Speech recognition | [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition)| | ||
| 87 | -| Speech recognition with [Whisper](https://github.com/openai/whisper)| [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper)| | ||
| 88 | -| Speech synthesis | [Click me](https://huggingface.co/spaces/k2-fsa/text-to-speech)| | ||
| 89 | -| Generate subtitles| [Click me](https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos)| | ||
| 90 | -|Audio tagging| [Click me](https://huggingface.co/spaces/k2-fsa/audio-tagging)| | ||
| 91 | -|Spoken language identification with [Whisper](https://github.com/openai/whisper)|[Click me](https://huggingface.co/spaces/k2-fsa/spoken-language-identification)| | 85 | +| Description | URL | |
| 86 | +|-------------------------------------------------------|------------------------------------| | ||
| 87 | +| Speech recognition | [Click me][hf-space-asr] | | ||
| 88 | +| Speech recognition with [Whisper][Whisper] | [Click me][hf-space-asr-whisper] | | ||
| 89 | +| Speech synthesis | [Click me][hf-space-tts] | | ||
| 90 | +| Generate subtitles | [Click me][hf-space-subtitle] | | ||
| 91 | +| Audio tagging | [Click me][hf-space-audio-tagging] | | ||
| 92 | +| Spoken language identification with [Whisper][Whisper]| [Click me][hf-space-slid-whisper] | | ||
| 92 | 93 | ||
| 93 | We also have spaces built using WebAssembly. The are listed below: | 94 | We also have spaces built using WebAssembly. The are listed below: |
| 94 | 95 | ||
| 95 | -| Description | URL| Chinese users| | ||
| 96 | -|---|---|---| | ||
| 97 | -|Voice activity detection with [silero-vad](https://github.com/snakers4/silero-vad)| [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx)|[地址](https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx)| | ||
| 98 | -|Real-time speech recognition (Chinese + English) with Zipformer | [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)| | ||
| 99 | -|Real-time speech recognition (Chinese + English) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| | ||
| 100 | -|Real-time speech recognition (Chinese + English + Cantonese) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| | ||
| 101 | -|Real-time speech recognition (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en)| | ||
| 102 | -|Speech synthesis (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en)| | ||
| 103 | -|Speech synthesis (German)|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de)| | 96 | +| Description | Huggingface space| ModelScope space| |
| 97 | +|------------------------------------------------------------------------------------------|------------------|-----------------| | ||
| 98 | +|Voice activity detection with [silero-vad][silero-vad] | [Click me][wasm-hf-vad]|[地址][wasm-ms-vad]| | ||
| 99 | +|Real-time speech recognition (Chinese + English) with Zipformer | [Click me][wasm-hf-streaming-asr-zh-en-zipformer]|[地址][wasm-hf-streaming-asr-zh-en-zipformer]| | ||
| 100 | +|Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| | ||
| 101 | +|Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| | ||
| 102 | +|Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| | ||
| 103 | +|VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| | ||
| 104 | +|VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| | ||
| 105 | +|VAD + speech recognition (English) with Zipformer trained with [GigaSpeech][GigaSpeech] |[Click me][wasm-hf-vad-asr-en-zipformer-gigaspeech]| [地址][wasm-ms-vad-asr-en-zipformer-gigaspeech]| | ||
| 106 | +|VAD + speech recognition (Chinese) with Zipformer trained with [WenetSpeech][WenetSpeech] |[Click me][wasm-hf-vad-asr-zh-zipformer-wenetspeech]| [地址][wasm-ms-vad-asr-zh-zipformer-wenetspeech]| | ||
| 107 | +|VAD + speech recognition (Japanese) with Zipformer trained with [ReazonSpeech][ReazonSpeech]|[Click me][wasm-hf-vad-asr-ja-zipformer-reazonspeech]| [地址][wasm-ms-vad-asr-ja-zipformer-reazonspeech]| | ||
| 108 | +|VAD + speech recognition (Thai) with Zipformer trained with [GigaSpeech2][GigaSpeech2] |[Click me][wasm-hf-vad-asr-th-zipformer-gigaspeech2]| [地址][wasm-ms-vad-asr-th-zipformer-gigaspeech2]| | ||
| 109 | +|VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]| | ||
| 110 | +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]| | ||
| 111 | +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| | ||
| 112 | +|Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| | ||
| 113 | +|Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| | ||
| 104 | 114 | ||
| 105 | ### Links for pre-built Android APKs | 115 | ### Links for pre-built Android APKs |
| 106 | 116 | ||
| 107 | -| Description | URL | 中国用户 | | ||
| 108 | -|--------------------------------|-----------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| | ||
| 109 | -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html) | | ||
| 110 | -| Text-to-speech | [Address](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html) | | ||
| 111 | -|Voice activity detection (VAD) | [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html)| | ||
| 112 | -|VAD + non-streaming speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html)| | ||
| 113 | -|Two-pass speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html)| | ||
| 114 | -| Audio tagging | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html) | | ||
| 115 | -| Audio tagging (WearOS) | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html) | | ||
| 116 | -| Speaker identification | [Address](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html) | | ||
| 117 | -| Spoken language identification | [Address](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html) | | ||
| 118 | -|Keyword spotting| [Address](https://k2-fsa.github.io/sherpa/onnx/kws/apk.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html)| | 117 | +| Description | URL | 中国用户 | |
| 118 | +|----------------------------------------|------------------------------|-----------------------------| | ||
| 119 | +| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn]| | ||
| 120 | +| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | | ||
| 121 | +| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | | ||
| 122 | +| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | | ||
| 123 | +| Two-pass speech recognition | [Address][apk-2pass] | [点此][apk-2pass-cn] | | ||
| 124 | +| Audio tagging | [Address][apk-at] | [点此][apk-at-cn] | | ||
| 125 | +| Audio tagging (WearOS) | [Address][apk-at-wearos] | [点此][apk-at-wearos-cn] | | ||
| 126 | +| Speaker identification | [Address][apk-sid] | [点此][apk-sid-cn] | | ||
| 127 | +| Spoken language identification | [Address][apk-slid] | [点此][apk-slid-cn] | | ||
| 128 | +| Keyword spotting | [Address][apk-kws] | [点此][apk-kws-cn] | | ||
| 119 | 129 | ||
| 120 | ### Links for pre-built Flutter APPs | 130 | ### Links for pre-built Flutter APPs |
| 121 | 131 | ||
| 122 | #### Real-time speech recognition | 132 | #### Real-time speech recognition |
| 123 | 133 | ||
| 124 | -| Description | URL | 中国用户 | | ||
| 125 | -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------| | ||
| 126 | -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html)| | 134 | +| Description | URL | 中国用户 | |
| 135 | +|--------------------------------|-------------------------------------|-------------------------------------| | ||
| 136 | +| Streaming speech recognition | [Address][apk-flutter-streaming-asr]| [点此][apk-flutter-streaming-asr-cn]| | ||
| 127 | 137 | ||
| 128 | #### Text-to-speech | 138 | #### Text-to-speech |
| 129 | 139 | ||
| 130 | -| Description | URL | 中国用户 | | ||
| 131 | -|--------------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------| | ||
| 132 | -| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html)| | ||
| 133 | -| Linux (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html) | | ||
| 134 | -| macOS (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html) | | ||
| 135 | -| macOS (arm64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html)| | ||
| 136 | -| Windows (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html) | | 140 | +| Description | URL | 中国用户 | |
| 141 | +|------------------------------------------|------------------------------------|------------------------------------| | ||
| 142 | +| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address][flutter-tts-android] | [点此][flutter-tts-android-cn] | | ||
| 143 | +| Linux (x64) | [Address][flutter-tts-linux] | [点此][flutter-tts-linux-cn] | | ||
| 144 | +| macOS (x64) | [Address][flutter-tts-macos-x64] | [点此][flutter-tts-macos-arm64-cn] | | ||
| 145 | +| macOS (arm64) | [Address][flutter-tts-macos-arm64] | [点此][flutter-tts-macos-x64-cn] | | ||
| 146 | +| Windows (x64) | [Address][flutter-tts-win-x64] | [点此][flutter-tts-win-x64-cn] | | ||
| 137 | 147 | ||
| 138 | > Note: You need to build from source for iOS. | 148 | > Note: You need to build from source for iOS. |
| 139 | 149 | ||
| @@ -141,23 +151,23 @@ We also have spaces built using WebAssembly. The are listed below: | @@ -141,23 +151,23 @@ We also have spaces built using WebAssembly. The are listed below: | ||
| 141 | 151 | ||
| 142 | #### Generating subtitles | 152 | #### Generating subtitles |
| 143 | 153 | ||
| 144 | -| Description | URL | 中国用户 | | ||
| 145 | -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------| | ||
| 146 | -| Generate subtitles (生成字幕) | [Address](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html)| | 154 | +| Description | URL | 中国用户 | |
| 155 | +|--------------------------------|----------------------------|----------------------------| | ||
| 156 | +| Generate subtitles (生成字幕) | [Address][lazarus-subtitle]| [点此][lazarus-subtitle-cn]| | ||
| 147 | 157 | ||
| 148 | 158 | ||
| 149 | ### Links for pre-trained models | 159 | ### Links for pre-trained models |
| 150 | 160 | ||
| 151 | -| Description | URL | | ||
| 152 | -|--------------------------------|--------------------------------------------------------------------------------------------------------------------------------| | ||
| 153 | -| Speech recognition (speech to text, ASR) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | | ||
| 154 | -| Text-to-speech (TTS) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) | | ||
| 155 | -| VAD | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx)| | ||
| 156 | -| Keyword spotting |[Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models)| | ||
| 157 | -| Audio tagging | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models)| | ||
| 158 | -| Speaker identification (Speaker ID) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models)| | ||
| 159 | -| Spoken language identification (Language ID) | See multi-lingual [Whisper](https://github.com/openai/whisper) ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | | ||
| 160 | -| Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)| | 161 | +| Description | URL | |
| 162 | +|---------------------------------------------|---------------------------------------------------------------------------------------| | ||
| 163 | +| Speech recognition (speech to text, ASR) | [Address][asr-models] | | ||
| 164 | +| Text-to-speech (TTS) | [Address][tts-models] | | ||
| 165 | +| VAD | [Address][vad-models] | | ||
| 166 | +| Keyword spotting | [Address][kws-models] | | ||
| 167 | +| Audio tagging | [Address][at-models] | | ||
| 168 | +| Speaker identification (Speaker ID) | [Address][sid-models] | | ||
| 169 | +| Spoken language identification (Language ID)| See multi-lingual [Whisper][Whisper] ASR models from [Speech recognition][asr-models]| | ||
| 170 | +| Punctuation | [Address][punct-models] | | ||
| 161 | 171 | ||
| 162 | ### Useful links | 172 | ### Useful links |
| 163 | 173 | ||
| @@ -169,3 +179,100 @@ We also have spaces built using WebAssembly. The are listed below: | @@ -169,3 +179,100 @@ We also have spaces built using WebAssembly. The are listed below: | ||
| 169 | Please see | 179 | Please see |
| 170 | https://k2-fsa.github.io/sherpa/social-groups.html | 180 | https://k2-fsa.github.io/sherpa/social-groups.html |
| 171 | for 新一代 Kaldi **微信交流群** and **QQ 交流群**. | 181 | for 新一代 Kaldi **微信交流群** and **QQ 交流群**. |
| 182 | + | ||
| 183 | +[sherpa-rs]: https://github.com/thewh1teagle/sherpa-rs | ||
| 184 | +[silero-vad]: https://github.com/snakers4/silero-vad | ||
| 185 | +[Raspberry Pi]: https://www.raspberrypi.com/ | ||
| 186 | +[RV1126]: https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf | ||
| 187 | +[LicheePi4A]: https://sipeed.com/licheepi4a | ||
| 188 | +[VisionFive 2]: https://www.starfivetech.com/en/site/boards | ||
| 189 | +[旭日X3派]: https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html | ||
| 190 | +[爱芯派]: https://wiki.sipeed.com/hardware/zh/maixIII/ax-pi/axpi.html | ||
| 191 | +[hf-space-asr]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition | ||
| 192 | +[Whisper]: https://github.com/openai/whisper | ||
| 193 | +[hf-space-asr-whisper]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper | ||
| 194 | +[hf-space-tts]: https://huggingface.co/spaces/k2-fsa/text-to-speech | ||
| 195 | +[hf-space-subtitle]: https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos | ||
| 196 | +[hf-space-audio-tagging]: https://huggingface.co/spaces/k2-fsa/audio-tagging | ||
| 197 | +[hf-space-slid-whisper]: https://huggingface.co/spaces/k2-fsa/spoken-language-identification | ||
| 198 | +[wasm-hf-vad]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx | ||
| 199 | +[wasm-ms-vad]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx | ||
| 200 | +[wasm-hf-streaming-asr-zh-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en | ||
| 201 | +[wasm-ms-streaming-asr-zh-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en | ||
| 202 | +[wasm-hf-streaming-asr-zh-en-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer | ||
| 203 | +[wasm-ms-streaming-asr-zh-en-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer | ||
| 204 | +[Paraformer-large]: https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary | ||
| 205 | +[wasm-hf-streaming-asr-zh-en-yue-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer | ||
| 206 | +[wasm-ms-streaming-asr-zh-en-yue-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer | ||
| 207 | +[wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en | ||
| 208 | +[wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en | ||
| 209 | +[SenseVoice]: https://github.com/FunAudioLLM/SenseVoice | ||
| 210 | +[wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice | ||
| 211 | +[wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice | ||
| 212 | +[wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny | ||
| 213 | +[wasm-ms-vad-asr-en-whisper-tiny-en]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny | ||
| 214 | +[wasm-hf-vad-asr-en-zipformer-gigaspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech | ||
| 215 | +[wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech | ||
| 216 | +[wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech | ||
| 217 | +[wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech | ||
| 218 | +[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf | ||
| 219 | +[wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer | ||
| 220 | +[wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer | ||
| 221 | +[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2 | ||
| 222 | +[wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer | ||
| 223 | +[wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer | ||
| 224 | +[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR | ||
| 225 | +[wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech | ||
| 226 | +[wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech | ||
| 227 | +[wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer | ||
| 228 | +[wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer | ||
| 229 | +[wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small | ||
| 230 | +[wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small | ||
| 231 | +[wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en | ||
| 232 | +[wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en | ||
| 233 | +[wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de | ||
| 234 | +[wasm-ms-tts-piper-de]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de | ||
| 235 | +[apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html | ||
| 236 | +[apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html | ||
| 237 | +[apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html | ||
| 238 | +[apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html | ||
| 239 | +[apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html | ||
| 240 | +[apk-vad-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html | ||
| 241 | +[apk-vad-asr]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html | ||
| 242 | +[apk-vad-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html | ||
| 243 | +[apk-2pass]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html | ||
| 244 | +[apk-2pass-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html | ||
| 245 | +[apk-at]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html | ||
| 246 | +[apk-at-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html | ||
| 247 | +[apk-at-wearos]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html | ||
| 248 | +[apk-at-wearos-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html | ||
| 249 | +[apk-sid]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html | ||
| 250 | +[apk-sid-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html | ||
| 251 | +[apk-slid]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html | ||
| 252 | +[apk-slid-cn]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html | ||
| 253 | +[apk-kws]: https://k2-fsa.github.io/sherpa/onnx/kws/apk.html | ||
| 254 | +[apk-kws-cn]: https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html | ||
| 255 | +[apk-flutter-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html | ||
| 256 | +[apk-flutter-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html | ||
| 257 | +[flutter-tts-android]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html | ||
| 258 | +[flutter-tts-android-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html | ||
| 259 | +[flutter-tts-linux]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html | ||
| 260 | +[flutter-tts-linux-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html | ||
| 261 | +[flutter-tts-macos-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html | ||
| 262 | +[flutter-tts-macos-arm64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html | ||
| 263 | +[flutter-tts-macos-arm64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html | ||
| 264 | +[flutter-tts-macos-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html | ||
| 265 | +[flutter-tts-win-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html | ||
| 266 | +[flutter-tts-win-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html | ||
| 267 | +[lazarus-subtitle]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html | ||
| 268 | +[lazarus-subtitle-cn]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html | ||
| 269 | +[asr-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 270 | +[tts-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 271 | +[vad-models]: https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 272 | +[kws-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models | ||
| 273 | +[at-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models | ||
| 274 | +[sid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 275 | +[slid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 276 | +[punct-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models | ||
| 277 | +[GigaSpeech]: https://github.com/SpeechColab/GigaSpeech | ||
| 278 | +[WenetSpeech]: https://github.com/wenet-e2e/WenetSpeech |
build-wasm-simd-vad-asr.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | +# Copyright (c) 2024 Xiaomi Corporation | ||
| 3 | +# | ||
| 4 | +# This script is to build sherpa-onnx for WebAssembly (VAD+ASR) | ||
| 5 | +# Note: ASR here means non-streaming ASR | ||
| 6 | + | ||
| 7 | +set -ex | ||
| 8 | + | ||
| 9 | +if [ x"$EMSCRIPTEN" == x"" ]; then | ||
| 10 | + if ! command -v emcc &> /dev/null; then | ||
| 11 | + echo "Please install emscripten first" | ||
| 12 | + echo "" | ||
| 13 | + echo "You can use the following commands to install it:" | ||
| 14 | + echo "" | ||
| 15 | + echo "git clone https://github.com/emscripten-core/emsdk.git" | ||
| 16 | + echo "cd emsdk" | ||
| 17 | + echo "git pull" | ||
| 18 | + echo "./emsdk install latest" | ||
| 19 | + echo "./emsdk activate latest" | ||
| 20 | + echo "source ./emsdk_env.sh" | ||
| 21 | + exit 1 | ||
| 22 | + else | ||
| 23 | + EMSCRIPTEN=$(dirname $(realpath $(which emcc))) | ||
| 24 | + fi | ||
| 25 | +fi | ||
| 26 | + | ||
| 27 | +export EMSCRIPTEN=$EMSCRIPTEN | ||
| 28 | +echo "EMSCRIPTEN: $EMSCRIPTEN" | ||
| 29 | +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then | ||
| 30 | + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake" | ||
| 31 | + echo "Please make sure you have installed emsdk correctly" | ||
| 32 | + exit 1 | ||
| 33 | +fi | ||
| 34 | + | ||
| 35 | +mkdir -p build-wasm-simd-vad-asr | ||
| 36 | +pushd build-wasm-simd-vad-asr | ||
| 37 | + | ||
| 38 | +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON | ||
| 39 | + | ||
| 40 | +cmake \ | ||
| 41 | + -DCMAKE_INSTALL_PREFIX=./install \ | ||
| 42 | + -DCMAKE_BUILD_TYPE=Release \ | ||
| 43 | + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \ | ||
| 44 | + \ | ||
| 45 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 46 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 47 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 48 | + -DBUILD_SHARED_LIBS=OFF \ | ||
| 49 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 50 | + -DSHERPA_ONNX_ENABLE_JNI=OFF \ | ||
| 51 | + -DSHERPA_ONNX_ENABLE_TTS=OFF \ | ||
| 52 | + -DSHERPA_ONNX_ENABLE_C_API=ON \ | ||
| 53 | + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ | ||
| 54 | + -DSHERPA_ONNX_ENABLE_GPU=OFF \ | ||
| 55 | + -DSHERPA_ONNX_ENABLE_WASM=ON \ | ||
| 56 | + -DSHERPA_ONNX_ENABLE_WASM_VAD_ASR=ON \ | ||
| 57 | + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ | ||
| 58 | + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \ | ||
| 59 | + .. | ||
| 60 | +make -j2 | ||
| 61 | +make install | ||
| 62 | + | ||
| 63 | +echo "pwd: $PWD" | ||
| 64 | + | ||
| 65 | +cp -fv ../wasm/vad/sherpa-onnx-vad.js ./install/bin/wasm/vad-asr/ | ||
| 66 | +cp -fv ../wasm/asr/sherpa-onnx-asr.js ./install/bin/wasm/vad-asr/ | ||
| 67 | + | ||
| 68 | +ls -lh install/bin/wasm/vad-asr |
scripts/wasm/generate-vad-asr.py
0 → 100755
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +import argparse | ||
| 4 | +from dataclasses import dataclass | ||
| 5 | +from typing import List, Optional | ||
| 6 | + | ||
| 7 | +import jinja2 | ||
| 8 | + | ||
| 9 | + | ||
| 10 | +def get_args(): | ||
| 11 | + parser = argparse.ArgumentParser() | ||
| 12 | + parser.add_argument( | ||
| 13 | + "--total", | ||
| 14 | + type=int, | ||
| 15 | + default=1, | ||
| 16 | + help="Number of runners", | ||
| 17 | + ) | ||
| 18 | + parser.add_argument( | ||
| 19 | + "--index", | ||
| 20 | + type=int, | ||
| 21 | + default=0, | ||
| 22 | + help="Index of the current runner", | ||
| 23 | + ) | ||
| 24 | + return parser.parse_args() | ||
| 25 | + | ||
| 26 | + | ||
| 27 | +@dataclass | ||
| 28 | +class Model: | ||
| 29 | + model_name: str | ||
| 30 | + hf: str # huggingface space name | ||
| 31 | + ms: str # modelscope space name | ||
| 32 | + short_name: str | ||
| 33 | + cmd: str = "" | ||
| 34 | + | ||
| 35 | + | ||
| 36 | +def get_models(): | ||
| 37 | + models = [ | ||
| 38 | + Model( | ||
| 39 | + model_name="sherpa-onnx-whisper-tiny.en", | ||
| 40 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny", | ||
| 41 | + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny", | ||
| 42 | + short_name="vad-asr-en-whisper_tiny", | ||
| 43 | + cmd=""" | ||
| 44 | + pushd $model_name | ||
| 45 | + mv -v tiny.en-encoder.int8.onnx ../whisper-encoder.onnx | ||
| 46 | + mv -v tiny.en-decoder.int8.onnx ../whisper-decoder.onnx | ||
| 47 | + mv -v tiny.en-tokens.txt ../tokens.txt | ||
| 48 | + popd | ||
| 49 | + rm -rf $model_name | ||
| 50 | + sed -i.bak 's/Zipformer/Whisper tiny.en supporting English 英文/g' ../index.html | ||
| 51 | + git diff | ||
| 52 | + """, | ||
| 53 | + ), | ||
| 54 | + Model( | ||
| 55 | + model_name="sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", | ||
| 56 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice", | ||
| 57 | + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice", | ||
| 58 | + short_name="vad-asr-zh_en_ja_ko_cantonese-sense_voice_small", | ||
| 59 | + cmd=""" | ||
| 60 | + pushd $model_name | ||
| 61 | + mv -v model.int8.onnx ../sense-voice.onnx | ||
| 62 | + mv -v tokens.txt ../ | ||
| 63 | + popd | ||
| 64 | + rm -rf $model_name | ||
| 65 | + sed -i.bak 's/Zipformer/SenseVoice Small supporting English, Chinese, Japanese, Korean, Cantonese 中英日韩粤/g' ../index.html | ||
| 66 | + git diff | ||
| 67 | + """, | ||
| 68 | + ), | ||
| 69 | + Model( | ||
| 70 | + model_name="sherpa-onnx-paraformer-zh-2023-09-14", | ||
| 71 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer", | ||
| 72 | + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer", | ||
| 73 | + short_name="vad-asr-zh_en-paraformer_large", | ||
| 74 | + cmd=""" | ||
| 75 | + pushd $model_name | ||
| 76 | + mv -v model.int8.onnx ../paraformer.onnx | ||
| 77 | + mv -v tokens.txt ../ | ||
| 78 | + popd | ||
| 79 | + rm -rf $model_name | ||
| 80 | + sed -i.bak 's/Zipformer/Paraformer supporting Chinese, English 中英/g' ../index.html | ||
| 81 | + git diff | ||
| 82 | + """, | ||
| 83 | + ), | ||
| 84 | + Model( | ||
| 85 | + model_name="sherpa-onnx-paraformer-zh-small-2024-03-09", | ||
| 86 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small", | ||
| 87 | + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small", | ||
| 88 | + short_name="vad-asr-zh_en-paraformer_small", | ||
| 89 | + cmd=""" | ||
| 90 | + pushd $model_name | ||
| 91 | + mv -v model.int8.onnx ../paraformer.onnx | ||
| 92 | + mv -v tokens.txt ../ | ||
| 93 | + popd | ||
| 94 | + rm -rf $model_name | ||
| 95 | + sed -i.bak 's/Zipformer/Paraformer-small supporting Chinese, English 中英文/g' ../index.html | ||
| 96 | + git diff | ||
| 97 | + """, | ||
| 98 | + ), | ||
| 99 | + Model( | ||
| 100 | + model_name="sherpa-onnx-zipformer-gigaspeech-2023-12-12", | ||
| 101 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech", | ||
| 102 | + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech", | ||
| 103 | + short_name="vad-asr-en-zipformer_gigaspeech", | ||
| 104 | + cmd=""" | ||
| 105 | + pushd $model_name | ||
| 106 | + mv encoder-epoch-30-avg-1.int8.onnx ../transducer-encoder.onnx | ||
| 107 | + mv decoder-epoch-30-avg-1.onnx ../transducer-decoder.onnx | ||
| 108 | + mv joiner-epoch-30-avg-1.int8.onnx ../transducer-joiner.onnx | ||
| 109 | + mv tokens.txt ../ | ||
| 110 | + popd | ||
| 111 | + rm -rf $model_name | ||
| 112 | + sed -i.bak 's/Zipformer/Zipformer supporting English 英语/g' ../index.html | ||
| 113 | + git diff | ||
| 114 | + """, | ||
| 115 | + ), | ||
| 116 | + Model( | ||
| 117 | + model_name="icefall-asr-zipformer-wenetspeech-20230615", | ||
| 118 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech", | ||
| 119 | + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech", | ||
| 120 | + short_name="vad-asr-zh-zipformer_wenetspeech", | ||
| 121 | + cmd=""" | ||
| 122 | + pushd $model_name | ||
| 123 | + mv -v data/lang_char/tokens.txt ../ | ||
| 124 | + mv -v exp/encoder-epoch-12-avg-4.int8.onnx ../transducer-encoder.onnx | ||
| 125 | + mv -v exp/decoder-epoch-12-avg-4.onnx ../transducer-decoder.onnx | ||
| 126 | + mv -v exp/joiner-epoch-12-avg-4.int8.onnx ../transducer-joiner.onnx | ||
| 127 | + popd | ||
| 128 | + rm -rf $model_name | ||
| 129 | + sed -i.bak 's/Zipformer/Zipformer supporting Chinese 中文/g' ../index.html | ||
| 130 | + git diff | ||
| 131 | + """, | ||
| 132 | + ), | ||
| 133 | + Model( | ||
| 134 | + model_name="sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01", | ||
| 135 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer", | ||
| 136 | + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer", | ||
| 137 | + short_name="vad-asr-ja-zipformer_reazonspeech", | ||
| 138 | + cmd=""" | ||
| 139 | + pushd $model_name | ||
| 140 | + mv encoder-epoch-99-avg-1.int8.onnx ../transducer-encoder.onnx | ||
| 141 | + mv decoder-epoch-99-avg-1.onnx ../transducer-decoder.onnx | ||
| 142 | + mv joiner-epoch-99-avg-1.int8.onnx ../transducer-joiner.onnx | ||
| 143 | + mv tokens.txt ../ | ||
| 144 | + popd | ||
| 145 | + rm -rf $model_name | ||
| 146 | + sed -i.bak 's/Zipformer/Zipformer supporting Japanese 日语/g' ../index.html | ||
| 147 | + git diff | ||
| 148 | + """, | ||
| 149 | + ), | ||
| 150 | + Model( | ||
| 151 | + model_name="sherpa-onnx-zipformer-thai-2024-06-20", | ||
| 152 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer", | ||
| 153 | + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer", | ||
| 154 | + short_name="vad-asr-th-zipformer_gigaspeech2", | ||
| 155 | + cmd=""" | ||
| 156 | + pushd $model_name | ||
| 157 | + mv encoder-epoch-12-avg-5.int8.onnx ../transducer-encoder.onnx | ||
| 158 | + mv decoder-epoch-12-avg-5.onnx ../transducer-decoder.onnx | ||
| 159 | + mv joiner-epoch-12-avg-5.int8.onnx ../transducer-joiner.onnx | ||
| 160 | + mv tokens.txt ../ | ||
| 161 | + popd | ||
| 162 | + rm -rf $model_name | ||
| 163 | + sed -i.bak 's/Zipformer/Zipformer supporting Thai 泰语/g' ../index.html | ||
| 164 | + git diff | ||
| 165 | + """, | ||
| 166 | + ), | ||
| 167 | + Model( | ||
| 168 | + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04", | ||
| 169 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech", | ||
| 170 | + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech", | ||
| 171 | + short_name="vad-asr-zh-telespeech", | ||
| 172 | + cmd=""" | ||
| 173 | + pushd $model_name | ||
| 174 | + mv model.int8.onnx ../telespeech.onnx | ||
| 175 | + mv tokens.txt ../ | ||
| 176 | + popd | ||
| 177 | + rm -rf $model_name | ||
| 178 | + sed -i.bak 's/Zipformer/TeleSpeech-ASR supporting Chinese 多种中文方言/g' ../index.html | ||
| 179 | + git diff | ||
| 180 | + """, | ||
| 181 | + ), | ||
| 182 | + ] | ||
| 183 | + return models | ||
| 184 | + | ||
| 185 | + | ||
| 186 | +def main(): | ||
| 187 | + args = get_args() | ||
| 188 | + index = args.index | ||
| 189 | + total = args.total | ||
| 190 | + assert 0 <= index < total, (index, total) | ||
| 191 | + | ||
| 192 | + all_model_list = get_models() | ||
| 193 | + | ||
| 194 | + num_models = len(all_model_list) | ||
| 195 | + | ||
| 196 | + num_per_runner = num_models // total | ||
| 197 | + if num_per_runner <= 0: | ||
| 198 | + raise ValueError(f"num_models: {num_models}, num_runners: {total}") | ||
| 199 | + | ||
| 200 | + start = index * num_per_runner | ||
| 201 | + end = start + num_per_runner | ||
| 202 | + | ||
| 203 | + remaining = num_models - args.total * num_per_runner | ||
| 204 | + | ||
| 205 | + print(f"{index}/{total}: {start}-{end}/{num_models}") | ||
| 206 | + | ||
| 207 | + d = dict() | ||
| 208 | + d["model_list"] = all_model_list[start:end] | ||
| 209 | + if index < remaining: | ||
| 210 | + s = args.total * num_per_runner + index | ||
| 211 | + d["model_list"].append(all_model_list[s]) | ||
| 212 | + print(f"{s}/{num_models}") | ||
| 213 | + | ||
| 214 | + filename_list = [ | ||
| 215 | + "./run-vad-asr.sh", | ||
| 216 | + ] | ||
| 217 | + for filename in filename_list: | ||
| 218 | + environment = jinja2.Environment() | ||
| 219 | + with open(f"{filename}.in") as f: | ||
| 220 | + s = f.read() | ||
| 221 | + template = environment.from_string(s) | ||
| 222 | + | ||
| 223 | + s = template.render(**d) | ||
| 224 | + with open(filename, "w") as f: | ||
| 225 | + print(s, file=f) | ||
| 226 | + | ||
| 227 | + | ||
| 228 | +if __name__ == "__main__": | ||
| 229 | + main() |
scripts/wasm/run-vad-asr.sh.in
0 → 100644
| 1 | +#!/usr/bin/env bash | ||
| 2 | +# | ||
| 3 | +# Build WebAssembly APPs for huggingface spaces and modelscope spaces | ||
| 4 | + | ||
| 5 | +set -ex | ||
| 6 | + | ||
| 7 | +log() { | ||
| 8 | + # This function is from espnet | ||
| 9 | + local fname=${BASH_SOURCE[1]##*/} | ||
| 10 | + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" | ||
| 11 | +} | ||
| 12 | + | ||
| 13 | +SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) | ||
| 14 | + | ||
| 15 | + | ||
| 16 | +{% for model in model_list %} | ||
| 17 | +model_name={{ model.model_name }} | ||
| 18 | +short_name={{ model.short_name }} | ||
| 19 | +hf_name={{ model.hf }} | ||
| 20 | +ms_name={{ model.ms }} | ||
| 21 | + | ||
| 22 | +pushd wasm/vad-asr | ||
| 23 | +git checkout . | ||
| 24 | +rm -rf assets | ||
| 25 | +mkdir assets | ||
| 26 | +cd assets | ||
| 27 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 28 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${model_name}.tar.bz2 | ||
| 29 | +tar xvf ${model_name}.tar.bz2 | ||
| 30 | +rm ${model_name}.tar.bz2 | ||
| 31 | + | ||
| 32 | +{{ model.cmd }} | ||
| 33 | + | ||
| 34 | +popd | ||
| 35 | + | ||
| 36 | +ls -lh wasm/vad-asr/assets | ||
| 37 | + | ||
| 38 | +rm -rf build-wasm-simd-vad-asr/install | ||
| 39 | +rm -rf build-wasm-simd-vad-asr/wasm | ||
| 40 | + | ||
| 41 | +./build-wasm-simd-vad-asr.sh | ||
| 42 | + | ||
| 43 | +dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-${short_name} | ||
| 44 | +mv build-wasm-simd-vad-asr/install/bin/wasm/vad-asr $dst | ||
| 45 | +ls -lh $dst | ||
| 46 | +tar cjfv $dst.tar.bz2 ./$dst | ||
| 47 | +ls -lh *.tar.bz2 | ||
| 48 | + | ||
| 49 | +git config --global user.email "csukuangfj@gmail.com" | ||
| 50 | +git config --global user.name "Fangjun Kuang" | ||
| 51 | + | ||
| 52 | +export GIT_LFS_SKIP_SMUDGE=1 | ||
| 53 | +export GIT_CLONE_PROTECTION_ACTIVE=false | ||
| 54 | + | ||
| 55 | +rm -rf ms | ||
| 56 | +git clone https://www.modelscope.cn/studios/$ms_name.git ms | ||
| 57 | + | ||
| 58 | +cd ms | ||
| 59 | +cp -v ../$dst/* . | ||
| 60 | + | ||
| 61 | +git status | ||
| 62 | +git lfs track "*.data" | ||
| 63 | +git lfs track "*.wasm" | ||
| 64 | +ls -lh | ||
| 65 | + | ||
| 66 | +git add . | ||
| 67 | +git commit -m "update model" | ||
| 68 | +git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/$ms_name.git | ||
| 69 | +cd .. | ||
| 70 | +rm -rf ms | ||
| 71 | + | ||
| 72 | +rm -rf huggingface | ||
| 73 | + | ||
| 74 | +git clone https://huggingface.co/spaces/$hf_name huggingface | ||
| 75 | +cd huggingface | ||
| 76 | +cp -v ../$dst/* . | ||
| 77 | + | ||
| 78 | +git status | ||
| 79 | +git lfs track "*.data" | ||
| 80 | +git lfs track "*.wasm" | ||
| 81 | +ls -lh | ||
| 82 | + | ||
| 83 | +git add . | ||
| 84 | +git commit -m "update model" | ||
| 85 | +git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/$hf_name main | ||
| 86 | +cd .. | ||
| 87 | +rm -rf huggingface | ||
| 88 | +rm -rf $dst | ||
| 89 | + | ||
| 90 | +ls -lh *.tar.bz2 | ||
| 91 | + | ||
| 92 | +{% endfor %} |
| @@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
| 13 | #include "sherpa-onnx/csrc/audio-tagging.h" | 13 | #include "sherpa-onnx/csrc/audio-tagging.h" |
| 14 | #include "sherpa-onnx/csrc/circular-buffer.h" | 14 | #include "sherpa-onnx/csrc/circular-buffer.h" |
| 15 | #include "sherpa-onnx/csrc/display.h" | 15 | #include "sherpa-onnx/csrc/display.h" |
| 16 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 16 | #include "sherpa-onnx/csrc/keyword-spotter.h" | 17 | #include "sherpa-onnx/csrc/keyword-spotter.h" |
| 17 | #include "sherpa-onnx/csrc/macros.h" | 18 | #include "sherpa-onnx/csrc/macros.h" |
| 18 | #include "sherpa-onnx/csrc/offline-punctuation.h" | 19 | #include "sherpa-onnx/csrc/offline-punctuation.h" |
| @@ -1638,3 +1639,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( | @@ -1638,3 +1639,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( | ||
| 1638 | void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { | 1639 | void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { |
| 1639 | p->impl->Reset(); | 1640 | p->impl->Reset(); |
| 1640 | } | 1641 | } |
| 1642 | + | ||
| 1643 | +int32_t SherpaOnnxFileExists(const char *filename) { | ||
| 1644 | + return sherpa_onnx::FileExists(filename); | ||
| 1645 | +} |
| @@ -1361,6 +1361,9 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( | @@ -1361,6 +1361,9 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( | ||
| 1361 | SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( | 1361 | SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( |
| 1362 | const SherpaOnnxLinearResampler *p); | 1362 | const SherpaOnnxLinearResampler *p); |
| 1363 | 1363 | ||
| 1364 | +// Return 1 if the file exists; return 0 if the file does not exist. | ||
| 1365 | +SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); | ||
| 1366 | + | ||
| 1364 | #if defined(__GNUC__) | 1367 | #if defined(__GNUC__) |
| 1365 | #pragma GCC diagnostic pop | 1368 | #pragma GCC diagnostic pop |
| 1366 | #endif | 1369 | #endif |
| @@ -14,6 +14,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD) | @@ -14,6 +14,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD) | ||
| 14 | add_subdirectory(vad) | 14 | add_subdirectory(vad) |
| 15 | endif() | 15 | endif() |
| 16 | 16 | ||
| 17 | +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR) | ||
| 18 | + add_subdirectory(vad-asr) | ||
| 19 | +endif() | ||
| 20 | + | ||
| 17 | if(SHERPA_ONNX_ENABLE_WASM_NODEJS) | 21 | if(SHERPA_ONNX_ENABLE_WASM_NODEJS) |
| 18 | add_subdirectory(nodejs) | 22 | add_subdirectory(nodejs) |
| 19 | endif() | 23 | endif() |
| @@ -80,3 +80,10 @@ assets fangjun$ tree -L 1 | @@ -80,3 +80,10 @@ assets fangjun$ tree -L 1 | ||
| 80 | 80 | ||
| 81 | 0 directories, 4 files | 81 | 0 directories, 4 files |
| 82 | ``` | 82 | ``` |
| 83 | + | ||
| 84 | +You can find example build scripts at: | ||
| 85 | + | ||
| 86 | + - Streaming Zipformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/ wasm-simd-hf-space-zh-en-asr-zipformer.yaml | ||
| 87 | + - Streaming Zipformer (English): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml | ||
| 88 | + - Streaming Paraformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml | ||
| 89 | + - Streaming Paraformer (English + Chinese + Cantonese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml |
| @@ -3,7 +3,7 @@ | @@ -3,7 +3,7 @@ | ||
| 3 | <head> | 3 | <head> |
| 4 | <meta charset="utf-8"> | 4 | <meta charset="utf-8"> |
| 5 | <meta name="viewport" content="width=device-width" /> | 5 | <meta name="viewport" content="width=device-width" /> |
| 6 | - <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title> | 6 | + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for ASR</title> |
| 7 | <style> | 7 | <style> |
| 8 | h1,div { | 8 | h1,div { |
| 9 | text-align: center; | 9 | text-align: center; |
| @@ -30,3 +30,8 @@ assets fangjun$ tree -L 1 | @@ -30,3 +30,8 @@ assets fangjun$ tree -L 1 | ||
| 30 | 30 | ||
| 31 | 1 directory, 3 files | 31 | 1 directory, 3 files |
| 32 | ``` | 32 | ``` |
| 33 | + | ||
| 34 | +You can find example build scripts at: | ||
| 35 | + | ||
| 36 | + - English TTS: https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-en-tts.yaml | ||
| 37 | + - German TTS: https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-de-tts.yaml |
wasm/vad-asr/CMakeLists.txt
0 → 100644
| 1 | +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH}) | ||
| 2 | + message(FATAL_ERROR "Please use ./build-wasm-simd-vad.sh to build for wasm VAD") | ||
| 3 | +endif() | ||
| 4 | + | ||
| 5 | +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/silero_vad.onnx" OR NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/tokens.txt") | ||
| 6 | + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue") | ||
| 7 | +endif() | ||
| 8 | + | ||
| 9 | +set(exported_functions | ||
| 10 | + # VAD | ||
| 11 | + SherpaOnnxCreateCircularBuffer | ||
| 12 | + SherpaOnnxDestroyCircularBuffer | ||
| 13 | + SherpaOnnxCircularBufferPush | ||
| 14 | + SherpaOnnxCircularBufferGet | ||
| 15 | + SherpaOnnxCircularBufferFree | ||
| 16 | + SherpaOnnxCircularBufferPop | ||
| 17 | + SherpaOnnxCircularBufferSize | ||
| 18 | + SherpaOnnxCircularBufferHead | ||
| 19 | + SherpaOnnxCircularBufferReset | ||
| 20 | + SherpaOnnxCreateVoiceActivityDetector | ||
| 21 | + SherpaOnnxDestroyVoiceActivityDetector | ||
| 22 | + SherpaOnnxVoiceActivityDetectorAcceptWaveform | ||
| 23 | + SherpaOnnxVoiceActivityDetectorEmpty | ||
| 24 | + SherpaOnnxVoiceActivityDetectorDetected | ||
| 25 | + SherpaOnnxVoiceActivityDetectorPop | ||
| 26 | + SherpaOnnxVoiceActivityDetectorClear | ||
| 27 | + SherpaOnnxVoiceActivityDetectorFront | ||
| 28 | + SherpaOnnxDestroySpeechSegment | ||
| 29 | + SherpaOnnxVoiceActivityDetectorReset | ||
| 30 | + SherpaOnnxVoiceActivityDetectorFlush | ||
| 31 | + # non-streaming ASR | ||
| 32 | + SherpaOnnxAcceptWaveformOffline | ||
| 33 | + SherpaOnnxCreateOfflineRecognizer | ||
| 34 | + SherpaOnnxCreateOfflineStream | ||
| 35 | + SherpaOnnxDecodeMultipleOfflineStreams | ||
| 36 | + SherpaOnnxDecodeOfflineStream | ||
| 37 | + SherpaOnnxDestroyOfflineRecognizer | ||
| 38 | + SherpaOnnxDestroyOfflineRecognizerResult | ||
| 39 | + SherpaOnnxDestroyOfflineStream | ||
| 40 | + SherpaOnnxDestroyOfflineStreamResultJson | ||
| 41 | + SherpaOnnxGetOfflineStreamResult | ||
| 42 | + SherpaOnnxGetOfflineStreamResultAsJson | ||
| 43 | + # | ||
| 44 | + SherpaOnnxFileExists | ||
| 45 | +) | ||
| 46 | +set(mangled_exported_functions) | ||
| 47 | +foreach(x IN LISTS exported_functions) | ||
| 48 | + list(APPEND mangled_exported_functions "_${x}") | ||
| 49 | +endforeach() | ||
| 50 | +list(JOIN mangled_exported_functions "," all_exported_functions) | ||
| 51 | + | ||
| 52 | +include_directories(${CMAKE_SOURCE_DIR}) | ||
| 53 | +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=512MB -s ALLOW_MEMORY_GROWTH=1") | ||
| 54 | +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB | ||
| 55 | +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ") | ||
| 56 | +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ") | ||
| 57 | +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ") | ||
| 58 | + | ||
| 59 | +message(STATUS "MY_FLAGS: ${MY_FLAGS}") | ||
| 60 | + | ||
| 61 | +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}") | ||
| 62 | +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}") | ||
| 63 | +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}") | ||
| 64 | + | ||
| 65 | +if (NOT CMAKE_EXECUTABLE_SUFFIX STREQUAL ".js") | ||
| 66 | + message(FATAL_ERROR "The default suffix for building executables should be .js!") | ||
| 67 | +endif() | ||
| 68 | +# set(CMAKE_EXECUTABLE_SUFFIX ".html") | ||
| 69 | + | ||
| 70 | +add_executable(sherpa-onnx-wasm-main-vad-asr sherpa-onnx-wasm-main-vad-asr.cc) | ||
| 71 | +target_link_libraries(sherpa-onnx-wasm-main-vad-asr sherpa-onnx-c-api) | ||
| 72 | +install(TARGETS sherpa-onnx-wasm-main-vad-asr DESTINATION bin/wasm/vad-asr) | ||
| 73 | + | ||
| 74 | +install( | ||
| 75 | + FILES | ||
| 76 | + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad-asr>/sherpa-onnx-wasm-main-vad-asr.js" | ||
| 77 | + "index.html" | ||
| 78 | + "app-vad-asr.js" | ||
| 79 | + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad-asr>/sherpa-onnx-wasm-main-vad-asr.wasm" | ||
| 80 | + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad-asr>/sherpa-onnx-wasm-main-vad-asr.data" | ||
| 81 | + DESTINATION | ||
| 82 | + bin/wasm/vad-asr | ||
| 83 | +) |
wasm/vad-asr/app-vad-asr.js
0 → 100644
| 1 | +// This file copies and modifies code | ||
| 2 | +// from https://mdn.github.io/web-dictaphone/scripts/app.js | ||
| 3 | +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e | ||
| 4 | + | ||
| 5 | +const startBtn = document.getElementById('startBtn'); | ||
| 6 | +const stopBtn = document.getElementById('stopBtn'); | ||
| 7 | +const clearBtn = document.getElementById('clearBtn'); | ||
| 8 | +const hint = document.getElementById('hint'); | ||
| 9 | +const soundClips = document.getElementById('sound-clips'); | ||
| 10 | + | ||
| 11 | +let textArea = document.getElementById('results'); | ||
| 12 | + | ||
| 13 | +let lastResult = ''; | ||
| 14 | +let resultList = []; | ||
| 15 | + | ||
| 16 | +clearBtn.onclick = function() { | ||
| 17 | + resultList = []; | ||
| 18 | + textArea.value = getDisplayResult(); | ||
| 19 | + textArea.scrollTop = textArea.scrollHeight; // auto scroll | ||
| 20 | +}; | ||
| 21 | + | ||
| 22 | +function getDisplayResult() { | ||
| 23 | + let i = 0; | ||
| 24 | + let ans = ''; | ||
| 25 | + for (let s in resultList) { | ||
| 26 | + if (resultList[s] == '') { | ||
| 27 | + continue; | ||
| 28 | + } | ||
| 29 | + | ||
| 30 | + if (resultList[s] == 'Speech detected') { | ||
| 31 | + ans += '' + i + ': ' + resultList[s]; | ||
| 32 | + i += 1; | ||
| 33 | + } else { | ||
| 34 | + ans += ', ' + resultList[s] + '\n'; | ||
| 35 | + } | ||
| 36 | + } | ||
| 37 | + | ||
| 38 | + if (lastResult.length > 0) { | ||
| 39 | + ans += '' + i + ': ' + lastResult + '\n'; | ||
| 40 | + } | ||
| 41 | + return ans; | ||
| 42 | +} | ||
| 43 | + | ||
| 44 | + | ||
| 45 | + | ||
| 46 | +Module = {}; | ||
| 47 | + | ||
| 48 | +let audioCtx; | ||
| 49 | +let mediaStream; | ||
| 50 | + | ||
| 51 | +let expectedSampleRate = 16000; | ||
| 52 | +let recordSampleRate; // the sampleRate of the microphone | ||
| 53 | +let recorder = null; // the microphone | ||
| 54 | +let leftchannel = []; // TODO: Use a single channel | ||
| 55 | + | ||
| 56 | +let recordingLength = 0; // number of samples so far | ||
| 57 | + | ||
| 58 | +let vad = null; | ||
| 59 | +let buffer = null; | ||
| 60 | +let recognizer = null; | ||
| 61 | +let printed = false; | ||
| 62 | + | ||
| 63 | +function fileExists(filename) { | ||
| 64 | + const filenameLen = Module.lengthBytesUTF8(filename) + 1; | ||
| 65 | + const buffer = Module._malloc(filenameLen); | ||
| 66 | + Module.stringToUTF8(filename, buffer, filenameLen); | ||
| 67 | + | ||
| 68 | + let exists = Module._SherpaOnnxFileExists(buffer); | ||
| 69 | + | ||
| 70 | + Module._free(buffer); | ||
| 71 | + | ||
| 72 | + return exists; | ||
| 73 | +} | ||
| 74 | + | ||
| 75 | +function createOfflineRecognizerSenseVoice() {} | ||
| 76 | + | ||
| 77 | +function initOfflineRecognizer() { | ||
| 78 | + let config = { | ||
| 79 | + modelConfig: { | ||
| 80 | + debug: 1, | ||
| 81 | + tokens: './tokens.txt', | ||
| 82 | + }, | ||
| 83 | + }; | ||
| 84 | + if (fileExists('sense-voice.onnx') == 1) { | ||
| 85 | + config.modelConfig.senseVoice = { | ||
| 86 | + model: './sense-voice.onnx', | ||
| 87 | + useInverseTextNormalization: 1, | ||
| 88 | + }; | ||
| 89 | + } else if (fileExists('whisper-encoder.onnx')) { | ||
| 90 | + config.modelConfig.whisper = { | ||
| 91 | + encoder: './whisper-encoder.onnx', | ||
| 92 | + decoder: './whisper-decoder.onnx', | ||
| 93 | + }; | ||
| 94 | + } else if (fileExists('transducer-encoder.onnx')) { | ||
| 95 | + config.modelConfig.transducer = { | ||
| 96 | + encoder: './transducer-encoder.onnx', | ||
| 97 | + decoder: './transducer-decoder.onnx', | ||
| 98 | + joiner: './transducer-joiner.onnx', | ||
| 99 | + }; | ||
| 100 | + config.modelConfig.modelType = 'transducer'; | ||
| 101 | + } else if (fileExists('nemo-transducer-encoder.onnx')) { | ||
| 102 | + config.modelConfig.transducer = { | ||
| 103 | + encoder: './nemo-transducer-encoder.onnx', | ||
| 104 | + decoder: './nemo-transducer-decoder.onnx', | ||
| 105 | + joiner: './nemo-transducer-joiner.onnx', | ||
| 106 | + }; | ||
| 107 | + config.modelConfig.modelType = 'nemo_transducer'; | ||
| 108 | + } else if (fileExists('paraformer.onnx')) { | ||
| 109 | + config.modelConfig.paraformer = { | ||
| 110 | + model: './paraformer.onnx', | ||
| 111 | + }; | ||
| 112 | + } else if (fileExists('telespeech.onnx')) { | ||
| 113 | + config.modelConfig.telespeechCtc = './telespeech.onnx'; | ||
| 114 | + } else { | ||
| 115 | + console.log('Please specify a model.'); | ||
| 116 | + alert('Please specify a model.'); | ||
| 117 | + } | ||
| 118 | + | ||
| 119 | + recognizer = new OfflineRecognizer(config, Module); | ||
| 120 | +} | ||
| 121 | + | ||
| 122 | +Module.onRuntimeInitialized = function() { | ||
| 123 | + console.log('inited!'); | ||
| 124 | + hint.innerText = 'Model loaded! Please click start'; | ||
| 125 | + | ||
| 126 | + startBtn.disabled = false; | ||
| 127 | + | ||
| 128 | + vad = createVad(Module); | ||
| 129 | + console.log('vad is created!', vad); | ||
| 130 | + | ||
| 131 | + buffer = new CircularBuffer(30 * 16000, Module); | ||
| 132 | + console.log('CircularBuffer is created!', buffer); | ||
| 133 | + | ||
| 134 | + initOfflineRecognizer(); | ||
| 135 | +}; | ||
| 136 | + | ||
| 137 | + | ||
| 138 | + | ||
| 139 | +if (navigator.mediaDevices.getUserMedia) { | ||
| 140 | + console.log('getUserMedia supported.'); | ||
| 141 | + | ||
| 142 | + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia | ||
| 143 | + const constraints = {audio: true}; | ||
| 144 | + | ||
| 145 | + let onSuccess = function(stream) { | ||
| 146 | + if (!audioCtx) { | ||
| 147 | + audioCtx = new AudioContext({sampleRate: expectedSampleRate}); | ||
| 148 | + } | ||
| 149 | + console.log(audioCtx); | ||
| 150 | + recordSampleRate = audioCtx.sampleRate; | ||
| 151 | + console.log('sample rate ' + recordSampleRate); | ||
| 152 | + | ||
| 153 | + // creates an audio node from the microphone incoming stream | ||
| 154 | + mediaStream = audioCtx.createMediaStreamSource(stream); | ||
| 155 | + console.log('media stream', mediaStream); | ||
| 156 | + | ||
| 157 | + // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor | ||
| 158 | + // bufferSize: the onaudioprocess event is called when the buffer is full | ||
| 159 | + var bufferSize = 4096; | ||
| 160 | + var numberOfInputChannels = 1; | ||
| 161 | + var numberOfOutputChannels = 2; | ||
| 162 | + if (audioCtx.createScriptProcessor) { | ||
| 163 | + recorder = audioCtx.createScriptProcessor( | ||
| 164 | + bufferSize, numberOfInputChannels, numberOfOutputChannels); | ||
| 165 | + } else { | ||
| 166 | + recorder = audioCtx.createJavaScriptNode( | ||
| 167 | + bufferSize, numberOfInputChannels, numberOfOutputChannels); | ||
| 168 | + } | ||
| 169 | + console.log('recorder', recorder); | ||
| 170 | + | ||
| 171 | + recorder.onaudioprocess = function(e) { | ||
| 172 | + let samples = new Float32Array(e.inputBuffer.getChannelData(0)) | ||
| 173 | + samples = downsampleBuffer(samples, expectedSampleRate); | ||
| 174 | + buffer.push(samples); | ||
| 175 | + while (buffer.size() > vad.config.sileroVad.windowSize) { | ||
| 176 | + const s = buffer.get(buffer.head(), vad.config.sileroVad.windowSize); | ||
| 177 | + vad.acceptWaveform(s); | ||
| 178 | + buffer.pop(vad.config.sileroVad.windowSize); | ||
| 179 | + | ||
| 180 | + if (vad.isDetected() && !printed) { | ||
| 181 | + printed = true; | ||
| 182 | + lastResult = 'Speech detected'; | ||
| 183 | + } | ||
| 184 | + | ||
| 185 | + if (!vad.isDetected()) { | ||
| 186 | + printed = false; | ||
| 187 | + if (lastResult != '') { | ||
| 188 | + resultList.push(lastResult); | ||
| 189 | + } | ||
| 190 | + lastResult = ''; | ||
| 191 | + } | ||
| 192 | + | ||
| 193 | + while (!vad.isEmpty()) { | ||
| 194 | + const segment = vad.front(); | ||
| 195 | + const duration = segment.samples.length / expectedSampleRate; | ||
| 196 | + let durationStr = `Duration: ${duration.toFixed(3)} seconds`; | ||
| 197 | + vad.pop(); | ||
| 198 | + | ||
| 199 | + // non-streaming asr | ||
| 200 | + const stream = recognizer.createStream(); | ||
| 201 | + stream.acceptWaveform(expectedSampleRate, segment.samples); | ||
| 202 | + recognizer.decode(stream); | ||
| 203 | + let recognitionResult = recognizer.getResult(stream); | ||
| 204 | + console.log(recognitionResult); | ||
| 205 | + let text = recognitionResult.text; | ||
| 206 | + stream.free(); | ||
| 207 | + console.log(text); | ||
| 208 | + | ||
| 209 | + if (text != '') { | ||
| 210 | + durationStr += `. Result: ${text}`; | ||
| 211 | + } | ||
| 212 | + | ||
| 213 | + resultList.push(durationStr); | ||
| 214 | + | ||
| 215 | + | ||
| 216 | + // now save the segment to a wav file | ||
| 217 | + let buf = new Int16Array(segment.samples.length); | ||
| 218 | + for (var i = 0; i < segment.samples.length; ++i) { | ||
| 219 | + let s = segment.samples[i]; | ||
| 220 | + if (s >= 1) | ||
| 221 | + s = 1; | ||
| 222 | + else if (s <= -1) | ||
| 223 | + s = -1; | ||
| 224 | + | ||
| 225 | + buf[i] = s * 32767; | ||
| 226 | + } | ||
| 227 | + | ||
| 228 | + let clipName = new Date().toISOString() + '--' + durationStr; | ||
| 229 | + | ||
| 230 | + const clipContainer = document.createElement('article'); | ||
| 231 | + const clipLabel = document.createElement('p'); | ||
| 232 | + const audio = document.createElement('audio'); | ||
| 233 | + const deleteButton = document.createElement('button'); | ||
| 234 | + | ||
| 235 | + clipContainer.classList.add('clip'); | ||
| 236 | + audio.setAttribute('controls', ''); | ||
| 237 | + deleteButton.textContent = 'Delete'; | ||
| 238 | + deleteButton.className = 'delete'; | ||
| 239 | + | ||
| 240 | + clipLabel.textContent = clipName; | ||
| 241 | + | ||
| 242 | + clipContainer.appendChild(audio); | ||
| 243 | + | ||
| 244 | + clipContainer.appendChild(clipLabel); | ||
| 245 | + clipContainer.appendChild(deleteButton); | ||
| 246 | + soundClips.appendChild(clipContainer); | ||
| 247 | + | ||
| 248 | + audio.controls = true; | ||
| 249 | + const blob = toWav(buf); | ||
| 250 | + | ||
| 251 | + leftchannel = []; | ||
| 252 | + const audioURL = window.URL.createObjectURL(blob); | ||
| 253 | + audio.src = audioURL; | ||
| 254 | + | ||
| 255 | + deleteButton.onclick = function(e) { | ||
| 256 | + let evtTgt = e.target; | ||
| 257 | + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode); | ||
| 258 | + }; | ||
| 259 | + | ||
| 260 | + clipLabel.onclick = function() { | ||
| 261 | + const existingName = clipLabel.textContent; | ||
| 262 | + const newClipName = prompt('Enter a new name for your sound clip?'); | ||
| 263 | + if (newClipName === null) { | ||
| 264 | + clipLabel.textContent = existingName; | ||
| 265 | + } else { | ||
| 266 | + clipLabel.textContent = newClipName; | ||
| 267 | + } | ||
| 268 | + }; | ||
| 269 | + } | ||
| 270 | + } | ||
| 271 | + | ||
| 272 | + textArea.value = getDisplayResult(); | ||
| 273 | + textArea.scrollTop = textArea.scrollHeight; // auto scroll | ||
| 274 | + }; | ||
| 275 | + | ||
| 276 | + startBtn.onclick = function() { | ||
| 277 | + mediaStream.connect(recorder); | ||
| 278 | + recorder.connect(audioCtx.destination); | ||
| 279 | + | ||
| 280 | + console.log('recorder started'); | ||
| 281 | + | ||
| 282 | + stopBtn.disabled = false; | ||
| 283 | + startBtn.disabled = true; | ||
| 284 | + }; | ||
| 285 | + | ||
| 286 | + stopBtn.onclick = function() { | ||
| 287 | + vad.reset(); | ||
| 288 | + buffer.reset(); | ||
| 289 | + console.log('recorder stopped'); | ||
| 290 | + | ||
| 291 | + // stopBtn recording | ||
| 292 | + recorder.disconnect(audioCtx.destination); | ||
| 293 | + mediaStream.disconnect(recorder); | ||
| 294 | + | ||
| 295 | + startBtn.style.background = ''; | ||
| 296 | + startBtn.style.color = ''; | ||
| 297 | + // mediaRecorder.requestData(); | ||
| 298 | + | ||
| 299 | + stopBtn.disabled = true; | ||
| 300 | + startBtn.disabled = false; | ||
| 301 | + }; | ||
| 302 | + }; | ||
| 303 | + | ||
| 304 | + let onError = function(err) { | ||
| 305 | + console.log('The following error occured: ' + err); | ||
| 306 | + }; | ||
| 307 | + | ||
| 308 | + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); | ||
| 309 | +} else { | ||
| 310 | + console.log('getUserMedia not supported on your browser!'); | ||
| 311 | + alert('getUserMedia not supported on your browser!'); | ||
| 312 | +} | ||
| 313 | + | ||
| 314 | + | ||
| 315 | +// this function is copied/modified from | ||
| 316 | +// https://gist.github.com/meziantou/edb7217fddfbb70e899e | ||
| 317 | +function flatten(listOfSamples) { | ||
| 318 | + let n = 0; | ||
| 319 | + for (let i = 0; i < listOfSamples.length; ++i) { | ||
| 320 | + n += listOfSamples[i].length; | ||
| 321 | + } | ||
| 322 | + let ans = new Int16Array(n); | ||
| 323 | + | ||
| 324 | + let offset = 0; | ||
| 325 | + for (let i = 0; i < listOfSamples.length; ++i) { | ||
| 326 | + ans.set(listOfSamples[i], offset); | ||
| 327 | + offset += listOfSamples[i].length; | ||
| 328 | + } | ||
| 329 | + return ans; | ||
| 330 | +} | ||
| 331 | + | ||
| 332 | +// this function is copied/modified from | ||
| 333 | +// https://gist.github.com/meziantou/edb7217fddfbb70e899e | ||
| 334 | +function toWav(samples) { | ||
| 335 | + let buf = new ArrayBuffer(44 + samples.length * 2); | ||
| 336 | + var view = new DataView(buf); | ||
| 337 | + | ||
| 338 | + // http://soundfile.sapp.org/doc/WaveFormat/ | ||
| 339 | + // F F I R | ||
| 340 | + view.setUint32(0, 0x46464952, true); // chunkID | ||
| 341 | + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize | ||
| 342 | + // E V A W | ||
| 343 | + view.setUint32(8, 0x45564157, true); // format | ||
| 344 | + // | ||
| 345 | + // t m f | ||
| 346 | + view.setUint32(12, 0x20746d66, true); // subchunk1ID | ||
| 347 | + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM | ||
| 348 | + view.setUint32(20, 1, true); // audioFormat, 1 for PCM | ||
| 349 | + view.setUint16(22, 1, true); // numChannels: 1 channel | ||
| 350 | + view.setUint32(24, expectedSampleRate, true); // sampleRate | ||
| 351 | + view.setUint32(28, expectedSampleRate * 2, true); // byteRate | ||
| 352 | + view.setUint16(32, 2, true); // blockAlign | ||
| 353 | + view.setUint16(34, 16, true); // bitsPerSample | ||
| 354 | + view.setUint32(36, 0x61746164, true); // Subchunk2ID | ||
| 355 | + view.setUint32(40, samples.length * 2, true); // subchunk2Size | ||
| 356 | + | ||
| 357 | + let offset = 44; | ||
| 358 | + for (let i = 0; i < samples.length; ++i) { | ||
| 359 | + view.setInt16(offset, samples[i], true); | ||
| 360 | + offset += 2; | ||
| 361 | + } | ||
| 362 | + | ||
| 363 | + return new Blob([view], {type: 'audio/wav'}); | ||
| 364 | +} | ||
| 365 | + | ||
| 366 | +// this function is copied from | ||
| 367 | +// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46 | ||
| 368 | +function downsampleBuffer(buffer, exportSampleRate) { | ||
| 369 | + if (exportSampleRate === recordSampleRate) { | ||
| 370 | + return buffer; | ||
| 371 | + } | ||
| 372 | + var sampleRateRatio = recordSampleRate / exportSampleRate; | ||
| 373 | + var newLength = Math.round(buffer.length / sampleRateRatio); | ||
| 374 | + var result = new Float32Array(newLength); | ||
| 375 | + var offsetResult = 0; | ||
| 376 | + var offsetBuffer = 0; | ||
| 377 | + while (offsetResult < result.length) { | ||
| 378 | + var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio); | ||
| 379 | + var accum = 0, count = 0; | ||
| 380 | + for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) { | ||
| 381 | + accum += buffer[i]; | ||
| 382 | + count++; | ||
| 383 | + } | ||
| 384 | + result[offsetResult] = accum / count; | ||
| 385 | + offsetResult++; | ||
| 386 | + offsetBuffer = nextOffsetBuffer; | ||
| 387 | + } | ||
| 388 | + return result; | ||
| 389 | +}; |
wasm/vad-asr/assets/README.md
0 → 100644
| 1 | +# Introduction | ||
| 2 | + | ||
| 3 | +## Download VAD models | ||
| 4 | + | ||
| 5 | +Please download | ||
| 6 | +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 7 | +and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`. | ||
| 8 | + | ||
| 9 | +## Download non-streaming ASR models | ||
| 10 | + | ||
| 11 | +Please refer to | ||
| 12 | +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html | ||
| 13 | +to download a non-streaming ASR model, i.e., an offline ASR model. | ||
| 14 | + | ||
| 15 | +After downloading, you should rename the model files. | ||
| 16 | + | ||
| 17 | +Please refer to | ||
| 18 | +https://k2-fsa.github.io/sherpa/onnx/lazarus/generate-subtitles.html#download-a-speech-recognition-model | ||
| 19 | +for how to rename. | ||
| 20 | + | ||
| 21 | +You can find example build scripts at the following address: | ||
| 22 | + | ||
| 23 | + https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-vad-asr.yaml |
wasm/vad-asr/index.html
0 → 100644
| 1 | +<html lang="en"> | ||
| 2 | + | ||
| 3 | +<head> | ||
| 4 | + <meta charset="utf-8"> | ||
| 5 | + <meta name="viewport" content="width=device-width" /> | ||
| 6 | + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for VAD + ASR</title> | ||
| 7 | + <style> | ||
| 8 | + h1,div { | ||
| 9 | + text-align: center; | ||
| 10 | + } | ||
| 11 | + textarea { | ||
| 12 | + width:100%; | ||
| 13 | + } | ||
| 14 | + </style> | ||
| 15 | +</head> | ||
| 16 | + | ||
| 17 | +<body> | ||
| 18 | + <h1> | ||
| 19 | + Next-gen Kaldi + WebAssembly<br/> | ||
| 20 | + VAD+ASR Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/> | ||
| 21 | + (with Zipformer) | ||
| 22 | + </h1> | ||
| 23 | + | ||
| 24 | + <div> | ||
| 25 | + <span id="hint">Loading model ... ...</span> | ||
| 26 | + <br/> | ||
| 27 | + <br/> | ||
| 28 | + <button id="startBtn" disabled>Start</button> | ||
| 29 | + <button id="stopBtn" disabled>Stop</button> | ||
| 30 | + <button id="clearBtn">Clear</button> | ||
| 31 | + <br/> | ||
| 32 | + <br/> | ||
| 33 | + <textarea id="results" rows="10" readonly></textarea> | ||
| 34 | + </div> | ||
| 35 | + | ||
| 36 | + <section flex="1" overflow="auto" id="sound-clips"> | ||
| 37 | + </section> | ||
| 38 | + | ||
| 39 | + <script src="sherpa-onnx-asr.js"></script> | ||
| 40 | + <script src="sherpa-onnx-vad.js"></script> | ||
| 41 | + <script src="app-vad-asr.js"></script> | ||
| 42 | + <script src="sherpa-onnx-wasm-main-vad-asr.js"></script> | ||
| 43 | +</body> |
wasm/vad-asr/sherpa-onnx-asr.js
0 → 120000
| 1 | +../asr/sherpa-onnx-asr.js |
wasm/vad-asr/sherpa-onnx-vad.js
0 → 120000
| 1 | +../vad/sherpa-onnx-vad.js |
| 1 | +// wasm/sherpa-onnx-wasm-main-vad-asr.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | +#include <stdio.h> | ||
| 5 | + | ||
| 6 | +#include <algorithm> | ||
| 7 | +#include <memory> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 10 | + | ||
| 11 | +// see also | ||
| 12 | +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html | ||
| 13 | + | ||
| 14 | +extern "C" { | ||
| 15 | + | ||
| 16 | +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { | ||
| 17 | + std::copy(src, src + num_bytes, dst); | ||
| 18 | +} | ||
| 19 | +} |
| @@ -3,3 +3,6 @@ | @@ -3,3 +3,6 @@ | ||
| 3 | Please download | 3 | Please download |
| 4 | https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | 4 | https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx |
| 5 | and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`. | 5 | and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`. |
| 6 | + | ||
| 7 | +You can find example build script at | ||
| 8 | +https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-silero-vad.yaml |
| @@ -3,7 +3,7 @@ | @@ -3,7 +3,7 @@ | ||
| 3 | <head> | 3 | <head> |
| 4 | <meta charset="utf-8"> | 4 | <meta charset="utf-8"> |
| 5 | <meta name="viewport" content="width=device-width" /> | 5 | <meta name="viewport" content="width=device-width" /> |
| 6 | - <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title> | 6 | + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for VAD</title> |
| 7 | <style> | 7 | <style> |
| 8 | h1,div { | 8 | h1,div { |
| 9 | text-align: center; | 9 | text-align: center; |
| @@ -172,7 +172,6 @@ class Vad { | @@ -172,7 +172,6 @@ class Vad { | ||
| 172 | constructor(configObj, Module) { | 172 | constructor(configObj, Module) { |
| 173 | this.config = configObj; | 173 | this.config = configObj; |
| 174 | const config = initSherpaOnnxVadModelConfig(configObj, Module); | 174 | const config = initSherpaOnnxVadModelConfig(configObj, Module); |
| 175 | - Module._MyPrint(config.ptr); | ||
| 176 | const handle = Module._SherpaOnnxCreateVoiceActivityDetector( | 175 | const handle = Module._SherpaOnnxCreateVoiceActivityDetector( |
| 177 | config.ptr, configObj.bufferSizeInSeconds || 30); | 176 | config.ptr, configObj.bufferSizeInSeconds || 30); |
| 178 | freeConfig(config, Module); | 177 | freeConfig(config, Module); |
-
请 注册 或 登录 后发表评论