Fangjun Kuang
Committed by GitHub

WebAssembly example for VAD + Non-streaming ASR (#1284)

@@ -25,8 +25,12 @@ jobs: @@ -25,8 +25,12 @@ jobs:
25 - uses: actions/checkout@v4 25 - uses: actions/checkout@v4
26 with: 26 with:
27 fetch-depth: 0 27 fetch-depth: 0
  28 +
28 - name: Install emsdk 29 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
  31 + with:
  32 + version: 3.1.51
  33 + actions-cache-folder: 'emsdk-cache'
30 34
31 - name: View emsdk version 35 - name: View emsdk version
32 shell: bash 36 shell: bash
@@ -27,6 +27,9 @@ jobs: @@ -27,6 +27,9 @@ jobs:
27 fetch-depth: 0 27 fetch-depth: 0
28 - name: Install emsdk 28 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 29 uses: mymindstorm/setup-emsdk@v14
  30 + with:
  31 + version: 3.1.51
  32 + actions-cache-folder: 'emsdk-cache'
30 33
31 - name: View emsdk version 34 - name: View emsdk version
32 shell: bash 35 shell: bash
@@ -25,8 +25,12 @@ jobs: @@ -25,8 +25,12 @@ jobs:
25 - uses: actions/checkout@v4 25 - uses: actions/checkout@v4
26 with: 26 with:
27 fetch-depth: 0 27 fetch-depth: 0
  28 +
28 - name: Install emsdk 29 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
  31 + with:
  32 + version: 3.1.51
  33 + actions-cache-folder: 'emsdk-cache'
30 34
31 - name: View emsdk version 35 - name: View emsdk version
32 shell: bash 36 shell: bash
@@ -25,6 +25,7 @@ jobs: @@ -25,6 +25,7 @@ jobs:
25 - uses: actions/checkout@v4 25 - uses: actions/checkout@v4
26 with: 26 with:
27 fetch-depth: 0 27 fetch-depth: 0
  28 +
28 - name: Install emsdk 29 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
30 with: 31 with:
  1 +name: wasm-simd-hf-space-vad-asr
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
  9 +
  10 + workflow_dispatch:
  11 +
  12 +concurrency:
  13 + group: wasm-simd-hf-space-vad-asr${{ github.ref }}
  14 + cancel-in-progress: true
  15 +
  16 +jobs:
  17 + wasm-simd-hf-space-vad-asr:
  18 + name: ${{ matrix.index }}/${{ matrix.total }}
  19 + runs-on: ${{ matrix.os }}
  20 + strategy:
  21 + fail-fast: false
  22 + matrix:
  23 + os: [ubuntu-latest]
  24 + total: ["8"]
  25 + index: ["0", "1", "2", "3", "4", "5", "6", "7"]
  26 +
  27 + steps:
  28 + - uses: actions/checkout@v4
  29 + with:
  30 + fetch-depth: 0
  31 +
  32 + - name: Install Python dependencies
  33 + shell: bash
  34 + run: |
  35 + python3 -m pip install --upgrade pip jinja2
  36 +
  37 + - name: Install emsdk
  38 + uses: mymindstorm/setup-emsdk@v14
  39 + with:
  40 + version: 3.1.51
  41 + actions-cache-folder: 'emsdk-cache'
  42 +
  43 + - name: View emsdk version
  44 + shell: bash
  45 + run: |
  46 + emcc -v
  47 + echo "--------------------"
  48 + emcc --check
  49 +
  50 + - name: Generate build script
  51 + shell: bash
  52 + run: |
  53 + cd scripts/wasm
  54 +
  55 + total=${{ matrix.total }}
  56 + index=${{ matrix.index }}
  57 +
  58 + ./generate-vad-asr.py --total $total --index $index
  59 +
  60 + chmod +x run-vad-asr.sh
  61 + mv -v ./run-vad-asr.sh ../..
  62 +
  63 + - name: Show build scripts
  64 + shell: bash
  65 + run: |
  66 + cat ./run-vad-asr.sh
  67 +
  68 + - uses: actions/upload-artifact@v4
  69 + with:
  70 + name: run-vad-asr-${{ matrix.index }}
  71 + path: ./run-vad-asr.sh
  72 +
  73 + - name: Build sherpa-onnx for WebAssembly
  74 + shell: bash
  75 + env:
  76 + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }}
  77 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  78 + run: |
  79 + ./run-vad-asr.sh
  80 +
  81 + - name: Release jar
  82 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  83 + uses: svenstaro/upload-release-action@v2
  84 + with:
  85 + file_glob: true
  86 + overwrite: true
  87 + file: ./*.tar.bz2
  88 +
  89 + - name: Upload wasm files
  90 + uses: actions/upload-artifact@v4
  91 + with:
  92 + name: sherpa-onnx-wasm-simd-vad-asr-${{ matrix.index }}
  93 + path: ./sherpa-onnx-wasm-simd-*.tar.bz2
@@ -25,8 +25,12 @@ jobs: @@ -25,8 +25,12 @@ jobs:
25 - uses: actions/checkout@v4 25 - uses: actions/checkout@v4
26 with: 26 with:
27 fetch-depth: 0 27 fetch-depth: 0
  28 +
28 - name: Install emsdk 29 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
  31 + with:
  32 + version: 3.1.51
  33 + actions-cache-folder: 'emsdk-cache'
30 34
31 - name: View emsdk version 35 - name: View emsdk version
32 shell: bash 36 shell: bash
@@ -25,8 +25,12 @@ jobs: @@ -25,8 +25,12 @@ jobs:
25 - uses: actions/checkout@v4 25 - uses: actions/checkout@v4
26 with: 26 with:
27 fetch-depth: 0 27 fetch-depth: 0
  28 +
28 - name: Install emsdk 29 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
  31 + with:
  32 + version: 3.1.51
  33 + actions-cache-folder: 'emsdk-cache'
30 34
31 - name: View emsdk version 35 - name: View emsdk version
32 shell: bash 36 shell: bash
@@ -25,8 +25,12 @@ jobs: @@ -25,8 +25,12 @@ jobs:
25 - uses: actions/checkout@v4 25 - uses: actions/checkout@v4
26 with: 26 with:
27 fetch-depth: 0 27 fetch-depth: 0
  28 +
28 - name: Install emsdk 29 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
  31 + with:
  32 + version: 3.1.51
  33 + actions-cache-folder: 'emsdk-cache'
30 34
31 - name: View emsdk version 35 - name: View emsdk version
32 shell: bash 36 shell: bash
@@ -36,6 +36,7 @@ option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) @@ -36,6 +36,7 @@ option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF)
36 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) 36 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF)
37 option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) 37 option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF)
38 option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF) 38 option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF)
  39 +option(SHERPA_ONNX_ENABLE_WASM_VAD_ASR "Whether to enable WASM for VAD+ASR" OFF)
39 option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) 40 option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF)
40 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) 41 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON)
41 option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON) 42 option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON)
@@ -137,6 +138,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") @@ -137,6 +138,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}")
137 message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") 138 message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}")
138 message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") 139 message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}")
139 message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}") 140 message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}")
  141 +message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD_ASR ${SHERPA_ONNX_ENABLE_WASM_VAD_ASR}")
140 message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") 142 message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}")
141 message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}") 143 message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}")
142 message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") 144 message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}")
@@ -211,11 +213,22 @@ if(SHERPA_ONNX_ENABLE_WASM) @@ -211,11 +213,22 @@ if(SHERPA_ONNX_ENABLE_WASM)
211 endif() 213 endif()
212 214
213 if(SHERPA_ONNX_ENABLE_WASM_KWS) 215 if(SHERPA_ONNX_ENABLE_WASM_KWS)
  216 + if(NOT SHERPA_ONNX_ENABLE_WASM)
  217 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for KWS")
  218 + endif()
214 add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) 219 add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1)
215 endif() 220 endif()
216 221
217 if(SHERPA_ONNX_ENABLE_WASM_VAD) 222 if(SHERPA_ONNX_ENABLE_WASM_VAD)
218 - add_definitions(-DSHERPA_ONNX_ENABLE_WASM_VAD=1) 223 + if(NOT SHERPA_ONNX_ENABLE_WASM)
  224 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD")
  225 + endif()
  226 +endif()
  227 +
  228 +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR)
  229 + if(NOT SHERPA_ONNX_ENABLE_WASM)
  230 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD+ASR")
  231 + endif()
219 endif() 232 endif()
220 233
221 if(NOT CMAKE_CXX_STANDARD) 234 if(NOT CMAKE_CXX_STANDARD)
@@ -15,7 +15,7 @@ @@ -15,7 +15,7 @@
15 ### Supported platforms 15 ### Supported platforms
16 16
17 |Architecture| Android | iOS | Windows | macOS | linux | 17 |Architecture| Android | iOS | Windows | macOS | linux |
18 -|------------|------------------|---------------|------------|-------|-------| 18 +|------------|---------|---------|------------|-------|-------|
19 | x64 | ✔️ | | ✔️ | ✔️ | ✔️ | 19 | x64 | ✔️ | | ✔️ | ✔️ | ✔️ |
20 | x86 | ✔️ | | ✔️ | | | 20 | x86 | ✔️ | | ✔️ | | |
21 | arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | 21 | arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ |
@@ -37,7 +37,7 @@ @@ -37,7 +37,7 @@
37 |-------|----------|----------|------------| 37 |-------|----------|----------|------------|
38 | ✔️ | ✔️ | ✔️ | ✔️ | 38 | ✔️ | ✔️ | ✔️ | ✔️ |
39 39
40 -For Rust support, please see https://github.com/thewh1teagle/sherpa-rs 40 +For Rust support, please see [sherpa-rs][sherpa-rs]
41 41
42 It also supports WebAssembly. 42 It also supports WebAssembly.
43 43
@@ -51,7 +51,7 @@ This repository supports running the following functions **locally** @@ -51,7 +51,7 @@ This repository supports running the following functions **locally**
51 - Speaker verification 51 - Speaker verification
52 - Spoken language identification 52 - Spoken language identification
53 - Audio tagging 53 - Audio tagging
54 - - VAD (e.g., [silero-vad](https://github.com/snakers4/silero-vad)) 54 + - VAD (e.g., [silero-vad][silero-vad])
55 - Keyword spotting 55 - Keyword spotting
56 56
57 on the following platforms and operating systems: 57 on the following platforms and operating systems:
@@ -62,11 +62,12 @@ on the following platforms and operating systems: @@ -62,11 +62,12 @@ on the following platforms and operating systems:
62 - iOS 62 - iOS
63 - NodeJS 63 - NodeJS
64 - WebAssembly 64 - WebAssembly
65 - - [Raspberry Pi](https://www.raspberrypi.com/)  
66 - - [RV1126](https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf)  
67 - - [LicheePi4A](https://sipeed.com/licheepi4a)  
68 - - [VisionFive 2](https://www.starfivetech.com/en/site/boards)  
69 - - [旭日X3派](https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html) 65 + - [Raspberry Pi][Raspberry Pi]
  66 + - [RV1126][RV1126]
  67 + - [LicheePi4A][LicheePi4A]
  68 + - [VisionFive 2][VisionFive 2]
  69 + - [旭日X3派][旭日X3派]
  70 + - [爱芯派][爱芯派]
70 - etc 71 - etc
71 72
72 with the following APIs 73 with the following APIs
@@ -82,58 +83,67 @@ You can visit the following Huggingface spaces to try `sherpa-onnx` without @@ -82,58 +83,67 @@ You can visit the following Huggingface spaces to try `sherpa-onnx` without
82 installing anything. All you need is a browser. 83 installing anything. All you need is a browser.
83 84
84 | Description | URL | 85 | Description | URL |
85 -|---|---|  
86 -| Speech recognition | [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition)|  
87 -| Speech recognition with [Whisper](https://github.com/openai/whisper)| [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper)|  
88 -| Speech synthesis | [Click me](https://huggingface.co/spaces/k2-fsa/text-to-speech)|  
89 -| Generate subtitles| [Click me](https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos)|  
90 -|Audio tagging| [Click me](https://huggingface.co/spaces/k2-fsa/audio-tagging)|  
91 -|Spoken language identification with [Whisper](https://github.com/openai/whisper)|[Click me](https://huggingface.co/spaces/k2-fsa/spoken-language-identification)| 86 +|-------------------------------------------------------|------------------------------------|
  87 +| Speech recognition | [Click me][hf-space-asr] |
  88 +| Speech recognition with [Whisper][Whisper] | [Click me][hf-space-asr-whisper] |
  89 +| Speech synthesis | [Click me][hf-space-tts] |
  90 +| Generate subtitles | [Click me][hf-space-subtitle] |
  91 +| Audio tagging | [Click me][hf-space-audio-tagging] |
  92 +| Spoken language identification with [Whisper][Whisper]| [Click me][hf-space-slid-whisper] |
92 93
93 We also have spaces built using WebAssembly. The are listed below: 94 We also have spaces built using WebAssembly. The are listed below:
94 95
95 -| Description | URL| Chinese users|  
96 -|---|---|---|  
97 -|Voice activity detection with [silero-vad](https://github.com/snakers4/silero-vad)| [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx)|[地址](https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx)|  
98 -|Real-time speech recognition (Chinese + English) with Zipformer | [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|  
99 -|Real-time speech recognition (Chinese + English) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)|  
100 -|Real-time speech recognition (Chinese + English + Cantonese) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)|  
101 -|Real-time speech recognition (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en)|  
102 -|Speech synthesis (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en)|  
103 -|Speech synthesis (German)|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de)| 96 +| Description | Huggingface space| ModelScope space|
  97 +|------------------------------------------------------------------------------------------|------------------|-----------------|
  98 +|Voice activity detection with [silero-vad][silero-vad] | [Click me][wasm-hf-vad]|[地址][wasm-ms-vad]|
  99 +|Real-time speech recognition (Chinese + English) with Zipformer | [Click me][wasm-hf-streaming-asr-zh-en-zipformer]|[地址][wasm-hf-streaming-asr-zh-en-zipformer]|
  100 +|Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]|
  101 +|Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]|
  102 +|Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]|
  103 +|VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]|
  104 +|VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]|
  105 +|VAD + speech recognition (English) with Zipformer trained with [GigaSpeech][GigaSpeech] |[Click me][wasm-hf-vad-asr-en-zipformer-gigaspeech]| [地址][wasm-ms-vad-asr-en-zipformer-gigaspeech]|
  106 +|VAD + speech recognition (Chinese) with Zipformer trained with [WenetSpeech][WenetSpeech] |[Click me][wasm-hf-vad-asr-zh-zipformer-wenetspeech]| [地址][wasm-ms-vad-asr-zh-zipformer-wenetspeech]|
  107 +|VAD + speech recognition (Japanese) with Zipformer trained with [ReazonSpeech][ReazonSpeech]|[Click me][wasm-hf-vad-asr-ja-zipformer-reazonspeech]| [地址][wasm-ms-vad-asr-ja-zipformer-reazonspeech]|
  108 +|VAD + speech recognition (Thai) with Zipformer trained with [GigaSpeech2][GigaSpeech2] |[Click me][wasm-hf-vad-asr-th-zipformer-gigaspeech2]| [地址][wasm-ms-vad-asr-th-zipformer-gigaspeech2]|
  109 +|VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]|
  110 +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]|
  111 +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]|
  112 +|Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]|
  113 +|Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]|
104 114
105 ### Links for pre-built Android APKs 115 ### Links for pre-built Android APKs
106 116
107 | Description | URL | 中国用户 | 117 | Description | URL | 中国用户 |
108 -|--------------------------------|-----------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|  
109 -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html) |  
110 -| Text-to-speech | [Address](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html) |  
111 -|Voice activity detection (VAD) | [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html)|  
112 -|VAD + non-streaming speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html)|  
113 -|Two-pass speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html)|  
114 -| Audio tagging | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html) |  
115 -| Audio tagging (WearOS) | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html) |  
116 -| Speaker identification | [Address](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html) |  
117 -| Spoken language identification | [Address](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html) |  
118 -|Keyword spotting| [Address](https://k2-fsa.github.io/sherpa/onnx/kws/apk.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html)| 118 +|----------------------------------------|------------------------------|-----------------------------|
  119 +| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn]|
  120 +| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] |
  121 +| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] |
  122 +| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] |
  123 +| Two-pass speech recognition | [Address][apk-2pass] | [点此][apk-2pass-cn] |
  124 +| Audio tagging | [Address][apk-at] | [点此][apk-at-cn] |
  125 +| Audio tagging (WearOS) | [Address][apk-at-wearos] | [点此][apk-at-wearos-cn] |
  126 +| Speaker identification | [Address][apk-sid] | [点此][apk-sid-cn] |
  127 +| Spoken language identification | [Address][apk-slid] | [点此][apk-slid-cn] |
  128 +| Keyword spotting | [Address][apk-kws] | [点此][apk-kws-cn] |
119 129
120 ### Links for pre-built Flutter APPs 130 ### Links for pre-built Flutter APPs
121 131
122 #### Real-time speech recognition 132 #### Real-time speech recognition
123 133
124 | Description | URL | 中国用户 | 134 | Description | URL | 中国用户 |
125 -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------|  
126 -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html)| 135 +|--------------------------------|-------------------------------------|-------------------------------------|
  136 +| Streaming speech recognition | [Address][apk-flutter-streaming-asr]| [点此][apk-flutter-streaming-asr-cn]|
127 137
128 #### Text-to-speech 138 #### Text-to-speech
129 139
130 | Description | URL | 中国用户 | 140 | Description | URL | 中国用户 |
131 -|--------------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------|  
132 -| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html)|  
133 -| Linux (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html) |  
134 -| macOS (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html) |  
135 -| macOS (arm64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html)|  
136 -| Windows (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html) | 141 +|------------------------------------------|------------------------------------|------------------------------------|
  142 +| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address][flutter-tts-android] | [点此][flutter-tts-android-cn] |
  143 +| Linux (x64) | [Address][flutter-tts-linux] | [点此][flutter-tts-linux-cn] |
  144 +| macOS (x64) | [Address][flutter-tts-macos-x64] | [点此][flutter-tts-macos-arm64-cn] |
  145 +| macOS (arm64) | [Address][flutter-tts-macos-arm64] | [点此][flutter-tts-macos-x64-cn] |
  146 +| Windows (x64) | [Address][flutter-tts-win-x64] | [点此][flutter-tts-win-x64-cn] |
137 147
138 > Note: You need to build from source for iOS. 148 > Note: You need to build from source for iOS.
139 149
@@ -142,22 +152,22 @@ We also have spaces built using WebAssembly. The are listed below: @@ -142,22 +152,22 @@ We also have spaces built using WebAssembly. The are listed below:
142 #### Generating subtitles 152 #### Generating subtitles
143 153
144 | Description | URL | 中国用户 | 154 | Description | URL | 中国用户 |
145 -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------|  
146 -| Generate subtitles (生成字幕) | [Address](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html)| 155 +|--------------------------------|----------------------------|----------------------------|
  156 +| Generate subtitles (生成字幕) | [Address][lazarus-subtitle]| [点此][lazarus-subtitle-cn]|
147 157
148 158
149 ### Links for pre-trained models 159 ### Links for pre-trained models
150 160
151 | Description | URL | 161 | Description | URL |
152 -|--------------------------------|--------------------------------------------------------------------------------------------------------------------------------|  
153 -| Speech recognition (speech to text, ASR) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) |  
154 -| Text-to-speech (TTS) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) |  
155 -| VAD | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx)|  
156 -| Keyword spotting |[Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models)|  
157 -| Audio tagging | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models)|  
158 -| Speaker identification (Speaker ID) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models)|  
159 -| Spoken language identification (Language ID) | See multi-lingual [Whisper](https://github.com/openai/whisper) ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) |  
160 -| Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)| 162 +|---------------------------------------------|---------------------------------------------------------------------------------------|
  163 +| Speech recognition (speech to text, ASR) | [Address][asr-models] |
  164 +| Text-to-speech (TTS) | [Address][tts-models] |
  165 +| VAD | [Address][vad-models] |
  166 +| Keyword spotting | [Address][kws-models] |
  167 +| Audio tagging | [Address][at-models] |
  168 +| Speaker identification (Speaker ID) | [Address][sid-models] |
  169 +| Spoken language identification (Language ID)| See multi-lingual [Whisper][Whisper] ASR models from [Speech recognition][asr-models]|
  170 +| Punctuation | [Address][punct-models] |
161 171
162 ### Useful links 172 ### Useful links
163 173
@@ -169,3 +179,100 @@ We also have spaces built using WebAssembly. The are listed below: @@ -169,3 +179,100 @@ We also have spaces built using WebAssembly. The are listed below:
169 Please see 179 Please see
170 https://k2-fsa.github.io/sherpa/social-groups.html 180 https://k2-fsa.github.io/sherpa/social-groups.html
171 for 新一代 Kaldi **微信交流群** and **QQ 交流群**. 181 for 新一代 Kaldi **微信交流群** and **QQ 交流群**.
  182 +
  183 +[sherpa-rs]: https://github.com/thewh1teagle/sherpa-rs
  184 +[silero-vad]: https://github.com/snakers4/silero-vad
  185 +[Raspberry Pi]: https://www.raspberrypi.com/
  186 +[RV1126]: https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf
  187 +[LicheePi4A]: https://sipeed.com/licheepi4a
  188 +[VisionFive 2]: https://www.starfivetech.com/en/site/boards
  189 +[旭日X3派]: https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html
  190 +[爱芯派]: https://wiki.sipeed.com/hardware/zh/maixIII/ax-pi/axpi.html
  191 +[hf-space-asr]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
  192 +[Whisper]: https://github.com/openai/whisper
  193 +[hf-space-asr-whisper]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper
  194 +[hf-space-tts]: https://huggingface.co/spaces/k2-fsa/text-to-speech
  195 +[hf-space-subtitle]: https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos
  196 +[hf-space-audio-tagging]: https://huggingface.co/spaces/k2-fsa/audio-tagging
  197 +[hf-space-slid-whisper]: https://huggingface.co/spaces/k2-fsa/spoken-language-identification
  198 +[wasm-hf-vad]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx
  199 +[wasm-ms-vad]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx
  200 +[wasm-hf-streaming-asr-zh-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en
  201 +[wasm-ms-streaming-asr-zh-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en
  202 +[wasm-hf-streaming-asr-zh-en-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer
  203 +[wasm-ms-streaming-asr-zh-en-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer
  204 +[Paraformer-large]: https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary
  205 +[wasm-hf-streaming-asr-zh-en-yue-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer
  206 +[wasm-ms-streaming-asr-zh-en-yue-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer
  207 +[wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en
  208 +[wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en
  209 +[SenseVoice]: https://github.com/FunAudioLLM/SenseVoice
  210 +[wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice
  211 +[wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice
  212 +[wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny
  213 +[wasm-ms-vad-asr-en-whisper-tiny-en]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny
  214 +[wasm-hf-vad-asr-en-zipformer-gigaspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech
  215 +[wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech
  216 +[wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
  217 +[wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
  218 +[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
  219 +[wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
  220 +[wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
  221 +[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2
  222 +[wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer
  223 +[wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer
  224 +[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR
  225 +[wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
  226 +[wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
  227 +[wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
  228 +[wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
  229 +[wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
  230 +[wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
  231 +[wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en
  232 +[wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en
  233 +[wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de
  234 +[wasm-ms-tts-piper-de]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de
  235 +[apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html
  236 +[apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html
  237 +[apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html
  238 +[apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html
  239 +[apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html
  240 +[apk-vad-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html
  241 +[apk-vad-asr]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html
  242 +[apk-vad-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html
  243 +[apk-2pass]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html
  244 +[apk-2pass-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html
  245 +[apk-at]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html
  246 +[apk-at-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html
  247 +[apk-at-wearos]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html
  248 +[apk-at-wearos-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html
  249 +[apk-sid]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html
  250 +[apk-sid-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html
  251 +[apk-slid]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html
  252 +[apk-slid-cn]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html
  253 +[apk-kws]: https://k2-fsa.github.io/sherpa/onnx/kws/apk.html
  254 +[apk-kws-cn]: https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html
  255 +[apk-flutter-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html
  256 +[apk-flutter-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html
  257 +[flutter-tts-android]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html
  258 +[flutter-tts-android-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html
  259 +[flutter-tts-linux]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html
  260 +[flutter-tts-linux-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html
  261 +[flutter-tts-macos-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html
  262 +[flutter-tts-macos-arm64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html
  263 +[flutter-tts-macos-arm64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html
  264 +[flutter-tts-macos-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html
  265 +[flutter-tts-win-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html
  266 +[flutter-tts-win-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html
  267 +[lazarus-subtitle]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html
  268 +[lazarus-subtitle-cn]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html
  269 +[asr-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  270 +[tts-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  271 +[vad-models]: https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  272 +[kws-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models
  273 +[at-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
  274 +[sid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  275 +[slid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  276 +[punct-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models
  277 +[GigaSpeech]: https://github.com/SpeechColab/GigaSpeech
  278 +[WenetSpeech]: https://github.com/wenet-e2e/WenetSpeech
  1 +#!/usr/bin/env bash
  2 +# Copyright (c) 2024 Xiaomi Corporation
  3 +#
  4 +# This script is to build sherpa-onnx for WebAssembly (VAD+ASR)
  5 +# Note: ASR here means non-streaming ASR
  6 +
  7 +set -ex
  8 +
  9 +if [ x"$EMSCRIPTEN" == x"" ]; then
  10 + if ! command -v emcc &> /dev/null; then
  11 + echo "Please install emscripten first"
  12 + echo ""
  13 + echo "You can use the following commands to install it:"
  14 + echo ""
  15 + echo "git clone https://github.com/emscripten-core/emsdk.git"
  16 + echo "cd emsdk"
  17 + echo "git pull"
  18 + echo "./emsdk install latest"
  19 + echo "./emsdk activate latest"
  20 + echo "source ./emsdk_env.sh"
  21 + exit 1
  22 + else
  23 + EMSCRIPTEN=$(dirname $(realpath $(which emcc)))
  24 + fi
  25 +fi
  26 +
  27 +export EMSCRIPTEN=$EMSCRIPTEN
  28 +echo "EMSCRIPTEN: $EMSCRIPTEN"
  29 +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then
  30 + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake"
  31 + echo "Please make sure you have installed emsdk correctly"
  32 + exit 1
  33 +fi
  34 +
  35 +mkdir -p build-wasm-simd-vad-asr
  36 +pushd build-wasm-simd-vad-asr
  37 +
  38 +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON
  39 +
  40 +cmake \
  41 + -DCMAKE_INSTALL_PREFIX=./install \
  42 + -DCMAKE_BUILD_TYPE=Release \
  43 + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \
  44 + \
  45 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  46 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  47 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  48 + -DBUILD_SHARED_LIBS=OFF \
  49 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  50 + -DSHERPA_ONNX_ENABLE_JNI=OFF \
  51 + -DSHERPA_ONNX_ENABLE_TTS=OFF \
  52 + -DSHERPA_ONNX_ENABLE_C_API=ON \
  53 + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
  54 + -DSHERPA_ONNX_ENABLE_GPU=OFF \
  55 + -DSHERPA_ONNX_ENABLE_WASM=ON \
  56 + -DSHERPA_ONNX_ENABLE_WASM_VAD_ASR=ON \
  57 + -DSHERPA_ONNX_ENABLE_BINARY=OFF \
  58 + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \
  59 + ..
  60 +make -j2
  61 +make install
  62 +
  63 +echo "pwd: $PWD"
  64 +
  65 +cp -fv ../wasm/vad/sherpa-onnx-vad.js ./install/bin/wasm/vad-asr/
  66 +cp -fv ../wasm/asr/sherpa-onnx-asr.js ./install/bin/wasm/vad-asr/
  67 +
  68 +ls -lh install/bin/wasm/vad-asr
  1 +#!/usr/bin/env python3
  2 +
  3 +import argparse
  4 +from dataclasses import dataclass
  5 +from typing import List, Optional
  6 +
  7 +import jinja2
  8 +
  9 +
  10 +def get_args():
  11 + parser = argparse.ArgumentParser()
  12 + parser.add_argument(
  13 + "--total",
  14 + type=int,
  15 + default=1,
  16 + help="Number of runners",
  17 + )
  18 + parser.add_argument(
  19 + "--index",
  20 + type=int,
  21 + default=0,
  22 + help="Index of the current runner",
  23 + )
  24 + return parser.parse_args()
  25 +
  26 +
  27 +@dataclass
  28 +class Model:
  29 + model_name: str
  30 + hf: str # huggingface space name
  31 + ms: str # modelscope space name
  32 + short_name: str
  33 + cmd: str = ""
  34 +
  35 +
  36 +def get_models():
  37 + models = [
  38 + Model(
  39 + model_name="sherpa-onnx-whisper-tiny.en",
  40 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny",
  41 + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny",
  42 + short_name="vad-asr-en-whisper_tiny",
  43 + cmd="""
  44 + pushd $model_name
  45 + mv -v tiny.en-encoder.int8.onnx ../whisper-encoder.onnx
  46 + mv -v tiny.en-decoder.int8.onnx ../whisper-decoder.onnx
  47 + mv -v tiny.en-tokens.txt ../tokens.txt
  48 + popd
  49 + rm -rf $model_name
  50 + sed -i.bak 's/Zipformer/Whisper tiny.en supporting English 英文/g' ../index.html
  51 + git diff
  52 + """,
  53 + ),
  54 + Model(
  55 + model_name="sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17",
  56 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice",
  57 + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice",
  58 + short_name="vad-asr-zh_en_ja_ko_cantonese-sense_voice_small",
  59 + cmd="""
  60 + pushd $model_name
  61 + mv -v model.int8.onnx ../sense-voice.onnx
  62 + mv -v tokens.txt ../
  63 + popd
  64 + rm -rf $model_name
  65 + sed -i.bak 's/Zipformer/SenseVoice Small supporting English, Chinese, Japanese, Korean, Cantonese 中英日韩粤/g' ../index.html
  66 + git diff
  67 + """,
  68 + ),
  69 + Model(
  70 + model_name="sherpa-onnx-paraformer-zh-2023-09-14",
  71 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer",
  72 + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer",
  73 + short_name="vad-asr-zh_en-paraformer_large",
  74 + cmd="""
  75 + pushd $model_name
  76 + mv -v model.int8.onnx ../paraformer.onnx
  77 + mv -v tokens.txt ../
  78 + popd
  79 + rm -rf $model_name
  80 + sed -i.bak 's/Zipformer/Paraformer supporting Chinese, English 中英/g' ../index.html
  81 + git diff
  82 + """,
  83 + ),
  84 + Model(
  85 + model_name="sherpa-onnx-paraformer-zh-small-2024-03-09",
  86 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small",
  87 + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small",
  88 + short_name="vad-asr-zh_en-paraformer_small",
  89 + cmd="""
  90 + pushd $model_name
  91 + mv -v model.int8.onnx ../paraformer.onnx
  92 + mv -v tokens.txt ../
  93 + popd
  94 + rm -rf $model_name
  95 + sed -i.bak 's/Zipformer/Paraformer-small supporting Chinese, English 中英文/g' ../index.html
  96 + git diff
  97 + """,
  98 + ),
  99 + Model(
  100 + model_name="sherpa-onnx-zipformer-gigaspeech-2023-12-12",
  101 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech",
  102 + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech",
  103 + short_name="vad-asr-en-zipformer_gigaspeech",
  104 + cmd="""
  105 + pushd $model_name
  106 + mv encoder-epoch-30-avg-1.int8.onnx ../transducer-encoder.onnx
  107 + mv decoder-epoch-30-avg-1.onnx ../transducer-decoder.onnx
  108 + mv joiner-epoch-30-avg-1.int8.onnx ../transducer-joiner.onnx
  109 + mv tokens.txt ../
  110 + popd
  111 + rm -rf $model_name
  112 + sed -i.bak 's/Zipformer/Zipformer supporting English 英语/g' ../index.html
  113 + git diff
  114 + """,
  115 + ),
  116 + Model(
  117 + model_name="icefall-asr-zipformer-wenetspeech-20230615",
  118 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech",
  119 + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech",
  120 + short_name="vad-asr-zh-zipformer_wenetspeech",
  121 + cmd="""
  122 + pushd $model_name
  123 + mv -v data/lang_char/tokens.txt ../
  124 + mv -v exp/encoder-epoch-12-avg-4.int8.onnx ../transducer-encoder.onnx
  125 + mv -v exp/decoder-epoch-12-avg-4.onnx ../transducer-decoder.onnx
  126 + mv -v exp/joiner-epoch-12-avg-4.int8.onnx ../transducer-joiner.onnx
  127 + popd
  128 + rm -rf $model_name
  129 + sed -i.bak 's/Zipformer/Zipformer supporting Chinese 中文/g' ../index.html
  130 + git diff
  131 + """,
  132 + ),
  133 + Model(
  134 + model_name="sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01",
  135 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer",
  136 + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer",
  137 + short_name="vad-asr-ja-zipformer_reazonspeech",
  138 + cmd="""
  139 + pushd $model_name
  140 + mv encoder-epoch-99-avg-1.int8.onnx ../transducer-encoder.onnx
  141 + mv decoder-epoch-99-avg-1.onnx ../transducer-decoder.onnx
  142 + mv joiner-epoch-99-avg-1.int8.onnx ../transducer-joiner.onnx
  143 + mv tokens.txt ../
  144 + popd
  145 + rm -rf $model_name
  146 + sed -i.bak 's/Zipformer/Zipformer supporting Japanese 日语/g' ../index.html
  147 + git diff
  148 + """,
  149 + ),
  150 + Model(
  151 + model_name="sherpa-onnx-zipformer-thai-2024-06-20",
  152 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer",
  153 + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer",
  154 + short_name="vad-asr-th-zipformer_gigaspeech2",
  155 + cmd="""
  156 + pushd $model_name
  157 + mv encoder-epoch-12-avg-5.int8.onnx ../transducer-encoder.onnx
  158 + mv decoder-epoch-12-avg-5.onnx ../transducer-decoder.onnx
  159 + mv joiner-epoch-12-avg-5.int8.onnx ../transducer-joiner.onnx
  160 + mv tokens.txt ../
  161 + popd
  162 + rm -rf $model_name
  163 + sed -i.bak 's/Zipformer/Zipformer supporting Thai 泰语/g' ../index.html
  164 + git diff
  165 + """,
  166 + ),
  167 + Model(
  168 + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04",
  169 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech",
  170 + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech",
  171 + short_name="vad-asr-zh-telespeech",
  172 + cmd="""
  173 + pushd $model_name
  174 + mv model.int8.onnx ../telespeech.onnx
  175 + mv tokens.txt ../
  176 + popd
  177 + rm -rf $model_name
  178 + sed -i.bak 's/Zipformer/TeleSpeech-ASR supporting Chinese 多种中文方言/g' ../index.html
  179 + git diff
  180 + """,
  181 + ),
  182 + ]
  183 + return models
  184 +
  185 +
  186 +def main():
  187 + args = get_args()
  188 + index = args.index
  189 + total = args.total
  190 + assert 0 <= index < total, (index, total)
  191 +
  192 + all_model_list = get_models()
  193 +
  194 + num_models = len(all_model_list)
  195 +
  196 + num_per_runner = num_models // total
  197 + if num_per_runner <= 0:
  198 + raise ValueError(f"num_models: {num_models}, num_runners: {total}")
  199 +
  200 + start = index * num_per_runner
  201 + end = start + num_per_runner
  202 +
  203 + remaining = num_models - args.total * num_per_runner
  204 +
  205 + print(f"{index}/{total}: {start}-{end}/{num_models}")
  206 +
  207 + d = dict()
  208 + d["model_list"] = all_model_list[start:end]
  209 + if index < remaining:
  210 + s = args.total * num_per_runner + index
  211 + d["model_list"].append(all_model_list[s])
  212 + print(f"{s}/{num_models}")
  213 +
  214 + filename_list = [
  215 + "./run-vad-asr.sh",
  216 + ]
  217 + for filename in filename_list:
  218 + environment = jinja2.Environment()
  219 + with open(f"{filename}.in") as f:
  220 + s = f.read()
  221 + template = environment.from_string(s)
  222 +
  223 + s = template.render(**d)
  224 + with open(filename, "w") as f:
  225 + print(s, file=f)
  226 +
  227 +
  228 +if __name__ == "__main__":
  229 + main()
  1 +#!/usr/bin/env bash
  2 +#
  3 +# Build WebAssembly APPs for huggingface spaces and modelscope spaces
  4 +
  5 +set -ex
  6 +
  7 +log() {
  8 + # This function is from espnet
  9 + local fname=${BASH_SOURCE[1]##*/}
  10 + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  11 +}
  12 +
  13 +SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  14 +
  15 +
  16 +{% for model in model_list %}
  17 +model_name={{ model.model_name }}
  18 +short_name={{ model.short_name }}
  19 +hf_name={{ model.hf }}
  20 +ms_name={{ model.ms }}
  21 +
  22 +pushd wasm/vad-asr
  23 +git checkout .
  24 +rm -rf assets
  25 +mkdir assets
  26 +cd assets
  27 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  28 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${model_name}.tar.bz2
  29 +tar xvf ${model_name}.tar.bz2
  30 +rm ${model_name}.tar.bz2
  31 +
  32 +{{ model.cmd }}
  33 +
  34 +popd
  35 +
  36 +ls -lh wasm/vad-asr/assets
  37 +
  38 +rm -rf build-wasm-simd-vad-asr/install
  39 +rm -rf build-wasm-simd-vad-asr/wasm
  40 +
  41 +./build-wasm-simd-vad-asr.sh
  42 +
  43 +dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-${short_name}
  44 +mv build-wasm-simd-vad-asr/install/bin/wasm/vad-asr $dst
  45 +ls -lh $dst
  46 +tar cjfv $dst.tar.bz2 ./$dst
  47 +ls -lh *.tar.bz2
  48 +
  49 +git config --global user.email "csukuangfj@gmail.com"
  50 +git config --global user.name "Fangjun Kuang"
  51 +
  52 +export GIT_LFS_SKIP_SMUDGE=1
  53 +export GIT_CLONE_PROTECTION_ACTIVE=false
  54 +
  55 +rm -rf ms
  56 +git clone https://www.modelscope.cn/studios/$ms_name.git ms
  57 +
  58 +cd ms
  59 +cp -v ../$dst/* .
  60 +
  61 +git status
  62 +git lfs track "*.data"
  63 +git lfs track "*.wasm"
  64 +ls -lh
  65 +
  66 +git add .
  67 +git commit -m "update model"
  68 +git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/$ms_name.git
  69 +cd ..
  70 +rm -rf ms
  71 +
  72 +rm -rf huggingface
  73 +
  74 +git clone https://huggingface.co/spaces/$hf_name huggingface
  75 +cd huggingface
  76 +cp -v ../$dst/* .
  77 +
  78 +git status
  79 +git lfs track "*.data"
  80 +git lfs track "*.wasm"
  81 +ls -lh
  82 +
  83 +git add .
  84 +git commit -m "update model"
  85 +git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/$hf_name main
  86 +cd ..
  87 +rm -rf huggingface
  88 +rm -rf $dst
  89 +
  90 +ls -lh *.tar.bz2
  91 +
  92 +{% endfor %}
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 #include "sherpa-onnx/csrc/audio-tagging.h" 13 #include "sherpa-onnx/csrc/audio-tagging.h"
14 #include "sherpa-onnx/csrc/circular-buffer.h" 14 #include "sherpa-onnx/csrc/circular-buffer.h"
15 #include "sherpa-onnx/csrc/display.h" 15 #include "sherpa-onnx/csrc/display.h"
  16 +#include "sherpa-onnx/csrc/file-utils.h"
16 #include "sherpa-onnx/csrc/keyword-spotter.h" 17 #include "sherpa-onnx/csrc/keyword-spotter.h"
17 #include "sherpa-onnx/csrc/macros.h" 18 #include "sherpa-onnx/csrc/macros.h"
18 #include "sherpa-onnx/csrc/offline-punctuation.h" 19 #include "sherpa-onnx/csrc/offline-punctuation.h"
@@ -1638,3 +1639,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( @@ -1638,3 +1639,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
1638 void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { 1639 void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
1639 p->impl->Reset(); 1640 p->impl->Reset();
1640 } 1641 }
  1642 +
  1643 +int32_t SherpaOnnxFileExists(const char *filename) {
  1644 + return sherpa_onnx::FileExists(filename);
  1645 +}
@@ -1361,6 +1361,9 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( @@ -1361,6 +1361,9 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
1361 SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( 1361 SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
1362 const SherpaOnnxLinearResampler *p); 1362 const SherpaOnnxLinearResampler *p);
1363 1363
  1364 +// Return 1 if the file exists; return 0 if the file does not exist.
  1365 +SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
  1366 +
1364 #if defined(__GNUC__) 1367 #if defined(__GNUC__)
1365 #pragma GCC diagnostic pop 1368 #pragma GCC diagnostic pop
1366 #endif 1369 #endif
@@ -14,6 +14,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD) @@ -14,6 +14,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD)
14 add_subdirectory(vad) 14 add_subdirectory(vad)
15 endif() 15 endif()
16 16
  17 +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR)
  18 + add_subdirectory(vad-asr)
  19 +endif()
  20 +
17 if(SHERPA_ONNX_ENABLE_WASM_NODEJS) 21 if(SHERPA_ONNX_ENABLE_WASM_NODEJS)
18 add_subdirectory(nodejs) 22 add_subdirectory(nodejs)
19 endif() 23 endif()
@@ -80,3 +80,10 @@ assets fangjun$ tree -L 1 @@ -80,3 +80,10 @@ assets fangjun$ tree -L 1
80 80
81 0 directories, 4 files 81 0 directories, 4 files
82 ``` 82 ```
  83 +
  84 +You can find example build scripts at:
  85 +
  86 + - Streaming Zipformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/ wasm-simd-hf-space-zh-en-asr-zipformer.yaml
  87 + - Streaming Zipformer (English): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml
  88 + - Streaming Paraformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml
  89 + - Streaming Paraformer (English + Chinese + Cantonese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 <head> 3 <head>
4 <meta charset="utf-8"> 4 <meta charset="utf-8">
5 <meta name="viewport" content="width=device-width" /> 5 <meta name="viewport" content="width=device-width" />
6 - <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title> 6 + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for ASR</title>
7 <style> 7 <style>
8 h1,div { 8 h1,div {
9 text-align: center; 9 text-align: center;
@@ -30,3 +30,8 @@ assets fangjun$ tree -L 1 @@ -30,3 +30,8 @@ assets fangjun$ tree -L 1
30 30
31 1 directory, 3 files 31 1 directory, 3 files
32 ``` 32 ```
  33 +
  34 +You can find example build scripts at:
  35 +
  36 + - English TTS: https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-en-tts.yaml
  37 + - German TTS: https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-de-tts.yaml
  1 +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH})
  2 + message(FATAL_ERROR "Please use ./build-wasm-simd-vad.sh to build for wasm VAD")
  3 +endif()
  4 +
  5 +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/silero_vad.onnx" OR NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/tokens.txt")
  6 + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue")
  7 +endif()
  8 +
  9 +set(exported_functions
  10 + # VAD
  11 + SherpaOnnxCreateCircularBuffer
  12 + SherpaOnnxDestroyCircularBuffer
  13 + SherpaOnnxCircularBufferPush
  14 + SherpaOnnxCircularBufferGet
  15 + SherpaOnnxCircularBufferFree
  16 + SherpaOnnxCircularBufferPop
  17 + SherpaOnnxCircularBufferSize
  18 + SherpaOnnxCircularBufferHead
  19 + SherpaOnnxCircularBufferReset
  20 + SherpaOnnxCreateVoiceActivityDetector
  21 + SherpaOnnxDestroyVoiceActivityDetector
  22 + SherpaOnnxVoiceActivityDetectorAcceptWaveform
  23 + SherpaOnnxVoiceActivityDetectorEmpty
  24 + SherpaOnnxVoiceActivityDetectorDetected
  25 + SherpaOnnxVoiceActivityDetectorPop
  26 + SherpaOnnxVoiceActivityDetectorClear
  27 + SherpaOnnxVoiceActivityDetectorFront
  28 + SherpaOnnxDestroySpeechSegment
  29 + SherpaOnnxVoiceActivityDetectorReset
  30 + SherpaOnnxVoiceActivityDetectorFlush
  31 + # non-streaming ASR
  32 + SherpaOnnxAcceptWaveformOffline
  33 + SherpaOnnxCreateOfflineRecognizer
  34 + SherpaOnnxCreateOfflineStream
  35 + SherpaOnnxDecodeMultipleOfflineStreams
  36 + SherpaOnnxDecodeOfflineStream
  37 + SherpaOnnxDestroyOfflineRecognizer
  38 + SherpaOnnxDestroyOfflineRecognizerResult
  39 + SherpaOnnxDestroyOfflineStream
  40 + SherpaOnnxDestroyOfflineStreamResultJson
  41 + SherpaOnnxGetOfflineStreamResult
  42 + SherpaOnnxGetOfflineStreamResultAsJson
  43 + #
  44 + SherpaOnnxFileExists
  45 +)
  46 +set(mangled_exported_functions)
  47 +foreach(x IN LISTS exported_functions)
  48 + list(APPEND mangled_exported_functions "_${x}")
  49 +endforeach()
  50 +list(JOIN mangled_exported_functions "," all_exported_functions)
  51 +
  52 +include_directories(${CMAKE_SOURCE_DIR})
  53 +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=512MB -s ALLOW_MEMORY_GROWTH=1")
  54 +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB
  55 +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ")
  56 +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ")
  57 +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ")
  58 +
  59 +message(STATUS "MY_FLAGS: ${MY_FLAGS}")
  60 +
  61 +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}")
  62 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}")
  63 +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}")
  64 +
  65 +if (NOT CMAKE_EXECUTABLE_SUFFIX STREQUAL ".js")
  66 + message(FATAL_ERROR "The default suffix for building executables should be .js!")
  67 +endif()
  68 +# set(CMAKE_EXECUTABLE_SUFFIX ".html")
  69 +
  70 +add_executable(sherpa-onnx-wasm-main-vad-asr sherpa-onnx-wasm-main-vad-asr.cc)
  71 +target_link_libraries(sherpa-onnx-wasm-main-vad-asr sherpa-onnx-c-api)
  72 +install(TARGETS sherpa-onnx-wasm-main-vad-asr DESTINATION bin/wasm/vad-asr)
  73 +
  74 +install(
  75 + FILES
  76 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad-asr>/sherpa-onnx-wasm-main-vad-asr.js"
  77 + "index.html"
  78 + "app-vad-asr.js"
  79 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad-asr>/sherpa-onnx-wasm-main-vad-asr.wasm"
  80 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad-asr>/sherpa-onnx-wasm-main-vad-asr.data"
  81 + DESTINATION
  82 + bin/wasm/vad-asr
  83 +)
  1 +// This file copies and modifies code
  2 +// from https://mdn.github.io/web-dictaphone/scripts/app.js
  3 +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e
  4 +
  5 +const startBtn = document.getElementById('startBtn');
  6 +const stopBtn = document.getElementById('stopBtn');
  7 +const clearBtn = document.getElementById('clearBtn');
  8 +const hint = document.getElementById('hint');
  9 +const soundClips = document.getElementById('sound-clips');
  10 +
  11 +let textArea = document.getElementById('results');
  12 +
  13 +let lastResult = '';
  14 +let resultList = [];
  15 +
  16 +clearBtn.onclick = function() {
  17 + resultList = [];
  18 + textArea.value = getDisplayResult();
  19 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  20 +};
  21 +
  22 +function getDisplayResult() {
  23 + let i = 0;
  24 + let ans = '';
  25 + for (let s in resultList) {
  26 + if (resultList[s] == '') {
  27 + continue;
  28 + }
  29 +
  30 + if (resultList[s] == 'Speech detected') {
  31 + ans += '' + i + ': ' + resultList[s];
  32 + i += 1;
  33 + } else {
  34 + ans += ', ' + resultList[s] + '\n';
  35 + }
  36 + }
  37 +
  38 + if (lastResult.length > 0) {
  39 + ans += '' + i + ': ' + lastResult + '\n';
  40 + }
  41 + return ans;
  42 +}
  43 +
  44 +
  45 +
  46 +Module = {};
  47 +
  48 +let audioCtx;
  49 +let mediaStream;
  50 +
  51 +let expectedSampleRate = 16000;
  52 +let recordSampleRate; // the sampleRate of the microphone
  53 +let recorder = null; // the microphone
  54 +let leftchannel = []; // TODO: Use a single channel
  55 +
  56 +let recordingLength = 0; // number of samples so far
  57 +
  58 +let vad = null;
  59 +let buffer = null;
  60 +let recognizer = null;
  61 +let printed = false;
  62 +
  63 +function fileExists(filename) {
  64 + const filenameLen = Module.lengthBytesUTF8(filename) + 1;
  65 + const buffer = Module._malloc(filenameLen);
  66 + Module.stringToUTF8(filename, buffer, filenameLen);
  67 +
  68 + let exists = Module._SherpaOnnxFileExists(buffer);
  69 +
  70 + Module._free(buffer);
  71 +
  72 + return exists;
  73 +}
  74 +
  75 +function createOfflineRecognizerSenseVoice() {}
  76 +
  77 +function initOfflineRecognizer() {
  78 + let config = {
  79 + modelConfig: {
  80 + debug: 1,
  81 + tokens: './tokens.txt',
  82 + },
  83 + };
  84 + if (fileExists('sense-voice.onnx') == 1) {
  85 + config.modelConfig.senseVoice = {
  86 + model: './sense-voice.onnx',
  87 + useInverseTextNormalization: 1,
  88 + };
  89 + } else if (fileExists('whisper-encoder.onnx')) {
  90 + config.modelConfig.whisper = {
  91 + encoder: './whisper-encoder.onnx',
  92 + decoder: './whisper-decoder.onnx',
  93 + };
  94 + } else if (fileExists('transducer-encoder.onnx')) {
  95 + config.modelConfig.transducer = {
  96 + encoder: './transducer-encoder.onnx',
  97 + decoder: './transducer-decoder.onnx',
  98 + joiner: './transducer-joiner.onnx',
  99 + };
  100 + config.modelConfig.modelType = 'transducer';
  101 + } else if (fileExists('nemo-transducer-encoder.onnx')) {
  102 + config.modelConfig.transducer = {
  103 + encoder: './nemo-transducer-encoder.onnx',
  104 + decoder: './nemo-transducer-decoder.onnx',
  105 + joiner: './nemo-transducer-joiner.onnx',
  106 + };
  107 + config.modelConfig.modelType = 'nemo_transducer';
  108 + } else if (fileExists('paraformer.onnx')) {
  109 + config.modelConfig.paraformer = {
  110 + model: './paraformer.onnx',
  111 + };
  112 + } else if (fileExists('telespeech.onnx')) {
  113 + config.modelConfig.telespeechCtc = './telespeech.onnx';
  114 + } else {
  115 + console.log('Please specify a model.');
  116 + alert('Please specify a model.');
  117 + }
  118 +
  119 + recognizer = new OfflineRecognizer(config, Module);
  120 +}
  121 +
  122 +Module.onRuntimeInitialized = function() {
  123 + console.log('inited!');
  124 + hint.innerText = 'Model loaded! Please click start';
  125 +
  126 + startBtn.disabled = false;
  127 +
  128 + vad = createVad(Module);
  129 + console.log('vad is created!', vad);
  130 +
  131 + buffer = new CircularBuffer(30 * 16000, Module);
  132 + console.log('CircularBuffer is created!', buffer);
  133 +
  134 + initOfflineRecognizer();
  135 +};
  136 +
  137 +
  138 +
  139 +if (navigator.mediaDevices.getUserMedia) {
  140 + console.log('getUserMedia supported.');
  141 +
  142 + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
  143 + const constraints = {audio: true};
  144 +
  145 + let onSuccess = function(stream) {
  146 + if (!audioCtx) {
  147 + audioCtx = new AudioContext({sampleRate: expectedSampleRate});
  148 + }
  149 + console.log(audioCtx);
  150 + recordSampleRate = audioCtx.sampleRate;
  151 + console.log('sample rate ' + recordSampleRate);
  152 +
  153 + // creates an audio node from the microphone incoming stream
  154 + mediaStream = audioCtx.createMediaStreamSource(stream);
  155 + console.log('media stream', mediaStream);
  156 +
  157 + // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
  158 + // bufferSize: the onaudioprocess event is called when the buffer is full
  159 + var bufferSize = 4096;
  160 + var numberOfInputChannels = 1;
  161 + var numberOfOutputChannels = 2;
  162 + if (audioCtx.createScriptProcessor) {
  163 + recorder = audioCtx.createScriptProcessor(
  164 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  165 + } else {
  166 + recorder = audioCtx.createJavaScriptNode(
  167 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  168 + }
  169 + console.log('recorder', recorder);
  170 +
  171 + recorder.onaudioprocess = function(e) {
  172 + let samples = new Float32Array(e.inputBuffer.getChannelData(0))
  173 + samples = downsampleBuffer(samples, expectedSampleRate);
  174 + buffer.push(samples);
  175 + while (buffer.size() > vad.config.sileroVad.windowSize) {
  176 + const s = buffer.get(buffer.head(), vad.config.sileroVad.windowSize);
  177 + vad.acceptWaveform(s);
  178 + buffer.pop(vad.config.sileroVad.windowSize);
  179 +
  180 + if (vad.isDetected() && !printed) {
  181 + printed = true;
  182 + lastResult = 'Speech detected';
  183 + }
  184 +
  185 + if (!vad.isDetected()) {
  186 + printed = false;
  187 + if (lastResult != '') {
  188 + resultList.push(lastResult);
  189 + }
  190 + lastResult = '';
  191 + }
  192 +
  193 + while (!vad.isEmpty()) {
  194 + const segment = vad.front();
  195 + const duration = segment.samples.length / expectedSampleRate;
  196 + let durationStr = `Duration: ${duration.toFixed(3)} seconds`;
  197 + vad.pop();
  198 +
  199 + // non-streaming asr
  200 + const stream = recognizer.createStream();
  201 + stream.acceptWaveform(expectedSampleRate, segment.samples);
  202 + recognizer.decode(stream);
  203 + let recognitionResult = recognizer.getResult(stream);
  204 + console.log(recognitionResult);
  205 + let text = recognitionResult.text;
  206 + stream.free();
  207 + console.log(text);
  208 +
  209 + if (text != '') {
  210 + durationStr += `. Result: ${text}`;
  211 + }
  212 +
  213 + resultList.push(durationStr);
  214 +
  215 +
  216 + // now save the segment to a wav file
  217 + let buf = new Int16Array(segment.samples.length);
  218 + for (var i = 0; i < segment.samples.length; ++i) {
  219 + let s = segment.samples[i];
  220 + if (s >= 1)
  221 + s = 1;
  222 + else if (s <= -1)
  223 + s = -1;
  224 +
  225 + buf[i] = s * 32767;
  226 + }
  227 +
  228 + let clipName = new Date().toISOString() + '--' + durationStr;
  229 +
  230 + const clipContainer = document.createElement('article');
  231 + const clipLabel = document.createElement('p');
  232 + const audio = document.createElement('audio');
  233 + const deleteButton = document.createElement('button');
  234 +
  235 + clipContainer.classList.add('clip');
  236 + audio.setAttribute('controls', '');
  237 + deleteButton.textContent = 'Delete';
  238 + deleteButton.className = 'delete';
  239 +
  240 + clipLabel.textContent = clipName;
  241 +
  242 + clipContainer.appendChild(audio);
  243 +
  244 + clipContainer.appendChild(clipLabel);
  245 + clipContainer.appendChild(deleteButton);
  246 + soundClips.appendChild(clipContainer);
  247 +
  248 + audio.controls = true;
  249 + const blob = toWav(buf);
  250 +
  251 + leftchannel = [];
  252 + const audioURL = window.URL.createObjectURL(blob);
  253 + audio.src = audioURL;
  254 +
  255 + deleteButton.onclick = function(e) {
  256 + let evtTgt = e.target;
  257 + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
  258 + };
  259 +
  260 + clipLabel.onclick = function() {
  261 + const existingName = clipLabel.textContent;
  262 + const newClipName = prompt('Enter a new name for your sound clip?');
  263 + if (newClipName === null) {
  264 + clipLabel.textContent = existingName;
  265 + } else {
  266 + clipLabel.textContent = newClipName;
  267 + }
  268 + };
  269 + }
  270 + }
  271 +
  272 + textArea.value = getDisplayResult();
  273 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  274 + };
  275 +
  276 + startBtn.onclick = function() {
  277 + mediaStream.connect(recorder);
  278 + recorder.connect(audioCtx.destination);
  279 +
  280 + console.log('recorder started');
  281 +
  282 + stopBtn.disabled = false;
  283 + startBtn.disabled = true;
  284 + };
  285 +
  286 + stopBtn.onclick = function() {
  287 + vad.reset();
  288 + buffer.reset();
  289 + console.log('recorder stopped');
  290 +
  291 + // stopBtn recording
  292 + recorder.disconnect(audioCtx.destination);
  293 + mediaStream.disconnect(recorder);
  294 +
  295 + startBtn.style.background = '';
  296 + startBtn.style.color = '';
  297 + // mediaRecorder.requestData();
  298 +
  299 + stopBtn.disabled = true;
  300 + startBtn.disabled = false;
  301 + };
  302 + };
  303 +
  304 + let onError = function(err) {
  305 + console.log('The following error occured: ' + err);
  306 + };
  307 +
  308 + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
  309 +} else {
  310 + console.log('getUserMedia not supported on your browser!');
  311 + alert('getUserMedia not supported on your browser!');
  312 +}
  313 +
  314 +
  315 +// this function is copied/modified from
  316 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  317 +function flatten(listOfSamples) {
  318 + let n = 0;
  319 + for (let i = 0; i < listOfSamples.length; ++i) {
  320 + n += listOfSamples[i].length;
  321 + }
  322 + let ans = new Int16Array(n);
  323 +
  324 + let offset = 0;
  325 + for (let i = 0; i < listOfSamples.length; ++i) {
  326 + ans.set(listOfSamples[i], offset);
  327 + offset += listOfSamples[i].length;
  328 + }
  329 + return ans;
  330 +}
  331 +
  332 +// this function is copied/modified from
  333 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  334 +function toWav(samples) {
  335 + let buf = new ArrayBuffer(44 + samples.length * 2);
  336 + var view = new DataView(buf);
  337 +
  338 + // http://soundfile.sapp.org/doc/WaveFormat/
  339 + // F F I R
  340 + view.setUint32(0, 0x46464952, true); // chunkID
  341 + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
  342 + // E V A W
  343 + view.setUint32(8, 0x45564157, true); // format
  344 + //
  345 + // t m f
  346 + view.setUint32(12, 0x20746d66, true); // subchunk1ID
  347 + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
  348 + view.setUint32(20, 1, true); // audioFormat, 1 for PCM
  349 + view.setUint16(22, 1, true); // numChannels: 1 channel
  350 + view.setUint32(24, expectedSampleRate, true); // sampleRate
  351 + view.setUint32(28, expectedSampleRate * 2, true); // byteRate
  352 + view.setUint16(32, 2, true); // blockAlign
  353 + view.setUint16(34, 16, true); // bitsPerSample
  354 + view.setUint32(36, 0x61746164, true); // Subchunk2ID
  355 + view.setUint32(40, samples.length * 2, true); // subchunk2Size
  356 +
  357 + let offset = 44;
  358 + for (let i = 0; i < samples.length; ++i) {
  359 + view.setInt16(offset, samples[i], true);
  360 + offset += 2;
  361 + }
  362 +
  363 + return new Blob([view], {type: 'audio/wav'});
  364 +}
  365 +
  366 +// this function is copied from
  367 +// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46
  368 +function downsampleBuffer(buffer, exportSampleRate) {
  369 + if (exportSampleRate === recordSampleRate) {
  370 + return buffer;
  371 + }
  372 + var sampleRateRatio = recordSampleRate / exportSampleRate;
  373 + var newLength = Math.round(buffer.length / sampleRateRatio);
  374 + var result = new Float32Array(newLength);
  375 + var offsetResult = 0;
  376 + var offsetBuffer = 0;
  377 + while (offsetResult < result.length) {
  378 + var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
  379 + var accum = 0, count = 0;
  380 + for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
  381 + accum += buffer[i];
  382 + count++;
  383 + }
  384 + result[offsetResult] = accum / count;
  385 + offsetResult++;
  386 + offsetBuffer = nextOffsetBuffer;
  387 + }
  388 + return result;
  389 +};
  1 +# Introduction
  2 +
  3 +## Download VAD models
  4 +
  5 +Please download
  6 +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  7 +and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`.
  8 +
  9 +## Download non-streaming ASR models
  10 +
  11 +Please refer to
  12 +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  13 +to download a non-streaming ASR model, i.e., an offline ASR model.
  14 +
  15 +After downloading, you should rename the model files.
  16 +
  17 +Please refer to
  18 +https://k2-fsa.github.io/sherpa/onnx/lazarus/generate-subtitles.html#download-a-speech-recognition-model
  19 +for how to rename.
  20 +
  21 +You can find example build scripts at the following address:
  22 +
  23 + https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-vad-asr.yaml
  1 +<html lang="en">
  2 +
  3 +<head>
  4 + <meta charset="utf-8">
  5 + <meta name="viewport" content="width=device-width" />
  6 + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for VAD + ASR</title>
  7 + <style>
  8 + h1,div {
  9 + text-align: center;
  10 + }
  11 + textarea {
  12 + width:100%;
  13 + }
  14 + </style>
  15 +</head>
  16 +
  17 +<body>
  18 + <h1>
  19 + Next-gen Kaldi + WebAssembly<br/>
  20 + VAD+ASR Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/>
  21 + (with Zipformer)
  22 + </h1>
  23 +
  24 + <div>
  25 + <span id="hint">Loading model ... ...</span>
  26 + <br/>
  27 + <br/>
  28 + <button id="startBtn" disabled>Start</button>
  29 + <button id="stopBtn" disabled>Stop</button>
  30 + <button id="clearBtn">Clear</button>
  31 + <br/>
  32 + <br/>
  33 + <textarea id="results" rows="10" readonly></textarea>
  34 + </div>
  35 +
  36 + <section flex="1" overflow="auto" id="sound-clips">
  37 + </section>
  38 +
  39 + <script src="sherpa-onnx-asr.js"></script>
  40 + <script src="sherpa-onnx-vad.js"></script>
  41 + <script src="app-vad-asr.js"></script>
  42 + <script src="sherpa-onnx-wasm-main-vad-asr.js"></script>
  43 +</body>
  1 +../asr/sherpa-onnx-asr.js
  1 +../vad/sherpa-onnx-vad.js
  1 +// wasm/sherpa-onnx-wasm-main-vad-asr.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#include <stdio.h>
  5 +
  6 +#include <algorithm>
  7 +#include <memory>
  8 +
  9 +#include "sherpa-onnx/c-api/c-api.h"
  10 +
  11 +// see also
  12 +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html
  13 +
  14 +extern "C" {
  15 +
  16 +void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
  17 + std::copy(src, src + num_bytes, dst);
  18 +}
  19 +}
@@ -3,3 +3,6 @@ @@ -3,3 +3,6 @@
3 Please download 3 Please download
4 https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx 4 https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
5 and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`. 5 and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`.
  6 +
  7 +You can find example build script at
  8 +https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-silero-vad.yaml
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 <head> 3 <head>
4 <meta charset="utf-8"> 4 <meta charset="utf-8">
5 <meta name="viewport" content="width=device-width" /> 5 <meta name="viewport" content="width=device-width" />
6 - <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title> 6 + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for VAD</title>
7 <style> 7 <style>
8 h1,div { 8 h1,div {
9 text-align: center; 9 text-align: center;
@@ -172,7 +172,6 @@ class Vad { @@ -172,7 +172,6 @@ class Vad {
172 constructor(configObj, Module) { 172 constructor(configObj, Module) {
173 this.config = configObj; 173 this.config = configObj;
174 const config = initSherpaOnnxVadModelConfig(configObj, Module); 174 const config = initSherpaOnnxVadModelConfig(configObj, Module);
175 - Module._MyPrint(config.ptr);  
176 const handle = Module._SherpaOnnxCreateVoiceActivityDetector( 175 const handle = Module._SherpaOnnxCreateVoiceActivityDetector(
177 config.ptr, configObj.bufferSizeInSeconds || 30); 176 config.ptr, configObj.bufferSizeInSeconds || 30);
178 freeConfig(config, Module); 177 freeConfig(config, Module);