Fangjun Kuang
Committed by GitHub

Add WebAssembly for VAD (#1281)

1 name: wasm-simd-hf-space-de-tts 1 name: wasm-simd-hf-space-de-tts
2 2
3 on: 3 on:
4 - release:  
5 - types:  
6 - - published 4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
7 9
8 workflow_dispatch: 10 workflow_dispatch:
9 11
@@ -71,6 +73,14 @@ jobs: @@ -71,6 +73,14 @@ jobs:
71 name: sherpa-onnx-wasm-simd-de-tts 73 name: sherpa-onnx-wasm-simd-de-tts
72 path: ./sherpa-onnx-wasm-simd-*.tar.bz2 74 path: ./sherpa-onnx-wasm-simd-*.tar.bz2
73 75
  76 + - name: Release
  77 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  78 + uses: svenstaro/upload-release-action@v2
  79 + with:
  80 + file_glob: true
  81 + overwrite: true
  82 + file: ./*.tar.bz2
  83 +
74 - name: Publish to ModelScope 84 - name: Publish to ModelScope
75 # if: false 85 # if: false
76 env: 86 env:
1 name: wasm-simd-hf-space-en-asr-zipformer 1 name: wasm-simd-hf-space-en-asr-zipformer
2 2
3 on: 3 on:
4 - release:  
5 - types:  
6 - - published 4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
7 9
8 workflow_dispatch: 10 workflow_dispatch:
9 11
@@ -73,6 +75,14 @@ jobs: @@ -73,6 +75,14 @@ jobs:
73 name: sherpa-onnx-wasm-simd-en-asr-zipformer 75 name: sherpa-onnx-wasm-simd-en-asr-zipformer
74 path: ./sherpa-onnx-wasm-simd-*.tar.bz2 76 path: ./sherpa-onnx-wasm-simd-*.tar.bz2
75 77
  78 + - name: Release
  79 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  80 + uses: svenstaro/upload-release-action@v2
  81 + with:
  82 + file_glob: true
  83 + overwrite: true
  84 + file: ./*.tar.bz2
  85 +
76 - name: Publish to ModelScope 86 - name: Publish to ModelScope
77 # if: false 87 # if: false
78 env: 88 env:
1 name: wasm-simd-hf-space-en-tts 1 name: wasm-simd-hf-space-en-tts
2 2
3 on: 3 on:
4 - release:  
5 - types:  
6 - - published 4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
7 9
8 workflow_dispatch: 10 workflow_dispatch:
9 11
@@ -69,6 +71,14 @@ jobs: @@ -69,6 +71,14 @@ jobs:
69 name: sherpa-onnx-wasm-simd-en-tts 71 name: sherpa-onnx-wasm-simd-en-tts
70 path: ./sherpa-onnx-wasm-simd-*.tar.bz2 72 path: ./sherpa-onnx-wasm-simd-*.tar.bz2
71 73
  74 + - name: Release
  75 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  76 + uses: svenstaro/upload-release-action@v2
  77 + with:
  78 + file_glob: true
  79 + overwrite: true
  80 + file: ./*.tar.bz2
  81 +
72 - name: Publish to ModelScope 82 - name: Publish to ModelScope
73 # if: false 83 # if: false
74 env: 84 env:
  1 +name: wasm-simd-hf-space-silero-vad
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
  9 +
  10 + workflow_dispatch:
  11 +
  12 +concurrency:
  13 + group: wasm-simd-hf-space-silero-vad-${{ github.ref }}
  14 + cancel-in-progress: true
  15 +
  16 +jobs:
  17 + wasm-simd-hf-space-silero-vad:
  18 + runs-on: ${{ matrix.os }}
  19 + strategy:
  20 + fail-fast: false
  21 + matrix:
  22 + os: [ubuntu-latest]
  23 +
  24 + steps:
  25 + - uses: actions/checkout@v4
  26 + with:
  27 + fetch-depth: 0
  28 + - name: Install emsdk
  29 + uses: mymindstorm/setup-emsdk@v14
  30 + with:
  31 + version: 3.1.51
  32 + actions-cache-folder: 'emsdk-cache'
  33 +
  34 + - name: View emsdk version
  35 + shell: bash
  36 + run: |
  37 + emcc -v
  38 + echo "--------------------"
  39 + emcc --check
  40 +
  41 + - name: Download model files
  42 + shell: bash
  43 + run: |
  44 + cd wasm/vad/assets
  45 + ls -lh
  46 + echo "----------"
  47 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  48 + ls -lh
  49 +
  50 + - name: Build sherpa-onnx for WebAssembly
  51 + shell: bash
  52 + run: |
  53 + ./build-wasm-simd-vad.sh
  54 +
  55 + - name: collect files
  56 + shell: bash
  57 + run: |
  58 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  59 +
  60 + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-vad
  61 + mv build-wasm-simd-vad/install/bin/wasm/vad $dst
  62 + ls -lh $dst
  63 + tar cjfv $dst.tar.bz2 ./$dst
  64 +
  65 + - name: Upload wasm files
  66 + uses: actions/upload-artifact@v4
  67 + with:
  68 + name: sherpa-onnx-wasm-simd-vad
  69 + path: ./sherpa-onnx-wasm-simd-*.tar.bz2
  70 +
  71 + - name: Release
  72 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  73 + uses: svenstaro/upload-release-action@v2
  74 + with:
  75 + file_glob: true
  76 + overwrite: true
  77 + file: ./*.tar.bz2
  78 +
  79 + - name: Publish to ModelScope
  80 + # if: false
  81 + env:
  82 + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }}
  83 + uses: nick-fields/retry@v2
  84 + with:
  85 + max_attempts: 20
  86 + timeout_seconds: 200
  87 + shell: bash
  88 + command: |
  89 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  90 +
  91 + git config --global user.email "csukuangfj@gmail.com"
  92 + git config --global user.name "Fangjun Kuang"
  93 +
  94 + rm -rf ms
  95 + export GIT_LFS_SKIP_SMUDGE=1
  96 + export GIT_CLONE_PROTECTION_ACTIVE=false
  97 +
  98 + git clone https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx.git ms
  99 + cd ms
  100 + rm -fv *.js
  101 + rm -fv *.data
  102 + git fetch
  103 + git pull
  104 + git merge -m "merge remote" --ff origin main
  105 +
  106 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-vad/* .
  107 +
  108 + git status
  109 + git lfs track "*.data"
  110 + git lfs track "*.wasm"
  111 + ls -lh
  112 +
  113 + git add .
  114 + git commit -m "update model"
  115 + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx.git
  116 +
  117 + - name: Publish to huggingface
  118 + env:
  119 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  120 + uses: nick-fields/retry@v2
  121 + with:
  122 + max_attempts: 20
  123 + timeout_seconds: 200
  124 + shell: bash
  125 + command: |
  126 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  127 +
  128 + git config --global user.email "csukuangfj@gmail.com"
  129 + git config --global user.name "Fangjun Kuang"
  130 +
  131 + rm -rf huggingface
  132 + export GIT_LFS_SKIP_SMUDGE=1
  133 + export GIT_CLONE_PROTECTION_ACTIVE=false
  134 +
  135 + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx huggingface
  136 + cd huggingface
  137 + rm -fv *.js
  138 + rm -fv *.data
  139 + git fetch
  140 + git pull
  141 + git merge -m "merge remote" --ff origin main
  142 +
  143 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-vad/* .
  144 +
  145 + git status
  146 + git lfs track "*.data"
  147 + git lfs track "*.wasm"
  148 + ls -lh
  149 +
  150 + git add .
  151 + git commit -m "update model"
  152 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx main
1 name: wasm-simd-hf-space-zh-cantonese-en-asr-paraformer 1 name: wasm-simd-hf-space-zh-cantonese-en-asr-paraformer
2 2
3 on: 3 on:
4 - release:  
5 - types:  
6 - - published 4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
7 9
8 workflow_dispatch: 10 workflow_dispatch:
9 11
@@ -80,6 +82,14 @@ jobs: @@ -80,6 +82,14 @@ jobs:
80 name: sherpa-onnx-wasm-simd-zh-cantonese-en-asr-paraformer 82 name: sherpa-onnx-wasm-simd-zh-cantonese-en-asr-paraformer
81 path: ./sherpa-onnx-wasm-simd-*.tar.bz2 83 path: ./sherpa-onnx-wasm-simd-*.tar.bz2
82 84
  85 + - name: Release
  86 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  87 + uses: svenstaro/upload-release-action@v2
  88 + with:
  89 + file_glob: true
  90 + overwrite: true
  91 + file: ./*.tar.bz2
  92 +
83 - name: Publish to huggingface 93 - name: Publish to huggingface
84 env: 94 env:
85 HF_TOKEN: ${{ secrets.HF_TOKEN }} 95 HF_TOKEN: ${{ secrets.HF_TOKEN }}
1 name: wasm-simd-hf-space-zh-en-asr-paraformer 1 name: wasm-simd-hf-space-zh-en-asr-paraformer
2 2
3 on: 3 on:
4 - release:  
5 - types:  
6 - - published 4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
7 9
8 workflow_dispatch: 10 workflow_dispatch:
9 11
@@ -80,6 +82,14 @@ jobs: @@ -80,6 +82,14 @@ jobs:
80 name: sherpa-onnx-wasm-simd-zh-en-asr-paraformer 82 name: sherpa-onnx-wasm-simd-zh-en-asr-paraformer
81 path: ./sherpa-onnx-wasm-simd-*.tar.bz2 83 path: ./sherpa-onnx-wasm-simd-*.tar.bz2
82 84
  85 + - name: Release
  86 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  87 + uses: svenstaro/upload-release-action@v2
  88 + with:
  89 + file_glob: true
  90 + overwrite: true
  91 + file: ./*.tar.bz2
  92 +
83 - name: Publish to ModelScope 93 - name: Publish to ModelScope
84 # if: false 94 # if: false
85 env: 95 env:
1 name: wasm-simd-hf-space-zh-en-asr-zipformer 1 name: wasm-simd-hf-space-zh-en-asr-zipformer
2 2
3 on: 3 on:
4 - release:  
5 - types:  
6 - - published 4 + push:
  5 + branches:
  6 + - wasm
  7 + tags:
  8 + - 'v[0-9]+.[0-9]+.[0-9]+*'
7 9
8 workflow_dispatch: 10 workflow_dispatch:
9 11
@@ -71,6 +73,14 @@ jobs: @@ -71,6 +73,14 @@ jobs:
71 name: sherpa-onnx-wasm-simd-zh-en-asr-zipformer 73 name: sherpa-onnx-wasm-simd-zh-en-asr-zipformer
72 path: ./sherpa-onnx-wasm-simd-*.tar.bz2 74 path: ./sherpa-onnx-wasm-simd-*.tar.bz2
73 75
  76 + - name: Release
  77 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  78 + uses: svenstaro/upload-release-action@v2
  79 + with:
  80 + file_glob: true
  81 + overwrite: true
  82 + file: ./*.tar.bz2
  83 +
74 - name: Publish to ModelScope 84 - name: Publish to ModelScope
75 # if: false 85 # if: false
76 env: 86 env:
@@ -35,6 +35,7 @@ option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF) @@ -35,6 +35,7 @@ option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF)
35 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) 35 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF)
36 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) 36 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF)
37 option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) 37 option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF)
  38 +option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF)
38 option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) 39 option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF)
39 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) 40 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON)
40 option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON) 41 option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON)
@@ -135,6 +136,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}") @@ -135,6 +136,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}")
135 message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") 136 message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}")
136 message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") 137 message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}")
137 message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") 138 message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}")
  139 +message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}")
138 message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") 140 message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}")
139 message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}") 141 message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}")
140 message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") 142 message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}")
@@ -212,6 +214,10 @@ if(SHERPA_ONNX_ENABLE_WASM_KWS) @@ -212,6 +214,10 @@ if(SHERPA_ONNX_ENABLE_WASM_KWS)
212 add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) 214 add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1)
213 endif() 215 endif()
214 216
  217 +if(SHERPA_ONNX_ENABLE_WASM_VAD)
  218 + add_definitions(-DSHERPA_ONNX_ENABLE_WASM_VAD=1)
  219 +endif()
  220 +
215 if(NOT CMAKE_CXX_STANDARD) 221 if(NOT CMAKE_CXX_STANDARD)
216 set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ version to be used.") 222 set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ version to be used.")
217 endif() 223 endif()
@@ -76,6 +76,32 @@ with the following APIs @@ -76,6 +76,32 @@ with the following APIs
76 - Swift, Rust 76 - Swift, Rust
77 - Dart, Object Pascal 77 - Dart, Object Pascal
78 78
  79 +### Links for Huggingface Spaces
  80 +
  81 +You can visit the following Huggingface spaces to try `sherpa-onnx` without
  82 +installing anything. All you need is a browser.
  83 +
  84 +| Description | URL |
  85 +|---|---|
  86 +| Speech recognition | [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition)|
  87 +| Speech recognition with [Whisper](https://github.com/openai/whisper)| [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper)|
  88 +| Speech synthesis | [Click me](https://huggingface.co/spaces/k2-fsa/text-to-speech)|
  89 +| Generate subtitles| [Click me](https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos)|
  90 +|Audio tagging| [Click me](https://huggingface.co/spaces/k2-fsa/audio-tagging)|
  91 +|Spoken language identification with [Whisper](https://github.com/openai/whisper)|[Click me](https://huggingface.co/spaces/k2-fsa/spoken-language-identification)|
  92 +
  93 +We also have spaces built using WebAssembly. The are listed below:
  94 +
  95 +| Description | URL| Chinese users|
  96 +|---|---|---|
  97 +|Voice activity detection with [silero-vad](https://github.com/snakers4/silero-vad)| [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx)|[地址](https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx)|
  98 +|Real-time speech recognition (Chinese + English) with Zipformer | [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|
  99 +|Real-time speech recognition (Chinese + English) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)|
  100 +|Real-time speech recognition (Chinese + English + Cantonese) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)|
  101 +|Real-time speech recognition (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en)|
  102 +|Speech synthesis (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en)|
  103 +|Speech synthesis (German)|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de)|
  104 +
79 ### Links for pre-built Android APKs 105 ### Links for pre-built Android APKs
80 106
81 | Description | URL | 中国用户 | 107 | Description | URL | 中国用户 |
@@ -130,7 +156,7 @@ with the following APIs @@ -130,7 +156,7 @@ with the following APIs
130 | Keyword spotting |[Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models)| 156 | Keyword spotting |[Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models)|
131 | Audio tagging | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models)| 157 | Audio tagging | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models)|
132 | Speaker identification (Speaker ID) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models)| 158 | Speaker identification (Speaker ID) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models)|
133 -| Spoken language identification (Language ID) | See multi-lingual Whisper ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | 159 +| Spoken language identification (Language ID) | See multi-lingual [Whisper](https://github.com/openai/whisper) ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) |
134 | Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)| 160 | Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)|
135 161
136 ### Useful links 162 ### Useful links
@@ -48,6 +48,7 @@ cmake \ @@ -48,6 +48,7 @@ cmake \
48 -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ 48 -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
49 -DSHERPA_ONNX_ENABLE_JNI=OFF \ 49 -DSHERPA_ONNX_ENABLE_JNI=OFF \
50 -DSHERPA_ONNX_ENABLE_C_API=ON \ 50 -DSHERPA_ONNX_ENABLE_C_API=ON \
  51 + -DSHERPA_ONNX_ENABLE_TTS=OFF \
51 -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ 52 -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
52 -DSHERPA_ONNX_ENABLE_GPU=OFF \ 53 -DSHERPA_ONNX_ENABLE_GPU=OFF \
53 -DSHERPA_ONNX_ENABLE_WASM=ON \ 54 -DSHERPA_ONNX_ENABLE_WASM=ON \
@@ -43,6 +43,7 @@ cmake \ @@ -43,6 +43,7 @@ cmake \
43 -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ 43 -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
44 -DSHERPA_ONNX_ENABLE_JNI=OFF \ 44 -DSHERPA_ONNX_ENABLE_JNI=OFF \
45 -DSHERPA_ONNX_ENABLE_C_API=ON \ 45 -DSHERPA_ONNX_ENABLE_C_API=ON \
  46 + -DSHERPA_ONNX_ENABLE_TTS=OFF \
46 -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ 47 -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
47 -DSHERPA_ONNX_ENABLE_GPU=OFF \ 48 -DSHERPA_ONNX_ENABLE_GPU=OFF \
48 -DSHERPA_ONNX_ENABLE_WASM=ON \ 49 -DSHERPA_ONNX_ENABLE_WASM=ON \
  1 +#!/usr/bin/env bash
  2 +# Copyright (c) 2024 Xiaomi Corporation
  3 +#
  4 +# This script is to build sherpa-onnx for WebAssembly (VAD)
  5 +
  6 +set -ex
  7 +
  8 +if [ x"$EMSCRIPTEN" == x"" ]; then
  9 + if ! command -v emcc &> /dev/null; then
  10 + echo "Please install emscripten first"
  11 + echo ""
  12 + echo "You can use the following commands to install it:"
  13 + echo ""
  14 + echo "git clone https://github.com/emscripten-core/emsdk.git"
  15 + echo "cd emsdk"
  16 + echo "git pull"
  17 + echo "./emsdk install latest"
  18 + echo "./emsdk activate latest"
  19 + echo "source ./emsdk_env.sh"
  20 + exit 1
  21 + else
  22 + EMSCRIPTEN=$(dirname $(realpath $(which emcc)))
  23 + fi
  24 +fi
  25 +
  26 +export EMSCRIPTEN=$EMSCRIPTEN
  27 +echo "EMSCRIPTEN: $EMSCRIPTEN"
  28 +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then
  29 + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake"
  30 + echo "Please make sure you have installed emsdk correctly"
  31 + exit 1
  32 +fi
  33 +
  34 +mkdir -p build-wasm-simd-vad
  35 +pushd build-wasm-simd-vad
  36 +
  37 +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON
  38 +
  39 +cmake \
  40 + -DCMAKE_INSTALL_PREFIX=./install \
  41 + -DCMAKE_BUILD_TYPE=Release \
  42 + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \
  43 + \
  44 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  45 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  46 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  47 + -DBUILD_SHARED_LIBS=OFF \
  48 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  49 + -DSHERPA_ONNX_ENABLE_JNI=OFF \
  50 + -DSHERPA_ONNX_ENABLE_TTS=OFF \
  51 + -DSHERPA_ONNX_ENABLE_C_API=ON \
  52 + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
  53 + -DSHERPA_ONNX_ENABLE_GPU=OFF \
  54 + -DSHERPA_ONNX_ENABLE_WASM=ON \
  55 + -DSHERPA_ONNX_ENABLE_WASM_VAD=ON \
  56 + -DSHERPA_ONNX_ENABLE_BINARY=OFF \
  57 + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \
  58 + ..
  59 +make -j2
  60 +make install
  61 +
  62 +ls -lh install/bin/wasm/vad
@@ -71,7 +71,7 @@ config = { @@ -71,7 +71,7 @@ config = {
71 /* 71 /*
72 { 72 {
73 samples: a 1-d float32 array, 73 samples: a 1-d float32 array,
74 - start: a int32 74 + start: an int32
75 } 75 }
76 */ 76 */
77 front(enableExternalBuffer = true) { 77 front(enableExternalBuffer = true) {
@@ -10,6 +10,10 @@ if(SHERPA_ONNX_ENABLE_WASM_KWS) @@ -10,6 +10,10 @@ if(SHERPA_ONNX_ENABLE_WASM_KWS)
10 add_subdirectory(kws) 10 add_subdirectory(kws)
11 endif() 11 endif()
12 12
  13 +if(SHERPA_ONNX_ENABLE_WASM_VAD)
  14 + add_subdirectory(vad)
  15 +endif()
  16 +
13 if(SHERPA_ONNX_ENABLE_WASM_NODEJS) 17 if(SHERPA_ONNX_ENABLE_WASM_NODEJS)
14 add_subdirectory(nodejs) 18 add_subdirectory(nodejs)
15 endif() 19 endif()
  1 +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH})
  2 + message(FATAL_ERROR "Please use ./build-wasm-simd-vad.sh to build for wasm VAD")
  3 +endif()
  4 +
  5 +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/silero_vad.onnx")
  6 + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue")
  7 +endif()
  8 +
  9 +set(exported_functions
  10 + MyPrint
  11 + # VAD
  12 + SherpaOnnxCreateCircularBuffer
  13 + SherpaOnnxDestroyCircularBuffer
  14 + SherpaOnnxCircularBufferPush
  15 + SherpaOnnxCircularBufferGet
  16 + SherpaOnnxCircularBufferFree
  17 + SherpaOnnxCircularBufferPop
  18 + SherpaOnnxCircularBufferSize
  19 + SherpaOnnxCircularBufferHead
  20 + SherpaOnnxCircularBufferReset
  21 + SherpaOnnxCreateVoiceActivityDetector
  22 + SherpaOnnxDestroyVoiceActivityDetector
  23 + SherpaOnnxVoiceActivityDetectorAcceptWaveform
  24 + SherpaOnnxVoiceActivityDetectorEmpty
  25 + SherpaOnnxVoiceActivityDetectorDetected
  26 + SherpaOnnxVoiceActivityDetectorPop
  27 + SherpaOnnxVoiceActivityDetectorClear
  28 + SherpaOnnxVoiceActivityDetectorFront
  29 + SherpaOnnxDestroySpeechSegment
  30 + SherpaOnnxVoiceActivityDetectorReset
  31 + SherpaOnnxVoiceActivityDetectorFlush
  32 + #
  33 +)
  34 +set(mangled_exported_functions)
  35 +foreach(x IN LISTS exported_functions)
  36 + list(APPEND mangled_exported_functions "_${x}")
  37 +endforeach()
  38 +list(JOIN mangled_exported_functions "," all_exported_functions)
  39 +
  40 +include_directories(${CMAKE_SOURCE_DIR})
  41 +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=64MB -s ALLOW_MEMORY_GROWTH=1")
  42 +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB
  43 +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ")
  44 +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ")
  45 +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ")
  46 +
  47 +message(STATUS "MY_FLAGS: ${MY_FLAGS}")
  48 +
  49 +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}")
  50 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}")
  51 +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}")
  52 +
  53 +if (NOT CMAKE_EXECUTABLE_SUFFIX STREQUAL ".js")
  54 + message(FATAL_ERROR "The default suffix for building executables should be .js!")
  55 +endif()
  56 +# set(CMAKE_EXECUTABLE_SUFFIX ".html")
  57 +
  58 +add_executable(sherpa-onnx-wasm-main-vad sherpa-onnx-wasm-main-vad.cc)
  59 +target_link_libraries(sherpa-onnx-wasm-main-vad sherpa-onnx-c-api)
  60 +install(TARGETS sherpa-onnx-wasm-main-vad DESTINATION bin/wasm/vad)
  61 +
  62 +install(
  63 + FILES
  64 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad>/sherpa-onnx-wasm-main-vad.js"
  65 + "index.html"
  66 + "sherpa-onnx-vad.js"
  67 + "app-vad.js"
  68 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad>/sherpa-onnx-wasm-main-vad.wasm"
  69 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-vad>/sherpa-onnx-wasm-main-vad.data"
  70 + DESTINATION
  71 + bin/wasm/vad
  72 +)
  1 +// This file copies and modifies code
  2 +// from https://mdn.github.io/web-dictaphone/scripts/app.js
  3 +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e
  4 +
  5 +const startBtn = document.getElementById('startBtn');
  6 +const stopBtn = document.getElementById('stopBtn');
  7 +const clearBtn = document.getElementById('clearBtn');
  8 +const hint = document.getElementById('hint');
  9 +const soundClips = document.getElementById('sound-clips');
  10 +
  11 +let textArea = document.getElementById('results');
  12 +
  13 +let lastResult = '';
  14 +let resultList = [];
  15 +
  16 +clearBtn.onclick = function() {
  17 + resultList = [];
  18 + textArea.value = getDisplayResult();
  19 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  20 +};
  21 +
  22 +function getDisplayResult() {
  23 + let i = 0;
  24 + let ans = '';
  25 + for (let s in resultList) {
  26 + if (resultList[s] == '') {
  27 + continue;
  28 + }
  29 +
  30 + if (resultList[s] == 'Speech detected') {
  31 + ans += '' + i + ': ' + resultList[s];
  32 + i += 1;
  33 + } else {
  34 + ans += ', ' + resultList[s] + '\n';
  35 + }
  36 + }
  37 +
  38 + if (lastResult.length > 0) {
  39 + ans += '' + i + ': ' + lastResult + '\n';
  40 + }
  41 + return ans;
  42 +}
  43 +
  44 +
  45 +Module = {};
  46 +Module.onRuntimeInitialized = function() {
  47 + console.log('inited!');
  48 + hint.innerText = 'Model loaded! Please click start';
  49 +
  50 + startBtn.disabled = false;
  51 +
  52 + vad = createVad(Module);
  53 + console.log('vad is created!', vad);
  54 +
  55 + buffer = new CircularBuffer(30 * 16000, Module);
  56 + console.log('CircularBuffer is created!', buffer);
  57 +};
  58 +
  59 +let audioCtx;
  60 +let mediaStream;
  61 +
  62 +let expectedSampleRate = 16000;
  63 +let recordSampleRate; // the sampleRate of the microphone
  64 +let recorder = null; // the microphone
  65 +let leftchannel = []; // TODO: Use a single channel
  66 +
  67 +let recordingLength = 0; // number of samples so far
  68 +
  69 +let vad = null;
  70 +let buffer = null;
  71 +let printed = false;
  72 +
  73 +if (navigator.mediaDevices.getUserMedia) {
  74 + console.log('getUserMedia supported.');
  75 +
  76 + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
  77 + const constraints = {audio: true};
  78 +
  79 + let onSuccess = function(stream) {
  80 + if (!audioCtx) {
  81 + audioCtx = new AudioContext({sampleRate: expectedSampleRate});
  82 + }
  83 + console.log(audioCtx);
  84 + recordSampleRate = audioCtx.sampleRate;
  85 + console.log('sample rate ' + recordSampleRate);
  86 +
  87 + // creates an audio node from the microphone incoming stream
  88 + mediaStream = audioCtx.createMediaStreamSource(stream);
  89 + console.log('media stream', mediaStream);
  90 +
  91 + // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
  92 + // bufferSize: the onaudioprocess event is called when the buffer is full
  93 + var bufferSize = 4096;
  94 + var numberOfInputChannels = 1;
  95 + var numberOfOutputChannels = 2;
  96 + if (audioCtx.createScriptProcessor) {
  97 + recorder = audioCtx.createScriptProcessor(
  98 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  99 + } else {
  100 + recorder = audioCtx.createJavaScriptNode(
  101 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  102 + }
  103 + console.log('recorder', recorder);
  104 +
  105 + recorder.onaudioprocess = function(e) {
  106 + let samples = new Float32Array(e.inputBuffer.getChannelData(0))
  107 + samples = downsampleBuffer(samples, expectedSampleRate);
  108 + buffer.push(samples);
  109 + while (buffer.size() > vad.config.sileroVad.windowSize) {
  110 + const s = buffer.get(buffer.head(), vad.config.sileroVad.windowSize);
  111 + vad.acceptWaveform(s);
  112 + buffer.pop(vad.config.sileroVad.windowSize);
  113 +
  114 + if (vad.isDetected() && !printed) {
  115 + printed = true;
  116 + lastResult = 'Speech detected';
  117 + }
  118 +
  119 + if (!vad.isDetected()) {
  120 + printed = false;
  121 + if (lastResult != '') {
  122 + resultList.push(lastResult);
  123 + }
  124 + lastResult = '';
  125 + }
  126 +
  127 + while (!vad.isEmpty()) {
  128 + const segment = vad.front();
  129 + const duration = segment.samples.length / expectedSampleRate;
  130 + const durationStr = `Duration: ${duration.toFixed(3)} seconds`;
  131 + resultList.push(durationStr);
  132 + vad.pop();
  133 +
  134 + // now save the segment to a wav file
  135 + let buf = new Int16Array(segment.samples.length);
  136 + for (var i = 0; i < segment.samples.length; ++i) {
  137 + let s = segment.samples[i];
  138 + if (s >= 1)
  139 + s = 1;
  140 + else if (s <= -1)
  141 + s = -1;
  142 +
  143 + buf[i] = s * 32767;
  144 + }
  145 +
  146 + let clipName = new Date().toISOString() + '--' + durationStr;
  147 +
  148 + const clipContainer = document.createElement('article');
  149 + const clipLabel = document.createElement('p');
  150 + const audio = document.createElement('audio');
  151 + const deleteButton = document.createElement('button');
  152 +
  153 + clipContainer.classList.add('clip');
  154 + audio.setAttribute('controls', '');
  155 + deleteButton.textContent = 'Delete';
  156 + deleteButton.className = 'delete';
  157 +
  158 + clipLabel.textContent = clipName;
  159 +
  160 + clipContainer.appendChild(audio);
  161 +
  162 + clipContainer.appendChild(clipLabel);
  163 + clipContainer.appendChild(deleteButton);
  164 + soundClips.appendChild(clipContainer);
  165 +
  166 + audio.controls = true;
  167 + const blob = toWav(buf);
  168 +
  169 + leftchannel = [];
  170 + const audioURL = window.URL.createObjectURL(blob);
  171 + audio.src = audioURL;
  172 +
  173 + deleteButton.onclick = function(e) {
  174 + let evtTgt = e.target;
  175 + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
  176 + };
  177 +
  178 + clipLabel.onclick = function() {
  179 + const existingName = clipLabel.textContent;
  180 + const newClipName = prompt('Enter a new name for your sound clip?');
  181 + if (newClipName === null) {
  182 + clipLabel.textContent = existingName;
  183 + } else {
  184 + clipLabel.textContent = newClipName;
  185 + }
  186 + };
  187 + }
  188 + }
  189 +
  190 + textArea.value = getDisplayResult();
  191 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  192 + };
  193 +
  194 + startBtn.onclick = function() {
  195 + mediaStream.connect(recorder);
  196 + recorder.connect(audioCtx.destination);
  197 +
  198 + console.log('recorder started');
  199 +
  200 + stopBtn.disabled = false;
  201 + startBtn.disabled = true;
  202 + };
  203 +
  204 + stopBtn.onclick = function() {
  205 + vad.reset();
  206 + buffer.reset();
  207 + console.log('recorder stopped');
  208 +
  209 + // stopBtn recording
  210 + recorder.disconnect(audioCtx.destination);
  211 + mediaStream.disconnect(recorder);
  212 +
  213 + startBtn.style.background = '';
  214 + startBtn.style.color = '';
  215 + // mediaRecorder.requestData();
  216 +
  217 + stopBtn.disabled = true;
  218 + startBtn.disabled = false;
  219 + };
  220 + };
  221 +
  222 + let onError = function(err) {
  223 + console.log('The following error occured: ' + err);
  224 + };
  225 +
  226 + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
  227 +} else {
  228 + console.log('getUserMedia not supported on your browser!');
  229 + alert('getUserMedia not supported on your browser!');
  230 +}
  231 +
  232 +
  233 +// this function is copied/modified from
  234 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  235 +function flatten(listOfSamples) {
  236 + let n = 0;
  237 + for (let i = 0; i < listOfSamples.length; ++i) {
  238 + n += listOfSamples[i].length;
  239 + }
  240 + let ans = new Int16Array(n);
  241 +
  242 + let offset = 0;
  243 + for (let i = 0; i < listOfSamples.length; ++i) {
  244 + ans.set(listOfSamples[i], offset);
  245 + offset += listOfSamples[i].length;
  246 + }
  247 + return ans;
  248 +}
  249 +
  250 +// this function is copied/modified from
  251 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  252 +function toWav(samples) {
  253 + let buf = new ArrayBuffer(44 + samples.length * 2);
  254 + var view = new DataView(buf);
  255 +
  256 + // http://soundfile.sapp.org/doc/WaveFormat/
  257 + // F F I R
  258 + view.setUint32(0, 0x46464952, true); // chunkID
  259 + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
  260 + // E V A W
  261 + view.setUint32(8, 0x45564157, true); // format
  262 + //
  263 + // t m f
  264 + view.setUint32(12, 0x20746d66, true); // subchunk1ID
  265 + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
  266 + view.setUint32(20, 1, true); // audioFormat, 1 for PCM
  267 + view.setUint16(22, 1, true); // numChannels: 1 channel
  268 + view.setUint32(24, expectedSampleRate, true); // sampleRate
  269 + view.setUint32(28, expectedSampleRate * 2, true); // byteRate
  270 + view.setUint16(32, 2, true); // blockAlign
  271 + view.setUint16(34, 16, true); // bitsPerSample
  272 + view.setUint32(36, 0x61746164, true); // Subchunk2ID
  273 + view.setUint32(40, samples.length * 2, true); // subchunk2Size
  274 +
  275 + let offset = 44;
  276 + for (let i = 0; i < samples.length; ++i) {
  277 + view.setInt16(offset, samples[i], true);
  278 + offset += 2;
  279 + }
  280 +
  281 + return new Blob([view], {type: 'audio/wav'});
  282 +}
  283 +
  284 +// this function is copied from
  285 +// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46
  286 +function downsampleBuffer(buffer, exportSampleRate) {
  287 + if (exportSampleRate === recordSampleRate) {
  288 + return buffer;
  289 + }
  290 + var sampleRateRatio = recordSampleRate / exportSampleRate;
  291 + var newLength = Math.round(buffer.length / sampleRateRatio);
  292 + var result = new Float32Array(newLength);
  293 + var offsetResult = 0;
  294 + var offsetBuffer = 0;
  295 + while (offsetResult < result.length) {
  296 + var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
  297 + var accum = 0, count = 0;
  298 + for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
  299 + accum += buffer[i];
  300 + count++;
  301 + }
  302 + result[offsetResult] = accum / count;
  303 + offsetResult++;
  304 + offsetBuffer = nextOffsetBuffer;
  305 + }
  306 + return result;
  307 +};
  1 +# Introduction
  2 +
  3 +Please download
  4 +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  5 +and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`.
  1 +<html lang="en">
  2 +
  3 +<head>
  4 + <meta charset="utf-8">
  5 + <meta name="viewport" content="width=device-width" />
  6 + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title>
  7 + <style>
  8 + h1,div {
  9 + text-align: center;
  10 + }
  11 + textarea {
  12 + width:100%;
  13 + }
  14 + </style>
  15 +</head>
  16 +
  17 +<body>
  18 + <h1>
  19 + Next-gen Kaldi + WebAssembly<br/>
  20 + VAD Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/>
  21 + (with <a href="https://github.com/snakers4/silero-vad">silero-vad</a>)
  22 + </h1>
  23 +
  24 + <div>
  25 + <span id="hint">Loading model ... ...</span>
  26 + <br/>
  27 + <br/>
  28 + <button id="startBtn" disabled>Start</button>
  29 + <button id="stopBtn" disabled>Stop</button>
  30 + <button id="clearBtn">Clear</button>
  31 + <br/>
  32 + <br/>
  33 + <textarea id="results" rows="10" readonly></textarea>
  34 + </div>
  35 +
  36 + <section flex="1" overflow="auto" id="sound-clips">
  37 + </section>
  38 +
  39 + <script src="sherpa-onnx-vad.js"></script>
  40 + <script src="app-vad.js"></script>
  41 + <script src="sherpa-onnx-wasm-main-vad.js"></script>
  42 +</body>
  1 +function freeConfig(config, Module) {
  2 + if ('buffer' in config) {
  3 + Module._free(config.buffer);
  4 + }
  5 +
  6 + if ('sileroVad' in config) {
  7 + freeConfig(config.sileroVad, Module)
  8 + }
  9 +
  10 +
  11 + Module._free(config.ptr);
  12 +}
  13 +
  14 +// The user should free the returned pointers
  15 +function initSherpaOnnxSileroVadModelConfig(config, Module) {
  16 + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1;
  17 +
  18 + const n = modelLen;
  19 +
  20 + const buffer = Module._malloc(n);
  21 +
  22 + const len = 5 * 4;
  23 + const ptr = Module._malloc(len);
  24 +
  25 + Module.stringToUTF8(config.model || '', buffer, modelLen);
  26 +
  27 + offset = 0;
  28 + Module.setValue(ptr, buffer, 'i8*');
  29 + offset += 4;
  30 +
  31 + Module.setValue(ptr + offset, config.threshold || 0.5, 'float');
  32 + offset += 4;
  33 +
  34 + Module.setValue(ptr + offset, config.minSilenceDuration || 0.5, 'float');
  35 + offset += 4;
  36 +
  37 + Module.setValue(ptr + offset, config.minSpeechDuration || 0.25, 'float');
  38 + offset += 4;
  39 +
  40 + Module.setValue(ptr + offset, config.windowSize || 512, 'i32');
  41 + offset += 4;
  42 +
  43 + return {
  44 + buffer: buffer, ptr: ptr, len: len,
  45 + }
  46 +}
  47 +
  48 +function initSherpaOnnxVadModelConfig(config, Module) {
  49 + if (!('sileroVad' in config)) {
  50 + config.sileroVad = {
  51 + model: '',
  52 + threshold: 0.50,
  53 + minSilenceDuration: 0.50,
  54 + minSpeechDuration: 0.25,
  55 + windowSize: 512,
  56 + };
  57 + }
  58 +
  59 + const sileroVad =
  60 + initSherpaOnnxSileroVadModelConfig(config.sileroVad, Module);
  61 +
  62 + const len = sileroVad.len + 4 * 4;
  63 + const ptr = Module._malloc(len);
  64 +
  65 + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
  66 + const buffer = Module._malloc(providerLen);
  67 + Module.stringToUTF8(config.provider || 'cpu', buffer, providerLen);
  68 +
  69 + let offset = 0;
  70 + Module._CopyHeap(sileroVad.ptr, sileroVad.len, ptr + offset);
  71 + offset += sileroVad.len;
  72 +
  73 + Module.setValue(ptr + offset, config.sampleRate || 16000, 'i32');
  74 + offset += 4;
  75 +
  76 + Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
  77 + offset += 4;
  78 +
  79 + Module.setValue(ptr + offset, buffer, 'i8*'); // provider
  80 + offset += 4;
  81 +
  82 + Module.setValue(ptr + offset, config.debug || 0, 'i32');
  83 + offset += 4;
  84 +
  85 + return {
  86 + buffer: buffer, ptr: ptr, len: len, sileroVad: sileroVad,
  87 + }
  88 +}
  89 +
  90 +function createVad(Module, myConfig) {
  91 + const sileroVad = {
  92 + model: './silero_vad.onnx',
  93 + threshold: 0.50,
  94 + minSilenceDuration: 0.50,
  95 + minSpeechDuration: 0.25,
  96 + windowSize: 512,
  97 + };
  98 +
  99 + let config = {
  100 + sileroVad: sileroVad,
  101 + sampleRate: 16000,
  102 + numThreads: 1,
  103 + provider: 'cpu',
  104 + debug: 1,
  105 + bufferSizeInSeconds: 30,
  106 + };
  107 +
  108 + if (myConfig) {
  109 + config = myConfig;
  110 + }
  111 +
  112 + return new Vad(config, Module);
  113 +}
  114 +
  115 +
  116 +class CircularBuffer {
  117 + constructor(capacity, Module) {
  118 + this.handle = Module._SherpaOnnxCreateCircularBuffer(capacity);
  119 + this.Module = Module;
  120 + }
  121 +
  122 + free() {
  123 + this.Module._SherpaOnnxDestroyCircularBuffer(this.handle);
  124 + this.handle = 0
  125 + }
  126 +
  127 + /**
  128 + * @param samples {Float32Array}
  129 + */
  130 + push(samples) {
  131 + const pointer =
  132 + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT);
  133 + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT);
  134 + this.Module._SherpaOnnxCircularBufferPush(
  135 + this.handle, pointer, samples.length);
  136 + this.Module._free(pointer);
  137 + }
  138 +
  139 + get(startIndex, n) {
  140 + const p =
  141 + this.Module._SherpaOnnxCircularBufferGet(this.handle, startIndex, n);
  142 +
  143 + const samplesPtr = p / 4;
  144 + const samples = new Float32Array(n);
  145 + for (let i = 0; i < n; i++) {
  146 + samples[i] = this.Module.HEAPF32[samplesPtr + i];
  147 + }
  148 +
  149 + this.Module._SherpaOnnxCircularBufferFree(p);
  150 +
  151 + return samples;
  152 + }
  153 +
  154 + pop(n) {
  155 + this.Module._SherpaOnnxCircularBufferPop(this.handle, n);
  156 + }
  157 +
  158 + size() {
  159 + return this.Module._SherpaOnnxCircularBufferSize(this.handle);
  160 + }
  161 +
  162 + head() {
  163 + return this.Module._SherpaOnnxCircularBufferHead(this.handle);
  164 + }
  165 +
  166 + reset() {
  167 + this.Module._SherpaOnnxCircularBufferReset(this.handle);
  168 + }
  169 +}
  170 +
  171 +class Vad {
  172 + constructor(configObj, Module) {
  173 + this.config = configObj;
  174 + const config = initSherpaOnnxVadModelConfig(configObj, Module);
  175 + Module._MyPrint(config.ptr);
  176 + const handle = Module._SherpaOnnxCreateVoiceActivityDetector(
  177 + config.ptr, configObj.bufferSizeInSeconds || 30);
  178 + freeConfig(config, Module);
  179 +
  180 + this.handle = handle;
  181 + this.Module = Module;
  182 + }
  183 +
  184 + free() {
  185 + this.Module._SherpaOnnxDestroyVoiceActivityDetector(this.handle);
  186 + this.handle = 0
  187 + }
  188 +
  189 + // samples is a float32 array
  190 + acceptWaveform(samples) {
  191 + const pointer =
  192 + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT);
  193 + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT);
  194 + this.Module._SherpaOnnxVoiceActivityDetectorAcceptWaveform(
  195 + this.handle, pointer, samples.length);
  196 + this.Module._free(pointer);
  197 + }
  198 +
  199 + isEmpty() {
  200 + return this.Module._SherpaOnnxVoiceActivityDetectorEmpty(this.handle) == 1;
  201 + }
  202 +
  203 + isDetected() {
  204 + return this.Module._SherpaOnnxVoiceActivityDetectorDetected(this.handle) ==
  205 + 1;
  206 + }
  207 +
  208 + pop() {
  209 + this.Module._SherpaOnnxVoiceActivityDetectorPop(this.handle);
  210 + }
  211 +
  212 + clear() {
  213 + this.Module._SherpaOnnxVoiceActivityDetectorClear(this.handle);
  214 + }
  215 +
  216 + /*
  217 +{
  218 + samples: a 1-d float32 array,
  219 + start: an int32
  220 +}
  221 + */
  222 + front() {
  223 + const h = this.Module._SherpaOnnxVoiceActivityDetectorFront(this.handle);
  224 +
  225 + const start = this.Module.HEAP32[h / 4];
  226 + const samplesPtr = this.Module.HEAP32[h / 4 + 1] / 4;
  227 + const numSamples = this.Module.HEAP32[h / 4 + 2];
  228 +
  229 + const samples = new Float32Array(numSamples);
  230 + for (let i = 0; i < numSamples; i++) {
  231 + samples[i] = this.Module.HEAPF32[samplesPtr + i];
  232 + }
  233 +
  234 + this.Module._SherpaOnnxDestroySpeechSegment(h);
  235 + return {samples: samples, start: start};
  236 + }
  237 +
  238 + reset() {
  239 + this.Module._SherpaOnnxVoiceActivityDetectorReset(this.handle);
  240 + }
  241 +
  242 + flush() {
  243 + this.Module._SherpaOnnxVoiceActivityDetectorFlush(this.handle);
  244 + }
  245 +};
  246 +
  247 +if (typeof process == 'object' && typeof process.versions == 'object' &&
  248 + typeof process.versions.node == 'string') {
  249 + module.exports = {
  250 + createVad,
  251 + CircularBuffer,
  252 + };
  253 +}
  1 +// wasm/sherpa-onnx-wasm-main-vad.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#include <stdio.h>
  5 +
  6 +#include <algorithm>
  7 +#include <memory>
  8 +
  9 +#include "sherpa-onnx/c-api/c-api.h"
  10 +
  11 +// see also
  12 +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html
  13 +
  14 +extern "C" {
  15 +
  16 +static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, "");
  17 +
  18 +static_assert(sizeof(SherpaOnnxVadModelConfig) ==
  19 + sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4,
  20 + "");
  21 +void MyPrint(SherpaOnnxVadModelConfig *config) {
  22 + auto silero_vad = &config->silero_vad;
  23 +
  24 + fprintf(stdout, "----------silero_vad config----------\n");
  25 + fprintf(stdout, "model: %s\n", silero_vad->model);
  26 + fprintf(stdout, "threshold: %.3f\n", silero_vad->threshold);
  27 + fprintf(stdout, "min_silence_duration: %.3f\n",
  28 + silero_vad->min_silence_duration);
  29 + fprintf(stdout, "min_speech_duration: %.3f\n",
  30 + silero_vad->min_speech_duration);
  31 + fprintf(stdout, "window_size: %d\n", silero_vad->window_size);
  32 +
  33 + fprintf(stdout, "----------config----------\n");
  34 +
  35 + fprintf(stdout, "sample_rate: %d\n", config->sample_rate);
  36 + fprintf(stdout, "num_threads: %d\n", config->num_threads);
  37 +
  38 + fprintf(stdout, "provider: %s\n", config->provider);
  39 + fprintf(stdout, "debug: %d\n", config->debug);
  40 +}
  41 +
  42 +void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
  43 + std::copy(src, src + num_bytes, dst);
  44 +}
  45 +}