Fangjun Kuang
Committed by GitHub

Add JavaScript (WebAssembly) API for ten-vad (#2382)

Add support for the ten-vad model alongside silero-vad in the WebAssembly VAD API, update the UI and documentation, and extend examples and CI workflows to handle the new model.

- Extend C++ bindings and printing logic to include ten-vad configuration.
- Implement JavaScript init/free routines and runtime detection for ten-vad.
- Update UI layout, README assets, example scripts, and CI workflow to support ten-vad.
name: wasm-simd-hf-space-ten-vad
on:
push:
branches:
- wasm
- wasm-ten-vad
tags:
- 'v[0-9]+.[0-9]+.[0-9]+*'
workflow_dispatch:
concurrency:
group: wasm-simd-hf-space-ten-vad-${{ github.ref }}
cancel-in-progress: true
jobs:
wasm-simd-hf-space-ten-vad:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Update version
shell: bash
run: |
./new-release.sh
git diff .
- name: Install emsdk
uses: mymindstorm/setup-emsdk@v14
with:
version: 3.1.53
actions-cache-folder: 'emsdk-cache'
- name: View emsdk version
shell: bash
run: |
emcc -v
echo "--------------------"
emcc --check
- name: Download model files
shell: bash
run: |
cd wasm/vad/assets
ls -lh
echo "----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
ls -lh
cd ..
sed -i.bak "s|.*(with <a .*| (with <a href="https://github.com/TEN-framework/ten-vad">ten-vad</a>)|" ./index.html
git diff .
- name: Build sherpa-onnx for WebAssembly
shell: bash
run: |
./build-wasm-simd-vad.sh
- name: collect files
shell: bash
run: |
SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-ten-vad
mv build-wasm-simd-vad/install/bin/wasm/vad $dst
ls -lh $dst
tar cjfv $dst.tar.bz2 ./$dst
- name: Upload wasm files
uses: actions/upload-artifact@v4
with:
name: sherpa-onnx-wasm-simd-ten-vad
path: ./sherpa-onnx-wasm-simd-*.tar.bz2
- name: Release
if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
overwrite: true
file: ./*.tar.bz2
- name: Publish to ModelScope
# if: false
env:
MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }}
uses: nick-fields/retry@v2
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf ms
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://www.modelscope.cn/studios/csukuangfj/web-assembly-ten-vad-sherpa-onnx.git ms
cd ms
rm -fv *.js
rm -fv *.data
git fetch
git pull
git merge -m "merge remote" --ff origin main
cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-ten-vad/* .
git status
git lfs track "*.data"
git lfs track "*.wasm"
ls -lh
git add .
git commit -m "update model"
git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/csukuangfj/web-assembly-ten-vad-sherpa-onnx.git
- name: Publish to huggingface
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v2
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://huggingface.co/spaces/k2-fsa/web-assembly-ten-vad-sherpa-onnx huggingface
cd huggingface
rm -fv *.js
rm -fv *.data
git fetch
git pull
git merge -m "merge remote" --ff origin main
cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-ten-vad/* .
git status
git lfs track "*.data"
git lfs track "*.wasm"
ls -lh
git add .
git commit -m "update model"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-ten-vad-sherpa-onnx main
... ...
... ... @@ -26,6 +26,15 @@ function createRecognizer() {
function createVad() {
// please download silero_vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// please download ten-vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
//
// You only need one vad
//
// To use ten-vad.onnx, please set sileroVad.model to ''
// and set tenVad.model to 'ten-vad.onnx'
//
const config = {
sileroVad: {
model: './silero_vad.onnx',
... ... @@ -35,12 +44,22 @@ function createVad() {
maxSpeechDuration: 5,
windowSize: 512,
},
tenVad: {
// model: './ten-vad.onnx',
model: '',
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
maxSpeechDuration: 5,
windowSize: 256,
},
sampleRate: 16000,
debug: true,
numThreads: 1,
bufferSizeInSeconds: 60,
};
return sherpa_onnx.createVad(config);
}
... ... @@ -60,7 +79,11 @@ if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
console.log('Started')
let start = Date.now();
const windowSize = vad.config.sileroVad.windowSize;
let windowSize = vad.config.sileroVad.windowSize;
if (vad.config.tenVad.model != '') {
windowSize = vad.config.tenVad.windowSize;
}
for (let i = 0; i < wave.samples.length; i += windowSize) {
const thisWindow = wave.samples.subarray(i, i + windowSize);
vad.acceptWaveform(thisWindow);
... ...
... ... @@ -59,7 +59,7 @@ rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
cd ../
sed -i.bak s/"type = 0"/"type = 1"/g ./sherpa-onnx.js
sed -i.bak s/"type = 0"/"type = 1"/g ./sherpa-onnx-asr.js
sed -i.bak s/Zipformer/Paraformer/g ./index.html
cd ../..
... ...
... ... @@ -69,8 +69,6 @@ function fileExists(filename) {
return exists;
}
function createOfflineRecognizerSenseVoice() {}
function initOfflineRecognizer() {
let config = {
modelConfig: {
... ...
... ... @@ -2,7 +2,7 @@ if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH})
message(FATAL_ERROR "Please use ./build-wasm-simd-vad.sh to build for wasm VAD")
endif()
if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/silero_vad.onnx")
if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/silero_vad.onnx" AND NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/ten-vad.onnx" )
message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue")
endif()
... ... @@ -30,6 +30,7 @@ set(exported_functions
SherpaOnnxVoiceActivityDetectorReset
SherpaOnnxVoiceActivityDetectorFlush
#
SherpaOnnxFileExists
)
set(mangled_exported_functions)
foreach(x IN LISTS exported_functions)
... ...
... ... @@ -5,7 +5,6 @@
const startBtn = document.getElementById('startBtn');
const stopBtn = document.getElementById('stopBtn');
const clearBtn = document.getElementById('clearBtn');
const hint = document.getElementById('hint');
const soundClips = document.getElementById('sound-clips');
let textArea = document.getElementById('results');
... ... @@ -43,19 +42,98 @@ function getDisplayResult() {
Module = {};
// https://emscripten.org/docs/api_reference/module.html#Module.locateFile
Module.locateFile = function(path, scriptDirectory = '') {
console.log(`path: ${path}, scriptDirectory: ${scriptDirectory}`);
return scriptDirectory + path;
};
// https://emscripten.org/docs/api_reference/module.html#Module.locateFile
Module.setStatus = function(status) {
console.log(`status ${status}`);
const statusElement = document.getElementById('status');
if (status == 'Running...') {
status = 'Model downloaded. Initializing vad...'
}
statusElement.textContent = status;
if (status === '') {
statusElement.style.display = 'none';
// statusElement.parentNode.removeChild(statusElement);
document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
tabContentElement.classList.remove('loading');
});
} else {
statusElement.style.display = 'block';
document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
tabContentElement.classList.add('loading');
});
}
};
Module.onRuntimeInitialized = function() {
console.log('inited!');
hint.innerText = 'Model loaded! Please click start';
startBtn.disabled = false;
vad = createVad(Module);
initVad();
console.log('vad is created!', vad);
buffer = new CircularBuffer(30 * 16000, Module);
console.log('CircularBuffer is created!', buffer);
};
function fileExists(filename) {
const filenameLen = Module.lengthBytesUTF8(filename) + 1;
const buffer = Module._malloc(filenameLen);
Module.stringToUTF8(filename, buffer, filenameLen);
let exists = Module._SherpaOnnxFileExists(buffer);
Module._free(buffer);
return exists;
}
function initVad() {
const sileroVad = {
model: '',
threshold: 0.50,
minSilenceDuration: 0.50,
minSpeechDuration: 0.25,
maxSpeechDuration: 20,
windowSize: 512,
};
const tenVad = {
model: '',
threshold: 0.50,
minSilenceDuration: 0.50,
minSpeechDuration: 0.25,
maxSpeechDuration: 20,
windowSize: 256,
};
let config = {
sileroVad: sileroVad,
tenVad: tenVad,
sampleRate: 16000,
numThreads: 1,
provider: 'cpu',
debug: 1,
bufferSizeInSeconds: 30,
};
if (fileExists('silero_vad.onnx') == 1) {
config.sileroVad.model = 'silero_vad.onnx'
} else if (fileExists('ten-vad.onnx') == 1) {
config.tenVad.model = 'ten-vad.onnx'
}
vad = createVad(Module, config);
}
let audioCtx;
let mediaStream;
... ...
# Introduction
## Use silero-vad
Please download
https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`.
You can find example build script at
https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-silero-vad.yaml
```
cd /path/to/sherpa-onnx/wasm/vad/assets
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
```
## Use ten-vad
Please download
https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
and put `ten-vad.onnx` into the current directory, i.e., `wasm/vad/assets`.
You can find example build script at
https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-ten-vad.yaml
```
cd /path/to/sherpa-onnx/wasm/vad/assets
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
cd ..
sed -i.bak "s|.*(with <a .*| (with <a href="https://github.com/TEN-framework/ten-vad">ten-vad</a>)|" ./index.html
```
... ...
... ... @@ -11,30 +11,67 @@
textarea {
width:100%;
}
.loading {
display: none !important;
}
</style>
</head>
<body>
<body style="font-family: 'Source Sans Pro', sans-serif; background-color: #f9fafb; color: #333; display: flex; flex-direction: column; align-items: center; height: 100vh; margin: 0;">
<h1>
Next-gen Kaldi + WebAssembly<br/>
VAD Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/>
VAD Demo using <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/>
(with <a href="https://github.com/snakers4/silero-vad">silero-vad</a>)
</h1>
<div>
<span id="hint">Loading model ... ...</span>
<br/>
<br/>
<button id="startBtn" disabled>Start</button>
<button id="stopBtn" disabled>Stop</button>
<button id="clearBtn">Clear</button>
<br/>
<br/>
<textarea id="results" rows="10" readonly></textarea>
<div style="width: 100%; max-width: 900px; background: #fff; padding: 1.5rem; border-radius: 8px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); flex: 1;">
<div id="status">Loading...</div>
<div id="singleAudioContent" class="tab-content loading">
<div style="display: flex; gap: 1.5rem;">
<div style="flex: 1; display: flex; flex-direction: row; align-items: center; gap: 1rem;">
<button id="startBtn" disabled>Start</button>
<button id="stopBtn" disabled>Stop</button>
<button id="clearBtn">Clear</button>
</div>
</div>
<div style="flex: 1; display: flex; flex-direction: column; gap: 1rem;">
<textarea id="results" rows="10" placeholder="Please click start and speak. Output will appear here..." readonly style="flex: 1; padding: 0.75rem; font-size: 1rem; border: 1px solid #ced4da; border-radius: 8px; resize: none; background-color: #f8f9fa;"></textarea>
</div>
<section flex="1" overflow="auto" id="sound-clips">
</section>
</div>
<section flex="1" overflow="auto" id="sound-clips">
</section>
<!-- Footer Section -->
<div style="width: 100%; max-width: 900px; margin-top: 1.5rem; background: #fff; padding: 1.5rem; border-radius: 8px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); text-align: left; font-size: 0.9rem; color: #6c757d;">
<h3>Description</h3>
<ul>
<li>Everything is <strong>open-sourced.</strong> <a href="https://github.com/k2-fsa/sherpa-onnx">code</a></li>
<li>If you have any issues, please either <a href="https://github.com/k2-fsa/sherpa-onnx/issues">file a ticket</a> or contact us via</li>
<ul>
<li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#wechat">WeChat group</a></li>
<li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#qq">QQ group</a></li>
<li><a href="https://k2-fsa.github.io/sherpa/social-groups.html#bilibili-b">Bilibili</a></li>
</ul>
</ul>
<h3>About This Demo</h3>
<ul>
<li><strong>Private and Secure:</strong> All processing is done locally on your device (CPU) within your browser with a single thread. No server is involved, ensuring privacy and security. You can disconnect from the Internet once this page is loaded.</li>
<li><strong>Efficient Resource Usage:</strong> No GPU is required, leaving system resources available for webLLM analysis.</li>
</ul>
<h3>Latest Update</h3>
<ul>
<li>Update UI.</li>
<li>First working version.</li>
</ul>
<h3>Acknowledgement</h3>
<ul>
<li>We refer to <a href="https://huggingface.co/spaces/Banafo/Kroko-Streaming-ASR-Wasm">https://huggingface.co/spaces/Banafo/Kroko-Streaming-ASR-Wasm</a> for the UI part.</li>
</ul>
</div>
<script src="sherpa-onnx-vad.js"></script>
<script src="app-vad.js"></script>
... ...
... ... @@ -7,6 +7,10 @@ function freeConfig(config, Module) {
freeConfig(config.sileroVad, Module)
}
if ('tenVad' in config) {
freeConfig(config.tenVad, Module)
}
Module._free(config.ptr);
}
... ... @@ -48,6 +52,42 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) {
}
}
function initSherpaOnnxTenVadModelConfig(config, Module) {
const modelLen = Module.lengthBytesUTF8(config.model || '') + 1;
const n = modelLen;
const buffer = Module._malloc(n);
const len = 6 * 4;
const ptr = Module._malloc(len);
Module.stringToUTF8(config.model || '', buffer, modelLen);
offset = 0;
Module.setValue(ptr, buffer, 'i8*');
offset += 4;
Module.setValue(ptr + offset, config.threshold || 0.5, 'float');
offset += 4;
Module.setValue(ptr + offset, config.minSilenceDuration || 0.5, 'float');
offset += 4;
Module.setValue(ptr + offset, config.minSpeechDuration || 0.25, 'float');
offset += 4;
Module.setValue(ptr + offset, config.windowSize || 256, 'i32');
offset += 4;
Module.setValue(ptr + offset, config.maxSpeechDuration || 20, 'float');
offset += 4;
return {
buffer: buffer, ptr: ptr, len: len,
}
}
function initSherpaOnnxVadModelConfig(config, Module) {
if (!('sileroVad' in config)) {
config.sileroVad = {
... ... @@ -60,10 +100,23 @@ function initSherpaOnnxVadModelConfig(config, Module) {
};
}
if (!('tenVad' in config)) {
config.tenVad = {
model: '',
threshold: 0.50,
minSilenceDuration: 0.50,
minSpeechDuration: 0.25,
windowSize: 256,
maxSpeechDuration: 20,
};
}
const sileroVad =
initSherpaOnnxSileroVadModelConfig(config.sileroVad, Module);
const len = sileroVad.len + 4 * 4;
const tenVad = initSherpaOnnxTenVadModelConfig(config.tenVad, Module);
const len = sileroVad.len + 4 * 4 + tenVad.len;
const ptr = Module._malloc(len);
const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
... ... @@ -86,8 +139,11 @@ function initSherpaOnnxVadModelConfig(config, Module) {
Module.setValue(ptr + offset, config.debug || 0, 'i32');
offset += 4;
Module._CopyHeap(tenVad.ptr, tenVad.len, ptr + offset);
offset += tenVad.len;
return {
buffer: buffer, ptr: ptr, len: len, sileroVad: sileroVad,
buffer: buffer, ptr: ptr, len: len, sileroVad: sileroVad, tenVad: tenVad
}
}
... ... @@ -101,8 +157,18 @@ function createVad(Module, myConfig) {
windowSize: 512,
};
const tenVad = {
model: '',
threshold: 0.50,
minSilenceDuration: 0.50,
minSpeechDuration: 0.25,
maxSpeechDuration: 20,
windowSize: 256,
};
let config = {
sileroVad: sileroVad,
tenVad: tenVad,
sampleRate: 16000,
numThreads: 1,
provider: 'cpu',
... ...
... ... @@ -14,12 +14,15 @@
extern "C" {
static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 6 * 4, "");
static_assert(sizeof(SherpaOnnxTenVadModelConfig) == 6 * 4, "");
static_assert(sizeof(SherpaOnnxVadModelConfig) ==
sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4,
sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4 +
sizeof(SherpaOnnxTenVadModelConfig),
"");
void MyPrint(SherpaOnnxVadModelConfig *config) {
auto silero_vad = &config->silero_vad;
auto ten_vad = &config->ten_vad;
fprintf(stdout, "----------silero_vad config----------\n");
fprintf(stdout, "model: %s\n", silero_vad->model);
... ... @@ -32,6 +35,15 @@ void MyPrint(SherpaOnnxVadModelConfig *config) {
fprintf(stdout, "max_speech_duration: %.3f\n",
silero_vad->max_speech_duration);
fprintf(stdout, "----------ten_vad config----------\n");
fprintf(stdout, "model: %s\n", ten_vad->model);
fprintf(stdout, "threshold: %.3f\n", ten_vad->threshold);
fprintf(stdout, "min_silence_duration: %.3f\n",
ten_vad->min_silence_duration);
fprintf(stdout, "min_speech_duration: %.3f\n", ten_vad->min_speech_duration);
fprintf(stdout, "window_size: %d\n", ten_vad->window_size);
fprintf(stdout, "max_speech_duration: %.3f\n", ten_vad->max_speech_duration);
fprintf(stdout, "----------config----------\n");
fprintf(stdout, "sample_rate: %d\n", config->sample_rate);
... ...