Fangjun Kuang
Committed by GitHub

Add JavaScript (node-addon) API for ten-vad (#2383)

This PR adds support for the new ten-vad model in both the Node.js addon examples and the HarmonyOS wrapper.

- Introduce TenVadConfig alongside existing SileroVadConfig and extend the VadConfig API.
- Update C++ addon to parse ten-vad parameters and pass them through to the detector.
- Modify Node.js example scripts to let users switch between silero and ten-vad and to normalize generated filenames.
export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so";
export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad';
export { CircularBuffer, SileroVadConfig, TenVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad';
export { Samples,
... ...
... ... @@ -294,6 +294,25 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
return c;
}
static SherpaOnnxTenVadModelConfig GetTenVadConfig(const Napi::Object &obj) {
SherpaOnnxTenVadModelConfig c;
memset(&c, 0, sizeof(c));
if (!obj.Has("tenVad") || !obj.Get("tenVad").IsObject()) {
return c;
}
Napi::Object o = obj.Get("tenVad").As<Napi::Object>();
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);
return c;
}
static Napi::External<SherpaOnnxVoiceActivityDetector>
CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
... ... @@ -339,6 +358,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) {
SherpaOnnxVadModelConfig c;
memset(&c, 0, sizeof(c));
c.silero_vad = GetSileroVadConfig(o);
c.ten_vad = GetTenVadConfig(o);
SHERPA_ONNX_ASSIGN_ATTR_INT32(sample_rate, sampleRate);
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
... ... @@ -369,6 +389,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) {
SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds);
#endif
SHERPA_ONNX_DELETE_C_STR(c.silero_vad.model);
SHERPA_ONNX_DELETE_C_STR(c.ten_vad.model);
SHERPA_ONNX_DELETE_C_STR(c.provider);
return Napi::External<SherpaOnnxVoiceActivityDetector>::New(
... ...
... ... @@ -23,25 +23,48 @@ export class SileroVadConfig {
public minSpeechDuration: number;
public minSilenceDuration: number;
public windowSize: number;
public maxSpeechDuration: number;
public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number,
windowSize: number) {
windowSize: number, maxSpeechDuration: number = 20) {
this.model = model;
this.threshold = threshold;
this.minSpeechDuration = minSpeechDuration;
this.minSilenceDuration = minSilenceDuration;
this.windowSize = windowSize;
this.maxSpeechDuration = maxSpeechDuration
}
}
export class TenVadConfig {
public model: string;
public threshold: number;
public minSpeechDuration: number;
public minSilenceDuration: number;
public windowSize: number;
public maxSpeechDuration: number;
public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number,
windowSize: number, maxSpeechDuration: number = 20) {
this.model = model;
this.threshold = threshold;
this.minSpeechDuration = minSpeechDuration;
this.minSilenceDuration = minSilenceDuration;
this.windowSize = windowSize;
this.maxSpeechDuration = maxSpeechDuration
}
}
export class VadConfig {
public sileroVad: SileroVadConfig;
public tenVad: TenVadConfig;
public sampleRate: number;
public debug: boolean;
public numThreads: number;
public constructor(sileroVad: SileroVadConfig, sampleRate: number, debug: boolean, numThreads: number) {
public constructor(sileroVad: SileroVadConfig, tenVad: TenVadConfig, sampleRate: number, debug: boolean, numThreads: number) {
this.sileroVad = sileroVad;
this.tenVad = tenVad;
this.sampleRate = sampleRate;
this.debug = debug;
this.numThreads = numThreads;
... ... @@ -127,4 +150,4 @@ export class Vad {
flush(): void {
voiceActivityDetectorFlush(this.handle);
}
}
\ No newline at end of file
}
... ...
... ... @@ -6,6 +6,7 @@ import {
OnlineRecognizerResult,
readWaveFromBinary,
SileroVadConfig,
TenVadConfig,
SpeechSegment,
Vad,
VadConfig,
... ... @@ -31,6 +32,13 @@ function initVad(context: Context): Vad {
0.5,
512,
),
new TenVadConfig(
'', // set it to ten-vad.onnx to use ten-vad
0.5,
0.25,
0.5,
256,
),
16000,
true,
1,
... ... @@ -93,7 +101,12 @@ function decodeFile(filename: string): string {
console.log(`samples length ${wave.samples.length}`);
const resultList: string[] = [];
const windowSize: number = vad.config.sileroVad.windowSize;
let windowSize: number = vad.config.sileroVad.windowSize;
if (vad.config.tenVad.model != '') {
windowSize = vad.config.tenVad.windowSize;
}
for (let i = 0; i < wave.samples.length; i += windowSize) {
const thisWindow: Float32Array = wave.samples.subarray(i, i + windowSize)
vad.acceptWaveform(thisWindow);
... ... @@ -138,7 +151,12 @@ function decodeFile(filename: string): string {
function decodeMic(samples: Float32Array) {
const resultList: string[] = [];
const windowSize: number = vad.config.sileroVad.windowSize;
let windowSize: number = vad.config.sileroVad.windowSize;
if (vad.config.tenVad.model != '') {
windowSize = vad.config.tenVad.windowSize;
}
for (let i = 0; i < samples.length; i += windowSize) {
const thisWindow: Float32Array = samples.subarray(i, i + windowSize)
vad.acceptWaveform(thisWindow);
... ...
... ... @@ -97,9 +97,11 @@ ai.on('data', data => {
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -94,9 +94,11 @@ ai.on('data', data => {
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -92,9 +92,11 @@ ai.on('data', data => {
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -95,9 +95,11 @@ ai.on('data', data => {
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -97,9 +97,11 @@ ai.on('data', data => {
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -93,9 +93,11 @@ ai.on('data', data => {
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -93,9 +93,11 @@ ai.on('data', data => {
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -8,14 +8,28 @@ const sherpa_onnx = require('sherpa-onnx-node');
function createVad() {
// please download silero_vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// OR
//
// please download ten-vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
const config = {
sileroVad: {
// model: '',
model: './silero_vad.onnx',
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
windowSize: 512,
},
tenVad: {
model: '',
// model: './ten-vad.onnx',
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
windowSize: 256,
},
sampleRate: 16000,
debug: true,
numThreads: 1,
... ... @@ -47,7 +61,10 @@ const ai = new portAudio.AudioIO({
let printed = false;
let index = 0;
ai.on('data', data => {
const windowSize = vad.config.sileroVad.windowSize;
const windowSize = vad.config.sileroVad.model != '' ?
vad.config.sileroVad.windowSize :
vad.config.tenVad.windowSize;
buffer.push(new Float32Array(data.buffer));
while (buffer.size() > windowSize) {
const samples = buffer.get(buffer.head(), windowSize);
... ... @@ -66,9 +83,10 @@ ai.on('data', data => {
const segment = vad.front();
vad.pop();
const filename = `${index}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...
... ... @@ -91,9 +91,11 @@ ai.on('data', data => {
const fullLang = display.of(lang);
const filename = `${index}-${fullLang}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`
.replace(/:/g, '-');
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
... ...