Fangjun Kuang
Committed by GitHub

Add JavaScript (node-addon) API for ten-vad (#2383)

This PR adds support for the new ten-vad model in both the Node.js addon examples and the HarmonyOS wrapper.

- Introduce TenVadConfig alongside existing SileroVadConfig and extend the VadConfig API.
- Update C++ addon to parse ten-vad parameters and pass them through to the detector.
- Modify Node.js example scripts to let users switch between silero and ten-vad and to normalize generated filenames.
1 export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so"; 1 export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so";
2 2
3 -export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad'; 3 +export { CircularBuffer, SileroVadConfig, TenVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad';
4 4
5 5
6 export { Samples, 6 export { Samples,
@@ -294,6 +294,25 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig( @@ -294,6 +294,25 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
294 return c; 294 return c;
295 } 295 }
296 296
  297 +static SherpaOnnxTenVadModelConfig GetTenVadConfig(const Napi::Object &obj) {
  298 + SherpaOnnxTenVadModelConfig c;
  299 + memset(&c, 0, sizeof(c));
  300 +
  301 + if (!obj.Has("tenVad") || !obj.Get("tenVad").IsObject()) {
  302 + return c;
  303 + }
  304 +
  305 + Napi::Object o = obj.Get("tenVad").As<Napi::Object>();
  306 + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
  307 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold);
  308 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
  309 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
  310 + SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
  311 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);
  312 +
  313 + return c;
  314 +}
  315 +
297 static Napi::External<SherpaOnnxVoiceActivityDetector> 316 static Napi::External<SherpaOnnxVoiceActivityDetector>
298 CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { 317 CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) {
299 Napi::Env env = info.Env(); 318 Napi::Env env = info.Env();
@@ -339,6 +358,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { @@ -339,6 +358,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) {
339 SherpaOnnxVadModelConfig c; 358 SherpaOnnxVadModelConfig c;
340 memset(&c, 0, sizeof(c)); 359 memset(&c, 0, sizeof(c));
341 c.silero_vad = GetSileroVadConfig(o); 360 c.silero_vad = GetSileroVadConfig(o);
  361 + c.ten_vad = GetTenVadConfig(o);
342 362
343 SHERPA_ONNX_ASSIGN_ATTR_INT32(sample_rate, sampleRate); 363 SHERPA_ONNX_ASSIGN_ATTR_INT32(sample_rate, sampleRate);
344 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); 364 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -369,6 +389,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { @@ -369,6 +389,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) {
369 SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds); 389 SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds);
370 #endif 390 #endif
371 SHERPA_ONNX_DELETE_C_STR(c.silero_vad.model); 391 SHERPA_ONNX_DELETE_C_STR(c.silero_vad.model);
  392 + SHERPA_ONNX_DELETE_C_STR(c.ten_vad.model);
372 SHERPA_ONNX_DELETE_C_STR(c.provider); 393 SHERPA_ONNX_DELETE_C_STR(c.provider);
373 394
374 return Napi::External<SherpaOnnxVoiceActivityDetector>::New( 395 return Napi::External<SherpaOnnxVoiceActivityDetector>::New(
@@ -23,25 +23,48 @@ export class SileroVadConfig { @@ -23,25 +23,48 @@ export class SileroVadConfig {
23 public minSpeechDuration: number; 23 public minSpeechDuration: number;
24 public minSilenceDuration: number; 24 public minSilenceDuration: number;
25 public windowSize: number; 25 public windowSize: number;
  26 + public maxSpeechDuration: number;
26 27
27 public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number, 28 public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number,
28 - windowSize: number) { 29 + windowSize: number, maxSpeechDuration: number = 20) {
29 this.model = model; 30 this.model = model;
30 this.threshold = threshold; 31 this.threshold = threshold;
31 this.minSpeechDuration = minSpeechDuration; 32 this.minSpeechDuration = minSpeechDuration;
32 this.minSilenceDuration = minSilenceDuration; 33 this.minSilenceDuration = minSilenceDuration;
33 this.windowSize = windowSize; 34 this.windowSize = windowSize;
  35 + this.maxSpeechDuration = maxSpeechDuration
  36 + }
  37 +}
  38 +
  39 +export class TenVadConfig {
  40 + public model: string;
  41 + public threshold: number;
  42 + public minSpeechDuration: number;
  43 + public minSilenceDuration: number;
  44 + public windowSize: number;
  45 + public maxSpeechDuration: number;
  46 +
  47 + public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number,
  48 + windowSize: number, maxSpeechDuration: number = 20) {
  49 + this.model = model;
  50 + this.threshold = threshold;
  51 + this.minSpeechDuration = minSpeechDuration;
  52 + this.minSilenceDuration = minSilenceDuration;
  53 + this.windowSize = windowSize;
  54 + this.maxSpeechDuration = maxSpeechDuration
34 } 55 }
35 } 56 }
36 57
37 export class VadConfig { 58 export class VadConfig {
38 public sileroVad: SileroVadConfig; 59 public sileroVad: SileroVadConfig;
  60 + public tenVad: TenVadConfig;
39 public sampleRate: number; 61 public sampleRate: number;
40 public debug: boolean; 62 public debug: boolean;
41 public numThreads: number; 63 public numThreads: number;
42 64
43 - public constructor(sileroVad: SileroVadConfig, sampleRate: number, debug: boolean, numThreads: number) { 65 + public constructor(sileroVad: SileroVadConfig, tenVad: TenVadConfig, sampleRate: number, debug: boolean, numThreads: number) {
44 this.sileroVad = sileroVad; 66 this.sileroVad = sileroVad;
  67 + this.tenVad = tenVad;
45 this.sampleRate = sampleRate; 68 this.sampleRate = sampleRate;
46 this.debug = debug; 69 this.debug = debug;
47 this.numThreads = numThreads; 70 this.numThreads = numThreads;
@@ -6,6 +6,7 @@ import { @@ -6,6 +6,7 @@ import {
6 OnlineRecognizerResult, 6 OnlineRecognizerResult,
7 readWaveFromBinary, 7 readWaveFromBinary,
8 SileroVadConfig, 8 SileroVadConfig,
  9 + TenVadConfig,
9 SpeechSegment, 10 SpeechSegment,
10 Vad, 11 Vad,
11 VadConfig, 12 VadConfig,
@@ -31,6 +32,13 @@ function initVad(context: Context): Vad { @@ -31,6 +32,13 @@ function initVad(context: Context): Vad {
31 0.5, 32 0.5,
32 512, 33 512,
33 ), 34 ),
  35 + new TenVadConfig(
  36 + '', // set it to ten-vad.onnx to use ten-vad
  37 + 0.5,
  38 + 0.25,
  39 + 0.5,
  40 + 256,
  41 + ),
34 16000, 42 16000,
35 true, 43 true,
36 1, 44 1,
@@ -93,7 +101,12 @@ function decodeFile(filename: string): string { @@ -93,7 +101,12 @@ function decodeFile(filename: string): string {
93 console.log(`samples length ${wave.samples.length}`); 101 console.log(`samples length ${wave.samples.length}`);
94 const resultList: string[] = []; 102 const resultList: string[] = [];
95 103
96 - const windowSize: number = vad.config.sileroVad.windowSize; 104 + let windowSize: number = vad.config.sileroVad.windowSize;
  105 +
  106 + if (vad.config.tenVad.model != '') {
  107 + windowSize = vad.config.tenVad.windowSize;
  108 + }
  109 +
97 for (let i = 0; i < wave.samples.length; i += windowSize) { 110 for (let i = 0; i < wave.samples.length; i += windowSize) {
98 const thisWindow: Float32Array = wave.samples.subarray(i, i + windowSize) 111 const thisWindow: Float32Array = wave.samples.subarray(i, i + windowSize)
99 vad.acceptWaveform(thisWindow); 112 vad.acceptWaveform(thisWindow);
@@ -138,7 +151,12 @@ function decodeFile(filename: string): string { @@ -138,7 +151,12 @@ function decodeFile(filename: string): string {
138 function decodeMic(samples: Float32Array) { 151 function decodeMic(samples: Float32Array) {
139 const resultList: string[] = []; 152 const resultList: string[] = [];
140 153
141 - const windowSize: number = vad.config.sileroVad.windowSize; 154 + let windowSize: number = vad.config.sileroVad.windowSize;
  155 +
  156 + if (vad.config.tenVad.model != '') {
  157 + windowSize = vad.config.tenVad.windowSize;
  158 + }
  159 +
142 for (let i = 0; i < samples.length; i += windowSize) { 160 for (let i = 0; i < samples.length; i += windowSize) {
143 const thisWindow: Float32Array = samples.subarray(i, i + windowSize) 161 const thisWindow: Float32Array = samples.subarray(i, i + windowSize)
144 vad.acceptWaveform(thisWindow); 162 vad.acceptWaveform(thisWindow);
@@ -99,7 +99,9 @@ ai.on('data', data => { @@ -99,7 +99,9 @@ ai.on('data', data => {
99 const filename = `${index}-${text}-${ 99 const filename = `${index}-${text}-${
100 new Date() 100 new Date()
101 .toLocaleTimeString('en-US', {hour12: false}) 101 .toLocaleTimeString('en-US', {hour12: false})
102 - .split(' ')[0]}.wav`; 102 + .split(' ')[0]}.wav`
  103 + .replace(/:/g, '-');
  104 +
103 sherpa_onnx.writeWave( 105 sherpa_onnx.writeWave(
104 filename, 106 filename,
105 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 107 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -96,7 +96,9 @@ ai.on('data', data => { @@ -96,7 +96,9 @@ ai.on('data', data => {
96 const filename = `${index}-${text}-${ 96 const filename = `${index}-${text}-${
97 new Date() 97 new Date()
98 .toLocaleTimeString('en-US', {hour12: false}) 98 .toLocaleTimeString('en-US', {hour12: false})
99 - .split(' ')[0]}.wav`; 99 + .split(' ')[0]}.wav`
  100 + .replace(/:/g, '-');
  101 +
100 sherpa_onnx.writeWave( 102 sherpa_onnx.writeWave(
101 filename, 103 filename,
102 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 104 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -94,7 +94,9 @@ ai.on('data', data => { @@ -94,7 +94,9 @@ ai.on('data', data => {
94 const filename = `${index}-${text}-${ 94 const filename = `${index}-${text}-${
95 new Date() 95 new Date()
96 .toLocaleTimeString('en-US', {hour12: false}) 96 .toLocaleTimeString('en-US', {hour12: false})
97 - .split(' ')[0]}.wav`; 97 + .split(' ')[0]}.wav`
  98 + .replace(/:/g, '-');
  99 +
98 sherpa_onnx.writeWave( 100 sherpa_onnx.writeWave(
99 filename, 101 filename,
100 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 102 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -97,7 +97,9 @@ ai.on('data', data => { @@ -97,7 +97,9 @@ ai.on('data', data => {
97 const filename = `${index}-${text}-${ 97 const filename = `${index}-${text}-${
98 new Date() 98 new Date()
99 .toLocaleTimeString('en-US', {hour12: false}) 99 .toLocaleTimeString('en-US', {hour12: false})
100 - .split(' ')[0]}.wav`; 100 + .split(' ')[0]}.wav`
  101 + .replace(/:/g, '-');
  102 +
101 sherpa_onnx.writeWave( 103 sherpa_onnx.writeWave(
102 filename, 104 filename,
103 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 105 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -99,7 +99,9 @@ ai.on('data', data => { @@ -99,7 +99,9 @@ ai.on('data', data => {
99 const filename = `${index}-${text}-${ 99 const filename = `${index}-${text}-${
100 new Date() 100 new Date()
101 .toLocaleTimeString('en-US', {hour12: false}) 101 .toLocaleTimeString('en-US', {hour12: false})
102 - .split(' ')[0]}.wav`; 102 + .split(' ')[0]}.wav`
  103 + .replace(/:/g, '-');
  104 +
103 sherpa_onnx.writeWave( 105 sherpa_onnx.writeWave(
104 filename, 106 filename,
105 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 107 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -95,7 +95,9 @@ ai.on('data', data => { @@ -95,7 +95,9 @@ ai.on('data', data => {
95 const filename = `${index}-${text}-${ 95 const filename = `${index}-${text}-${
96 new Date() 96 new Date()
97 .toLocaleTimeString('en-US', {hour12: false}) 97 .toLocaleTimeString('en-US', {hour12: false})
98 - .split(' ')[0]}.wav`; 98 + .split(' ')[0]}.wav`
  99 + .replace(/:/g, '-');
  100 +
99 sherpa_onnx.writeWave( 101 sherpa_onnx.writeWave(
100 filename, 102 filename,
101 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 103 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -95,7 +95,9 @@ ai.on('data', data => { @@ -95,7 +95,9 @@ ai.on('data', data => {
95 const filename = `${index}-${text}-${ 95 const filename = `${index}-${text}-${
96 new Date() 96 new Date()
97 .toLocaleTimeString('en-US', {hour12: false}) 97 .toLocaleTimeString('en-US', {hour12: false})
98 - .split(' ')[0]}.wav`; 98 + .split(' ')[0]}.wav`
  99 + .replace(/:/g, '-');
  100 +
99 sherpa_onnx.writeWave( 101 sherpa_onnx.writeWave(
100 filename, 102 filename,
101 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 103 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -8,14 +8,28 @@ const sherpa_onnx = require('sherpa-onnx-node'); @@ -8,14 +8,28 @@ const sherpa_onnx = require('sherpa-onnx-node');
8 function createVad() { 8 function createVad() {
9 // please download silero_vad.onnx from 9 // please download silero_vad.onnx from
10 // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx 10 // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  11 + //
  12 + // OR
  13 + //
  14 + // please download ten-vad.onnx from
  15 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
11 const config = { 16 const config = {
12 sileroVad: { 17 sileroVad: {
  18 + // model: '',
13 model: './silero_vad.onnx', 19 model: './silero_vad.onnx',
14 threshold: 0.5, 20 threshold: 0.5,
15 minSpeechDuration: 0.25, 21 minSpeechDuration: 0.25,
16 minSilenceDuration: 0.5, 22 minSilenceDuration: 0.5,
17 windowSize: 512, 23 windowSize: 512,
18 }, 24 },
  25 + tenVad: {
  26 + model: '',
  27 + // model: './ten-vad.onnx',
  28 + threshold: 0.5,
  29 + minSpeechDuration: 0.25,
  30 + minSilenceDuration: 0.5,
  31 + windowSize: 256,
  32 + },
19 sampleRate: 16000, 33 sampleRate: 16000,
20 debug: true, 34 debug: true,
21 numThreads: 1, 35 numThreads: 1,
@@ -47,7 +61,10 @@ const ai = new portAudio.AudioIO({ @@ -47,7 +61,10 @@ const ai = new portAudio.AudioIO({
47 let printed = false; 61 let printed = false;
48 let index = 0; 62 let index = 0;
49 ai.on('data', data => { 63 ai.on('data', data => {
50 - const windowSize = vad.config.sileroVad.windowSize; 64 + const windowSize = vad.config.sileroVad.model != '' ?
  65 + vad.config.sileroVad.windowSize :
  66 + vad.config.tenVad.windowSize;
  67 +
51 buffer.push(new Float32Array(data.buffer)); 68 buffer.push(new Float32Array(data.buffer));
52 while (buffer.size() > windowSize) { 69 while (buffer.size() > windowSize) {
53 const samples = buffer.get(buffer.head(), windowSize); 70 const samples = buffer.get(buffer.head(), windowSize);
@@ -68,7 +85,8 @@ ai.on('data', data => { @@ -68,7 +85,8 @@ ai.on('data', data => {
68 const filename = `${index}-${ 85 const filename = `${index}-${
69 new Date() 86 new Date()
70 .toLocaleTimeString('en-US', {hour12: false}) 87 .toLocaleTimeString('en-US', {hour12: false})
71 - .split(' ')[0]}.wav`; 88 + .split(' ')[0]}.wav`
  89 + .replace(/:/g, '-');
72 sherpa_onnx.writeWave( 90 sherpa_onnx.writeWave(
73 filename, 91 filename,
74 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 92 {samples: segment.samples, sampleRate: vad.config.sampleRate});
@@ -93,7 +93,9 @@ ai.on('data', data => { @@ -93,7 +93,9 @@ ai.on('data', data => {
93 const filename = `${index}-${fullLang}-${ 93 const filename = `${index}-${fullLang}-${
94 new Date() 94 new Date()
95 .toLocaleTimeString('en-US', {hour12: false}) 95 .toLocaleTimeString('en-US', {hour12: false})
96 - .split(' ')[0]}.wav`; 96 + .split(' ')[0]}.wav`
  97 + .replace(/:/g, '-');
  98 +
97 sherpa_onnx.writeWave( 99 sherpa_onnx.writeWave(
98 filename, 100 filename,
99 {samples: segment.samples, sampleRate: vad.config.sampleRate}); 101 {samples: segment.samples, sampleRate: vad.config.sampleRate});