Committed by
GitHub
Add JavaScript (node-addon) API for ten-vad (#2383)
This PR adds support for the new ten-vad model in both the Node.js addon examples and the HarmonyOS wrapper. - Introduce TenVadConfig alongside existing SileroVadConfig and extend the VadConfig API. - Update C++ addon to parse ten-vad parameters and pass them through to the detector. - Modify Node.js example scripts to let users switch between silero and ten-vad and to normalize generated filenames.
正在显示
13 个修改的文件
包含
111 行增加
和
15 行删除
| 1 | export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so"; | 1 | export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so"; |
| 2 | 2 | ||
| 3 | -export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad'; | 3 | +export { CircularBuffer, SileroVadConfig, TenVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad'; |
| 4 | 4 | ||
| 5 | 5 | ||
| 6 | export { Samples, | 6 | export { Samples, |
| @@ -294,6 +294,25 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig( | @@ -294,6 +294,25 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig( | ||
| 294 | return c; | 294 | return c; |
| 295 | } | 295 | } |
| 296 | 296 | ||
| 297 | +static SherpaOnnxTenVadModelConfig GetTenVadConfig(const Napi::Object &obj) { | ||
| 298 | + SherpaOnnxTenVadModelConfig c; | ||
| 299 | + memset(&c, 0, sizeof(c)); | ||
| 300 | + | ||
| 301 | + if (!obj.Has("tenVad") || !obj.Get("tenVad").IsObject()) { | ||
| 302 | + return c; | ||
| 303 | + } | ||
| 304 | + | ||
| 305 | + Napi::Object o = obj.Get("tenVad").As<Napi::Object>(); | ||
| 306 | + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); | ||
| 307 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold); | ||
| 308 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration); | ||
| 309 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration); | ||
| 310 | + SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize); | ||
| 311 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration); | ||
| 312 | + | ||
| 313 | + return c; | ||
| 314 | +} | ||
| 315 | + | ||
| 297 | static Napi::External<SherpaOnnxVoiceActivityDetector> | 316 | static Napi::External<SherpaOnnxVoiceActivityDetector> |
| 298 | CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { | 317 | CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { |
| 299 | Napi::Env env = info.Env(); | 318 | Napi::Env env = info.Env(); |
| @@ -339,6 +358,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { | @@ -339,6 +358,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { | ||
| 339 | SherpaOnnxVadModelConfig c; | 358 | SherpaOnnxVadModelConfig c; |
| 340 | memset(&c, 0, sizeof(c)); | 359 | memset(&c, 0, sizeof(c)); |
| 341 | c.silero_vad = GetSileroVadConfig(o); | 360 | c.silero_vad = GetSileroVadConfig(o); |
| 361 | + c.ten_vad = GetTenVadConfig(o); | ||
| 342 | 362 | ||
| 343 | SHERPA_ONNX_ASSIGN_ATTR_INT32(sample_rate, sampleRate); | 363 | SHERPA_ONNX_ASSIGN_ATTR_INT32(sample_rate, sampleRate); |
| 344 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); | 364 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); |
| @@ -369,6 +389,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { | @@ -369,6 +389,7 @@ CreateVoiceActivityDetectorWrapper(const Napi::CallbackInfo &info) { | ||
| 369 | SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds); | 389 | SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds); |
| 370 | #endif | 390 | #endif |
| 371 | SHERPA_ONNX_DELETE_C_STR(c.silero_vad.model); | 391 | SHERPA_ONNX_DELETE_C_STR(c.silero_vad.model); |
| 392 | + SHERPA_ONNX_DELETE_C_STR(c.ten_vad.model); | ||
| 372 | SHERPA_ONNX_DELETE_C_STR(c.provider); | 393 | SHERPA_ONNX_DELETE_C_STR(c.provider); |
| 373 | 394 | ||
| 374 | return Napi::External<SherpaOnnxVoiceActivityDetector>::New( | 395 | return Napi::External<SherpaOnnxVoiceActivityDetector>::New( |
| @@ -23,25 +23,48 @@ export class SileroVadConfig { | @@ -23,25 +23,48 @@ export class SileroVadConfig { | ||
| 23 | public minSpeechDuration: number; | 23 | public minSpeechDuration: number; |
| 24 | public minSilenceDuration: number; | 24 | public minSilenceDuration: number; |
| 25 | public windowSize: number; | 25 | public windowSize: number; |
| 26 | + public maxSpeechDuration: number; | ||
| 26 | 27 | ||
| 27 | public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number, | 28 | public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number, |
| 28 | - windowSize: number) { | 29 | + windowSize: number, maxSpeechDuration: number = 20) { |
| 29 | this.model = model; | 30 | this.model = model; |
| 30 | this.threshold = threshold; | 31 | this.threshold = threshold; |
| 31 | this.minSpeechDuration = minSpeechDuration; | 32 | this.minSpeechDuration = minSpeechDuration; |
| 32 | this.minSilenceDuration = minSilenceDuration; | 33 | this.minSilenceDuration = minSilenceDuration; |
| 33 | this.windowSize = windowSize; | 34 | this.windowSize = windowSize; |
| 35 | + this.maxSpeechDuration = maxSpeechDuration | ||
| 36 | + } | ||
| 37 | +} | ||
| 38 | + | ||
| 39 | +export class TenVadConfig { | ||
| 40 | + public model: string; | ||
| 41 | + public threshold: number; | ||
| 42 | + public minSpeechDuration: number; | ||
| 43 | + public minSilenceDuration: number; | ||
| 44 | + public windowSize: number; | ||
| 45 | + public maxSpeechDuration: number; | ||
| 46 | + | ||
| 47 | + public constructor(model: string, threshold: number, minSpeechDuration: number, minSilenceDuration: number, | ||
| 48 | + windowSize: number, maxSpeechDuration: number = 20) { | ||
| 49 | + this.model = model; | ||
| 50 | + this.threshold = threshold; | ||
| 51 | + this.minSpeechDuration = minSpeechDuration; | ||
| 52 | + this.minSilenceDuration = minSilenceDuration; | ||
| 53 | + this.windowSize = windowSize; | ||
| 54 | + this.maxSpeechDuration = maxSpeechDuration | ||
| 34 | } | 55 | } |
| 35 | } | 56 | } |
| 36 | 57 | ||
| 37 | export class VadConfig { | 58 | export class VadConfig { |
| 38 | public sileroVad: SileroVadConfig; | 59 | public sileroVad: SileroVadConfig; |
| 60 | + public tenVad: TenVadConfig; | ||
| 39 | public sampleRate: number; | 61 | public sampleRate: number; |
| 40 | public debug: boolean; | 62 | public debug: boolean; |
| 41 | public numThreads: number; | 63 | public numThreads: number; |
| 42 | 64 | ||
| 43 | - public constructor(sileroVad: SileroVadConfig, sampleRate: number, debug: boolean, numThreads: number) { | 65 | + public constructor(sileroVad: SileroVadConfig, tenVad: TenVadConfig, sampleRate: number, debug: boolean, numThreads: number) { |
| 44 | this.sileroVad = sileroVad; | 66 | this.sileroVad = sileroVad; |
| 67 | + this.tenVad = tenVad; | ||
| 45 | this.sampleRate = sampleRate; | 68 | this.sampleRate = sampleRate; |
| 46 | this.debug = debug; | 69 | this.debug = debug; |
| 47 | this.numThreads = numThreads; | 70 | this.numThreads = numThreads; |
| @@ -6,6 +6,7 @@ import { | @@ -6,6 +6,7 @@ import { | ||
| 6 | OnlineRecognizerResult, | 6 | OnlineRecognizerResult, |
| 7 | readWaveFromBinary, | 7 | readWaveFromBinary, |
| 8 | SileroVadConfig, | 8 | SileroVadConfig, |
| 9 | + TenVadConfig, | ||
| 9 | SpeechSegment, | 10 | SpeechSegment, |
| 10 | Vad, | 11 | Vad, |
| 11 | VadConfig, | 12 | VadConfig, |
| @@ -31,6 +32,13 @@ function initVad(context: Context): Vad { | @@ -31,6 +32,13 @@ function initVad(context: Context): Vad { | ||
| 31 | 0.5, | 32 | 0.5, |
| 32 | 512, | 33 | 512, |
| 33 | ), | 34 | ), |
| 35 | + new TenVadConfig( | ||
| 36 | + '', // set it to ten-vad.onnx to use ten-vad | ||
| 37 | + 0.5, | ||
| 38 | + 0.25, | ||
| 39 | + 0.5, | ||
| 40 | + 256, | ||
| 41 | + ), | ||
| 34 | 16000, | 42 | 16000, |
| 35 | true, | 43 | true, |
| 36 | 1, | 44 | 1, |
| @@ -93,7 +101,12 @@ function decodeFile(filename: string): string { | @@ -93,7 +101,12 @@ function decodeFile(filename: string): string { | ||
| 93 | console.log(`samples length ${wave.samples.length}`); | 101 | console.log(`samples length ${wave.samples.length}`); |
| 94 | const resultList: string[] = []; | 102 | const resultList: string[] = []; |
| 95 | 103 | ||
| 96 | - const windowSize: number = vad.config.sileroVad.windowSize; | 104 | + let windowSize: number = vad.config.sileroVad.windowSize; |
| 105 | + | ||
| 106 | + if (vad.config.tenVad.model != '') { | ||
| 107 | + windowSize = vad.config.tenVad.windowSize; | ||
| 108 | + } | ||
| 109 | + | ||
| 97 | for (let i = 0; i < wave.samples.length; i += windowSize) { | 110 | for (let i = 0; i < wave.samples.length; i += windowSize) { |
| 98 | const thisWindow: Float32Array = wave.samples.subarray(i, i + windowSize) | 111 | const thisWindow: Float32Array = wave.samples.subarray(i, i + windowSize) |
| 99 | vad.acceptWaveform(thisWindow); | 112 | vad.acceptWaveform(thisWindow); |
| @@ -138,7 +151,12 @@ function decodeFile(filename: string): string { | @@ -138,7 +151,12 @@ function decodeFile(filename: string): string { | ||
| 138 | function decodeMic(samples: Float32Array) { | 151 | function decodeMic(samples: Float32Array) { |
| 139 | const resultList: string[] = []; | 152 | const resultList: string[] = []; |
| 140 | 153 | ||
| 141 | - const windowSize: number = vad.config.sileroVad.windowSize; | 154 | + let windowSize: number = vad.config.sileroVad.windowSize; |
| 155 | + | ||
| 156 | + if (vad.config.tenVad.model != '') { | ||
| 157 | + windowSize = vad.config.tenVad.windowSize; | ||
| 158 | + } | ||
| 159 | + | ||
| 142 | for (let i = 0; i < samples.length; i += windowSize) { | 160 | for (let i = 0; i < samples.length; i += windowSize) { |
| 143 | const thisWindow: Float32Array = samples.subarray(i, i + windowSize) | 161 | const thisWindow: Float32Array = samples.subarray(i, i + windowSize) |
| 144 | vad.acceptWaveform(thisWindow); | 162 | vad.acceptWaveform(thisWindow); |
| @@ -99,7 +99,9 @@ ai.on('data', data => { | @@ -99,7 +99,9 @@ ai.on('data', data => { | ||
| 99 | const filename = `${index}-${text}-${ | 99 | const filename = `${index}-${text}-${ |
| 100 | new Date() | 100 | new Date() |
| 101 | .toLocaleTimeString('en-US', {hour12: false}) | 101 | .toLocaleTimeString('en-US', {hour12: false}) |
| 102 | - .split(' ')[0]}.wav`; | 102 | + .split(' ')[0]}.wav` |
| 103 | + .replace(/:/g, '-'); | ||
| 104 | + | ||
| 103 | sherpa_onnx.writeWave( | 105 | sherpa_onnx.writeWave( |
| 104 | filename, | 106 | filename, |
| 105 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 107 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -96,7 +96,9 @@ ai.on('data', data => { | @@ -96,7 +96,9 @@ ai.on('data', data => { | ||
| 96 | const filename = `${index}-${text}-${ | 96 | const filename = `${index}-${text}-${ |
| 97 | new Date() | 97 | new Date() |
| 98 | .toLocaleTimeString('en-US', {hour12: false}) | 98 | .toLocaleTimeString('en-US', {hour12: false}) |
| 99 | - .split(' ')[0]}.wav`; | 99 | + .split(' ')[0]}.wav` |
| 100 | + .replace(/:/g, '-'); | ||
| 101 | + | ||
| 100 | sherpa_onnx.writeWave( | 102 | sherpa_onnx.writeWave( |
| 101 | filename, | 103 | filename, |
| 102 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 104 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -94,7 +94,9 @@ ai.on('data', data => { | @@ -94,7 +94,9 @@ ai.on('data', data => { | ||
| 94 | const filename = `${index}-${text}-${ | 94 | const filename = `${index}-${text}-${ |
| 95 | new Date() | 95 | new Date() |
| 96 | .toLocaleTimeString('en-US', {hour12: false}) | 96 | .toLocaleTimeString('en-US', {hour12: false}) |
| 97 | - .split(' ')[0]}.wav`; | 97 | + .split(' ')[0]}.wav` |
| 98 | + .replace(/:/g, '-'); | ||
| 99 | + | ||
| 98 | sherpa_onnx.writeWave( | 100 | sherpa_onnx.writeWave( |
| 99 | filename, | 101 | filename, |
| 100 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 102 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -97,7 +97,9 @@ ai.on('data', data => { | @@ -97,7 +97,9 @@ ai.on('data', data => { | ||
| 97 | const filename = `${index}-${text}-${ | 97 | const filename = `${index}-${text}-${ |
| 98 | new Date() | 98 | new Date() |
| 99 | .toLocaleTimeString('en-US', {hour12: false}) | 99 | .toLocaleTimeString('en-US', {hour12: false}) |
| 100 | - .split(' ')[0]}.wav`; | 100 | + .split(' ')[0]}.wav` |
| 101 | + .replace(/:/g, '-'); | ||
| 102 | + | ||
| 101 | sherpa_onnx.writeWave( | 103 | sherpa_onnx.writeWave( |
| 102 | filename, | 104 | filename, |
| 103 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 105 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -99,7 +99,9 @@ ai.on('data', data => { | @@ -99,7 +99,9 @@ ai.on('data', data => { | ||
| 99 | const filename = `${index}-${text}-${ | 99 | const filename = `${index}-${text}-${ |
| 100 | new Date() | 100 | new Date() |
| 101 | .toLocaleTimeString('en-US', {hour12: false}) | 101 | .toLocaleTimeString('en-US', {hour12: false}) |
| 102 | - .split(' ')[0]}.wav`; | 102 | + .split(' ')[0]}.wav` |
| 103 | + .replace(/:/g, '-'); | ||
| 104 | + | ||
| 103 | sherpa_onnx.writeWave( | 105 | sherpa_onnx.writeWave( |
| 104 | filename, | 106 | filename, |
| 105 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 107 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -95,7 +95,9 @@ ai.on('data', data => { | @@ -95,7 +95,9 @@ ai.on('data', data => { | ||
| 95 | const filename = `${index}-${text}-${ | 95 | const filename = `${index}-${text}-${ |
| 96 | new Date() | 96 | new Date() |
| 97 | .toLocaleTimeString('en-US', {hour12: false}) | 97 | .toLocaleTimeString('en-US', {hour12: false}) |
| 98 | - .split(' ')[0]}.wav`; | 98 | + .split(' ')[0]}.wav` |
| 99 | + .replace(/:/g, '-'); | ||
| 100 | + | ||
| 99 | sherpa_onnx.writeWave( | 101 | sherpa_onnx.writeWave( |
| 100 | filename, | 102 | filename, |
| 101 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 103 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -95,7 +95,9 @@ ai.on('data', data => { | @@ -95,7 +95,9 @@ ai.on('data', data => { | ||
| 95 | const filename = `${index}-${text}-${ | 95 | const filename = `${index}-${text}-${ |
| 96 | new Date() | 96 | new Date() |
| 97 | .toLocaleTimeString('en-US', {hour12: false}) | 97 | .toLocaleTimeString('en-US', {hour12: false}) |
| 98 | - .split(' ')[0]}.wav`; | 98 | + .split(' ')[0]}.wav` |
| 99 | + .replace(/:/g, '-'); | ||
| 100 | + | ||
| 99 | sherpa_onnx.writeWave( | 101 | sherpa_onnx.writeWave( |
| 100 | filename, | 102 | filename, |
| 101 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 103 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -8,14 +8,28 @@ const sherpa_onnx = require('sherpa-onnx-node'); | @@ -8,14 +8,28 @@ const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 8 | function createVad() { | 8 | function createVad() { |
| 9 | // please download silero_vad.onnx from | 9 | // please download silero_vad.onnx from |
| 10 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | 10 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx |
| 11 | + // | ||
| 12 | + // OR | ||
| 13 | + // | ||
| 14 | + // please download ten-vad.onnx from | ||
| 15 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 11 | const config = { | 16 | const config = { |
| 12 | sileroVad: { | 17 | sileroVad: { |
| 18 | + // model: '', | ||
| 13 | model: './silero_vad.onnx', | 19 | model: './silero_vad.onnx', |
| 14 | threshold: 0.5, | 20 | threshold: 0.5, |
| 15 | minSpeechDuration: 0.25, | 21 | minSpeechDuration: 0.25, |
| 16 | minSilenceDuration: 0.5, | 22 | minSilenceDuration: 0.5, |
| 17 | windowSize: 512, | 23 | windowSize: 512, |
| 18 | }, | 24 | }, |
| 25 | + tenVad: { | ||
| 26 | + model: '', | ||
| 27 | + // model: './ten-vad.onnx', | ||
| 28 | + threshold: 0.5, | ||
| 29 | + minSpeechDuration: 0.25, | ||
| 30 | + minSilenceDuration: 0.5, | ||
| 31 | + windowSize: 256, | ||
| 32 | + }, | ||
| 19 | sampleRate: 16000, | 33 | sampleRate: 16000, |
| 20 | debug: true, | 34 | debug: true, |
| 21 | numThreads: 1, | 35 | numThreads: 1, |
| @@ -47,7 +61,10 @@ const ai = new portAudio.AudioIO({ | @@ -47,7 +61,10 @@ const ai = new portAudio.AudioIO({ | ||
| 47 | let printed = false; | 61 | let printed = false; |
| 48 | let index = 0; | 62 | let index = 0; |
| 49 | ai.on('data', data => { | 63 | ai.on('data', data => { |
| 50 | - const windowSize = vad.config.sileroVad.windowSize; | 64 | + const windowSize = vad.config.sileroVad.model != '' ? |
| 65 | + vad.config.sileroVad.windowSize : | ||
| 66 | + vad.config.tenVad.windowSize; | ||
| 67 | + | ||
| 51 | buffer.push(new Float32Array(data.buffer)); | 68 | buffer.push(new Float32Array(data.buffer)); |
| 52 | while (buffer.size() > windowSize) { | 69 | while (buffer.size() > windowSize) { |
| 53 | const samples = buffer.get(buffer.head(), windowSize); | 70 | const samples = buffer.get(buffer.head(), windowSize); |
| @@ -68,7 +85,8 @@ ai.on('data', data => { | @@ -68,7 +85,8 @@ ai.on('data', data => { | ||
| 68 | const filename = `${index}-${ | 85 | const filename = `${index}-${ |
| 69 | new Date() | 86 | new Date() |
| 70 | .toLocaleTimeString('en-US', {hour12: false}) | 87 | .toLocaleTimeString('en-US', {hour12: false}) |
| 71 | - .split(' ')[0]}.wav`; | 88 | + .split(' ')[0]}.wav` |
| 89 | + .replace(/:/g, '-'); | ||
| 72 | sherpa_onnx.writeWave( | 90 | sherpa_onnx.writeWave( |
| 73 | filename, | 91 | filename, |
| 74 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 92 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
| @@ -93,7 +93,9 @@ ai.on('data', data => { | @@ -93,7 +93,9 @@ ai.on('data', data => { | ||
| 93 | const filename = `${index}-${fullLang}-${ | 93 | const filename = `${index}-${fullLang}-${ |
| 94 | new Date() | 94 | new Date() |
| 95 | .toLocaleTimeString('en-US', {hour12: false}) | 95 | .toLocaleTimeString('en-US', {hour12: false}) |
| 96 | - .split(' ')[0]}.wav`; | 96 | + .split(' ')[0]}.wav` |
| 97 | + .replace(/:/g, '-'); | ||
| 98 | + | ||
| 97 | sherpa_onnx.writeWave( | 99 | sherpa_onnx.writeWave( |
| 98 | filename, | 100 | filename, |
| 99 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); | 101 | {samples: segment.samples, sampleRate: vad.config.sampleRate}); |
-
请 注册 或 登录 后发表评论