Add JavaScript API for SenseVoice (#1157)

Fangjun Kuang · GitHub
Commit c3260ef842e2b6be23460e0a51130dde644c2161 c3260ef8 1 parent 8f4d332a
.github/scripts/test-nodejs-addon-npm.sh
nodejs-addon-examples/README.md
nodejs-addon-examples/package.json
nodejs-addon-examples/test_asr_non_streaming_sense_voice.js
nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js
scripts/node-addon-api/src/non-streaming-asr.cc
--- a/.github/scripts/test-nodejs-addon-npm.sh
查看文件 @c3260ef
+++ b/.github/scripts/test-nodejs-addon-npm.sh
查看文件 @c3260ef
@@ -20,6 +20,13 @@ if [[ $arch != "ia32" && $platform != "win32" ]]; then
   node ./test_asr_non_streaming_nemo_ctc.js
   rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
 
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+   tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+   rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+ 
+   node ./test_asr_non_streaming_sense_voice.js
+   rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+ 
   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
   tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
   rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
--- a/nodejs-addon-examples/README.md
查看文件 @c3260ef
+++ b/nodejs-addon-examples/README.md
查看文件 @c3260ef
@@ -95,6 +95,7 @@ The following tables list the examples in this folder.
 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
+ |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
 
 ## Non-Streaming speech-to-text from a microphone with VAD
 
@@ -104,6 +105,7 @@ The following tables list the examples in this folder.
 |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
 |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
+ |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
 
 ## Text-to-speech
 
@@ -252,6 +254,20 @@ npm install naudiodon2
 node ./test_vad_asr_non_streaming_paraformer_microphone.js
 ```
 
+ ### Non-streaming speech recognition with SenseVoice
+ 
+ ```bash
+ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+ tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+ rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+ 
+ node ./test_asr_non_streaming_sense_voice.js
+ 
+ # To run VAD + non-streaming ASR with Paraformer using a microphone
+ npm install naudiodon2
+ node ./test_vad_asr_non_streaming_sense_voice_microphone.js
+ ```
+ 
 ### Text-to-speech with piper VITS models (TTS)
 
 ```bash
--- a/nodejs-addon-examples/package.json
查看文件 @c3260ef
+++ b/nodejs-addon-examples/package.json
查看文件 @c3260ef
 {
   "dependencies": {
-     "sherpa-onnx-node": "^1.0.30"
+     "sherpa-onnx-node": "^1.10.17"
   }
 }
--- a/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js 0 → 100644
查看文件 @c3260ef
+++ b/nodejs-addon-examples/test_asr_non_streaming_sense_voice.js 0 → 100644
查看文件 @c3260ef
+ // Copyright (c)  2024  Xiaomi Corporation
+ const sherpa_onnx = require('sherpa-onnx-node');
+ 
+ // Please download test files from
+ // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+ const config = {
+   'featConfig': {
+     'sampleRate': 16000,
+     'featureDim': 80,
+   },
+   'modelConfig': {
+     'senseVoice': {
+       'model':
+           './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
+       'useInverseTextNormalization': 1,
+     },
+     'tokens': './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
+     'numThreads': 2,
+     'provider': 'cpu',
+     'debug': 1,
+   }
+ };
+ 
+ const waveFilename =
+     './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav';
+ 
+ const recognizer = new sherpa_onnx.OfflineRecognizer(config);
+ console.log('Started')
+ let start = Date.now();
+ const stream = recognizer.createStream();
+ const wave = sherpa_onnx.readWave(waveFilename);
+ stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
+ 
+ recognizer.decode(stream);
+ result = recognizer.getResult(stream)
+ let stop = Date.now();
+ console.log('Done')
+ 
+ const elapsed_seconds = (stop - start) / 1000;
+ const duration = wave.samples.length / wave.sampleRate;
+ const real_time_factor = elapsed_seconds / duration;
+ console.log('Wave duration', duration.toFixed(3), 'secodns')
+ console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+ console.log(
+     `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+     real_time_factor.toFixed(3))
+ console.log(waveFilename)
+ console.log('result\n', result)
--- a/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js 0 → 100644
查看文件 @c3260ef
+++ b/nodejs-addon-examples/test_vad_asr_non_streaming_sense_voice_microphone.js 0 → 100644
查看文件 @c3260ef
+ // Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ //
+ const portAudio = require('naudiodon2');
+ // console.log(portAudio.getDevices());
+ 
+ const sherpa_onnx = require('sherpa-onnx-node');
+ 
+ function createRecognizer() {
+   // Please download test files from
+   // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+   const config = {
+     'featConfig': {
+       'sampleRate': 16000,
+       'featureDim': 80,
+     },
+     'modelConfig': {
+       'senseVoice': {
+         'model':
+             './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
+         'useInverseTextNormalization': 1,
+       },
+       'tokens':
+           './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
+       'numThreads': 2,
+       'provider': 'cpu',
+       'debug': 1,
+     }
+   };
+ 
+   return new sherpa_onnx.OfflineRecognizer(config);
+ }
+ 
+ function createVad() {
+   // please download silero_vad.onnx from
+   // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+   const config = {
+     sileroVad: {
+       model: './silero_vad.onnx',
+       threshold: 0.5,
+       minSpeechDuration: 0.25,
+       minSilenceDuration: 0.5,
+       windowSize: 512,
+     },
+     sampleRate: 16000,
+     debug: true,
+     numThreads: 1,
+   };
+ 
+   const bufferSizeInSeconds = 60;
+ 
+   return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
+ }
+ 
+ const recognizer = createRecognizer();
+ const vad = createVad();
+ 
+ const bufferSizeInSeconds = 30;
+ const buffer =
+     new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
+ 
+ const ai = new portAudio.AudioIO({
+   inOptions: {
+     channelCount: 1,
+     closeOnError: true,  // Close the stream if an audio error is detected, if
+                          // set false then just log the error
+     deviceId: -1,  // Use -1 or omit the deviceId to select the default device
+     sampleFormat: portAudio.SampleFormatFloat32,
+     sampleRate: vad.config.sampleRate
+   }
+ });
+ 
+ let printed = false;
+ let index = 0;
+ ai.on('data', data => {
+   const windowSize = vad.config.sileroVad.windowSize;
+   buffer.push(new Float32Array(data.buffer));
+   while (buffer.size() > windowSize) {
+     const samples = buffer.get(buffer.head(), windowSize);
+     buffer.pop(windowSize);
+     vad.acceptWaveform(samples);
+   }
+ 
+   while (!vad.isEmpty()) {
+     const segment = vad.front();
+     vad.pop();
+     const stream = recognizer.createStream();
+     stream.acceptWaveform({
+       samples: segment.samples,
+       sampleRate: recognizer.config.featConfig.sampleRate
+     });
+     recognizer.decode(stream);
+     const r = recognizer.getResult(stream);
+     if (r.text.length > 0) {
+       const text = r.text.toLowerCase().trim();
+       console.log(`${index}: ${text}`);
+ 
+       const filename = `${index}-${text}-${
+           new Date()
+               .toLocaleTimeString('en-US', {hour12: false})
+               .split(' ')[0]}.wav`;
+       sherpa_onnx.writeWave(
+           filename,
+           {samples: segment.samples, sampleRate: vad.config.sampleRate});
+ 
+       index += 1;
+     }
+   }
+ });
+ 
+ ai.start();
+ console.log('Started! Please speak')
--- a/scripts/node-addon-api/src/non-streaming-asr.cc
查看文件 @c3260ef
+++ b/scripts/node-addon-api/src/non-streaming-asr.cc
查看文件 @c3260ef
@@ -96,6 +96,24 @@ static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig(
   return c;
 }
 
+ static SherpaOnnxOfflineSenseVoiceModelConfig GetOfflineSenseVoiceModelConfig(
+     Napi::Object obj) {
+   SherpaOnnxOfflineSenseVoiceModelConfig c;
+   memset(&c, 0, sizeof(c));
+ 
+   if (!obj.Has("senseVoice") || !obj.Get("senseVoice").IsObject()) {
+     return c;
+   }
+ 
+   Napi::Object o = obj.Get("senseVoice").As<Napi::Object>();
+ 
+   SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
+   SHERPA_ONNX_ASSIGN_ATTR_STR(language, language);
+   SHERPA_ONNX_ASSIGN_ATTR_INT32(use_itn, useInverseTextNormalization);
+ 
+   return c;
+ }
+ 
 static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
   SherpaOnnxOfflineModelConfig c;
   memset(&c, 0, sizeof(c));
@@ -111,6 +129,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
   c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o);
   c.whisper = GetOfflineWhisperModelConfig(o);
   c.tdnn = GetOfflineTdnnModelConfig(o);
+   c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
 
   SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
   SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -225,6 +244,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
     delete[] c.model_config.tdnn.model;
   }
 
+   if (c.model_config.sense_voice.model) {
+     delete[] c.model_config.sense_voice.model;
+   }
+ 
+   if (c.model_config.sense_voice.language) {
+     delete[] c.model_config.sense_voice.language;
+   }
+ 
   if (c.model_config.tokens) {
     delete[] c.model_config.tokens;
   }