Add VAD + Non-streaming ASR example for JavaScript API. (#1170)

Fangjun Kuang · GitHub
Commit 994c3e7c96a21f790cc9e672b81791583e215bb2 994c3e7c 1 parent 299f1a85
.github/scripts/test-nodejs-addon-npm.sh
.gitignore
CHANGELOG.md
CMakeLists.txt
dart-api-examples/keyword-spotter/pubspec.yaml
dart-api-examples/non-streaming-asr/pubspec.yaml
dart-api-examples/streaming-asr/pubspec.yaml
dart-api-examples/tts/pubspec.yaml
dart-api-examples/vad/pubspec.yaml
flutter-examples/streaming_asr/pubspec.yaml
flutter-examples/tts/pubspec.yaml
flutter/sherpa_onnx/pubspec.yaml
flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
nodejs-addon-examples/README.md
nodejs-addon-examples/package.json
nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js
scripts/dart/kws-pubspec.yaml
scripts/dart/sherpa-onnx-pubspec.yaml
scripts/node-addon-api/lib/vad.js
--- a/.github/scripts/test-nodejs-addon-npm.sh
查看文件 @994c3e7
+++ b/.github/scripts/test-nodejs-addon-npm.sh
查看文件 @994c3e7
@@ -10,6 +10,19 @@ arch=$(node -p "require('os').arch()")
 platform=$(node -p "require('os').platform()")
 node_version=$(node -p "process.versions.node.split('.')[0]")
 
+ echo "----------non-streaming asr + vad----------"
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+ rm sherpa-onnx-whisper-tiny.en.tar.bz2
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+ 
+ node ./test_vad_with_non_streaming_asr_whisper.js
+ rm -rf sherpa-onnx-whisper*
+ rm *.wav
+ rm *.onnx
+ 
 echo "----------asr----------"
 
 if [[ $arch != "ia32" && $platform != "win32" ]]; then
--- a/.gitignore
查看文件 @994c3e7
+++ b/.gitignore
查看文件 @994c3e7
@@ -112,3 +112,4 @@ sherpa-onnx-telespeech-ctc-*
 .ccache
 lib*.a
 sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+ *.bak
--- a/CHANGELOG.md
查看文件 @994c3e7
+++ b/CHANGELOG.md
查看文件 @994c3e7
+ ## 1.10.18
+ 
+ * Fix the case when recognition results contain the symbol `"`. It caused
+   issues when converting results to a json string.
+ 
 ## 1.10.17
 
 * Support SenseVoice CTC models.
--- a/CMakeLists.txt
查看文件 @994c3e7
+++ b/CMakeLists.txt
查看文件 @994c3e7
@@ -11,7 +11,7 @@ project(sherpa-onnx)
 # ./nodejs-addon-examples
 # ./dart-api-examples/
 # ./CHANGELOG.md
- set(SHERPA_ONNX_VERSION "1.10.17")
+ set(SHERPA_ONNX_VERSION "1.10.18")
 
 # Disable warning about
 #
--- a/dart-api-examples/keyword-spotter/pubspec.yaml
查看文件 @994c3e7
+++ b/dart-api-examples/keyword-spotter/pubspec.yaml
查看文件 @994c3e7
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-   sherpa_onnx: ^1.10.17
+   sherpa_onnx: ^1.10.18
   # sherpa_onnx:
   #   path: ../../flutter/sherpa_onnx
   path: ^1.9.0
--- a/dart-api-examples/non-streaming-asr/pubspec.yaml
查看文件 @994c3e7
+++ b/dart-api-examples/non-streaming-asr/pubspec.yaml
查看文件 @994c3e7
@@ -10,7 +10,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-   sherpa_onnx: ^1.10.17
+   sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 
--- a/dart-api-examples/streaming-asr/pubspec.yaml
查看文件 @994c3e7
+++ b/dart-api-examples/streaming-asr/pubspec.yaml
查看文件 @994c3e7
@@ -11,7 +11,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-   sherpa_onnx: ^1.10.17
+   sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 
--- a/dart-api-examples/tts/pubspec.yaml
查看文件 @994c3e7
+++ b/dart-api-examples/tts/pubspec.yaml
查看文件 @994c3e7
@@ -8,7 +8,7 @@ environment:
 
 # Add regular dependencies here.
 dependencies:
-   sherpa_onnx: ^1.10.17
+   sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 
--- a/dart-api-examples/vad/pubspec.yaml
查看文件 @994c3e7
+++ b/dart-api-examples/vad/pubspec.yaml
查看文件 @994c3e7
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-   sherpa_onnx: ^1.10.17
+   sherpa_onnx: ^1.10.18
   path: ^1.9.0
   args: ^2.5.0
 
--- a/flutter-examples/streaming_asr/pubspec.yaml
查看文件 @994c3e7
+++ b/flutter-examples/streaming_asr/pubspec.yaml
查看文件 @994c3e7
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none'
 
- version: 1.10.17
+ version: 1.10.18
 
 topics:
   - speech-recognition
@@ -30,7 +30,7 @@ dependencies:
   record: ^5.1.0
   url_launcher: ^6.2.6
 
-   sherpa_onnx: ^1.10.17
+   sherpa_onnx: ^1.10.18
   # sherpa_onnx:
     # path: ../../flutter/sherpa_onnx
 
--- a/flutter-examples/tts/pubspec.yaml
查看文件 @994c3e7
+++ b/flutter-examples/tts/pubspec.yaml
查看文件 @994c3e7
@@ -5,7 +5,7 @@ description: >
 
 publish_to: 'none' # Remove this line if you wish to publish to pub.dev
 
- version: 1.10.17
+ version: 1.10.18
 
 environment:
   sdk: '>=3.4.0 <4.0.0'
@@ -17,7 +17,7 @@ dependencies:
   cupertino_icons: ^1.0.6
   path_provider: ^2.1.3
   path: ^1.9.0
-   sherpa_onnx: ^1.10.17
+   sherpa_onnx: ^1.10.18
   url_launcher: ^6.2.6
   audioplayers: ^5.0.0
 
--- a/flutter/sherpa_onnx/pubspec.yaml
查看文件 @994c3e7
+++ b/flutter/sherpa_onnx/pubspec.yaml
查看文件 @994c3e7
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
- version: 1.10.17
+ version: 1.10.18
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 
@@ -30,23 +30,23 @@ dependencies:
   flutter:
     sdk: flutter
 
-   sherpa_onnx_android: ^1.10.17
+   sherpa_onnx_android: ^1.10.18
   # sherpa_onnx_android:
   #   path: ../sherpa_onnx_android
 
-   sherpa_onnx_macos: ^1.10.17
+   sherpa_onnx_macos: ^1.10.18
   # sherpa_onnx_macos:
   #   path: ../sherpa_onnx_macos
 
-   sherpa_onnx_linux: ^1.10.17
+   sherpa_onnx_linux: ^1.10.18
   # sherpa_onnx_linux:
   #   path: ../sherpa_onnx_linux
     #
-   sherpa_onnx_windows: ^1.10.17
+   sherpa_onnx_windows: ^1.10.18
   # sherpa_onnx_windows:
   #   path: ../sherpa_onnx_windows
 
-   sherpa_onnx_ios: ^1.10.17
+   sherpa_onnx_ios: ^1.10.18
   # sherpa_onnx_ios:
   #   path: ../sherpa_onnx_ios
 
--- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
查看文件 @994c3e7
+++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
查看文件 @994c3e7
@@ -7,7 +7,7 @@
 # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_ios'
-   s.version          = '1.10.17'
+   s.version          = '1.10.18'
   s.summary          = 'A new Flutter FFI plugin project.'
   s.description      = <<-DESC
 A new Flutter FFI plugin project.
--- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
查看文件 @994c3e7
+++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
查看文件 @994c3e7
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
   s.name             = 'sherpa_onnx_macos'
-   s.version          = '1.10.17'
+   s.version          = '1.10.18'
   s.summary          = 'sherpa-onnx Flutter FFI plugin project.'
   s.description      = <<-DESC
 sherpa-onnx Flutter FFI plugin project.
--- a/nodejs-addon-examples/README.md
查看文件 @994c3e7
+++ b/nodejs-addon-examples/README.md
查看文件 @994c3e7
@@ -93,6 +93,7 @@ The following tables list the examples in this folder.
 |---|---|
 |[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model|
 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
+ |[./test_vad_with_non_streaming_asr_whisper.js](./test_vad_with_non_streaming_asr_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper) + [Silero VAD](https://github.com/snakers4/silero-vad)|
 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
@@ -221,11 +222,24 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2
 
 node ./test_asr_non_streaming_whisper.js
 
- # To run VAD + non-streaming ASR with Paraformer using a microphone
+ # To run VAD + non-streaming ASR with Whisper using a microphone
 npm install naudiodon2
 node ./test_vad_asr_non_streaming_whisper_microphone.js
 ```
 
+ ### Non-streaming speech recognition with Whisper + VAD
+ 
+ ```bash
+ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+ rm sherpa-onnx-whisper-tiny.en.tar.bz2
+ 
+ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+ 
+ node ./test_vad_with_non_streaming_asr_whisper.js
+ ```
+ 
 ### Non-streaming speech recognition with NeMo CTC models
 
 ```bash
--- a/nodejs-addon-examples/package.json
查看文件 @994c3e7
+++ b/nodejs-addon-examples/package.json
查看文件 @994c3e7
 {
   "dependencies": {
-     "sherpa-onnx-node": "^1.10.17"
+     "sherpa-onnx-node": "^1.10.18"
   }
 }
--- a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js 0 → 100644
查看文件 @994c3e7
+++ b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js 0 → 100644
查看文件 @994c3e7
+ // Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+ 
+ const sherpa_onnx = require('sherpa-onnx-node');
+ 
+ function createRecognizer() {
+   // Please download test files from
+   // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+   const config = {
+     'featConfig': {
+       'sampleRate': 16000,
+       'featureDim': 80,
+     },
+     'modelConfig': {
+       'whisper': {
+         'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
+         'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx',
+       },
+       'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
+       'numThreads': 2,
+       'provider': 'cpu',
+       'debug': 1,
+     }
+   };
+ 
+   return new sherpa_onnx.OfflineRecognizer(config);
+ }
+ 
+ function createVad() {
+   // please download silero_vad.onnx from
+   // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+   const config = {
+     sileroVad: {
+       model: './silero_vad.onnx',
+       threshold: 0.5,
+       minSpeechDuration: 0.25,
+       minSilenceDuration: 0.5,
+       windowSize: 512,
+     },
+     sampleRate: 16000,
+     debug: true,
+     numThreads: 1,
+   };
+ 
+   const bufferSizeInSeconds = 60;
+ 
+   return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
+ }
+ 
+ const recognizer = createRecognizer();
+ const vad = createVad();
+ 
+ // please download ./Obama.wav from
+ // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+ const waveFilename = './Obama.wav';
+ const wave = sherpa_onnx.readWave(waveFilename);
+ 
+ if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
+   throw new Error(
+       'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
+ }
+ 
+ console.log('Started')
+ let start = Date.now();
+ 
+ const windowSize = vad.config.sileroVad.windowSize;
+ for (let i = 0; i < wave.samples.length; i += windowSize) {
+   const thisWindow = wave.samples.subarray(i, i + windowSize);
+   vad.acceptWaveform(thisWindow);
+ 
+   while (!vad.isEmpty()) {
+     const segment = vad.front();
+     vad.pop();
+ 
+     let start_time = segment.start / wave.sampleRate;
+     let end_time = start_time + segment.samples.length / wave.sampleRate;
+ 
+     start_time = start_time.toFixed(2);
+     end_time = end_time.toFixed(2);
+ 
+     const stream = recognizer.createStream();
+     stream.acceptWaveform(
+         {samples: segment.samples, sampleRate: wave.sampleRate});
+ 
+     recognizer.decode(stream);
+     const r = recognizer.getResult(stream);
+     if (r.text.length > 0) {
+       const text = r.text.toLowerCase().trim();
+       console.log(`${start_time} -- ${end_time}: ${text}`);
+     }
+   }
+ }
+ 
+ vad.flush();
+ 
+ while (!vad.isEmpty()) {
+   const segment = vad.front();
+   vad.pop();
+ 
+   let start_time = segment.start / wave.sampleRate;
+   let end_time = start_time + segment.samples.length / wave.sampleRate;
+ 
+   start_time = start_time.toFixed(2);
+   end_time = end_time.toFixed(2);
+ 
+   const stream = recognizer.createStream();
+   stream.acceptWaveform(
+       {samples: segment.samples, sampleRate: wave.sampleRate});
+ 
+   recognizer.decode(stream);
+   const r = recognizer.getResult(stream);
+   if (r.text.length > 0) {
+     const text = r.text.toLowerCase().trim();
+     console.log(`${start_time} -- ${end_time}: ${text}`);
+   }
+ }
+ 
+ let stop = Date.now();
+ console.log('Done')
+ 
+ const elapsed_seconds = (stop - start) / 1000;
+ const duration = wave.samples.length / wave.sampleRate;
+ const real_time_factor = elapsed_seconds / duration;
+ console.log('Wave duration', duration.toFixed(3), 'secodns')
+ console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+ console.log(
+     `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+     real_time_factor.toFixed(3))
--- a/scripts/dart/kws-pubspec.yaml
查看文件 @994c3e7
+++ b/scripts/dart/kws-pubspec.yaml
查看文件 @994c3e7
@@ -9,7 +9,7 @@ environment:
   sdk: ^3.4.0
 
 dependencies:
-   # sherpa_onnx: ^1.10.17
+   # sherpa_onnx: ^1.10.18
   sherpa_onnx:
     path: ../../flutter/sherpa_onnx
   path: ^1.9.0
--- a/scripts/dart/sherpa-onnx-pubspec.yaml
查看文件 @994c3e7
+++ b/scripts/dart/sherpa-onnx-pubspec.yaml
查看文件 @994c3e7
@@ -17,7 +17,7 @@ topics:
   - voice-activity-detection
 
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
- version: 1.10.17
+ version: 1.10.18
 
 homepage: https://github.com/k2-fsa/sherpa-onnx
 
--- a/scripts/node-addon-api/lib/vad.js
查看文件 @994c3e7
+++ b/scripts/node-addon-api/lib/vad.js
查看文件 @994c3e7
@@ -65,7 +65,7 @@ config = {
   }
 
   clear() {
-     addon.VoiceActivityDetectorClearWrapper(this.handle);
+     addon.voiceActivityDetectorClear(this.handle);
   }
 
   /*
@@ -79,11 +79,11 @@ config = {
   }
 
   reset() {
-     addon.VoiceActivityDetectorResetWrapper(this.handle);
+     addon.voiceActivityDetectorReset(this.handle);
   }
 
   flush() {
-     addon.VoiceActivityDetectorFlushWrapper(this.handle);
+     addon.voiceActivityDetectorFlush(this.handle);
   }
 }
 
--- a/sherpa-onnx/csrc/offline-stream.cc
查看文件 @994c3e7
+++ b/sherpa-onnx/csrc/offline-stream.cc
查看文件 @994c3e7
@@ -306,8 +306,7 @@ std::string OfflineRecognitionResult::AsJsonString() const {
   os << "{";
   os << "\"text\""
      << ": ";
-   os << "\"" << text << "\""
-      << ", ";
+   os << std::quoted(text) << ", ";
 
   os << "\""
      << "timestamps"
@@ -339,7 +338,7 @@ std::string OfflineRecognitionResult::AsJsonString() const {
          << "\"";
       os.flags(oldFlags);
     } else {
-       os << sep << "\"" << t << "\"";
+       os << sep << std::quoted(t);
     }
     sep = ", ";
   }
--- a/sherpa-onnx/csrc/online-recognizer.cc
查看文件 @994c3e7
+++ b/sherpa-onnx/csrc/online-recognizer.cc
查看文件 @994c3e7
@@ -44,7 +44,7 @@ std::string VecToString<std::string>(const std::vector<std::string> &vec,
   oss << "[";
   std::string sep = "";
   for (const auto &item : vec) {
-     oss << sep << "\"" << item << "\"";
+     oss << sep << std::quoted(item);
     sep = ", ";
   }
   oss << "]";
@@ -54,9 +54,7 @@ std::string VecToString<std::string>(const std::vector<std::string> &vec,
 std::string OnlineRecognizerResult::AsJsonString() const {
   std::ostringstream os;
   os << "{ ";
-   os << "\"text\": "
-      << "\"" << text << "\""
-      << ", ";
+   os << "\"text\": " << std::quoted(text) << ", ";
   os << "\"tokens\": " << VecToString(tokens) << ", ";
   os << "\"timestamps\": " << VecToString(timestamps, 2) << ", ";
   os << "\"ys_probs\": " << VecToString(ys_probs, 6) << ", ";