JavaScript API with WebAssembly for speaker diarization (#1414)

#1408 uses [node-addon-api](https://github.com/nodejs/node-addon-api) to call C API from JavaScript, whereas this pull request uses WebAssembly to call C API from JavaScript.

JavaScript API with WebAssembly for speaker diarization (#1414)
#1408 uses [node-addon-api](https://github.com/nodejs/node-addon-api) to call C API from JavaScript, whereas this pull request uses WebAssembly to call C API from JavaScript.
Fangjun Kuang · GitHub
Commit eefc17209589fe3b950f561bc9c8b8d1c9b8a742 eefc1720 1 parent f1b311ee
.github/scripts/test-nodejs-npm.sh
.github/workflows/test-build-wheel.yaml
.github/workflows/test-pip-install.yaml
nodejs-examples/README.md
nodejs-examples/test-offline-speaker-diarization.js
scripts/nodejs/index.js
wasm/nodejs/CMakeLists.txt
wasm/speaker-diarization/assets/README.md
wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
--- a/.github/scripts/test-nodejs-npm.sh
查看文件 @eefc172
+++ b/.github/scripts/test-nodejs-npm.sh
查看文件 @eefc172
@@ -9,6 +9,18 @@ git status
 ls -lh
 ls -lh node_modules
 
+ echo '-----speaker diarization----------'
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+ 
+ node ./test-offline-speaker-diarization.js
+ rm -rfv *.wav *.onnx sherpa-onnx-pyannote-*
+ 
 echo '-----vad+whisper----------'
 
 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
--- a/.github/workflows/test-build-wheel.yaml
查看文件 @eefc172
+++ b/.github/workflows/test-build-wheel.yaml
查看文件 @eefc172
@@ -139,7 +139,7 @@ jobs:
           export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
-           export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
+           export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH
 
           which sherpa-onnx
           sherpa-onnx --help
--- a/.github/workflows/test-pip-install.yaml
查看文件 @eefc172
+++ b/.github/workflows/test-pip-install.yaml
查看文件 @eefc172
@@ -104,7 +104,7 @@ jobs:
           export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
           export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
-           export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
+           export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH
 
           sherpa-onnx --help
           sherpa-onnx-keyword-spotter --help
--- a/nodejs-examples/README.md
查看文件 @eefc172
+++ b/nodejs-examples/README.md
查看文件 @eefc172
@@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa
 for text-to-speech and speech-to-text.
 
 
+ # Speaker diarization
+ 
+ In the following, we demonstrate how to run speaker diarization.
+ 
+ ```bash
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+ 
+ node ./test-offline-speaker-diarization.js
+ ```
+ 
 # Text-to-speech
 
 In the following, we demonstrate how to run text-to-speech.
--- a/nodejs-examples/test-offline-speaker-diarization.js 0 → 100644
查看文件 @eefc172
+++ b/nodejs-examples/test-offline-speaker-diarization.js 0 → 100644
查看文件 @eefc172
+ // Copyright (c)  2024  Xiaomi Corporation
+ const sherpa_onnx = require('sherpa-onnx');
+ 
+ // clang-format off
+ /* Please use the following commands to download files
+    used in this script
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+ 
+  */
+ // clang-format on
+ 
+ const config = {
+   segmentation: {
+     pyannote: {
+       model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
+       debug: 1,
+     },
+   },
+   embedding: {
+     model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
+     debug: 1,
+   },
+   clustering: {
+     // since we know that the test wave file
+     // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
+     // here. if you don't have such information, please set numClusters to -1
+     numClusters: 4,
+ 
+     // If numClusters is not -1, then threshold is ignored.
+     //
+     // A larger threshold leads to fewer clusters, i.e., fewer speakers
+     // A smaller threshold leads to more clusters, i.e., more speakers
+     // You need to tune it by yourself.
+     threshold: 0.5,
+   },
+ 
+   // If a segment is shorter than minDurationOn, we discard it
+   minDurationOn: 0.2,  // in seconds
+ 
+   // If the gap between two segments is less than minDurationOff, then we
+   // merge these two segments into a single one
+   minDurationOff: 0.5,  // in seconds
+ };
+ 
+ const waveFilename = './0-four-speakers-zh.wav';
+ 
+ const sd = sherpa_onnx.createOfflineSpeakerDiarization(config);
+ console.log('Started')
+ 
+ const wave = sherpa_onnx.readWave(waveFilename);
+ if (sd.sampleRate != wave.sampleRate) {
+   throw new Error(
+       `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
+ }
+ 
+ const segments = sd.process(wave.samples);
+ console.log(segments);
--- a/scripts/nodejs/index.js
查看文件 @eefc172
+++ b/scripts/nodejs/index.js
查看文件 @eefc172
@@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js');
 const sherpa_onnx_kws = require('./sherpa-onnx-kws.js');
 const sherpa_onnx_wave = require('./sherpa-onnx-wave.js');
 const sherpa_onnx_vad = require('./sherpa-onnx-vad.js');
+ const sherpa_onnx_speaker_diarization =
+     require('./sherpa-onnx-speaker-diarization.js');
 
 function createOnlineRecognizer(config) {
   return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config);
@@ -32,6 +34,11 @@ function createVad(config) {
   return sherpa_onnx_vad.createVad(wasmModule, config);
 }
 
+ function createOfflineSpeakerDiarization(config) {
+   return sherpa_onnx_speaker_diarization.createOfflineSpeakerDiarization(
+       wasmModule, config);
+ }
+ 
 function readWave(filename) {
   return sherpa_onnx_wave.readWave(filename, wasmModule);
 }
@@ -51,4 +58,5 @@ module.exports = {
   writeWave,
   createCircularBuffer,
   createVad,
+   createOfflineSpeakerDiarization,
 };
--- a/wasm/nodejs/CMakeLists.txt
查看文件 @eefc172
+++ b/wasm/nodejs/CMakeLists.txt
查看文件 @eefc172
@@ -70,6 +70,17 @@ set(exported_functions
   SherpaOnnxDestroySpeechSegment
   SherpaOnnxVoiceActivityDetectorReset
   SherpaOnnxVoiceActivityDetectorFlush
+   # Speaker diarization
+   SherpaOnnxCreateOfflineSpeakerDiarization
+   SherpaOnnxDestroyOfflineSpeakerDiarization
+   SherpaOnnxOfflineSpeakerDiarizationDestroyResult
+   SherpaOnnxOfflineSpeakerDiarizationDestroySegment
+   SherpaOnnxOfflineSpeakerDiarizationGetSampleRate
+   SherpaOnnxOfflineSpeakerDiarizationProcess
+   SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback
+   SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments
+   SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime
+   SherpaOnnxOfflineSpeakerDiarizationSetConfig
   #
   SherpaOnnxFileExists
   SherpaOnnxReadWave
@@ -109,6 +120,7 @@ install(
   ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js
   ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js
   ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js
+   ${CMAKE_SOURCE_DIR}/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
   ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js
     "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js"
     "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm"
--- a/wasm/speaker-diarization/assets/README.md
查看文件 @eefc172
+++ b/wasm/speaker-diarization/assets/README.md
查看文件 @eefc172
@@ -12,7 +12,6 @@ Remember to rename the downloaded files.
 
 The following is an example.
 
- 
 ```bash
 cd wasm/speaker-diarization/assets/
 
@@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
 cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx
 rm -rf sherpa-onnx-pyannote-segmentation-3-0
 
- 
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
 mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx
- 
- 
 ```
--- a/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
查看文件 @eefc172
+++ b/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
查看文件 @eefc172
@@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) {
   Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
   offset += 4;
 
-   Module.setValue(ptr + offset, config.debug || 1, 'i32');
+   Module.setValue(ptr + offset, config.debug || 0, 'i32');
   offset += 4;
 
   const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
@@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) {
   Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
   offset += 4;
 
-   Module.setValue(ptr + offset, config.debug || 1, 'i32');
+   Module.setValue(ptr + offset, config.debug || 0, 'i32');
   offset += 4;
 
   Module.setValue(ptr + offset, buffer + modelLen, 'i8*');
@@ -270,11 +270,15 @@ class OfflineSpeakerDiarization {
 }
 
 function createOfflineSpeakerDiarization(Module, myConfig) {
-   const config = {
+   let config = {
     segmentation: {
       pyannote: {model: './segmentation.onnx'},
+       debug: 1,
+     },
+     embedding: {
+       model: './embedding.onnx',
+       debug: 1,
     },
-     embedding: {model: './embedding.onnx'},
     clustering: {numClusters: -1, threshold: 0.5},
     minDurationOn: 0.3,
     minDurationOff: 0.5,