Fangjun Kuang
Committed by GitHub

JavaScript API with WebAssembly for speaker diarization (#1414)

#1408 uses [node-addon-api](https://github.com/nodejs/node-addon-api) to call C API from JavaScript, whereas this pull request uses WebAssembly to call C API from JavaScript.
@@ -9,6 +9,18 @@ git status @@ -9,6 +9,18 @@ git status
9 ls -lh 9 ls -lh
10 ls -lh node_modules 10 ls -lh node_modules
11 11
  12 +echo '-----speaker diarization----------'
  13 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  14 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  15 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  16 +
  17 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  18 +
  19 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  20 +
  21 +node ./test-offline-speaker-diarization.js
  22 +rm -rfv *.wav *.onnx sherpa-onnx-pyannote-*
  23 +
12 echo '-----vad+whisper----------' 24 echo '-----vad+whisper----------'
13 25
14 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 26 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
@@ -139,7 +139,7 @@ jobs: @@ -139,7 +139,7 @@ jobs:
139 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH 139 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
140 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH 140 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
141 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH 141 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
142 - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH 142 + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH
143 143
144 which sherpa-onnx 144 which sherpa-onnx
145 sherpa-onnx --help 145 sherpa-onnx --help
@@ -104,7 +104,7 @@ jobs: @@ -104,7 +104,7 @@ jobs:
104 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH 104 export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
105 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH 105 export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
106 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH 106 export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
107 - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH 107 + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH
108 108
109 sherpa-onnx --help 109 sherpa-onnx --help
110 sherpa-onnx-keyword-spotter --help 110 sherpa-onnx-keyword-spotter --help
@@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa @@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa
22 for text-to-speech and speech-to-text. 22 for text-to-speech and speech-to-text.
23 23
24 24
  25 +# Speaker diarization
  26 +
  27 +In the following, we demonstrate how to run speaker diarization.
  28 +
  29 +```bash
  30 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  31 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  32 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  33 +
  34 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  35 +
  36 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  37 +
  38 +node ./test-offline-speaker-diarization.js
  39 +```
  40 +
25 # Text-to-speech 41 # Text-to-speech
26 42
27 In the following, we demonstrate how to run text-to-speech. 43 In the following, we demonstrate how to run text-to-speech.
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx');
  3 +
  4 +// clang-format off
  5 +/* Please use the following commands to download files
  6 + used in this script
  7 +
  8 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  9 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  10 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  11 +
  12 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  13 +
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  15 +
  16 + */
  17 +// clang-format on
  18 +
  19 +const config = {
  20 + segmentation: {
  21 + pyannote: {
  22 + model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
  23 + debug: 1,
  24 + },
  25 + },
  26 + embedding: {
  27 + model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
  28 + debug: 1,
  29 + },
  30 + clustering: {
  31 + // since we know that the test wave file
  32 + // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
  33 + // here. if you don't have such information, please set numClusters to -1
  34 + numClusters: 4,
  35 +
  36 + // If numClusters is not -1, then threshold is ignored.
  37 + //
  38 + // A larger threshold leads to fewer clusters, i.e., fewer speakers
  39 + // A smaller threshold leads to more clusters, i.e., more speakers
  40 + // You need to tune it by yourself.
  41 + threshold: 0.5,
  42 + },
  43 +
  44 + // If a segment is shorter than minDurationOn, we discard it
  45 + minDurationOn: 0.2, // in seconds
  46 +
  47 + // If the gap between two segments is less than minDurationOff, then we
  48 + // merge these two segments into a single one
  49 + minDurationOff: 0.5, // in seconds
  50 +};
  51 +
  52 +const waveFilename = './0-four-speakers-zh.wav';
  53 +
  54 +const sd = sherpa_onnx.createOfflineSpeakerDiarization(config);
  55 +console.log('Started')
  56 +
  57 +const wave = sherpa_onnx.readWave(waveFilename);
  58 +if (sd.sampleRate != wave.sampleRate) {
  59 + throw new Error(
  60 + `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
  61 +}
  62 +
  63 +const segments = sd.process(wave.samples);
  64 +console.log(segments);
@@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js'); @@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js');
7 const sherpa_onnx_kws = require('./sherpa-onnx-kws.js'); 7 const sherpa_onnx_kws = require('./sherpa-onnx-kws.js');
8 const sherpa_onnx_wave = require('./sherpa-onnx-wave.js'); 8 const sherpa_onnx_wave = require('./sherpa-onnx-wave.js');
9 const sherpa_onnx_vad = require('./sherpa-onnx-vad.js'); 9 const sherpa_onnx_vad = require('./sherpa-onnx-vad.js');
  10 +const sherpa_onnx_speaker_diarization =
  11 + require('./sherpa-onnx-speaker-diarization.js');
10 12
11 function createOnlineRecognizer(config) { 13 function createOnlineRecognizer(config) {
12 return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config); 14 return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config);
@@ -32,6 +34,11 @@ function createVad(config) { @@ -32,6 +34,11 @@ function createVad(config) {
32 return sherpa_onnx_vad.createVad(wasmModule, config); 34 return sherpa_onnx_vad.createVad(wasmModule, config);
33 } 35 }
34 36
  37 +function createOfflineSpeakerDiarization(config) {
  38 + return sherpa_onnx_speaker_diarization.createOfflineSpeakerDiarization(
  39 + wasmModule, config);
  40 +}
  41 +
35 function readWave(filename) { 42 function readWave(filename) {
36 return sherpa_onnx_wave.readWave(filename, wasmModule); 43 return sherpa_onnx_wave.readWave(filename, wasmModule);
37 } 44 }
@@ -51,4 +58,5 @@ module.exports = { @@ -51,4 +58,5 @@ module.exports = {
51 writeWave, 58 writeWave,
52 createCircularBuffer, 59 createCircularBuffer,
53 createVad, 60 createVad,
  61 + createOfflineSpeakerDiarization,
54 }; 62 };
@@ -70,6 +70,17 @@ set(exported_functions @@ -70,6 +70,17 @@ set(exported_functions
70 SherpaOnnxDestroySpeechSegment 70 SherpaOnnxDestroySpeechSegment
71 SherpaOnnxVoiceActivityDetectorReset 71 SherpaOnnxVoiceActivityDetectorReset
72 SherpaOnnxVoiceActivityDetectorFlush 72 SherpaOnnxVoiceActivityDetectorFlush
  73 + # Speaker diarization
  74 + SherpaOnnxCreateOfflineSpeakerDiarization
  75 + SherpaOnnxDestroyOfflineSpeakerDiarization
  76 + SherpaOnnxOfflineSpeakerDiarizationDestroyResult
  77 + SherpaOnnxOfflineSpeakerDiarizationDestroySegment
  78 + SherpaOnnxOfflineSpeakerDiarizationGetSampleRate
  79 + SherpaOnnxOfflineSpeakerDiarizationProcess
  80 + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback
  81 + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments
  82 + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime
  83 + SherpaOnnxOfflineSpeakerDiarizationSetConfig
73 # 84 #
74 SherpaOnnxFileExists 85 SherpaOnnxFileExists
75 SherpaOnnxReadWave 86 SherpaOnnxReadWave
@@ -109,6 +120,7 @@ install( @@ -109,6 +120,7 @@ install(
109 ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js 120 ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js
110 ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js 121 ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js
111 ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js 122 ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js
  123 + ${CMAKE_SOURCE_DIR}/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js
112 ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js 124 ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js
113 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js" 125 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js"
114 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm" 126 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm"
@@ -12,7 +12,6 @@ Remember to rename the downloaded files. @@ -12,7 +12,6 @@ Remember to rename the downloaded files.
12 12
13 The following is an example. 13 The following is an example.
14 14
15 -  
16 ```bash 15 ```bash
17 cd wasm/speaker-diarization/assets/ 16 cd wasm/speaker-diarization/assets/
18 17
@@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 @@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
22 cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx 21 cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx
23 rm -rf sherpa-onnx-pyannote-segmentation-3-0 22 rm -rf sherpa-onnx-pyannote-segmentation-3-0
24 23
25 -  
26 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx 24 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
27 mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx 25 mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx
28 -  
29 -  
30 ``` 26 ```
@@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) { @@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) {
64 Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); 64 Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
65 offset += 4; 65 offset += 4;
66 66
67 - Module.setValue(ptr + offset, config.debug || 1, 'i32'); 67 + Module.setValue(ptr + offset, config.debug || 0, 'i32');
68 offset += 4; 68 offset += 4;
69 69
70 const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; 70 const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
@@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) { @@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) {
103 Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); 103 Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
104 offset += 4; 104 offset += 4;
105 105
106 - Module.setValue(ptr + offset, config.debug || 1, 'i32'); 106 + Module.setValue(ptr + offset, config.debug || 0, 'i32');
107 offset += 4; 107 offset += 4;
108 108
109 Module.setValue(ptr + offset, buffer + modelLen, 'i8*'); 109 Module.setValue(ptr + offset, buffer + modelLen, 'i8*');
@@ -270,11 +270,15 @@ class OfflineSpeakerDiarization { @@ -270,11 +270,15 @@ class OfflineSpeakerDiarization {
270 } 270 }
271 271
272 function createOfflineSpeakerDiarization(Module, myConfig) { 272 function createOfflineSpeakerDiarization(Module, myConfig) {
273 - const config = { 273 + let config = {
274 segmentation: { 274 segmentation: {
275 pyannote: {model: './segmentation.onnx'}, 275 pyannote: {model: './segmentation.onnx'},
  276 + debug: 1,
  277 + },
  278 + embedding: {
  279 + model: './embedding.onnx',
  280 + debug: 1,
276 }, 281 },
277 - embedding: {model: './embedding.onnx'},  
278 clustering: {numClusters: -1, threshold: 0.5}, 282 clustering: {numClusters: -1, threshold: 0.5},
279 minDurationOn: 0.3, 283 minDurationOn: 0.3,
280 minDurationOff: 0.5, 284 minDurationOff: 0.5,