Committed by
GitHub
JavaScript API with WebAssembly for speaker diarization (#1414)
#1408 uses [node-addon-api](https://github.com/nodejs/node-addon-api) to call C API from JavaScript, whereas this pull request uses WebAssembly to call C API from JavaScript.
正在显示
9 个修改的文件
包含
122 行增加
和
10 行删除
| @@ -9,6 +9,18 @@ git status | @@ -9,6 +9,18 @@ git status | ||
| 9 | ls -lh | 9 | ls -lh |
| 10 | ls -lh node_modules | 10 | ls -lh node_modules |
| 11 | 11 | ||
| 12 | +echo '-----speaker diarization----------' | ||
| 13 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 14 | +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 15 | +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 16 | + | ||
| 17 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 18 | + | ||
| 19 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 20 | + | ||
| 21 | +node ./test-offline-speaker-diarization.js | ||
| 22 | +rm -rfv *.wav *.onnx sherpa-onnx-pyannote-* | ||
| 23 | + | ||
| 12 | echo '-----vad+whisper----------' | 24 | echo '-----vad+whisper----------' |
| 13 | 25 | ||
| 14 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | 26 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 |
| @@ -139,7 +139,7 @@ jobs: | @@ -139,7 +139,7 @@ jobs: | ||
| 139 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH | 139 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH |
| 140 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH | 140 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH |
| 141 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH | 141 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH |
| 142 | - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH | 142 | + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH |
| 143 | 143 | ||
| 144 | which sherpa-onnx | 144 | which sherpa-onnx |
| 145 | sherpa-onnx --help | 145 | sherpa-onnx --help |
| @@ -104,7 +104,7 @@ jobs: | @@ -104,7 +104,7 @@ jobs: | ||
| 104 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH | 104 | export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH |
| 105 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH | 105 | export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH |
| 106 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH | 106 | export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH |
| 107 | - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH | 107 | + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH |
| 108 | 108 | ||
| 109 | sherpa-onnx --help | 109 | sherpa-onnx --help |
| 110 | sherpa-onnx-keyword-spotter --help | 110 | sherpa-onnx-keyword-spotter --help |
| @@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa | @@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa | ||
| 22 | for text-to-speech and speech-to-text. | 22 | for text-to-speech and speech-to-text. |
| 23 | 23 | ||
| 24 | 24 | ||
| 25 | +# Speaker diarization | ||
| 26 | + | ||
| 27 | +In the following, we demonstrate how to run speaker diarization. | ||
| 28 | + | ||
| 29 | +```bash | ||
| 30 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 31 | +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 32 | +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 33 | + | ||
| 34 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 35 | + | ||
| 36 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 37 | + | ||
| 38 | +node ./test-offline-speaker-diarization.js | ||
| 39 | +``` | ||
| 40 | + | ||
| 25 | # Text-to-speech | 41 | # Text-to-speech |
| 26 | 42 | ||
| 27 | In the following, we demonstrate how to run text-to-speech. | 43 | In the following, we demonstrate how to run text-to-speech. |
| 1 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx'); | ||
| 3 | + | ||
| 4 | +// clang-format off | ||
| 5 | +/* Please use the following commands to download files | ||
| 6 | + used in this script | ||
| 7 | + | ||
| 8 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 9 | +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 10 | +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 11 | + | ||
| 12 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 13 | + | ||
| 14 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 15 | + | ||
| 16 | + */ | ||
| 17 | +// clang-format on | ||
| 18 | + | ||
| 19 | +const config = { | ||
| 20 | + segmentation: { | ||
| 21 | + pyannote: { | ||
| 22 | + model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx', | ||
| 23 | + debug: 1, | ||
| 24 | + }, | ||
| 25 | + }, | ||
| 26 | + embedding: { | ||
| 27 | + model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx', | ||
| 28 | + debug: 1, | ||
| 29 | + }, | ||
| 30 | + clustering: { | ||
| 31 | + // since we know that the test wave file | ||
| 32 | + // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters | ||
| 33 | + // here. if you don't have such information, please set numClusters to -1 | ||
| 34 | + numClusters: 4, | ||
| 35 | + | ||
| 36 | + // If numClusters is not -1, then threshold is ignored. | ||
| 37 | + // | ||
| 38 | + // A larger threshold leads to fewer clusters, i.e., fewer speakers | ||
| 39 | + // A smaller threshold leads to more clusters, i.e., more speakers | ||
| 40 | + // You need to tune it by yourself. | ||
| 41 | + threshold: 0.5, | ||
| 42 | + }, | ||
| 43 | + | ||
| 44 | + // If a segment is shorter than minDurationOn, we discard it | ||
| 45 | + minDurationOn: 0.2, // in seconds | ||
| 46 | + | ||
| 47 | + // If the gap between two segments is less than minDurationOff, then we | ||
| 48 | + // merge these two segments into a single one | ||
| 49 | + minDurationOff: 0.5, // in seconds | ||
| 50 | +}; | ||
| 51 | + | ||
| 52 | +const waveFilename = './0-four-speakers-zh.wav'; | ||
| 53 | + | ||
| 54 | +const sd = sherpa_onnx.createOfflineSpeakerDiarization(config); | ||
| 55 | +console.log('Started') | ||
| 56 | + | ||
| 57 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 58 | +if (sd.sampleRate != wave.sampleRate) { | ||
| 59 | + throw new Error( | ||
| 60 | + `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`); | ||
| 61 | +} | ||
| 62 | + | ||
| 63 | +const segments = sd.process(wave.samples); | ||
| 64 | +console.log(segments); |
| @@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js'); | @@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js'); | ||
| 7 | const sherpa_onnx_kws = require('./sherpa-onnx-kws.js'); | 7 | const sherpa_onnx_kws = require('./sherpa-onnx-kws.js'); |
| 8 | const sherpa_onnx_wave = require('./sherpa-onnx-wave.js'); | 8 | const sherpa_onnx_wave = require('./sherpa-onnx-wave.js'); |
| 9 | const sherpa_onnx_vad = require('./sherpa-onnx-vad.js'); | 9 | const sherpa_onnx_vad = require('./sherpa-onnx-vad.js'); |
| 10 | +const sherpa_onnx_speaker_diarization = | ||
| 11 | + require('./sherpa-onnx-speaker-diarization.js'); | ||
| 10 | 12 | ||
| 11 | function createOnlineRecognizer(config) { | 13 | function createOnlineRecognizer(config) { |
| 12 | return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config); | 14 | return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config); |
| @@ -32,6 +34,11 @@ function createVad(config) { | @@ -32,6 +34,11 @@ function createVad(config) { | ||
| 32 | return sherpa_onnx_vad.createVad(wasmModule, config); | 34 | return sherpa_onnx_vad.createVad(wasmModule, config); |
| 33 | } | 35 | } |
| 34 | 36 | ||
| 37 | +function createOfflineSpeakerDiarization(config) { | ||
| 38 | + return sherpa_onnx_speaker_diarization.createOfflineSpeakerDiarization( | ||
| 39 | + wasmModule, config); | ||
| 40 | +} | ||
| 41 | + | ||
| 35 | function readWave(filename) { | 42 | function readWave(filename) { |
| 36 | return sherpa_onnx_wave.readWave(filename, wasmModule); | 43 | return sherpa_onnx_wave.readWave(filename, wasmModule); |
| 37 | } | 44 | } |
| @@ -51,4 +58,5 @@ module.exports = { | @@ -51,4 +58,5 @@ module.exports = { | ||
| 51 | writeWave, | 58 | writeWave, |
| 52 | createCircularBuffer, | 59 | createCircularBuffer, |
| 53 | createVad, | 60 | createVad, |
| 61 | + createOfflineSpeakerDiarization, | ||
| 54 | }; | 62 | }; |
| @@ -70,6 +70,17 @@ set(exported_functions | @@ -70,6 +70,17 @@ set(exported_functions | ||
| 70 | SherpaOnnxDestroySpeechSegment | 70 | SherpaOnnxDestroySpeechSegment |
| 71 | SherpaOnnxVoiceActivityDetectorReset | 71 | SherpaOnnxVoiceActivityDetectorReset |
| 72 | SherpaOnnxVoiceActivityDetectorFlush | 72 | SherpaOnnxVoiceActivityDetectorFlush |
| 73 | + # Speaker diarization | ||
| 74 | + SherpaOnnxCreateOfflineSpeakerDiarization | ||
| 75 | + SherpaOnnxDestroyOfflineSpeakerDiarization | ||
| 76 | + SherpaOnnxOfflineSpeakerDiarizationDestroyResult | ||
| 77 | + SherpaOnnxOfflineSpeakerDiarizationDestroySegment | ||
| 78 | + SherpaOnnxOfflineSpeakerDiarizationGetSampleRate | ||
| 79 | + SherpaOnnxOfflineSpeakerDiarizationProcess | ||
| 80 | + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback | ||
| 81 | + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments | ||
| 82 | + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime | ||
| 83 | + SherpaOnnxOfflineSpeakerDiarizationSetConfig | ||
| 73 | # | 84 | # |
| 74 | SherpaOnnxFileExists | 85 | SherpaOnnxFileExists |
| 75 | SherpaOnnxReadWave | 86 | SherpaOnnxReadWave |
| @@ -109,6 +120,7 @@ install( | @@ -109,6 +120,7 @@ install( | ||
| 109 | ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js | 120 | ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js |
| 110 | ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js | 121 | ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js |
| 111 | ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js | 122 | ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js |
| 123 | + ${CMAKE_SOURCE_DIR}/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js | ||
| 112 | ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js | 124 | ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js |
| 113 | "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js" | 125 | "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js" |
| 114 | "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm" | 126 | "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm" |
| @@ -12,7 +12,6 @@ Remember to rename the downloaded files. | @@ -12,7 +12,6 @@ Remember to rename the downloaded files. | ||
| 12 | 12 | ||
| 13 | The following is an example. | 13 | The following is an example. |
| 14 | 14 | ||
| 15 | - | ||
| 16 | ```bash | 15 | ```bash |
| 17 | cd wasm/speaker-diarization/assets/ | 16 | cd wasm/speaker-diarization/assets/ |
| 18 | 17 | ||
| @@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | @@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 22 | cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx | 21 | cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx |
| 23 | rm -rf sherpa-onnx-pyannote-segmentation-3-0 | 22 | rm -rf sherpa-onnx-pyannote-segmentation-3-0 |
| 24 | 23 | ||
| 25 | - | ||
| 26 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | 24 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx |
| 27 | mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx | 25 | mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx |
| 28 | - | ||
| 29 | - | ||
| 30 | ``` | 26 | ``` |
| @@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) { | @@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) { | ||
| 64 | Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); | 64 | Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); |
| 65 | offset += 4; | 65 | offset += 4; |
| 66 | 66 | ||
| 67 | - Module.setValue(ptr + offset, config.debug || 1, 'i32'); | 67 | + Module.setValue(ptr + offset, config.debug || 0, 'i32'); |
| 68 | offset += 4; | 68 | offset += 4; |
| 69 | 69 | ||
| 70 | const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; | 70 | const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; |
| @@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) { | @@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) { | ||
| 103 | Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); | 103 | Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); |
| 104 | offset += 4; | 104 | offset += 4; |
| 105 | 105 | ||
| 106 | - Module.setValue(ptr + offset, config.debug || 1, 'i32'); | 106 | + Module.setValue(ptr + offset, config.debug || 0, 'i32'); |
| 107 | offset += 4; | 107 | offset += 4; |
| 108 | 108 | ||
| 109 | Module.setValue(ptr + offset, buffer + modelLen, 'i8*'); | 109 | Module.setValue(ptr + offset, buffer + modelLen, 'i8*'); |
| @@ -270,11 +270,15 @@ class OfflineSpeakerDiarization { | @@ -270,11 +270,15 @@ class OfflineSpeakerDiarization { | ||
| 270 | } | 270 | } |
| 271 | 271 | ||
| 272 | function createOfflineSpeakerDiarization(Module, myConfig) { | 272 | function createOfflineSpeakerDiarization(Module, myConfig) { |
| 273 | - const config = { | 273 | + let config = { |
| 274 | segmentation: { | 274 | segmentation: { |
| 275 | pyannote: {model: './segmentation.onnx'}, | 275 | pyannote: {model: './segmentation.onnx'}, |
| 276 | + debug: 1, | ||
| 277 | + }, | ||
| 278 | + embedding: { | ||
| 279 | + model: './embedding.onnx', | ||
| 280 | + debug: 1, | ||
| 276 | }, | 281 | }, |
| 277 | - embedding: {model: './embedding.onnx'}, | ||
| 278 | clustering: {numClusters: -1, threshold: 0.5}, | 282 | clustering: {numClusters: -1, threshold: 0.5}, |
| 279 | minDurationOn: 0.3, | 283 | minDurationOn: 0.3, |
| 280 | minDurationOff: 0.5, | 284 | minDurationOff: 0.5, |
-
请 注册 或 登录 后发表评论