Committed by
GitHub
JavaScript API (node-addon) for speaker diarization (#1408)
正在显示
11 个修改的文件
包含
443 行增加
和
13 行删除
| 1 | { | 1 | { |
| 2 | "name": "sherpa-onnx-PLATFORM2-ARCH", | 2 | "name": "sherpa-onnx-PLATFORM2-ARCH", |
| 3 | "version": "SHERPA_ONNX_VERSION", | 3 | "version": "SHERPA_ONNX_VERSION", |
| 4 | - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", | 4 | + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", |
| 5 | "main": "index.js", | 5 | "main": "index.js", |
| 6 | "scripts": { | 6 | "scripts": { |
| 7 | "test": "echo \"Error: no test specified\" && exit 1" | 7 | "test": "echo \"Error: no test specified\" && exit 1" |
| @@ -16,8 +16,18 @@ | @@ -16,8 +16,18 @@ | ||
| 16 | "transcription", | 16 | "transcription", |
| 17 | "real-time speech recognition", | 17 | "real-time speech recognition", |
| 18 | "without internet connection", | 18 | "without internet connection", |
| 19 | + "locally", | ||
| 20 | + "local", | ||
| 19 | "embedded systems", | 21 | "embedded systems", |
| 20 | "open source", | 22 | "open source", |
| 23 | + "diarization", | ||
| 24 | + "speaker diarization", | ||
| 25 | + "speaker recognition", | ||
| 26 | + "speaker", | ||
| 27 | + "speaker segmentation", | ||
| 28 | + "speaker verification", | ||
| 29 | + "spoken language identification", | ||
| 30 | + "sherpa", | ||
| 21 | "zipformer", | 31 | "zipformer", |
| 22 | "asr", | 32 | "asr", |
| 23 | "tts", | 33 | "tts", |
| @@ -30,13 +40,13 @@ | @@ -30,13 +40,13 @@ | ||
| 30 | "offline", | 40 | "offline", |
| 31 | "privacy", | 41 | "privacy", |
| 32 | "open source", | 42 | "open source", |
| 33 | - "vad", | ||
| 34 | - "speaker id", | ||
| 35 | - "language id", | ||
| 36 | - "node-addon-api", | ||
| 37 | "streaming speech recognition", | 43 | "streaming speech recognition", |
| 38 | "speech", | 44 | "speech", |
| 39 | - "recognition" | 45 | + "recognition", |
| 46 | + "vad", | ||
| 47 | + "node-addon-api", | ||
| 48 | + "speaker id", | ||
| 49 | + "language id" | ||
| 40 | ], | 50 | ], |
| 41 | "author": "The next-gen Kaldi team", | 51 | "author": "The next-gen Kaldi team", |
| 42 | "license": "Apache-2.0", | 52 | "license": "Apache-2.0", |
| 1 | { | 1 | { |
| 2 | "name": "sherpa-onnx-node", | 2 | "name": "sherpa-onnx-node", |
| 3 | "version": "SHERPA_ONNX_VERSION", | 3 | "version": "SHERPA_ONNX_VERSION", |
| 4 | - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", | 4 | + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", |
| 5 | "main": "sherpa-onnx.js", | 5 | "main": "sherpa-onnx.js", |
| 6 | "scripts": { | 6 | "scripts": { |
| 7 | "test": "echo \"Error: no test specified\" && exit 1" | 7 | "test": "echo \"Error: no test specified\" && exit 1" |
| @@ -16,8 +16,18 @@ | @@ -16,8 +16,18 @@ | ||
| 16 | "transcription", | 16 | "transcription", |
| 17 | "real-time speech recognition", | 17 | "real-time speech recognition", |
| 18 | "without internet connection", | 18 | "without internet connection", |
| 19 | + "locally", | ||
| 20 | + "local", | ||
| 19 | "embedded systems", | 21 | "embedded systems", |
| 20 | "open source", | 22 | "open source", |
| 23 | + "diarization", | ||
| 24 | + "speaker diarization", | ||
| 25 | + "speaker recognition", | ||
| 26 | + "speaker", | ||
| 27 | + "speaker segmentation", | ||
| 28 | + "speaker verification", | ||
| 29 | + "spoken language identification", | ||
| 30 | + "sherpa", | ||
| 21 | "zipformer", | 31 | "zipformer", |
| 22 | "asr", | 32 | "asr", |
| 23 | "tts", | 33 | "tts", |
| @@ -30,13 +40,13 @@ | @@ -30,13 +40,13 @@ | ||
| 30 | "offline", | 40 | "offline", |
| 31 | "privacy", | 41 | "privacy", |
| 32 | "open source", | 42 | "open source", |
| 33 | - "vad", | ||
| 34 | - "speaker id", | ||
| 35 | - "language id", | ||
| 36 | - "node-addon-api", | ||
| 37 | "streaming speech recognition", | 43 | "streaming speech recognition", |
| 38 | "speech", | 44 | "speech", |
| 39 | - "recognition" | 45 | + "recognition", |
| 46 | + "vad", | ||
| 47 | + "node-addon-api", | ||
| 48 | + "speaker id", | ||
| 49 | + "language id" | ||
| 40 | ], | 50 | ], |
| 41 | "author": "The next-gen Kaldi team", | 51 | "author": "The next-gen Kaldi team", |
| 42 | "license": "Apache-2.0", | 52 | "license": "Apache-2.0", |
| @@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()") | @@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()") | ||
| 10 | platform=$(node -p "require('os').platform()") | 10 | platform=$(node -p "require('os').platform()") |
| 11 | node_version=$(node -p "process.versions.node.split('.')[0]") | 11 | node_version=$(node -p "process.versions.node.split('.')[0]") |
| 12 | 12 | ||
| 13 | +echo "----------non-streaming speaker diarization----------" | ||
| 14 | + | ||
| 15 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 16 | +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 17 | +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 18 | + | ||
| 19 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 20 | + | ||
| 21 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 22 | + | ||
| 23 | +node ./test_offline_speaker_diarization.js | ||
| 24 | + | ||
| 25 | +rm -rfv *.onnx *.wav sherpa-onnx-pyannote-* | ||
| 26 | + | ||
| 13 | echo "----------non-streaming asr + vad----------" | 27 | echo "----------non-streaming asr + vad----------" |
| 14 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | 28 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 |
| 15 | tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 | 29 | tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 |
| @@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI | @@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI | ||
| 43 | 43 | ||
| 44 | The following tables list the examples in this folder. | 44 | The following tables list the examples in this folder. |
| 45 | 45 | ||
| 46 | +## Speaker diarization | ||
| 47 | + | ||
| 48 | +|File| Description| | ||
| 49 | +|---|---| | ||
| 50 | +|[./test_offline_speaker_diarization.js](./test_offline_speaker_diarization.js)| It demonstrates how to use sherpa-onnx JavaScript API for speaker diarization. It supports speaker segmentation models from [pyannote-audio](https://github.com/pyannote/pyannote-audio)| | ||
| 51 | + | ||
| 46 | ## Add punctuations to text | 52 | ## Add punctuations to text |
| 47 | 53 | ||
| 48 | |File| Description| | 54 | |File| Description| |
| @@ -130,6 +136,21 @@ The following tables list the examples in this folder. | @@ -130,6 +136,21 @@ The following tables list the examples in this folder. | ||
| 130 | |[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model| | 136 | |[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model| |
| 131 | 137 | ||
| 132 | 138 | ||
| 139 | +### Speaker diarization | ||
| 140 | + | ||
| 141 | +```bash | ||
| 142 | + | ||
| 143 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 144 | +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 145 | +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 146 | + | ||
| 147 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 148 | + | ||
| 149 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 150 | + | ||
| 151 | +node ./test_offline_speaker_diarization.js | ||
| 152 | +``` | ||
| 153 | + | ||
| 133 | ### Voice Activity detection (VAD) | 154 | ### Voice Activity detection (VAD) |
| 134 | 155 | ||
| 135 | ```bash | 156 | ```bash |
| 1 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | + | ||
| 4 | +// clang-format off | ||
| 5 | +/* Please use the following commands to download files | ||
| 6 | + used in this script | ||
| 7 | + | ||
| 8 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 9 | +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 10 | +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 11 | + | ||
| 12 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 13 | + | ||
| 14 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 15 | + | ||
| 16 | + */ | ||
| 17 | +// clang-format on | ||
| 18 | + | ||
| 19 | +const config = { | ||
| 20 | + segmentation: { | ||
| 21 | + pyannote: { | ||
| 22 | + model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx', | ||
| 23 | + }, | ||
| 24 | + }, | ||
| 25 | + embedding: { | ||
| 26 | + model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx', | ||
| 27 | + }, | ||
| 28 | + clustering: { | ||
| 29 | + // since we know that the test wave file | ||
| 30 | + // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters | ||
| 31 | + // here. if you don't have such information, please set numClusters to -1 | ||
| 32 | + numClusters: 4, | ||
| 33 | + | ||
| 34 | + // If numClusters is not -1, then threshold is ignored. | ||
| 35 | + // | ||
| 36 | + // A larger threshold leads to fewer clusters, i.e., fewer speakers | ||
| 37 | + // A smaller threshold leads to more clusters, i.e., more speakers | ||
| 38 | + // You need to tune it by yourself. | ||
| 39 | + threshold: 0.5, | ||
| 40 | + }, | ||
| 41 | + | ||
| 42 | + // If a segment is shorter than minDurationOn, we discard it | ||
| 43 | + minDurationOn: 0.2, // in seconds | ||
| 44 | + | ||
| 45 | + // If the gap between two segments is less than minDurationOff, then we | ||
| 46 | + // merge these two segments into a single one | ||
| 47 | + minDurationOff: 0.5, // in seconds | ||
| 48 | +}; | ||
| 49 | + | ||
| 50 | +const waveFilename = './0-four-speakers-zh.wav'; | ||
| 51 | + | ||
| 52 | +const sd = new sherpa_onnx.OfflineSpeakerDiarization(config); | ||
| 53 | +console.log('Started') | ||
| 54 | + | ||
| 55 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 56 | +if (sd.sampleRate != wave.sampleRate) { | ||
| 57 | + throw new Error( | ||
| 58 | + `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`); | ||
| 59 | +} | ||
| 60 | + | ||
| 61 | +const segments = sd.process(wave.samples); | ||
| 62 | +console.log(segments); |
| @@ -21,6 +21,7 @@ set(srcs | @@ -21,6 +21,7 @@ set(srcs | ||
| 21 | src/audio-tagging.cc | 21 | src/audio-tagging.cc |
| 22 | src/keyword-spotting.cc | 22 | src/keyword-spotting.cc |
| 23 | src/non-streaming-asr.cc | 23 | src/non-streaming-asr.cc |
| 24 | + src/non-streaming-speaker-diarization.cc | ||
| 24 | src/non-streaming-tts.cc | 25 | src/non-streaming-tts.cc |
| 25 | src/punctuation.cc | 26 | src/punctuation.cc |
| 26 | src/sherpa-onnx-node-addon-api.cc | 27 | src/sherpa-onnx-node-addon-api.cc |
| 1 | +const addon = require('./addon.js'); | ||
| 2 | + | ||
| 3 | +class OfflineSpeakerDiarization { | ||
| 4 | + constructor(config) { | ||
| 5 | + this.handle = addon.createOfflineSpeakerDiarization(config); | ||
| 6 | + this.config = config; | ||
| 7 | + | ||
| 8 | + this.sampleRate = addon.getOfflineSpeakerDiarizationSampleRate(this.handle); | ||
| 9 | + } | ||
| 10 | + | ||
| 11 | + /** | ||
| 12 | + * samples is a 1-d float32 array. Each element of the array should be | ||
| 13 | + * in the range [-1, 1]. | ||
| 14 | + * | ||
| 15 | + * We assume its sample rate equals to this.sampleRate. | ||
| 16 | + * | ||
| 17 | + * Returns an array of object, where an object is | ||
| 18 | + * | ||
| 19 | + * { | ||
| 20 | + * "start": start_time_in_seconds, | ||
| 21 | + * "end": end_time_in_seconds, | ||
| 22 | + * "speaker": an_integer, | ||
| 23 | + * } | ||
| 24 | + */ | ||
| 25 | + process(samples) { | ||
| 26 | + return addon.offlineSpeakerDiarizationProcess(this.handle, samples); | ||
| 27 | + } | ||
| 28 | +} | ||
| 29 | + | ||
| 30 | +module.exports = { | ||
| 31 | + OfflineSpeakerDiarization, | ||
| 32 | +} |
| @@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js'); | @@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js'); | ||
| 8 | const at = require('./audio-tagg.js'); | 8 | const at = require('./audio-tagg.js'); |
| 9 | const punct = require('./punctuation.js'); | 9 | const punct = require('./punctuation.js'); |
| 10 | const kws = require('./keyword-spotter.js'); | 10 | const kws = require('./keyword-spotter.js'); |
| 11 | +const sd = require('./non-streaming-speaker-diarization.js'); | ||
| 11 | 12 | ||
| 12 | module.exports = { | 13 | module.exports = { |
| 13 | OnlineRecognizer: streaming_asr.OnlineRecognizer, | 14 | OnlineRecognizer: streaming_asr.OnlineRecognizer, |
| @@ -24,4 +25,5 @@ module.exports = { | @@ -24,4 +25,5 @@ module.exports = { | ||
| 24 | AudioTagging: at.AudioTagging, | 25 | AudioTagging: at.AudioTagging, |
| 25 | Punctuation: punct.Punctuation, | 26 | Punctuation: punct.Punctuation, |
| 26 | KeywordSpotter: kws.KeywordSpotter, | 27 | KeywordSpotter: kws.KeywordSpotter, |
| 28 | + OfflineSpeakerDiarization: sd.OfflineSpeakerDiarization, | ||
| 27 | } | 29 | } |
| 1 | { | 1 | { |
| 2 | "main": "lib/sherpa-onnx.js", | 2 | "main": "lib/sherpa-onnx.js", |
| 3 | "version": "1.0.0", | 3 | "version": "1.0.0", |
| 4 | - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", | 4 | + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", |
| 5 | "dependencies": { | 5 | "dependencies": { |
| 6 | "cmake-js": "^6.0.0", | 6 | "cmake-js": "^6.0.0", |
| 7 | "node-addon-api": "^1.1.0", | 7 | "node-addon-api": "^1.1.0", |
| @@ -21,8 +21,18 @@ | @@ -21,8 +21,18 @@ | ||
| 21 | "transcription", | 21 | "transcription", |
| 22 | "real-time speech recognition", | 22 | "real-time speech recognition", |
| 23 | "without internet connection", | 23 | "without internet connection", |
| 24 | + "locally", | ||
| 25 | + "local", | ||
| 24 | "embedded systems", | 26 | "embedded systems", |
| 25 | "open source", | 27 | "open source", |
| 28 | + "diarization", | ||
| 29 | + "speaker diarization", | ||
| 30 | + "speaker recognition", | ||
| 31 | + "speaker", | ||
| 32 | + "speaker segmentation", | ||
| 33 | + "speaker verification", | ||
| 34 | + "spoken language identification", | ||
| 35 | + "sherpa", | ||
| 26 | "zipformer", | 36 | "zipformer", |
| 27 | "asr", | 37 | "asr", |
| 28 | "tts", | 38 | "tts", |
| 1 | +// scripts/node-addon-api/src/non-streaming-speaker-diarization.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <algorithm> | ||
| 6 | +#include <sstream> | ||
| 7 | + | ||
| 8 | +#include "macros.h" // NOLINT | ||
| 9 | +#include "napi.h" // NOLINT | ||
| 10 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 11 | + | ||
| 12 | +static SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig | ||
| 13 | +GetOfflineSpeakerSegmentationPyannoteModelConfig(Napi::Object obj) { | ||
| 14 | + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig c; | ||
| 15 | + memset(&c, 0, sizeof(c)); | ||
| 16 | + | ||
| 17 | + if (!obj.Has("pyannote") || !obj.Get("pyannote").IsObject()) { | ||
| 18 | + return c; | ||
| 19 | + } | ||
| 20 | + | ||
| 21 | + Napi::Object o = obj.Get("pyannote").As<Napi::Object>(); | ||
| 22 | + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); | ||
| 23 | + | ||
| 24 | + return c; | ||
| 25 | +} | ||
| 26 | + | ||
| 27 | +static SherpaOnnxOfflineSpeakerSegmentationModelConfig | ||
| 28 | +GetOfflineSpeakerSegmentationModelConfig(Napi::Object obj) { | ||
| 29 | + SherpaOnnxOfflineSpeakerSegmentationModelConfig c; | ||
| 30 | + memset(&c, 0, sizeof(c)); | ||
| 31 | + | ||
| 32 | + if (!obj.Has("segmentation") || !obj.Get("segmentation").IsObject()) { | ||
| 33 | + return c; | ||
| 34 | + } | ||
| 35 | + | ||
| 36 | + Napi::Object o = obj.Get("segmentation").As<Napi::Object>(); | ||
| 37 | + | ||
| 38 | + c.pyannote = GetOfflineSpeakerSegmentationPyannoteModelConfig(o); | ||
| 39 | + | ||
| 40 | + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); | ||
| 41 | + | ||
| 42 | + if (o.Has("debug") && | ||
| 43 | + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { | ||
| 44 | + if (o.Get("debug").IsBoolean()) { | ||
| 45 | + c.debug = o.Get("debug").As<Napi::Boolean>().Value(); | ||
| 46 | + } else { | ||
| 47 | + c.debug = o.Get("debug").As<Napi::Number>().Int32Value(); | ||
| 48 | + } | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider); | ||
| 52 | + | ||
| 53 | + return c; | ||
| 54 | +} | ||
| 55 | + | ||
| 56 | +static SherpaOnnxSpeakerEmbeddingExtractorConfig | ||
| 57 | +GetSpeakerEmbeddingExtractorConfig(Napi::Object obj) { | ||
| 58 | + SherpaOnnxSpeakerEmbeddingExtractorConfig c; | ||
| 59 | + memset(&c, 0, sizeof(c)); | ||
| 60 | + | ||
| 61 | + if (!obj.Has("embedding") || !obj.Get("embedding").IsObject()) { | ||
| 62 | + return c; | ||
| 63 | + } | ||
| 64 | + | ||
| 65 | + Napi::Object o = obj.Get("embedding").As<Napi::Object>(); | ||
| 66 | + | ||
| 67 | + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); | ||
| 68 | + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); | ||
| 69 | + | ||
| 70 | + if (o.Has("debug") && | ||
| 71 | + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { | ||
| 72 | + if (o.Get("debug").IsBoolean()) { | ||
| 73 | + c.debug = o.Get("debug").As<Napi::Boolean>().Value(); | ||
| 74 | + } else { | ||
| 75 | + c.debug = o.Get("debug").As<Napi::Number>().Int32Value(); | ||
| 76 | + } | ||
| 77 | + } | ||
| 78 | + | ||
| 79 | + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider); | ||
| 80 | + | ||
| 81 | + return c; | ||
| 82 | +} | ||
| 83 | + | ||
| 84 | +static SherpaOnnxFastClusteringConfig GetFastClusteringConfig( | ||
| 85 | + Napi::Object obj) { | ||
| 86 | + SherpaOnnxFastClusteringConfig c; | ||
| 87 | + memset(&c, 0, sizeof(c)); | ||
| 88 | + | ||
| 89 | + if (!obj.Has("clustering") || !obj.Get("clustering").IsObject()) { | ||
| 90 | + return c; | ||
| 91 | + } | ||
| 92 | + | ||
| 93 | + Napi::Object o = obj.Get("clustering").As<Napi::Object>(); | ||
| 94 | + | ||
| 95 | + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_clusters, numClusters); | ||
| 96 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold); | ||
| 97 | + | ||
| 98 | + return c; | ||
| 99 | +} | ||
| 100 | + | ||
| 101 | +static Napi::External<SherpaOnnxOfflineSpeakerDiarization> | ||
| 102 | +CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) { | ||
| 103 | + Napi::Env env = info.Env(); | ||
| 104 | + if (info.Length() != 1) { | ||
| 105 | + std::ostringstream os; | ||
| 106 | + os << "Expect only 1 argument. Given: " << info.Length(); | ||
| 107 | + | ||
| 108 | + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); | ||
| 109 | + | ||
| 110 | + return {}; | ||
| 111 | + } | ||
| 112 | + | ||
| 113 | + if (!info[0].IsObject()) { | ||
| 114 | + Napi::TypeError::New(env, "Expect an object as the argument") | ||
| 115 | + .ThrowAsJavaScriptException(); | ||
| 116 | + | ||
| 117 | + return {}; | ||
| 118 | + } | ||
| 119 | + | ||
| 120 | + Napi::Object o = info[0].As<Napi::Object>(); | ||
| 121 | + | ||
| 122 | + SherpaOnnxOfflineSpeakerDiarizationConfig c; | ||
| 123 | + memset(&c, 0, sizeof(c)); | ||
| 124 | + | ||
| 125 | + c.segmentation = GetOfflineSpeakerSegmentationModelConfig(o); | ||
| 126 | + c.embedding = GetSpeakerEmbeddingExtractorConfig(o); | ||
| 127 | + c.clustering = GetFastClusteringConfig(o); | ||
| 128 | + | ||
| 129 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_on, minDurationOn); | ||
| 130 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_off, minDurationOff); | ||
| 131 | + | ||
| 132 | + const SherpaOnnxOfflineSpeakerDiarization *sd = | ||
| 133 | + SherpaOnnxCreateOfflineSpeakerDiarization(&c); | ||
| 134 | + | ||
| 135 | + if (c.segmentation.pyannote.model) { | ||
| 136 | + delete[] c.segmentation.pyannote.model; | ||
| 137 | + } | ||
| 138 | + | ||
| 139 | + if (c.segmentation.provider) { | ||
| 140 | + delete[] c.segmentation.provider; | ||
| 141 | + } | ||
| 142 | + | ||
| 143 | + if (c.embedding.model) { | ||
| 144 | + delete[] c.embedding.model; | ||
| 145 | + } | ||
| 146 | + | ||
| 147 | + if (c.embedding.provider) { | ||
| 148 | + delete[] c.embedding.provider; | ||
| 149 | + } | ||
| 150 | + | ||
| 151 | + if (!sd) { | ||
| 152 | + Napi::TypeError::New(env, "Please check your config!") | ||
| 153 | + .ThrowAsJavaScriptException(); | ||
| 154 | + | ||
| 155 | + return {}; | ||
| 156 | + } | ||
| 157 | + | ||
| 158 | + return Napi::External<SherpaOnnxOfflineSpeakerDiarization>::New( | ||
| 159 | + env, const_cast<SherpaOnnxOfflineSpeakerDiarization *>(sd), | ||
| 160 | + [](Napi::Env env, SherpaOnnxOfflineSpeakerDiarization *sd) { | ||
| 161 | + SherpaOnnxDestroyOfflineSpeakerDiarization(sd); | ||
| 162 | + }); | ||
| 163 | +} | ||
| 164 | + | ||
| 165 | +static Napi::Number OfflineSpeakerDiarizationGetSampleRateWrapper( | ||
| 166 | + const Napi::CallbackInfo &info) { | ||
| 167 | + Napi::Env env = info.Env(); | ||
| 168 | + | ||
| 169 | + if (info.Length() != 1) { | ||
| 170 | + std::ostringstream os; | ||
| 171 | + os << "Expect only 1 argument. Given: " << info.Length(); | ||
| 172 | + | ||
| 173 | + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); | ||
| 174 | + | ||
| 175 | + return {}; | ||
| 176 | + } | ||
| 177 | + | ||
| 178 | + if (!info[0].IsExternal()) { | ||
| 179 | + Napi::TypeError::New( | ||
| 180 | + env, "Argument 0 should be an offline speaker diarization pointer.") | ||
| 181 | + .ThrowAsJavaScriptException(); | ||
| 182 | + | ||
| 183 | + return {}; | ||
| 184 | + } | ||
| 185 | + | ||
| 186 | + const SherpaOnnxOfflineSpeakerDiarization *sd = | ||
| 187 | + info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data(); | ||
| 188 | + | ||
| 189 | + int32_t sample_rate = SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd); | ||
| 190 | + | ||
| 191 | + return Napi::Number::New(env, sample_rate); | ||
| 192 | +} | ||
| 193 | + | ||
| 194 | +static Napi::Array OfflineSpeakerDiarizationProcessWrapper( | ||
| 195 | + const Napi::CallbackInfo &info) { | ||
| 196 | + Napi::Env env = info.Env(); | ||
| 197 | + | ||
| 198 | + if (info.Length() != 2) { | ||
| 199 | + std::ostringstream os; | ||
| 200 | + os << "Expect only 2 arguments. Given: " << info.Length(); | ||
| 201 | + | ||
| 202 | + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); | ||
| 203 | + | ||
| 204 | + return {}; | ||
| 205 | + } | ||
| 206 | + | ||
| 207 | + if (!info[0].IsExternal()) { | ||
| 208 | + Napi::TypeError::New( | ||
| 209 | + env, "Argument 0 should be an offline speaker diarization pointer.") | ||
| 210 | + .ThrowAsJavaScriptException(); | ||
| 211 | + | ||
| 212 | + return {}; | ||
| 213 | + } | ||
| 214 | + | ||
| 215 | + const SherpaOnnxOfflineSpeakerDiarization *sd = | ||
| 216 | + info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data(); | ||
| 217 | + | ||
| 218 | + if (!info[1].IsTypedArray()) { | ||
| 219 | + Napi::TypeError::New(env, "Argument 1 should be a typed array") | ||
| 220 | + .ThrowAsJavaScriptException(); | ||
| 221 | + | ||
| 222 | + return {}; | ||
| 223 | + } | ||
| 224 | + | ||
| 225 | + Napi::Float32Array samples = info[1].As<Napi::Float32Array>(); | ||
| 226 | + | ||
| 227 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r = | ||
| 228 | + SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.Data(), | ||
| 229 | + samples.ElementLength()); | ||
| 230 | + | ||
| 231 | + int32_t num_segments = | ||
| 232 | + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r); | ||
| 233 | + | ||
| 234 | + const SherpaOnnxOfflineSpeakerDiarizationSegment *segments = | ||
| 235 | + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(r); | ||
| 236 | + | ||
| 237 | + Napi::Array ans = Napi::Array::New(env, num_segments); | ||
| 238 | + | ||
| 239 | + for (int32_t i = 0; i != num_segments; ++i) { | ||
| 240 | + Napi::Object obj = Napi::Object::New(env); | ||
| 241 | + obj.Set(Napi::String::New(env, "start"), segments[i].start); | ||
| 242 | + obj.Set(Napi::String::New(env, "end"), segments[i].end); | ||
| 243 | + obj.Set(Napi::String::New(env, "speaker"), segments[i].speaker); | ||
| 244 | + | ||
| 245 | + ans[i] = obj; | ||
| 246 | + } | ||
| 247 | + | ||
| 248 | + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments); | ||
| 249 | + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r); | ||
| 250 | + | ||
| 251 | + return ans; | ||
| 252 | +} | ||
| 253 | + | ||
| 254 | +void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) { | ||
| 255 | + exports.Set(Napi::String::New(env, "createOfflineSpeakerDiarization"), | ||
| 256 | + Napi::Function::New(env, CreateOfflineSpeakerDiarizationWrapper)); | ||
| 257 | + | ||
| 258 | + exports.Set( | ||
| 259 | + Napi::String::New(env, "getOfflineSpeakerDiarizationSampleRate"), | ||
| 260 | + Napi::Function::New(env, OfflineSpeakerDiarizationGetSampleRateWrapper)); | ||
| 261 | + | ||
| 262 | + exports.Set( | ||
| 263 | + Napi::String::New(env, "offlineSpeakerDiarizationProcess"), | ||
| 264 | + Napi::Function::New(env, OfflineSpeakerDiarizationProcessWrapper)); | ||
| 265 | +} |
| @@ -25,6 +25,8 @@ void InitPunctuation(Napi::Env env, Napi::Object exports); | @@ -25,6 +25,8 @@ void InitPunctuation(Napi::Env env, Napi::Object exports); | ||
| 25 | 25 | ||
| 26 | void InitKeywordSpotting(Napi::Env env, Napi::Object exports); | 26 | void InitKeywordSpotting(Napi::Env env, Napi::Object exports); |
| 27 | 27 | ||
| 28 | +void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports); | ||
| 29 | + | ||
| 28 | Napi::Object Init(Napi::Env env, Napi::Object exports) { | 30 | Napi::Object Init(Napi::Env env, Napi::Object exports) { |
| 29 | InitStreamingAsr(env, exports); | 31 | InitStreamingAsr(env, exports); |
| 30 | InitNonStreamingAsr(env, exports); | 32 | InitNonStreamingAsr(env, exports); |
| @@ -37,6 +39,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) { | @@ -37,6 +39,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) { | ||
| 37 | InitAudioTagging(env, exports); | 39 | InitAudioTagging(env, exports); |
| 38 | InitPunctuation(env, exports); | 40 | InitPunctuation(env, exports); |
| 39 | InitKeywordSpotting(env, exports); | 41 | InitKeywordSpotting(env, exports); |
| 42 | + InitNonStreamingSpeakerDiarization(env, exports); | ||
| 40 | 43 | ||
| 41 | return exports; | 44 | return exports; |
| 42 | } | 45 | } |
-
请 注册 或 登录 后发表评论