Fangjun Kuang
Committed by GitHub

JavaScript API (node-addon) for speaker diarization (#1408)

1 { 1 {
2 "name": "sherpa-onnx-PLATFORM2-ARCH", 2 "name": "sherpa-onnx-PLATFORM2-ARCH",
3 "version": "SHERPA_ONNX_VERSION", 3 "version": "SHERPA_ONNX_VERSION",
4 - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", 4 + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
5 "main": "index.js", 5 "main": "index.js",
6 "scripts": { 6 "scripts": {
7 "test": "echo \"Error: no test specified\" && exit 1" 7 "test": "echo \"Error: no test specified\" && exit 1"
@@ -16,8 +16,18 @@ @@ -16,8 +16,18 @@
16 "transcription", 16 "transcription",
17 "real-time speech recognition", 17 "real-time speech recognition",
18 "without internet connection", 18 "without internet connection",
  19 + "locally",
  20 + "local",
19 "embedded systems", 21 "embedded systems",
20 "open source", 22 "open source",
  23 + "diarization",
  24 + "speaker diarization",
  25 + "speaker recognition",
  26 + "speaker",
  27 + "speaker segmentation",
  28 + "speaker verification",
  29 + "spoken language identification",
  30 + "sherpa",
21 "zipformer", 31 "zipformer",
22 "asr", 32 "asr",
23 "tts", 33 "tts",
@@ -30,13 +40,13 @@ @@ -30,13 +40,13 @@
30 "offline", 40 "offline",
31 "privacy", 41 "privacy",
32 "open source", 42 "open source",
33 - "vad",  
34 - "speaker id",  
35 - "language id",  
36 - "node-addon-api",  
37 "streaming speech recognition", 43 "streaming speech recognition",
38 "speech", 44 "speech",
39 - "recognition" 45 + "recognition",
  46 + "vad",
  47 + "node-addon-api",
  48 + "speaker id",
  49 + "language id"
40 ], 50 ],
41 "author": "The next-gen Kaldi team", 51 "author": "The next-gen Kaldi team",
42 "license": "Apache-2.0", 52 "license": "Apache-2.0",
1 { 1 {
2 "name": "sherpa-onnx-node", 2 "name": "sherpa-onnx-node",
3 "version": "SHERPA_ONNX_VERSION", 3 "version": "SHERPA_ONNX_VERSION",
4 - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", 4 + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
5 "main": "sherpa-onnx.js", 5 "main": "sherpa-onnx.js",
6 "scripts": { 6 "scripts": {
7 "test": "echo \"Error: no test specified\" && exit 1" 7 "test": "echo \"Error: no test specified\" && exit 1"
@@ -16,8 +16,18 @@ @@ -16,8 +16,18 @@
16 "transcription", 16 "transcription",
17 "real-time speech recognition", 17 "real-time speech recognition",
18 "without internet connection", 18 "without internet connection",
  19 + "locally",
  20 + "local",
19 "embedded systems", 21 "embedded systems",
20 "open source", 22 "open source",
  23 + "diarization",
  24 + "speaker diarization",
  25 + "speaker recognition",
  26 + "speaker",
  27 + "speaker segmentation",
  28 + "speaker verification",
  29 + "spoken language identification",
  30 + "sherpa",
21 "zipformer", 31 "zipformer",
22 "asr", 32 "asr",
23 "tts", 33 "tts",
@@ -30,13 +40,13 @@ @@ -30,13 +40,13 @@
30 "offline", 40 "offline",
31 "privacy", 41 "privacy",
32 "open source", 42 "open source",
33 - "vad",  
34 - "speaker id",  
35 - "language id",  
36 - "node-addon-api",  
37 "streaming speech recognition", 43 "streaming speech recognition",
38 "speech", 44 "speech",
39 - "recognition" 45 + "recognition",
  46 + "vad",
  47 + "node-addon-api",
  48 + "speaker id",
  49 + "language id"
40 ], 50 ],
41 "author": "The next-gen Kaldi team", 51 "author": "The next-gen Kaldi team",
42 "license": "Apache-2.0", 52 "license": "Apache-2.0",
@@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()") @@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()")
10 platform=$(node -p "require('os').platform()") 10 platform=$(node -p "require('os').platform()")
11 node_version=$(node -p "process.versions.node.split('.')[0]") 11 node_version=$(node -p "process.versions.node.split('.')[0]")
12 12
  13 +echo "----------non-streaming speaker diarization----------"
  14 +
  15 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  16 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  17 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  18 +
  19 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  20 +
  21 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  22 +
  23 +node ./test_offline_speaker_diarization.js
  24 +
  25 +rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
  26 +
13 echo "----------non-streaming asr + vad----------" 27 echo "----------non-streaming asr + vad----------"
14 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 28 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
15 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 29 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
@@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI @@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI
43 43
44 The following tables list the examples in this folder. 44 The following tables list the examples in this folder.
45 45
  46 +## Speaker diarization
  47 +
  48 +|File| Description|
  49 +|---|---|
  50 +|[./test_offline_speaker_diarization.js](./test_offline_speaker_diarization.js)| It demonstrates how to use sherpa-onnx JavaScript API for speaker diarization. It supports speaker segmentation models from [pyannote-audio](https://github.com/pyannote/pyannote-audio)|
  51 +
46 ## Add punctuations to text 52 ## Add punctuations to text
47 53
48 |File| Description| 54 |File| Description|
@@ -130,6 +136,21 @@ The following tables list the examples in this folder. @@ -130,6 +136,21 @@ The following tables list the examples in this folder.
130 |[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model| 136 |[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|
131 137
132 138
  139 +### Speaker diarization
  140 +
  141 +```bash
  142 +
  143 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  144 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  145 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  146 +
  147 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  148 +
  149 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  150 +
  151 +node ./test_offline_speaker_diarization.js
  152 +```
  153 +
133 ### Voice Activity detection (VAD) 154 ### Voice Activity detection (VAD)
134 155
135 ```bash 156 ```bash
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// clang-format off
  5 +/* Please use the following commands to download files
  6 + used in this script
  7 +
  8 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  9 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  10 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  11 +
  12 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  13 +
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  15 +
  16 + */
  17 +// clang-format on
  18 +
  19 +const config = {
  20 + segmentation: {
  21 + pyannote: {
  22 + model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
  23 + },
  24 + },
  25 + embedding: {
  26 + model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
  27 + },
  28 + clustering: {
  29 + // since we know that the test wave file
  30 + // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
  31 + // here. if you don't have such information, please set numClusters to -1
  32 + numClusters: 4,
  33 +
  34 + // If numClusters is not -1, then threshold is ignored.
  35 + //
  36 + // A larger threshold leads to fewer clusters, i.e., fewer speakers
  37 + // A smaller threshold leads to more clusters, i.e., more speakers
  38 + // You need to tune it by yourself.
  39 + threshold: 0.5,
  40 + },
  41 +
  42 + // If a segment is shorter than minDurationOn, we discard it
  43 + minDurationOn: 0.2, // in seconds
  44 +
  45 + // If the gap between two segments is less than minDurationOff, then we
  46 + // merge these two segments into a single one
  47 + minDurationOff: 0.5, // in seconds
  48 +};
  49 +
  50 +const waveFilename = './0-four-speakers-zh.wav';
  51 +
  52 +const sd = new sherpa_onnx.OfflineSpeakerDiarization(config);
  53 +console.log('Started')
  54 +
  55 +const wave = sherpa_onnx.readWave(waveFilename);
  56 +if (sd.sampleRate != wave.sampleRate) {
  57 + throw new Error(
  58 + `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
  59 +}
  60 +
  61 +const segments = sd.process(wave.samples);
  62 +console.log(segments);
@@ -21,6 +21,7 @@ set(srcs @@ -21,6 +21,7 @@ set(srcs
21 src/audio-tagging.cc 21 src/audio-tagging.cc
22 src/keyword-spotting.cc 22 src/keyword-spotting.cc
23 src/non-streaming-asr.cc 23 src/non-streaming-asr.cc
  24 + src/non-streaming-speaker-diarization.cc
24 src/non-streaming-tts.cc 25 src/non-streaming-tts.cc
25 src/punctuation.cc 26 src/punctuation.cc
26 src/sherpa-onnx-node-addon-api.cc 27 src/sherpa-onnx-node-addon-api.cc
  1 +const addon = require('./addon.js');
  2 +
  3 +class OfflineSpeakerDiarization {
  4 + constructor(config) {
  5 + this.handle = addon.createOfflineSpeakerDiarization(config);
  6 + this.config = config;
  7 +
  8 + this.sampleRate = addon.getOfflineSpeakerDiarizationSampleRate(this.handle);
  9 + }
  10 +
  11 + /**
  12 + * samples is a 1-d float32 array. Each element of the array should be
  13 + * in the range [-1, 1].
  14 + *
  15 + * We assume its sample rate equals to this.sampleRate.
  16 + *
  17 + * Returns an array of object, where an object is
  18 + *
  19 + * {
  20 + * "start": start_time_in_seconds,
  21 + * "end": end_time_in_seconds,
  22 + * "speaker": an_integer,
  23 + * }
  24 + */
  25 + process(samples) {
  26 + return addon.offlineSpeakerDiarizationProcess(this.handle, samples);
  27 + }
  28 +}
  29 +
  30 +module.exports = {
  31 + OfflineSpeakerDiarization,
  32 +}
@@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js'); @@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js');
8 const at = require('./audio-tagg.js'); 8 const at = require('./audio-tagg.js');
9 const punct = require('./punctuation.js'); 9 const punct = require('./punctuation.js');
10 const kws = require('./keyword-spotter.js'); 10 const kws = require('./keyword-spotter.js');
  11 +const sd = require('./non-streaming-speaker-diarization.js');
11 12
12 module.exports = { 13 module.exports = {
13 OnlineRecognizer: streaming_asr.OnlineRecognizer, 14 OnlineRecognizer: streaming_asr.OnlineRecognizer,
@@ -24,4 +25,5 @@ module.exports = { @@ -24,4 +25,5 @@ module.exports = {
24 AudioTagging: at.AudioTagging, 25 AudioTagging: at.AudioTagging,
25 Punctuation: punct.Punctuation, 26 Punctuation: punct.Punctuation,
26 KeywordSpotter: kws.KeywordSpotter, 27 KeywordSpotter: kws.KeywordSpotter,
  28 + OfflineSpeakerDiarization: sd.OfflineSpeakerDiarization,
27 } 29 }
1 { 1 {
2 "main": "lib/sherpa-onnx.js", 2 "main": "lib/sherpa-onnx.js",
3 "version": "1.0.0", 3 "version": "1.0.0",
4 - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", 4 + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
5 "dependencies": { 5 "dependencies": {
6 "cmake-js": "^6.0.0", 6 "cmake-js": "^6.0.0",
7 "node-addon-api": "^1.1.0", 7 "node-addon-api": "^1.1.0",
@@ -21,8 +21,18 @@ @@ -21,8 +21,18 @@
21 "transcription", 21 "transcription",
22 "real-time speech recognition", 22 "real-time speech recognition",
23 "without internet connection", 23 "without internet connection",
  24 + "locally",
  25 + "local",
24 "embedded systems", 26 "embedded systems",
25 "open source", 27 "open source",
  28 + "diarization",
  29 + "speaker diarization",
  30 + "speaker recognition",
  31 + "speaker",
  32 + "speaker segmentation",
  33 + "speaker verification",
  34 + "spoken language identification",
  35 + "sherpa",
26 "zipformer", 36 "zipformer",
27 "asr", 37 "asr",
28 "tts", 38 "tts",
  1 +// scripts/node-addon-api/src/non-streaming-speaker-diarization.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include <algorithm>
  6 +#include <sstream>
  7 +
  8 +#include "macros.h" // NOLINT
  9 +#include "napi.h" // NOLINT
  10 +#include "sherpa-onnx/c-api/c-api.h"
  11 +
  12 +static SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
  13 +GetOfflineSpeakerSegmentationPyannoteModelConfig(Napi::Object obj) {
  14 + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig c;
  15 + memset(&c, 0, sizeof(c));
  16 +
  17 + if (!obj.Has("pyannote") || !obj.Get("pyannote").IsObject()) {
  18 + return c;
  19 + }
  20 +
  21 + Napi::Object o = obj.Get("pyannote").As<Napi::Object>();
  22 + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
  23 +
  24 + return c;
  25 +}
  26 +
  27 +static SherpaOnnxOfflineSpeakerSegmentationModelConfig
  28 +GetOfflineSpeakerSegmentationModelConfig(Napi::Object obj) {
  29 + SherpaOnnxOfflineSpeakerSegmentationModelConfig c;
  30 + memset(&c, 0, sizeof(c));
  31 +
  32 + if (!obj.Has("segmentation") || !obj.Get("segmentation").IsObject()) {
  33 + return c;
  34 + }
  35 +
  36 + Napi::Object o = obj.Get("segmentation").As<Napi::Object>();
  37 +
  38 + c.pyannote = GetOfflineSpeakerSegmentationPyannoteModelConfig(o);
  39 +
  40 + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
  41 +
  42 + if (o.Has("debug") &&
  43 + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
  44 + if (o.Get("debug").IsBoolean()) {
  45 + c.debug = o.Get("debug").As<Napi::Boolean>().Value();
  46 + } else {
  47 + c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
  48 + }
  49 + }
  50 +
  51 + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
  52 +
  53 + return c;
  54 +}
  55 +
  56 +static SherpaOnnxSpeakerEmbeddingExtractorConfig
  57 +GetSpeakerEmbeddingExtractorConfig(Napi::Object obj) {
  58 + SherpaOnnxSpeakerEmbeddingExtractorConfig c;
  59 + memset(&c, 0, sizeof(c));
  60 +
  61 + if (!obj.Has("embedding") || !obj.Get("embedding").IsObject()) {
  62 + return c;
  63 + }
  64 +
  65 + Napi::Object o = obj.Get("embedding").As<Napi::Object>();
  66 +
  67 + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
  68 + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
  69 +
  70 + if (o.Has("debug") &&
  71 + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
  72 + if (o.Get("debug").IsBoolean()) {
  73 + c.debug = o.Get("debug").As<Napi::Boolean>().Value();
  74 + } else {
  75 + c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
  76 + }
  77 + }
  78 +
  79 + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
  80 +
  81 + return c;
  82 +}
  83 +
  84 +static SherpaOnnxFastClusteringConfig GetFastClusteringConfig(
  85 + Napi::Object obj) {
  86 + SherpaOnnxFastClusteringConfig c;
  87 + memset(&c, 0, sizeof(c));
  88 +
  89 + if (!obj.Has("clustering") || !obj.Get("clustering").IsObject()) {
  90 + return c;
  91 + }
  92 +
  93 + Napi::Object o = obj.Get("clustering").As<Napi::Object>();
  94 +
  95 + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_clusters, numClusters);
  96 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold);
  97 +
  98 + return c;
  99 +}
  100 +
  101 +static Napi::External<SherpaOnnxOfflineSpeakerDiarization>
  102 +CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) {
  103 + Napi::Env env = info.Env();
  104 + if (info.Length() != 1) {
  105 + std::ostringstream os;
  106 + os << "Expect only 1 argument. Given: " << info.Length();
  107 +
  108 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  109 +
  110 + return {};
  111 + }
  112 +
  113 + if (!info[0].IsObject()) {
  114 + Napi::TypeError::New(env, "Expect an object as the argument")
  115 + .ThrowAsJavaScriptException();
  116 +
  117 + return {};
  118 + }
  119 +
  120 + Napi::Object o = info[0].As<Napi::Object>();
  121 +
  122 + SherpaOnnxOfflineSpeakerDiarizationConfig c;
  123 + memset(&c, 0, sizeof(c));
  124 +
  125 + c.segmentation = GetOfflineSpeakerSegmentationModelConfig(o);
  126 + c.embedding = GetSpeakerEmbeddingExtractorConfig(o);
  127 + c.clustering = GetFastClusteringConfig(o);
  128 +
  129 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_on, minDurationOn);
  130 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_off, minDurationOff);
  131 +
  132 + const SherpaOnnxOfflineSpeakerDiarization *sd =
  133 + SherpaOnnxCreateOfflineSpeakerDiarization(&c);
  134 +
  135 + if (c.segmentation.pyannote.model) {
  136 + delete[] c.segmentation.pyannote.model;
  137 + }
  138 +
  139 + if (c.segmentation.provider) {
  140 + delete[] c.segmentation.provider;
  141 + }
  142 +
  143 + if (c.embedding.model) {
  144 + delete[] c.embedding.model;
  145 + }
  146 +
  147 + if (c.embedding.provider) {
  148 + delete[] c.embedding.provider;
  149 + }
  150 +
  151 + if (!sd) {
  152 + Napi::TypeError::New(env, "Please check your config!")
  153 + .ThrowAsJavaScriptException();
  154 +
  155 + return {};
  156 + }
  157 +
  158 + return Napi::External<SherpaOnnxOfflineSpeakerDiarization>::New(
  159 + env, const_cast<SherpaOnnxOfflineSpeakerDiarization *>(sd),
  160 + [](Napi::Env env, SherpaOnnxOfflineSpeakerDiarization *sd) {
  161 + SherpaOnnxDestroyOfflineSpeakerDiarization(sd);
  162 + });
  163 +}
  164 +
  165 +static Napi::Number OfflineSpeakerDiarizationGetSampleRateWrapper(
  166 + const Napi::CallbackInfo &info) {
  167 + Napi::Env env = info.Env();
  168 +
  169 + if (info.Length() != 1) {
  170 + std::ostringstream os;
  171 + os << "Expect only 1 argument. Given: " << info.Length();
  172 +
  173 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  174 +
  175 + return {};
  176 + }
  177 +
  178 + if (!info[0].IsExternal()) {
  179 + Napi::TypeError::New(
  180 + env, "Argument 0 should be an offline speaker diarization pointer.")
  181 + .ThrowAsJavaScriptException();
  182 +
  183 + return {};
  184 + }
  185 +
  186 + const SherpaOnnxOfflineSpeakerDiarization *sd =
  187 + info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data();
  188 +
  189 + int32_t sample_rate = SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd);
  190 +
  191 + return Napi::Number::New(env, sample_rate);
  192 +}
  193 +
  194 +static Napi::Array OfflineSpeakerDiarizationProcessWrapper(
  195 + const Napi::CallbackInfo &info) {
  196 + Napi::Env env = info.Env();
  197 +
  198 + if (info.Length() != 2) {
  199 + std::ostringstream os;
  200 + os << "Expect only 2 arguments. Given: " << info.Length();
  201 +
  202 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  203 +
  204 + return {};
  205 + }
  206 +
  207 + if (!info[0].IsExternal()) {
  208 + Napi::TypeError::New(
  209 + env, "Argument 0 should be an offline speaker diarization pointer.")
  210 + .ThrowAsJavaScriptException();
  211 +
  212 + return {};
  213 + }
  214 +
  215 + const SherpaOnnxOfflineSpeakerDiarization *sd =
  216 + info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data();
  217 +
  218 + if (!info[1].IsTypedArray()) {
  219 + Napi::TypeError::New(env, "Argument 1 should be a typed array")
  220 + .ThrowAsJavaScriptException();
  221 +
  222 + return {};
  223 + }
  224 +
  225 + Napi::Float32Array samples = info[1].As<Napi::Float32Array>();
  226 +
  227 + const SherpaOnnxOfflineSpeakerDiarizationResult *r =
  228 + SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.Data(),
  229 + samples.ElementLength());
  230 +
  231 + int32_t num_segments =
  232 + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r);
  233 +
  234 + const SherpaOnnxOfflineSpeakerDiarizationSegment *segments =
  235 + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(r);
  236 +
  237 + Napi::Array ans = Napi::Array::New(env, num_segments);
  238 +
  239 + for (int32_t i = 0; i != num_segments; ++i) {
  240 + Napi::Object obj = Napi::Object::New(env);
  241 + obj.Set(Napi::String::New(env, "start"), segments[i].start);
  242 + obj.Set(Napi::String::New(env, "end"), segments[i].end);
  243 + obj.Set(Napi::String::New(env, "speaker"), segments[i].speaker);
  244 +
  245 + ans[i] = obj;
  246 + }
  247 +
  248 + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments);
  249 + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r);
  250 +
  251 + return ans;
  252 +}
  253 +
  254 +void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) {
  255 + exports.Set(Napi::String::New(env, "createOfflineSpeakerDiarization"),
  256 + Napi::Function::New(env, CreateOfflineSpeakerDiarizationWrapper));
  257 +
  258 + exports.Set(
  259 + Napi::String::New(env, "getOfflineSpeakerDiarizationSampleRate"),
  260 + Napi::Function::New(env, OfflineSpeakerDiarizationGetSampleRateWrapper));
  261 +
  262 + exports.Set(
  263 + Napi::String::New(env, "offlineSpeakerDiarizationProcess"),
  264 + Napi::Function::New(env, OfflineSpeakerDiarizationProcessWrapper));
  265 +}
@@ -25,6 +25,8 @@ void InitPunctuation(Napi::Env env, Napi::Object exports); @@ -25,6 +25,8 @@ void InitPunctuation(Napi::Env env, Napi::Object exports);
25 25
26 void InitKeywordSpotting(Napi::Env env, Napi::Object exports); 26 void InitKeywordSpotting(Napi::Env env, Napi::Object exports);
27 27
  28 +void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports);
  29 +
28 Napi::Object Init(Napi::Env env, Napi::Object exports) { 30 Napi::Object Init(Napi::Env env, Napi::Object exports) {
29 InitStreamingAsr(env, exports); 31 InitStreamingAsr(env, exports);
30 InitNonStreamingAsr(env, exports); 32 InitNonStreamingAsr(env, exports);
@@ -37,6 +39,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) { @@ -37,6 +39,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
37 InitAudioTagging(env, exports); 39 InitAudioTagging(env, exports);
38 InitPunctuation(env, exports); 40 InitPunctuation(env, exports);
39 InitKeywordSpotting(env, exports); 41 InitKeywordSpotting(env, exports);
  42 + InitNonStreamingSpeakerDiarization(env, exports);
40 43
41 return exports; 44 return exports;
42 } 45 }