Fangjun Kuang
Committed by GitHub

JavaScript API (node-addon) for speaker diarization (#1408)

{
"name": "sherpa-onnx-PLATFORM2-ARCH",
"version": "SHERPA_ONNX_VERSION",
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
... ... @@ -16,8 +16,18 @@
"transcription",
"real-time speech recognition",
"without internet connection",
"locally",
"local",
"embedded systems",
"open source",
"diarization",
"speaker diarization",
"speaker recognition",
"speaker",
"speaker segmentation",
"speaker verification",
"spoken language identification",
"sherpa",
"zipformer",
"asr",
"tts",
... ... @@ -30,13 +40,13 @@
"offline",
"privacy",
"open source",
"vad",
"speaker id",
"language id",
"node-addon-api",
"streaming speech recognition",
"speech",
"recognition"
"recognition",
"vad",
"node-addon-api",
"speaker id",
"language id"
],
"author": "The next-gen Kaldi team",
"license": "Apache-2.0",
... ...
{
"name": "sherpa-onnx-node",
"version": "SHERPA_ONNX_VERSION",
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
"main": "sherpa-onnx.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
... ... @@ -16,8 +16,18 @@
"transcription",
"real-time speech recognition",
"without internet connection",
"locally",
"local",
"embedded systems",
"open source",
"diarization",
"speaker diarization",
"speaker recognition",
"speaker",
"speaker segmentation",
"speaker verification",
"spoken language identification",
"sherpa",
"zipformer",
"asr",
"tts",
... ... @@ -30,13 +40,13 @@
"offline",
"privacy",
"open source",
"vad",
"speaker id",
"language id",
"node-addon-api",
"streaming speech recognition",
"speech",
"recognition"
"recognition",
"vad",
"node-addon-api",
"speaker id",
"language id"
],
"author": "The next-gen Kaldi team",
"license": "Apache-2.0",
... ...
... ... @@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()")
platform=$(node -p "require('os').platform()")
node_version=$(node -p "process.versions.node.split('.')[0]")
echo "----------non-streaming speaker diarization----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
node ./test_offline_speaker_diarization.js
rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
echo "----------non-streaming asr + vad----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
... ...
... ... @@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@<REPLACE-THIS-WI
The following tables list the examples in this folder.
## Speaker diarization
|File| Description|
|---|---|
|[./test_offline_speaker_diarization.js](./test_offline_speaker_diarization.js)| It demonstrates how to use sherpa-onnx JavaScript API for speaker diarization. It supports speaker segmentation models from [pyannote-audio](https://github.com/pyannote/pyannote-audio)|
## Add punctuations to text
|File| Description|
... ... @@ -130,6 +136,21 @@ The following tables list the examples in this folder.
|[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|
### Speaker diarization
```bash
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
node ./test_offline_speaker_diarization.js
```
### Voice Activity detection (VAD)
```bash
... ...
// Copyright (c) 2024 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx-node');
// clang-format off
/* Please use the following commands to download files
used in this script
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
*/
// clang-format on
const config = {
segmentation: {
pyannote: {
model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx',
},
},
embedding: {
model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx',
},
clustering: {
// since we know that the test wave file
// ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters
// here. if you don't have such information, please set numClusters to -1
numClusters: 4,
// If numClusters is not -1, then threshold is ignored.
//
// A larger threshold leads to fewer clusters, i.e., fewer speakers
// A smaller threshold leads to more clusters, i.e., more speakers
// You need to tune it by yourself.
threshold: 0.5,
},
// If a segment is shorter than minDurationOn, we discard it
minDurationOn: 0.2, // in seconds
// If the gap between two segments is less than minDurationOff, then we
// merge these two segments into a single one
minDurationOff: 0.5, // in seconds
};
const waveFilename = './0-four-speakers-zh.wav';
const sd = new sherpa_onnx.OfflineSpeakerDiarization(config);
console.log('Started')
const wave = sherpa_onnx.readWave(waveFilename);
if (sd.sampleRate != wave.sampleRate) {
throw new Error(
`Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`);
}
const segments = sd.process(wave.samples);
console.log(segments);
... ...
... ... @@ -21,6 +21,7 @@ set(srcs
src/audio-tagging.cc
src/keyword-spotting.cc
src/non-streaming-asr.cc
src/non-streaming-speaker-diarization.cc
src/non-streaming-tts.cc
src/punctuation.cc
src/sherpa-onnx-node-addon-api.cc
... ...
const addon = require('./addon.js');
class OfflineSpeakerDiarization {
constructor(config) {
this.handle = addon.createOfflineSpeakerDiarization(config);
this.config = config;
this.sampleRate = addon.getOfflineSpeakerDiarizationSampleRate(this.handle);
}
/**
* samples is a 1-d float32 array. Each element of the array should be
* in the range [-1, 1].
*
* We assume its sample rate equals to this.sampleRate.
*
* Returns an array of object, where an object is
*
* {
* "start": start_time_in_seconds,
* "end": end_time_in_seconds,
* "speaker": an_integer,
* }
*/
process(samples) {
return addon.offlineSpeakerDiarizationProcess(this.handle, samples);
}
}
module.exports = {
OfflineSpeakerDiarization,
}
... ...
... ... @@ -8,6 +8,7 @@ const sid = require('./speaker-identification.js');
const at = require('./audio-tagg.js');
const punct = require('./punctuation.js');
const kws = require('./keyword-spotter.js');
const sd = require('./non-streaming-speaker-diarization.js');
module.exports = {
OnlineRecognizer: streaming_asr.OnlineRecognizer,
... ... @@ -24,4 +25,5 @@ module.exports = {
AudioTagging: at.AudioTagging,
Punctuation: punct.Punctuation,
KeywordSpotter: kws.KeywordSpotter,
OfflineSpeakerDiarization: sd.OfflineSpeakerDiarization,
}
... ...
{
"main": "lib/sherpa-onnx.js",
"version": "1.0.0",
"description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection",
"description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
"dependencies": {
"cmake-js": "^6.0.0",
"node-addon-api": "^1.1.0",
... ... @@ -21,8 +21,18 @@
"transcription",
"real-time speech recognition",
"without internet connection",
"locally",
"local",
"embedded systems",
"open source",
"diarization",
"speaker diarization",
"speaker recognition",
"speaker",
"speaker segmentation",
"speaker verification",
"spoken language identification",
"sherpa",
"zipformer",
"asr",
"tts",
... ...
// scripts/node-addon-api/src/non-streaming-speaker-diarization.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include <algorithm>
#include <sstream>
#include "macros.h" // NOLINT
#include "napi.h" // NOLINT
#include "sherpa-onnx/c-api/c-api.h"
static SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
GetOfflineSpeakerSegmentationPyannoteModelConfig(Napi::Object obj) {
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig c;
memset(&c, 0, sizeof(c));
if (!obj.Has("pyannote") || !obj.Get("pyannote").IsObject()) {
return c;
}
Napi::Object o = obj.Get("pyannote").As<Napi::Object>();
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
return c;
}
static SherpaOnnxOfflineSpeakerSegmentationModelConfig
GetOfflineSpeakerSegmentationModelConfig(Napi::Object obj) {
SherpaOnnxOfflineSpeakerSegmentationModelConfig c;
memset(&c, 0, sizeof(c));
if (!obj.Has("segmentation") || !obj.Get("segmentation").IsObject()) {
return c;
}
Napi::Object o = obj.Get("segmentation").As<Napi::Object>();
c.pyannote = GetOfflineSpeakerSegmentationPyannoteModelConfig(o);
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
if (o.Has("debug") &&
(o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
if (o.Get("debug").IsBoolean()) {
c.debug = o.Get("debug").As<Napi::Boolean>().Value();
} else {
c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
}
}
SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
return c;
}
static SherpaOnnxSpeakerEmbeddingExtractorConfig
GetSpeakerEmbeddingExtractorConfig(Napi::Object obj) {
SherpaOnnxSpeakerEmbeddingExtractorConfig c;
memset(&c, 0, sizeof(c));
if (!obj.Has("embedding") || !obj.Get("embedding").IsObject()) {
return c;
}
Napi::Object o = obj.Get("embedding").As<Napi::Object>();
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
if (o.Has("debug") &&
(o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
if (o.Get("debug").IsBoolean()) {
c.debug = o.Get("debug").As<Napi::Boolean>().Value();
} else {
c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
}
}
SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
return c;
}
static SherpaOnnxFastClusteringConfig GetFastClusteringConfig(
Napi::Object obj) {
SherpaOnnxFastClusteringConfig c;
memset(&c, 0, sizeof(c));
if (!obj.Has("clustering") || !obj.Get("clustering").IsObject()) {
return c;
}
Napi::Object o = obj.Get("clustering").As<Napi::Object>();
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_clusters, numClusters);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold);
return c;
}
static Napi::External<SherpaOnnxOfflineSpeakerDiarization>
CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
if (info.Length() != 1) {
std::ostringstream os;
os << "Expect only 1 argument. Given: " << info.Length();
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
return {};
}
if (!info[0].IsObject()) {
Napi::TypeError::New(env, "Expect an object as the argument")
.ThrowAsJavaScriptException();
return {};
}
Napi::Object o = info[0].As<Napi::Object>();
SherpaOnnxOfflineSpeakerDiarizationConfig c;
memset(&c, 0, sizeof(c));
c.segmentation = GetOfflineSpeakerSegmentationModelConfig(o);
c.embedding = GetSpeakerEmbeddingExtractorConfig(o);
c.clustering = GetFastClusteringConfig(o);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_on, minDurationOn);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_off, minDurationOff);
const SherpaOnnxOfflineSpeakerDiarization *sd =
SherpaOnnxCreateOfflineSpeakerDiarization(&c);
if (c.segmentation.pyannote.model) {
delete[] c.segmentation.pyannote.model;
}
if (c.segmentation.provider) {
delete[] c.segmentation.provider;
}
if (c.embedding.model) {
delete[] c.embedding.model;
}
if (c.embedding.provider) {
delete[] c.embedding.provider;
}
if (!sd) {
Napi::TypeError::New(env, "Please check your config!")
.ThrowAsJavaScriptException();
return {};
}
return Napi::External<SherpaOnnxOfflineSpeakerDiarization>::New(
env, const_cast<SherpaOnnxOfflineSpeakerDiarization *>(sd),
[](Napi::Env env, SherpaOnnxOfflineSpeakerDiarization *sd) {
SherpaOnnxDestroyOfflineSpeakerDiarization(sd);
});
}
static Napi::Number OfflineSpeakerDiarizationGetSampleRateWrapper(
const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
if (info.Length() != 1) {
std::ostringstream os;
os << "Expect only 1 argument. Given: " << info.Length();
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
return {};
}
if (!info[0].IsExternal()) {
Napi::TypeError::New(
env, "Argument 0 should be an offline speaker diarization pointer.")
.ThrowAsJavaScriptException();
return {};
}
const SherpaOnnxOfflineSpeakerDiarization *sd =
info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data();
int32_t sample_rate = SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd);
return Napi::Number::New(env, sample_rate);
}
static Napi::Array OfflineSpeakerDiarizationProcessWrapper(
const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
if (info.Length() != 2) {
std::ostringstream os;
os << "Expect only 2 arguments. Given: " << info.Length();
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
return {};
}
if (!info[0].IsExternal()) {
Napi::TypeError::New(
env, "Argument 0 should be an offline speaker diarization pointer.")
.ThrowAsJavaScriptException();
return {};
}
const SherpaOnnxOfflineSpeakerDiarization *sd =
info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data();
if (!info[1].IsTypedArray()) {
Napi::TypeError::New(env, "Argument 1 should be a typed array")
.ThrowAsJavaScriptException();
return {};
}
Napi::Float32Array samples = info[1].As<Napi::Float32Array>();
const SherpaOnnxOfflineSpeakerDiarizationResult *r =
SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.Data(),
samples.ElementLength());
int32_t num_segments =
SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r);
const SherpaOnnxOfflineSpeakerDiarizationSegment *segments =
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(r);
Napi::Array ans = Napi::Array::New(env, num_segments);
for (int32_t i = 0; i != num_segments; ++i) {
Napi::Object obj = Napi::Object::New(env);
obj.Set(Napi::String::New(env, "start"), segments[i].start);
obj.Set(Napi::String::New(env, "end"), segments[i].end);
obj.Set(Napi::String::New(env, "speaker"), segments[i].speaker);
ans[i] = obj;
}
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments);
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r);
return ans;
}
void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) {
exports.Set(Napi::String::New(env, "createOfflineSpeakerDiarization"),
Napi::Function::New(env, CreateOfflineSpeakerDiarizationWrapper));
exports.Set(
Napi::String::New(env, "getOfflineSpeakerDiarizationSampleRate"),
Napi::Function::New(env, OfflineSpeakerDiarizationGetSampleRateWrapper));
exports.Set(
Napi::String::New(env, "offlineSpeakerDiarizationProcess"),
Napi::Function::New(env, OfflineSpeakerDiarizationProcessWrapper));
}
... ...
... ... @@ -25,6 +25,8 @@ void InitPunctuation(Napi::Env env, Napi::Object exports);
void InitKeywordSpotting(Napi::Env env, Napi::Object exports);
void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports);
Napi::Object Init(Napi::Env env, Napi::Object exports) {
InitStreamingAsr(env, exports);
InitNonStreamingAsr(env, exports);
... ... @@ -37,6 +39,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
InitAudioTagging(env, exports);
InitPunctuation(env, exports);
InitKeywordSpotting(env, exports);
InitNonStreamingSpeakerDiarization(env, exports);
return exports;
}
... ...