Fangjun Kuang
Committed by GitHub

Java API for speaker diarization (#1416)

... ... @@ -107,6 +107,13 @@ jobs:
make -j4
ls -lh lib
- name: Run java test (speaker diarization)
shell: bash
run: |
cd ./java-api-examples
./run-offline-speaker-diarization.sh
rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
- name: Run java test (kws)
shell: bash
run: |
... ...
// Copyright 2024 Xiaomi Corporation
// This file shows how to use sherpa-onnx Java API for speaker diarization,
import com.k2fsa.sherpa.onnx.*;
public class OfflineSpeakerDiarizationDemo {
public static void main(String[] args) {
/* Please use the following commands to download files used in this file
Step 1: Download a speaker segmentation model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
Step 2: Download a speaker embedding extractor model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
Step 3. Download test wave files
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
Step 4. Run it
*/
String segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
String embeddingModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
String waveFilename = "./0-four-speakers-zh.wav";
WaveReader reader = new WaveReader(waveFilename);
OfflineSpeakerSegmentationPyannoteModelConfig pyannote =
OfflineSpeakerSegmentationPyannoteModelConfig.builder().setModel(segmentationModel).build();
OfflineSpeakerSegmentationModelConfig segmentation =
OfflineSpeakerSegmentationModelConfig.builder()
.setPyannote(pyannote)
.setDebug(true)
.build();
SpeakerEmbeddingExtractorConfig embedding =
SpeakerEmbeddingExtractorConfig.builder().setModel(embeddingModel).setDebug(true).build();
// The test wave file ./0-four-speakers-zh.wav contains four speakers, so
// we use numClusters=4 here. If you don't know the number of speakers
// in the test wave file, please set the numClusters to -1 and provide
// threshold for clustering
FastClusteringConfig clustering =
FastClusteringConfig.builder()
.setNumClusters(4) // set it to -1 if you don't know the actual number
.setThreshold(0.5f)
.build();
OfflineSpeakerDiarizationConfig config =
OfflineSpeakerDiarizationConfig.builder()
.setSegmentation(segmentation)
.setEmbedding(embedding)
.setClustering(clustering)
.setMinDurationOn(0.2f)
.setMinDurationOff(0.5f)
.build();
OfflineSpeakerDiarization sd = new OfflineSpeakerDiarization(config);
if (sd.getSampleRate() != reader.getSampleRate()) {
System.out.printf(
"Expected sample rate: %d, given: %d\n", sd.getSampleRate(), reader.getSampleRate());
return;
}
// OfflineSpeakerDiarizationSegment[] segments = sd.process(reader.getSamples());
// without callback is also ok
// or you can use a callback to show the progress
OfflineSpeakerDiarizationSegment[] segments =
sd.processWithCallback(
reader.getSamples(),
(int numProcessedChunks, int numTotalChunks, long arg) -> {
float progress = 100.0f * numProcessedChunks / numTotalChunks;
System.out.printf("Progress: %.2f%%\n", progress);
return 0;
});
for (OfflineSpeakerDiarizationSegment s : segments) {
System.out.printf("%.3f -- %.3f speaker_%02d\n", s.getStart(), s.getEnd(), s.getSpeaker());
}
sd.release();
}
}
... ...
... ... @@ -4,6 +4,12 @@ This directory contains examples for the JAVA API of sherpa-onnx.
# Usage
## Non-streaming speaker diarization
```bash
./run-offline-speaker-diarization.sh
```
## Streaming Speech recognition
```
... ...
#!/usr/bin/env bash
set -ex
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
mkdir -p ../build
pushd ../build
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..
make -j4
ls -lh lib
popd
fi
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
pushd ../sherpa-onnx/java-api
make
popd
fi
if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
fi
if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi
if [ ! -f ./0-four-speakers-zh.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
fi
java \
-Djava.library.path=$PWD/../build/lib \
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
./OfflineSpeakerDiarizationDemo.java
... ...
... ... @@ -68,6 +68,15 @@ java_files += KeywordSpotterConfig.java
java_files += KeywordSpotterResult.java
java_files += KeywordSpotter.java
java_files += OfflineSpeakerSegmentationPyannoteModelConfig.java
java_files += OfflineSpeakerSegmentationModelConfig.java
java_files += FastClusteringConfig.java
java_files += OfflineSpeakerDiarizationConfig.java
java_files += OfflineSpeakerDiarizationSegment.java
java_files += OfflineSpeakerDiarizationCallback.java
java_files += OfflineSpeakerDiarization.java
class_files := $(java_files:%.java=%.class)
java_files := $(addprefix src/$(package_dir)/,$(java_files))
... ...
// Copyright 2024 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class FastClusteringConfig {
private final int numClusters;
private final float threshold;
private FastClusteringConfig(Builder builder) {
this.numClusters = builder.numClusters;
this.threshold = builder.threshold;
}
public static Builder builder() {
return new Builder();
}
public int getNumClusters() {
return numClusters;
}
public float getThreshold() {
return threshold;
}
public static class Builder {
private int numClusters = -1;
private float threshold = 0.5f;
public FastClusteringConfig build() {
return new FastClusteringConfig(this);
}
public Builder setNumClusters(int numClusters) {
this.numClusters = numClusters;
return this;
}
public Builder setThreshold(float threshold) {
this.threshold = threshold;
return this;
}
}
}
... ...
// Copyright 2024 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class OfflineSpeakerDiarization {
static {
System.loadLibrary("sherpa-onnx-jni");
}
private long ptr = 0;
public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) {
ptr = newFromFile(config);
}
public int getSampleRate() {
return getSampleRate(ptr);
}
// Only config.clustering is used. All other fields are ignored
public void setConfig(OfflineSpeakerDiarizationConfig config) {
setConfig(ptr, config);
}
public OfflineSpeakerDiarizationSegment[] process(float[] samples) {
return process(ptr, samples);
}
public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback) {
return processWithCallback(ptr, samples, callback, 0);
}
public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback, long arg) {
return processWithCallback(ptr, samples, callback, arg);
}
protected void finalize() throws Throwable {
release();
}
// You'd better call it manually if it is not used anymore
public void release() {
if (this.ptr == 0) {
return;
}
delete(this.ptr);
this.ptr = 0;
}
private native int getSampleRate(long ptr);
private native void delete(long ptr);
private native long newFromFile(OfflineSpeakerDiarizationConfig config);
private native void setConfig(long ptr, OfflineSpeakerDiarizationConfig config);
private native OfflineSpeakerDiarizationSegment[] process(long ptr, float[] samples);
private native OfflineSpeakerDiarizationSegment[] processWithCallback(long ptr, float[] samples, OfflineSpeakerDiarizationCallback callback, long arg);
}
\ No newline at end of file
... ...
// Copyright 2024 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
@FunctionalInterface
public interface OfflineSpeakerDiarizationCallback {
Integer invoke(int numProcessedChunks, int numTotalCunks, long arg);
}
... ...
package com.k2fsa.sherpa.onnx;
public class OfflineSpeakerDiarizationConfig {
private final OfflineSpeakerSegmentationModelConfig segmentation;
private final SpeakerEmbeddingExtractorConfig embedding;
private final FastClusteringConfig clustering;
private final float minDurationOn;
private final float minDurationOff;
private OfflineSpeakerDiarizationConfig(Builder builder) {
this.segmentation = builder.segmentation;
this.embedding = builder.embedding;
this.clustering = builder.clustering;
this.minDurationOff = builder.minDurationOff;
this.minDurationOn = builder.minDurationOn;
}
public static Builder builder() {
return new Builder();
}
public OfflineSpeakerSegmentationModelConfig getSegmentation() {
return segmentation;
}
public SpeakerEmbeddingExtractorConfig getEmbedding() {
return embedding;
}
public FastClusteringConfig getClustering() {
return clustering;
}
public float getMinDurationOff() {
return minDurationOff;
}
public float getMinDurationOn() {
return minDurationOn;
}
public static class Builder {
private OfflineSpeakerSegmentationModelConfig segmentation = OfflineSpeakerSegmentationModelConfig.builder().build();
private SpeakerEmbeddingExtractorConfig embedding = SpeakerEmbeddingExtractorConfig.builder().build();
private FastClusteringConfig clustering = FastClusteringConfig.builder().build();
private float minDurationOn = 0.2f;
private float minDurationOff = 0.5f;
public OfflineSpeakerDiarizationConfig build() {
return new OfflineSpeakerDiarizationConfig(this);
}
public Builder setSegmentation(OfflineSpeakerSegmentationModelConfig segmentation) {
this.segmentation = segmentation;
return this;
}
public Builder setEmbedding(SpeakerEmbeddingExtractorConfig embedding) {
this.embedding = embedding;
return this;
}
public Builder setClustering(FastClusteringConfig clustering) {
this.clustering = clustering;
return this;
}
public Builder setMinDurationOff(float minDurationOff) {
this.minDurationOff = minDurationOff;
return this;
}
public Builder setMinDurationOn(float minDurationOn) {
this.minDurationOn = minDurationOn;
return this;
}
}
}
... ...
// Copyright 2024 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class OfflineSpeakerDiarizationSegment {
private final float start;
private final float end;
private final int speaker;
public OfflineSpeakerDiarizationSegment(float start, float end, int speaker) {
this.start = start;
this.end = end;
this.speaker = speaker;
}
public float getStart() {
return start;
}
public float getEnd() {
return end;
}
public int getSpeaker() {
return speaker;
}
}
... ...
// Copyright 2024 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class OfflineSpeakerSegmentationModelConfig {
private final OfflineSpeakerSegmentationPyannoteModelConfig pyannote;
private final int numThreads;
private final boolean debug;
private final String provider;
private OfflineSpeakerSegmentationModelConfig(Builder builder) {
this.pyannote = builder.pyannote;
this.numThreads = builder.numThreads;
this.debug = builder.debug;
this.provider = builder.provider;
}
public static Builder builder() {
return new Builder();
}
public static class Builder {
private OfflineSpeakerSegmentationPyannoteModelConfig pyannote = OfflineSpeakerSegmentationPyannoteModelConfig.builder().build();
private int numThreads = 1;
private boolean debug = true;
private String provider = "cpu";
public OfflineSpeakerSegmentationModelConfig build() {
return new OfflineSpeakerSegmentationModelConfig(this);
}
public Builder setPyannote(OfflineSpeakerSegmentationPyannoteModelConfig pyannote) {
this.pyannote = pyannote;
return this;
}
public Builder setNumThreads(int numThreads) {
this.numThreads = numThreads;
return this;
}
public Builder setDebug(boolean debug) {
this.debug = debug;
return this;
}
public Builder setProvider(String provider) {
this.provider = provider;
return this;
}
}
}
\ No newline at end of file
... ...
// Copyright 2024 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class OfflineSpeakerSegmentationPyannoteModelConfig {
private final String model;
private OfflineSpeakerSegmentationPyannoteModelConfig(Builder builder) {
this.model = builder.model;
}
public static Builder builder() {
return new Builder();
}
public String getModel() {
return model;
}
public static class Builder {
private String model = "";
public OfflineSpeakerSegmentationPyannoteModelConfig build() {
return new OfflineSpeakerSegmentationPyannoteModelConfig(this);
}
public Builder setModel(String model) {
this.model = model;
return this;
}
}
}
... ...
// Copyright 2024 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
@FunctionalInterface
... ...
... ... @@ -50,5 +50,4 @@ public class SpeakerEmbeddingExtractorConfig {
return this;
}
}
}
... ...