Fangjun Kuang
Committed by GitHub

Java API for speaker diarization (#1416)

@@ -107,6 +107,13 @@ jobs: @@ -107,6 +107,13 @@ jobs:
107 make -j4 107 make -j4
108 ls -lh lib 108 ls -lh lib
109 109
  110 + - name: Run java test (speaker diarization)
  111 + shell: bash
  112 + run: |
  113 + cd ./java-api-examples
  114 + ./run-offline-speaker-diarization.sh
  115 + rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
  116 +
110 - name: Run java test (kws) 117 - name: Run java test (kws)
111 shell: bash 118 shell: bash
112 run: | 119 run: |
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +// This file shows how to use sherpa-onnx Java API for speaker diarization,
  4 +import com.k2fsa.sherpa.onnx.*;
  5 +
  6 +public class OfflineSpeakerDiarizationDemo {
  7 + public static void main(String[] args) {
  8 + /* Please use the following commands to download files used in this file
  9 + Step 1: Download a speaker segmentation model
  10 +
  11 + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
  12 + for a list of available models. The following is an example
  13 +
  14 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  15 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  16 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  17 +
  18 + Step 2: Download a speaker embedding extractor model
  19 +
  20 + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  21 + for a list of available models. The following is an example
  22 +
  23 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  24 +
  25 + Step 3. Download test wave files
  26 +
  27 + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
  28 + for a list of available test wave files. The following is an example
  29 +
  30 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  31 +
  32 + Step 4. Run it
  33 + */
  34 +
  35 + String segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
  36 + String embeddingModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
  37 + String waveFilename = "./0-four-speakers-zh.wav";
  38 +
  39 + WaveReader reader = new WaveReader(waveFilename);
  40 +
  41 + OfflineSpeakerSegmentationPyannoteModelConfig pyannote =
  42 + OfflineSpeakerSegmentationPyannoteModelConfig.builder().setModel(segmentationModel).build();
  43 +
  44 + OfflineSpeakerSegmentationModelConfig segmentation =
  45 + OfflineSpeakerSegmentationModelConfig.builder()
  46 + .setPyannote(pyannote)
  47 + .setDebug(true)
  48 + .build();
  49 +
  50 + SpeakerEmbeddingExtractorConfig embedding =
  51 + SpeakerEmbeddingExtractorConfig.builder().setModel(embeddingModel).setDebug(true).build();
  52 +
  53 + // The test wave file ./0-four-speakers-zh.wav contains four speakers, so
  54 + // we use numClusters=4 here. If you don't know the number of speakers
  55 + // in the test wave file, please set the numClusters to -1 and provide
  56 + // threshold for clustering
  57 + FastClusteringConfig clustering =
  58 + FastClusteringConfig.builder()
  59 + .setNumClusters(4) // set it to -1 if you don't know the actual number
  60 + .setThreshold(0.5f)
  61 + .build();
  62 +
  63 + OfflineSpeakerDiarizationConfig config =
  64 + OfflineSpeakerDiarizationConfig.builder()
  65 + .setSegmentation(segmentation)
  66 + .setEmbedding(embedding)
  67 + .setClustering(clustering)
  68 + .setMinDurationOn(0.2f)
  69 + .setMinDurationOff(0.5f)
  70 + .build();
  71 +
  72 + OfflineSpeakerDiarization sd = new OfflineSpeakerDiarization(config);
  73 + if (sd.getSampleRate() != reader.getSampleRate()) {
  74 + System.out.printf(
  75 + "Expected sample rate: %d, given: %d\n", sd.getSampleRate(), reader.getSampleRate());
  76 + return;
  77 + }
  78 +
  79 + // OfflineSpeakerDiarizationSegment[] segments = sd.process(reader.getSamples());
  80 + // without callback is also ok
  81 +
  82 + // or you can use a callback to show the progress
  83 + OfflineSpeakerDiarizationSegment[] segments =
  84 + sd.processWithCallback(
  85 + reader.getSamples(),
  86 + (int numProcessedChunks, int numTotalChunks, long arg) -> {
  87 + float progress = 100.0f * numProcessedChunks / numTotalChunks;
  88 + System.out.printf("Progress: %.2f%%\n", progress);
  89 +
  90 + return 0;
  91 + });
  92 +
  93 + for (OfflineSpeakerDiarizationSegment s : segments) {
  94 + System.out.printf("%.3f -- %.3f speaker_%02d\n", s.getStart(), s.getEnd(), s.getSpeaker());
  95 + }
  96 +
  97 + sd.release();
  98 + }
  99 +}
@@ -4,6 +4,12 @@ This directory contains examples for the JAVA API of sherpa-onnx. @@ -4,6 +4,12 @@ This directory contains examples for the JAVA API of sherpa-onnx.
4 4
5 # Usage 5 # Usage
6 6
  7 +## Non-streaming speaker diarization
  8 +
  9 +```bash
  10 +./run-offline-speaker-diarization.sh
  11 +```
  12 +
7 ## Streaming Speech recognition 13 ## Streaming Speech recognition
8 14
9 ``` 15 ```
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
  29 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  30 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  31 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  32 +fi
  33 +
  34 +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
  35 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  36 +fi
  37 +
  38 +if [ ! -f ./0-four-speakers-zh.wav ]; then
  39 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  40 +fi
  41 +
  42 +java \
  43 + -Djava.library.path=$PWD/../build/lib \
  44 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  45 + ./OfflineSpeakerDiarizationDemo.java
@@ -68,6 +68,15 @@ java_files += KeywordSpotterConfig.java @@ -68,6 +68,15 @@ java_files += KeywordSpotterConfig.java
68 java_files += KeywordSpotterResult.java 68 java_files += KeywordSpotterResult.java
69 java_files += KeywordSpotter.java 69 java_files += KeywordSpotter.java
70 70
  71 +java_files += OfflineSpeakerSegmentationPyannoteModelConfig.java
  72 +java_files += OfflineSpeakerSegmentationModelConfig.java
  73 +java_files += FastClusteringConfig.java
  74 +java_files += OfflineSpeakerDiarizationConfig.java
  75 +java_files += OfflineSpeakerDiarizationSegment.java
  76 +java_files += OfflineSpeakerDiarizationCallback.java
  77 +java_files += OfflineSpeakerDiarization.java
  78 +
  79 +
71 class_files := $(java_files:%.java=%.class) 80 class_files := $(java_files:%.java=%.class)
72 81
73 java_files := $(addprefix src/$(package_dir)/,$(java_files)) 82 java_files := $(addprefix src/$(package_dir)/,$(java_files))
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class FastClusteringConfig {
  6 + private final int numClusters;
  7 + private final float threshold;
  8 +
  9 + private FastClusteringConfig(Builder builder) {
  10 + this.numClusters = builder.numClusters;
  11 + this.threshold = builder.threshold;
  12 + }
  13 +
  14 + public static Builder builder() {
  15 + return new Builder();
  16 + }
  17 +
  18 + public int getNumClusters() {
  19 + return numClusters;
  20 + }
  21 +
  22 + public float getThreshold() {
  23 + return threshold;
  24 + }
  25 +
  26 + public static class Builder {
  27 + private int numClusters = -1;
  28 + private float threshold = 0.5f;
  29 +
  30 + public FastClusteringConfig build() {
  31 + return new FastClusteringConfig(this);
  32 + }
  33 +
  34 + public Builder setNumClusters(int numClusters) {
  35 + this.numClusters = numClusters;
  36 + return this;
  37 + }
  38 +
  39 + public Builder setThreshold(float threshold) {
  40 + this.threshold = threshold;
  41 + return this;
  42 + }
  43 + }
  44 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class OfflineSpeakerDiarization {
  6 + static {
  7 + System.loadLibrary("sherpa-onnx-jni");
  8 + }
  9 +
  10 + private long ptr = 0;
  11 +
  12 + public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) {
  13 + ptr = newFromFile(config);
  14 + }
  15 +
  16 + public int getSampleRate() {
  17 + return getSampleRate(ptr);
  18 + }
  19 +
  20 + // Only config.clustering is used. All other fields are ignored
  21 + public void setConfig(OfflineSpeakerDiarizationConfig config) {
  22 + setConfig(ptr, config);
  23 + }
  24 +
  25 + public OfflineSpeakerDiarizationSegment[] process(float[] samples) {
  26 + return process(ptr, samples);
  27 + }
  28 +
  29 + public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback) {
  30 + return processWithCallback(ptr, samples, callback, 0);
  31 + }
  32 +
  33 + public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback, long arg) {
  34 + return processWithCallback(ptr, samples, callback, arg);
  35 + }
  36 +
  37 + protected void finalize() throws Throwable {
  38 + release();
  39 + }
  40 +
  41 + // You'd better call it manually if it is not used anymore
  42 + public void release() {
  43 + if (this.ptr == 0) {
  44 + return;
  45 + }
  46 + delete(this.ptr);
  47 + this.ptr = 0;
  48 + }
  49 +
  50 + private native int getSampleRate(long ptr);
  51 +
  52 + private native void delete(long ptr);
  53 +
  54 + private native long newFromFile(OfflineSpeakerDiarizationConfig config);
  55 +
  56 + private native void setConfig(long ptr, OfflineSpeakerDiarizationConfig config);
  57 +
  58 + private native OfflineSpeakerDiarizationSegment[] process(long ptr, float[] samples);
  59 +
  60 + private native OfflineSpeakerDiarizationSegment[] processWithCallback(long ptr, float[] samples, OfflineSpeakerDiarizationCallback callback, long arg);
  61 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +@FunctionalInterface
  6 +public interface OfflineSpeakerDiarizationCallback {
  7 + Integer invoke(int numProcessedChunks, int numTotalCunks, long arg);
  8 +}
  1 +package com.k2fsa.sherpa.onnx;
  2 +
  3 +public class OfflineSpeakerDiarizationConfig {
  4 + private final OfflineSpeakerSegmentationModelConfig segmentation;
  5 + private final SpeakerEmbeddingExtractorConfig embedding;
  6 + private final FastClusteringConfig clustering;
  7 + private final float minDurationOn;
  8 + private final float minDurationOff;
  9 +
  10 + private OfflineSpeakerDiarizationConfig(Builder builder) {
  11 + this.segmentation = builder.segmentation;
  12 + this.embedding = builder.embedding;
  13 + this.clustering = builder.clustering;
  14 + this.minDurationOff = builder.minDurationOff;
  15 + this.minDurationOn = builder.minDurationOn;
  16 + }
  17 +
  18 + public static Builder builder() {
  19 + return new Builder();
  20 + }
  21 +
  22 + public OfflineSpeakerSegmentationModelConfig getSegmentation() {
  23 + return segmentation;
  24 + }
  25 +
  26 + public SpeakerEmbeddingExtractorConfig getEmbedding() {
  27 + return embedding;
  28 + }
  29 +
  30 + public FastClusteringConfig getClustering() {
  31 + return clustering;
  32 + }
  33 +
  34 + public float getMinDurationOff() {
  35 + return minDurationOff;
  36 + }
  37 +
  38 + public float getMinDurationOn() {
  39 + return minDurationOn;
  40 + }
  41 +
  42 + public static class Builder {
  43 + private OfflineSpeakerSegmentationModelConfig segmentation = OfflineSpeakerSegmentationModelConfig.builder().build();
  44 + private SpeakerEmbeddingExtractorConfig embedding = SpeakerEmbeddingExtractorConfig.builder().build();
  45 + private FastClusteringConfig clustering = FastClusteringConfig.builder().build();
  46 + private float minDurationOn = 0.2f;
  47 + private float minDurationOff = 0.5f;
  48 +
  49 + public OfflineSpeakerDiarizationConfig build() {
  50 + return new OfflineSpeakerDiarizationConfig(this);
  51 + }
  52 +
  53 + public Builder setSegmentation(OfflineSpeakerSegmentationModelConfig segmentation) {
  54 + this.segmentation = segmentation;
  55 + return this;
  56 + }
  57 +
  58 + public Builder setEmbedding(SpeakerEmbeddingExtractorConfig embedding) {
  59 + this.embedding = embedding;
  60 + return this;
  61 + }
  62 +
  63 + public Builder setClustering(FastClusteringConfig clustering) {
  64 + this.clustering = clustering;
  65 + return this;
  66 + }
  67 +
  68 + public Builder setMinDurationOff(float minDurationOff) {
  69 + this.minDurationOff = minDurationOff;
  70 + return this;
  71 + }
  72 +
  73 + public Builder setMinDurationOn(float minDurationOn) {
  74 + this.minDurationOn = minDurationOn;
  75 + return this;
  76 + }
  77 + }
  78 +
  79 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class OfflineSpeakerDiarizationSegment {
  6 + private final float start;
  7 + private final float end;
  8 + private final int speaker;
  9 +
  10 + public OfflineSpeakerDiarizationSegment(float start, float end, int speaker) {
  11 + this.start = start;
  12 + this.end = end;
  13 + this.speaker = speaker;
  14 + }
  15 +
  16 + public float getStart() {
  17 + return start;
  18 + }
  19 +
  20 + public float getEnd() {
  21 + return end;
  22 + }
  23 +
  24 + public int getSpeaker() {
  25 + return speaker;
  26 + }
  27 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class OfflineSpeakerSegmentationModelConfig {
  6 + private final OfflineSpeakerSegmentationPyannoteModelConfig pyannote;
  7 + private final int numThreads;
  8 + private final boolean debug;
  9 + private final String provider;
  10 +
  11 + private OfflineSpeakerSegmentationModelConfig(Builder builder) {
  12 + this.pyannote = builder.pyannote;
  13 + this.numThreads = builder.numThreads;
  14 + this.debug = builder.debug;
  15 + this.provider = builder.provider;
  16 + }
  17 +
  18 + public static Builder builder() {
  19 + return new Builder();
  20 + }
  21 +
  22 + public static class Builder {
  23 + private OfflineSpeakerSegmentationPyannoteModelConfig pyannote = OfflineSpeakerSegmentationPyannoteModelConfig.builder().build();
  24 + private int numThreads = 1;
  25 + private boolean debug = true;
  26 + private String provider = "cpu";
  27 +
  28 + public OfflineSpeakerSegmentationModelConfig build() {
  29 + return new OfflineSpeakerSegmentationModelConfig(this);
  30 + }
  31 +
  32 + public Builder setPyannote(OfflineSpeakerSegmentationPyannoteModelConfig pyannote) {
  33 + this.pyannote = pyannote;
  34 + return this;
  35 + }
  36 +
  37 + public Builder setNumThreads(int numThreads) {
  38 + this.numThreads = numThreads;
  39 + return this;
  40 + }
  41 +
  42 + public Builder setDebug(boolean debug) {
  43 + this.debug = debug;
  44 + return this;
  45 + }
  46 +
  47 + public Builder setProvider(String provider) {
  48 + this.provider = provider;
  49 + return this;
  50 + }
  51 + }
  52 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class OfflineSpeakerSegmentationPyannoteModelConfig {
  6 + private final String model;
  7 +
  8 + private OfflineSpeakerSegmentationPyannoteModelConfig(Builder builder) {
  9 + this.model = builder.model;
  10 + }
  11 +
  12 + public static Builder builder() {
  13 + return new Builder();
  14 + }
  15 +
  16 + public String getModel() {
  17 + return model;
  18 + }
  19 +
  20 + public static class Builder {
  21 + private String model = "";
  22 +
  23 + public OfflineSpeakerSegmentationPyannoteModelConfig build() {
  24 + return new OfflineSpeakerSegmentationPyannoteModelConfig(this);
  25 + }
  26 +
  27 + public Builder setModel(String model) {
  28 + this.model = model;
  29 + return this;
  30 + }
  31 + }
  32 +}
  1 +// Copyright 2024 Xiaomi Corporation
  2 +
1 package com.k2fsa.sherpa.onnx; 3 package com.k2fsa.sherpa.onnx;
2 4
3 @FunctionalInterface 5 @FunctionalInterface
@@ -50,5 +50,4 @@ public class SpeakerEmbeddingExtractorConfig { @@ -50,5 +50,4 @@ public class SpeakerEmbeddingExtractorConfig {
50 return this; 50 return this;
51 } 51 }
52 } 52 }
53 -  
54 } 53 }