OfflineSpeakerDiarizationDemo.java
4.0 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// Copyright 2024 Xiaomi Corporation
// This file shows how to use sherpa-onnx Java API for speaker diarization,
import com.k2fsa.sherpa.onnx.*;
public class OfflineSpeakerDiarizationDemo {
public static void main(String[] args) {
/* Please use the following commands to download files used in this file
Step 1: Download a speaker segmentation model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
Step 2: Download a speaker embedding extractor model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
Step 3. Download test wave files
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
Step 4. Run it
*/
String segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
String embeddingModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
String waveFilename = "./0-four-speakers-zh.wav";
WaveReader reader = new WaveReader(waveFilename);
OfflineSpeakerSegmentationPyannoteModelConfig pyannote =
OfflineSpeakerSegmentationPyannoteModelConfig.builder().setModel(segmentationModel).build();
OfflineSpeakerSegmentationModelConfig segmentation =
OfflineSpeakerSegmentationModelConfig.builder()
.setPyannote(pyannote)
.setDebug(true)
.build();
SpeakerEmbeddingExtractorConfig embedding =
SpeakerEmbeddingExtractorConfig.builder().setModel(embeddingModel).setDebug(true).build();
// The test wave file ./0-four-speakers-zh.wav contains four speakers, so
// we use numClusters=4 here. If you don't know the number of speakers
// in the test wave file, please set the numClusters to -1 and provide
// threshold for clustering
FastClusteringConfig clustering =
FastClusteringConfig.builder()
.setNumClusters(4) // set it to -1 if you don't know the actual number
.setThreshold(0.5f)
.build();
OfflineSpeakerDiarizationConfig config =
OfflineSpeakerDiarizationConfig.builder()
.setSegmentation(segmentation)
.setEmbedding(embedding)
.setClustering(clustering)
.setMinDurationOn(0.2f)
.setMinDurationOff(0.5f)
.build();
OfflineSpeakerDiarization sd = new OfflineSpeakerDiarization(config);
if (sd.getSampleRate() != reader.getSampleRate()) {
System.out.printf(
"Expected sample rate: %d, given: %d\n", sd.getSampleRate(), reader.getSampleRate());
return;
}
// OfflineSpeakerDiarizationSegment[] segments = sd.process(reader.getSamples());
// without callback is also ok
// or you can use a callback to show the progress
OfflineSpeakerDiarizationSegment[] segments =
sd.processWithCallback(
reader.getSamples(),
(int numProcessedChunks, int numTotalChunks, long arg) -> {
float progress = 100.0f * numProcessedChunks / numTotalChunks;
System.out.printf("Progress: %.2f%%\n", progress);
return 0;
});
for (OfflineSpeakerDiarizationSegment s : segments) {
System.out.printf("%.3f -- %.3f speaker_%02d\n", s.getStart(), s.getEnd(), s.getSpeaker());
}
sd.release();
}
}