main.pas
3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use the Pascal API from sherpa-onnx
for speaker diarization.
Usage:
Step 1: Download a speaker segmentation model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
Step 2: Download a speaker embedding extractor model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
Step 3. Download test wave files
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
Step 4. Run it
}
program main;
{$mode delphi}
uses
sherpa_onnx,
ctypes,
SysUtils;
function ProgressCallback(
NumProcessedChunks: cint32;
NumTotalChunks: cint32): cint32; cdecl;
var
Progress: Single;
begin
Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
WriteLn(Format('Progress: %.3f%%', [Progress]));
Result := 0;
end;
var
Wave: TSherpaOnnxWave;
Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
Sd: TSherpaOnnxOfflineSpeakerDiarization;
Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
I: Integer;
begin
Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');
Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';
{
Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
set NumClusters to 4 here.
If you don't have such information, please set NumClusters to -1.
In that case, you have to set Config.Clustering.Threshold.
A larger threshold leads to fewer clusters, i.e., fewer speakers.
}
Config.Clustering.NumClusters := 4;
Config.Segmentation.Debug := True;
Config.Embedding.Debug := True;
Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
if Sd.GetHandle = nil then
begin
WriteLn('Please check you config');
Exit;
end;
if Sd.GetSampleRate <> Wave.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
Exit;
end;
{
// If you don't want to use a callback
Segments := Sd.Process(Wave.Samples);
}
Segments := Sd.Process(Wave.Samples, @ProgressCallback);
for I := Low(Segments) to High(Segments) do
begin
WriteLn(Format('%.3f -- %.3f speaker_%d',
[Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
end;
FreeAndNil(Sd);
end.