Fangjun Kuang
Committed by GitHub

Pascal API for speaker diarization (#1420)

@@ -127,6 +127,21 @@ jobs: @@ -127,6 +127,21 @@ jobs:
127 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts 127 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
128 fi 128 fi
129 129
  130 + - name: Run Pascal test (Speaker diarization)
  131 + shell: bash
  132 + run: |
  133 + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
  134 +
  135 + cd ./pascal-api-examples
  136 + pushd speaker-diarization
  137 +
  138 + ./run.sh
  139 + rm -rfv *.onnx *.wav sherpa-onnx-*
  140 + ls -lh
  141 + echo "---"
  142 +
  143 + popd
  144 +
130 - name: Run Pascal test (TTS) 145 - name: Run Pascal test (TTS)
131 shell: bash 146 shell: bash
132 run: | 147 run: |
@@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html @@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
9 |Directory| Description| 9 |Directory| Description|
10 |---------|------------| 10 |---------|------------|
11 |[read-wav](./read-wav)|It shows how to read a wave file.| 11 |[read-wav](./read-wav)|It shows how to read a wave file.|
  12 +|[speaker-diarization](./speaker-diarization)|It shows how to use Pascal API for speaker diarization.|
12 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.| 13 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
13 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| 14 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
14 |[vad](./vad)| It shows how to use the voice activity detection API.| 15 |[vad](./vad)| It shows how to use the voice activity detection API.|
  1 +{ Copyright (c) 2024 Xiaomi Corporation }
  2 +{
  3 +This file shows how to use the Pascal API from sherpa-onnx
  4 +for speaker diarization.
  5 +
  6 +Usage:
  7 +
  8 +Step 1: Download a speaker segmentation model
  9 +
  10 +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
  11 +for a list of available models. The following is an example
  12 +
  13 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  14 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  15 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  16 +
  17 +Step 2: Download a speaker embedding extractor model
  18 +
  19 +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  20 +for a list of available models. The following is an example
  21 +
  22 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  23 +
  24 +Step 3. Download test wave files
  25 +
  26 +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
  27 +for a list of available test wave files. The following is an example
  28 +
  29 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  30 +
  31 +Step 4. Run it
  32 +}
  33 +
  34 +program main;
  35 +
  36 +{$mode delphi}
  37 +
  38 +uses
  39 + sherpa_onnx,
  40 + ctypes,
  41 + SysUtils;
  42 +
  43 +function ProgressCallback(
  44 + NumProcessedChunks: cint32;
  45 + NumTotalChunks: cint32): cint32; cdecl;
  46 +var
  47 + Progress: Single;
  48 +begin
  49 + Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
  50 + WriteLn(Format('Progress: %.3f%%', [Progress]));
  51 +
  52 + Result := 0;
  53 +end;
  54 +
  55 +var
  56 + Wave: TSherpaOnnxWave;
  57 + Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
  58 + Sd: TSherpaOnnxOfflineSpeakerDiarization;
  59 + Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
  60 + I: Integer;
  61 +begin
  62 + Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');
  63 +
  64 + Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
  65 + Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';
  66 +
  67 + {
  68 + Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
  69 + set NumClusters to 4 here.
  70 + If you don't have such information, please set NumClusters to -1.
  71 + In that case, you have to set Config.Clustering.Threshold.
  72 + A larger threshold leads to fewer clusters, i.e., fewer speakers.
  73 + }
  74 + Config.Clustering.NumClusters := 4;
  75 + Config.Segmentation.Debug := True;
  76 + Config.Embedding.Debug := True;
  77 +
  78 + Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
  79 + if Sd.GetHandle = nil then
  80 + begin
  81 + WriteLn('Please check you config');
  82 + Exit;
  83 + end;
  84 +
  85 + if Sd.GetSampleRate <> Wave.SampleRate then
  86 + begin
  87 + WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
  88 + Exit;
  89 + end;
  90 +
  91 + {
  92 + // If you don't want to use a callback
  93 + Segments := Sd.Process(Wave.Samples);
  94 + }
  95 + Segments := Sd.Process(Wave.Samples, @ProgressCallback);
  96 +
  97 + for I := Low(Segments) to High(Segments) do
  98 + begin
  99 + WriteLn(Format('%.3f -- %.3f speaker_%d',
  100 + [Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
  101 + end;
  102 +
  103 + FreeAndNil(Sd);
  104 +end.
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +fpc \
  27 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  28 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  29 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  30 + ./main.pas
  31 +
  32 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  33 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  34 +
  35 +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
  36 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  37 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  38 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  39 +fi
  40 +
  41 +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
  42 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  43 +fi
  44 +
  45 +if [ ! -f ./0-four-speakers-zh.wav ]; then
  46 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  47 +fi
  48 +
  49 +./main
@@ -102,7 +102,7 @@ type @@ -102,7 +102,7 @@ type
102 102
103 function Generate(Text: AnsiString; SpeakerId: Integer; 103 function Generate(Text: AnsiString; SpeakerId: Integer;
104 Speed: Single; 104 Speed: Single;
105 - Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; 105 + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
106 Arg: Pointer 106 Arg: Pointer
107 ): TSherpaOnnxGeneratedAudio; overload; 107 ): TSherpaOnnxGeneratedAudio; overload;
108 108
@@ -398,6 +398,78 @@ type @@ -398,6 +398,78 @@ type
398 property GetHandle: Pointer Read Handle; 398 property GetHandle: Pointer Read Handle;
399 end; 399 end;
400 400
  401 +
  402 + TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record
  403 + Model: AnsiString;
  404 + function ToString: AnsiString;
  405 + end;
  406 +
  407 + TSherpaOnnxOfflineSpeakerSegmentationModelConfig = record
  408 + Pyannote: TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
  409 + NumThreads: Integer;
  410 + Debug: Boolean;
  411 + Provider: AnsiString;
  412 + function ToString: AnsiString;
  413 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig);
  414 + end;
  415 +
  416 + TSherpaOnnxFastClusteringConfig = record
  417 + NumClusters: Integer;
  418 + Threshold: Single;
  419 + function ToString: AnsiString;
  420 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig);
  421 + end;
  422 +
  423 + TSherpaOnnxSpeakerEmbeddingExtractorConfig = record
  424 + Model: AnsiString;
  425 + NumThreads: Integer;
  426 + Debug: Boolean;
  427 + Provider: AnsiString;
  428 + function ToString: AnsiString;
  429 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig);
  430 + end;
  431 +
  432 + TSherpaOnnxOfflineSpeakerDiarizationConfig = record
  433 + Segmentation: TSherpaOnnxOfflineSpeakerSegmentationModelConfig;
  434 + Embedding: TSherpaOnnxSpeakerEmbeddingExtractorConfig;
  435 + Clustering: TSherpaOnnxFastClusteringConfig;
  436 + MinDurationOn: Single;
  437 + MinDurationOff: Single;
  438 + function ToString: AnsiString;
  439 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig);
  440 + end;
  441 +
  442 + TSherpaOnnxOfflineSpeakerDiarizationSegment = record
  443 + Start: Single;
  444 + Stop: Single;
  445 + Speaker: Integer;
  446 + function ToString: AnsiString;
  447 + end;
  448 +
  449 + TSherpaOnnxOfflineSpeakerDiarizationSegmentArray = array of TSherpaOnnxOfflineSpeakerDiarizationSegment;
  450 +
  451 + PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = ^TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg;
  452 +
  453 + TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = function(
  454 + NumProcessChunks: cint32;
  455 + NumTotalChunks: cint32): cint32; cdecl;
  456 +
  457 + TSherpaOnnxOfflineSpeakerDiarization = class
  458 + private
  459 + Handle: Pointer;
  460 + SampleRate: Integer;
  461 + _Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
  462 + public
  463 + constructor Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
  464 + destructor Destroy; override;
  465 + procedure SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
  466 + function Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload;
  467 + function Process(Samples: array of Single; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload;
  468 + property GetHandle: Pointer Read Handle;
  469 + property GetSampleRate: Integer Read SampleRate;
  470 + end;
  471 +
  472 +
401 { It supports reading a single channel wave with 16-bit encoded samples. 473 { It supports reading a single channel wave with 16-bit encoded samples.
402 Samples are normalized to the range [-1, 1]. 474 Samples are normalized to the range [-1, 1].
403 } 475 }
@@ -656,6 +728,47 @@ type @@ -656,6 +728,47 @@ type
656 728
657 PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut; 729 PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
658 730
  731 + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record
  732 + Model: PAnsiChar;
  733 + end;
  734 +
  735 + SherpaOnnxOfflineSpeakerSegmentationModelConfig = record
  736 + Pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
  737 + NumThreads: cint32;
  738 + Debug: cint32;
  739 + Provider: PAnsiChar;
  740 + end;
  741 +
  742 + SherpaOnnxFastClusteringConfig = record
  743 + NumClusters: cint32;
  744 + Threshold: cfloat;
  745 + end;
  746 +
  747 + SherpaOnnxSpeakerEmbeddingExtractorConfig = record
  748 + Model: PAnsiChar;
  749 + NumThreads: cint32;
  750 + Debug: cint32;
  751 + Provider: PAnsiChar;
  752 + end;
  753 +
  754 + SherpaOnnxOfflineSpeakerDiarizationConfig = record
  755 + Segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig;
  756 + Embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig;
  757 + Clustering: SherpaOnnxFastClusteringConfig;
  758 + MinDurationOn: cfloat;
  759 + MinDurationOff: cfloat;
  760 + end;
  761 +
  762 + SherpaOnnxOfflineSpeakerDiarizationSegment = record
  763 + Start: cfloat;
  764 + Stop: cfloat;
  765 + Speaker: cint32;
  766 + end;
  767 +
  768 + PSherpaOnnxOfflineSpeakerDiarizationSegment = ^SherpaOnnxOfflineSpeakerDiarizationSegment;
  769 +
  770 + PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig;
  771 +
659 function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32; 772 function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
660 SampleRateOutHz: cint32; 773 SampleRateOutHz: cint32;
661 FilterCutoffHz: cfloat; 774 FilterCutoffHz: cfloat;
@@ -677,6 +790,37 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec @@ -677,6 +790,37 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec
677 procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl; 790 procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
678 external SherpaOnnxLibName; 791 external SherpaOnnxLibName;
679 792
  793 +function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl;
  794 + external SherpaOnnxLibName;
  795 +
  796 +procedure SherpaOnnxDestroyOfflineSpeakerDiarization(P: Pointer); cdecl;
  797 + external SherpaOnnxLibName;
  798 +
  799 +function SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(P: Pointer): cint32; cdecl;
  800 + external SherpaOnnxLibName;
  801 +
  802 +procedure SherpaOnnxOfflineSpeakerDiarizationSetConfig(P: Pointer; Config: PSherpaOnnxOfflineSpeakerDiarizationConfig); cdecl;
  803 + external SherpaOnnxLibName;
  804 +
  805 +function SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(P: Pointer): cint32; cdecl;
  806 + external SherpaOnnxLibName;
  807 +
  808 +function SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(P: Pointer): PSherpaOnnxOfflineSpeakerDiarizationSegment; cdecl;
  809 + external SherpaOnnxLibName;
  810 +
  811 +procedure SherpaOnnxOfflineSpeakerDiarizationDestroySegment(P: Pointer); cdecl;
  812 + external SherpaOnnxLibName;
  813 +
  814 +function SherpaOnnxOfflineSpeakerDiarizationProcess(P: Pointer; Samples: pcfloat; N: cint32): Pointer; cdecl;
  815 + external SherpaOnnxLibName;
  816 +
  817 +function SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(P: Pointer;
  818 + Samples: pcfloat; N: cint32; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): Pointer; cdecl;
  819 + external SherpaOnnxLibName;
  820 +
  821 +procedure SherpaOnnxOfflineSpeakerDiarizationDestroyResult(P: Pointer); cdecl;
  822 + external SherpaOnnxLibName;
  823 +
680 function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl; 824 function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
681 external SherpaOnnxLibName; 825 external SherpaOnnxLibName;
682 826
@@ -1773,7 +1917,7 @@ end; @@ -1773,7 +1917,7 @@ end;
1773 1917
1774 function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer; 1918 function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
1775 Speed: Single; 1919 Speed: Single;
1776 - Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; 1920 + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
1777 Arg: Pointer 1921 Arg: Pointer
1778 ): TSherpaOnnxGeneratedAudio; 1922 ): TSherpaOnnxGeneratedAudio;
1779 var 1923 var
@@ -1847,4 +1991,195 @@ begin @@ -1847,4 +1991,195 @@ begin
1847 SherpaOnnxLinearResamplerReset(Self.Handle); 1991 SherpaOnnxLinearResamplerReset(Self.Handle);
1848 end; 1992 end;
1849 1993
  1994 +function TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig.ToString: AnsiString;
  1995 +begin
  1996 + Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' +
  1997 + 'Model := %s)',[Self.Model]);
  1998 +end;
  1999 +
  2000 +function TSherpaOnnxOfflineSpeakerSegmentationModelConfig.ToString: AnsiString;
  2001 +begin
  2002 + Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' +
  2003 + 'Pyannote := %s, ' +
  2004 + 'NumThreads := %d, ' +
  2005 + 'Debug := %s, ' +
  2006 + 'Provider := %s)',
  2007 + [Self.Pyannote.ToString, Self.NumThreads,
  2008 + Self.Debug.ToString, Self.Provider]);
  2009 +end;
  2010 +
  2011 +class operator TSherpaOnnxOfflineSpeakerSegmentationModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig);
  2012 +begin
  2013 + Dest.NumThreads := 1;
  2014 + Dest.Debug := False;
  2015 + Dest.Provider := 'cpu';
  2016 +end;
  2017 +
  2018 +function TSherpaOnnxFastClusteringConfig.ToString: AnsiString;
  2019 +begin
  2020 + Result := Format('TSherpaOnnxFastClusteringConfig(' +
  2021 + 'NumClusters := %d, Threshold := %.3f)',
  2022 + [Self.NumClusters, Self.Threshold]);
  2023 +end;
  2024 +
  2025 +class operator TSherpaOnnxFastClusteringConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig);
  2026 +begin
  2027 + Dest.NumClusters := -1;
  2028 + Dest.Threshold := 0.5;
  2029 +end;
  2030 +
  2031 +function TSherpaOnnxSpeakerEmbeddingExtractorConfig.ToString: AnsiString;
  2032 +begin
  2033 + Result := Format('TSherpaOnnxSpeakerEmbeddingExtractorConfig(' +
  2034 + 'Model := %s, '+
  2035 + 'NumThreads := %d, '+
  2036 + 'Debug := %s, '+
  2037 + 'Provider := %s)',
  2038 + [Self.Model, Self.NumThreads, Self.Debug.ToString, Self.Provider]);
  2039 +end;
  2040 +
  2041 +class operator TSherpaOnnxSpeakerEmbeddingExtractorConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig);
  2042 +begin
  2043 + Dest.NumThreads := 1;
  2044 + Dest.Debug := False;
  2045 + Dest.Provider := 'cpu';
  2046 +end;
  2047 +
  2048 +function TSherpaOnnxOfflineSpeakerDiarizationConfig.ToString: AnsiString;
  2049 +begin
  2050 + Result := Format('TSherpaOnnxOfflineSpeakerDiarizationConfig(' +
  2051 + 'Segmentation := %s, '+
  2052 + 'Embedding := %s, '+
  2053 + 'Clustering := %s, '+
  2054 + 'MinDurationOn := %.3f, '+
  2055 + 'MinDurationOff := %.3f)',
  2056 + [Self.Segmentation.ToString, Self.Embedding.ToString,
  2057 + Self.Clustering.ToString, Self.MinDurationOn, Self.MinDurationOff]);
  2058 +end;
  2059 +
  2060 +class operator TSherpaOnnxOfflineSpeakerDiarizationConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig);
  2061 +begin
  2062 + Dest.MinDurationOn := 0.2;
  2063 + Dest.MinDurationOff := 0.5;
  2064 +end;
  2065 +
  2066 +function TSherpaOnnxOfflineSpeakerDiarizationSegment.ToString: AnsiString;
  2067 +begin
  2068 + Result := Format('TSherpaOnnxOfflineSpeakerDiarizationSegment(' +
  2069 + 'Start := %.3f, '+
  2070 + 'Stop := %.3f, '+
  2071 + 'Speaker := %d)',
  2072 + [Self.Start, Self.Stop, Self.Speaker]);
  2073 +end;
  2074 +
  2075 +constructor TSherpaOnnxOfflineSpeakerDiarization.Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
  2076 +var
  2077 + C: SherpaOnnxOfflineSpeakerDiarizationConfig;
  2078 +begin
  2079 + C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig);
  2080 + C.Segmentation.Pyannote.Model := PAnsiChar(Config.Segmentation.Pyannote.Model);
  2081 + C.Segmentation.NumThreads := Config.Segmentation.NumThreads;
  2082 + C.Segmentation.Debug := Ord(Config.Segmentation.Debug);
  2083 + C.Segmentation.Provider := PAnsiChar(Config.Segmentation.Provider);
  2084 +
  2085 + C.Embedding.Model := PAnsiChar(Config.Embedding.Model);
  2086 + C.Embedding.NumThreads := Config.Embedding.NumThreads;
  2087 + C.Embedding.Debug := Ord(Config.Embedding.Debug);
  2088 + C.Embedding.Provider := PAnsiChar(Config.Embedding.Provider);
  2089 +
  2090 + C.Clustering.NumClusters := Config.Clustering.NumClusters;
  2091 + C.Clustering.Threshold := Config.Clustering.Threshold;
  2092 +
  2093 + C.MinDurationOn := Config.MinDurationOn;
  2094 + C.MinDurationOff := Config.MinDurationOff;
  2095 +
  2096 + Self.Handle := SherpaOnnxCreateOfflineSpeakerDiarization(@C);
  2097 + Self._Config := Config;
  2098 + Self.SampleRate := 0;
  2099 +
  2100 + if Self.Handle <> nil then
  2101 + begin
  2102 + Self.SampleRate := SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(Self.Handle);
  2103 + end;
  2104 +end;
  2105 +
  2106 +destructor TSherpaOnnxOfflineSpeakerDiarization.Destroy;
  2107 +begin
  2108 + SherpaOnnxDestroyOfflineSpeakerDiarization(Self.Handle);
  2109 + Self.Handle := nil;
  2110 +end;
  2111 +
  2112 +procedure TSherpaOnnxOfflineSpeakerDiarization.SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
  2113 +var
  2114 + C: SherpaOnnxOfflineSpeakerDiarizationConfig;
  2115 +begin
  2116 + C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig);
  2117 +
  2118 + C.Clustering.NumClusters := Config.Clustering.NumClusters;
  2119 + C.Clustering.Threshold := Config.Clustering.Threshold;
  2120 +
  2121 + SherpaOnnxOfflineSpeakerDiarizationSetConfig(Self.Handle, @C);
  2122 +end;
  2123 +
  2124 +function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
  2125 +var
  2126 + R: Pointer;
  2127 + NumSegments: Integer;
  2128 + I: Integer;
  2129 + Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment;
  2130 +begin
  2131 + Result := nil;
  2132 +
  2133 + R := SherpaOnnxOfflineSpeakerDiarizationProcess(Self.Handle, pcfloat(Samples), Length(Samples));
  2134 + if R = nil then
  2135 + begin
  2136 + Exit
  2137 + end;
  2138 + NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R);
  2139 +
  2140 + Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R);
  2141 +
  2142 + SetLength(Result, NumSegments);
  2143 + for I := Low(Result) to High(Result) do
  2144 + begin
  2145 + Result[I].Start := Segments[I].Start;
  2146 + Result[I].Stop := Segments[I].Stop;
  2147 + Result[I].Speaker := Segments[I].Speaker;
  2148 + end;
  2149 +
  2150 + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments);
  2151 + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R);
  2152 +end;
  2153 +
  2154 +function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single;
  2155 + callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
  2156 +var
  2157 + R: Pointer;
  2158 + NumSegments: Integer;
  2159 + I: Integer;
  2160 + Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment;
  2161 +begin
  2162 + Result := nil;
  2163 +
  2164 + R := SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(Self.Handle, pcfloat(Samples), Length(Samples), callback);
  2165 + if R = nil then
  2166 + begin
  2167 + Exit
  2168 + end;
  2169 + NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R);
  2170 +
  2171 + Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R);
  2172 +
  2173 + SetLength(Result, NumSegments);
  2174 + for I := Low(Result) to High(Result) do
  2175 + begin
  2176 + Result[I].Start := Segments[I].Start;
  2177 + Result[I].Stop := Segments[I].Stop;
  2178 + Result[I].Speaker := Segments[I].Speaker;
  2179 + end;
  2180 +
  2181 + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments);
  2182 + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R);
  2183 +end;
  2184 +
1850 end. 2185 end.