Committed by
GitHub
Pascal API for speaker diarization (#1420)
正在显示
5 个修改的文件
包含
506 行增加
和
2 行删除
| @@ -127,6 +127,21 @@ jobs: | @@ -127,6 +127,21 @@ jobs: | ||
| 127 | cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts | 127 | cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts |
| 128 | fi | 128 | fi |
| 129 | 129 | ||
| 130 | + - name: Run Pascal test (Speaker diarization) | ||
| 131 | + shell: bash | ||
| 132 | + run: | | ||
| 133 | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH | ||
| 134 | + | ||
| 135 | + cd ./pascal-api-examples | ||
| 136 | + pushd speaker-diarization | ||
| 137 | + | ||
| 138 | + ./run.sh | ||
| 139 | + rm -rfv *.onnx *.wav sherpa-onnx-* | ||
| 140 | + ls -lh | ||
| 141 | + echo "---" | ||
| 142 | + | ||
| 143 | + popd | ||
| 144 | + | ||
| 130 | - name: Run Pascal test (TTS) | 145 | - name: Run Pascal test (TTS) |
| 131 | shell: bash | 146 | shell: bash |
| 132 | run: | | 147 | run: | |
| @@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html | @@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html | ||
| 9 | |Directory| Description| | 9 | |Directory| Description| |
| 10 | |---------|------------| | 10 | |---------|------------| |
| 11 | |[read-wav](./read-wav)|It shows how to read a wave file.| | 11 | |[read-wav](./read-wav)|It shows how to read a wave file.| |
| 12 | +|[speaker-diarization](./speaker-diarization)|It shows how to use Pascal API for speaker diarization.| | ||
| 12 | |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.| | 13 | |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.| |
| 13 | |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| | 14 | |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| |
| 14 | |[vad](./vad)| It shows how to use the voice activity detection API.| | 15 | |[vad](./vad)| It shows how to use the voice activity detection API.| |
| 1 | +{ Copyright (c) 2024 Xiaomi Corporation } | ||
| 2 | +{ | ||
| 3 | +This file shows how to use the Pascal API from sherpa-onnx | ||
| 4 | +for speaker diarization. | ||
| 5 | + | ||
| 6 | +Usage: | ||
| 7 | + | ||
| 8 | +Step 1: Download a speaker segmentation model | ||
| 9 | + | ||
| 10 | +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models | ||
| 11 | +for a list of available models. The following is an example | ||
| 12 | + | ||
| 13 | + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 14 | + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 15 | + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 16 | + | ||
| 17 | +Step 2: Download a speaker embedding extractor model | ||
| 18 | + | ||
| 19 | +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 20 | +for a list of available models. The following is an example | ||
| 21 | + | ||
| 22 | + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 23 | + | ||
| 24 | +Step 3. Download test wave files | ||
| 25 | + | ||
| 26 | +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models | ||
| 27 | +for a list of available test wave files. The following is an example | ||
| 28 | + | ||
| 29 | + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 30 | + | ||
| 31 | +Step 4. Run it | ||
| 32 | +} | ||
| 33 | + | ||
| 34 | +program main; | ||
| 35 | + | ||
| 36 | +{$mode delphi} | ||
| 37 | + | ||
| 38 | +uses | ||
| 39 | + sherpa_onnx, | ||
| 40 | + ctypes, | ||
| 41 | + SysUtils; | ||
| 42 | + | ||
| 43 | +function ProgressCallback( | ||
| 44 | + NumProcessedChunks: cint32; | ||
| 45 | + NumTotalChunks: cint32): cint32; cdecl; | ||
| 46 | +var | ||
| 47 | + Progress: Single; | ||
| 48 | +begin | ||
| 49 | + Progress := 100.0 * NumProcessedChunks / NumTotalChunks; | ||
| 50 | + WriteLn(Format('Progress: %.3f%%', [Progress])); | ||
| 51 | + | ||
| 52 | + Result := 0; | ||
| 53 | +end; | ||
| 54 | + | ||
| 55 | +var | ||
| 56 | + Wave: TSherpaOnnxWave; | ||
| 57 | + Config: TSherpaOnnxOfflineSpeakerDiarizationConfig; | ||
| 58 | + Sd: TSherpaOnnxOfflineSpeakerDiarization; | ||
| 59 | + Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; | ||
| 60 | + I: Integer; | ||
| 61 | +begin | ||
| 62 | + Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav'); | ||
| 63 | + | ||
| 64 | + Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx'; | ||
| 65 | + Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx'; | ||
| 66 | + | ||
| 67 | + { | ||
| 68 | + Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we | ||
| 69 | + set NumClusters to 4 here. | ||
| 70 | + If you don't have such information, please set NumClusters to -1. | ||
| 71 | + In that case, you have to set Config.Clustering.Threshold. | ||
| 72 | + A larger threshold leads to fewer clusters, i.e., fewer speakers. | ||
| 73 | + } | ||
| 74 | + Config.Clustering.NumClusters := 4; | ||
| 75 | + Config.Segmentation.Debug := True; | ||
| 76 | + Config.Embedding.Debug := True; | ||
| 77 | + | ||
| 78 | + Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config); | ||
| 79 | + if Sd.GetHandle = nil then | ||
| 80 | + begin | ||
| 81 | + WriteLn('Please check you config'); | ||
| 82 | + Exit; | ||
| 83 | + end; | ||
| 84 | + | ||
| 85 | + if Sd.GetSampleRate <> Wave.SampleRate then | ||
| 86 | + begin | ||
| 87 | + WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate])); | ||
| 88 | + Exit; | ||
| 89 | + end; | ||
| 90 | + | ||
| 91 | + { | ||
| 92 | + // If you don't want to use a callback | ||
| 93 | + Segments := Sd.Process(Wave.Samples); | ||
| 94 | + } | ||
| 95 | + Segments := Sd.Process(Wave.Samples, @ProgressCallback); | ||
| 96 | + | ||
| 97 | + for I := Low(Segments) to High(Segments) do | ||
| 98 | + begin | ||
| 99 | + WriteLn(Format('%.3f -- %.3f speaker_%d', | ||
| 100 | + [Segments[I].Start, Segments[I].Stop, Segments[I].Speaker])); | ||
| 101 | + end; | ||
| 102 | + | ||
| 103 | + FreeAndNil(Sd); | ||
| 104 | +end. |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
| 6 | +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) | ||
| 7 | + | ||
| 8 | +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" | ||
| 9 | + | ||
| 10 | +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then | ||
| 11 | + mkdir -p ../../build | ||
| 12 | + pushd ../../build | ||
| 13 | + cmake \ | ||
| 14 | + -DCMAKE_INSTALL_PREFIX=./install \ | ||
| 15 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 16 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 17 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 18 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 19 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 20 | + .. | ||
| 21 | + | ||
| 22 | + cmake --build . --target install --config Release | ||
| 23 | + popd | ||
| 24 | +fi | ||
| 25 | + | ||
| 26 | +fpc \ | ||
| 27 | + -dSHERPA_ONNX_USE_SHARED_LIBS \ | ||
| 28 | + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ | ||
| 29 | + -Fl$SHERPA_ONNX_DIR/build/install/lib \ | ||
| 30 | + ./main.pas | ||
| 31 | + | ||
| 32 | +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH | ||
| 33 | +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 34 | + | ||
| 35 | +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then | ||
| 36 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 37 | + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 38 | + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 39 | +fi | ||
| 40 | + | ||
| 41 | +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then | ||
| 42 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 43 | +fi | ||
| 44 | + | ||
| 45 | +if [ ! -f ./0-four-speakers-zh.wav ]; then | ||
| 46 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 47 | +fi | ||
| 48 | + | ||
| 49 | +./main |
| @@ -102,7 +102,7 @@ type | @@ -102,7 +102,7 @@ type | ||
| 102 | 102 | ||
| 103 | function Generate(Text: AnsiString; SpeakerId: Integer; | 103 | function Generate(Text: AnsiString; SpeakerId: Integer; |
| 104 | Speed: Single; | 104 | Speed: Single; |
| 105 | - Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; | 105 | + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg; |
| 106 | Arg: Pointer | 106 | Arg: Pointer |
| 107 | ): TSherpaOnnxGeneratedAudio; overload; | 107 | ): TSherpaOnnxGeneratedAudio; overload; |
| 108 | 108 | ||
| @@ -398,6 +398,78 @@ type | @@ -398,6 +398,78 @@ type | ||
| 398 | property GetHandle: Pointer Read Handle; | 398 | property GetHandle: Pointer Read Handle; |
| 399 | end; | 399 | end; |
| 400 | 400 | ||
| 401 | + | ||
| 402 | + TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record | ||
| 403 | + Model: AnsiString; | ||
| 404 | + function ToString: AnsiString; | ||
| 405 | + end; | ||
| 406 | + | ||
| 407 | + TSherpaOnnxOfflineSpeakerSegmentationModelConfig = record | ||
| 408 | + Pyannote: TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig; | ||
| 409 | + NumThreads: Integer; | ||
| 410 | + Debug: Boolean; | ||
| 411 | + Provider: AnsiString; | ||
| 412 | + function ToString: AnsiString; | ||
| 413 | + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig); | ||
| 414 | + end; | ||
| 415 | + | ||
| 416 | + TSherpaOnnxFastClusteringConfig = record | ||
| 417 | + NumClusters: Integer; | ||
| 418 | + Threshold: Single; | ||
| 419 | + function ToString: AnsiString; | ||
| 420 | + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig); | ||
| 421 | + end; | ||
| 422 | + | ||
| 423 | + TSherpaOnnxSpeakerEmbeddingExtractorConfig = record | ||
| 424 | + Model: AnsiString; | ||
| 425 | + NumThreads: Integer; | ||
| 426 | + Debug: Boolean; | ||
| 427 | + Provider: AnsiString; | ||
| 428 | + function ToString: AnsiString; | ||
| 429 | + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig); | ||
| 430 | + end; | ||
| 431 | + | ||
| 432 | + TSherpaOnnxOfflineSpeakerDiarizationConfig = record | ||
| 433 | + Segmentation: TSherpaOnnxOfflineSpeakerSegmentationModelConfig; | ||
| 434 | + Embedding: TSherpaOnnxSpeakerEmbeddingExtractorConfig; | ||
| 435 | + Clustering: TSherpaOnnxFastClusteringConfig; | ||
| 436 | + MinDurationOn: Single; | ||
| 437 | + MinDurationOff: Single; | ||
| 438 | + function ToString: AnsiString; | ||
| 439 | + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 440 | + end; | ||
| 441 | + | ||
| 442 | + TSherpaOnnxOfflineSpeakerDiarizationSegment = record | ||
| 443 | + Start: Single; | ||
| 444 | + Stop: Single; | ||
| 445 | + Speaker: Integer; | ||
| 446 | + function ToString: AnsiString; | ||
| 447 | + end; | ||
| 448 | + | ||
| 449 | + TSherpaOnnxOfflineSpeakerDiarizationSegmentArray = array of TSherpaOnnxOfflineSpeakerDiarizationSegment; | ||
| 450 | + | ||
| 451 | + PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = ^TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg; | ||
| 452 | + | ||
| 453 | + TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = function( | ||
| 454 | + NumProcessChunks: cint32; | ||
| 455 | + NumTotalChunks: cint32): cint32; cdecl; | ||
| 456 | + | ||
| 457 | + TSherpaOnnxOfflineSpeakerDiarization = class | ||
| 458 | + private | ||
| 459 | + Handle: Pointer; | ||
| 460 | + SampleRate: Integer; | ||
| 461 | + _Config: TSherpaOnnxOfflineSpeakerDiarizationConfig; | ||
| 462 | + public | ||
| 463 | + constructor Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 464 | + destructor Destroy; override; | ||
| 465 | + procedure SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 466 | + function Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload; | ||
| 467 | + function Process(Samples: array of Single; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload; | ||
| 468 | + property GetHandle: Pointer Read Handle; | ||
| 469 | + property GetSampleRate: Integer Read SampleRate; | ||
| 470 | + end; | ||
| 471 | + | ||
| 472 | + | ||
| 401 | { It supports reading a single channel wave with 16-bit encoded samples. | 473 | { It supports reading a single channel wave with 16-bit encoded samples. |
| 402 | Samples are normalized to the range [-1, 1]. | 474 | Samples are normalized to the range [-1, 1]. |
| 403 | } | 475 | } |
| @@ -656,6 +728,47 @@ type | @@ -656,6 +728,47 @@ type | ||
| 656 | 728 | ||
| 657 | PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut; | 729 | PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut; |
| 658 | 730 | ||
| 731 | + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record | ||
| 732 | + Model: PAnsiChar; | ||
| 733 | + end; | ||
| 734 | + | ||
| 735 | + SherpaOnnxOfflineSpeakerSegmentationModelConfig = record | ||
| 736 | + Pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig; | ||
| 737 | + NumThreads: cint32; | ||
| 738 | + Debug: cint32; | ||
| 739 | + Provider: PAnsiChar; | ||
| 740 | + end; | ||
| 741 | + | ||
| 742 | + SherpaOnnxFastClusteringConfig = record | ||
| 743 | + NumClusters: cint32; | ||
| 744 | + Threshold: cfloat; | ||
| 745 | + end; | ||
| 746 | + | ||
| 747 | + SherpaOnnxSpeakerEmbeddingExtractorConfig = record | ||
| 748 | + Model: PAnsiChar; | ||
| 749 | + NumThreads: cint32; | ||
| 750 | + Debug: cint32; | ||
| 751 | + Provider: PAnsiChar; | ||
| 752 | + end; | ||
| 753 | + | ||
| 754 | + SherpaOnnxOfflineSpeakerDiarizationConfig = record | ||
| 755 | + Segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig; | ||
| 756 | + Embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig; | ||
| 757 | + Clustering: SherpaOnnxFastClusteringConfig; | ||
| 758 | + MinDurationOn: cfloat; | ||
| 759 | + MinDurationOff: cfloat; | ||
| 760 | + end; | ||
| 761 | + | ||
| 762 | + SherpaOnnxOfflineSpeakerDiarizationSegment = record | ||
| 763 | + Start: cfloat; | ||
| 764 | + Stop: cfloat; | ||
| 765 | + Speaker: cint32; | ||
| 766 | + end; | ||
| 767 | + | ||
| 768 | + PSherpaOnnxOfflineSpeakerDiarizationSegment = ^SherpaOnnxOfflineSpeakerDiarizationSegment; | ||
| 769 | + | ||
| 770 | + PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig; | ||
| 771 | + | ||
| 659 | function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32; | 772 | function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32; |
| 660 | SampleRateOutHz: cint32; | 773 | SampleRateOutHz: cint32; |
| 661 | FilterCutoffHz: cfloat; | 774 | FilterCutoffHz: cfloat; |
| @@ -677,6 +790,37 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec | @@ -677,6 +790,37 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec | ||
| 677 | procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl; | 790 | procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl; |
| 678 | external SherpaOnnxLibName; | 791 | external SherpaOnnxLibName; |
| 679 | 792 | ||
| 793 | +function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl; | ||
| 794 | + external SherpaOnnxLibName; | ||
| 795 | + | ||
| 796 | +procedure SherpaOnnxDestroyOfflineSpeakerDiarization(P: Pointer); cdecl; | ||
| 797 | + external SherpaOnnxLibName; | ||
| 798 | + | ||
| 799 | +function SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(P: Pointer): cint32; cdecl; | ||
| 800 | + external SherpaOnnxLibName; | ||
| 801 | + | ||
| 802 | +procedure SherpaOnnxOfflineSpeakerDiarizationSetConfig(P: Pointer; Config: PSherpaOnnxOfflineSpeakerDiarizationConfig); cdecl; | ||
| 803 | + external SherpaOnnxLibName; | ||
| 804 | + | ||
| 805 | +function SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(P: Pointer): cint32; cdecl; | ||
| 806 | + external SherpaOnnxLibName; | ||
| 807 | + | ||
| 808 | +function SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(P: Pointer): PSherpaOnnxOfflineSpeakerDiarizationSegment; cdecl; | ||
| 809 | + external SherpaOnnxLibName; | ||
| 810 | + | ||
| 811 | +procedure SherpaOnnxOfflineSpeakerDiarizationDestroySegment(P: Pointer); cdecl; | ||
| 812 | + external SherpaOnnxLibName; | ||
| 813 | + | ||
| 814 | +function SherpaOnnxOfflineSpeakerDiarizationProcess(P: Pointer; Samples: pcfloat; N: cint32): Pointer; cdecl; | ||
| 815 | + external SherpaOnnxLibName; | ||
| 816 | + | ||
| 817 | +function SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(P: Pointer; | ||
| 818 | + Samples: pcfloat; N: cint32; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): Pointer; cdecl; | ||
| 819 | + external SherpaOnnxLibName; | ||
| 820 | + | ||
| 821 | +procedure SherpaOnnxOfflineSpeakerDiarizationDestroyResult(P: Pointer); cdecl; | ||
| 822 | + external SherpaOnnxLibName; | ||
| 823 | + | ||
| 680 | function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl; | 824 | function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl; |
| 681 | external SherpaOnnxLibName; | 825 | external SherpaOnnxLibName; |
| 682 | 826 | ||
| @@ -1773,7 +1917,7 @@ end; | @@ -1773,7 +1917,7 @@ end; | ||
| 1773 | 1917 | ||
| 1774 | function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer; | 1918 | function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer; |
| 1775 | Speed: Single; | 1919 | Speed: Single; |
| 1776 | - Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; | 1920 | + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg; |
| 1777 | Arg: Pointer | 1921 | Arg: Pointer |
| 1778 | ): TSherpaOnnxGeneratedAudio; | 1922 | ): TSherpaOnnxGeneratedAudio; |
| 1779 | var | 1923 | var |
| @@ -1847,4 +1991,195 @@ begin | @@ -1847,4 +1991,195 @@ begin | ||
| 1847 | SherpaOnnxLinearResamplerReset(Self.Handle); | 1991 | SherpaOnnxLinearResamplerReset(Self.Handle); |
| 1848 | end; | 1992 | end; |
| 1849 | 1993 | ||
| 1994 | +function TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig.ToString: AnsiString; | ||
| 1995 | +begin | ||
| 1996 | + Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' + | ||
| 1997 | + 'Model := %s)',[Self.Model]); | ||
| 1998 | +end; | ||
| 1999 | + | ||
| 2000 | +function TSherpaOnnxOfflineSpeakerSegmentationModelConfig.ToString: AnsiString; | ||
| 2001 | +begin | ||
| 2002 | + Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' + | ||
| 2003 | + 'Pyannote := %s, ' + | ||
| 2004 | + 'NumThreads := %d, ' + | ||
| 2005 | + 'Debug := %s, ' + | ||
| 2006 | + 'Provider := %s)', | ||
| 2007 | + [Self.Pyannote.ToString, Self.NumThreads, | ||
| 2008 | + Self.Debug.ToString, Self.Provider]); | ||
| 2009 | +end; | ||
| 2010 | + | ||
| 2011 | +class operator TSherpaOnnxOfflineSpeakerSegmentationModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig); | ||
| 2012 | +begin | ||
| 2013 | + Dest.NumThreads := 1; | ||
| 2014 | + Dest.Debug := False; | ||
| 2015 | + Dest.Provider := 'cpu'; | ||
| 2016 | +end; | ||
| 2017 | + | ||
| 2018 | +function TSherpaOnnxFastClusteringConfig.ToString: AnsiString; | ||
| 2019 | +begin | ||
| 2020 | + Result := Format('TSherpaOnnxFastClusteringConfig(' + | ||
| 2021 | + 'NumClusters := %d, Threshold := %.3f)', | ||
| 2022 | + [Self.NumClusters, Self.Threshold]); | ||
| 2023 | +end; | ||
| 2024 | + | ||
| 2025 | +class operator TSherpaOnnxFastClusteringConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig); | ||
| 2026 | +begin | ||
| 2027 | + Dest.NumClusters := -1; | ||
| 2028 | + Dest.Threshold := 0.5; | ||
| 2029 | +end; | ||
| 2030 | + | ||
| 2031 | +function TSherpaOnnxSpeakerEmbeddingExtractorConfig.ToString: AnsiString; | ||
| 2032 | +begin | ||
| 2033 | + Result := Format('TSherpaOnnxSpeakerEmbeddingExtractorConfig(' + | ||
| 2034 | + 'Model := %s, '+ | ||
| 2035 | + 'NumThreads := %d, '+ | ||
| 2036 | + 'Debug := %s, '+ | ||
| 2037 | + 'Provider := %s)', | ||
| 2038 | + [Self.Model, Self.NumThreads, Self.Debug.ToString, Self.Provider]); | ||
| 2039 | +end; | ||
| 2040 | + | ||
| 2041 | +class operator TSherpaOnnxSpeakerEmbeddingExtractorConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig); | ||
| 2042 | +begin | ||
| 2043 | + Dest.NumThreads := 1; | ||
| 2044 | + Dest.Debug := False; | ||
| 2045 | + Dest.Provider := 'cpu'; | ||
| 2046 | +end; | ||
| 2047 | + | ||
| 2048 | +function TSherpaOnnxOfflineSpeakerDiarizationConfig.ToString: AnsiString; | ||
| 2049 | +begin | ||
| 2050 | + Result := Format('TSherpaOnnxOfflineSpeakerDiarizationConfig(' + | ||
| 2051 | + 'Segmentation := %s, '+ | ||
| 2052 | + 'Embedding := %s, '+ | ||
| 2053 | + 'Clustering := %s, '+ | ||
| 2054 | + 'MinDurationOn := %.3f, '+ | ||
| 2055 | + 'MinDurationOff := %.3f)', | ||
| 2056 | + [Self.Segmentation.ToString, Self.Embedding.ToString, | ||
| 2057 | + Self.Clustering.ToString, Self.MinDurationOn, Self.MinDurationOff]); | ||
| 2058 | +end; | ||
| 2059 | + | ||
| 2060 | +class operator TSherpaOnnxOfflineSpeakerDiarizationConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 2061 | +begin | ||
| 2062 | + Dest.MinDurationOn := 0.2; | ||
| 2063 | + Dest.MinDurationOff := 0.5; | ||
| 2064 | +end; | ||
| 2065 | + | ||
| 2066 | +function TSherpaOnnxOfflineSpeakerDiarizationSegment.ToString: AnsiString; | ||
| 2067 | +begin | ||
| 2068 | + Result := Format('TSherpaOnnxOfflineSpeakerDiarizationSegment(' + | ||
| 2069 | + 'Start := %.3f, '+ | ||
| 2070 | + 'Stop := %.3f, '+ | ||
| 2071 | + 'Speaker := %d)', | ||
| 2072 | + [Self.Start, Self.Stop, Self.Speaker]); | ||
| 2073 | +end; | ||
| 2074 | + | ||
| 2075 | +constructor TSherpaOnnxOfflineSpeakerDiarization.Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 2076 | +var | ||
| 2077 | + C: SherpaOnnxOfflineSpeakerDiarizationConfig; | ||
| 2078 | +begin | ||
| 2079 | + C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 2080 | + C.Segmentation.Pyannote.Model := PAnsiChar(Config.Segmentation.Pyannote.Model); | ||
| 2081 | + C.Segmentation.NumThreads := Config.Segmentation.NumThreads; | ||
| 2082 | + C.Segmentation.Debug := Ord(Config.Segmentation.Debug); | ||
| 2083 | + C.Segmentation.Provider := PAnsiChar(Config.Segmentation.Provider); | ||
| 2084 | + | ||
| 2085 | + C.Embedding.Model := PAnsiChar(Config.Embedding.Model); | ||
| 2086 | + C.Embedding.NumThreads := Config.Embedding.NumThreads; | ||
| 2087 | + C.Embedding.Debug := Ord(Config.Embedding.Debug); | ||
| 2088 | + C.Embedding.Provider := PAnsiChar(Config.Embedding.Provider); | ||
| 2089 | + | ||
| 2090 | + C.Clustering.NumClusters := Config.Clustering.NumClusters; | ||
| 2091 | + C.Clustering.Threshold := Config.Clustering.Threshold; | ||
| 2092 | + | ||
| 2093 | + C.MinDurationOn := Config.MinDurationOn; | ||
| 2094 | + C.MinDurationOff := Config.MinDurationOff; | ||
| 2095 | + | ||
| 2096 | + Self.Handle := SherpaOnnxCreateOfflineSpeakerDiarization(@C); | ||
| 2097 | + Self._Config := Config; | ||
| 2098 | + Self.SampleRate := 0; | ||
| 2099 | + | ||
| 2100 | + if Self.Handle <> nil then | ||
| 2101 | + begin | ||
| 2102 | + Self.SampleRate := SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(Self.Handle); | ||
| 2103 | + end; | ||
| 2104 | +end; | ||
| 2105 | + | ||
| 2106 | +destructor TSherpaOnnxOfflineSpeakerDiarization.Destroy; | ||
| 2107 | +begin | ||
| 2108 | + SherpaOnnxDestroyOfflineSpeakerDiarization(Self.Handle); | ||
| 2109 | + Self.Handle := nil; | ||
| 2110 | +end; | ||
| 2111 | + | ||
| 2112 | +procedure TSherpaOnnxOfflineSpeakerDiarization.SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 2113 | +var | ||
| 2114 | + C: SherpaOnnxOfflineSpeakerDiarizationConfig; | ||
| 2115 | +begin | ||
| 2116 | + C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig); | ||
| 2117 | + | ||
| 2118 | + C.Clustering.NumClusters := Config.Clustering.NumClusters; | ||
| 2119 | + C.Clustering.Threshold := Config.Clustering.Threshold; | ||
| 2120 | + | ||
| 2121 | + SherpaOnnxOfflineSpeakerDiarizationSetConfig(Self.Handle, @C); | ||
| 2122 | +end; | ||
| 2123 | + | ||
| 2124 | +function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; | ||
| 2125 | +var | ||
| 2126 | + R: Pointer; | ||
| 2127 | + NumSegments: Integer; | ||
| 2128 | + I: Integer; | ||
| 2129 | + Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment; | ||
| 2130 | +begin | ||
| 2131 | + Result := nil; | ||
| 2132 | + | ||
| 2133 | + R := SherpaOnnxOfflineSpeakerDiarizationProcess(Self.Handle, pcfloat(Samples), Length(Samples)); | ||
| 2134 | + if R = nil then | ||
| 2135 | + begin | ||
| 2136 | + Exit | ||
| 2137 | + end; | ||
| 2138 | + NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R); | ||
| 2139 | + | ||
| 2140 | + Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R); | ||
| 2141 | + | ||
| 2142 | + SetLength(Result, NumSegments); | ||
| 2143 | + for I := Low(Result) to High(Result) do | ||
| 2144 | + begin | ||
| 2145 | + Result[I].Start := Segments[I].Start; | ||
| 2146 | + Result[I].Stop := Segments[I].Stop; | ||
| 2147 | + Result[I].Speaker := Segments[I].Speaker; | ||
| 2148 | + end; | ||
| 2149 | + | ||
| 2150 | + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments); | ||
| 2151 | + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R); | ||
| 2152 | +end; | ||
| 2153 | + | ||
| 2154 | +function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single; | ||
| 2155 | + callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; | ||
| 2156 | +var | ||
| 2157 | + R: Pointer; | ||
| 2158 | + NumSegments: Integer; | ||
| 2159 | + I: Integer; | ||
| 2160 | + Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment; | ||
| 2161 | +begin | ||
| 2162 | + Result := nil; | ||
| 2163 | + | ||
| 2164 | + R := SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(Self.Handle, pcfloat(Samples), Length(Samples), callback); | ||
| 2165 | + if R = nil then | ||
| 2166 | + begin | ||
| 2167 | + Exit | ||
| 2168 | + end; | ||
| 2169 | + NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R); | ||
| 2170 | + | ||
| 2171 | + Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R); | ||
| 2172 | + | ||
| 2173 | + SetLength(Result, NumSegments); | ||
| 2174 | + for I := Low(Result) to High(Result) do | ||
| 2175 | + begin | ||
| 2176 | + Result[I].Start := Segments[I].Start; | ||
| 2177 | + Result[I].Stop := Segments[I].Stop; | ||
| 2178 | + Result[I].Speaker := Segments[I].Speaker; | ||
| 2179 | + end; | ||
| 2180 | + | ||
| 2181 | + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments); | ||
| 2182 | + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R); | ||
| 2183 | +end; | ||
| 2184 | + | ||
| 1850 | end. | 2185 | end. |
-
请 注册 或 登录 后发表评论