C# API for speaker diarization (#1407)

Fangjun Kuang · GitHub
Commit a45e5dba9986f07f13d4f64b13ff77589a9909e3 a45e5dba 1 parent bd50e795
.github/scripts/test-dot-net.sh
.github/workflows/test-dot-net.yaml
dotnet-examples/offline-speaker-diarization/Program.cs
dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj
dotnet-examples/offline-speaker-diarization/run.sh
dotnet-examples/sherpa-onnx.sln
scripts/dotnet/FastClusteringConfig.cs
scripts/dotnet/OfflineSpeakerDiarization.cs
scripts/dotnet/OfflineSpeakerDiarizationConfig.cs
scripts/dotnet/OfflineSpeakerDiarizationSegment.cs
scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs
scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs
--- a/.github/scripts/test-dot-net.sh
查看文件 @a45e5db
+++ b/.github/scripts/test-dot-net.sh
查看文件 @a45e5db
@@ -2,7 +2,13 @@
 
 cd dotnet-examples/
 
- cd ./offline-decode-files
+ cd ./offline-speaker-diarization
+ ./run.sh
+ rm -rfv *.onnx
+ rm -fv *.wav
+ rm -rfv sherpa-onnx-pyannote-*
+ 
+ cd ../offline-decode-files
 ./run-sense-voice-ctc.sh
 rm -rf sherpa-onnx-*
 
--- a/.github/workflows/test-dot-net.yaml
查看文件 @a45e5db
+++ b/.github/workflows/test-dot-net.yaml
查看文件 @a45e5db
@@ -47,53 +47,10 @@ jobs:
         with:
           fetch-depth: 0
 
-       - name: Free space
-         if: matrix.os == 'ubuntu-latest'
-         shell: bash
-         run: |
-           df -h
-           rm -rf /opt/hostedtoolcache
-           df -h
- 
-       - name: Free more space
-         if: matrix.os == 'ubuntu-latest'
-         shell: bash
-         run: |
-           # https://github.com/orgs/community/discussions/25678
-           cd /opt
-           find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';'
- 
-           sudo rm -rf /usr/share/dotnet
-           sudo rm -rf "/usr/local/share/boost"
-           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- 
-       - name: Free Disk Space (Ubuntu)
-         if: matrix.os == 'ubuntu-latest'
-         uses: jlumbroso/free-disk-space@main
-         with:
-           # this might remove tools that are actually needed,
-           # if set to "true" but frees about 6 GB
-           tool-cache: false
- 
-           # all of these default to true, but feel free to set to
-           # "false" if necessary for your workflow
-           android: true
-           dotnet: false
-           haskell: true
-           large-packages: true
-           docker-images: false
-           swap-storage: true
- 
-       - name: Check space
-         if: matrix.os == 'ubuntu-latest'
-         shell: bash
-         run: |
-           df -h
- 
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2
         with:
-           key: ${{ matrix.os }}-release-shared
+           key: ${{ matrix.os }}-dotnet-release-shared
 
       - name: Build sherpa-onnx
         shell: bash
@@ -110,11 +67,16 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
             -DBUILD_ESPEAK_NG_EXE=OFF \
-             -DSHERPA_ONNX_ENABLE_BINARY=ON \
+             -DSHERPA_ONNX_ENABLE_BINARY=OFF \
             ..
 
           cmake --build . --target install --config Release
 
+           rm -rf install/share
+           rm -rf install/lib/pkg*
+ 
+           ls -lh ./install/lib
+ 
       - uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.os }}
@@ -148,7 +110,7 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: ubuntu-latest
-           path: /tmp/linux
+           path: /tmp/linux-x64
 
       - name: Setup .NET
         uses: actions/setup-dotnet@v4
@@ -162,17 +124,21 @@ jobs:
       - name: Display files
         shell: bash
         run: |
-           echo "----------/tmp/----------"
-           ls -lh /tmp/
+           echo "----------/tmp----------"
+           ls -lh /tmp
 
-           echo "----------/tmp/linux----------"
-           ls -lh /tmp/linux
+           echo "----------/tmp/linux-x64----------"
+           ls -lh /tmp/linux-x64
+           df -h
 
       - name: Build
         shell: bash
         run: |
           cd scripts/dotnet
           ./run.sh
+           df -h
+ 
+           ls -lh /tmp/packages
 
       - name: Copy files
         shell: bash
@@ -181,9 +147,14 @@ jobs:
 
           ls -lh /tmp
 
+           df -h
+ 
       - name: Run tests
         shell: bash
         run: |
+           dotnet nuget locals all --clear
+           df -h
+ 
           .github/scripts/test-dot-net.sh
 
       - uses: actions/upload-artifact@v4
--- a/dotnet-examples/offline-speaker-diarization/Program.cs 0 → 100644
查看文件 @a45e5db
+++ b/dotnet-examples/offline-speaker-diarization/Program.cs 0 → 100644
查看文件 @a45e5db
+ // Copyright (c)  2024  Xiaomi Corporation
+ //
+ 
+ // This file shows how to use sherpa-onnx C# API for speaker diarization
+ /*
+ Usage:
+ 
+ Step 1: Download a speaker segmentation model
+ 
+ Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+ for a list of available models. The following is an example
+ 
+   wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+   tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+   rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ 
+ Step 2: Download a speaker embedding extractor model
+ 
+ Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
+ for a list of available models. The following is an example
+ 
+   wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+ 
+ Step 3. Download test wave files
+ 
+ Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+ for a list of available test wave files. The following is an example
+ 
+   wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+ 
+ Step 4. Run it
+ 
+   dotnet run
+ */
+ 
+ using SherpaOnnx;
+ using System;
+ 
+ class OfflineSpeakerDiarizationDemo
+ {
+   static void Main(string[] args)
+   {
+     var config = new OfflineSpeakerDiarizationConfig();
+     config.Segmentation.Pyannote.Model = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
+     config.Embedding.Model = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
+ 
+     // the test wave ./0-four-speakers-zh.wav has 4 speakers, so
+     // we set num_clusters to 4
+     //
+     config.Clustering.NumClusters = 4;
+     // If you don't know the number of speakers in the test wave file, please
+     // use
+     // config.Clustering.Threshold = 0.5; // You need to tune this threshold
+     var sd = new OfflineSpeakerDiarization(config);
+ 
+     var testWaveFile = "./0-four-speakers-zh.wav";
+     WaveReader waveReader = new WaveReader(testWaveFile);
+     if (sd.SampleRate != waveReader.SampleRate)
+     {
+       Console.WriteLine($"Expected sample rate: {sd.SampleRate}. Given: {waveReader.SampleRate}");
+       return;
+     }
+ 
+     Console.WriteLine("Started");
+ 
+      // var segments = sd.Process(waveReader.Samples); // this one is also ok
+ 
+     var MyProgressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) =>
+     {
+       float progress = 100.0F * numProcessedChunks / numTotalChunks;
+       Console.WriteLine("Progress {0}%", String.Format("{0:0.00}", progress));
+       return 0;
+     };
+ 
+     var callback = new OfflineSpeakerDiarizationProgressCallback(MyProgressCallback);
+     var segments = sd.ProcessWithCallback(waveReader.Samples, callback, IntPtr.Zero);
+ 
+     foreach (var s in segments)
+     {
+       Console.WriteLine("{0} -- {1} speaker_{2}", String.Format("{0:0.00}", s.Start), String.Format("{0:0.00}", s.End), s.Speaker);
+     }
+   }
+ }
--- a/dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj 0 → 100644
查看文件 @a45e5db
+++ b/dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj 0 → 100644
查看文件 @a45e5db
+ <Project Sdk="Microsoft.NET.Sdk">
+ 
+   <PropertyGroup>
+     <OutputType>Exe</OutputType>
+     <TargetFramework>net6.0</TargetFramework>
+     <RootNamespace>offline_speaker_diarization</RootNamespace>
+     <ImplicitUsings>enable</ImplicitUsings>
+     <Nullable>enable</Nullable>
+   </PropertyGroup>
+ 
+   <ItemGroup>
+     <ProjectReference Include="..\Common\Common.csproj" />
+   </ItemGroup>
+ 
+ </Project>
--- a/dotnet-examples/offline-speaker-diarization/run.sh 0 → 100755
查看文件 @a45e5db
+++ b/dotnet-examples/offline-speaker-diarization/run.sh 0 → 100755
查看文件 @a45e5db
+ #!/usr/bin/env bash
+ 
+ 
+ if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+   tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+   rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+ fi
+ 
+ if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+ fi
+ 
+ if [ ! -f ./0-four-speakers-zh.wav ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+ fi
+ 
+ dotnet run
--- a/dotnet-examples/sherpa-onnx.sln
查看文件 @a45e5db
+++ b/dotnet-examples/sherpa-onnx.sln
查看文件 @a45e5db
@@ -31,6 +31,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TTS", "TTS\TTS.csproj", "{DACE4A18-4FC8-4437-92BF-5A90BA81286C}"
 EndProject
+ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
+ EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -93,6 +95,10 @@ Global
 		{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.Build.0 = Release|Any CPU
+ 		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ 		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ 		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ 		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/scripts/dotnet/FastClusteringConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/FastClusteringConfig.cs 0 → 100644
查看文件 @a45e5db
+ /// Copyright (c)  2024  Xiaomi Corporation
+ 
+ using System.Runtime.InteropServices;
+ 
+ namespace SherpaOnnx
+ {
+ 
+     [StructLayout(LayoutKind.Sequential)]
+     public struct FastClusteringConfig
+     {
+         public FastClusteringConfig()
+         {
+             NumClusters = -1;
+             Threshold = 0.5F;
+         }
+ 
+         public int NumClusters;
+         public float Threshold;
+     }
+ }
--- a/scripts/dotnet/OfflineSpeakerDiarization.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerDiarization.cs 0 → 100644
查看文件 @a45e5db
+ /// Copyright (c)  2024  Xiaomi Corporation
+ using System;
+ using System.Runtime.InteropServices;
+ using System.Text;
+ 
+ namespace SherpaOnnx
+ {
+     // IntPtr is actually a `const float*` from C++
+     public delegate int OfflineSpeakerDiarizationProgressCallback(int numProcessedChunks, int numTotalChunks, IntPtr arg);
+ 
+     public class OfflineSpeakerDiarization : IDisposable
+     {
+         public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config)
+         {
+             IntPtr h = SherpaOnnxCreateOfflineSpeakerDiarization(ref config);
+             _handle = new HandleRef(this, h);
+         }
+ 
+         public OfflineSpeakerDiarizationSegment[] Process(float[] samples)
+         {
+             IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcess(_handle.Handle, samples, samples.Length);
+             return ProcessImpl(result);
+         }
+ 
+         public OfflineSpeakerDiarizationSegment[] ProcessWithCallback(float[] samples, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg)
+         {
+             IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(_handle.Handle, samples, samples.Length, callback, arg);
+             return ProcessImpl(result);
+         }
+ 
+         private OfflineSpeakerDiarizationSegment[] ProcessImpl(IntPtr result)
+         {
+             if (result == IntPtr.Zero)
+             {
+               return new OfflineSpeakerDiarizationSegment[] {};
+             }
+ 
+             int numSegments = SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result);
+             IntPtr p = SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result);
+ 
+             OfflineSpeakerDiarizationSegment[] ans = new OfflineSpeakerDiarizationSegment[numSegments];
+             unsafe
+             {
+               int size = sizeof(float) * 2 + sizeof(int);
+               for (int i = 0; i != numSegments; ++i)
+               {
+                 IntPtr t = new IntPtr((byte*)p + i * size);
+                 ans[i] = new OfflineSpeakerDiarizationSegment(t);
+ 
+                 // The following IntPtr.Add() does not support net20
+                 // ans[i] = new OfflineSpeakerDiarizationSegment(IntPtr.Add(p, i));
+               }
+             }
+ 
+ 
+             SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p);
+             SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result);
+ 
+             return ans;
+ 
+         }
+ 
+         public void Dispose()
+         {
+             Cleanup();
+             // Prevent the object from being placed on the
+             // finalization queue
+             System.GC.SuppressFinalize(this);
+         }
+ 
+         ~OfflineSpeakerDiarization()
+         {
+             Cleanup();
+         }
+ 
+         private void Cleanup()
+         {
+             SherpaOnnxDestroyOfflineSpeakerDiarization(_handle.Handle);
+ 
+             // Don't permit the handle to be used again.
+             _handle = new HandleRef(this, IntPtr.Zero);
+         }
+ 
+         private HandleRef _handle;
+ 
+         public int SampleRate
+         {
+             get
+             {
+                 return SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(_handle.Handle);
+             }
+         }
+ 
+         [DllImport(Dll.Filename)]
+         private static extern IntPtr SherpaOnnxCreateOfflineSpeakerDiarization(ref OfflineSpeakerDiarizationConfig config);
+ 
+         [DllImport(Dll.Filename)]
+         private static extern void SherpaOnnxDestroyOfflineSpeakerDiarization(IntPtr handle);
+ 
+         [DllImport(Dll.Filename)]
+         private static extern int SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(IntPtr handle);
+ 
+         [DllImport(Dll.Filename)]
+         private static extern int SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(IntPtr handle);
+ 
+         [DllImport(Dll.Filename)]
+         private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcess(IntPtr handle, float[] samples, int n);
+ 
+         [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
+         private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(IntPtr handle, float[] samples, int n, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg);
+ 
+         [DllImport(Dll.Filename)]
+         private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(IntPtr handle);
+ 
+         [DllImport(Dll.Filename)]
+         private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(IntPtr handle);
+ 
+         [DllImport(Dll.Filename)]
+         private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(IntPtr handle);
+     }
+ }
+ 
--- a/scripts/dotnet/OfflineSpeakerDiarizationConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerDiarizationConfig.cs 0 → 100644
查看文件 @a45e5db
+ /// Copyright (c)  2024  Xiaomi Corporation
+ 
+ using System.Runtime.InteropServices;
+ 
+ namespace SherpaOnnx
+ {
+ 
+     [StructLayout(LayoutKind.Sequential)]
+     public struct OfflineSpeakerDiarizationConfig
+     {
+         public OfflineSpeakerDiarizationConfig()
+         {
+             Segmentation = new OfflineSpeakerSegmentationModelConfig();
+             Embedding = new SpeakerEmbeddingExtractorConfig();
+             Clustering = new FastClusteringConfig();
+ 
+             MinDurationOn = 0.3F;
+             MinDurationOff = 0.5F;
+         }
+ 
+         public OfflineSpeakerSegmentationModelConfig Segmentation;
+         public SpeakerEmbeddingExtractorConfig Embedding;
+         public FastClusteringConfig Clustering;
+ 
+         public float MinDurationOn;
+         public float MinDurationOff;
+     }
+ }
+ 
+ 
+ 
--- a/scripts/dotnet/OfflineSpeakerDiarizationSegment.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerDiarizationSegment.cs 0 → 100644
查看文件 @a45e5db
+ /// Copyright (c)  2024  Xiaomi Corporation
+ using System;
+ using System.Runtime.InteropServices;
+ using System.Text;
+ 
+ namespace SherpaOnnx
+ {
+ 
+     public class OfflineSpeakerDiarizationSegment
+     {
+         public OfflineSpeakerDiarizationSegment(IntPtr handle)
+         {
+           Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl));
+ 
+           Start = impl.Start;
+           End = impl.End;
+           Speaker = impl.Speaker;
+         }
+ 
+         [StructLayout(LayoutKind.Sequential)]
+         struct Impl
+         {
+             public float Start;
+             public float End;
+             public int Speaker;
+         }
+ 
+         public float Start;
+         public float End;
+         public int Speaker;
+     }
+ }
+ 
--- a/scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs 0 → 100644
查看文件 @a45e5db
+ /// Copyright (c)  2024  Xiaomi Corporation
+ 
+ using System.Runtime.InteropServices;
+ 
+ namespace SherpaOnnx
+ {
+ 
+     [StructLayout(LayoutKind.Sequential)]
+     public struct OfflineSpeakerSegmentationModelConfig
+     {
+         public OfflineSpeakerSegmentationModelConfig()
+         {
+             Pyannote = new OfflineSpeakerSegmentationPyannoteModelConfig();
+             NumThreads = 1;
+             Debug = 0;
+             Provider = "cpu";
+         }
+ 
+         public OfflineSpeakerSegmentationPyannoteModelConfig Pyannote;
+ 
+         /// Number of threads used to run the neural network model
+         public int NumThreads;
+ 
+         /// true to print debug information of the model
+         public int Debug;
+ 
+         [MarshalAs(UnmanagedType.LPStr)]
+         public string Provider;
+     }
+ }
+ 
+ 
--- a/scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs 0 → 100644
查看文件 @a45e5db
+ /// Copyright (c)  2024  Xiaomi Corporation
+ 
+ using System.Runtime.InteropServices;
+ 
+ namespace SherpaOnnx
+ {
+ 
+     [StructLayout(LayoutKind.Sequential)]
+     public struct OfflineSpeakerSegmentationPyannoteModelConfig
+     {
+         public OfflineSpeakerSegmentationPyannoteModelConfig()
+         {
+             Model = "";
+         }
+ 
+         [MarshalAs(UnmanagedType.LPStr)]
+         public string Model;
+     }
+ }
+