C# API for speaker diarization (#1407)

Fangjun Kuang · GitHub
Commit a45e5dba9986f07f13d4f64b13ff77589a9909e3 a45e5dba 1 parent bd50e795
.github/scripts/test-dot-net.sh
.github/workflows/test-dot-net.yaml
dotnet-examples/offline-speaker-diarization/Program.cs
dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj
dotnet-examples/offline-speaker-diarization/run.sh
dotnet-examples/sherpa-onnx.sln
scripts/dotnet/FastClusteringConfig.cs
scripts/dotnet/OfflineSpeakerDiarization.cs
scripts/dotnet/OfflineSpeakerDiarizationConfig.cs
scripts/dotnet/OfflineSpeakerDiarizationSegment.cs
scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs
scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs
--- a/.github/scripts/test-dot-net.sh
查看文件 @a45e5db
+++ b/.github/scripts/test-dot-net.sh
查看文件 @a45e5db
@@ -2,7 +2,13 @@
 cd dotnet-examples/
-cd ./offline-decode-files
+cd ./offline-speaker-diarization
+./run.sh
+rm -rfv *.onnx
+rm -fv *.wav
+rm -rfv sherpa-onnx-pyannote-*
+
+cd ../offline-decode-files
 ./run-sense-voice-ctc.sh
 rm -rf sherpa-onnx-*
--- a/.github/workflows/test-dot-net.yaml
查看文件 @a45e5db
+++ b/.github/workflows/test-dot-net.yaml
查看文件 @a45e5db
@@ -47,53 +47,10 @@ jobs:
         with:
           fetch-depth: 0
-      - name: Free space
-        if: matrix.os == 'ubuntu-latest'
-        shell: bash
-        run: |
-          df -h
-          rm -rf /opt/hostedtoolcache
-          df -h
-
-      - name: Free more space
-        if: matrix.os == 'ubuntu-latest'
-        shell: bash
-        run: |
-          # https://github.com/orgs/community/discussions/25678
-          cd /opt
-          find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';'
-
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-
-      - name: Free Disk Space (Ubuntu)
-        if: matrix.os == 'ubuntu-latest'
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: false
-          haskell: true
-          large-packages: true
-          docker-images: false
-          swap-storage: true
-
-      - name: Check space
-        if: matrix.os == 'ubuntu-latest'
-        shell: bash
-        run: |
-          df -h
-
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2
         with:
-          key: ${{ matrix.os }}-release-shared
+          key: ${{ matrix.os }}-dotnet-release-shared
       - name: Build sherpa-onnx
         shell: bash
@@ -110,11 +67,16 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release \
             -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
             -DBUILD_ESPEAK_NG_EXE=OFF \
-            -DSHERPA_ONNX_ENABLE_BINARY=ON \
+            -DSHERPA_ONNX_ENABLE_BINARY=OFF \
             ..
           cmake --build . --target install --config Release
+          rm -rf install/share
+          rm -rf install/lib/pkg*
+
+          ls -lh ./install/lib
+
       - uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.os }}
@@ -148,7 +110,7 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           name: ubuntu-latest
-          path: /tmp/linux
+          path: /tmp/linux-x64
       - name: Setup .NET
         uses: actions/setup-dotnet@v4
@@ -162,17 +124,21 @@ jobs:
       - name: Display files
         shell: bash
         run: |
-          echo "----------/tmp/----------"
-          ls -lh /tmp/
+          echo "----------/tmp----------"
+          ls -lh /tmp
-          echo "----------/tmp/linux----------"
-          ls -lh /tmp/linux
+          echo "----------/tmp/linux-x64----------"
+          ls -lh /tmp/linux-x64
+          df -h
       - name: Build
         shell: bash
         run: |
           cd scripts/dotnet
           ./run.sh
+          df -h
+
+          ls -lh /tmp/packages
       - name: Copy files
         shell: bash
@@ -181,9 +147,14 @@ jobs:
           ls -lh /tmp
+          df -h
+
       - name: Run tests
         shell: bash
         run: |
+          dotnet nuget locals all --clear
+          df -h
+
           .github/scripts/test-dot-net.sh
       - uses: actions/upload-artifact@v4
--- a/dotnet-examples/offline-speaker-diarization/Program.cs 0 → 100644
查看文件 @a45e5db
+++ b/dotnet-examples/offline-speaker-diarization/Program.cs 0 → 100644
查看文件 @a45e5db
+// Copyright (c)  2024  Xiaomi Corporation
+//
+
+// This file shows how to use sherpa-onnx C# API for speaker diarization
+/*
+Usage:
+
+Step 1: Download a speaker segmentation model
+
+Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+for a list of available models. The following is an example
+
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+
+Step 2: Download a speaker embedding extractor model
+
+Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
+for a list of available models. The following is an example
+
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+
+Step 3. Download test wave files
+
+Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+for a list of available test wave files. The following is an example
+
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+
+Step 4. Run it
+
+  dotnet run
+*/
+
+using SherpaOnnx;
+using System;
+
+class OfflineSpeakerDiarizationDemo
+{
+  static void Main(string[] args)
+  {
+    var config = new OfflineSpeakerDiarizationConfig();
+    config.Segmentation.Pyannote.Model = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
+    config.Embedding.Model = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
+
+    // the test wave ./0-four-speakers-zh.wav has 4 speakers, so
+    // we set num_clusters to 4
+    //
+    config.Clustering.NumClusters = 4;
+    // If you don't know the number of speakers in the test wave file, please
+    // use
+    // config.Clustering.Threshold = 0.5; // You need to tune this threshold
+    var sd = new OfflineSpeakerDiarization(config);
+
+    var testWaveFile = "./0-four-speakers-zh.wav";
+    WaveReader waveReader = new WaveReader(testWaveFile);
+    if (sd.SampleRate != waveReader.SampleRate)
+    {
+      Console.WriteLine($"Expected sample rate: {sd.SampleRate}. Given: {waveReader.SampleRate}");
+      return;
+    }
+
+    Console.WriteLine("Started");
+
+     // var segments = sd.Process(waveReader.Samples); // this one is also ok
+
+    var MyProgressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) =>
+    {
+      float progress = 100.0F * numProcessedChunks / numTotalChunks;
+      Console.WriteLine("Progress {0}%", String.Format("{0:0.00}", progress));
+      return 0;
+    };
+
+    var callback = new OfflineSpeakerDiarizationProgressCallback(MyProgressCallback);
+    var segments = sd.ProcessWithCallback(waveReader.Samples, callback, IntPtr.Zero);
+
+    foreach (var s in segments)
+    {
+      Console.WriteLine("{0} -- {1} speaker_{2}", String.Format("{0:0.00}", s.Start), String.Format("{0:0.00}", s.End), s.Speaker);
+    }
+  }
+}
--- a/dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj 0 → 100644
查看文件 @a45e5db
+++ b/dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj 0 → 100644
查看文件 @a45e5db
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <RootNamespace>offline_speaker_diarization</RootNamespace>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Common\Common.csproj" />
+  </ItemGroup>
+
+</Project>
--- a/dotnet-examples/offline-speaker-diarization/run.sh 0 → 100755
查看文件 @a45e5db
+++ b/dotnet-examples/offline-speaker-diarization/run.sh 0 → 100755
查看文件 @a45e5db
+#!/usr/bin/env bash
+
+
+if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+fi
+
+if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+fi
+
+if [ ! -f ./0-four-speakers-zh.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+fi
+
+dotnet run
--- a/dotnet-examples/sherpa-onnx.sln
查看文件 @a45e5db
+++ b/dotnet-examples/sherpa-onnx.sln
查看文件 @a45e5db
@@ -31,6 +31,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TTS", "TTS\TTS.csproj", "{DACE4A18-4FC8-4437-92BF-5A90BA81286C}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -93,6 +95,10 @@ Global
 		{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.Build.0 = Release|Any CPU
+		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/scripts/dotnet/FastClusteringConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/FastClusteringConfig.cs 0 → 100644
查看文件 @a45e5db
+/// Copyright (c)  2024  Xiaomi Corporation
+
+using System.Runtime.InteropServices;
+
+namespace SherpaOnnx
+{
+
+    [StructLayout(LayoutKind.Sequential)]
+    public struct FastClusteringConfig
+    {
+        public FastClusteringConfig()
+        {
+            NumClusters = -1;
+            Threshold = 0.5F;
+        }
+
+        public int NumClusters;
+        public float Threshold;
+    }
+}
--- a/scripts/dotnet/OfflineSpeakerDiarization.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerDiarization.cs 0 → 100644
查看文件 @a45e5db
+/// Copyright (c)  2024  Xiaomi Corporation
+using System;
+using System.Runtime.InteropServices;
+using System.Text;
+
+namespace SherpaOnnx
+{
+    // IntPtr is actually a `const float*` from C++
+    public delegate int OfflineSpeakerDiarizationProgressCallback(int numProcessedChunks, int numTotalChunks, IntPtr arg);
+
+    public class OfflineSpeakerDiarization : IDisposable
+    {
+        public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config)
+        {
+            IntPtr h = SherpaOnnxCreateOfflineSpeakerDiarization(ref config);
+            _handle = new HandleRef(this, h);
+        }
+
+        public OfflineSpeakerDiarizationSegment[] Process(float[] samples)
+        {
+            IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcess(_handle.Handle, samples, samples.Length);
+            return ProcessImpl(result);
+        }
+
+        public OfflineSpeakerDiarizationSegment[] ProcessWithCallback(float[] samples, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg)
+        {
+            IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(_handle.Handle, samples, samples.Length, callback, arg);
+            return ProcessImpl(result);
+        }
+
+        private OfflineSpeakerDiarizationSegment[] ProcessImpl(IntPtr result)
+        {
+            if (result == IntPtr.Zero)
+            {
+              return new OfflineSpeakerDiarizationSegment[] {};
+            }
+
+            int numSegments = SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result);
+            IntPtr p = SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result);
+
+            OfflineSpeakerDiarizationSegment[] ans = new OfflineSpeakerDiarizationSegment[numSegments];
+            unsafe
+            {
+              int size = sizeof(float) * 2 + sizeof(int);
+              for (int i = 0; i != numSegments; ++i)
+              {
+                IntPtr t = new IntPtr((byte*)p + i * size);
+                ans[i] = new OfflineSpeakerDiarizationSegment(t);
+
+                // The following IntPtr.Add() does not support net20
+                // ans[i] = new OfflineSpeakerDiarizationSegment(IntPtr.Add(p, i));
+              }
+            }
+
+
+            SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p);
+            SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result);
+
+            return ans;
+
+        }
+
+        public void Dispose()
+        {
+            Cleanup();
+            // Prevent the object from being placed on the
+            // finalization queue
+            System.GC.SuppressFinalize(this);
+        }
+
+        ~OfflineSpeakerDiarization()
+        {
+            Cleanup();
+        }
+
+        private void Cleanup()
+        {
+            SherpaOnnxDestroyOfflineSpeakerDiarization(_handle.Handle);
+
+            // Don't permit the handle to be used again.
+            _handle = new HandleRef(this, IntPtr.Zero);
+        }
+
+        private HandleRef _handle;
+
+        public int SampleRate
+        {
+            get
+            {
+                return SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(_handle.Handle);
+            }
+        }
+
+        [DllImport(Dll.Filename)]
+        private static extern IntPtr SherpaOnnxCreateOfflineSpeakerDiarization(ref OfflineSpeakerDiarizationConfig config);
+
+        [DllImport(Dll.Filename)]
+        private static extern void SherpaOnnxDestroyOfflineSpeakerDiarization(IntPtr handle);
+
+        [DllImport(Dll.Filename)]
+        private static extern int SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(IntPtr handle);
+
+        [DllImport(Dll.Filename)]
+        private static extern int SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(IntPtr handle);
+
+        [DllImport(Dll.Filename)]
+        private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcess(IntPtr handle, float[] samples, int n);
+
+        [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
+        private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(IntPtr handle, float[] samples, int n, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg);
+
+        [DllImport(Dll.Filename)]
+        private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(IntPtr handle);
+
+        [DllImport(Dll.Filename)]
+        private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(IntPtr handle);
+
+        [DllImport(Dll.Filename)]
+        private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(IntPtr handle);
+    }
+}
+
--- a/scripts/dotnet/OfflineSpeakerDiarizationConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerDiarizationConfig.cs 0 → 100644
查看文件 @a45e5db
+/// Copyright (c)  2024  Xiaomi Corporation
+
+using System.Runtime.InteropServices;
+
+namespace SherpaOnnx
+{
+
+    [StructLayout(LayoutKind.Sequential)]
+    public struct OfflineSpeakerDiarizationConfig
+    {
+        public OfflineSpeakerDiarizationConfig()
+        {
+            Segmentation = new OfflineSpeakerSegmentationModelConfig();
+            Embedding = new SpeakerEmbeddingExtractorConfig();
+            Clustering = new FastClusteringConfig();
+
+            MinDurationOn = 0.3F;
+            MinDurationOff = 0.5F;
+        }
+
+        public OfflineSpeakerSegmentationModelConfig Segmentation;
+        public SpeakerEmbeddingExtractorConfig Embedding;
+        public FastClusteringConfig Clustering;
+
+        public float MinDurationOn;
+        public float MinDurationOff;
+    }
+}
+
+
+
--- a/scripts/dotnet/OfflineSpeakerDiarizationSegment.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerDiarizationSegment.cs 0 → 100644
查看文件 @a45e5db
+/// Copyright (c)  2024  Xiaomi Corporation
+using System;
+using System.Runtime.InteropServices;
+using System.Text;
+
+namespace SherpaOnnx
+{
+
+    public class OfflineSpeakerDiarizationSegment
+    {
+        public OfflineSpeakerDiarizationSegment(IntPtr handle)
+        {
+          Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl));
+
+          Start = impl.Start;
+          End = impl.End;
+          Speaker = impl.Speaker;
+        }
+
+        [StructLayout(LayoutKind.Sequential)]
+        struct Impl
+        {
+            public float Start;
+            public float End;
+            public int Speaker;
+        }
+
+        public float Start;
+        public float End;
+        public int Speaker;
+    }
+}
+
--- a/scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs 0 → 100644
查看文件 @a45e5db
+/// Copyright (c)  2024  Xiaomi Corporation
+
+using System.Runtime.InteropServices;
+
+namespace SherpaOnnx
+{
+
+    [StructLayout(LayoutKind.Sequential)]
+    public struct OfflineSpeakerSegmentationModelConfig
+    {
+        public OfflineSpeakerSegmentationModelConfig()
+        {
+            Pyannote = new OfflineSpeakerSegmentationPyannoteModelConfig();
+            NumThreads = 1;
+            Debug = 0;
+            Provider = "cpu";
+        }
+
+        public OfflineSpeakerSegmentationPyannoteModelConfig Pyannote;
+
+        /// Number of threads used to run the neural network model
+        public int NumThreads;
+
+        /// true to print debug information of the model
+        public int Debug;
+
+        [MarshalAs(UnmanagedType.LPStr)]
+        public string Provider;
+    }
+}
+
+
--- a/scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs 0 → 100644
查看文件 @a45e5db
+++ b/scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs 0 → 100644
查看文件 @a45e5db
+/// Copyright (c)  2024  Xiaomi Corporation
+
+using System.Runtime.InteropServices;
+
+namespace SherpaOnnx
+{
+
+    [StructLayout(LayoutKind.Sequential)]
+    public struct OfflineSpeakerSegmentationPyannoteModelConfig
+    {
+        public OfflineSpeakerSegmentationPyannoteModelConfig()
+        {
+            Model = "";
+        }
+
+        [MarshalAs(UnmanagedType.LPStr)]
+        public string Model;
+    }
+}
+