Fangjun Kuang
Committed by GitHub

C# API for speaker diarization (#1407)

... ... @@ -2,7 +2,13 @@
cd dotnet-examples/
cd ./offline-decode-files
cd ./offline-speaker-diarization
./run.sh
rm -rfv *.onnx
rm -fv *.wav
rm -rfv sherpa-onnx-pyannote-*
cd ../offline-decode-files
./run-sense-voice-ctc.sh
rm -rf sherpa-onnx-*
... ...
... ... @@ -47,53 +47,10 @@ jobs:
with:
fetch-depth: 0
- name: Free space
if: matrix.os == 'ubuntu-latest'
shell: bash
run: |
df -h
rm -rf /opt/hostedtoolcache
df -h
- name: Free more space
if: matrix.os == 'ubuntu-latest'
shell: bash
run: |
# https://github.com/orgs/community/discussions/25678
cd /opt
find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';'
sudo rm -rf /usr/share/dotnet
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Free Disk Space (Ubuntu)
if: matrix.os == 'ubuntu-latest'
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: false
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- name: Check space
if: matrix.os == 'ubuntu-latest'
shell: bash
run: |
df -h
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: ${{ matrix.os }}-release-shared
key: ${{ matrix.os }}-dotnet-release-shared
- name: Build sherpa-onnx
shell: bash
... ... @@ -110,11 +67,16 @@ jobs:
-DCMAKE_BUILD_TYPE=Release \
-DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
-DBUILD_ESPEAK_NG_EXE=OFF \
-DSHERPA_ONNX_ENABLE_BINARY=ON \
-DSHERPA_ONNX_ENABLE_BINARY=OFF \
..
cmake --build . --target install --config Release
rm -rf install/share
rm -rf install/lib/pkg*
ls -lh ./install/lib
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}
... ... @@ -148,7 +110,7 @@ jobs:
uses: actions/download-artifact@v4
with:
name: ubuntu-latest
path: /tmp/linux
path: /tmp/linux-x64
- name: Setup .NET
uses: actions/setup-dotnet@v4
... ... @@ -162,17 +124,21 @@ jobs:
- name: Display files
shell: bash
run: |
echo "----------/tmp/----------"
ls -lh /tmp/
echo "----------/tmp----------"
ls -lh /tmp
echo "----------/tmp/linux----------"
ls -lh /tmp/linux
echo "----------/tmp/linux-x64----------"
ls -lh /tmp/linux-x64
df -h
- name: Build
shell: bash
run: |
cd scripts/dotnet
./run.sh
df -h
ls -lh /tmp/packages
- name: Copy files
shell: bash
... ... @@ -181,9 +147,14 @@ jobs:
ls -lh /tmp
df -h
- name: Run tests
shell: bash
run: |
dotnet nuget locals all --clear
df -h
.github/scripts/test-dot-net.sh
- uses: actions/upload-artifact@v4
... ...
// Copyright (c) 2024 Xiaomi Corporation
//
// This file shows how to use sherpa-onnx C# API for speaker diarization
/*
Usage:
Step 1: Download a speaker segmentation model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
Step 2: Download a speaker embedding extractor model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
Step 3. Download test wave files
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
Step 4. Run it
dotnet run
*/
using SherpaOnnx;
using System;
class OfflineSpeakerDiarizationDemo
{
static void Main(string[] args)
{
var config = new OfflineSpeakerDiarizationConfig();
config.Segmentation.Pyannote.Model = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
config.Embedding.Model = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
// the test wave ./0-four-speakers-zh.wav has 4 speakers, so
// we set num_clusters to 4
//
config.Clustering.NumClusters = 4;
// If you don't know the number of speakers in the test wave file, please
// use
// config.Clustering.Threshold = 0.5; // You need to tune this threshold
var sd = new OfflineSpeakerDiarization(config);
var testWaveFile = "./0-four-speakers-zh.wav";
WaveReader waveReader = new WaveReader(testWaveFile);
if (sd.SampleRate != waveReader.SampleRate)
{
Console.WriteLine($"Expected sample rate: {sd.SampleRate}. Given: {waveReader.SampleRate}");
return;
}
Console.WriteLine("Started");
// var segments = sd.Process(waveReader.Samples); // this one is also ok
var MyProgressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) =>
{
float progress = 100.0F * numProcessedChunks / numTotalChunks;
Console.WriteLine("Progress {0}%", String.Format("{0:0.00}", progress));
return 0;
};
var callback = new OfflineSpeakerDiarizationProgressCallback(MyProgressCallback);
var segments = sd.ProcessWithCallback(waveReader.Samples, callback, IntPtr.Zero);
foreach (var s in segments)
{
Console.WriteLine("{0} -- {1} speaker_{2}", String.Format("{0:0.00}", s.Start), String.Format("{0:0.00}", s.End), s.Speaker);
}
}
}
... ...
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<RootNamespace>offline_speaker_diarization</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\Common\Common.csproj" />
</ItemGroup>
</Project>
... ...
#!/usr/bin/env bash
if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
fi
if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi
if [ ! -f ./0-four-speakers-zh.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
fi
dotnet run
... ...
... ... @@ -31,6 +31,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TTS", "TTS\TTS.csproj", "{DACE4A18-4FC8-4437-92BF-5A90BA81286C}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
... ... @@ -93,6 +95,10 @@ Global
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.Build.0 = Release|Any CPU
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
... ...
/// Copyright (c) 2024 Xiaomi Corporation
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
[StructLayout(LayoutKind.Sequential)]
public struct FastClusteringConfig
{
public FastClusteringConfig()
{
NumClusters = -1;
Threshold = 0.5F;
}
public int NumClusters;
public float Threshold;
}
}
... ...
/// Copyright (c) 2024 Xiaomi Corporation
using System;
using System.Runtime.InteropServices;
using System.Text;
namespace SherpaOnnx
{
// IntPtr is actually a `const float*` from C++
public delegate int OfflineSpeakerDiarizationProgressCallback(int numProcessedChunks, int numTotalChunks, IntPtr arg);
public class OfflineSpeakerDiarization : IDisposable
{
public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config)
{
IntPtr h = SherpaOnnxCreateOfflineSpeakerDiarization(ref config);
_handle = new HandleRef(this, h);
}
public OfflineSpeakerDiarizationSegment[] Process(float[] samples)
{
IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcess(_handle.Handle, samples, samples.Length);
return ProcessImpl(result);
}
public OfflineSpeakerDiarizationSegment[] ProcessWithCallback(float[] samples, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg)
{
IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(_handle.Handle, samples, samples.Length, callback, arg);
return ProcessImpl(result);
}
private OfflineSpeakerDiarizationSegment[] ProcessImpl(IntPtr result)
{
if (result == IntPtr.Zero)
{
return new OfflineSpeakerDiarizationSegment[] {};
}
int numSegments = SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result);
IntPtr p = SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result);
OfflineSpeakerDiarizationSegment[] ans = new OfflineSpeakerDiarizationSegment[numSegments];
unsafe
{
int size = sizeof(float) * 2 + sizeof(int);
for (int i = 0; i != numSegments; ++i)
{
IntPtr t = new IntPtr((byte*)p + i * size);
ans[i] = new OfflineSpeakerDiarizationSegment(t);
// The following IntPtr.Add() does not support net20
// ans[i] = new OfflineSpeakerDiarizationSegment(IntPtr.Add(p, i));
}
}
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p);
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result);
return ans;
}
public void Dispose()
{
Cleanup();
// Prevent the object from being placed on the
// finalization queue
System.GC.SuppressFinalize(this);
}
~OfflineSpeakerDiarization()
{
Cleanup();
}
private void Cleanup()
{
SherpaOnnxDestroyOfflineSpeakerDiarization(_handle.Handle);
// Don't permit the handle to be used again.
_handle = new HandleRef(this, IntPtr.Zero);
}
private HandleRef _handle;
public int SampleRate
{
get
{
return SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(_handle.Handle);
}
}
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxCreateOfflineSpeakerDiarization(ref OfflineSpeakerDiarizationConfig config);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxDestroyOfflineSpeakerDiarization(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcess(IntPtr handle, float[] samples, int n);
[DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(IntPtr handle, float[] samples, int n, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(IntPtr handle);
}
}
... ...
/// Copyright (c) 2024 Xiaomi Corporation
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
[StructLayout(LayoutKind.Sequential)]
public struct OfflineSpeakerDiarizationConfig
{
public OfflineSpeakerDiarizationConfig()
{
Segmentation = new OfflineSpeakerSegmentationModelConfig();
Embedding = new SpeakerEmbeddingExtractorConfig();
Clustering = new FastClusteringConfig();
MinDurationOn = 0.3F;
MinDurationOff = 0.5F;
}
public OfflineSpeakerSegmentationModelConfig Segmentation;
public SpeakerEmbeddingExtractorConfig Embedding;
public FastClusteringConfig Clustering;
public float MinDurationOn;
public float MinDurationOff;
}
}
... ...
/// Copyright (c) 2024 Xiaomi Corporation
using System;
using System.Runtime.InteropServices;
using System.Text;
namespace SherpaOnnx
{
public class OfflineSpeakerDiarizationSegment
{
public OfflineSpeakerDiarizationSegment(IntPtr handle)
{
Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl));
Start = impl.Start;
End = impl.End;
Speaker = impl.Speaker;
}
[StructLayout(LayoutKind.Sequential)]
struct Impl
{
public float Start;
public float End;
public int Speaker;
}
public float Start;
public float End;
public int Speaker;
}
}
... ...
/// Copyright (c) 2024 Xiaomi Corporation
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
[StructLayout(LayoutKind.Sequential)]
public struct OfflineSpeakerSegmentationModelConfig
{
public OfflineSpeakerSegmentationModelConfig()
{
Pyannote = new OfflineSpeakerSegmentationPyannoteModelConfig();
NumThreads = 1;
Debug = 0;
Provider = "cpu";
}
public OfflineSpeakerSegmentationPyannoteModelConfig Pyannote;
/// Number of threads used to run the neural network model
public int NumThreads;
/// true to print debug information of the model
public int Debug;
[MarshalAs(UnmanagedType.LPStr)]
public string Provider;
}
}
... ...
/// Copyright (c) 2024 Xiaomi Corporation
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
[StructLayout(LayoutKind.Sequential)]
public struct OfflineSpeakerSegmentationPyannoteModelConfig
{
public OfflineSpeakerSegmentationPyannoteModelConfig()
{
Model = "";
}
[MarshalAs(UnmanagedType.LPStr)]
public string Model;
}
}
... ...