Fangjun Kuang
Committed by GitHub

Add Speaker ID demo for C# (#862)

... ... @@ -2,7 +2,10 @@
cd dotnet-examples/
cd streaming-hlg-decoding/
cd speaker-identification
./run.sh
cd ../streaming-hlg-decoding/
./run.sh
cd ../spoken-language-identification
... ...
... ... @@ -179,6 +179,7 @@ jobs:
cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/
cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/
cp -v scripts/dotnet/examples/streaming-hlg-decoding.csproj dotnet-examples/streaming-hlg-decoding
cp -v scripts/dotnet/examples/speaker-identification.csproj dotnet-examples/speaker-identification
ls -lh /tmp
... ...
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)
set(SHERPA_ONNX_VERSION "1.9.23")
set(SHERPA_ONNX_VERSION "1.9.24")
# Disable warning about
#
... ...
... ... @@ -17,6 +17,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identificat
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "streaming-hlg-decoding", "streaming-hlg-decoding\streaming-hlg-decoding.csproj", "{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speaker-identification", "speaker-identification\speaker-identification.csproj", "{2B1B140E-A92F-426B-B0DF-5D916B67304F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
... ... @@ -54,5 +56,9 @@ Global
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C4A368A5-FCA0-419D-97C9-C8CE0B08EB99}.Release|Any CPU.Build.0 = Release|Any CPU
{2B1B140E-A92F-426B-B0DF-5D916B67304F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2B1B140E-A92F-426B-B0DF-5D916B67304F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2B1B140E-A92F-426B-B0DF-5D916B67304F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2B1B140E-A92F-426B-B0DF-5D916B67304F}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
EndGlobal
... ...
// Copyright (c) 2024 Xiaomi Corporation
//
// This file shows how to do speaker identification with sherpa-onnx.
//
// 1. Download a model from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
//
// 2. Download test data from
//
// git clone https://github.com/csukuangfj/sr-data
//
// 3. Now run it
//
// dotnet run
using SherpaOnnx;
using System.Collections.Generic;
using System;
class SpeakerIdentificationDemo
{
public static float[] ComputeEmbedding(SpeakerEmbeddingExtractor extractor, String filename)
{
WaveReader reader = new WaveReader(filename);
OnlineStream stream = extractor.CreateStream();
stream.AcceptWaveform(reader.SampleRate, reader.Samples);
stream.InputFinished();
float[] embedding = extractor.Compute(stream);
return embedding;
}
static void Main(string[] args)
{
var config = new SpeakerEmbeddingExtractorConfig();
config.Model = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
config.Debug = 1;
var extractor = new SpeakerEmbeddingExtractor(config);
var manager = new SpeakerEmbeddingManager(extractor.Dim);
string[] spk1Files =
new string[] {
"./sr-data/enroll/fangjun-sr-1.wav",
"./sr-data/enroll/fangjun-sr-2.wav",
"./sr-data/enroll/fangjun-sr-3.wav",
};
float[][] spk1Vec = new float[spk1Files.Length][];
for (int i = 0; i < spk1Files.Length; ++i)
{
spk1Vec[i] = ComputeEmbedding(extractor, spk1Files[i]);
}
string[] spk2Files =
new string[] {
"./sr-data/enroll/leijun-sr-1.wav", "./sr-data/enroll/leijun-sr-2.wav",
};
float[][] spk2Vec = new float[spk2Files.Length][];
for (int i = 0; i < spk2Files.Length; ++i)
{
spk2Vec[i] = ComputeEmbedding(extractor, spk2Files[i]);
}
if (!manager.Add("fangjun", spk1Vec))
{
Console.WriteLine("Failed to register fangjun");
return;
}
if (!manager.Add("leijun", spk2Vec))
{
Console.WriteLine("Failed to register leijun");
return;
}
if (manager.NumSpeakers != 2)
{
Console.WriteLine("There should be two speakers");
return;
}
if (!manager.Contains("fangjun"))
{
Console.WriteLine("It should contain the speaker fangjun");
return;
}
if (!manager.Contains("leijun"))
{
Console.WriteLine("It should contain the speaker leijun");
return;
}
Console.WriteLine("---All speakers---");
string[] allSpeakers = manager.GetAllSpeakers();
foreach (var s in allSpeakers)
{
Console.WriteLine(s);
}
Console.WriteLine("------------");
string[] testFiles =
new string[] {
"./sr-data/test/fangjun-test-sr-1.wav",
"./sr-data/test/leijun-test-sr-1.wav",
"./sr-data/test/liudehua-test-sr-1.wav"
};
float threshold = 0.6f;
foreach (var file in testFiles)
{
float[] embedding = ComputeEmbedding(extractor, file);
String name = manager.Search(embedding, threshold);
if (name == "")
{
name = "<Unknown>";
}
Console.WriteLine("{0}: {1}", file, name);
}
// test verify
if (!manager.Verify("fangjun", ComputeEmbedding(extractor, testFiles[0]), threshold))
{
Console.WriteLine("testFiles[0] should match fangjun!");
return;
}
if (!manager.Remove("fangjun"))
{
Console.WriteLine("Failed to remove fangjun");
return;
}
if (manager.Verify("fangjun", ComputeEmbedding(extractor, testFiles[0]), threshold))
{
Console.WriteLine("{0} should match no one!", testFiles[0]);
return;
}
if (manager.NumSpeakers != 1)
{
Console.WriteLine("There should only 1 speaker left.");
return;
}
}
}
... ...
../offline-decode-files/WaveReader.cs
\ No newline at end of file
... ...
#!/usr/bin/env bash
set -ex
if [ ! -e ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi
if [ ! -d ./sr-data ]; then
git clone https://github.com/csukuangfj/sr-data
fi
dotnet run
... ...
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<RootNamespace>speaker_identification</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
</ItemGroup>
</Project>
... ...
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<RootNamespace>speaker_identification</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<PropertyGroup>
<RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
</ItemGroup>
</Project>
... ...
... ... @@ -222,6 +222,14 @@ namespace SherpaOnnx
}
}
public int NumSpeakers
{
get
{
return SherpaOnnxOfflineTtsNumSpeakers(_handle.Handle);
}
}
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxCreateOfflineTts(ref OfflineTtsConfig config);
... ... @@ -232,6 +240,9 @@ namespace SherpaOnnx
private static extern int SherpaOnnxOfflineTtsSampleRate(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxOfflineTtsNumSpeakers(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxOfflineTtsGenerate(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string text, int sid, float speed);
[DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
... ... @@ -557,6 +568,112 @@ namespace SherpaOnnx
}
[StructLayout(LayoutKind.Sequential)]
public struct SpeakerEmbeddingExtractorConfig
{
public SpeakerEmbeddingExtractorConfig()
{
Model = "";
NumThreads = 1;
Debug = 0;
Provider = "cpu";
}
[MarshalAs(UnmanagedType.LPStr)]
public string Model;
public int NumThreads;
public int Debug;
[MarshalAs(UnmanagedType.LPStr)]
public string Provider;
}
public class SpeakerEmbeddingExtractor : IDisposable
{
public SpeakerEmbeddingExtractor(SpeakerEmbeddingExtractorConfig config)
{
IntPtr h = SherpaOnnxCreateSpeakerEmbeddingExtractor(ref config);
_handle = new HandleRef(this, h);
}
public OnlineStream CreateStream()
{
IntPtr p = SherpaOnnxSpeakerEmbeddingExtractorCreateStream(_handle.Handle);
return new OnlineStream(p);
}
public bool IsReady(OnlineStream stream)
{
return SherpaOnnxSpeakerEmbeddingExtractorIsReady(_handle.Handle, stream.Handle) != 0;
}
public float[] Compute(OnlineStream stream)
{
IntPtr p = SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(_handle.Handle, stream.Handle);
int dim = Dim;
float[] ans = new float[dim];
Marshal.Copy(p, ans, 0, dim);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(p);
return ans;
}
public int Dim
{
get
{
return SherpaOnnxSpeakerEmbeddingExtractorDim(_handle.Handle);
}
}
public void Dispose()
{
Cleanup();
// Prevent the object from being placed on the
// finalization queue
System.GC.SuppressFinalize(this);
}
~SpeakerEmbeddingExtractor()
{
Cleanup();
}
private void Cleanup()
{
SherpaOnnxDestroySpeakerEmbeddingExtractor(_handle.Handle);
// Don't permit the handle to be used again.
_handle = new HandleRef(this, IntPtr.Zero);
}
private HandleRef _handle;
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxCreateSpeakerEmbeddingExtractor(ref SpeakerEmbeddingExtractorConfig config);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxDestroySpeakerEmbeddingExtractor(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingExtractorDim(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxSpeakerEmbeddingExtractorCreateStream(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingExtractorIsReady(IntPtr handle, IntPtr stream);
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(IntPtr handle, IntPtr stream);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(IntPtr p);
}
[StructLayout(LayoutKind.Sequential)]
public struct SpokenLanguageIdentificationWhisperConfig
{
public SpokenLanguageIdentificationWhisperConfig()
... ... @@ -593,6 +710,185 @@ namespace SherpaOnnx
public string Provider;
}
public class SpeakerEmbeddingManager : IDisposable
{
public SpeakerEmbeddingManager(int dim)
{
IntPtr h = SherpaOnnxCreateSpeakerEmbeddingManager(dim);
_handle = new HandleRef(this, h);
this._dim = dim;
}
public bool Add(string name, float[] v)
{
return SherpaOnnxSpeakerEmbeddingManagerAdd(_handle.Handle, name, v) == 1;
}
public bool Add(string name, ICollection<float[]> v_list)
{
int n = v_list.Count;
float[] v = new float[n * _dim];
int i = 0;
foreach (var item in v_list)
{
item.CopyTo(v, i);
i += _dim;
}
return SherpaOnnxSpeakerEmbeddingManagerAddListFlattened(_handle.Handle, name, v, n) == 1;
}
public bool Remove(string name)
{
return SherpaOnnxSpeakerEmbeddingManagerRemove(_handle.Handle, name) == 1;
}
public string Search(float[] v, float threshold)
{
IntPtr p = SherpaOnnxSpeakerEmbeddingManagerSearch(_handle.Handle, v, threshold);
string s = "";
int length = 0;
unsafe
{
byte* b = (byte*)p;
if (b != null)
{
while (*b != 0)
{
++b;
length += 1;
}
}
}
if (length > 0)
{
byte[] stringBuffer = new byte[length];
Marshal.Copy(p, stringBuffer, 0, length);
s = Encoding.UTF8.GetString(stringBuffer);
}
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(p);
return s;
}
public bool Verify(string name, float[] v, float threshold)
{
return SherpaOnnxSpeakerEmbeddingManagerVerify(_handle.Handle, name, v, threshold) == 1;
}
public bool Contains(string name)
{
return SherpaOnnxSpeakerEmbeddingManagerContains(_handle.Handle, name) == 1;
}
public string[] GetAllSpeakers()
{
if (NumSpeakers == 0)
{
return new string[] { };
}
IntPtr names = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(_handle.Handle);
string[] ans = new string[NumSpeakers];
unsafe
{
byte** p = (byte**)names;
for (int i = 0; i != NumSpeakers; i++)
{
int length = 0;
byte* s = p[i];
while (*s != 0)
{
++s;
length += 1;
}
byte[] stringBuffer = new byte[length];
Marshal.Copy((IntPtr)p[i], stringBuffer, 0, length);
ans[i] = Encoding.UTF8.GetString(stringBuffer);
}
}
SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(names);
return ans;
}
public void Dispose()
{
Cleanup();
// Prevent the object from being placed on the
// finalization queue
System.GC.SuppressFinalize(this);
}
~SpeakerEmbeddingManager()
{
Cleanup();
}
private void Cleanup()
{
SherpaOnnxDestroySpeakerEmbeddingManager(_handle.Handle);
// Don't permit the handle to be used again.
_handle = new HandleRef(this, IntPtr.Zero);
}
public int NumSpeakers
{
get
{
return SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(_handle.Handle);
}
}
private HandleRef _handle;
private int _dim;
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxCreateSpeakerEmbeddingManager(int dim);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxDestroySpeakerEmbeddingManager(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingManagerAdd(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name, float[] v);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingManagerAddListFlattened(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name, float[] v, int n);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingManagerRemove(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name);
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxSpeakerEmbeddingManagerSearch(IntPtr handle, float[] v, float threshold);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxSpeakerEmbeddingManagerFreeSearch(IntPtr p);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingManagerVerify(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name, float[] v, float threshold);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingManagerContains(IntPtr handle, [MarshalAs(UnmanagedType.LPStr)] string name);
[DllImport(Dll.Filename)]
private static extern int SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern IntPtr SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(IntPtr names);
}
public class SpokenLanguageIdentificationResult
{
public SpokenLanguageIdentificationResult(IntPtr handle)
... ...