Fangjun Kuang
Committed by GitHub

Add C# API for Kokoro TTS models (#1720)

@@ -2,7 +2,11 @@ @@ -2,7 +2,11 @@
2 2
3 cd dotnet-examples/ 3 cd dotnet-examples/
4 4
5 -cd ./offline-tts 5 +cd ./kokoro-tts
  6 +./run-kokoro-en.sh
  7 +ls -lh
  8 +
  9 +cd ../offline-tts
6 ./run-matcha-zh.sh 10 ./run-matcha-zh.sh
7 ls -lh *.wav 11 ls -lh *.wav
8 ./run-matcha-en.sh 12 ./run-matcha-en.sh
@@ -19,7 +23,8 @@ pushd ../.. @@ -19,7 +23,8 @@ pushd ../..
19 23
20 mkdir tts 24 mkdir tts
21 25
22 -cp dotnet-examples/offline-tts/*.wav ./tts 26 +cp -v dotnet-examples/kokoro-tts/*.wav ./tts
  27 +cp -v dotnet-examples/offline-tts/*.wav ./tts
23 popd 28 popd
24 29
25 cd ../offline-speaker-diarization 30 cd ../offline-speaker-diarization
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +//
  3 +// This file shows how to use a non-streaming Kokoro TTS model
  4 +// for text-to-speech
  5 +// Please refer to
  6 +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  7 +// and
  8 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  9 +// to download pre-trained models
  10 +using PortAudioSharp;
  11 +using SherpaOnnx;
  12 +using System.Collections.Concurrent;
  13 +using System.Runtime.InteropServices;
  14 +
  15 +class OfflineTtsDemo
  16 +{
  17 + static void Main(string[] args)
  18 + {
  19 + var config = new OfflineTtsConfig();
  20 + config.Model.Kokoro.Model = "./kokoro-en-v0_19/model.onnx";
  21 + config.Model.Kokoro.Voices = "./kokoro-en-v0_19/voices.bin";
  22 + config.Model.Kokoro.Tokens = "./kokoro-en-v0_19/tokens.txt";
  23 + config.Model.Kokoro.DataDir = "./kokoro-en-v0_19/espeak-ng-data";
  24 +
  25 + config.Model.NumThreads = 2;
  26 + config.Model.Debug = 1;
  27 + config.Model.Provider = "cpu";
  28 +
  29 + var tts = new OfflineTts(config);
  30 + var speed = 1.0f;
  31 + var text = "Today as always, men fall into two groups: slaves and free men. Whoever " +
  32 + "does not have two-thirds of his day for himself, is a slave, whatever " +
  33 + "he may be: a statesman, a businessman, an official, or a scholar. " +
  34 + "Friends fell out often because life was changing so fast. The easiest " +
  35 + "thing in the world was to lose touch with someone.";
  36 +
  37 + // mapping of sid to voice name
  38 + // 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
  39 + // 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis
  40 + var sid = 0;
  41 +
  42 +
  43 + Console.WriteLine(PortAudio.VersionInfo.versionText);
  44 + PortAudio.Initialize();
  45 + Console.WriteLine($"Number of devices: {PortAudio.DeviceCount}");
  46 +
  47 + for (int i = 0; i != PortAudio.DeviceCount; ++i)
  48 + {
  49 + Console.WriteLine($" Device {i}");
  50 + DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i);
  51 + Console.WriteLine($" Name: {deviceInfo.name}");
  52 + Console.WriteLine($" Max output channels: {deviceInfo.maxOutputChannels}");
  53 + Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}");
  54 + }
  55 + int deviceIndex = PortAudio.DefaultOutputDevice;
  56 + if (deviceIndex == PortAudio.NoDevice)
  57 + {
  58 + Console.WriteLine("No default output device found. Please use ../offline-tts instead");
  59 + Environment.Exit(1);
  60 + }
  61 +
  62 + var info = PortAudio.GetDeviceInfo(deviceIndex);
  63 + Console.WriteLine();
  64 + Console.WriteLine($"Use output default device {deviceIndex} ({info.name})");
  65 +
  66 + var param = new StreamParameters();
  67 + param.device = deviceIndex;
  68 + param.channelCount = 1;
  69 + param.sampleFormat = SampleFormat.Float32;
  70 + param.suggestedLatency = info.defaultLowOutputLatency;
  71 + param.hostApiSpecificStreamInfo = IntPtr.Zero;
  72 +
  73 + // https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview
  74 + var dataItems = new BlockingCollection<float[]>();
  75 +
  76 + var MyCallback = (IntPtr samples, int n, float progress) =>
  77 + {
  78 + Console.WriteLine($"Progress {progress*100}%");
  79 +
  80 + float[] data = new float[n];
  81 +
  82 + Marshal.Copy(samples, data, 0, n);
  83 +
  84 + dataItems.Add(data);
  85 +
  86 + // 1 means to keep generating
  87 + // 0 means to stop generating
  88 + return 1;
  89 + };
  90 +
  91 + var playFinished = false;
  92 +
  93 + float[]? lastSampleArray = null;
  94 + int lastIndex = 0; // not played
  95 +
  96 + PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output,
  97 + UInt32 frameCount,
  98 + ref StreamCallbackTimeInfo timeInfo,
  99 + StreamCallbackFlags statusFlags,
  100 + IntPtr userData
  101 + ) =>
  102 + {
  103 + if (dataItems.IsCompleted && lastSampleArray == null && lastIndex == 0)
  104 + {
  105 + Console.WriteLine($"Finished playing");
  106 + playFinished = true;
  107 + return StreamCallbackResult.Complete;
  108 + }
  109 +
  110 + int expected = Convert.ToInt32(frameCount);
  111 + int i = 0;
  112 +
  113 + while ((lastSampleArray != null || dataItems.Count != 0) && (i < expected))
  114 + {
  115 + int needed = expected - i;
  116 +
  117 + if (lastSampleArray != null)
  118 + {
  119 + int remaining = lastSampleArray.Length - lastIndex;
  120 + if (remaining >= needed)
  121 + {
  122 + float[] this_block = lastSampleArray.Skip(lastIndex).Take(needed).ToArray();
  123 + lastIndex += needed;
  124 + if (lastIndex == lastSampleArray.Length)
  125 + {
  126 + lastSampleArray = null;
  127 + lastIndex = 0;
  128 + }
  129 +
  130 + Marshal.Copy(this_block, 0, IntPtr.Add(output, i * sizeof(float)), needed);
  131 + return StreamCallbackResult.Continue;
  132 + }
  133 +
  134 + float[] this_block2 = lastSampleArray.Skip(lastIndex).Take(remaining).ToArray();
  135 + lastIndex = 0;
  136 + lastSampleArray = null;
  137 +
  138 + Marshal.Copy(this_block2, 0, IntPtr.Add(output, i * sizeof(float)), remaining);
  139 + i += remaining;
  140 + continue;
  141 + }
  142 +
  143 + if (dataItems.Count != 0)
  144 + {
  145 + lastSampleArray = dataItems.Take();
  146 + lastIndex = 0;
  147 + }
  148 + }
  149 +
  150 + if (i < expected)
  151 + {
  152 + int sizeInBytes = (expected - i) * 4;
  153 + Marshal.Copy(new byte[sizeInBytes], 0, IntPtr.Add(output, i * sizeof(float)), sizeInBytes);
  154 + }
  155 +
  156 + return StreamCallbackResult.Continue;
  157 + };
  158 +
  159 + PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: null, outParams: param, sampleRate: tts.SampleRate,
  160 + framesPerBuffer: 0,
  161 + streamFlags: StreamFlags.ClipOff,
  162 + callback: playCallback,
  163 + userData: IntPtr.Zero
  164 + );
  165 +
  166 + stream.Start();
  167 +
  168 + var callback = new OfflineTtsCallbackProgress(MyCallback);
  169 +
  170 + var audio = tts.GenerateWithCallbackProgress(text, speed, sid, callback);
  171 + var outputFilename = "./generated-kokoro-0.wav";
  172 + var ok = audio.SaveToWaveFile(outputFilename);
  173 +
  174 + if (ok)
  175 + {
  176 + Console.WriteLine($"Wrote to {outputFilename} succeeded!");
  177 + }
  178 + else
  179 + {
  180 + Console.WriteLine($"Failed to write {outputFilename}");
  181 + }
  182 + dataItems.CompleteAdding();
  183 +
  184 + while (!playFinished)
  185 + {
  186 + Thread.Sleep(100); // 100ms
  187 + }
  188 + }
  189 +}
  1 +<Project Sdk="Microsoft.NET.Sdk">
  2 +
  3 + <PropertyGroup>
  4 + <OutputType>Exe</OutputType>
  5 + <TargetFramework>net8.0</TargetFramework>
  6 + <RootNamespace>kokoro_tts_play</RootNamespace>
  7 + <ImplicitUsings>enable</ImplicitUsings>
  8 + <Nullable>enable</Nullable>
  9 + </PropertyGroup>
  10 +
  11 + <ItemGroup>
  12 + <PackageReference Include="PortAudioSharp2" Version="*" />
  13 + </ItemGroup>
  14 +
  15 + <ItemGroup>
  16 + <ProjectReference Include="..\Common\Common.csproj" />
  17 + </ItemGroup>
  18 +
  19 +</Project>
  1 +#!/usr/bin/env bash
  2 +set -ex
  3 +
  4 +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
  5 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  6 + tar xf kokoro-en-v0_19.tar.bz2
  7 + rm kokoro-en-v0_19.tar.bz2
  8 +fi
  9 +
  10 +dotnet run
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +//
  3 +// This file shows how to use a non-streaming Kokoro TTS model
  4 +// for text-to-speech
  5 +// Please refer to
  6 +// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  7 +// and
  8 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  9 +// to download pre-trained models
  10 +using SherpaOnnx;
  11 +using System.Runtime.InteropServices;
  12 +
  13 +class OfflineTtsDemo
  14 +{
  15 + static void Main(string[] args)
  16 + {
  17 + var config = new OfflineTtsConfig();
  18 + config.Model.Kokoro.Model = "./kokoro-en-v0_19/model.onnx";
  19 + config.Model.Kokoro.Voices = "./kokoro-en-v0_19/voices.bin";
  20 + config.Model.Kokoro.Tokens = "./kokoro-en-v0_19/tokens.txt";
  21 + config.Model.Kokoro.DataDir = "./kokoro-en-v0_19/espeak-ng-data";
  22 +
  23 + config.Model.NumThreads = 2;
  24 + config.Model.Debug = 1;
  25 + config.Model.Provider = "cpu";
  26 +
  27 + var tts = new OfflineTts(config);
  28 + var speed = 1.0f;
  29 + var text = "Today as always, men fall into two groups: slaves and free men. Whoever " +
  30 + "does not have two-thirds of his day for himself, is a slave, whatever " +
  31 + "he may be: a statesman, a businessman, an official, or a scholar. " +
  32 + "Friends fell out often because life was changing so fast. The easiest " +
  33 + "thing in the world was to lose touch with someone.";
  34 +
  35 + // mapping of sid to voice name
  36 + // 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
  37 + // 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis
  38 + var sid = 0;
  39 +
  40 + var MyCallback = (IntPtr samples, int n, float progress) =>
  41 + {
  42 + float[] data = new float[n];
  43 + Marshal.Copy(samples, data, 0, n);
  44 + // You can process samples here, e.g., play them.
  45 + // See ../kokoro-tts-playback for how to play them
  46 + Console.WriteLine($"Progress {progress*100}%");
  47 +
  48 + // 1 means to keep generating
  49 + // 0 means to stop generating
  50 + return 1;
  51 + };
  52 +
  53 + var callback = new OfflineTtsCallbackProgress(MyCallback);
  54 +
  55 + var audio = tts.GenerateWithCallbackProgress(text, speed, sid, callback);
  56 +
  57 + var outputFilename = "./generated-kokoro-0.wav";
  58 + var ok = audio.SaveToWaveFile(outputFilename);
  59 +
  60 + if (ok)
  61 + {
  62 + Console.WriteLine($"Wrote to {outputFilename} succeeded!");
  63 + }
  64 + else
  65 + {
  66 + Console.WriteLine($"Failed to write {outputFilename}");
  67 + }
  68 + }
  69 +}
  70 +
  1 +<Project Sdk="Microsoft.NET.Sdk">
  2 +
  3 + <PropertyGroup>
  4 + <OutputType>Exe</OutputType>
  5 + <TargetFramework>net8.0</TargetFramework>
  6 + <RootNamespace>kokoro_tts</RootNamespace>
  7 + <ImplicitUsings>enable</ImplicitUsings>
  8 + <Nullable>enable</Nullable>
  9 + </PropertyGroup>
  10 +
  11 + <ItemGroup>
  12 + <ProjectReference Include="..\Common\Common.csproj" />
  13 + </ItemGroup>
  14 +
  15 +</Project>
  1 +#!/usr/bin/env bash
  2 +set -ex
  3 +
  4 +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
  5 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  6 + tar xf kokoro-en-v0_19.tar.bz2
  7 + rm kokoro-en-v0_19.tar.bz2
  8 +fi
  9 +
  10 +dotnet run
@@ -31,6 +31,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro @@ -31,6 +31,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro
31 EndProject 31 EndProject
32 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}" 32 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
33 EndProject 33 EndProject
  34 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts", "kokoro-tts\kokoro-tts.csproj", "{9C0ABE6C-1F54-42B5-804E-C3FED6668F52}"
  35 +EndProject
  36 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts-play", "kokoro-tts-play\kokoro-tts-play.csproj", "{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}"
  37 +EndProject
34 Global 38 Global
35 GlobalSection(SolutionConfigurationPlatforms) = preSolution 39 GlobalSection(SolutionConfigurationPlatforms) = preSolution
36 Debug|Any CPU = Debug|Any CPU 40 Debug|Any CPU = Debug|Any CPU
@@ -93,6 +97,14 @@ Global @@ -93,6 +97,14 @@ Global
93 {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU 97 {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU
94 {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU 98 {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
95 {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU 99 {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU
  100 + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
  101 + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Debug|Any CPU.Build.0 = Debug|Any CPU
  102 + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Release|Any CPU.ActiveCfg = Release|Any CPU
  103 + {9C0ABE6C-1F54-42B5-804E-C3FED6668F52}.Release|Any CPU.Build.0 = Release|Any CPU
  104 + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
  105 + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Debug|Any CPU.Build.0 = Debug|Any CPU
  106 + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.ActiveCfg = Release|Any CPU
  107 + {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.Build.0 = Release|Any CPU
96 EndGlobalSection 108 EndGlobalSection
97 GlobalSection(SolutionProperties) = preSolution 109 GlobalSection(SolutionProperties) = preSolution
98 HideSolutionNode = FALSE 110 HideSolutionNode = FALSE
@@ -7,6 +7,7 @@ namespace SherpaOnnx @@ -7,6 +7,7 @@ namespace SherpaOnnx
7 { 7 {
8 // IntPtr is actually a `const float*` from C++ 8 // IntPtr is actually a `const float*` from C++
9 public delegate int OfflineTtsCallback(IntPtr samples, int n); 9 public delegate int OfflineTtsCallback(IntPtr samples, int n);
  10 + public delegate int OfflineTtsCallbackProgress(IntPtr samples, int n, float progress);
10 11
11 public class OfflineTts : IDisposable 12 public class OfflineTts : IDisposable
12 { 13 {
@@ -36,6 +37,16 @@ namespace SherpaOnnx @@ -36,6 +37,16 @@ namespace SherpaOnnx
36 return new OfflineTtsGeneratedAudio(p); 37 return new OfflineTtsGeneratedAudio(p);
37 } 38 }
38 39
  40 + public OfflineTtsGeneratedAudio GenerateWithCallbackProgress(String text, float speed, int speakerId, OfflineTtsCallbackProgress callback)
  41 + {
  42 + byte[] utf8Bytes = Encoding.UTF8.GetBytes(text);
  43 + byte[] utf8BytesWithNull = new byte[utf8Bytes.Length + 1]; // +1 for null terminator
  44 + Array.Copy(utf8Bytes, utf8BytesWithNull, utf8Bytes.Length);
  45 + utf8BytesWithNull[utf8Bytes.Length] = 0; // Null terminator
  46 + IntPtr p = SherpaOnnxOfflineTtsGenerateWithProgressCallback(_handle.Handle, utf8BytesWithNull, speakerId, speed, callback);
  47 + return new OfflineTtsGeneratedAudio(p);
  48 + }
  49 +
39 public void Dispose() 50 public void Dispose()
40 { 51 {
41 Cleanup(); 52 Cleanup();
@@ -92,5 +103,8 @@ namespace SherpaOnnx @@ -92,5 +103,8 @@ namespace SherpaOnnx
92 103
93 [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)] 104 [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
94 private static extern IntPtr SherpaOnnxOfflineTtsGenerateWithCallback(IntPtr handle, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Text, int sid, float speed, OfflineTtsCallback callback); 105 private static extern IntPtr SherpaOnnxOfflineTtsGenerateWithCallback(IntPtr handle, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Text, int sid, float speed, OfflineTtsCallback callback);
  106 +
  107 + [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)]
  108 + private static extern IntPtr SherpaOnnxOfflineTtsGenerateWithProgressCallback(IntPtr handle, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Text, int sid, float speed, OfflineTtsCallbackProgress callback);
95 } 109 }
96 } 110 }
  1 +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +using System.Runtime.InteropServices;
  4 +
  5 +namespace SherpaOnnx
  6 +{
  7 + [StructLayout(LayoutKind.Sequential)]
  8 + public struct OfflineTtsKokoroModelConfig
  9 + {
  10 + public OfflineTtsKokoroModelConfig()
  11 + {
  12 + Model = "";
  13 + Voices = "";
  14 + Tokens = "";
  15 + DataDir = "";
  16 +
  17 + LengthScale = 1.0F;
  18 + }
  19 + [MarshalAs(UnmanagedType.LPStr)]
  20 + public string Model;
  21 +
  22 + [MarshalAs(UnmanagedType.LPStr)]
  23 + public string Voices;
  24 +
  25 + [MarshalAs(UnmanagedType.LPStr)]
  26 + public string Tokens;
  27 +
  28 + [MarshalAs(UnmanagedType.LPStr)]
  29 + public string DataDir;
  30 +
  31 + public float LengthScale;
  32 + }
  33 +}
@@ -12,6 +12,7 @@ namespace SherpaOnnx @@ -12,6 +12,7 @@ namespace SherpaOnnx
12 { 12 {
13 Vits = new OfflineTtsVitsModelConfig(); 13 Vits = new OfflineTtsVitsModelConfig();
14 Matcha = new OfflineTtsMatchaModelConfig(); 14 Matcha = new OfflineTtsMatchaModelConfig();
  15 + Kokoro = new OfflineTtsKokoroModelConfig();
15 NumThreads = 1; 16 NumThreads = 1;
16 Debug = 0; 17 Debug = 0;
17 Provider = "cpu"; 18 Provider = "cpu";
@@ -24,5 +25,6 @@ namespace SherpaOnnx @@ -24,5 +25,6 @@ namespace SherpaOnnx
24 public string Provider; 25 public string Provider;
25 26
26 public OfflineTtsMatchaModelConfig Matcha; 27 public OfflineTtsMatchaModelConfig Matcha;
  28 + public OfflineTtsKokoroModelConfig Kokoro;
27 } 29 }
28 } 30 }