Add Pascal API for Kokoro TTS models (#1724)

Fangjun Kuang · GitHub
Commit 46f2e32e8a2d34bdb42e0e02f7b0e1d0209ec97b 46f2e32e 1 parent 4335e2ac
.github/workflows/pascal.yaml
pascal-api-examples/tts/.gitignore
pascal-api-examples/tts/kokoro-en-playback.pas
pascal-api-examples/tts/kokoro-en.pas
pascal-api-examples/tts/matcha-en-playback.pas
pascal-api-examples/tts/matcha-en.pas
pascal-api-examples/tts/matcha-zh-playback.pas
pascal-api-examples/tts/run-kokoro-en-playback.sh
pascal-api-examples/tts/run-kokoro-en.sh
sherpa-onnx/pascal-api/sherpa_onnx.pas
--- a/.github/workflows/pascal.yaml
查看文件 @46f2e32
+++ b/.github/workflows/pascal.yaml
查看文件 @46f2e32
@@ -154,6 +154,12 @@ jobs:
           ls -lh
           echo "---"
+          ./run-kokoro-en.sh
+          rm -rf kokoro-en-*
+          rm kokoro-en
+          ls -lh
+          echo "---"
+
           ./run-matcha-zh.sh
           rm -rf matcha-icefall-*
           rm matcha-zh
--- a/pascal-api-examples/tts/.gitignore
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/.gitignore
查看文件 @46f2e32
@@ -6,3 +6,5 @@ matcha-zh
 matcha-en
 matcha-zh-playback
 matcha-en-playback
+kokoro-en
+kokoro-en-playback
--- a/pascal-api-examples/tts/kokoro-en-playback.pas 0 → 100644
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/kokoro-en-playback.pas 0 → 100644
查看文件 @46f2e32
+{ Copyright (c)  2025  Xiaomi Corporation }
+program kokoro_en_playback;
+{
+This file shows how to use the text to speech API of sherpa-onnx
+with Kokoro models.
+
+It generates speech from text and saves it to a wave file.
+
+Note that it plays the audio back as it is still generating.
+}
+
+{$mode objfpc}
+
+uses
+  {$ifdef unix}
+  cthreads,
+  {$endif}
+  SysUtils,
+  dos,
+  ctypes,
+  portaudio,
+  sherpa_onnx;
+
+var
+  CriticalSection: TRTLCriticalSection;
+
+  Tts: TSherpaOnnxOfflineTts;
+  Audio: TSherpaOnnxGeneratedAudio;
+  Resampler: TSherpaOnnxLinearResampler;
+
+  Text: AnsiString;
+  Speed: Single = 1.0;  {Use a larger value to speak faster}
+  SpeakerId: Integer = 7;
+  Buffer: TSherpaOnnxCircularBuffer;
+  FinishedGeneration: Boolean = False;
+  FinishedPlaying: Boolean = False;
+
+  Version: String;
+  EnvStr: String;
+  Status: Integer;
+  NumDevices: Integer;
+  DeviceIndex: Integer;
+  DeviceInfo: PPaDeviceInfo;
+
+  { If you get EDivByZero: Division by zero error, please change the sample rate
+    to the one supported by your microphone.
+  }
+  DeviceSampleRate: Integer = 48000;
+  I: Integer;
+  Param: TPaStreamParameters;
+  Stream: PPaStream;
+  Wave: TSherpaOnnxWave;
+
+function GenerateCallback(
+      Samples: pcfloat; N: cint32;
+      Arg: Pointer): cint; cdecl;
+begin
+  EnterCriticalSection(CriticalSection);
+  try
+    if Resampler <> nil then
+      Buffer.Push(Resampler.Resample(Samples, N, False))
+    else
+      Buffer.Push(Samples, N);
+  finally
+    LeaveCriticalSection(CriticalSection);
+  end;
+
+  { 1 means to continue generating; 0 means to stop generating. }
+  Result := 1;
+end;
+
+function PlayCallback(
+      input: Pointer; output: Pointer;
+      frameCount: culong;
+      timeInfo: PPaStreamCallbackTimeInfo;
+      statusFlags: TPaStreamCallbackFlags;
+      userData: Pointer ): cint; cdecl;
+var
+  Samples: TSherpaOnnxSamplesArray;
+  I: Integer;
+begin
+  EnterCriticalSection(CriticalSection);
+  try
+    if Buffer.Size >= frameCount then
+      begin
+        Samples := Buffer.Get(Buffer.Head, FrameCount);
+        Buffer.Pop(FrameCount);
+      end
+    else if Buffer.Size > 0 then
+      begin
+        Samples := Buffer.Get(Buffer.Head, Buffer.Size);
+        Buffer.Pop(Buffer.Size);
+        SetLength(Samples, frameCount);
+      end
+    else
+      SetLength(Samples, frameCount);
+
+    for I := 0 to frameCount - 1 do
+      pcfloat(output)[I] := Samples[I];
+
+    if (Buffer.Size > 0) or (not FinishedGeneration) then
+      Result := paContinue
+    else
+      begin
+        Result := paComplete;
+        FinishedPlaying := True;
+      end;
+  finally
+    LeaveCriticalSection(CriticalSection);
+  end;
+end;
+
+function GetOfflineTts: TSherpaOnnxOfflineTts;
+var
+  Config: TSherpaOnnxOfflineTtsConfig;
+begin
+  Config.Model.Kokoro.Model := './kokoro-en-v0_19/model.onnx';
+  Config.Model.Kokoro.Voices := './kokoro-en-v0_19/voices.bin';
+  Config.Model.Kokoro.Tokens := './kokoro-en-v0_19/tokens.txt';
+  Config.Model.Kokoro.DataDir := './kokoro-en-v0_19/espeak-ng-data';
+  Config.Model.NumThreads := 2;
+  Config.Model.Debug := False;
+  Config.MaxNumSentences := 1;
+
+  Result := TSherpaOnnxOfflineTts.Create(Config);
+end;
+
+begin
+  Tts := GetOfflineTts;
+  if Tts.GetSampleRate <> DeviceSampleRate then
+    Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
+
+  Version := String(Pa_GetVersionText);
+  WriteLn('Version is ', Version);
+  Status := Pa_Initialize;
+  if Status <> paNoError then
+    begin
+      WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
+      Exit;
+    end;
+
+  NumDevices := Pa_GetDeviceCount;
+  WriteLn('Num devices: ', NumDevices);
+
+  DeviceIndex := Pa_GetDefaultOutputDevice;
+
+  if DeviceIndex = paNoDevice then
+    begin
+      WriteLn('No default output device found');
+      Pa_Terminate;
+      Exit;
+    end;
+
+  EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
+  if EnvStr <> '' then
+    begin
+      DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
+      WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
+    end;
+
+  for I := 0 to (NumDevices - 1) do
+    begin
+      DeviceInfo := Pa_GetDeviceInfo(I);
+      if I = DeviceIndex then
+        { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
+        WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
+      else
+        WriteLn(Format('   %d %s', [I, AnsiString(DeviceInfo^.Name)]));
+    end;
+
+  WriteLn('Use device ', DeviceIndex);
+  WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
+  WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
+
+  Initialize(Param);
+  Param.Device := DeviceIndex;
+  Param.ChannelCount := 1;
+  Param.SampleFormat := paFloat32;
+  param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
+  param.HostApiSpecificStreamInfo := nil;
+
+  Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
+
+
+  { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
+  Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
+    PPaStreamCallback(@PlayCallback), nil);
+
+  if Status <> paNoError then
+    begin
+      WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
+      Pa_Terminate;
+      Exit;
+    end;
+
+  InitCriticalSection(CriticalSection);
+
+  Status := Pa_StartStream(stream);
+  if Status <> paNoError then
+    begin
+      WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
+      Pa_Terminate;
+      Exit;
+    end;
+
+  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
+
+  Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
+
+  Audio :=  Tts.Generate(Text, SpeakerId, Speed,
+    PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
+  FinishedGeneration := True;
+  SherpaOnnxWriteWave('./kokoro-en-playback-7.wav', Audio.Samples, Audio.SampleRate);
+  WriteLn('Saved to ./kokoro-en-playback-7.wav');
+
+  while not FinishedPlaying do
+    Pa_Sleep(100);  {sleep for 0.1 second }
+    {TODO(fangjun): Use an event to indicate the play is finished}
+
+  DoneCriticalSection(CriticalSection);
+
+  FreeAndNil(Tts);
+  FreeAndNil(Resampler);
+
+  Status := Pa_CloseStream(stream);
+  if Status <> paNoError then
+    begin
+      WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
+      Exit;
+    end;
+
+  Status := Pa_Terminate;
+  if Status <> paNoError then
+    begin
+      WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
+      Exit;
+    end;
+end.
+
--- a/pascal-api-examples/tts/kokoro-en.pas 0 → 100644
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/kokoro-en.pas 0 → 100644
查看文件 @46f2e32
+{ Copyright (c)  2025  Xiaomi Corporation }
+program kokoro_en;
+{
+This file shows how to use the text to speech API of sherpa-onnx
+with Kokoro TTS models.
+
+It generates speech from text and saves it to a wave file.
+
+If you want to play it while it is generating, please see
+./kokoro-en-playback.pas
+}
+
+{$mode objfpc}
+
+uses
+  SysUtils,
+  sherpa_onnx;
+
+function GetOfflineTts: TSherpaOnnxOfflineTts;
+var
+  Config: TSherpaOnnxOfflineTtsConfig;
+begin
+  Config.Model.Kokoro.Model := './kokoro-en-v0_19/model.onnx';
+  Config.Model.Kokoro.Voices := './kokoro-en-v0_19/voices.bin';
+  Config.Model.Kokoro.Tokens := './kokoro-en-v0_19/tokens.txt';
+  Config.Model.Kokoro.DataDir := './kokoro-en-v0_19/espeak-ng-data';
+  Config.Model.NumThreads := 2;
+  Config.Model.Debug := False;
+  Config.MaxNumSentences := 1;
+
+  Result := TSherpaOnnxOfflineTts.Create(Config);
+end;
+
+var
+  Tts: TSherpaOnnxOfflineTts;
+  Audio: TSherpaOnnxGeneratedAudio;
+
+  Text: AnsiString;
+  Speed: Single = 1.0;  {Use a larger value to speak faster}
+  SpeakerId: Integer = 8;
+
+begin
+  Tts := GetOfflineTts;
+
+  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
+
+  Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
+
+  Audio :=  Tts.Generate(Text, SpeakerId, Speed);
+  SherpaOnnxWriteWave('./kokoro-en-8.wav', Audio.Samples, Audio.SampleRate);
+  WriteLn('Saved to ./kokoro-en-8.wav');
+
+  FreeAndNil(Tts);
+end.
+
--- a/pascal-api-examples/tts/matcha-en-playback.pas
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/matcha-en-playback.pas
查看文件 @46f2e32
@@ -2,7 +2,7 @@
 program matcha_en_playback;
 {
 This file shows how to use the text to speech API of sherpa-onnx
-with Piper models.
+with MatchaTTS models.
 It generates speech from text and saves it to a wave file.
@@ -210,8 +210,8 @@ begin
   Audio :=  Tts.Generate(Text, SpeakerId, Speed,
     PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
   FinishedGeneration := True;
-  SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
-  WriteLn('Saved to ./matcha-zh-playback.wav');
+  SherpaOnnxWriteWave('./matcha-en-playback.wav', Audio.Samples, Audio.SampleRate);
+  WriteLn('Saved to ./matcha-en-playback.wav');
   while not FinishedPlaying do
     Pa_Sleep(100);  {sleep for 0.1 second }
--- a/pascal-api-examples/tts/matcha-en.pas
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/matcha-en.pas
查看文件 @46f2e32
@@ -7,7 +7,7 @@ with MatchaTTS models.
 It generates speech from text and saves it to a wave file.
 If you want to play it while it is generating, please see
-./matcha-zh-playback.pas
+./matcha-en-playback.pas
 }
 {$mode objfpc}
--- a/pascal-api-examples/tts/matcha-zh-playback.pas
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/matcha-zh-playback.pas
查看文件 @46f2e32
@@ -2,7 +2,7 @@
 program matcha_zh_playback;
 {
 This file shows how to use the text to speech API of sherpa-onnx
-with Piper models.
+with MatchaTTS models.
 It generates speech from text and saves it to a wave file.
--- a/pascal-api-examples/tts/run-kokoro-en-playback.sh 0 → 100755
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/run-kokoro-en-playback.sh 0 → 100755
查看文件 @46f2e32
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+# please visit
+# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html
+if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
+  tar xf kokoro-en-v0_19.tar.bz2
+  rm kokoro-en-v0_19.tar.bz2
+fi
+
+fpc \
+  -dSHERPA_ONNX_USE_SHARED_LIBS \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
+  ./kokoro-en-playback.pas
+
+# Please see ../portaudio-test/README.md
+# for how to install portaudio on macOS
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./kokoro-en-playback
--- a/pascal-api-examples/tts/run-kokoro-en.sh 0 → 100755
查看文件 @46f2e32
+++ b/pascal-api-examples/tts/run-kokoro-en.sh 0 → 100755
查看文件 @46f2e32
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+# please visit
+# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html
+if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
+  tar xf kokoro-en-v0_19.tar.bz2
+  rm kokoro-en-v0_19.tar.bz2
+fi
+
+fpc \
+  -dSHERPA_ONNX_USE_SHARED_LIBS \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./kokoro-en.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./kokoro-en
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @46f2e32
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @46f2e32
@@ -76,12 +76,24 @@ type
     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
   end;
+  TSherpaOnnxOfflineTtsKokoroModelConfig = record
+    Model: AnsiString;
+    Voices: AnsiString;
+    Tokens: AnsiString;
+    DataDir: AnsiString;
+    LengthScale: Single;
+
+    function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
+  end;
+
   TSherpaOnnxOfflineTtsModelConfig = record
     Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
     NumThreads: Integer;
     Debug: Boolean;
     Provider: AnsiString;
     Matcha: TSherpaOnnxOfflineTtsMatchaModelConfig;
+    Kokoro: TSherpaOnnxOfflineTtsKokoroModelConfig;
     function ToString: AnsiString;
     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
@@ -739,12 +751,21 @@ type
     DictDir: PAnsiChar;
   end;
+  SherpaOnnxOfflineTtsKokoroModelConfig = record
+    Model: PAnsiChar;
+    Voices: PAnsiChar;
+    Tokens: PAnsiChar;
+    DataDir: PAnsiChar;
+    LengthScale: cfloat;
+  end;
+
   SherpaOnnxOfflineTtsModelConfig = record
     Vits: SherpaOnnxOfflineTtsVitsModelConfig;
     NumThreads: cint32;
     Debug: cint32;
     Provider: PAnsiChar;
     Matcha: SherpaOnnxOfflineTtsMatchaModelConfig;
+    Kokoro: SherpaOnnxOfflineTtsKokoroModelConfig;
   end;
   SherpaOnnxOfflineTtsConfig = record
@@ -1903,6 +1924,23 @@ begin
   Dest.LengthScale := 1.0;
 end;
+function TSherpaOnnxOfflineTtsKokoroModelConfig.ToString: AnsiString;
+begin
+  Result := Format('TSherpaOnnxOfflineTtsKokoroModelConfig(' +
+    'Model := %s, ' +
+    'Voices := %s, ' +
+    'Tokens := %s, ' +
+    'DataDir := %s, ' +
+    'LengthScale := %.2f' +
+    ')',
+    [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale]);
+end;
+
+class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
+begin
+  Dest.LengthScale := 1.0;
+end;
+
 function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
 begin
   Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
@@ -1910,10 +1948,11 @@ begin
     'NumThreads := %d, ' +
     'Debug := %s, ' +
     'Provider := %s, ' +
-    'Matcha := %s' +
+    'Matcha := %s, ' +
+    'Kokoro := %s' +
     ')',
     [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider,
-     Self.Matcha.ToString
+     Self.Matcha.ToString, Self.Kokoro.ToString
     ]);
 end;
@@ -1966,6 +2005,12 @@ begin
   C.Model.Matcha.LengthScale := Config.Model.Matcha.LengthScale;
   C.Model.Matcha.DictDir := PAnsiChar(Config.Model.Matcha.DictDir);
+  C.Model.Kokoro.Model := PAnsiChar(Config.Model.Kokoro.Model);
+  C.Model.Kokoro.Voices := PAnsiChar(Config.Model.Kokoro.Voices);
+  C.Model.Kokoro.Tokens := PAnsiChar(Config.Model.Kokoro.Tokens);
+  C.Model.Kokoro.DataDir := PAnsiChar(Config.Model.Kokoro.DataDir);
+  C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale;
+
   C.Model.NumThreads := Config.Model.NumThreads;
   C.Model.Provider := PAnsiChar(Config.Model.Provider);
   C.Model.Debug := Ord(Config.Model.Debug);