Add Pascal API for MatchaTTS models. (#1686)

Fangjun Kuang · GitHub
Commit c6fcd32552d754a21721045b0937696fb9c38da1 c6fcd325 1 parent 46330b25
.github/workflows/pascal.yaml
pascal-api-examples/tts/.gitignore
pascal-api-examples/tts/matcha-en-playback.pas
pascal-api-examples/tts/matcha-en.pas
pascal-api-examples/tts/matcha-zh-playback.pas
pascal-api-examples/tts/matcha-zh.pas
pascal-api-examples/tts/piper-playback.pas
pascal-api-examples/tts/run-matcha-en-playback.sh
pascal-api-examples/tts/run-matcha-en.sh
pascal-api-examples/tts/run-matcha-zh-playback.sh
pascal-api-examples/tts/run-matcha-zh.sh
sherpa-onnx/pascal-api/sherpa_onnx.pas
--- a/.github/workflows/pascal.yaml
查看文件 @c6fcd32
+++ b/.github/workflows/pascal.yaml
查看文件 @c6fcd32
@@ -152,6 +152,19 @@ jobs:
 
           ./run-piper.sh
           rm -rf vits-piper-*
+           rm piper
+           ls -lh
+           echo "---"
+ 
+           ./run-matcha-zh.sh
+           rm -rf matcha-icefall-*
+           rm matcha-zh
+           ls -lh
+           echo "---"
+ 
+           ./run-matcha-en.sh
+           rm -rf matcha-icefall-*
+           rm matcha-en
           ls -lh
           echo "---"
 
--- a/pascal-api-examples/tts/.gitignore
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/.gitignore
查看文件 @c6fcd32
@@ -2,3 +2,7 @@
 piper
 piper-playback
 link*.res
+ matcha-zh
+ matcha-en
+ matcha-zh-playback
+ matcha-en-playback
--- a/pascal-api-examples/tts/matcha-en-playback.pas 0 → 100644
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/matcha-en-playback.pas 0 → 100644
查看文件 @c6fcd32
+ { Copyright (c)  2025  Xiaomi Corporation }
+ program matcha_en_playback;
+ {
+ This file shows how to use the text to speech API of sherpa-onnx
+ with Piper models.
+ 
+ It generates speech from text and saves it to a wave file.
+ 
+ Note that it plays the audio back as it is still generating.
+ }
+ 
+ {$mode objfpc}
+ 
+ uses
+   {$ifdef unix}
+   cthreads,
+   {$endif}
+   SysUtils,
+   dos,
+   ctypes,
+   portaudio,
+   sherpa_onnx;
+ 
+ var
+   CriticalSection: TRTLCriticalSection;
+ 
+   Tts: TSherpaOnnxOfflineTts;
+   Audio: TSherpaOnnxGeneratedAudio;
+   Resampler: TSherpaOnnxLinearResampler;
+ 
+   Text: AnsiString;
+   Speed: Single = 1.0;  {Use a larger value to speak faster}
+   SpeakerId: Integer = 0;
+   Buffer: TSherpaOnnxCircularBuffer;
+   FinishedGeneration: Boolean = False;
+   FinishedPlaying: Boolean = False;
+ 
+   Version: String;
+   EnvStr: String;
+   Status: Integer;
+   NumDevices: Integer;
+   DeviceIndex: Integer;
+   DeviceInfo: PPaDeviceInfo;
+ 
+   { If you get EDivByZero: Division by zero error, please change the sample rate
+     to the one supported by your microphone.
+   }
+   DeviceSampleRate: Integer = 48000;
+   I: Integer;
+   Param: TPaStreamParameters;
+   Stream: PPaStream;
+   Wave: TSherpaOnnxWave;
+ 
+ function GenerateCallback(
+       Samples: pcfloat; N: cint32;
+       Arg: Pointer): cint; cdecl;
+ begin
+   EnterCriticalSection(CriticalSection);
+   try
+     if Resampler <> nil then
+       Buffer.Push(Resampler.Resample(Samples, N, False))
+     else
+       Buffer.Push(Samples, N);
+   finally
+     LeaveCriticalSection(CriticalSection);
+   end;
+ 
+   { 1 means to continue generating; 0 means to stop generating. }
+   Result := 1;
+ end;
+ 
+ function PlayCallback(
+       input: Pointer; output: Pointer;
+       frameCount: culong;
+       timeInfo: PPaStreamCallbackTimeInfo;
+       statusFlags: TPaStreamCallbackFlags;
+       userData: Pointer ): cint; cdecl;
+ var
+   Samples: TSherpaOnnxSamplesArray;
+   I: Integer;
+ begin
+   EnterCriticalSection(CriticalSection);
+   try
+     if Buffer.Size >= frameCount then
+       begin
+         Samples := Buffer.Get(Buffer.Head, FrameCount);
+         Buffer.Pop(FrameCount);
+       end
+     else if Buffer.Size > 0 then
+       begin
+         Samples := Buffer.Get(Buffer.Head, Buffer.Size);
+         Buffer.Pop(Buffer.Size);
+         SetLength(Samples, frameCount);
+       end
+     else
+       SetLength(Samples, frameCount);
+ 
+     for I := 0 to frameCount - 1 do
+       pcfloat(output)[I] := Samples[I];
+ 
+     if (Buffer.Size > 0) or (not FinishedGeneration) then
+       Result := paContinue
+     else
+       begin
+         Result := paComplete;
+         FinishedPlaying := True;
+       end;
+   finally
+     LeaveCriticalSection(CriticalSection);
+   end;
+ end;
+ 
+ function GetOfflineTts: TSherpaOnnxOfflineTts;
+ var
+   Config: TSherpaOnnxOfflineTtsConfig;
+ begin
+   Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx';
+   Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
+   Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt';
+   Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data';
+   Config.Model.NumThreads := 1;
+   Config.Model.Debug := False;
+   Config.MaxNumSentences := 1;
+ 
+   Result := TSherpaOnnxOfflineTts.Create(Config);
+ end;
+ 
+ begin
+   Tts := GetOfflineTts;
+   if Tts.GetSampleRate <> DeviceSampleRate then
+     Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
+ 
+   Version := String(Pa_GetVersionText);
+   WriteLn('Version is ', Version);
+   Status := Pa_Initialize;
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
+       Exit;
+     end;
+ 
+   NumDevices := Pa_GetDeviceCount;
+   WriteLn('Num devices: ', NumDevices);
+ 
+   DeviceIndex := Pa_GetDefaultOutputDevice;
+ 
+   if DeviceIndex = paNoDevice then
+     begin
+       WriteLn('No default output device found');
+       Pa_Terminate;
+       Exit;
+     end;
+ 
+   EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
+   if EnvStr <> '' then
+     begin
+       DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
+       WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
+     end;
+ 
+   for I := 0 to (NumDevices - 1) do
+     begin
+       DeviceInfo := Pa_GetDeviceInfo(I);
+       if I = DeviceIndex then
+         { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
+         WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
+       else
+         WriteLn(Format('   %d %s', [I, AnsiString(DeviceInfo^.Name)]));
+     end;
+ 
+   WriteLn('Use device ', DeviceIndex);
+   WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
+   WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
+ 
+   Initialize(Param);
+   Param.Device := DeviceIndex;
+   Param.ChannelCount := 1;
+   Param.SampleFormat := paFloat32;
+   param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
+   param.HostApiSpecificStreamInfo := nil;
+ 
+   Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
+ 
+ 
+   { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
+   Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
+     PPaStreamCallback(@PlayCallback), nil);
+ 
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
+       Pa_Terminate;
+       Exit;
+     end;
+ 
+   InitCriticalSection(CriticalSection);
+ 
+   Status := Pa_StartStream(stream);
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
+       Pa_Terminate;
+       Exit;
+     end;
+ 
+   WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
+ 
+   Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
+ 
+   Audio :=  Tts.Generate(Text, SpeakerId, Speed,
+     PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
+   FinishedGeneration := True;
+   SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
+   WriteLn('Saved to ./matcha-zh-playback.wav');
+ 
+   while not FinishedPlaying do
+     Pa_Sleep(100);  {sleep for 0.1 second }
+     {TODO(fangjun): Use an event to indicate the play is finished}
+ 
+   DoneCriticalSection(CriticalSection);
+ 
+   FreeAndNil(Tts);
+   FreeAndNil(Resampler);
+ 
+   Status := Pa_CloseStream(stream);
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
+       Exit;
+     end;
+ 
+   Status := Pa_Terminate;
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
+       Exit;
+     end;
+ end.
+ 
--- a/pascal-api-examples/tts/matcha-en.pas 0 → 100644
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/matcha-en.pas 0 → 100644
查看文件 @c6fcd32
+ { Copyright (c)  2025  Xiaomi Corporation }
+ program matcha_en;
+ {
+ This file shows how to use the text to speech API of sherpa-onnx
+ with MatchaTTS models.
+ 
+ It generates speech from text and saves it to a wave file.
+ 
+ If you want to play it while it is generating, please see
+ ./matcha-zh-playback.pas
+ }
+ 
+ {$mode objfpc}
+ 
+ uses
+   SysUtils,
+   sherpa_onnx;
+ 
+ function GetOfflineTts: TSherpaOnnxOfflineTts;
+ var
+   Config: TSherpaOnnxOfflineTtsConfig;
+ begin
+   Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx';
+   Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
+   Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt';
+   Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data';
+   Config.Model.NumThreads := 1;
+   Config.Model.Debug := False;
+   Config.MaxNumSentences := 1;
+ 
+   Result := TSherpaOnnxOfflineTts.Create(Config);
+ end;
+ 
+ var
+   Tts: TSherpaOnnxOfflineTts;
+   Audio: TSherpaOnnxGeneratedAudio;
+ 
+   Text: AnsiString;
+   Speed: Single = 1.0;  {Use a larger value to speak faster}
+   SpeakerId: Integer = 0;
+ 
+ begin
+   Tts := GetOfflineTts;
+ 
+   WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
+ 
+   Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
+ 
+   Audio :=  Tts.Generate(Text, SpeakerId, Speed);
+   SherpaOnnxWriteWave('./matcha-en.wav', Audio.Samples, Audio.SampleRate);
+   WriteLn('Saved to ./matcha-en.wav');
+ 
+   FreeAndNil(Tts);
+ end.
+ 
--- a/pascal-api-examples/tts/matcha-zh-playback.pas 0 → 100644
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/matcha-zh-playback.pas 0 → 100644
查看文件 @c6fcd32
+ { Copyright (c)  2025  Xiaomi Corporation }
+ program matcha_zh_playback;
+ {
+ This file shows how to use the text to speech API of sherpa-onnx
+ with Piper models.
+ 
+ It generates speech from text and saves it to a wave file.
+ 
+ Note that it plays the audio back as it is still generating.
+ }
+ 
+ {$mode objfpc}
+ 
+ uses
+   {$ifdef unix}
+   cthreads,
+   {$endif}
+   SysUtils,
+   dos,
+   ctypes,
+   portaudio,
+   sherpa_onnx;
+ 
+ var
+   CriticalSection: TRTLCriticalSection;
+ 
+   Tts: TSherpaOnnxOfflineTts;
+   Audio: TSherpaOnnxGeneratedAudio;
+   Resampler: TSherpaOnnxLinearResampler;
+ 
+   Text: AnsiString;
+   Speed: Single = 1.0;  {Use a larger value to speak faster}
+   SpeakerId: Integer = 0;
+   Buffer: TSherpaOnnxCircularBuffer;
+   FinishedGeneration: Boolean = False;
+   FinishedPlaying: Boolean = False;
+ 
+   Version: String;
+   EnvStr: String;
+   Status: Integer;
+   NumDevices: Integer;
+   DeviceIndex: Integer;
+   DeviceInfo: PPaDeviceInfo;
+ 
+   { If you get EDivByZero: Division by zero error, please change the sample rate
+     to the one supported by your microphone.
+   }
+   DeviceSampleRate: Integer = 48000;
+   I: Integer;
+   Param: TPaStreamParameters;
+   Stream: PPaStream;
+   Wave: TSherpaOnnxWave;
+ 
+ function GenerateCallback(
+       Samples: pcfloat; N: cint32;
+       Arg: Pointer): cint; cdecl;
+ begin
+   EnterCriticalSection(CriticalSection);
+   try
+     if Resampler <> nil then
+       Buffer.Push(Resampler.Resample(Samples, N, False))
+     else
+       Buffer.Push(Samples, N);
+   finally
+     LeaveCriticalSection(CriticalSection);
+   end;
+ 
+   { 1 means to continue generating; 0 means to stop generating. }
+   Result := 1;
+ end;
+ 
+ function PlayCallback(
+       input: Pointer; output: Pointer;
+       frameCount: culong;
+       timeInfo: PPaStreamCallbackTimeInfo;
+       statusFlags: TPaStreamCallbackFlags;
+       userData: Pointer ): cint; cdecl;
+ var
+   Samples: TSherpaOnnxSamplesArray;
+   I: Integer;
+ begin
+   EnterCriticalSection(CriticalSection);
+   try
+     if Buffer.Size >= frameCount then
+       begin
+         Samples := Buffer.Get(Buffer.Head, FrameCount);
+         Buffer.Pop(FrameCount);
+       end
+     else if Buffer.Size > 0 then
+       begin
+         Samples := Buffer.Get(Buffer.Head, Buffer.Size);
+         Buffer.Pop(Buffer.Size);
+         SetLength(Samples, frameCount);
+       end
+     else
+       SetLength(Samples, frameCount);
+ 
+     for I := 0 to frameCount - 1 do
+       pcfloat(output)[I] := Samples[I];
+ 
+     if (Buffer.Size > 0) or (not FinishedGeneration) then
+       Result := paContinue
+     else
+       begin
+         Result := paComplete;
+         FinishedPlaying := True;
+       end;
+   finally
+     LeaveCriticalSection(CriticalSection);
+   end;
+ end;
+ 
+ function GetOfflineTts: TSherpaOnnxOfflineTts;
+ var
+   Config: TSherpaOnnxOfflineTtsConfig;
+ begin
+   Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx';
+   Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
+   Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt';
+   Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt';
+   Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict';
+   Config.Model.NumThreads := 1;
+   Config.Model.Debug := False;
+   Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst';
+   Config.MaxNumSentences := 1;
+ 
+   Result := TSherpaOnnxOfflineTts.Create(Config);
+ end;
+ 
+ begin
+   Tts := GetOfflineTts;
+   if Tts.GetSampleRate <> DeviceSampleRate then
+     Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
+ 
+   Version := String(Pa_GetVersionText);
+   WriteLn('Version is ', Version);
+   Status := Pa_Initialize;
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
+       Exit;
+     end;
+ 
+   NumDevices := Pa_GetDeviceCount;
+   WriteLn('Num devices: ', NumDevices);
+ 
+   DeviceIndex := Pa_GetDefaultOutputDevice;
+ 
+   if DeviceIndex = paNoDevice then
+     begin
+       WriteLn('No default output device found');
+       Pa_Terminate;
+       Exit;
+     end;
+ 
+   EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
+   if EnvStr <> '' then
+     begin
+       DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
+       WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
+     end;
+ 
+   for I := 0 to (NumDevices - 1) do
+     begin
+       DeviceInfo := Pa_GetDeviceInfo(I);
+       if I = DeviceIndex then
+         { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
+         WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
+       else
+         WriteLn(Format('   %d %s', [I, AnsiString(DeviceInfo^.Name)]));
+     end;
+ 
+   WriteLn('Use device ', DeviceIndex);
+   WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
+   WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
+ 
+   Initialize(Param);
+   Param.Device := DeviceIndex;
+   Param.ChannelCount := 1;
+   Param.SampleFormat := paFloat32;
+   param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
+   param.HostApiSpecificStreamInfo := nil;
+ 
+   Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
+ 
+ 
+   { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
+   Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
+     PPaStreamCallback(@PlayCallback), nil);
+ 
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
+       Pa_Terminate;
+       Exit;
+     end;
+ 
+   InitCriticalSection(CriticalSection);
+ 
+   Status := Pa_StartStream(stream);
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
+       Pa_Terminate;
+       Exit;
+     end;
+ 
+   WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
+ 
+   Text := '某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。';
+ 
+   Audio :=  Tts.Generate(Text, SpeakerId, Speed,
+     PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
+   FinishedGeneration := True;
+   SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
+   WriteLn('Saved to ./matcha-zh-playback.wav');
+ 
+   while not FinishedPlaying do
+     Pa_Sleep(100);  {sleep for 0.1 second }
+     {TODO(fangjun): Use an event to indicate the play is finished}
+ 
+   DoneCriticalSection(CriticalSection);
+ 
+   FreeAndNil(Tts);
+   FreeAndNil(Resampler);
+ 
+   Status := Pa_CloseStream(stream);
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
+       Exit;
+     end;
+ 
+   Status := Pa_Terminate;
+   if Status <> paNoError then
+     begin
+       WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
+       Exit;
+     end;
+ end.
+ 
--- a/pascal-api-examples/tts/matcha-zh.pas 0 → 100644
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/matcha-zh.pas 0 → 100644
查看文件 @c6fcd32
+ { Copyright (c)  2025  Xiaomi Corporation }
+ program matcha_zh;
+ {
+ This file shows how to use the text to speech API of sherpa-onnx
+ with MatchaTTS models.
+ 
+ It generates speech from text and saves it to a wave file.
+ 
+ If you want to play it while it is generating, please see
+ ./matcha-zh-playback.pas
+ }
+ 
+ {$mode objfpc}
+ 
+ uses
+   SysUtils,
+   sherpa_onnx;
+ 
+ function GetOfflineTts: TSherpaOnnxOfflineTts;
+ var
+   Config: TSherpaOnnxOfflineTtsConfig;
+ begin
+   Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx';
+   Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
+   Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt';
+   Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt';
+   Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict';
+   Config.Model.NumThreads := 1;
+   Config.Model.Debug := False;
+   Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst';
+   Config.MaxNumSentences := 1;
+ 
+   Result := TSherpaOnnxOfflineTts.Create(Config);
+ end;
+ 
+ var
+   Tts: TSherpaOnnxOfflineTts;
+   Audio: TSherpaOnnxGeneratedAudio;
+ 
+   Text: AnsiString;
+   Speed: Single = 1.0;  {Use a larger value to speak faster}
+   SpeakerId: Integer = 0;
+ 
+ begin
+   Tts := GetOfflineTts;
+ 
+   WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
+ 
+   Text := '某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。';
+ 
+   Audio :=  Tts.Generate(Text, SpeakerId, Speed);
+   SherpaOnnxWriteWave('./matcha-zh.wav', Audio.Samples, Audio.SampleRate);
+   WriteLn('Saved to ./matcha-zh.wav');
+ 
+   FreeAndNil(Tts);
+ end.
+ 
--- a/pascal-api-examples/tts/piper-playback.pas
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/piper-playback.pas
查看文件 @c6fcd32
 { Copyright (c)  2024  Xiaomi Corporation }
- program piper;
+ program piper_playback;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with Piper models.
--- a/pascal-api-examples/tts/run-matcha-en-playback.sh 0 → 100755
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/run-matcha-en-playback.sh 0 → 100755
查看文件 @c6fcd32
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+ 
+ echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+ 
+ if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+   mkdir -p ../../build
+   pushd ../../build
+   cmake \
+     -DCMAKE_INSTALL_PREFIX=./install \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     ..
+ 
+   cmake --build . --target install --config Release
+   popd
+ fi
+ 
+ # please visit
+ # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+ # matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+ # to download more models
+ if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
+   tar xf matcha-icefall-en_US-ljspeech.tar.bz2
+   rm matcha-icefall-en_US-ljspeech.tar.bz2
+ fi
+ 
+ if [ ! -f ./hifigan_v2.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+ fi
+ 
+ fpc \
+   -dSHERPA_ONNX_USE_SHARED_LIBS \
+   -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+   -Fl$SHERPA_ONNX_DIR/build/install/lib \
+   -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
+   ./matcha-en-playback.pas
+ 
+ # Please see ../portaudio-test/README.md
+ # for how to install portaudio on macOS
+ 
+ export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+ export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+ 
+ ./matcha-en-playback
--- a/pascal-api-examples/tts/run-matcha-en.sh 0 → 100755
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/run-matcha-en.sh 0 → 100755
查看文件 @c6fcd32
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+ 
+ echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+ 
+ if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+   mkdir -p ../../build
+   pushd ../../build
+   cmake \
+     -DCMAKE_INSTALL_PREFIX=./install \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     ..
+ 
+   cmake --build . --target install --config Release
+   popd
+ fi
+ 
+ # please visit
+ # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+ # matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+ # to download more models
+ if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
+   tar xf matcha-icefall-en_US-ljspeech.tar.bz2
+   rm matcha-icefall-en_US-ljspeech.tar.bz2
+ fi
+ 
+ if [ ! -f ./hifigan_v2.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+ fi
+ 
+ fpc \
+   -dSHERPA_ONNX_USE_SHARED_LIBS \
+   -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+   -Fl$SHERPA_ONNX_DIR/build/install/lib \
+   ./matcha-en.pas
+ 
+ export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+ export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+ 
+ ./matcha-en
--- a/pascal-api-examples/tts/run-matcha-zh-playback.sh 0 → 100755
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/run-matcha-zh-playback.sh 0 → 100755
查看文件 @c6fcd32
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+ 
+ echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+ 
+ if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+   mkdir -p ../../build
+   pushd ../../build
+   cmake \
+     -DCMAKE_INSTALL_PREFIX=./install \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     ..
+ 
+   cmake --build . --target install --config Release
+   popd
+ fi
+ 
+ # please visit
+ # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
+ # to download more models
+ if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
+   tar xvf matcha-icefall-zh-baker.tar.bz2
+   rm matcha-icefall-zh-baker.tar.bz2
+ fi
+ 
+ if [ ! -f ./hifigan_v2.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+ fi
+ 
+ fpc \
+   -dSHERPA_ONNX_USE_SHARED_LIBS \
+   -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+   -Fl$SHERPA_ONNX_DIR/build/install/lib \
+   -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
+   ./matcha-zh-playback.pas
+ 
+ # Please see ../portaudio-test/README.md
+ # for how to install portaudio on macOS
+ 
+ export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+ export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+ 
+ ./matcha-zh-playback
--- a/pascal-api-examples/tts/run-matcha-zh.sh 0 → 100755
查看文件 @c6fcd32
+++ b/pascal-api-examples/tts/run-matcha-zh.sh 0 → 100755
查看文件 @c6fcd32
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+ 
+ echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+ 
+ if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+   mkdir -p ../../build
+   pushd ../../build
+   cmake \
+     -DCMAKE_INSTALL_PREFIX=./install \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     ..
+ 
+   cmake --build . --target install --config Release
+   popd
+ fi
+ 
+ # please visit
+ # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
+ # to download more models
+ if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
+   tar xvf matcha-icefall-zh-baker.tar.bz2
+   rm matcha-icefall-zh-baker.tar.bz2
+ fi
+ 
+ if [ ! -f ./hifigan_v2.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+ fi
+ 
+ fpc \
+   -dSHERPA_ONNX_USE_SHARED_LIBS \
+   -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+   -Fl$SHERPA_ONNX_DIR/build/install/lib \
+   ./matcha-zh.pas
+ 
+ export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+ export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+ 
+ ./matcha-zh
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @c6fcd32
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @c6fcd32
@@ -62,11 +62,26 @@ type
     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
   end;
 
+   TSherpaOnnxOfflineTtsMatchaModelConfig = record
+     AcousticModel: AnsiString;
+     Vocoder: AnsiString;
+     Lexicon: AnsiString;
+     Tokens: AnsiString;
+     DataDir: AnsiString;
+     NoiseScale: Single;
+     LengthScale: Single;
+     DictDir: AnsiString;
+ 
+     function ToString: AnsiString;
+     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
+   end;
+ 
   TSherpaOnnxOfflineTtsModelConfig = record
     Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
     NumThreads: Integer;
     Debug: Boolean;
     Provider: AnsiString;
+     Matcha: TSherpaOnnxOfflineTtsMatchaModelConfig;
 
     function ToString: AnsiString;
     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
@@ -713,11 +728,23 @@ type
     DictDir: PAnsiChar;
   end;
 
+   SherpaOnnxOfflineTtsMatchaModelConfig = record
+     AcousticModel: PAnsiChar;
+     Vocoder: PAnsiChar;
+     Lexicon: PAnsiChar;
+     Tokens: PAnsiChar;
+     DataDir: PAnsiChar;
+     NoiseScale: cfloat;
+     LengthScale: cfloat;
+     DictDir: PAnsiChar;
+   end;
+ 
   SherpaOnnxOfflineTtsModelConfig = record
     Vits: SherpaOnnxOfflineTtsVitsModelConfig;
     NumThreads: cint32;
     Debug: cint32;
     Provider: PAnsiChar;
+     Matcha: SherpaOnnxOfflineTtsMatchaModelConfig;
   end;
 
   SherpaOnnxOfflineTtsConfig = record
@@ -1853,15 +1880,40 @@ begin
   Dest.LengthScale := 1.0;
 end;
 
+ function TSherpaOnnxOfflineTtsMatchaModelConfig.ToString: AnsiString;
+ begin
+   Result := Format('TSherpaOnnxOfflineTtsMatchaModelConfig(' +
+     'AcousticModel := %s, ' +
+     'Vocoder := %s, ' +
+     'Lexicon := %s, ' +
+     'Tokens := %s, ' +
+     'DataDir := %s, ' +
+     'NoiseScale := %.2f, ' +
+     'LengthScale := %.2f, ' +
+     'DictDir := %s' +
+     ')',
+     [Self.AcousticModel, Self.Vocoder, Self.Lexicon, Self.Tokens,
+      Self.DataDir, Self.NoiseScale, Self.LengthScale, Self.DictDir
+     ]);
+ end;
+ 
+ class operator TSherpaOnnxOfflineTtsMatchaModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
+ begin
+   Dest.NoiseScale := 0.667;
+   Dest.LengthScale := 1.0;
+ end;
+ 
 function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
 begin
   Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
     'Vits := %s, ' +
     'NumThreads := %d, ' +
     'Debug := %s, ' +
-     'Provider := %s' +
+     'Provider := %s, ' +
+     'Matcha := %s' +
     ')',
-     [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider
+     [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider,
+      Self.Matcha.ToString
     ]);
 end;
 
@@ -1905,6 +1957,15 @@ begin
   C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
   C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir);
 
+   C.Model.Matcha.AcousticModel := PAnsiChar(Config.Model.Matcha.AcousticModel);
+   C.Model.Matcha.Vocoder := PAnsiChar(Config.Model.Matcha.Vocoder);
+   C.Model.Matcha.Lexicon := PAnsiChar(Config.Model.Matcha.Lexicon);
+   C.Model.Matcha.Tokens := PAnsiChar(Config.Model.Matcha.Tokens);
+   C.Model.Matcha.DataDir := PAnsiChar(Config.Model.Matcha.DataDir);
+   C.Model.Matcha.NoiseScale := Config.Model.Matcha.NoiseScale;
+   C.Model.Matcha.LengthScale := Config.Model.Matcha.LengthScale;
+   C.Model.Matcha.DictDir := PAnsiChar(Config.Model.Matcha.DictDir);
+ 
   C.Model.NumThreads := Config.Model.NumThreads;
   C.Model.Provider := PAnsiChar(Config.Model.Provider);
   C.Model.Debug := Ord(Config.Model.Debug);