Fangjun Kuang
Committed by GitHub

Add Pascal API for MatchaTTS models. (#1686)

@@ -152,6 +152,19 @@ jobs: @@ -152,6 +152,19 @@ jobs:
152 152
153 ./run-piper.sh 153 ./run-piper.sh
154 rm -rf vits-piper-* 154 rm -rf vits-piper-*
  155 + rm piper
  156 + ls -lh
  157 + echo "---"
  158 +
  159 + ./run-matcha-zh.sh
  160 + rm -rf matcha-icefall-*
  161 + rm matcha-zh
  162 + ls -lh
  163 + echo "---"
  164 +
  165 + ./run-matcha-en.sh
  166 + rm -rf matcha-icefall-*
  167 + rm matcha-en
155 ls -lh 168 ls -lh
156 echo "---" 169 echo "---"
157 170
@@ -2,3 +2,7 @@ @@ -2,3 +2,7 @@
2 piper 2 piper
3 piper-playback 3 piper-playback
4 link*.res 4 link*.res
  5 +matcha-zh
  6 +matcha-en
  7 +matcha-zh-playback
  8 +matcha-en-playback
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +program matcha_en_playback;
  3 +{
  4 +This file shows how to use the text to speech API of sherpa-onnx
  5 +with Piper models.
  6 +
  7 +It generates speech from text and saves it to a wave file.
  8 +
  9 +Note that it plays the audio back as it is still generating.
  10 +}
  11 +
  12 +{$mode objfpc}
  13 +
  14 +uses
  15 + {$ifdef unix}
  16 + cthreads,
  17 + {$endif}
  18 + SysUtils,
  19 + dos,
  20 + ctypes,
  21 + portaudio,
  22 + sherpa_onnx;
  23 +
  24 +var
  25 + CriticalSection: TRTLCriticalSection;
  26 +
  27 + Tts: TSherpaOnnxOfflineTts;
  28 + Audio: TSherpaOnnxGeneratedAudio;
  29 + Resampler: TSherpaOnnxLinearResampler;
  30 +
  31 + Text: AnsiString;
  32 + Speed: Single = 1.0; {Use a larger value to speak faster}
  33 + SpeakerId: Integer = 0;
  34 + Buffer: TSherpaOnnxCircularBuffer;
  35 + FinishedGeneration: Boolean = False;
  36 + FinishedPlaying: Boolean = False;
  37 +
  38 + Version: String;
  39 + EnvStr: String;
  40 + Status: Integer;
  41 + NumDevices: Integer;
  42 + DeviceIndex: Integer;
  43 + DeviceInfo: PPaDeviceInfo;
  44 +
  45 + { If you get EDivByZero: Division by zero error, please change the sample rate
  46 + to the one supported by your microphone.
  47 + }
  48 + DeviceSampleRate: Integer = 48000;
  49 + I: Integer;
  50 + Param: TPaStreamParameters;
  51 + Stream: PPaStream;
  52 + Wave: TSherpaOnnxWave;
  53 +
  54 +function GenerateCallback(
  55 + Samples: pcfloat; N: cint32;
  56 + Arg: Pointer): cint; cdecl;
  57 +begin
  58 + EnterCriticalSection(CriticalSection);
  59 + try
  60 + if Resampler <> nil then
  61 + Buffer.Push(Resampler.Resample(Samples, N, False))
  62 + else
  63 + Buffer.Push(Samples, N);
  64 + finally
  65 + LeaveCriticalSection(CriticalSection);
  66 + end;
  67 +
  68 + { 1 means to continue generating; 0 means to stop generating. }
  69 + Result := 1;
  70 +end;
  71 +
  72 +function PlayCallback(
  73 + input: Pointer; output: Pointer;
  74 + frameCount: culong;
  75 + timeInfo: PPaStreamCallbackTimeInfo;
  76 + statusFlags: TPaStreamCallbackFlags;
  77 + userData: Pointer ): cint; cdecl;
  78 +var
  79 + Samples: TSherpaOnnxSamplesArray;
  80 + I: Integer;
  81 +begin
  82 + EnterCriticalSection(CriticalSection);
  83 + try
  84 + if Buffer.Size >= frameCount then
  85 + begin
  86 + Samples := Buffer.Get(Buffer.Head, FrameCount);
  87 + Buffer.Pop(FrameCount);
  88 + end
  89 + else if Buffer.Size > 0 then
  90 + begin
  91 + Samples := Buffer.Get(Buffer.Head, Buffer.Size);
  92 + Buffer.Pop(Buffer.Size);
  93 + SetLength(Samples, frameCount);
  94 + end
  95 + else
  96 + SetLength(Samples, frameCount);
  97 +
  98 + for I := 0 to frameCount - 1 do
  99 + pcfloat(output)[I] := Samples[I];
  100 +
  101 + if (Buffer.Size > 0) or (not FinishedGeneration) then
  102 + Result := paContinue
  103 + else
  104 + begin
  105 + Result := paComplete;
  106 + FinishedPlaying := True;
  107 + end;
  108 + finally
  109 + LeaveCriticalSection(CriticalSection);
  110 + end;
  111 +end;
  112 +
  113 +function GetOfflineTts: TSherpaOnnxOfflineTts;
  114 +var
  115 + Config: TSherpaOnnxOfflineTtsConfig;
  116 +begin
  117 + Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx';
  118 + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  119 + Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt';
  120 + Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data';
  121 + Config.Model.NumThreads := 1;
  122 + Config.Model.Debug := False;
  123 + Config.MaxNumSentences := 1;
  124 +
  125 + Result := TSherpaOnnxOfflineTts.Create(Config);
  126 +end;
  127 +
  128 +begin
  129 + Tts := GetOfflineTts;
  130 + if Tts.GetSampleRate <> DeviceSampleRate then
  131 + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
  132 +
  133 + Version := String(Pa_GetVersionText);
  134 + WriteLn('Version is ', Version);
  135 + Status := Pa_Initialize;
  136 + if Status <> paNoError then
  137 + begin
  138 + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
  139 + Exit;
  140 + end;
  141 +
  142 + NumDevices := Pa_GetDeviceCount;
  143 + WriteLn('Num devices: ', NumDevices);
  144 +
  145 + DeviceIndex := Pa_GetDefaultOutputDevice;
  146 +
  147 + if DeviceIndex = paNoDevice then
  148 + begin
  149 + WriteLn('No default output device found');
  150 + Pa_Terminate;
  151 + Exit;
  152 + end;
  153 +
  154 + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
  155 + if EnvStr <> '' then
  156 + begin
  157 + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
  158 + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
  159 + end;
  160 +
  161 + for I := 0 to (NumDevices - 1) do
  162 + begin
  163 + DeviceInfo := Pa_GetDeviceInfo(I);
  164 + if I = DeviceIndex then
  165 + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
  166 + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
  167 + else
  168 + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
  169 + end;
  170 +
  171 + WriteLn('Use device ', DeviceIndex);
  172 + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
  173 + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
  174 +
  175 + Initialize(Param);
  176 + Param.Device := DeviceIndex;
  177 + Param.ChannelCount := 1;
  178 + Param.SampleFormat := paFloat32;
  179 + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
  180 + param.HostApiSpecificStreamInfo := nil;
  181 +
  182 + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
  183 +
  184 +
  185 + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
  186 + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
  187 + PPaStreamCallback(@PlayCallback), nil);
  188 +
  189 + if Status <> paNoError then
  190 + begin
  191 + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
  192 + Pa_Terminate;
  193 + Exit;
  194 + end;
  195 +
  196 + InitCriticalSection(CriticalSection);
  197 +
  198 + Status := Pa_StartStream(stream);
  199 + if Status <> paNoError then
  200 + begin
  201 + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
  202 + Pa_Terminate;
  203 + Exit;
  204 + end;
  205 +
  206 + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  207 +
  208 + Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
  209 +
  210 + Audio := Tts.Generate(Text, SpeakerId, Speed,
  211 + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
  212 + FinishedGeneration := True;
  213 + SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
  214 + WriteLn('Saved to ./matcha-zh-playback.wav');
  215 +
  216 + while not FinishedPlaying do
  217 + Pa_Sleep(100); {sleep for 0.1 second }
  218 + {TODO(fangjun): Use an event to indicate the play is finished}
  219 +
  220 + DoneCriticalSection(CriticalSection);
  221 +
  222 + FreeAndNil(Tts);
  223 + FreeAndNil(Resampler);
  224 +
  225 + Status := Pa_CloseStream(stream);
  226 + if Status <> paNoError then
  227 + begin
  228 + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
  229 + Exit;
  230 + end;
  231 +
  232 + Status := Pa_Terminate;
  233 + if Status <> paNoError then
  234 + begin
  235 + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
  236 + Exit;
  237 + end;
  238 +end.
  239 +
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +program matcha_en;
  3 +{
  4 +This file shows how to use the text to speech API of sherpa-onnx
  5 +with MatchaTTS models.
  6 +
  7 +It generates speech from text and saves it to a wave file.
  8 +
  9 +If you want to play it while it is generating, please see
  10 +./matcha-zh-playback.pas
  11 +}
  12 +
  13 +{$mode objfpc}
  14 +
  15 +uses
  16 + SysUtils,
  17 + sherpa_onnx;
  18 +
  19 +function GetOfflineTts: TSherpaOnnxOfflineTts;
  20 +var
  21 + Config: TSherpaOnnxOfflineTtsConfig;
  22 +begin
  23 + Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx';
  24 + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  25 + Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt';
  26 + Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data';
  27 + Config.Model.NumThreads := 1;
  28 + Config.Model.Debug := False;
  29 + Config.MaxNumSentences := 1;
  30 +
  31 + Result := TSherpaOnnxOfflineTts.Create(Config);
  32 +end;
  33 +
  34 +var
  35 + Tts: TSherpaOnnxOfflineTts;
  36 + Audio: TSherpaOnnxGeneratedAudio;
  37 +
  38 + Text: AnsiString;
  39 + Speed: Single = 1.0; {Use a larger value to speak faster}
  40 + SpeakerId: Integer = 0;
  41 +
  42 +begin
  43 + Tts := GetOfflineTts;
  44 +
  45 + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  46 +
  47 + Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
  48 +
  49 + Audio := Tts.Generate(Text, SpeakerId, Speed);
  50 + SherpaOnnxWriteWave('./matcha-en.wav', Audio.Samples, Audio.SampleRate);
  51 + WriteLn('Saved to ./matcha-en.wav');
  52 +
  53 + FreeAndNil(Tts);
  54 +end.
  55 +
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +program matcha_zh_playback;
  3 +{
  4 +This file shows how to use the text to speech API of sherpa-onnx
  5 +with Piper models.
  6 +
  7 +It generates speech from text and saves it to a wave file.
  8 +
  9 +Note that it plays the audio back as it is still generating.
  10 +}
  11 +
  12 +{$mode objfpc}
  13 +
  14 +uses
  15 + {$ifdef unix}
  16 + cthreads,
  17 + {$endif}
  18 + SysUtils,
  19 + dos,
  20 + ctypes,
  21 + portaudio,
  22 + sherpa_onnx;
  23 +
  24 +var
  25 + CriticalSection: TRTLCriticalSection;
  26 +
  27 + Tts: TSherpaOnnxOfflineTts;
  28 + Audio: TSherpaOnnxGeneratedAudio;
  29 + Resampler: TSherpaOnnxLinearResampler;
  30 +
  31 + Text: AnsiString;
  32 + Speed: Single = 1.0; {Use a larger value to speak faster}
  33 + SpeakerId: Integer = 0;
  34 + Buffer: TSherpaOnnxCircularBuffer;
  35 + FinishedGeneration: Boolean = False;
  36 + FinishedPlaying: Boolean = False;
  37 +
  38 + Version: String;
  39 + EnvStr: String;
  40 + Status: Integer;
  41 + NumDevices: Integer;
  42 + DeviceIndex: Integer;
  43 + DeviceInfo: PPaDeviceInfo;
  44 +
  45 + { If you get EDivByZero: Division by zero error, please change the sample rate
  46 + to the one supported by your microphone.
  47 + }
  48 + DeviceSampleRate: Integer = 48000;
  49 + I: Integer;
  50 + Param: TPaStreamParameters;
  51 + Stream: PPaStream;
  52 + Wave: TSherpaOnnxWave;
  53 +
  54 +function GenerateCallback(
  55 + Samples: pcfloat; N: cint32;
  56 + Arg: Pointer): cint; cdecl;
  57 +begin
  58 + EnterCriticalSection(CriticalSection);
  59 + try
  60 + if Resampler <> nil then
  61 + Buffer.Push(Resampler.Resample(Samples, N, False))
  62 + else
  63 + Buffer.Push(Samples, N);
  64 + finally
  65 + LeaveCriticalSection(CriticalSection);
  66 + end;
  67 +
  68 + { 1 means to continue generating; 0 means to stop generating. }
  69 + Result := 1;
  70 +end;
  71 +
  72 +function PlayCallback(
  73 + input: Pointer; output: Pointer;
  74 + frameCount: culong;
  75 + timeInfo: PPaStreamCallbackTimeInfo;
  76 + statusFlags: TPaStreamCallbackFlags;
  77 + userData: Pointer ): cint; cdecl;
  78 +var
  79 + Samples: TSherpaOnnxSamplesArray;
  80 + I: Integer;
  81 +begin
  82 + EnterCriticalSection(CriticalSection);
  83 + try
  84 + if Buffer.Size >= frameCount then
  85 + begin
  86 + Samples := Buffer.Get(Buffer.Head, FrameCount);
  87 + Buffer.Pop(FrameCount);
  88 + end
  89 + else if Buffer.Size > 0 then
  90 + begin
  91 + Samples := Buffer.Get(Buffer.Head, Buffer.Size);
  92 + Buffer.Pop(Buffer.Size);
  93 + SetLength(Samples, frameCount);
  94 + end
  95 + else
  96 + SetLength(Samples, frameCount);
  97 +
  98 + for I := 0 to frameCount - 1 do
  99 + pcfloat(output)[I] := Samples[I];
  100 +
  101 + if (Buffer.Size > 0) or (not FinishedGeneration) then
  102 + Result := paContinue
  103 + else
  104 + begin
  105 + Result := paComplete;
  106 + FinishedPlaying := True;
  107 + end;
  108 + finally
  109 + LeaveCriticalSection(CriticalSection);
  110 + end;
  111 +end;
  112 +
  113 +function GetOfflineTts: TSherpaOnnxOfflineTts;
  114 +var
  115 + Config: TSherpaOnnxOfflineTtsConfig;
  116 +begin
  117 + Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx';
  118 + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  119 + Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt';
  120 + Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt';
  121 + Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict';
  122 + Config.Model.NumThreads := 1;
  123 + Config.Model.Debug := False;
  124 + Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst';
  125 + Config.MaxNumSentences := 1;
  126 +
  127 + Result := TSherpaOnnxOfflineTts.Create(Config);
  128 +end;
  129 +
  130 +begin
  131 + Tts := GetOfflineTts;
  132 + if Tts.GetSampleRate <> DeviceSampleRate then
  133 + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
  134 +
  135 + Version := String(Pa_GetVersionText);
  136 + WriteLn('Version is ', Version);
  137 + Status := Pa_Initialize;
  138 + if Status <> paNoError then
  139 + begin
  140 + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
  141 + Exit;
  142 + end;
  143 +
  144 + NumDevices := Pa_GetDeviceCount;
  145 + WriteLn('Num devices: ', NumDevices);
  146 +
  147 + DeviceIndex := Pa_GetDefaultOutputDevice;
  148 +
  149 + if DeviceIndex = paNoDevice then
  150 + begin
  151 + WriteLn('No default output device found');
  152 + Pa_Terminate;
  153 + Exit;
  154 + end;
  155 +
  156 + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
  157 + if EnvStr <> '' then
  158 + begin
  159 + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
  160 + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
  161 + end;
  162 +
  163 + for I := 0 to (NumDevices - 1) do
  164 + begin
  165 + DeviceInfo := Pa_GetDeviceInfo(I);
  166 + if I = DeviceIndex then
  167 + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
  168 + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
  169 + else
  170 + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
  171 + end;
  172 +
  173 + WriteLn('Use device ', DeviceIndex);
  174 + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
  175 + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
  176 +
  177 + Initialize(Param);
  178 + Param.Device := DeviceIndex;
  179 + Param.ChannelCount := 1;
  180 + Param.SampleFormat := paFloat32;
  181 + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
  182 + param.HostApiSpecificStreamInfo := nil;
  183 +
  184 + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
  185 +
  186 +
  187 + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
  188 + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
  189 + PPaStreamCallback(@PlayCallback), nil);
  190 +
  191 + if Status <> paNoError then
  192 + begin
  193 + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
  194 + Pa_Terminate;
  195 + Exit;
  196 + end;
  197 +
  198 + InitCriticalSection(CriticalSection);
  199 +
  200 + Status := Pa_StartStream(stream);
  201 + if Status <> paNoError then
  202 + begin
  203 + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
  204 + Pa_Terminate;
  205 + Exit;
  206 + end;
  207 +
  208 + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  209 +
  210 + Text := '某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。20241231号,拨打110或者18920240511123456块钱。';
  211 +
  212 + Audio := Tts.Generate(Text, SpeakerId, Speed,
  213 + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
  214 + FinishedGeneration := True;
  215 + SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
  216 + WriteLn('Saved to ./matcha-zh-playback.wav');
  217 +
  218 + while not FinishedPlaying do
  219 + Pa_Sleep(100); {sleep for 0.1 second }
  220 + {TODO(fangjun): Use an event to indicate the play is finished}
  221 +
  222 + DoneCriticalSection(CriticalSection);
  223 +
  224 + FreeAndNil(Tts);
  225 + FreeAndNil(Resampler);
  226 +
  227 + Status := Pa_CloseStream(stream);
  228 + if Status <> paNoError then
  229 + begin
  230 + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
  231 + Exit;
  232 + end;
  233 +
  234 + Status := Pa_Terminate;
  235 + if Status <> paNoError then
  236 + begin
  237 + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
  238 + Exit;
  239 + end;
  240 +end.
  241 +
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +program matcha_zh;
  3 +{
  4 +This file shows how to use the text to speech API of sherpa-onnx
  5 +with MatchaTTS models.
  6 +
  7 +It generates speech from text and saves it to a wave file.
  8 +
  9 +If you want to play it while it is generating, please see
  10 +./matcha-zh-playback.pas
  11 +}
  12 +
  13 +{$mode objfpc}
  14 +
  15 +uses
  16 + SysUtils,
  17 + sherpa_onnx;
  18 +
  19 +function GetOfflineTts: TSherpaOnnxOfflineTts;
  20 +var
  21 + Config: TSherpaOnnxOfflineTtsConfig;
  22 +begin
  23 + Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx';
  24 + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  25 + Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt';
  26 + Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt';
  27 + Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict';
  28 + Config.Model.NumThreads := 1;
  29 + Config.Model.Debug := False;
  30 + Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst';
  31 + Config.MaxNumSentences := 1;
  32 +
  33 + Result := TSherpaOnnxOfflineTts.Create(Config);
  34 +end;
  35 +
  36 +var
  37 + Tts: TSherpaOnnxOfflineTts;
  38 + Audio: TSherpaOnnxGeneratedAudio;
  39 +
  40 + Text: AnsiString;
  41 + Speed: Single = 1.0; {Use a larger value to speak faster}
  42 + SpeakerId: Integer = 0;
  43 +
  44 +begin
  45 + Tts := GetOfflineTts;
  46 +
  47 + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  48 +
  49 + Text := '某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。20241231号,拨打110或者18920240511123456块钱。';
  50 +
  51 + Audio := Tts.Generate(Text, SpeakerId, Speed);
  52 + SherpaOnnxWriteWave('./matcha-zh.wav', Audio.Samples, Audio.SampleRate);
  53 + WriteLn('Saved to ./matcha-zh.wav');
  54 +
  55 + FreeAndNil(Tts);
  56 +end.
  57 +
1 { Copyright (c) 2024 Xiaomi Corporation } 1 { Copyright (c) 2024 Xiaomi Corporation }
2 -program piper; 2 +program piper_playback;
3 { 3 {
4 This file shows how to use the text to speech API of sherpa-onnx 4 This file shows how to use the text to speech API of sherpa-onnx
5 with Piper models. 5 with Piper models.
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +# please visit
  27 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  28 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  29 +# to download more models
  30 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  32 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  33 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  34 +fi
  35 +
  36 +if [ ! -f ./hifigan_v2.onnx ]; then
  37 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  38 +fi
  39 +
  40 +fpc \
  41 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  42 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  43 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  44 + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
  45 + ./matcha-en-playback.pas
  46 +
  47 +# Please see ../portaudio-test/README.md
  48 +# for how to install portaudio on macOS
  49 +
  50 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  51 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  52 +
  53 +./matcha-en-playback
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +# please visit
  27 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  28 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  29 +# to download more models
  30 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  32 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  33 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  34 +fi
  35 +
  36 +if [ ! -f ./hifigan_v2.onnx ]; then
  37 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  38 +fi
  39 +
  40 +fpc \
  41 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  42 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  43 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  44 + ./matcha-en.pas
  45 +
  46 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  47 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  48 +
  49 +./matcha-en
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +# please visit
  27 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  28 +# to download more models
  29 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  30 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  31 + tar xvf matcha-icefall-zh-baker.tar.bz2
  32 + rm matcha-icefall-zh-baker.tar.bz2
  33 +fi
  34 +
  35 +if [ ! -f ./hifigan_v2.onnx ]; then
  36 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  37 +fi
  38 +
  39 +fpc \
  40 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  41 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  42 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  43 + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
  44 + ./matcha-zh-playback.pas
  45 +
  46 +# Please see ../portaudio-test/README.md
  47 +# for how to install portaudio on macOS
  48 +
  49 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  50 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  51 +
  52 +./matcha-zh-playback
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +# please visit
  27 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  28 +# to download more models
  29 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  30 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  31 + tar xvf matcha-icefall-zh-baker.tar.bz2
  32 + rm matcha-icefall-zh-baker.tar.bz2
  33 +fi
  34 +
  35 +if [ ! -f ./hifigan_v2.onnx ]; then
  36 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  37 +fi
  38 +
  39 +fpc \
  40 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  41 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  42 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  43 + ./matcha-zh.pas
  44 +
  45 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  46 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  47 +
  48 +./matcha-zh
@@ -62,11 +62,26 @@ type @@ -62,11 +62,26 @@ type
62 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig); 62 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
63 end; 63 end;
64 64
  65 + TSherpaOnnxOfflineTtsMatchaModelConfig = record
  66 + AcousticModel: AnsiString;
  67 + Vocoder: AnsiString;
  68 + Lexicon: AnsiString;
  69 + Tokens: AnsiString;
  70 + DataDir: AnsiString;
  71 + NoiseScale: Single;
  72 + LengthScale: Single;
  73 + DictDir: AnsiString;
  74 +
  75 + function ToString: AnsiString;
  76 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
  77 + end;
  78 +
65 TSherpaOnnxOfflineTtsModelConfig = record 79 TSherpaOnnxOfflineTtsModelConfig = record
66 Vits: TSherpaOnnxOfflineTtsVitsModelConfig; 80 Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
67 NumThreads: Integer; 81 NumThreads: Integer;
68 Debug: Boolean; 82 Debug: Boolean;
69 Provider: AnsiString; 83 Provider: AnsiString;
  84 + Matcha: TSherpaOnnxOfflineTtsMatchaModelConfig;
70 85
71 function ToString: AnsiString; 86 function ToString: AnsiString;
72 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig); 87 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
@@ -713,11 +728,23 @@ type @@ -713,11 +728,23 @@ type
713 DictDir: PAnsiChar; 728 DictDir: PAnsiChar;
714 end; 729 end;
715 730
  731 + SherpaOnnxOfflineTtsMatchaModelConfig = record
  732 + AcousticModel: PAnsiChar;
  733 + Vocoder: PAnsiChar;
  734 + Lexicon: PAnsiChar;
  735 + Tokens: PAnsiChar;
  736 + DataDir: PAnsiChar;
  737 + NoiseScale: cfloat;
  738 + LengthScale: cfloat;
  739 + DictDir: PAnsiChar;
  740 + end;
  741 +
716 SherpaOnnxOfflineTtsModelConfig = record 742 SherpaOnnxOfflineTtsModelConfig = record
717 Vits: SherpaOnnxOfflineTtsVitsModelConfig; 743 Vits: SherpaOnnxOfflineTtsVitsModelConfig;
718 NumThreads: cint32; 744 NumThreads: cint32;
719 Debug: cint32; 745 Debug: cint32;
720 Provider: PAnsiChar; 746 Provider: PAnsiChar;
  747 + Matcha: SherpaOnnxOfflineTtsMatchaModelConfig;
721 end; 748 end;
722 749
723 SherpaOnnxOfflineTtsConfig = record 750 SherpaOnnxOfflineTtsConfig = record
@@ -1853,15 +1880,40 @@ begin @@ -1853,15 +1880,40 @@ begin
1853 Dest.LengthScale := 1.0; 1880 Dest.LengthScale := 1.0;
1854 end; 1881 end;
1855 1882
  1883 +function TSherpaOnnxOfflineTtsMatchaModelConfig.ToString: AnsiString;
  1884 +begin
  1885 + Result := Format('TSherpaOnnxOfflineTtsMatchaModelConfig(' +
  1886 + 'AcousticModel := %s, ' +
  1887 + 'Vocoder := %s, ' +
  1888 + 'Lexicon := %s, ' +
  1889 + 'Tokens := %s, ' +
  1890 + 'DataDir := %s, ' +
  1891 + 'NoiseScale := %.2f, ' +
  1892 + 'LengthScale := %.2f, ' +
  1893 + 'DictDir := %s' +
  1894 + ')',
  1895 + [Self.AcousticModel, Self.Vocoder, Self.Lexicon, Self.Tokens,
  1896 + Self.DataDir, Self.NoiseScale, Self.LengthScale, Self.DictDir
  1897 + ]);
  1898 +end;
  1899 +
  1900 +class operator TSherpaOnnxOfflineTtsMatchaModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
  1901 +begin
  1902 + Dest.NoiseScale := 0.667;
  1903 + Dest.LengthScale := 1.0;
  1904 +end;
  1905 +
1856 function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString; 1906 function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
1857 begin 1907 begin
1858 Result := Format('TSherpaOnnxOfflineTtsModelConfig(' + 1908 Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
1859 'Vits := %s, ' + 1909 'Vits := %s, ' +
1860 'NumThreads := %d, ' + 1910 'NumThreads := %d, ' +
1861 'Debug := %s, ' + 1911 'Debug := %s, ' +
1862 - 'Provider := %s' + 1912 + 'Provider := %s, ' +
  1913 + 'Matcha := %s' +
1863 ')', 1914 ')',
1864 - [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider 1915 + [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider,
  1916 + Self.Matcha.ToString
1865 ]); 1917 ]);
1866 end; 1918 end;
1867 1919
@@ -1905,6 +1957,15 @@ begin @@ -1905,6 +1957,15 @@ begin
1905 C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale; 1957 C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
1906 C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir); 1958 C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir);
1907 1959
  1960 + C.Model.Matcha.AcousticModel := PAnsiChar(Config.Model.Matcha.AcousticModel);
  1961 + C.Model.Matcha.Vocoder := PAnsiChar(Config.Model.Matcha.Vocoder);
  1962 + C.Model.Matcha.Lexicon := PAnsiChar(Config.Model.Matcha.Lexicon);
  1963 + C.Model.Matcha.Tokens := PAnsiChar(Config.Model.Matcha.Tokens);
  1964 + C.Model.Matcha.DataDir := PAnsiChar(Config.Model.Matcha.DataDir);
  1965 + C.Model.Matcha.NoiseScale := Config.Model.Matcha.NoiseScale;
  1966 + C.Model.Matcha.LengthScale := Config.Model.Matcha.LengthScale;
  1967 + C.Model.Matcha.DictDir := PAnsiChar(Config.Model.Matcha.DictDir);
  1968 +
1908 C.Model.NumThreads := Config.Model.NumThreads; 1969 C.Model.NumThreads := Config.Model.NumThreads;
1909 C.Model.Provider := PAnsiChar(Config.Model.Provider); 1970 C.Model.Provider := PAnsiChar(Config.Model.Provider);
1910 C.Model.Debug := Ord(Config.Model.Debug); 1971 C.Model.Debug := Ord(Config.Model.Debug);