Fangjun Kuang
Committed by GitHub

Text to speech API for Object Pascal. (#1273)

@@ -119,13 +119,29 @@ jobs: @@ -119,13 +119,29 @@ jobs:
119 cp -v install/lib/*.dll ../pascal-api-examples/vad 119 cp -v install/lib/*.dll ../pascal-api-examples/vad
120 cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr 120 cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
121 121
122 - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav  
123 - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr  
124 - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr  
125 - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad  
126 - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr 122 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav
  123 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr
  124 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr
  125 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad
  126 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
  127 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
127 fi 128 fi
128 129
  130 + - name: Run Pascal test (TTS)
  131 + shell: bash
  132 + run: |
  133 + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
  134 +
  135 + cd ./pascal-api-examples
  136 + pushd tts
  137 +
  138 + ./run-piper.sh
  139 + rm -rf vits-piper-*
  140 + ls -lh
  141 + echo "---"
  142 +
  143 + popd
  144 +
129 - name: Run Pascal test (VAD + non-streaming ASR) 145 - name: Run Pascal test (VAD + non-streaming ASR)
130 shell: bash 146 shell: bash
131 run: | 147 run: |
@@ -13,3 +13,5 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html @@ -13,3 +13,5 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
13 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| 13 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
14 |[vad](./vad)| It shows how to use the voice activity detection API.| 14 |[vad](./vad)| It shows how to use the voice activity detection API.|
15 |[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.| 15 |[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
  16 +|[portaudio-test](./portaudio-test)| It shows how to use PortAudio for recording and playing.|
  17 +|[tts](./tts)| It shows how to use the text-to-speech API.|
  1 +!run-*.sh
  2 +piper
  3 +piper-playback
  4 +link*.res
  1 +# Introduction
  2 +
  3 +This directory contains examples for how to use the TTS (text to speech) APIs.
  4 +
  5 +|Directory| Description|
  6 +|---------|------------|
  7 +|[run-piper.sh](./run-piper.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech.|
  8 +|[run-piper-playback.sh](./run-piper-playback.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech. It plays the generated audio as it is still generating. |
  9 +
  1 +{ Copyright (c) 2024 Xiaomi Corporation }
  2 +program piper;
  3 +{
  4 +This file shows how to use the text to speech API of sherpa-onnx
  5 +with Piper models.
  6 +
  7 +It generates speech from text and saves it to a wave file.
  8 +
  9 +Note that it plays the audio back as it is still generating.
  10 +}
  11 +
  12 +{$mode objfpc}
  13 +
  14 +uses
  15 + {$ifdef unix}
  16 + cthreads,
  17 + {$endif}
  18 + SysUtils,
  19 + dos,
  20 + ctypes,
  21 + portaudio,
  22 + sherpa_onnx;
  23 +
  24 +var
  25 + CriticalSection: TRTLCriticalSection;
  26 +
  27 + Tts: TSherpaOnnxOfflineTts;
  28 + Audio: TSherpaOnnxGeneratedAudio;
  29 + Resampler: TSherpaOnnxLinearResampler;
  30 +
  31 + Text: AnsiString;
  32 + Speed: Single = 1.0; {Use a larger value to speak faster}
  33 + SpeakerId: Integer = 0;
  34 + Buffer: TSherpaOnnxCircularBuffer;
  35 + FinishedGeneration: Boolean = False;
  36 + FinishedPlaying: Boolean = False;
  37 +
  38 + Version: String;
  39 + EnvStr: String;
  40 + Status: Integer;
  41 + NumDevices: Integer;
  42 + DeviceIndex: Integer;
  43 + DeviceInfo: PPaDeviceInfo;
  44 +
  45 + { If you get EDivByZero: Division by zero error, please change the sample rate
  46 + to the one supported by your microphone.
  47 + }
  48 + DeviceSampleRate: Integer = 48000;
  49 + I: Integer;
  50 + Param: TPaStreamParameters;
  51 + Stream: PPaStream;
  52 + Wave: TSherpaOnnxWave;
  53 +
  54 +function GenerateCallback(
  55 + Samples: pcfloat; N: cint32;
  56 + Arg: Pointer): cint; cdecl;
  57 +begin
  58 + EnterCriticalSection(CriticalSection);
  59 + try
  60 + if Resampler <> nil then
  61 + Buffer.Push(Resampler.Resample(Samples, N, False))
  62 + else
  63 + Buffer.Push(Samples, N);
  64 + finally
  65 + LeaveCriticalSection(CriticalSection);
  66 + end;
  67 +
  68 + { 1 means to continue generating; 0 means to stop generating. }
  69 + Result := 1;
  70 +end;
  71 +
  72 +function PlayCallback(
  73 + input: Pointer; output: Pointer;
  74 + frameCount: culong;
  75 + timeInfo: PPaStreamCallbackTimeInfo;
  76 + statusFlags: TPaStreamCallbackFlags;
  77 + userData: Pointer ): cint; cdecl;
  78 +var
  79 + Samples: TSherpaOnnxSamplesArray;
  80 + I: Integer;
  81 +begin
  82 + EnterCriticalSection(CriticalSection);
  83 + try
  84 + if Buffer.Size >= frameCount then
  85 + begin
  86 + Samples := Buffer.Get(Buffer.Head, FrameCount);
  87 + Buffer.Pop(FrameCount);
  88 + end
  89 + else if Buffer.Size > 0 then
  90 + begin
  91 + Samples := Buffer.Get(Buffer.Head, Buffer.Size);
  92 + Buffer.Pop(Buffer.Size);
  93 + SetLength(Samples, frameCount);
  94 + end
  95 + else
  96 + SetLength(Samples, frameCount);
  97 +
  98 + for I := 0 to frameCount - 1 do
  99 + pcfloat(output)[I] := Samples[I];
  100 +
  101 + if (Buffer.Size > 0) or (not FinishedGeneration) then
  102 + Result := paContinue
  103 + else
  104 + begin
  105 + Result := paComplete;
  106 + FinishedPlaying := True;
  107 + end;
  108 + finally
  109 + LeaveCriticalSection(CriticalSection);
  110 + end;
  111 +end;
  112 +
  113 +function GetOfflineTts: TSherpaOnnxOfflineTts;
  114 +var
  115 + Config: TSherpaOnnxOfflineTtsConfig;
  116 +begin
  117 + Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
  118 + Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
  119 + Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
  120 + Config.Model.NumThreads := 1;
  121 + Config.Model.Debug := False;
  122 + Config.MaxNumSentences := 1;
  123 +
  124 + Result := TSherpaOnnxOfflineTts.Create(Config);
  125 +end;
  126 +
  127 +begin
  128 + Tts := GetOfflineTts;
  129 + if Tts.GetSampleRate <> DeviceSampleRate then
  130 + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
  131 +
  132 + Version := String(Pa_GetVersionText);
  133 + WriteLn('Version is ', Version);
  134 + Status := Pa_Initialize;
  135 + if Status <> paNoError then
  136 + begin
  137 + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
  138 + Exit;
  139 + end;
  140 +
  141 + NumDevices := Pa_GetDeviceCount;
  142 + WriteLn('Num devices: ', NumDevices);
  143 +
  144 + DeviceIndex := Pa_GetDefaultOutputDevice;
  145 +
  146 + if DeviceIndex = paNoDevice then
  147 + begin
  148 + WriteLn('No default output device found');
  149 + Pa_Terminate;
  150 + Exit;
  151 + end;
  152 +
  153 + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
  154 + if EnvStr <> '' then
  155 + begin
  156 + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
  157 + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
  158 + end;
  159 +
  160 + for I := 0 to (NumDevices - 1) do
  161 + begin
  162 + DeviceInfo := Pa_GetDeviceInfo(I);
  163 + if I = DeviceIndex then
  164 + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
  165 + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
  166 + else
  167 + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
  168 + end;
  169 +
  170 + WriteLn('Use device ', DeviceIndex);
  171 + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
  172 + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
  173 +
  174 + Initialize(Param);
  175 + Param.Device := DeviceIndex;
  176 + Param.ChannelCount := 1;
  177 + Param.SampleFormat := paFloat32;
  178 + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
  179 + param.HostApiSpecificStreamInfo := nil;
  180 +
  181 + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
  182 +
  183 +
  184 + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
  185 + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
  186 + PPaStreamCallback(@PlayCallback), nil);
  187 +
  188 + if Status <> paNoError then
  189 + begin
  190 + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
  191 + Pa_Terminate;
  192 + Exit;
  193 + end;
  194 +
  195 + InitCriticalSection(CriticalSection);
  196 +
  197 + Status := Pa_StartStream(stream);
  198 + if Status <> paNoError then
  199 + begin
  200 + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
  201 + Pa_Terminate;
  202 + Exit;
  203 + end;
  204 +
  205 + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  206 +
  207 + Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
  208 +
  209 + Audio := Tts.Generate(Text, SpeakerId, Speed,
  210 + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
  211 + FinishedGeneration := True;
  212 + SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
  213 + WriteLn('Saved to ./libritts_r-generated.wav');
  214 +
  215 + while not FinishedPlaying do
  216 + Pa_Sleep(100); {sleep for 0.1 second }
  217 + {TODO(fangjun): Use an event to indicate the play is finished}
  218 +
  219 + DoneCriticalSection(CriticalSection);
  220 +
  221 + FreeAndNil(Tts);
  222 + FreeAndNil(Resampler);
  223 +
  224 + Status := Pa_CloseStream(stream);
  225 + if Status <> paNoError then
  226 + begin
  227 + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
  228 + Exit;
  229 + end;
  230 +
  231 + Status := Pa_Terminate;
  232 + if Status <> paNoError then
  233 + begin
  234 + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
  235 + Exit;
  236 + end;
  237 +end.
  238 +
  1 +{ Copyright (c) 2024 Xiaomi Corporation }
  2 +program piper;
  3 +{
  4 +This file shows how to use the text to speech API of sherpa-onnx
  5 +with Piper models.
  6 +
  7 +It generates speech from text and saves it to a wave file.
  8 +
  9 +If you want to play it while it is generating, please see
  10 +./piper-playback.pas
  11 +}
  12 +
  13 +{$mode objfpc}
  14 +
  15 +uses
  16 + SysUtils,
  17 + sherpa_onnx;
  18 +
  19 +function GetOfflineTts: TSherpaOnnxOfflineTts;
  20 +var
  21 + Config: TSherpaOnnxOfflineTtsConfig;
  22 +begin
  23 + Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
  24 + Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
  25 + Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
  26 + Config.Model.NumThreads := 1;
  27 + Config.Model.Debug := False;
  28 + Config.MaxNumSentences := 1;
  29 +
  30 + Result := TSherpaOnnxOfflineTts.Create(Config);
  31 +end;
  32 +
  33 +var
  34 + Tts: TSherpaOnnxOfflineTts;
  35 + Audio: TSherpaOnnxGeneratedAudio;
  36 +
  37 + Text: AnsiString;
  38 + Speed: Single = 1.0; {Use a larger value to speak faster}
  39 + SpeakerId: Integer = 0;
  40 +
  41 +begin
  42 + Tts := GetOfflineTts;
  43 +
  44 + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  45 +
  46 + Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
  47 +
  48 + Audio := Tts.Generate(Text, SpeakerId, Speed);
  49 + SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
  50 + WriteLn('Saved to ./libritts_r-generated.wav');
  51 +
  52 + FreeAndNil(Tts);
  53 +end.
  54 +
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
  28 + tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
  29 + rm vits-piper-en_US-libritts_r-medium.tar.bz2
  30 +fi
  31 +
  32 +fpc \
  33 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  34 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  35 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  36 + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
  37 + ./piper-playback.pas
  38 +
  39 +# Please see ../portaudio-test/README.md
  40 +# for how to install portaudio on macOS
  41 +
  42 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  43 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  44 +
  45 +./piper-playback
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
  28 + tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
  29 + rm vits-piper-en_US-libritts_r-medium.tar.bz2
  30 +fi
  31 +
  32 +fpc \
  33 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  34 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  35 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  36 + ./piper.pas
  37 +
  38 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  39 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  40 +
  41 +./piper
@@ -190,9 +190,9 @@ def get_piper_models() -> List[TtsModel]: @@ -190,9 +190,9 @@ def get_piper_models() -> List[TtsModel]:
190 TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"), 190 TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
191 TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"), 191 TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
192 TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"), 192 TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
193 - TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),  
194 - TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),  
195 - TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), 193 + # TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
  194 + # TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
  195 + # TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
196 TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"), 196 TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
197 TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"), 197 TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
198 TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"), 198 TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
@@ -180,9 +180,9 @@ def get_piper_models() -> List[TtsModel]: @@ -180,9 +180,9 @@ def get_piper_models() -> List[TtsModel]:
180 TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"), 180 TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
181 TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"), 181 TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
182 TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"), 182 TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
183 - TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),  
184 - TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),  
185 - TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), 183 + # TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
  184 + # TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
  185 + # TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
186 TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"), 186 TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
187 TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"), 187 TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
188 TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"), 188 TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
@@ -18,6 +18,7 @@ @@ -18,6 +18,7 @@
18 #include "sherpa-onnx/csrc/offline-punctuation.h" 18 #include "sherpa-onnx/csrc/offline-punctuation.h"
19 #include "sherpa-onnx/csrc/offline-recognizer.h" 19 #include "sherpa-onnx/csrc/offline-recognizer.h"
20 #include "sherpa-onnx/csrc/online-recognizer.h" 20 #include "sherpa-onnx/csrc/online-recognizer.h"
  21 +#include "sherpa-onnx/csrc/resample.h"
21 #include "sherpa-onnx/csrc/speaker-embedding-extractor.h" 22 #include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
22 #include "sherpa-onnx/csrc/speaker-embedding-manager.h" 23 #include "sherpa-onnx/csrc/speaker-embedding-manager.h"
23 #include "sherpa-onnx/csrc/spoken-language-identification.h" 24 #include "sherpa-onnx/csrc/spoken-language-identification.h"
@@ -1584,3 +1585,56 @@ const char *SherpaOfflinePunctuationAddPunct( @@ -1584,3 +1585,56 @@ const char *SherpaOfflinePunctuationAddPunct(
1584 } 1585 }
1585 1586
1586 void SherpaOfflinePunctuationFreeText(const char *text) { delete[] text; } 1587 void SherpaOfflinePunctuationFreeText(const char *text) { delete[] text; }
  1588 +
  1589 +struct SherpaOnnxLinearResampler {
  1590 + std::unique_ptr<sherpa_onnx::LinearResample> impl;
  1591 +};
  1592 +
  1593 +SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
  1594 + int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
  1595 + int32_t num_zeros) {
  1596 + SherpaOnnxLinearResampler *p = new SherpaOnnxLinearResampler;
  1597 + p->impl = std::make_unique<sherpa_onnx::LinearResample>(
  1598 + samp_rate_in_hz, samp_rate_out_hz, filter_cutoff_hz, num_zeros);
  1599 +
  1600 + return p;
  1601 +}
  1602 +
  1603 +void SherpaOnnxDestroyLinearResampler(SherpaOnnxLinearResampler *p) {
  1604 + delete p;
  1605 +}
  1606 +
  1607 +const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
  1608 + SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
  1609 + int32_t flush) {
  1610 + std::vector<float> o;
  1611 + p->impl->Resample(input, input_dim, flush, &o);
  1612 +
  1613 + float *s = new float[o.size()];
  1614 + std::copy(o.begin(), o.end(), s);
  1615 +
  1616 + SherpaOnnxResampleOut *ans = new SherpaOnnxResampleOut;
  1617 + ans->samples = s;
  1618 + ans->n = static_cast<int32_t>(o.size());
  1619 +
  1620 + return ans;
  1621 +}
  1622 +
  1623 +void SherpaOnnxLinearResamplerResampleFree(const SherpaOnnxResampleOut *p) {
  1624 + delete[] p->samples;
  1625 + delete p;
  1626 +}
  1627 +
  1628 +int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
  1629 + const SherpaOnnxLinearResampler *p) {
  1630 + return p->impl->GetInputSamplingRate();
  1631 +}
  1632 +
  1633 +int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
  1634 + const SherpaOnnxLinearResampler *p) {
  1635 + return p->impl->GetOutputSamplingRate();
  1636 +}
  1637 +
  1638 +void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
  1639 + p->impl->Reset();
  1640 +}
@@ -1315,6 +1315,52 @@ SHERPA_ONNX_API const char *SherpaOfflinePunctuationAddPunct( @@ -1315,6 +1315,52 @@ SHERPA_ONNX_API const char *SherpaOfflinePunctuationAddPunct(
1315 1315
1316 SHERPA_ONNX_API void SherpaOfflinePunctuationFreeText(const char *text); 1316 SHERPA_ONNX_API void SherpaOfflinePunctuationFreeText(const char *text);
1317 1317
  1318 +// for resampling
  1319 +SHERPA_ONNX_API typedef struct SherpaOnnxLinearResampler
  1320 + SherpaOnnxLinearResampler;
  1321 +
  1322 +/*
  1323 + float min_freq = min(sampling_rate_in_hz, samp_rate_out_hz);
  1324 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  1325 + int32_t lowpass_filter_width = 6;
  1326 +
  1327 + You can set filter_cutoff_hz to lowpass_cutoff
  1328 + sand set num_zeros to lowpass_filter_width
  1329 +*/
  1330 +// The user has to invoke SherpaOnnxDestroyLinearResampler()
  1331 +// to free the returned pointer to avoid memory leak
  1332 +SHERPA_ONNX_API SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
  1333 + int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
  1334 + int32_t num_zeros);
  1335 +
  1336 +SHERPA_ONNX_API void SherpaOnnxDestroyLinearResampler(
  1337 + SherpaOnnxLinearResampler *p);
  1338 +
  1339 +SHERPA_ONNX_API void SherpaOnnxLinearResamplerReset(
  1340 + SherpaOnnxLinearResampler *p);
  1341 +
  1342 +typedef struct SherpaOnnxResampleOut {
  1343 + const float *samples;
  1344 + int32_t n;
  1345 +} SherpaOnnxResampleOut;
  1346 +// The user has to invoke SherpaOnnxLinearResamplerResampleFree()
  1347 +// to free the returned pointer to avoid memory leak.
  1348 +//
  1349 +// If this is the last segment, you can set flush to 1; otherwise, please
  1350 +// set flush to 0
  1351 +SHERPA_ONNX_API const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
  1352 + SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
  1353 + int32_t flush);
  1354 +
  1355 +SHERPA_ONNX_API void SherpaOnnxLinearResamplerResampleFree(
  1356 + const SherpaOnnxResampleOut *p);
  1357 +
  1358 +SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
  1359 + const SherpaOnnxLinearResampler *p);
  1360 +
  1361 +SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
  1362 + const SherpaOnnxLinearResampler *p);
  1363 +
1318 #if defined(__GNUC__) 1364 #if defined(__GNUC__)
1319 #pragma GCC diagnostic pop 1365 #pragma GCC diagnostic pop
1320 #endif 1366 #endif
1 -{ Copyright (c) 2024 Xiaomi Corporation } 1 +{ Copyright (c) 2024 Xiaomi Corporation
  2 +
  3 +Please see
  4 +https://github.com/k2-fsa/sherpa-onnx/tree/master/pascal-api-examples
  5 +for how to use APIs in this file.
  6 +}
2 7
3 unit sherpa_onnx; 8 unit sherpa_onnx;
4 9
@@ -7,13 +12,105 @@ unit sherpa_onnx; @@ -7,13 +12,105 @@ unit sherpa_onnx;
7 {$modeSwitch advancedRecords} { to support records with methods } 12 {$modeSwitch advancedRecords} { to support records with methods }
8 {$ENDIF} 13 {$ENDIF}
9 14
10 -(* {$LongStrings ON} *) 15 +{$LongStrings ON}
11 16
12 interface 17 interface
13 uses 18 uses
14 ctypes; 19 ctypes;
15 20
16 type 21 type
  22 + TSherpaOnnxSamplesArray = array of Single;
  23 +
  24 + TSherpaOnnxLinearResampler = class
  25 + private
  26 + Handle: Pointer;
  27 + InputSampleRate: Integer;
  28 + OutputSampleRate: Integer;
  29 + public
  30 + constructor Create(SampleRateIn: Integer; SampleRateOut: Integer);
  31 + destructor Destroy; override;
  32 +
  33 + function Resample(Samples: pcfloat;
  34 + N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray; overload;
  35 +
  36 + function Resample(Samples: array of Single;
  37 + Flush: Boolean): TSherpaOnnxSamplesArray; overload;
  38 +
  39 + procedure Reset;
  40 +
  41 + property GetInputSampleRate: Integer Read InputSampleRate;
  42 + property GetOutputSampleRate: Integer Read OutputSampleRate;
  43 + end;
  44 +
  45 + PSherpaOnnxGeneratedAudioCallbackWithArg = ^TSherpaOnnxGeneratedAudioCallbackWithArg;
  46 +
  47 + TSherpaOnnxGeneratedAudioCallbackWithArg = function(
  48 + Samples: pcfloat; N: cint32;
  49 + Arg: Pointer): cint; cdecl;
  50 +
  51 + TSherpaOnnxOfflineTtsVitsModelConfig = record
  52 + Model: AnsiString;
  53 + Lexicon: AnsiString;
  54 + Tokens: AnsiString;
  55 + DataDir: AnsiString;
  56 + NoiseScale: Single;
  57 + NoiseScaleW: Single;
  58 + LengthScale: Single;
  59 + DictDir: AnsiString;
  60 +
  61 + function ToString: AnsiString;
  62 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
  63 + end;
  64 +
  65 + TSherpaOnnxOfflineTtsModelConfig = record
  66 + Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
  67 + NumThreads: Integer;
  68 + Debug: Boolean;
  69 + Provider: AnsiString;
  70 +
  71 + function ToString: AnsiString;
  72 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
  73 + end;
  74 +
  75 + TSherpaOnnxOfflineTtsConfig = record
  76 + Model: TSherpaOnnxOfflineTtsModelConfig;
  77 + RuleFsts: AnsiString;
  78 + MaxNumSentences: Integer;
  79 + RuleFars: AnsiString;
  80 +
  81 + function ToString: AnsiString;
  82 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
  83 + end;
  84 +
  85 + TSherpaOnnxGeneratedAudio = record
  86 + Samples: array of Single;
  87 + SampleRate: Integer;
  88 + end;
  89 +
  90 + TSherpaOnnxOfflineTts = class
  91 + private
  92 + Handle: Pointer;
  93 + SampleRate: Integer;
  94 + NumSpeakers: Integer;
  95 + _Config: TSherpaOnnxOfflineTtsConfig;
  96 + public
  97 + constructor Create(Config: TSherpaOnnxOfflineTtsConfig);
  98 + destructor Destroy; override;
  99 +
  100 + function Generate(Text: AnsiString; SpeakerId: Integer;
  101 + Speed: Single): TSherpaOnnxGeneratedAudio; overload;
  102 +
  103 + function Generate(Text: AnsiString; SpeakerId: Integer;
  104 + Speed: Single;
  105 + Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
  106 + Arg: Pointer
  107 + ): TSherpaOnnxGeneratedAudio; overload;
  108 +
  109 + property GetHandle: Pointer Read Handle;
  110 + property GetSampleRate: Integer Read SampleRate;
  111 + property GetNumSpeakers: Integer Read NumSpeakers;
  112 + end;
  113 +
17 TSherpaOnnxWave = record 114 TSherpaOnnxWave = record
18 Samples: array of Single; { normalized to the range [-1, 1] } 115 Samples: array of Single; { normalized to the range [-1, 1] }
19 SampleRate: Integer; 116 SampleRate: Integer;
@@ -254,7 +351,6 @@ type @@ -254,7 +351,6 @@ type
254 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig); 351 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
255 end; 352 end;
256 353
257 - TSherpaOnnxSamplesArray = array of Single;  
258 354
259 TSherpaOnnxCircularBuffer = class 355 TSherpaOnnxCircularBuffer = class
260 private 356 private
@@ -508,6 +604,94 @@ type @@ -508,6 +604,94 @@ type
508 604
509 PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment; 605 PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
510 606
  607 + SherpaOnnxOfflineTtsVitsModelConfig = record
  608 + Model: PAnsiChar;
  609 + Lexicon: PAnsiChar;
  610 + Tokens: PAnsiChar;
  611 + DataDir: PAnsiChar;
  612 + NoiseScale: cfloat;
  613 + NoiseScaleW: cfloat;
  614 + LengthScale: cfloat;
  615 + DictDir: PAnsiChar;
  616 + end;
  617 +
  618 + SherpaOnnxOfflineTtsModelConfig = record
  619 + Vits: SherpaOnnxOfflineTtsVitsModelConfig;
  620 + NumThreads: cint32;
  621 + Debug: cint32;
  622 + Provider: PAnsiChar;
  623 + end;
  624 +
  625 + SherpaOnnxOfflineTtsConfig = record
  626 + Model: SherpaOnnxOfflineTtsModelConfig;
  627 + RuleFsts: PAnsiChar;
  628 + MaxNumSentences: cint32;
  629 + RuleFars: PAnsiChar;
  630 + end;
  631 +
  632 + PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
  633 +
  634 + SherpaOnnxGeneratedAudio = record
  635 + Samples: pcfloat;
  636 + N: cint32;
  637 + SampleRate: cint32;
  638 + end;
  639 +
  640 + PSherpaOnnxGeneratedAudio = ^SherpaOnnxGeneratedAudio;
  641 +
  642 + SherpaOnnxResampleOut = record
  643 + Samples: pcfloat;
  644 + N: cint32;
  645 + end;
  646 +
  647 + PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
  648 +
  649 +function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
  650 + SampleRateOutHz: cint32;
  651 + FilterCutoffHz: cfloat;
  652 + NumZeros: cint32): Pointer; cdecl;
  653 + external SherpaOnnxLibName;
  654 +
  655 +procedure SherpaOnnxDestroyLinearResampler(P: Pointer); cdecl;
  656 + external SherpaOnnxLibName;
  657 +
  658 +function SherpaOnnxLinearResamplerResample(P: Pointer;
  659 + Samples: pcfloat;
  660 + N: Integer;
  661 + Flush: Integer): PSherpaOnnxResampleOut; cdecl;
  662 + external SherpaOnnxLibName;
  663 +
  664 +procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdecl;
  665 + external SherpaOnnxLibName;
  666 +
  667 +procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
  668 + external SherpaOnnxLibName;
  669 +
  670 +function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
  671 + external SherpaOnnxLibName;
  672 +
  673 +procedure SherpaOnnxDestroyOfflineTts(Tts: Pointer); cdecl;
  674 + external SherpaOnnxLibName;
  675 +
  676 +function SherpaOnnxOfflineTtsSampleRate(Tts: Pointer): cint32; cdecl;
  677 + external SherpaOnnxLibName;
  678 +
  679 +function SherpaOnnxOfflineTtsNumSpeakers(Tts: Pointer): cint32; cdecl;
  680 + external SherpaOnnxLibName;
  681 +
  682 +function SherpaOnnxOfflineTtsGenerate(Tts: Pointer;
  683 + Text: PAnsiChar; Sid: cint32; Speed: cfloat): PSherpaOnnxGeneratedAudio; cdecl;
  684 + external SherpaOnnxLibName;
  685 +
  686 +function SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Tts: Pointer;
  687 + Text: PAnsiChar; Sid: cint32; Speed: cfloat;
  688 + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
  689 + Arg: Pointer): PSherpaOnnxGeneratedAudio; cdecl;
  690 + external SherpaOnnxLibName;
  691 +
  692 +procedure SherpaOnnxDestroyOfflineTtsGeneratedAudio(Audio: Pointer); cdecl;
  693 + external SherpaOnnxLibName;
  694 +
511 function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig; 695 function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
512 BufferSizeInSeconds: cfloat): Pointer; cdecl; 696 BufferSizeInSeconds: cfloat): Pointer; cdecl;
513 external SherpaOnnxLibName; 697 external SherpaOnnxLibName;
@@ -793,8 +977,7 @@ constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecogniz @@ -793,8 +977,7 @@ constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecogniz
793 var 977 var
794 C: SherpaOnnxOnlineRecognizerConfig; 978 C: SherpaOnnxOnlineRecognizerConfig;
795 begin 979 begin
796 - Initialize(C);  
797 - 980 + C := Default(SherpaOnnxOnlineRecognizerConfig);
798 C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate; 981 C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
799 C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim; 982 C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
800 983
@@ -1051,8 +1234,7 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn @@ -1051,8 +1234,7 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn
1051 var 1234 var
1052 C: SherpaOnnxOfflineRecognizerConfig; 1235 C: SherpaOnnxOfflineRecognizerConfig;
1053 begin 1236 begin
1054 - Initialize(C);  
1055 - 1237 + C := Default(SherpaOnnxOfflineRecognizerConfig);
1056 C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate; 1238 C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
1057 C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim; 1239 C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
1058 1240
@@ -1369,12 +1551,11 @@ end; @@ -1369,12 +1551,11 @@ end;
1369 1551
1370 constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single); 1552 constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
1371 var 1553 var
1372 - C: SherpaOnnxVadModelConfig; 1554 + C: SherpaOnnxVadModelConfig ;
1373 begin 1555 begin
  1556 + C := Default(SherpaOnnxVadModelConfig);
1374 Self._Config := Config; 1557 Self._Config := Config;
1375 1558
1376 - Initialize(C);  
1377 -  
1378 C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model); 1559 C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
1379 C.SileroVad.Threshold := Config.SileroVad.Threshold; 1560 C.SileroVad.Threshold := Config.SileroVad.Threshold;
1380 C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; 1561 C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
@@ -1460,5 +1641,197 @@ begin @@ -1460,5 +1641,197 @@ begin
1460 SherpaOnnxVoiceActivityDetectorFlush(Self.Handle); 1641 SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
1461 end; 1642 end;
1462 1643
1463 -end. 1644 +function TSherpaOnnxOfflineTtsVitsModelConfig.ToString: AnsiString;
  1645 +begin
  1646 + Result := Format('TSherpaOnnxOfflineTtsVitsModelConfig(' +
  1647 + 'Model := %s, ' +
  1648 + 'Lexicon := %s, ' +
  1649 + 'Tokens := %s, ' +
  1650 + 'DataDir := %s, ' +
  1651 + 'NoiseScale := %.2f, ' +
  1652 + 'NoiseScaleW := %.2f, ' +
  1653 + 'LengthScale := %.2f, ' +
  1654 + 'DictDir := %s' +
  1655 + ')',
  1656 + [Self.Model, Self.Lexicon, Self.Tokens, Self.DataDir, Self.NoiseScale,
  1657 + Self.NoiseScaleW, Self.LengthScale, Self.DictDir
  1658 + ]);
  1659 +end;
  1660 +
  1661 +class operator TSherpaOnnxOfflineTtsVitsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
  1662 +begin
  1663 + Dest.NoiseScale := 0.667;
  1664 + Dest.NoiseScaleW := 0.8;
  1665 + Dest.LengthScale := 1.0;
  1666 +end;
  1667 +
  1668 +function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
  1669 +begin
  1670 + Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
  1671 + 'Vits := %s, ' +
  1672 + 'NumThreads := %d, ' +
  1673 + 'Debug := %s, ' +
  1674 + 'Provider := %s' +
  1675 + ')',
  1676 + [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider
  1677 + ]);
  1678 +end;
  1679 +
  1680 +class operator TSherpaOnnxOfflineTtsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
  1681 +begin
  1682 + Dest.NumThreads := 1;
  1683 + Dest.Debug := False;
  1684 + Dest.Provider := 'cpu';
  1685 +end;
  1686 +
  1687 +function TSherpaOnnxOfflineTtsConfig.ToString: AnsiString;
  1688 +begin
  1689 + Result := Format('TSherpaOnnxOfflineTtsConfig(' +
  1690 + 'Model := %s, ' +
  1691 + 'RuleFsts := %s, ' +
  1692 + 'MaxNumSentences := %d, ' +
  1693 + 'RuleFars := %s' +
  1694 + ')',
  1695 + [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars
  1696 + ]);
  1697 +end;
  1698 +
  1699 +class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
  1700 +begin
  1701 + Dest.MaxNumSentences := 1;
  1702 +end;
  1703 +
  1704 +constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
  1705 +var
  1706 + C: SherpaOnnxOfflineTtsConfig;
  1707 +begin
  1708 + C := Default(SherpaOnnxOfflineTtsConfig);
  1709 + Self._Config := Config;
  1710 +
  1711 + C.Model.Vits.Model := PAnsiChar(Config.Model.Vits.Model);
  1712 + C.Model.Vits.Lexicon := PAnsiChar(Config.Model.Vits.Lexicon);
  1713 + C.Model.Vits.Tokens := PAnsiChar(Config.Model.Vits.Tokens);
  1714 + C.Model.Vits.DataDir := PAnsiChar(Config.Model.Vits.DataDir);
  1715 + C.Model.Vits.NoiseScale := Config.Model.Vits.NoiseScale;
  1716 + C.Model.Vits.NoiseScaleW := Config.Model.Vits.NoiseScaleW;
  1717 + C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
  1718 + C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir);
  1719 +
  1720 + C.Model.NumThreads := Config.Model.NumThreads;
  1721 + C.Model.Provider := PAnsiChar(Config.Model.Provider);
  1722 + C.Model.Debug := Ord(Config.Model.Debug);
  1723 +
  1724 + C.RuleFsts := PAnsiChar(Config.RuleFsts);
  1725 + C.MaxNumSentences := Config.MaxNumSentences;
  1726 + C.RuleFars := PAnsiChar(Config.RuleFars);
  1727 +
  1728 + Self.Handle := SherpaOnnxCreateOfflineTts(@C);
  1729 +
  1730 + Self.SampleRate := SherpaOnnxOfflineTtsSampleRate(Self.Handle);
  1731 + Self.NumSpeakers := SherpaOnnxOfflineTtsNumSpeakers(Self.Handle);
  1732 +end;
  1733 +
  1734 +destructor TSherpaOnnxOfflineTts.Destroy;
  1735 +begin
  1736 + SherpaOnnxDestroyOfflineTts(Self.Handle);
  1737 + Self.Handle := nil;
  1738 +end;
  1739 +
  1740 +function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
  1741 + Speed: Single): TSherpaOnnxGeneratedAudio;
  1742 +var
  1743 + Audio: PSherpaOnnxGeneratedAudio;
  1744 + I: Integer;
  1745 +begin
  1746 + Result := Default(TSherpaOnnxGeneratedAudio);
  1747 +
  1748 + Audio := SherpaOnnxOfflineTtsGenerate(Self.Handle, PAnsiChar(Text), SpeakerId, Speed);
  1749 +
  1750 + SetLength(Result.Samples, Audio^.N);
  1751 + Result.SampleRate := Audio^.SampleRate;
  1752 +
  1753 + for I := Low(Result.Samples) to High(Result.Samples) do
  1754 + begin
  1755 + Result.Samples[I] := Audio^.Samples[I];
  1756 + end;
  1757 +
  1758 + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
  1759 +end;
  1760 +
  1761 +function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
  1762 + Speed: Single;
  1763 + Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
  1764 + Arg: Pointer
  1765 + ): TSherpaOnnxGeneratedAudio;
  1766 +var
  1767 + Audio: PSherpaOnnxGeneratedAudio;
  1768 + I: Integer;
  1769 +begin
  1770 + Result := Default(TSherpaOnnxGeneratedAudio);
  1771 +
  1772 + Audio := SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Self.Handle, PAnsiChar(Text),
  1773 + SpeakerId, Speed, Callback, Arg);
  1774 +
  1775 + SetLength(Result.Samples, Audio^.N);
  1776 + Result.SampleRate := Audio^.SampleRate;
  1777 +
  1778 + for I := Low(Result.Samples) to High(Result.Samples) do
  1779 + begin
  1780 + Result.Samples[I] := Audio^.Samples[I];
  1781 + end;
  1782 +
  1783 + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
  1784 +end;
1464 1785
  1786 +constructor TSherpaOnnxLinearResampler.Create(SampleRateIn: Integer; SampleRateOut: Integer);
  1787 +var
  1788 + MinFreq: Single;
  1789 + LowpassCutoff: Single;
  1790 + LowpassFilterWidth: Integer = 6;
  1791 +begin
  1792 + if SampleRateIn > SampleRateOut then
  1793 + MinFreq := SampleRateOut
  1794 + else
  1795 + MinFreq := SampleRateIn;
  1796 +
  1797 + LowpassCutoff := 0.99 * 0.5 * MinFreq;
  1798 +
  1799 + Self.Handle := SherpaOnnxCreateLinearResampler(SampleRateIn,
  1800 + SampleRateOut, LowpassCutoff, LowpassFilterWidth);
  1801 + Self.InputSampleRate := SampleRateIn;
  1802 + Self.OutputSampleRate := SampleRateOut;
  1803 +end;
  1804 +
  1805 +destructor TSherpaOnnxLinearResampler.Destroy;
  1806 +begin
  1807 + SherpaOnnxDestroyLinearResampler(Self.Handle);
  1808 + Self.Handle := nil;
  1809 +end;
  1810 +
  1811 +function TSherpaOnnxLinearResampler.Resample(Samples: pcfloat;
  1812 + N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray;
  1813 +var
  1814 + P: PSherpaOnnxResampleOut;
  1815 + I: Integer;
  1816 +begin
  1817 + Result := Default(TSherpaOnnxSamplesArray);
  1818 + P := SherpaOnnxLinearResamplerResample(Self.Handle, Samples, N, Ord(Flush));
  1819 + SetLength(Result, P^.N);
  1820 +
  1821 + for I := Low(Result) to High(Result) do
  1822 + Result[I] := P^.Samples[I];
  1823 +
  1824 + SherpaOnnxLinearResamplerResampleFree(P);
  1825 +end;
  1826 +
  1827 +function TSherpaOnnxLinearResampler.Resample(Samples: array of Single; Flush: Boolean): TSherpaOnnxSamplesArray;
  1828 +begin
  1829 + Result := Self.Resample(pcfloat(Samples), Length(Samples), Flush);
  1830 +end;
  1831 +
  1832 +procedure TSherpaOnnxLinearResampler.Reset;
  1833 +begin
  1834 + SherpaOnnxLinearResamplerReset(Self.Handle);
  1835 +end;
  1836 +
  1837 +end.