Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2025-08-08 20:10:36 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2025-08-08 20:10:36 +0800
Commit
3c25a1914beef939f38b314392c6748bb2bca0d5
3c25a191
1 parent
26b0e816
Add Pascal API for KittenTTS (#2474)
隐藏空白字符变更
内嵌
并排对比
正在显示
6 个修改的文件
包含
434 行增加
和
3 行删除
pascal-api-examples/tts/.gitignore
pascal-api-examples/tts/kitten-en-playback.pas
pascal-api-examples/tts/kitten-en.pas
pascal-api-examples/tts/run-kitten-en-playback.sh
pascal-api-examples/tts/run-kitten-en.sh
sherpa-onnx/pascal-api/sherpa_onnx.pas
pascal-api-examples/tts/.gitignore
查看文件 @
3c25a19
...
...
@@ -7,6 +7,8 @@ matcha-en
matcha-zh-playback
matcha-en-playback
kokoro-en
kitten-en
kokoro-en-playback
kitten-en-playback
kokoro-zh-en
kokoro-zh-en-playback
...
...
pascal-api-examples/tts/kitten-en-playback.pas
0 → 100644
查看文件 @
3c25a19
{
Copyright
(c)
2025
Xiaomi
Corporation
}
program
kitten_en_playback;
{
This
file
shows
how
to
use
the
text
to
speech
API
of
sherpa-onnx
with
kitten
models.
It
generates
speech
from
text
and
saves
it
to
a
wave
file.
Note
that
it
plays
the
audio
back
as
it
is
still
generating.
}
{
$mode
objfpc
}
uses
{
$ifdef
unix
}
cthreads,
{
$endif
}
SysUtils,
dos,
ctypes,
portaudio,
sherpa_onnx;
var
CriticalSection:
TRTLCriticalSection;
Tts:
TSherpaOnnxOfflineTts;
Audio:
TSherpaOnnxGeneratedAudio;
Resampler:
TSherpaOnnxLinearResampler;
Text:
AnsiString;
Speed:
Single
=
1.0
;
{
Use
a
larger
value
to
speak
faster
}
SpeakerId:
Integer
=
0
;
Buffer:
TSherpaOnnxCircularBuffer;
FinishedGeneration:
Boolean
=
False;
FinishedPlaying:
Boolean
=
False;
Version:
String;
EnvStr:
String;
Status:
Integer;
NumDevices:
Integer;
DeviceIndex:
Integer;
DeviceInfo:
PPaDeviceInfo;
{
If
you
get
EDivByZero:
Division
by
zero
error,
please
change
the
sample
rate
to
the
one
supported
by
your
microphone.
}
DeviceSampleRate:
Integer
=
48000
;
I:
Integer;
Param:
TPaStreamParameters;
Stream:
PPaStream;
Wave:
TSherpaOnnxWave;
function
GenerateCallback(
Samples:
pcfloat;
N:
cint
32
;
Arg:
Pointer):
cint;
cdecl;
begin
EnterCriticalSection(CriticalSection);
try
if
Resampler
<>
nil
then
Buffer.Push(Resampler.Resample(Samples,
N,
False))
else
Buffer.Push(Samples,
N);
finally
LeaveCriticalSection(CriticalSection);
end;
{
1
means
to
continue
generating;
0
means
to
stop
generating.
}
Result
:=
1
;
end;
function
PlayCallback(
input:
Pointer;
output:
Pointer;
frameCount:
culong;
timeInfo:
PPaStreamCallbackTimeInfo;
statusFlags:
TPaStreamCallbackFlags;
userData:
Pointer
):
cint;
cdecl;
var
Samples:
TSherpaOnnxSamplesArray;
I:
Integer;
begin
EnterCriticalSection(CriticalSection);
try
if
Buffer.Size
>=
frameCount
then
begin
Samples
:=
Buffer.Get(Buffer.Head,
FrameCount);
Buffer.Pop(FrameCount);
end
else
if
Buffer.Size
>
0
then
begin
Samples
:=
Buffer.Get(Buffer.Head,
Buffer.Size);
Buffer.Pop(Buffer.Size);
SetLength(Samples,
frameCount);
end
else
SetLength(Samples,
frameCount);
for
I
:=
0
to
frameCount
-
1
do
pcfloat(output)
[
I
]
:=
Samples
[
I
]
;
if
(Buffer.Size
>
0
)
or
(not
FinishedGeneration)
then
Result
:=
paContinue
else
begin
Result
:=
paComplete;
FinishedPlaying
:=
True;
end;
finally
LeaveCriticalSection(CriticalSection);
end;
end;
function
GetOfflineTts:
TSherpaOnnxOfflineTts;
var
Config:
TSherpaOnnxOfflineTtsConfig;
begin
Config.Model.Kitten.Model
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/model.fp
16
.onnx';
Config.Model.Kitten.Voices
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/voices.bin';
Config.Model.Kitten.Tokens
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/tokens.txt';
Config.Model.Kitten.DataDir
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/espeak-ng-data';
Config.Model.NumThreads
:=
2
;
Config.Model.Debug
:=
False;
Config.MaxNumSentences
:=
1
;
Result
:=
TSherpaOnnxOfflineTts.Create(Config);
end;
begin
Tts
:=
GetOfflineTts;
if
Tts.GetSampleRate
<>
DeviceSampleRate
then
Resampler
:=
TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate,
DeviceSampleRate);
Version
:=
String(Pa_GetVersionText);
WriteLn('Version
is
',
Version);
Status
:=
Pa_Initialize;
if
Status
<>
paNoError
then
begin
WriteLn('Failed
to
initialize
portaudio,
',
Pa_GetErrorText(Status));
Exit;
end;
NumDevices
:=
Pa_GetDeviceCount;
WriteLn('Num
devices:
',
NumDevices);
DeviceIndex
:=
Pa_GetDefaultOutputDevice;
if
DeviceIndex
=
paNoDevice
then
begin
WriteLn('No
default
output
device
found');
Pa_Terminate;
Exit;
end;
EnvStr
:=
GetEnv('SHERPA_ONNX_MIC_DEVICE');
if
EnvStr
<>
''
then
begin
DeviceIndex
:=
StrToIntDef(EnvStr,
DeviceIndex);
WriteLn('Use
device
index
from
environment
variable
SHERPA_ONNX_MIC_DEVICE:
',
EnvStr);
end;
for
I
:=
0
to
(NumDevices
-
1
)
do
begin
DeviceInfo
:=
Pa_GetDeviceInfo(I);
if
I
=
DeviceIndex
then
{
WriteLn(Format('
*
%d
%s',
[I,
DeviceInfo^.Name]))
}
WriteLn(Format('
*
%d
%s',
[
I
,
AnsiString(DeviceInfo^.Name)
]
))
else
WriteLn(Format('
%d
%s',
[
I
,
AnsiString(DeviceInfo^.Name)
]
));
end;
WriteLn('Use
device
',
DeviceIndex);
WriteLn('
Name
',
Pa_GetDeviceInfo(DeviceIndex)^.Name);
WriteLn('
Max
output
channels
',
Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
Initialize(Param);
Param.Device
:=
DeviceIndex;
Param.ChannelCount
:=
1
;
Param.SampleFormat
:=
paFloat
32
;
param.SuggestedLatency
:=
Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
param.HostApiSpecificStreamInfo
:=
nil;
Buffer
:=
TSherpaOnnxCircularBuffer.Create(
30
*
DeviceSampleRate);
{
Note(fangjun):
PortAudio
invokes
PlayCallback
in
a
separate
thread.
}
Status
:=
Pa_OpenStream(stream,
nil,
@Param,
DeviceSampleRate,
paFramesPerBufferUnspecified,
paNoFlag,
PPaStreamCallback(@PlayCallback),
nil);
if
Status
<>
paNoError
then
begin
WriteLn('Failed
to
open
stream,
',
Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;
InitCriticalSection(CriticalSection);
Status
:=
Pa_StartStream(stream);
if
Status
<>
paNoError
then
begin
WriteLn('Failed
to
start
stream,
',
Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;
WriteLn('There
are
',
Tts.GetNumSpeakers,
'
speakers');
Text
:=
'Friends
fell
out
often
because
life
was
changing
so
fast.
The
easiest
thing
in
the
world
was
to
lose
touch
with
someone.';
Audio
:=
Tts.Generate(Text,
SpeakerId,
Speed,
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback),
nil);
FinishedGeneration
:=
True;
SherpaOnnxWriteWave('./kitten-en-playback
-0
.wav',
Audio.Samples,
Audio.SampleRate);
WriteLn('Saved
to
./kitten-en-playback
-0
.wav');
while
not
FinishedPlaying
do
Pa_Sleep(
100
);
{
sleep
for
0.1
second
}
{
TODO(fangjun):
Use
an
event
to
indicate
the
play
is
finished
}
DoneCriticalSection(CriticalSection);
FreeAndNil(Tts);
FreeAndNil(Resampler);
Status
:=
Pa_CloseStream(stream);
if
Status
<>
paNoError
then
begin
WriteLn('Failed
to
close
stream,
',
Pa_GetErrorText(Status));
Exit;
end;
Status
:=
Pa_Terminate;
if
Status
<>
paNoError
then
begin
WriteLn('Failed
to
deinitialize
portaudio,
',
Pa_GetErrorText(Status));
Exit;
end;
end.
...
...
pascal-api-examples/tts/kitten-en.pas
0 → 100644
查看文件 @
3c25a19
{
Copyright
(c)
2025
Xiaomi
Corporation
}
program
kitten_en;
{
This
file
shows
how
to
use
the
text
to
speech
API
of
sherpa-onnx
with
Kitten
TTS
models.
It
generates
speech
from
text
and
saves
it
to
a
wave
file.
If
you
want
to
play
it
while
it
is
generating,
please
see
./kitten-en-playback.pas
}
{
$mode
objfpc
}
uses
SysUtils,
sherpa_onnx;
function
GetOfflineTts:
TSherpaOnnxOfflineTts;
var
Config:
TSherpaOnnxOfflineTtsConfig;
begin
Config.Model.Kitten.Model
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/model.fp
16
.onnx';
Config.Model.Kitten.Voices
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/voices.bin';
Config.Model.Kitten.Tokens
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/tokens.txt';
Config.Model.Kitten.DataDir
:=
'./kitten-nano-en-v
0
_
1
-fp
16
/espeak-ng-data';
Config.Model.NumThreads
:=
2
;
Config.Model.Debug
:=
False;
Config.MaxNumSentences
:=
1
;
Result
:=
TSherpaOnnxOfflineTts.Create(Config);
end;
var
Tts:
TSherpaOnnxOfflineTts;
Audio:
TSherpaOnnxGeneratedAudio;
Text:
AnsiString;
Speed:
Single
=
1.0
;
{
Use
a
larger
value
to
speak
faster
}
SpeakerId:
Integer
=
0
;
begin
Tts
:=
GetOfflineTts;
WriteLn('There
are
',
Tts.GetNumSpeakers,
'
speakers');
Text
:=
'Friends
fell
out
often
because
life
was
changing
so
fast.
The
easiest
thing
in
the
world
was
to
lose
touch
with
someone.';
Audio
:=
Tts.Generate(Text,
SpeakerId,
Speed);
SherpaOnnxWriteWave('./kitten-en
-0
.wav',
Audio.Samples,
Audio.SampleRate);
WriteLn('Saved
to
./kitten-en
-0
.wav');
FreeAndNil(Tts);
end.
...
...
pascal-api-examples/tts/run-kitten-en-playback.sh
0 → 100755
查看文件 @
3c25a19
#!/usr/bin/env bash
set
-ex
SCRIPT_DIR
=
$(
cd
--
"
$(
dirname --
"
${
BASH_SOURCE
[0]
}
"
)
"
&> /dev/null
&&
pwd
)
SHERPA_ONNX_DIR
=
$(
cd
$SCRIPT_DIR
/../..
&&
pwd
)
echo
"SHERPA_ONNX_DIR:
$SHERPA_ONNX_DIR
"
if
[[
! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib
&&
! -f ../../build/install/lib/libsherpa-onnx-c-api.so
&&
! -f ../../build/install/lib/sherpa-onnx-c-api.dll
]]
;
then
mkdir -p ../../build
pushd
../../build
cmake
\
-DCMAKE_INSTALL_PREFIX
=
./install
\
-DSHERPA_ONNX_ENABLE_PYTHON
=
OFF
\
-DSHERPA_ONNX_ENABLE_TESTS
=
OFF
\
-DSHERPA_ONNX_ENABLE_CHECK
=
OFF
\
-DBUILD_SHARED_LIBS
=
ON
\
-DSHERPA_ONNX_ENABLE_PORTAUDIO
=
OFF
\
..
cmake --build . --target install --config Release
popd
fi
# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html
if
[
! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx
]
;
then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
fi
fpc
\
-dSHERPA_ONNX_USE_SHARED_LIBS
\
-Fu
$SHERPA_ONNX_DIR
/sherpa-onnx/pascal-api
\
-Fl
$SHERPA_ONNX_DIR
/build/install/lib
\
-Fl/usr/local/Cellar/portaudio/19.7.0/lib
\
./kitten-en-playback.pas
# Please see ../portaudio-test/README.md
# for how to install portaudio on macOS
export
LD_LIBRARY_PATH
=
$SHERPA_ONNX_DIR
/build/install/lib:
$LD_LIBRARY_PATH
export
DYLD_LIBRARY_PATH
=
$SHERPA_ONNX_DIR
/build/install/lib:
$DYLD_LIBRARY_PATH
./kitten-en-playback
...
...
pascal-api-examples/tts/run-kitten-en.sh
0 → 100755
查看文件 @
3c25a19
#!/usr/bin/env bash
set
-ex
SCRIPT_DIR
=
$(
cd
--
"
$(
dirname --
"
${
BASH_SOURCE
[0]
}
"
)
"
&> /dev/null
&&
pwd
)
SHERPA_ONNX_DIR
=
$(
cd
$SCRIPT_DIR
/../..
&&
pwd
)
echo
"SHERPA_ONNX_DIR:
$SHERPA_ONNX_DIR
"
if
[[
! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib
&&
! -f ../../build/install/lib/libsherpa-onnx-c-api.so
&&
! -f ../../build/install/lib/sherpa-onnx-c-api.dll
]]
;
then
mkdir -p ../../build
pushd
../../build
cmake
\
-DCMAKE_INSTALL_PREFIX
=
./install
\
-DSHERPA_ONNX_ENABLE_PYTHON
=
OFF
\
-DSHERPA_ONNX_ENABLE_TESTS
=
OFF
\
-DSHERPA_ONNX_ENABLE_CHECK
=
OFF
\
-DBUILD_SHARED_LIBS
=
ON
\
-DSHERPA_ONNX_ENABLE_PORTAUDIO
=
OFF
\
..
cmake --build . --target install --config Release
popd
fi
# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html
if
[
! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx
]
;
then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
fi
fpc
\
-dSHERPA_ONNX_USE_SHARED_LIBS
\
-Fu
$SHERPA_ONNX_DIR
/sherpa-onnx/pascal-api
\
-Fl
$SHERPA_ONNX_DIR
/build/install/lib
\
./kitten-en.pas
export
LD_LIBRARY_PATH
=
$SHERPA_ONNX_DIR
/build/install/lib:
$LD_LIBRARY_PATH
export
DYLD_LIBRARY_PATH
=
$SHERPA_ONNX_DIR
/build/install/lib:
$DYLD_LIBRARY_PATH
./kitten-en
...
...
sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @
3c25a19
...
...
@@ -90,6 +90,17 @@ type
class
operator
Initialize(
{
$IFDEF
FPC
}
var
{
$ELSE
}
out
{
$ENDIF
}
Dest:
TSherpaOnnxOfflineTtsKokoroModelConfig);
end;
TSherpaOnnxOfflineTtsKittenModelConfig
=
record
Model:
AnsiString;
Voices:
AnsiString;
Tokens:
AnsiString;
DataDir:
AnsiString;
LengthScale:
Single;
function
ToString:
AnsiString;
class
operator
Initialize(
{
$IFDEF
FPC
}
var
{
$ELSE
}
out
{
$ENDIF
}
Dest:
TSherpaOnnxOfflineTtsKittenModelConfig);
end;
TSherpaOnnxOfflineTtsModelConfig
=
record
Vits:
TSherpaOnnxOfflineTtsVitsModelConfig;
NumThreads:
Integer;
...
...
@@ -97,6 +108,7 @@ type
Provider:
AnsiString;
Matcha:
TSherpaOnnxOfflineTtsMatchaModelConfig;
Kokoro:
TSherpaOnnxOfflineTtsKokoroModelConfig;
Kitten:
TSherpaOnnxOfflineTtsKittenModelConfig;
function
ToString:
AnsiString;
class
operator
Initialize(
{
$IFDEF
FPC
}
var
{
$ELSE
}
out
{
$ENDIF
}
Dest:
TSherpaOnnxOfflineTtsModelConfig);
...
...
@@ -913,6 +925,14 @@ type
Lang:
PAnsiChar;
end;
SherpaOnnxOfflineTtsKittenModelConfig
=
record
Model:
PAnsiChar;
Voices:
PAnsiChar;
Tokens:
PAnsiChar;
DataDir:
PAnsiChar;
LengthScale:
cfloat;
end;
SherpaOnnxOfflineTtsModelConfig
=
record
Vits:
SherpaOnnxOfflineTtsVitsModelConfig;
NumThreads:
cint
32
;
...
...
@@ -920,6 +940,7 @@ type
Provider:
PAnsiChar;
Matcha:
SherpaOnnxOfflineTtsMatchaModelConfig;
Kokoro:
SherpaOnnxOfflineTtsKokoroModelConfig;
Kitten:
SherpaOnnxOfflineTtsKittenModelConfig;
end;
SherpaOnnxOfflineTtsConfig
=
record
...
...
@@ -1340,7 +1361,7 @@ begin
'ModelType
:=
%s,
'
+
'ModelingUnit
:=
%s,
'
+
'BpeVocab
:=
%s,
'
+
'NemoCtc
:=
%s',
'NemoCtc
:=
%s
)
',
[
Self.Transducer.ToString
,
Self.Paraformer.ToString
,
Self.Zipformer
2
Ctc.ToString
,
Self.Tokens
,
Self.NumThreads
,
Self.Provider
,
Self.Debug.ToString
,
...
...
@@ -2298,6 +2319,23 @@ begin
Dest.LengthScale
:=
1.0
;
end;
function
TSherpaOnnxOfflineTtsKittenModelConfig.ToString:
AnsiString;
begin
Result
:=
Format('TSherpaOnnxOfflineTtsKittenModelConfig('
+
'Model
:=
%s,
'
+
'Voices
:=
%s,
'
+
'Tokens
:=
%s,
'
+
'DataDir
:=
%s,
'
+
'LengthScale
:=
%.
2
f'
+
')',
[
Self.Model
,
Self.Voices
,
Self.Tokens
,
Self.DataDir
,
Self.LengthScale
]
);
end;
class
operator
TSherpaOnnxOfflineTtsKittenModelConfig.Initialize(
{
$IFDEF
FPC
}
var
{
$ELSE
}
out
{
$ENDIF
}
Dest:
TSherpaOnnxOfflineTtsKittenModelConfig);
begin
Dest.LengthScale
:=
1.0
;
end;
function
TSherpaOnnxOfflineTtsModelConfig.ToString:
AnsiString;
begin
Result
:=
Format('TSherpaOnnxOfflineTtsModelConfig('
+
...
...
@@ -2306,10 +2344,11 @@ begin
'Debug
:=
%s,
'
+
'Provider
:=
%s,
'
+
'Matcha
:=
%s,
'
+
'Kokoro
:=
%s'
+
'Kokoro
:=
%s,
'
+
'Kitten
:=
%s'
+
')',
[
Self.Vits.ToString
,
Self.NumThreads
,
Self.Debug.ToString
,
Self.Provider
,
Self.Matcha.ToString
,
Self.Kokoro.ToString
Self.Matcha.ToString
,
Self.Kokoro.ToString
,
Self.Kitten.ToString
]
);
end;
...
...
@@ -2373,6 +2412,12 @@ begin
C.Model.Kokoro.Lexicon
:=
PAnsiChar(Config.Model.Kokoro.Lexicon);
C.Model.Kokoro.Lang
:=
PAnsiChar(Config.Model.Kokoro.Lang);
C.Model.Kitten.Model
:=
PAnsiChar(Config.Model.Kitten.Model);
C.Model.Kitten.Voices
:=
PAnsiChar(Config.Model.Kitten.Voices);
C.Model.Kitten.Tokens
:=
PAnsiChar(Config.Model.Kitten.Tokens);
C.Model.Kitten.DataDir
:=
PAnsiChar(Config.Model.Kitten.DataDir);
C.Model.Kitten.LengthScale
:=
Config.Model.Kitten.LengthScale;
C.Model.NumThreads
:=
Config.Model.NumThreads;
C.Model.Provider
:=
PAnsiChar(Config.Model.Provider);
C.Model.Debug
:=
Ord(Config.Model.Debug);
...
...
请
注册
或
登录
后发表评论