Fangjun Kuang
Committed by GitHub

Add Kotlin and Java API for KittenTTS (#2461)

... ... @@ -309,12 +309,14 @@ jobs:
run: |
cd ./java-api-examples
./run-non-streaming-tts-kitten-en.sh
./run-non-streaming-tts-kokoro-zh-en.sh
./run-non-streaming-tts-kokoro-en.sh
./run-non-streaming-tts-matcha-zh.sh
./run-non-streaming-tts-matcha-en.sh
ls -lh
rm -rf kitten-nano-en-*
rm -rf kokoro-multi-*
rm -rf kokoro-en-*
... ...
... ... @@ -144,3 +144,4 @@ sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
dict
*.npz
voices.bin
kitten-nano-en-v0_1-fp16
... ...
... ... @@ -40,7 +40,7 @@ public class NonStreamingTtsCoquiDe {
String waveFilename = "tts-coqui-de.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
// Copyright 2025 Xiaomi Corporation
// This file shows how to use a KittenTTS English model
// to convert text to speech
import com.k2fsa.sherpa.onnx.*;
public class NonStreamingTtsKittenEn {
public static void main(String[] args) {
// please visit
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html
// to download model files
String model = "./kitten-nano-en-v0_1-fp16/model.fp16.onnx";
String voices = "./kitten-nano-en-v0_1-fp16/voices.bin";
String tokens = "./kitten-nano-en-v0_1-fp16/tokens.txt";
String dataDir = "./kitten-nano-en-v0_1-fp16/espeak-ng-data";
String text =
"Today as always, men fall into two groups: slaves and free men. Whoever does not have"
+ " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
+ " businessman, an official, or a scholar.";
OfflineTtsKittenModelConfig kittenModelConfig =
OfflineTtsKittenModelConfig.builder()
.setModel(model)
.setVoices(voices)
.setTokens(tokens)
.setDataDir(dataDir)
.build();
OfflineTtsModelConfig modelConfig =
OfflineTtsModelConfig.builder()
.setKitten(kittenModelConfig)
.setNumThreads(2)
.setDebug(true)
.build();
OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
OfflineTts tts = new OfflineTts(config);
int sid = 7;
float speed = 1.0f;
long start = System.currentTimeMillis();
GeneratedAudio audio = tts.generate(text, sid, speed);
long stop = System.currentTimeMillis();
float timeElapsedSeconds = (stop - start) / 1000.0f;
float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
float real_time_factor = timeElapsedSeconds / audioDuration;
String waveFilename = "tts-kitten-en.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
tts.release();
}
}
... ...
... ... @@ -50,7 +50,7 @@ public class NonStreamingTtsKokoroEn {
String waveFilename = "tts-kokoro-en.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
... ... @@ -54,7 +54,7 @@ public class NonStreamingTtsKokoroZhEn {
String waveFilename = "tts-kokoro-zh-en.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
... ... @@ -50,7 +50,7 @@ public class NonStreamingTtsMatchaEn {
String waveFilename = "tts-matcha-en.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
... ... @@ -56,7 +56,7 @@ public class NonStreamingTtsMatchaZh {
String waveFilename = "tts-matcha-zh.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
... ... @@ -48,7 +48,7 @@ public class NonStreamingTtsPiperEn {
String waveFilename = "tts-piper-en.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
... ... @@ -176,7 +176,7 @@ public class NonStreamingTtsPiperEn {
String waveFilename = "tts-piper-en.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
... ... @@ -54,7 +54,7 @@ public class NonStreamingTtsPiperEn {
String waveFilename = "tts-vits-zh.wav";
audio.save(waveFilename);
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
System.out.printf("-- text: %s\n", text);
System.out.printf("-- Saved to %s\n", waveFilename);
... ...
#!/usr/bin/env bash
set -ex
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
mkdir -p ../build
pushd ../build
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..
make -j4
ls -lh lib
popd
fi
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
pushd ../sherpa-onnx/java-api
make
popd
fi
# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html
# to download more models
if [ ! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
fi
java \
-Djava.library.path=$PWD/../build/lib \
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
NonStreamingTtsKittenEn.java
... ...
... ... @@ -140,6 +140,12 @@ function testTts() {
rm kokoro-en-v0_19.tar.bz2
fi
if [ ! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
fi
out_filename=test_tts.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_tts.kt \
... ... @@ -477,7 +483,7 @@ function testOfflineNeMoCanary() {
java -Djava.library.path=../build/lib -jar $out_filename
}
# testVersion
testVersion
testOfflineNeMoCanary
testOfflineSenseVoiceWithHr
... ...
... ... @@ -5,6 +5,7 @@ fun main() {
testMatcha()
testKokoroEn()
testKokoroZhEn()
testKittenEn()
}
fun testKokoroZhEn() {
... ... @@ -96,6 +97,27 @@ fun testVits() {
println("Saved to test-en.wav")
}
fun testKittenEn() {
// see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
var config = OfflineTtsConfig(
model=OfflineTtsModelConfig(
kitten=OfflineTtsKittenModelConfig(
model="./kitten-nano-en-v0_1-fp16/model.fp16.onnx",
voices="./kitten-nano-en-v0_1-fp16/voices.bin",
tokens="./kitten-nano-en-v0_1-fp16/tokens.txt",
dataDir="./kitten-nano-en-v0_1-fp16/espeak-ng-data",
),
numThreads=2,
debug=true,
),
)
val tts = OfflineTts(config=config)
val audio = tts.generateWithCallback(text="How are you doing today?", sid=7, callback=::callback)
audio.save(filename="test-kitten-en.wav")
tts.release()
println("Saved to test-kitten-en.wav")
}
/*
1. Unzip test_tts.jar
2.
... ...
... ... @@ -43,6 +43,7 @@ java_files += OfflineRecognizerResult.java
java_files += OfflineStream.java
java_files += OfflineRecognizer.java
java_files += OfflineTtsKittenModelConfig.java
java_files += OfflineTtsKokoroModelConfig.java
java_files += OfflineTtsMatchaModelConfig.java
java_files += OfflineTtsVitsModelConfig.java
... ...
// Copyright 2025 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class OfflineTtsKittenModelConfig {
private final String model;
private final String voices;
private final String tokens;
private final String dataDir;
private final float lengthScale;
private OfflineTtsKittenModelConfig(Builder builder) {
this.model = builder.model;
this.voices = builder.voices;
this.tokens = builder.tokens;
this.dataDir = builder.dataDir;
this.lengthScale = builder.lengthScale;
}
public static Builder builder() {
return new Builder();
}
public String getModel() {
return model;
}
public String getVoices() {
return voices;
}
public String getTokens() {
return tokens;
}
public String getDataDir() {
return dataDir;
}
public float getLengthScale() {
return lengthScale;
}
public static class Builder {
private String model = "";
private String voices = "";
private String tokens = "";
private String dataDir = "";
private float lengthScale = 1.0f;
public OfflineTtsKittenModelConfig build() {
return new OfflineTtsKittenModelConfig(this);
}
public Builder setModel(String model) {
this.model = model;
return this;
}
public Builder setVoices(String voices) {
this.voices = voices;
return this;
}
public Builder setTokens(String tokens) {
this.tokens = tokens;
return this;
}
public Builder setDataDir(String dataDir) {
this.dataDir = dataDir;
return this;
}
public Builder setLengthScale(float lengthScale) {
this.lengthScale = lengthScale;
return this;
}
}
}
\ No newline at end of file
... ...
... ... @@ -6,6 +6,7 @@ public class OfflineTtsModelConfig {
private final OfflineTtsVitsModelConfig vits;
private final OfflineTtsMatchaModelConfig matcha;
private final OfflineTtsKokoroModelConfig kokoro;
private final OfflineTtsKittenModelConfig kitten;
private final int numThreads;
private final boolean debug;
private final String provider;
... ... @@ -14,6 +15,7 @@ public class OfflineTtsModelConfig {
this.vits = builder.vits;
this.matcha = builder.matcha;
this.kokoro = builder.kokoro;
this.kitten = builder.kitten;
this.numThreads = builder.numThreads;
this.debug = builder.debug;
this.provider = builder.provider;
... ... @@ -35,10 +37,15 @@ public class OfflineTtsModelConfig {
return kokoro;
}
public OfflineTtsKittenModelConfig getKitten() {
return kitten;
}
public static class Builder {
private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build();
private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build();
private OfflineTtsKokoroModelConfig kokoro = OfflineTtsKokoroModelConfig.builder().build();
private OfflineTtsKittenModelConfig kitten = OfflineTtsKittenModelConfig.builder().build();
private int numThreads = 1;
private boolean debug = true;
private String provider = "cpu";
... ... @@ -62,6 +69,11 @@ public class OfflineTtsModelConfig {
return this;
}
public Builder setKitten(OfflineTtsKittenModelConfig kitten) {
this.kitten = kitten;
return this;
}
public Builder setNumThreads(int numThreads) {
this.numThreads = numThreads;
return this;
... ... @@ -77,4 +89,4 @@ public class OfflineTtsModelConfig {
return this;
}
}
}
}
\ No newline at end of file
... ...
... ... @@ -166,6 +166,39 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
fid = env->GetFieldID(kokoro_cls, "lengthScale", "F");
ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid);
// kitten
fid = env->GetFieldID(model_config_cls, "kitten",
"Lcom/k2fsa/sherpa/onnx/OfflineTtsKittenModelConfig;");
jobject kitten = env->GetObjectField(model, fid);
jclass kitten_cls = env->GetObjectClass(kitten);
fid = env->GetFieldID(kitten_cls, "model", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(kitten, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.kitten.model = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(kitten_cls, "voices", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(kitten, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.kitten.voices = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(kitten_cls, "tokens", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(kitten, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.kitten.tokens = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(kitten_cls, "dataDir", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(kitten, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.kitten.data_dir = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(kitten_cls, "lengthScale", "F");
ans.model.kitten.length_scale = env->GetFloatField(kitten, fid);
fid = env->GetFieldID(model_config_cls, "numThreads", "I");
ans.model.num_threads = env->GetIntField(model, fid);
... ...
... ... @@ -36,10 +36,19 @@ data class OfflineTtsKokoroModelConfig(
var lengthScale: Float = 1.0f,
)
data class OfflineTtsKittenModelConfig(
var model: String = "",
var voices: String = "",
var tokens: String = "",
var dataDir: String = "",
var lengthScale: Float = 1.0f,
)
data class OfflineTtsModelConfig(
var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(),
var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(),
var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(),
var kitten: OfflineTtsKittenModelConfig = OfflineTtsKittenModelConfig(),
var numThreads: Int = 1,
var debug: Boolean = false,
var provider: String = "cpu",
... ... @@ -189,13 +198,14 @@ fun getOfflineTtsConfig(
modelName: String, // for VITS
acousticModelName: String, // for Matcha
vocoder: String, // for Matcha
voices: String, // for Kokoro
voices: String, // for Kokoro or kitten
lexicon: String,
dataDir: String,
dictDir: String,
ruleFsts: String,
ruleFars: String,
numThreads: Int? = null
numThreads: Int? = null,
isKitten: Boolean = false
): OfflineTtsConfig {
// For Matcha TTS, please set
// acousticModelName, vocoder
... ... @@ -203,13 +213,16 @@ fun getOfflineTtsConfig(
// For Kokoro TTS, please set
// modelName, voices
// For Kitten TTS, please set
// modelName, voices, isKitten
// For VITS, please set
// modelName
val numberOfThreads = if (numThreads != null) {
numThreads
} else if (voices.isNotEmpty()) {
// for Kokoro TTS models, we use more threads
// for Kokoro and Kitten TTS models, we use more threads
4
} else {
2
... ... @@ -252,7 +265,7 @@ fun getOfflineTtsConfig(
OfflineTtsMatchaModelConfig()
}
val kokoro = if (voices.isNotEmpty()) {
val kokoro = if (voices.isNotEmpty() && !isKitten) {
OfflineTtsKokoroModelConfig(
model = "$modelDir/$modelName",
voices = "$modelDir/$voices",
... ... @@ -269,11 +282,23 @@ fun getOfflineTtsConfig(
OfflineTtsKokoroModelConfig()
}
val kitten = if (isKitten) {
OfflineTtsKittenModelConfig(
model = "$modelDir/$modelName",
voices = "$modelDir/$voices",
tokens = "$modelDir/tokens.txt",
dataDir = dataDir,
)
} else {
OfflineTtsKittenModelConfig()
}
return OfflineTtsConfig(
model = OfflineTtsModelConfig(
vits = vits,
matcha = matcha,
kokoro = kokoro,
kitten = kitten,
numThreads = numberOfThreads,
debug = true,
provider = "cpu",
... ...