Fangjun Kuang
Committed by GitHub

Add Kotlin and Java API for KittenTTS (#2461)

@@ -309,12 +309,14 @@ jobs: @@ -309,12 +309,14 @@ jobs:
309 run: | 309 run: |
310 cd ./java-api-examples 310 cd ./java-api-examples
311 311
  312 + ./run-non-streaming-tts-kitten-en.sh
312 ./run-non-streaming-tts-kokoro-zh-en.sh 313 ./run-non-streaming-tts-kokoro-zh-en.sh
313 ./run-non-streaming-tts-kokoro-en.sh 314 ./run-non-streaming-tts-kokoro-en.sh
314 ./run-non-streaming-tts-matcha-zh.sh 315 ./run-non-streaming-tts-matcha-zh.sh
315 ./run-non-streaming-tts-matcha-en.sh 316 ./run-non-streaming-tts-matcha-en.sh
316 ls -lh 317 ls -lh
317 318
  319 + rm -rf kitten-nano-en-*
318 rm -rf kokoro-multi-* 320 rm -rf kokoro-multi-*
319 rm -rf kokoro-en-* 321 rm -rf kokoro-en-*
320 322
@@ -144,3 +144,4 @@ sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 @@ -144,3 +144,4 @@ sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
144 dict 144 dict
145 *.npz 145 *.npz
146 voices.bin 146 voices.bin
  147 +kitten-nano-en-v0_1-fp16
@@ -40,7 +40,7 @@ public class NonStreamingTtsCoquiDe { @@ -40,7 +40,7 @@ public class NonStreamingTtsCoquiDe {
40 String waveFilename = "tts-coqui-de.wav"; 40 String waveFilename = "tts-coqui-de.wav";
41 audio.save(waveFilename); 41 audio.save(waveFilename);
42 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 42 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
43 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 43 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
44 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 44 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
45 System.out.printf("-- text: %s\n", text); 45 System.out.printf("-- text: %s\n", text);
46 System.out.printf("-- Saved to %s\n", waveFilename); 46 System.out.printf("-- Saved to %s\n", waveFilename);
  1 +// Copyright 2025 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a KittenTTS English model
  4 +// to convert text to speech
  5 +import com.k2fsa.sherpa.onnx.*;
  6 +
  7 +public class NonStreamingTtsKittenEn {
  8 + public static void main(String[] args) {
  9 + // please visit
  10 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html
  11 + // to download model files
  12 + String model = "./kitten-nano-en-v0_1-fp16/model.fp16.onnx";
  13 + String voices = "./kitten-nano-en-v0_1-fp16/voices.bin";
  14 + String tokens = "./kitten-nano-en-v0_1-fp16/tokens.txt";
  15 + String dataDir = "./kitten-nano-en-v0_1-fp16/espeak-ng-data";
  16 + String text =
  17 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have"
  18 + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
  19 + + " businessman, an official, or a scholar.";
  20 +
  21 + OfflineTtsKittenModelConfig kittenModelConfig =
  22 + OfflineTtsKittenModelConfig.builder()
  23 + .setModel(model)
  24 + .setVoices(voices)
  25 + .setTokens(tokens)
  26 + .setDataDir(dataDir)
  27 + .build();
  28 +
  29 + OfflineTtsModelConfig modelConfig =
  30 + OfflineTtsModelConfig.builder()
  31 + .setKitten(kittenModelConfig)
  32 + .setNumThreads(2)
  33 + .setDebug(true)
  34 + .build();
  35 +
  36 + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
  37 + OfflineTts tts = new OfflineTts(config);
  38 +
  39 + int sid = 7;
  40 + float speed = 1.0f;
  41 + long start = System.currentTimeMillis();
  42 + GeneratedAudio audio = tts.generate(text, sid, speed);
  43 + long stop = System.currentTimeMillis();
  44 +
  45 + float timeElapsedSeconds = (stop - start) / 1000.0f;
  46 +
  47 + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
  48 + float real_time_factor = timeElapsedSeconds / audioDuration;
  49 +
  50 + String waveFilename = "tts-kitten-en.wav";
  51 + audio.save(waveFilename);
  52 + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
  53 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
  54 + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
  55 + System.out.printf("-- text: %s\n", text);
  56 + System.out.printf("-- Saved to %s\n", waveFilename);
  57 +
  58 + tts.release();
  59 + }
  60 +}
@@ -50,7 +50,7 @@ public class NonStreamingTtsKokoroEn { @@ -50,7 +50,7 @@ public class NonStreamingTtsKokoroEn {
50 String waveFilename = "tts-kokoro-en.wav"; 50 String waveFilename = "tts-kokoro-en.wav";
51 audio.save(waveFilename); 51 audio.save(waveFilename);
52 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 52 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
53 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 53 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
54 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 54 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
55 System.out.printf("-- text: %s\n", text); 55 System.out.printf("-- text: %s\n", text);
56 System.out.printf("-- Saved to %s\n", waveFilename); 56 System.out.printf("-- Saved to %s\n", waveFilename);
@@ -54,7 +54,7 @@ public class NonStreamingTtsKokoroZhEn { @@ -54,7 +54,7 @@ public class NonStreamingTtsKokoroZhEn {
54 String waveFilename = "tts-kokoro-zh-en.wav"; 54 String waveFilename = "tts-kokoro-zh-en.wav";
55 audio.save(waveFilename); 55 audio.save(waveFilename);
56 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 56 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
57 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 57 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
58 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 58 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
59 System.out.printf("-- text: %s\n", text); 59 System.out.printf("-- text: %s\n", text);
60 System.out.printf("-- Saved to %s\n", waveFilename); 60 System.out.printf("-- Saved to %s\n", waveFilename);
@@ -50,7 +50,7 @@ public class NonStreamingTtsMatchaEn { @@ -50,7 +50,7 @@ public class NonStreamingTtsMatchaEn {
50 String waveFilename = "tts-matcha-en.wav"; 50 String waveFilename = "tts-matcha-en.wav";
51 audio.save(waveFilename); 51 audio.save(waveFilename);
52 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 52 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
53 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 53 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
54 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 54 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
55 System.out.printf("-- text: %s\n", text); 55 System.out.printf("-- text: %s\n", text);
56 System.out.printf("-- Saved to %s\n", waveFilename); 56 System.out.printf("-- Saved to %s\n", waveFilename);
@@ -56,7 +56,7 @@ public class NonStreamingTtsMatchaZh { @@ -56,7 +56,7 @@ public class NonStreamingTtsMatchaZh {
56 String waveFilename = "tts-matcha-zh.wav"; 56 String waveFilename = "tts-matcha-zh.wav";
57 audio.save(waveFilename); 57 audio.save(waveFilename);
58 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 58 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
59 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 59 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
60 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 60 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
61 System.out.printf("-- text: %s\n", text); 61 System.out.printf("-- text: %s\n", text);
62 System.out.printf("-- Saved to %s\n", waveFilename); 62 System.out.printf("-- Saved to %s\n", waveFilename);
@@ -48,7 +48,7 @@ public class NonStreamingTtsPiperEn { @@ -48,7 +48,7 @@ public class NonStreamingTtsPiperEn {
48 String waveFilename = "tts-piper-en.wav"; 48 String waveFilename = "tts-piper-en.wav";
49 audio.save(waveFilename); 49 audio.save(waveFilename);
50 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 50 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
51 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 51 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
52 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 52 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
53 System.out.printf("-- text: %s\n", text); 53 System.out.printf("-- text: %s\n", text);
54 System.out.printf("-- Saved to %s\n", waveFilename); 54 System.out.printf("-- Saved to %s\n", waveFilename);
@@ -176,7 +176,7 @@ public class NonStreamingTtsPiperEn { @@ -176,7 +176,7 @@ public class NonStreamingTtsPiperEn {
176 String waveFilename = "tts-piper-en.wav"; 176 String waveFilename = "tts-piper-en.wav";
177 audio.save(waveFilename); 177 audio.save(waveFilename);
178 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 178 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
179 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 179 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
180 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 180 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
181 System.out.printf("-- text: %s\n", text); 181 System.out.printf("-- text: %s\n", text);
182 System.out.printf("-- Saved to %s\n", waveFilename); 182 System.out.printf("-- Saved to %s\n", waveFilename);
@@ -54,7 +54,7 @@ public class NonStreamingTtsPiperEn { @@ -54,7 +54,7 @@ public class NonStreamingTtsPiperEn {
54 String waveFilename = "tts-vits-zh.wav"; 54 String waveFilename = "tts-vits-zh.wav";
55 audio.save(waveFilename); 55 audio.save(waveFilename);
56 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); 56 System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
57 - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); 57 + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration);
58 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); 58 System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
59 System.out.printf("-- text: %s\n", text); 59 System.out.printf("-- text: %s\n", text);
60 System.out.printf("-- Saved to %s\n", waveFilename); 60 System.out.printf("-- Saved to %s\n", waveFilename);
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +# please visit
  29 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html
  30 +# to download more models
  31 +
  32 +if [ ! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx ]; then
  33 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
  34 + tar xf kitten-nano-en-v0_1-fp16.tar.bz2
  35 + rm kitten-nano-en-v0_1-fp16.tar.bz2
  36 +fi
  37 +
  38 +java \
  39 + -Djava.library.path=$PWD/../build/lib \
  40 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  41 + NonStreamingTtsKittenEn.java
@@ -140,6 +140,12 @@ function testTts() { @@ -140,6 +140,12 @@ function testTts() {
140 rm kokoro-en-v0_19.tar.bz2 140 rm kokoro-en-v0_19.tar.bz2
141 fi 141 fi
142 142
  143 + if [ ! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx ]; then
  144 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
  145 + tar xf kitten-nano-en-v0_1-fp16.tar.bz2
  146 + rm kitten-nano-en-v0_1-fp16.tar.bz2
  147 + fi
  148 +
143 out_filename=test_tts.jar 149 out_filename=test_tts.jar
144 kotlinc-jvm -include-runtime -d $out_filename \ 150 kotlinc-jvm -include-runtime -d $out_filename \
145 test_tts.kt \ 151 test_tts.kt \
@@ -477,7 +483,7 @@ function testOfflineNeMoCanary() { @@ -477,7 +483,7 @@ function testOfflineNeMoCanary() {
477 java -Djava.library.path=../build/lib -jar $out_filename 483 java -Djava.library.path=../build/lib -jar $out_filename
478 } 484 }
479 485
480 -# testVersion 486 +testVersion
481 487
482 testOfflineNeMoCanary 488 testOfflineNeMoCanary
483 testOfflineSenseVoiceWithHr 489 testOfflineSenseVoiceWithHr
@@ -5,6 +5,7 @@ fun main() { @@ -5,6 +5,7 @@ fun main() {
5 testMatcha() 5 testMatcha()
6 testKokoroEn() 6 testKokoroEn()
7 testKokoroZhEn() 7 testKokoroZhEn()
  8 + testKittenEn()
8 } 9 }
9 10
10 fun testKokoroZhEn() { 11 fun testKokoroZhEn() {
@@ -96,6 +97,27 @@ fun testVits() { @@ -96,6 +97,27 @@ fun testVits() {
96 println("Saved to test-en.wav") 97 println("Saved to test-en.wav")
97 } 98 }
98 99
  100 +fun testKittenEn() {
  101 + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  102 + var config = OfflineTtsConfig(
  103 + model=OfflineTtsModelConfig(
  104 + kitten=OfflineTtsKittenModelConfig(
  105 + model="./kitten-nano-en-v0_1-fp16/model.fp16.onnx",
  106 + voices="./kitten-nano-en-v0_1-fp16/voices.bin",
  107 + tokens="./kitten-nano-en-v0_1-fp16/tokens.txt",
  108 + dataDir="./kitten-nano-en-v0_1-fp16/espeak-ng-data",
  109 + ),
  110 + numThreads=2,
  111 + debug=true,
  112 + ),
  113 + )
  114 + val tts = OfflineTts(config=config)
  115 + val audio = tts.generateWithCallback(text="How are you doing today?", sid=7, callback=::callback)
  116 + audio.save(filename="test-kitten-en.wav")
  117 + tts.release()
  118 + println("Saved to test-kitten-en.wav")
  119 +}
  120 +
99 /* 121 /*
100 1. Unzip test_tts.jar 122 1. Unzip test_tts.jar
101 2. 123 2.
@@ -43,6 +43,7 @@ java_files += OfflineRecognizerResult.java @@ -43,6 +43,7 @@ java_files += OfflineRecognizerResult.java
43 java_files += OfflineStream.java 43 java_files += OfflineStream.java
44 java_files += OfflineRecognizer.java 44 java_files += OfflineRecognizer.java
45 45
  46 +java_files += OfflineTtsKittenModelConfig.java
46 java_files += OfflineTtsKokoroModelConfig.java 47 java_files += OfflineTtsKokoroModelConfig.java
47 java_files += OfflineTtsMatchaModelConfig.java 48 java_files += OfflineTtsMatchaModelConfig.java
48 java_files += OfflineTtsVitsModelConfig.java 49 java_files += OfflineTtsVitsModelConfig.java
  1 +// Copyright 2025 Xiaomi Corporation
  2 +package com.k2fsa.sherpa.onnx;
  3 +
  4 +public class OfflineTtsKittenModelConfig {
  5 + private final String model;
  6 + private final String voices;
  7 + private final String tokens;
  8 + private final String dataDir;
  9 + private final float lengthScale;
  10 +
  11 + private OfflineTtsKittenModelConfig(Builder builder) {
  12 + this.model = builder.model;
  13 + this.voices = builder.voices;
  14 + this.tokens = builder.tokens;
  15 + this.dataDir = builder.dataDir;
  16 + this.lengthScale = builder.lengthScale;
  17 + }
  18 +
  19 + public static Builder builder() {
  20 + return new Builder();
  21 + }
  22 +
  23 + public String getModel() {
  24 + return model;
  25 + }
  26 +
  27 + public String getVoices() {
  28 + return voices;
  29 + }
  30 +
  31 + public String getTokens() {
  32 + return tokens;
  33 + }
  34 +
  35 + public String getDataDir() {
  36 + return dataDir;
  37 + }
  38 +
  39 + public float getLengthScale() {
  40 + return lengthScale;
  41 + }
  42 +
  43 +
  44 + public static class Builder {
  45 + private String model = "";
  46 + private String voices = "";
  47 + private String tokens = "";
  48 + private String dataDir = "";
  49 + private float lengthScale = 1.0f;
  50 +
  51 + public OfflineTtsKittenModelConfig build() {
  52 + return new OfflineTtsKittenModelConfig(this);
  53 + }
  54 +
  55 + public Builder setModel(String model) {
  56 + this.model = model;
  57 + return this;
  58 + }
  59 +
  60 + public Builder setVoices(String voices) {
  61 + this.voices = voices;
  62 + return this;
  63 + }
  64 +
  65 + public Builder setTokens(String tokens) {
  66 + this.tokens = tokens;
  67 + return this;
  68 + }
  69 +
  70 + public Builder setDataDir(String dataDir) {
  71 + this.dataDir = dataDir;
  72 + return this;
  73 + }
  74 +
  75 + public Builder setLengthScale(float lengthScale) {
  76 + this.lengthScale = lengthScale;
  77 + return this;
  78 + }
  79 + }
  80 +}
@@ -6,6 +6,7 @@ public class OfflineTtsModelConfig { @@ -6,6 +6,7 @@ public class OfflineTtsModelConfig {
6 private final OfflineTtsVitsModelConfig vits; 6 private final OfflineTtsVitsModelConfig vits;
7 private final OfflineTtsMatchaModelConfig matcha; 7 private final OfflineTtsMatchaModelConfig matcha;
8 private final OfflineTtsKokoroModelConfig kokoro; 8 private final OfflineTtsKokoroModelConfig kokoro;
  9 + private final OfflineTtsKittenModelConfig kitten;
9 private final int numThreads; 10 private final int numThreads;
10 private final boolean debug; 11 private final boolean debug;
11 private final String provider; 12 private final String provider;
@@ -14,6 +15,7 @@ public class OfflineTtsModelConfig { @@ -14,6 +15,7 @@ public class OfflineTtsModelConfig {
14 this.vits = builder.vits; 15 this.vits = builder.vits;
15 this.matcha = builder.matcha; 16 this.matcha = builder.matcha;
16 this.kokoro = builder.kokoro; 17 this.kokoro = builder.kokoro;
  18 + this.kitten = builder.kitten;
17 this.numThreads = builder.numThreads; 19 this.numThreads = builder.numThreads;
18 this.debug = builder.debug; 20 this.debug = builder.debug;
19 this.provider = builder.provider; 21 this.provider = builder.provider;
@@ -35,10 +37,15 @@ public class OfflineTtsModelConfig { @@ -35,10 +37,15 @@ public class OfflineTtsModelConfig {
35 return kokoro; 37 return kokoro;
36 } 38 }
37 39
  40 + public OfflineTtsKittenModelConfig getKitten() {
  41 + return kitten;
  42 + }
  43 +
38 public static class Builder { 44 public static class Builder {
39 private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build(); 45 private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build();
40 private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build(); 46 private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build();
41 private OfflineTtsKokoroModelConfig kokoro = OfflineTtsKokoroModelConfig.builder().build(); 47 private OfflineTtsKokoroModelConfig kokoro = OfflineTtsKokoroModelConfig.builder().build();
  48 + private OfflineTtsKittenModelConfig kitten = OfflineTtsKittenModelConfig.builder().build();
42 private int numThreads = 1; 49 private int numThreads = 1;
43 private boolean debug = true; 50 private boolean debug = true;
44 private String provider = "cpu"; 51 private String provider = "cpu";
@@ -62,6 +69,11 @@ public class OfflineTtsModelConfig { @@ -62,6 +69,11 @@ public class OfflineTtsModelConfig {
62 return this; 69 return this;
63 } 70 }
64 71
  72 + public Builder setKitten(OfflineTtsKittenModelConfig kitten) {
  73 + this.kitten = kitten;
  74 + return this;
  75 + }
  76 +
65 public Builder setNumThreads(int numThreads) { 77 public Builder setNumThreads(int numThreads) {
66 this.numThreads = numThreads; 78 this.numThreads = numThreads;
67 return this; 79 return this;
@@ -77,4 +89,4 @@ public class OfflineTtsModelConfig { @@ -77,4 +89,4 @@ public class OfflineTtsModelConfig {
77 return this; 89 return this;
78 } 90 }
79 } 91 }
80 -} 92 +}
@@ -166,6 +166,39 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { @@ -166,6 +166,39 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
166 fid = env->GetFieldID(kokoro_cls, "lengthScale", "F"); 166 fid = env->GetFieldID(kokoro_cls, "lengthScale", "F");
167 ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid); 167 ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid);
168 168
  169 + // kitten
  170 + fid = env->GetFieldID(model_config_cls, "kitten",
  171 + "Lcom/k2fsa/sherpa/onnx/OfflineTtsKittenModelConfig;");
  172 + jobject kitten = env->GetObjectField(model, fid);
  173 + jclass kitten_cls = env->GetObjectClass(kitten);
  174 +
  175 + fid = env->GetFieldID(kitten_cls, "model", "Ljava/lang/String;");
  176 + s = (jstring)env->GetObjectField(kitten, fid);
  177 + p = env->GetStringUTFChars(s, nullptr);
  178 + ans.model.kitten.model = p;
  179 + env->ReleaseStringUTFChars(s, p);
  180 +
  181 + fid = env->GetFieldID(kitten_cls, "voices", "Ljava/lang/String;");
  182 + s = (jstring)env->GetObjectField(kitten, fid);
  183 + p = env->GetStringUTFChars(s, nullptr);
  184 + ans.model.kitten.voices = p;
  185 + env->ReleaseStringUTFChars(s, p);
  186 +
  187 + fid = env->GetFieldID(kitten_cls, "tokens", "Ljava/lang/String;");
  188 + s = (jstring)env->GetObjectField(kitten, fid);
  189 + p = env->GetStringUTFChars(s, nullptr);
  190 + ans.model.kitten.tokens = p;
  191 + env->ReleaseStringUTFChars(s, p);
  192 +
  193 + fid = env->GetFieldID(kitten_cls, "dataDir", "Ljava/lang/String;");
  194 + s = (jstring)env->GetObjectField(kitten, fid);
  195 + p = env->GetStringUTFChars(s, nullptr);
  196 + ans.model.kitten.data_dir = p;
  197 + env->ReleaseStringUTFChars(s, p);
  198 +
  199 + fid = env->GetFieldID(kitten_cls, "lengthScale", "F");
  200 + ans.model.kitten.length_scale = env->GetFloatField(kitten, fid);
  201 +
169 fid = env->GetFieldID(model_config_cls, "numThreads", "I"); 202 fid = env->GetFieldID(model_config_cls, "numThreads", "I");
170 ans.model.num_threads = env->GetIntField(model, fid); 203 ans.model.num_threads = env->GetIntField(model, fid);
171 204
@@ -36,10 +36,19 @@ data class OfflineTtsKokoroModelConfig( @@ -36,10 +36,19 @@ data class OfflineTtsKokoroModelConfig(
36 var lengthScale: Float = 1.0f, 36 var lengthScale: Float = 1.0f,
37 ) 37 )
38 38
  39 +data class OfflineTtsKittenModelConfig(
  40 + var model: String = "",
  41 + var voices: String = "",
  42 + var tokens: String = "",
  43 + var dataDir: String = "",
  44 + var lengthScale: Float = 1.0f,
  45 +)
  46 +
39 data class OfflineTtsModelConfig( 47 data class OfflineTtsModelConfig(
40 var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(), 48 var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(),
41 var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(), 49 var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(),
42 var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(), 50 var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(),
  51 + var kitten: OfflineTtsKittenModelConfig = OfflineTtsKittenModelConfig(),
43 var numThreads: Int = 1, 52 var numThreads: Int = 1,
44 var debug: Boolean = false, 53 var debug: Boolean = false,
45 var provider: String = "cpu", 54 var provider: String = "cpu",
@@ -189,13 +198,14 @@ fun getOfflineTtsConfig( @@ -189,13 +198,14 @@ fun getOfflineTtsConfig(
189 modelName: String, // for VITS 198 modelName: String, // for VITS
190 acousticModelName: String, // for Matcha 199 acousticModelName: String, // for Matcha
191 vocoder: String, // for Matcha 200 vocoder: String, // for Matcha
192 - voices: String, // for Kokoro 201 + voices: String, // for Kokoro or kitten
193 lexicon: String, 202 lexicon: String,
194 dataDir: String, 203 dataDir: String,
195 dictDir: String, 204 dictDir: String,
196 ruleFsts: String, 205 ruleFsts: String,
197 ruleFars: String, 206 ruleFars: String,
198 - numThreads: Int? = null 207 + numThreads: Int? = null,
  208 + isKitten: Boolean = false
199 ): OfflineTtsConfig { 209 ): OfflineTtsConfig {
200 // For Matcha TTS, please set 210 // For Matcha TTS, please set
201 // acousticModelName, vocoder 211 // acousticModelName, vocoder
@@ -203,13 +213,16 @@ fun getOfflineTtsConfig( @@ -203,13 +213,16 @@ fun getOfflineTtsConfig(
203 // For Kokoro TTS, please set 213 // For Kokoro TTS, please set
204 // modelName, voices 214 // modelName, voices
205 215
  216 + // For Kitten TTS, please set
  217 + // modelName, voices, isKitten
  218 +
206 // For VITS, please set 219 // For VITS, please set
207 // modelName 220 // modelName
208 221
209 val numberOfThreads = if (numThreads != null) { 222 val numberOfThreads = if (numThreads != null) {
210 numThreads 223 numThreads
211 } else if (voices.isNotEmpty()) { 224 } else if (voices.isNotEmpty()) {
212 - // for Kokoro TTS models, we use more threads 225 + // for Kokoro and Kitten TTS models, we use more threads
213 4 226 4
214 } else { 227 } else {
215 2 228 2
@@ -252,7 +265,7 @@ fun getOfflineTtsConfig( @@ -252,7 +265,7 @@ fun getOfflineTtsConfig(
252 OfflineTtsMatchaModelConfig() 265 OfflineTtsMatchaModelConfig()
253 } 266 }
254 267
255 - val kokoro = if (voices.isNotEmpty()) { 268 + val kokoro = if (voices.isNotEmpty() && !isKitten) {
256 OfflineTtsKokoroModelConfig( 269 OfflineTtsKokoroModelConfig(
257 model = "$modelDir/$modelName", 270 model = "$modelDir/$modelName",
258 voices = "$modelDir/$voices", 271 voices = "$modelDir/$voices",
@@ -269,11 +282,23 @@ fun getOfflineTtsConfig( @@ -269,11 +282,23 @@ fun getOfflineTtsConfig(
269 OfflineTtsKokoroModelConfig() 282 OfflineTtsKokoroModelConfig()
270 } 283 }
271 284
  285 + val kitten = if (isKitten) {
  286 + OfflineTtsKittenModelConfig(
  287 + model = "$modelDir/$modelName",
  288 + voices = "$modelDir/$voices",
  289 + tokens = "$modelDir/tokens.txt",
  290 + dataDir = dataDir,
  291 + )
  292 + } else {
  293 + OfflineTtsKittenModelConfig()
  294 + }
  295 +
272 return OfflineTtsConfig( 296 return OfflineTtsConfig(
273 model = OfflineTtsModelConfig( 297 model = OfflineTtsModelConfig(
274 vits = vits, 298 vits = vits,
275 matcha = matcha, 299 matcha = matcha,
276 kokoro = kokoro, 300 kokoro = kokoro,
  301 + kitten = kitten,
277 numThreads = numberOfThreads, 302 numThreads = numberOfThreads,
278 debug = true, 303 debug = true,
279 provider = "cpu", 304 provider = "cpu",