Committed by
GitHub
Add Kotlin and Java API for KittenTTS (#2461)
正在显示
19 个修改的文件
包含
297 行增加
和
14 行删除
| @@ -309,12 +309,14 @@ jobs: | @@ -309,12 +309,14 @@ jobs: | ||
| 309 | run: | | 309 | run: | |
| 310 | cd ./java-api-examples | 310 | cd ./java-api-examples |
| 311 | 311 | ||
| 312 | + ./run-non-streaming-tts-kitten-en.sh | ||
| 312 | ./run-non-streaming-tts-kokoro-zh-en.sh | 313 | ./run-non-streaming-tts-kokoro-zh-en.sh |
| 313 | ./run-non-streaming-tts-kokoro-en.sh | 314 | ./run-non-streaming-tts-kokoro-en.sh |
| 314 | ./run-non-streaming-tts-matcha-zh.sh | 315 | ./run-non-streaming-tts-matcha-zh.sh |
| 315 | ./run-non-streaming-tts-matcha-en.sh | 316 | ./run-non-streaming-tts-matcha-en.sh |
| 316 | ls -lh | 317 | ls -lh |
| 317 | 318 | ||
| 319 | + rm -rf kitten-nano-en-* | ||
| 318 | rm -rf kokoro-multi-* | 320 | rm -rf kokoro-multi-* |
| 319 | rm -rf kokoro-en-* | 321 | rm -rf kokoro-en-* |
| 320 | 322 |
| @@ -40,7 +40,7 @@ public class NonStreamingTtsCoquiDe { | @@ -40,7 +40,7 @@ public class NonStreamingTtsCoquiDe { | ||
| 40 | String waveFilename = "tts-coqui-de.wav"; | 40 | String waveFilename = "tts-coqui-de.wav"; |
| 41 | audio.save(waveFilename); | 41 | audio.save(waveFilename); |
| 42 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 42 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 43 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 43 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 44 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 44 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 45 | System.out.printf("-- text: %s\n", text); | 45 | System.out.printf("-- text: %s\n", text); |
| 46 | System.out.printf("-- Saved to %s\n", waveFilename); | 46 | System.out.printf("-- Saved to %s\n", waveFilename); |
| 1 | +// Copyright 2025 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use a KittenTTS English model | ||
| 4 | +// to convert text to speech | ||
| 5 | +import com.k2fsa.sherpa.onnx.*; | ||
| 6 | + | ||
| 7 | +public class NonStreamingTtsKittenEn { | ||
| 8 | + public static void main(String[] args) { | ||
| 9 | + // please visit | ||
| 10 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html | ||
| 11 | + // to download model files | ||
| 12 | + String model = "./kitten-nano-en-v0_1-fp16/model.fp16.onnx"; | ||
| 13 | + String voices = "./kitten-nano-en-v0_1-fp16/voices.bin"; | ||
| 14 | + String tokens = "./kitten-nano-en-v0_1-fp16/tokens.txt"; | ||
| 15 | + String dataDir = "./kitten-nano-en-v0_1-fp16/espeak-ng-data"; | ||
| 16 | + String text = | ||
| 17 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have" | ||
| 18 | + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a" | ||
| 19 | + + " businessman, an official, or a scholar."; | ||
| 20 | + | ||
| 21 | + OfflineTtsKittenModelConfig kittenModelConfig = | ||
| 22 | + OfflineTtsKittenModelConfig.builder() | ||
| 23 | + .setModel(model) | ||
| 24 | + .setVoices(voices) | ||
| 25 | + .setTokens(tokens) | ||
| 26 | + .setDataDir(dataDir) | ||
| 27 | + .build(); | ||
| 28 | + | ||
| 29 | + OfflineTtsModelConfig modelConfig = | ||
| 30 | + OfflineTtsModelConfig.builder() | ||
| 31 | + .setKitten(kittenModelConfig) | ||
| 32 | + .setNumThreads(2) | ||
| 33 | + .setDebug(true) | ||
| 34 | + .build(); | ||
| 35 | + | ||
| 36 | + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build(); | ||
| 37 | + OfflineTts tts = new OfflineTts(config); | ||
| 38 | + | ||
| 39 | + int sid = 7; | ||
| 40 | + float speed = 1.0f; | ||
| 41 | + long start = System.currentTimeMillis(); | ||
| 42 | + GeneratedAudio audio = tts.generate(text, sid, speed); | ||
| 43 | + long stop = System.currentTimeMillis(); | ||
| 44 | + | ||
| 45 | + float timeElapsedSeconds = (stop - start) / 1000.0f; | ||
| 46 | + | ||
| 47 | + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); | ||
| 48 | + float real_time_factor = timeElapsedSeconds / audioDuration; | ||
| 49 | + | ||
| 50 | + String waveFilename = "tts-kitten-en.wav"; | ||
| 51 | + audio.save(waveFilename); | ||
| 52 | + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | ||
| 53 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); | ||
| 54 | + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | ||
| 55 | + System.out.printf("-- text: %s\n", text); | ||
| 56 | + System.out.printf("-- Saved to %s\n", waveFilename); | ||
| 57 | + | ||
| 58 | + tts.release(); | ||
| 59 | + } | ||
| 60 | +} |
| @@ -50,7 +50,7 @@ public class NonStreamingTtsKokoroEn { | @@ -50,7 +50,7 @@ public class NonStreamingTtsKokoroEn { | ||
| 50 | String waveFilename = "tts-kokoro-en.wav"; | 50 | String waveFilename = "tts-kokoro-en.wav"; |
| 51 | audio.save(waveFilename); | 51 | audio.save(waveFilename); |
| 52 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 52 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 53 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 53 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 54 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 54 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 55 | System.out.printf("-- text: %s\n", text); | 55 | System.out.printf("-- text: %s\n", text); |
| 56 | System.out.printf("-- Saved to %s\n", waveFilename); | 56 | System.out.printf("-- Saved to %s\n", waveFilename); |
| @@ -54,7 +54,7 @@ public class NonStreamingTtsKokoroZhEn { | @@ -54,7 +54,7 @@ public class NonStreamingTtsKokoroZhEn { | ||
| 54 | String waveFilename = "tts-kokoro-zh-en.wav"; | 54 | String waveFilename = "tts-kokoro-zh-en.wav"; |
| 55 | audio.save(waveFilename); | 55 | audio.save(waveFilename); |
| 56 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 56 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 57 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 57 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 58 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 58 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 59 | System.out.printf("-- text: %s\n", text); | 59 | System.out.printf("-- text: %s\n", text); |
| 60 | System.out.printf("-- Saved to %s\n", waveFilename); | 60 | System.out.printf("-- Saved to %s\n", waveFilename); |
| @@ -50,7 +50,7 @@ public class NonStreamingTtsMatchaEn { | @@ -50,7 +50,7 @@ public class NonStreamingTtsMatchaEn { | ||
| 50 | String waveFilename = "tts-matcha-en.wav"; | 50 | String waveFilename = "tts-matcha-en.wav"; |
| 51 | audio.save(waveFilename); | 51 | audio.save(waveFilename); |
| 52 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 52 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 53 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 53 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 54 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 54 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 55 | System.out.printf("-- text: %s\n", text); | 55 | System.out.printf("-- text: %s\n", text); |
| 56 | System.out.printf("-- Saved to %s\n", waveFilename); | 56 | System.out.printf("-- Saved to %s\n", waveFilename); |
| @@ -56,7 +56,7 @@ public class NonStreamingTtsMatchaZh { | @@ -56,7 +56,7 @@ public class NonStreamingTtsMatchaZh { | ||
| 56 | String waveFilename = "tts-matcha-zh.wav"; | 56 | String waveFilename = "tts-matcha-zh.wav"; |
| 57 | audio.save(waveFilename); | 57 | audio.save(waveFilename); |
| 58 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 58 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 59 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 59 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 60 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 60 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 61 | System.out.printf("-- text: %s\n", text); | 61 | System.out.printf("-- text: %s\n", text); |
| 62 | System.out.printf("-- Saved to %s\n", waveFilename); | 62 | System.out.printf("-- Saved to %s\n", waveFilename); |
| @@ -48,7 +48,7 @@ public class NonStreamingTtsPiperEn { | @@ -48,7 +48,7 @@ public class NonStreamingTtsPiperEn { | ||
| 48 | String waveFilename = "tts-piper-en.wav"; | 48 | String waveFilename = "tts-piper-en.wav"; |
| 49 | audio.save(waveFilename); | 49 | audio.save(waveFilename); |
| 50 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 50 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 51 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 51 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 52 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 52 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 53 | System.out.printf("-- text: %s\n", text); | 53 | System.out.printf("-- text: %s\n", text); |
| 54 | System.out.printf("-- Saved to %s\n", waveFilename); | 54 | System.out.printf("-- Saved to %s\n", waveFilename); |
| @@ -176,7 +176,7 @@ public class NonStreamingTtsPiperEn { | @@ -176,7 +176,7 @@ public class NonStreamingTtsPiperEn { | ||
| 176 | String waveFilename = "tts-piper-en.wav"; | 176 | String waveFilename = "tts-piper-en.wav"; |
| 177 | audio.save(waveFilename); | 177 | audio.save(waveFilename); |
| 178 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 178 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 179 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 179 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 180 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 180 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 181 | System.out.printf("-- text: %s\n", text); | 181 | System.out.printf("-- text: %s\n", text); |
| 182 | System.out.printf("-- Saved to %s\n", waveFilename); | 182 | System.out.printf("-- Saved to %s\n", waveFilename); |
| @@ -54,7 +54,7 @@ public class NonStreamingTtsPiperEn { | @@ -54,7 +54,7 @@ public class NonStreamingTtsPiperEn { | ||
| 54 | String waveFilename = "tts-vits-zh.wav"; | 54 | String waveFilename = "tts-vits-zh.wav"; |
| 55 | audio.save(waveFilename); | 55 | audio.save(waveFilename); |
| 56 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | 56 | System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); |
| 57 | - System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | 57 | + System.out.printf("-- audio duration: %.3f seconds\n", audioDuration); |
| 58 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | 58 | System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); |
| 59 | System.out.printf("-- text: %s\n", text); | 59 | System.out.printf("-- text: %s\n", text); |
| 60 | System.out.printf("-- Saved to %s\n", waveFilename); | 60 | System.out.printf("-- Saved to %s\n", waveFilename); |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +# please visit | ||
| 29 | +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html | ||
| 30 | +# to download more models | ||
| 31 | + | ||
| 32 | +if [ ! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx ]; then | ||
| 33 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 34 | + tar xf kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 35 | + rm kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 36 | +fi | ||
| 37 | + | ||
| 38 | +java \ | ||
| 39 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 40 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 41 | + NonStreamingTtsKittenEn.java |
| @@ -140,6 +140,12 @@ function testTts() { | @@ -140,6 +140,12 @@ function testTts() { | ||
| 140 | rm kokoro-en-v0_19.tar.bz2 | 140 | rm kokoro-en-v0_19.tar.bz2 |
| 141 | fi | 141 | fi |
| 142 | 142 | ||
| 143 | + if [ ! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx ]; then | ||
| 144 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 145 | + tar xf kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 146 | + rm kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 147 | + fi | ||
| 148 | + | ||
| 143 | out_filename=test_tts.jar | 149 | out_filename=test_tts.jar |
| 144 | kotlinc-jvm -include-runtime -d $out_filename \ | 150 | kotlinc-jvm -include-runtime -d $out_filename \ |
| 145 | test_tts.kt \ | 151 | test_tts.kt \ |
| @@ -477,7 +483,7 @@ function testOfflineNeMoCanary() { | @@ -477,7 +483,7 @@ function testOfflineNeMoCanary() { | ||
| 477 | java -Djava.library.path=../build/lib -jar $out_filename | 483 | java -Djava.library.path=../build/lib -jar $out_filename |
| 478 | } | 484 | } |
| 479 | 485 | ||
| 480 | -# testVersion | 486 | +testVersion |
| 481 | 487 | ||
| 482 | testOfflineNeMoCanary | 488 | testOfflineNeMoCanary |
| 483 | testOfflineSenseVoiceWithHr | 489 | testOfflineSenseVoiceWithHr |
| @@ -5,6 +5,7 @@ fun main() { | @@ -5,6 +5,7 @@ fun main() { | ||
| 5 | testMatcha() | 5 | testMatcha() |
| 6 | testKokoroEn() | 6 | testKokoroEn() |
| 7 | testKokoroZhEn() | 7 | testKokoroZhEn() |
| 8 | + testKittenEn() | ||
| 8 | } | 9 | } |
| 9 | 10 | ||
| 10 | fun testKokoroZhEn() { | 11 | fun testKokoroZhEn() { |
| @@ -96,6 +97,27 @@ fun testVits() { | @@ -96,6 +97,27 @@ fun testVits() { | ||
| 96 | println("Saved to test-en.wav") | 97 | println("Saved to test-en.wav") |
| 97 | } | 98 | } |
| 98 | 99 | ||
| 100 | +fun testKittenEn() { | ||
| 101 | + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 102 | + var config = OfflineTtsConfig( | ||
| 103 | + model=OfflineTtsModelConfig( | ||
| 104 | + kitten=OfflineTtsKittenModelConfig( | ||
| 105 | + model="./kitten-nano-en-v0_1-fp16/model.fp16.onnx", | ||
| 106 | + voices="./kitten-nano-en-v0_1-fp16/voices.bin", | ||
| 107 | + tokens="./kitten-nano-en-v0_1-fp16/tokens.txt", | ||
| 108 | + dataDir="./kitten-nano-en-v0_1-fp16/espeak-ng-data", | ||
| 109 | + ), | ||
| 110 | + numThreads=2, | ||
| 111 | + debug=true, | ||
| 112 | + ), | ||
| 113 | + ) | ||
| 114 | + val tts = OfflineTts(config=config) | ||
| 115 | + val audio = tts.generateWithCallback(text="How are you doing today?", sid=7, callback=::callback) | ||
| 116 | + audio.save(filename="test-kitten-en.wav") | ||
| 117 | + tts.release() | ||
| 118 | + println("Saved to test-kitten-en.wav") | ||
| 119 | +} | ||
| 120 | + | ||
| 99 | /* | 121 | /* |
| 100 | 1. Unzip test_tts.jar | 122 | 1. Unzip test_tts.jar |
| 101 | 2. | 123 | 2. |
| @@ -43,6 +43,7 @@ java_files += OfflineRecognizerResult.java | @@ -43,6 +43,7 @@ java_files += OfflineRecognizerResult.java | ||
| 43 | java_files += OfflineStream.java | 43 | java_files += OfflineStream.java |
| 44 | java_files += OfflineRecognizer.java | 44 | java_files += OfflineRecognizer.java |
| 45 | 45 | ||
| 46 | +java_files += OfflineTtsKittenModelConfig.java | ||
| 46 | java_files += OfflineTtsKokoroModelConfig.java | 47 | java_files += OfflineTtsKokoroModelConfig.java |
| 47 | java_files += OfflineTtsMatchaModelConfig.java | 48 | java_files += OfflineTtsMatchaModelConfig.java |
| 48 | java_files += OfflineTtsVitsModelConfig.java | 49 | java_files += OfflineTtsVitsModelConfig.java |
| 1 | +// Copyright 2025 Xiaomi Corporation | ||
| 2 | +package com.k2fsa.sherpa.onnx; | ||
| 3 | + | ||
| 4 | +public class OfflineTtsKittenModelConfig { | ||
| 5 | + private final String model; | ||
| 6 | + private final String voices; | ||
| 7 | + private final String tokens; | ||
| 8 | + private final String dataDir; | ||
| 9 | + private final float lengthScale; | ||
| 10 | + | ||
| 11 | + private OfflineTtsKittenModelConfig(Builder builder) { | ||
| 12 | + this.model = builder.model; | ||
| 13 | + this.voices = builder.voices; | ||
| 14 | + this.tokens = builder.tokens; | ||
| 15 | + this.dataDir = builder.dataDir; | ||
| 16 | + this.lengthScale = builder.lengthScale; | ||
| 17 | + } | ||
| 18 | + | ||
| 19 | + public static Builder builder() { | ||
| 20 | + return new Builder(); | ||
| 21 | + } | ||
| 22 | + | ||
| 23 | + public String getModel() { | ||
| 24 | + return model; | ||
| 25 | + } | ||
| 26 | + | ||
| 27 | + public String getVoices() { | ||
| 28 | + return voices; | ||
| 29 | + } | ||
| 30 | + | ||
| 31 | + public String getTokens() { | ||
| 32 | + return tokens; | ||
| 33 | + } | ||
| 34 | + | ||
| 35 | + public String getDataDir() { | ||
| 36 | + return dataDir; | ||
| 37 | + } | ||
| 38 | + | ||
| 39 | + public float getLengthScale() { | ||
| 40 | + return lengthScale; | ||
| 41 | + } | ||
| 42 | + | ||
| 43 | + | ||
| 44 | + public static class Builder { | ||
| 45 | + private String model = ""; | ||
| 46 | + private String voices = ""; | ||
| 47 | + private String tokens = ""; | ||
| 48 | + private String dataDir = ""; | ||
| 49 | + private float lengthScale = 1.0f; | ||
| 50 | + | ||
| 51 | + public OfflineTtsKittenModelConfig build() { | ||
| 52 | + return new OfflineTtsKittenModelConfig(this); | ||
| 53 | + } | ||
| 54 | + | ||
| 55 | + public Builder setModel(String model) { | ||
| 56 | + this.model = model; | ||
| 57 | + return this; | ||
| 58 | + } | ||
| 59 | + | ||
| 60 | + public Builder setVoices(String voices) { | ||
| 61 | + this.voices = voices; | ||
| 62 | + return this; | ||
| 63 | + } | ||
| 64 | + | ||
| 65 | + public Builder setTokens(String tokens) { | ||
| 66 | + this.tokens = tokens; | ||
| 67 | + return this; | ||
| 68 | + } | ||
| 69 | + | ||
| 70 | + public Builder setDataDir(String dataDir) { | ||
| 71 | + this.dataDir = dataDir; | ||
| 72 | + return this; | ||
| 73 | + } | ||
| 74 | + | ||
| 75 | + public Builder setLengthScale(float lengthScale) { | ||
| 76 | + this.lengthScale = lengthScale; | ||
| 77 | + return this; | ||
| 78 | + } | ||
| 79 | + } | ||
| 80 | +} |
| @@ -6,6 +6,7 @@ public class OfflineTtsModelConfig { | @@ -6,6 +6,7 @@ public class OfflineTtsModelConfig { | ||
| 6 | private final OfflineTtsVitsModelConfig vits; | 6 | private final OfflineTtsVitsModelConfig vits; |
| 7 | private final OfflineTtsMatchaModelConfig matcha; | 7 | private final OfflineTtsMatchaModelConfig matcha; |
| 8 | private final OfflineTtsKokoroModelConfig kokoro; | 8 | private final OfflineTtsKokoroModelConfig kokoro; |
| 9 | + private final OfflineTtsKittenModelConfig kitten; | ||
| 9 | private final int numThreads; | 10 | private final int numThreads; |
| 10 | private final boolean debug; | 11 | private final boolean debug; |
| 11 | private final String provider; | 12 | private final String provider; |
| @@ -14,6 +15,7 @@ public class OfflineTtsModelConfig { | @@ -14,6 +15,7 @@ public class OfflineTtsModelConfig { | ||
| 14 | this.vits = builder.vits; | 15 | this.vits = builder.vits; |
| 15 | this.matcha = builder.matcha; | 16 | this.matcha = builder.matcha; |
| 16 | this.kokoro = builder.kokoro; | 17 | this.kokoro = builder.kokoro; |
| 18 | + this.kitten = builder.kitten; | ||
| 17 | this.numThreads = builder.numThreads; | 19 | this.numThreads = builder.numThreads; |
| 18 | this.debug = builder.debug; | 20 | this.debug = builder.debug; |
| 19 | this.provider = builder.provider; | 21 | this.provider = builder.provider; |
| @@ -35,10 +37,15 @@ public class OfflineTtsModelConfig { | @@ -35,10 +37,15 @@ public class OfflineTtsModelConfig { | ||
| 35 | return kokoro; | 37 | return kokoro; |
| 36 | } | 38 | } |
| 37 | 39 | ||
| 40 | + public OfflineTtsKittenModelConfig getKitten() { | ||
| 41 | + return kitten; | ||
| 42 | + } | ||
| 43 | + | ||
| 38 | public static class Builder { | 44 | public static class Builder { |
| 39 | private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build(); | 45 | private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build(); |
| 40 | private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build(); | 46 | private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build(); |
| 41 | private OfflineTtsKokoroModelConfig kokoro = OfflineTtsKokoroModelConfig.builder().build(); | 47 | private OfflineTtsKokoroModelConfig kokoro = OfflineTtsKokoroModelConfig.builder().build(); |
| 48 | + private OfflineTtsKittenModelConfig kitten = OfflineTtsKittenModelConfig.builder().build(); | ||
| 42 | private int numThreads = 1; | 49 | private int numThreads = 1; |
| 43 | private boolean debug = true; | 50 | private boolean debug = true; |
| 44 | private String provider = "cpu"; | 51 | private String provider = "cpu"; |
| @@ -62,6 +69,11 @@ public class OfflineTtsModelConfig { | @@ -62,6 +69,11 @@ public class OfflineTtsModelConfig { | ||
| 62 | return this; | 69 | return this; |
| 63 | } | 70 | } |
| 64 | 71 | ||
| 72 | + public Builder setKitten(OfflineTtsKittenModelConfig kitten) { | ||
| 73 | + this.kitten = kitten; | ||
| 74 | + return this; | ||
| 75 | + } | ||
| 76 | + | ||
| 65 | public Builder setNumThreads(int numThreads) { | 77 | public Builder setNumThreads(int numThreads) { |
| 66 | this.numThreads = numThreads; | 78 | this.numThreads = numThreads; |
| 67 | return this; | 79 | return this; |
| @@ -77,4 +89,4 @@ public class OfflineTtsModelConfig { | @@ -77,4 +89,4 @@ public class OfflineTtsModelConfig { | ||
| 77 | return this; | 89 | return this; |
| 78 | } | 90 | } |
| 79 | } | 91 | } |
| 80 | -} | 92 | +} |
| @@ -166,6 +166,39 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | @@ -166,6 +166,39 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | ||
| 166 | fid = env->GetFieldID(kokoro_cls, "lengthScale", "F"); | 166 | fid = env->GetFieldID(kokoro_cls, "lengthScale", "F"); |
| 167 | ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid); | 167 | ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid); |
| 168 | 168 | ||
| 169 | + // kitten | ||
| 170 | + fid = env->GetFieldID(model_config_cls, "kitten", | ||
| 171 | + "Lcom/k2fsa/sherpa/onnx/OfflineTtsKittenModelConfig;"); | ||
| 172 | + jobject kitten = env->GetObjectField(model, fid); | ||
| 173 | + jclass kitten_cls = env->GetObjectClass(kitten); | ||
| 174 | + | ||
| 175 | + fid = env->GetFieldID(kitten_cls, "model", "Ljava/lang/String;"); | ||
| 176 | + s = (jstring)env->GetObjectField(kitten, fid); | ||
| 177 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 178 | + ans.model.kitten.model = p; | ||
| 179 | + env->ReleaseStringUTFChars(s, p); | ||
| 180 | + | ||
| 181 | + fid = env->GetFieldID(kitten_cls, "voices", "Ljava/lang/String;"); | ||
| 182 | + s = (jstring)env->GetObjectField(kitten, fid); | ||
| 183 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 184 | + ans.model.kitten.voices = p; | ||
| 185 | + env->ReleaseStringUTFChars(s, p); | ||
| 186 | + | ||
| 187 | + fid = env->GetFieldID(kitten_cls, "tokens", "Ljava/lang/String;"); | ||
| 188 | + s = (jstring)env->GetObjectField(kitten, fid); | ||
| 189 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 190 | + ans.model.kitten.tokens = p; | ||
| 191 | + env->ReleaseStringUTFChars(s, p); | ||
| 192 | + | ||
| 193 | + fid = env->GetFieldID(kitten_cls, "dataDir", "Ljava/lang/String;"); | ||
| 194 | + s = (jstring)env->GetObjectField(kitten, fid); | ||
| 195 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 196 | + ans.model.kitten.data_dir = p; | ||
| 197 | + env->ReleaseStringUTFChars(s, p); | ||
| 198 | + | ||
| 199 | + fid = env->GetFieldID(kitten_cls, "lengthScale", "F"); | ||
| 200 | + ans.model.kitten.length_scale = env->GetFloatField(kitten, fid); | ||
| 201 | + | ||
| 169 | fid = env->GetFieldID(model_config_cls, "numThreads", "I"); | 202 | fid = env->GetFieldID(model_config_cls, "numThreads", "I"); |
| 170 | ans.model.num_threads = env->GetIntField(model, fid); | 203 | ans.model.num_threads = env->GetIntField(model, fid); |
| 171 | 204 |
| @@ -36,10 +36,19 @@ data class OfflineTtsKokoroModelConfig( | @@ -36,10 +36,19 @@ data class OfflineTtsKokoroModelConfig( | ||
| 36 | var lengthScale: Float = 1.0f, | 36 | var lengthScale: Float = 1.0f, |
| 37 | ) | 37 | ) |
| 38 | 38 | ||
| 39 | +data class OfflineTtsKittenModelConfig( | ||
| 40 | + var model: String = "", | ||
| 41 | + var voices: String = "", | ||
| 42 | + var tokens: String = "", | ||
| 43 | + var dataDir: String = "", | ||
| 44 | + var lengthScale: Float = 1.0f, | ||
| 45 | +) | ||
| 46 | + | ||
| 39 | data class OfflineTtsModelConfig( | 47 | data class OfflineTtsModelConfig( |
| 40 | var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(), | 48 | var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(), |
| 41 | var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(), | 49 | var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(), |
| 42 | var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(), | 50 | var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(), |
| 51 | + var kitten: OfflineTtsKittenModelConfig = OfflineTtsKittenModelConfig(), | ||
| 43 | var numThreads: Int = 1, | 52 | var numThreads: Int = 1, |
| 44 | var debug: Boolean = false, | 53 | var debug: Boolean = false, |
| 45 | var provider: String = "cpu", | 54 | var provider: String = "cpu", |
| @@ -189,13 +198,14 @@ fun getOfflineTtsConfig( | @@ -189,13 +198,14 @@ fun getOfflineTtsConfig( | ||
| 189 | modelName: String, // for VITS | 198 | modelName: String, // for VITS |
| 190 | acousticModelName: String, // for Matcha | 199 | acousticModelName: String, // for Matcha |
| 191 | vocoder: String, // for Matcha | 200 | vocoder: String, // for Matcha |
| 192 | - voices: String, // for Kokoro | 201 | + voices: String, // for Kokoro or kitten |
| 193 | lexicon: String, | 202 | lexicon: String, |
| 194 | dataDir: String, | 203 | dataDir: String, |
| 195 | dictDir: String, | 204 | dictDir: String, |
| 196 | ruleFsts: String, | 205 | ruleFsts: String, |
| 197 | ruleFars: String, | 206 | ruleFars: String, |
| 198 | - numThreads: Int? = null | 207 | + numThreads: Int? = null, |
| 208 | + isKitten: Boolean = false | ||
| 199 | ): OfflineTtsConfig { | 209 | ): OfflineTtsConfig { |
| 200 | // For Matcha TTS, please set | 210 | // For Matcha TTS, please set |
| 201 | // acousticModelName, vocoder | 211 | // acousticModelName, vocoder |
| @@ -203,13 +213,16 @@ fun getOfflineTtsConfig( | @@ -203,13 +213,16 @@ fun getOfflineTtsConfig( | ||
| 203 | // For Kokoro TTS, please set | 213 | // For Kokoro TTS, please set |
| 204 | // modelName, voices | 214 | // modelName, voices |
| 205 | 215 | ||
| 216 | + // For Kitten TTS, please set | ||
| 217 | + // modelName, voices, isKitten | ||
| 218 | + | ||
| 206 | // For VITS, please set | 219 | // For VITS, please set |
| 207 | // modelName | 220 | // modelName |
| 208 | 221 | ||
| 209 | val numberOfThreads = if (numThreads != null) { | 222 | val numberOfThreads = if (numThreads != null) { |
| 210 | numThreads | 223 | numThreads |
| 211 | } else if (voices.isNotEmpty()) { | 224 | } else if (voices.isNotEmpty()) { |
| 212 | - // for Kokoro TTS models, we use more threads | 225 | + // for Kokoro and Kitten TTS models, we use more threads |
| 213 | 4 | 226 | 4 |
| 214 | } else { | 227 | } else { |
| 215 | 2 | 228 | 2 |
| @@ -252,7 +265,7 @@ fun getOfflineTtsConfig( | @@ -252,7 +265,7 @@ fun getOfflineTtsConfig( | ||
| 252 | OfflineTtsMatchaModelConfig() | 265 | OfflineTtsMatchaModelConfig() |
| 253 | } | 266 | } |
| 254 | 267 | ||
| 255 | - val kokoro = if (voices.isNotEmpty()) { | 268 | + val kokoro = if (voices.isNotEmpty() && !isKitten) { |
| 256 | OfflineTtsKokoroModelConfig( | 269 | OfflineTtsKokoroModelConfig( |
| 257 | model = "$modelDir/$modelName", | 270 | model = "$modelDir/$modelName", |
| 258 | voices = "$modelDir/$voices", | 271 | voices = "$modelDir/$voices", |
| @@ -269,11 +282,23 @@ fun getOfflineTtsConfig( | @@ -269,11 +282,23 @@ fun getOfflineTtsConfig( | ||
| 269 | OfflineTtsKokoroModelConfig() | 282 | OfflineTtsKokoroModelConfig() |
| 270 | } | 283 | } |
| 271 | 284 | ||
| 285 | + val kitten = if (isKitten) { | ||
| 286 | + OfflineTtsKittenModelConfig( | ||
| 287 | + model = "$modelDir/$modelName", | ||
| 288 | + voices = "$modelDir/$voices", | ||
| 289 | + tokens = "$modelDir/tokens.txt", | ||
| 290 | + dataDir = dataDir, | ||
| 291 | + ) | ||
| 292 | + } else { | ||
| 293 | + OfflineTtsKittenModelConfig() | ||
| 294 | + } | ||
| 295 | + | ||
| 272 | return OfflineTtsConfig( | 296 | return OfflineTtsConfig( |
| 273 | model = OfflineTtsModelConfig( | 297 | model = OfflineTtsModelConfig( |
| 274 | vits = vits, | 298 | vits = vits, |
| 275 | matcha = matcha, | 299 | matcha = matcha, |
| 276 | kokoro = kokoro, | 300 | kokoro = kokoro, |
| 301 | + kitten = kitten, | ||
| 277 | numThreads = numberOfThreads, | 302 | numThreads = numberOfThreads, |
| 278 | debug = true, | 303 | debug = true, |
| 279 | provider = "cpu", | 304 | provider = "cpu", |
-
请 注册 或 登录 后发表评论