Committed by
GitHub
Add Java and Koltin API for Kokoro TTS 1.0 (#1798)
正在显示
9 个修改的文件
包含
171 行增加
和
3 行删除
| @@ -234,11 +234,13 @@ jobs: | @@ -234,11 +234,13 @@ jobs: | ||
| 234 | run: | | 234 | run: | |
| 235 | cd ./java-api-examples | 235 | cd ./java-api-examples |
| 236 | 236 | ||
| 237 | + ./run-non-streaming-tts-kokoro-zh-en.sh | ||
| 237 | ./run-non-streaming-tts-kokoro-en.sh | 238 | ./run-non-streaming-tts-kokoro-en.sh |
| 238 | ./run-non-streaming-tts-matcha-zh.sh | 239 | ./run-non-streaming-tts-matcha-zh.sh |
| 239 | ./run-non-streaming-tts-matcha-en.sh | 240 | ./run-non-streaming-tts-matcha-en.sh |
| 240 | ls -lh | 241 | ls -lh |
| 241 | 242 | ||
| 243 | + rm -rf kokoro-multi-* | ||
| 242 | rm -rf kokoro-en-* | 244 | rm -rf kokoro-en-* |
| 243 | 245 | ||
| 244 | rm -rf matcha-icefall-* | 246 | rm -rf matcha-icefall-* |
| 1 | +// Copyright 2025 Xiaomi Corporation | ||
| 2 | + | ||
| 3 | +// This file shows how to use a Kokoro multi-lingual model | ||
| 4 | +// to convert Chinese and English text to speech | ||
| 5 | +import com.k2fsa.sherpa.onnx.*; | ||
| 6 | + | ||
| 7 | +public class NonStreamingTtsKokoroZhEn { | ||
| 8 | + public static void main(String[] args) { | ||
| 9 | + // please visit | ||
| 10 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html | ||
| 11 | + // to download model files | ||
| 12 | + String model = "./kokoro-multi-lang-v1_0/model.onnx"; | ||
| 13 | + String voices = "./kokoro-multi-lang-v1_0/voices.bin"; | ||
| 14 | + String tokens = "./kokoro-multi-lang-v1_0/tokens.txt"; | ||
| 15 | + String dataDir = "./kokoro-multi-lang-v1_0/espeak-ng-data"; | ||
| 16 | + String dictDir = "./kokoro-multi-lang-v1_0/dict"; | ||
| 17 | + String lexicon = | ||
| 18 | + "./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt"; | ||
| 19 | + String text = | ||
| 20 | + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki." | ||
| 21 | + + " 你觉得中英文说的如何呢?"; | ||
| 22 | + | ||
| 23 | + OfflineTtsKokoroModelConfig kokoroModelConfig = | ||
| 24 | + OfflineTtsKokoroModelConfig.builder() | ||
| 25 | + .setModel(model) | ||
| 26 | + .setVoices(voices) | ||
| 27 | + .setTokens(tokens) | ||
| 28 | + .setDataDir(dataDir) | ||
| 29 | + .setDictDir(dictDir) | ||
| 30 | + .setLexicon(lexicon) | ||
| 31 | + .build(); | ||
| 32 | + | ||
| 33 | + OfflineTtsModelConfig modelConfig = | ||
| 34 | + OfflineTtsModelConfig.builder() | ||
| 35 | + .setKokoro(kokoroModelConfig) | ||
| 36 | + .setNumThreads(2) | ||
| 37 | + .setDebug(true) | ||
| 38 | + .build(); | ||
| 39 | + | ||
| 40 | + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build(); | ||
| 41 | + OfflineTts tts = new OfflineTts(config); | ||
| 42 | + | ||
| 43 | + int sid = 0; // this model has 53 speakers. You can use sid in the range 0-52 | ||
| 44 | + float speed = 1.0f; | ||
| 45 | + long start = System.currentTimeMillis(); | ||
| 46 | + GeneratedAudio audio = tts.generate(text, sid, speed); | ||
| 47 | + long stop = System.currentTimeMillis(); | ||
| 48 | + | ||
| 49 | + float timeElapsedSeconds = (stop - start) / 1000.0f; | ||
| 50 | + | ||
| 51 | + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); | ||
| 52 | + float real_time_factor = timeElapsedSeconds / audioDuration; | ||
| 53 | + | ||
| 54 | + String waveFilename = "tts-kokoro-zh-en.wav"; | ||
| 55 | + audio.save(waveFilename); | ||
| 56 | + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | ||
| 57 | + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | ||
| 58 | + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | ||
| 59 | + System.out.printf("-- text: %s\n", text); | ||
| 60 | + System.out.printf("-- Saved to %s\n", waveFilename); | ||
| 61 | + | ||
| 62 | + tts.release(); | ||
| 63 | + } | ||
| 64 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +# please visit | ||
| 29 | +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html | ||
| 30 | +# to download more models | ||
| 31 | +if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then | ||
| 32 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 | ||
| 33 | + tar xf kokoro-multi-lang-v1_0.tar.bz2 | ||
| 34 | + rm kokoro-multi-lang-v1_0.tar.bz2 | ||
| 35 | +fi | ||
| 36 | + | ||
| 37 | +java \ | ||
| 38 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 39 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 40 | + NonStreamingTtsKokoroZhEn.java |
| @@ -115,6 +115,12 @@ function testTts() { | @@ -115,6 +115,12 @@ function testTts() { | ||
| 115 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | 115 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx |
| 116 | fi | 116 | fi |
| 117 | 117 | ||
| 118 | + if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then | ||
| 119 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 | ||
| 120 | + tar xf kokoro-multi-lang-v1_0.tar.bz2 | ||
| 121 | + rm kokoro-multi-lang-v1_0.tar.bz2 | ||
| 122 | + fi | ||
| 123 | + | ||
| 118 | if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then | 124 | if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then |
| 119 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | 125 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 |
| 120 | tar xf kokoro-en-v0_19.tar.bz2 | 126 | tar xf kokoro-en-v0_19.tar.bz2 |
| @@ -3,10 +3,34 @@ package com.k2fsa.sherpa.onnx | @@ -3,10 +3,34 @@ package com.k2fsa.sherpa.onnx | ||
| 3 | fun main() { | 3 | fun main() { |
| 4 | testVits() | 4 | testVits() |
| 5 | testMatcha() | 5 | testMatcha() |
| 6 | - testKokoro() | 6 | + testKokoroEn() |
| 7 | + testKokoroZhEn() | ||
| 7 | } | 8 | } |
| 8 | 9 | ||
| 9 | -fun testKokoro() { | 10 | +fun testKokoroZhEn() { |
| 11 | + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 12 | + var config = OfflineTtsConfig( | ||
| 13 | + model=OfflineTtsModelConfig( | ||
| 14 | + kokoro=OfflineTtsKokoroModelConfig( | ||
| 15 | + model="./kokoro-multi-lang-v1_0/model.onnx", | ||
| 16 | + voices="./kokoro-multi-lang-v1_0/voices.bin", | ||
| 17 | + tokens="./kokoro-multi-lang-v1_0/tokens.txt", | ||
| 18 | + dataDir="./kokoro-multi-lang-v1_0/espeak-ng-data", | ||
| 19 | + dictDir="./kokoro-multi-lang-v1_0/dict", | ||
| 20 | + lexicon="./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt", | ||
| 21 | + ), | ||
| 22 | + numThreads=2, | ||
| 23 | + debug=true, | ||
| 24 | + ), | ||
| 25 | + ) | ||
| 26 | + val tts = OfflineTts(config=config) | ||
| 27 | + val audio = tts.generateWithCallback(text="中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?", callback=::callback) | ||
| 28 | + audio.save(filename="test-kokoro-zh-en.wav") | ||
| 29 | + tts.release() | ||
| 30 | + println("Saved to test-kokoro-zh-en.wav") | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +fun testKokoroEn() { | ||
| 10 | // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 34 | // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 11 | var config = OfflineTtsConfig( | 35 | var config = OfflineTtsConfig( |
| 12 | model=OfflineTtsModelConfig( | 36 | model=OfflineTtsModelConfig( |
| @@ -27,7 +27,7 @@ def main(): | @@ -27,7 +27,7 @@ def main(): | ||
| 27 | 27 | ||
| 28 | meta_data = { | 28 | meta_data = { |
| 29 | "model_type": "kokoro", | 29 | "model_type": "kokoro", |
| 30 | - "language": "English", | 30 | + "language": "multi-lang, e.g., English, Chinese", |
| 31 | "has_espeak": 1, | 31 | "has_espeak": 1, |
| 32 | "sample_rate": 24000, | 32 | "sample_rate": 24000, |
| 33 | "version": 2, | 33 | "version": 2, |
| @@ -5,14 +5,18 @@ public class OfflineTtsKokoroModelConfig { | @@ -5,14 +5,18 @@ public class OfflineTtsKokoroModelConfig { | ||
| 5 | private final String model; | 5 | private final String model; |
| 6 | private final String voices; | 6 | private final String voices; |
| 7 | private final String tokens; | 7 | private final String tokens; |
| 8 | + private final String lexicon; | ||
| 8 | private final String dataDir; | 9 | private final String dataDir; |
| 10 | + private final String dictDir; | ||
| 9 | private final float lengthScale; | 11 | private final float lengthScale; |
| 10 | 12 | ||
| 11 | private OfflineTtsKokoroModelConfig(Builder builder) { | 13 | private OfflineTtsKokoroModelConfig(Builder builder) { |
| 12 | this.model = builder.model; | 14 | this.model = builder.model; |
| 13 | this.voices = builder.voices; | 15 | this.voices = builder.voices; |
| 14 | this.tokens = builder.tokens; | 16 | this.tokens = builder.tokens; |
| 17 | + this.lexicon = builder.lexicon; | ||
| 15 | this.dataDir = builder.dataDir; | 18 | this.dataDir = builder.dataDir; |
| 19 | + this.dictDir = builder.dictDir; | ||
| 16 | this.lengthScale = builder.lengthScale; | 20 | this.lengthScale = builder.lengthScale; |
| 17 | } | 21 | } |
| 18 | 22 | ||
| @@ -45,7 +49,9 @@ public class OfflineTtsKokoroModelConfig { | @@ -45,7 +49,9 @@ public class OfflineTtsKokoroModelConfig { | ||
| 45 | private String model = ""; | 49 | private String model = ""; |
| 46 | private String voices = ""; | 50 | private String voices = ""; |
| 47 | private String tokens = ""; | 51 | private String tokens = ""; |
| 52 | + private String lexicon = ""; | ||
| 48 | private String dataDir = ""; | 53 | private String dataDir = ""; |
| 54 | + private String dictDir = ""; | ||
| 49 | private float lengthScale = 1.0f; | 55 | private float lengthScale = 1.0f; |
| 50 | 56 | ||
| 51 | public OfflineTtsKokoroModelConfig build() { | 57 | public OfflineTtsKokoroModelConfig build() { |
| @@ -67,11 +73,21 @@ public class OfflineTtsKokoroModelConfig { | @@ -67,11 +73,21 @@ public class OfflineTtsKokoroModelConfig { | ||
| 67 | return this; | 73 | return this; |
| 68 | } | 74 | } |
| 69 | 75 | ||
| 76 | + public Builder setLexicon(String lexicon) { | ||
| 77 | + this.lexicon = lexicon; | ||
| 78 | + return this; | ||
| 79 | + } | ||
| 80 | + | ||
| 70 | public Builder setDataDir(String dataDir) { | 81 | public Builder setDataDir(String dataDir) { |
| 71 | this.dataDir = dataDir; | 82 | this.dataDir = dataDir; |
| 72 | return this; | 83 | return this; |
| 73 | } | 84 | } |
| 74 | 85 | ||
| 86 | + public Builder setDictDir(String dictDir) { | ||
| 87 | + this.dictDir = dictDir; | ||
| 88 | + return this; | ||
| 89 | + } | ||
| 90 | + | ||
| 75 | public Builder setLengthScale(float lengthScale) { | 91 | public Builder setLengthScale(float lengthScale) { |
| 76 | this.lengthScale = lengthScale; | 92 | this.lengthScale = lengthScale; |
| 77 | return this; | 93 | return this; |
| @@ -137,12 +137,24 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | @@ -137,12 +137,24 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | ||
| 137 | ans.model.kokoro.tokens = p; | 137 | ans.model.kokoro.tokens = p; |
| 138 | env->ReleaseStringUTFChars(s, p); | 138 | env->ReleaseStringUTFChars(s, p); |
| 139 | 139 | ||
| 140 | + fid = env->GetFieldID(kokoro_cls, "lexicon", "Ljava/lang/String;"); | ||
| 141 | + s = (jstring)env->GetObjectField(kokoro, fid); | ||
| 142 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 143 | + ans.model.kokoro.lexicon = p; | ||
| 144 | + env->ReleaseStringUTFChars(s, p); | ||
| 145 | + | ||
| 140 | fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); | 146 | fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); |
| 141 | s = (jstring)env->GetObjectField(kokoro, fid); | 147 | s = (jstring)env->GetObjectField(kokoro, fid); |
| 142 | p = env->GetStringUTFChars(s, nullptr); | 148 | p = env->GetStringUTFChars(s, nullptr); |
| 143 | ans.model.kokoro.data_dir = p; | 149 | ans.model.kokoro.data_dir = p; |
| 144 | env->ReleaseStringUTFChars(s, p); | 150 | env->ReleaseStringUTFChars(s, p); |
| 145 | 151 | ||
| 152 | + fid = env->GetFieldID(kokoro_cls, "dictDir", "Ljava/lang/String;"); | ||
| 153 | + s = (jstring)env->GetObjectField(kokoro, fid); | ||
| 154 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 155 | + ans.model.kokoro.dict_dir = p; | ||
| 156 | + env->ReleaseStringUTFChars(s, p); | ||
| 157 | + | ||
| 146 | fid = env->GetFieldID(kokoro_cls, "lengthScale", "F"); | 158 | fid = env->GetFieldID(kokoro_cls, "lengthScale", "F"); |
| 147 | ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid); | 159 | ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid); |
| 148 | 160 |
| @@ -30,6 +30,8 @@ data class OfflineTtsKokoroModelConfig( | @@ -30,6 +30,8 @@ data class OfflineTtsKokoroModelConfig( | ||
| 30 | var voices: String = "", | 30 | var voices: String = "", |
| 31 | var tokens: String = "", | 31 | var tokens: String = "", |
| 32 | var dataDir: String = "", | 32 | var dataDir: String = "", |
| 33 | + var lexicon: String = "", | ||
| 34 | + var dictDir: String = "", | ||
| 33 | var lengthScale: Float = 1.0f, | 35 | var lengthScale: Float = 1.0f, |
| 34 | ) | 36 | ) |
| 35 | 37 | ||
| @@ -254,6 +256,8 @@ fun getOfflineTtsConfig( | @@ -254,6 +256,8 @@ fun getOfflineTtsConfig( | ||
| 254 | voices = "$modelDir/$voices", | 256 | voices = "$modelDir/$voices", |
| 255 | tokens = "$modelDir/tokens.txt", | 257 | tokens = "$modelDir/tokens.txt", |
| 256 | dataDir = dataDir, | 258 | dataDir = dataDir, |
| 259 | + lexicon = if ("," in lexicon) lexicon else "$modelDir/$lexicon", | ||
| 260 | + dictDir = dictDir, | ||
| 257 | ) | 261 | ) |
| 258 | } else { | 262 | } else { |
| 259 | OfflineTtsKokoroModelConfig() | 263 | OfflineTtsKokoroModelConfig() |
-
请 注册 或 登录 后发表评论