Fangjun Kuang
Committed by GitHub

Add Java and Koltin API for Kokoro TTS 1.0 (#1798)

@@ -234,11 +234,13 @@ jobs: @@ -234,11 +234,13 @@ jobs:
234 run: | 234 run: |
235 cd ./java-api-examples 235 cd ./java-api-examples
236 236
  237 + ./run-non-streaming-tts-kokoro-zh-en.sh
237 ./run-non-streaming-tts-kokoro-en.sh 238 ./run-non-streaming-tts-kokoro-en.sh
238 ./run-non-streaming-tts-matcha-zh.sh 239 ./run-non-streaming-tts-matcha-zh.sh
239 ./run-non-streaming-tts-matcha-en.sh 240 ./run-non-streaming-tts-matcha-en.sh
240 ls -lh 241 ls -lh
241 242
  243 + rm -rf kokoro-multi-*
242 rm -rf kokoro-en-* 244 rm -rf kokoro-en-*
243 245
244 rm -rf matcha-icefall-* 246 rm -rf matcha-icefall-*
  1 +// Copyright 2025 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a Kokoro multi-lingual model
  4 +// to convert Chinese and English text to speech
  5 +import com.k2fsa.sherpa.onnx.*;
  6 +
  7 +public class NonStreamingTtsKokoroZhEn {
  8 + public static void main(String[] args) {
  9 + // please visit
  10 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html
  11 + // to download model files
  12 + String model = "./kokoro-multi-lang-v1_0/model.onnx";
  13 + String voices = "./kokoro-multi-lang-v1_0/voices.bin";
  14 + String tokens = "./kokoro-multi-lang-v1_0/tokens.txt";
  15 + String dataDir = "./kokoro-multi-lang-v1_0/espeak-ng-data";
  16 + String dictDir = "./kokoro-multi-lang-v1_0/dict";
  17 + String lexicon =
  18 + "./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt";
  19 + String text =
  20 + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki."
  21 + + " 你觉得中英文说的如何呢?";
  22 +
  23 + OfflineTtsKokoroModelConfig kokoroModelConfig =
  24 + OfflineTtsKokoroModelConfig.builder()
  25 + .setModel(model)
  26 + .setVoices(voices)
  27 + .setTokens(tokens)
  28 + .setDataDir(dataDir)
  29 + .setDictDir(dictDir)
  30 + .setLexicon(lexicon)
  31 + .build();
  32 +
  33 + OfflineTtsModelConfig modelConfig =
  34 + OfflineTtsModelConfig.builder()
  35 + .setKokoro(kokoroModelConfig)
  36 + .setNumThreads(2)
  37 + .setDebug(true)
  38 + .build();
  39 +
  40 + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
  41 + OfflineTts tts = new OfflineTts(config);
  42 +
  43 + int sid = 0; // this model has 53 speakers. You can use sid in the range 0-52
  44 + float speed = 1.0f;
  45 + long start = System.currentTimeMillis();
  46 + GeneratedAudio audio = tts.generate(text, sid, speed);
  47 + long stop = System.currentTimeMillis();
  48 +
  49 + float timeElapsedSeconds = (stop - start) / 1000.0f;
  50 +
  51 + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
  52 + float real_time_factor = timeElapsedSeconds / audioDuration;
  53 +
  54 + String waveFilename = "tts-kokoro-zh-en.wav";
  55 + audio.save(waveFilename);
  56 + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
  57 + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
  58 + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
  59 + System.out.printf("-- text: %s\n", text);
  60 + System.out.printf("-- Saved to %s\n", waveFilename);
  61 +
  62 + tts.release();
  63 + }
  64 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +# please visit
  29 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html
  30 +# to download more models
  31 +if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then
  32 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  33 + tar xf kokoro-multi-lang-v1_0.tar.bz2
  34 + rm kokoro-multi-lang-v1_0.tar.bz2
  35 +fi
  36 +
  37 +java \
  38 + -Djava.library.path=$PWD/../build/lib \
  39 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  40 + NonStreamingTtsKokoroZhEn.java
@@ -115,6 +115,12 @@ function testTts() { @@ -115,6 +115,12 @@ function testTts() {
115 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx 115 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
116 fi 116 fi
117 117
  118 + if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then
  119 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  120 + tar xf kokoro-multi-lang-v1_0.tar.bz2
  121 + rm kokoro-multi-lang-v1_0.tar.bz2
  122 + fi
  123 +
118 if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then 124 if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
119 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 125 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
120 tar xf kokoro-en-v0_19.tar.bz2 126 tar xf kokoro-en-v0_19.tar.bz2
@@ -3,10 +3,34 @@ package com.k2fsa.sherpa.onnx @@ -3,10 +3,34 @@ package com.k2fsa.sherpa.onnx
3 fun main() { 3 fun main() {
4 testVits() 4 testVits()
5 testMatcha() 5 testMatcha()
6 - testKokoro() 6 + testKokoroEn()
  7 + testKokoroZhEn()
7 } 8 }
8 9
9 -fun testKokoro() { 10 +fun testKokoroZhEn() {
  11 + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  12 + var config = OfflineTtsConfig(
  13 + model=OfflineTtsModelConfig(
  14 + kokoro=OfflineTtsKokoroModelConfig(
  15 + model="./kokoro-multi-lang-v1_0/model.onnx",
  16 + voices="./kokoro-multi-lang-v1_0/voices.bin",
  17 + tokens="./kokoro-multi-lang-v1_0/tokens.txt",
  18 + dataDir="./kokoro-multi-lang-v1_0/espeak-ng-data",
  19 + dictDir="./kokoro-multi-lang-v1_0/dict",
  20 + lexicon="./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt",
  21 + ),
  22 + numThreads=2,
  23 + debug=true,
  24 + ),
  25 + )
  26 + val tts = OfflineTts(config=config)
  27 + val audio = tts.generateWithCallback(text="中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?", callback=::callback)
  28 + audio.save(filename="test-kokoro-zh-en.wav")
  29 + tts.release()
  30 + println("Saved to test-kokoro-zh-en.wav")
  31 +}
  32 +
  33 +fun testKokoroEn() {
10 // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 34 // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
11 var config = OfflineTtsConfig( 35 var config = OfflineTtsConfig(
12 model=OfflineTtsModelConfig( 36 model=OfflineTtsModelConfig(
@@ -27,7 +27,7 @@ def main(): @@ -27,7 +27,7 @@ def main():
27 27
28 meta_data = { 28 meta_data = {
29 "model_type": "kokoro", 29 "model_type": "kokoro",
30 - "language": "English", 30 + "language": "multi-lang, e.g., English, Chinese",
31 "has_espeak": 1, 31 "has_espeak": 1,
32 "sample_rate": 24000, 32 "sample_rate": 24000,
33 "version": 2, 33 "version": 2,
@@ -5,14 +5,18 @@ public class OfflineTtsKokoroModelConfig { @@ -5,14 +5,18 @@ public class OfflineTtsKokoroModelConfig {
5 private final String model; 5 private final String model;
6 private final String voices; 6 private final String voices;
7 private final String tokens; 7 private final String tokens;
  8 + private final String lexicon;
8 private final String dataDir; 9 private final String dataDir;
  10 + private final String dictDir;
9 private final float lengthScale; 11 private final float lengthScale;
10 12
11 private OfflineTtsKokoroModelConfig(Builder builder) { 13 private OfflineTtsKokoroModelConfig(Builder builder) {
12 this.model = builder.model; 14 this.model = builder.model;
13 this.voices = builder.voices; 15 this.voices = builder.voices;
14 this.tokens = builder.tokens; 16 this.tokens = builder.tokens;
  17 + this.lexicon = builder.lexicon;
15 this.dataDir = builder.dataDir; 18 this.dataDir = builder.dataDir;
  19 + this.dictDir = builder.dictDir;
16 this.lengthScale = builder.lengthScale; 20 this.lengthScale = builder.lengthScale;
17 } 21 }
18 22
@@ -45,7 +49,9 @@ public class OfflineTtsKokoroModelConfig { @@ -45,7 +49,9 @@ public class OfflineTtsKokoroModelConfig {
45 private String model = ""; 49 private String model = "";
46 private String voices = ""; 50 private String voices = "";
47 private String tokens = ""; 51 private String tokens = "";
  52 + private String lexicon = "";
48 private String dataDir = ""; 53 private String dataDir = "";
  54 + private String dictDir = "";
49 private float lengthScale = 1.0f; 55 private float lengthScale = 1.0f;
50 56
51 public OfflineTtsKokoroModelConfig build() { 57 public OfflineTtsKokoroModelConfig build() {
@@ -67,11 +73,21 @@ public class OfflineTtsKokoroModelConfig { @@ -67,11 +73,21 @@ public class OfflineTtsKokoroModelConfig {
67 return this; 73 return this;
68 } 74 }
69 75
  76 + public Builder setLexicon(String lexicon) {
  77 + this.lexicon = lexicon;
  78 + return this;
  79 + }
  80 +
70 public Builder setDataDir(String dataDir) { 81 public Builder setDataDir(String dataDir) {
71 this.dataDir = dataDir; 82 this.dataDir = dataDir;
72 return this; 83 return this;
73 } 84 }
74 85
  86 + public Builder setDictDir(String dictDir) {
  87 + this.dictDir = dictDir;
  88 + return this;
  89 + }
  90 +
75 public Builder setLengthScale(float lengthScale) { 91 public Builder setLengthScale(float lengthScale) {
76 this.lengthScale = lengthScale; 92 this.lengthScale = lengthScale;
77 return this; 93 return this;
@@ -137,12 +137,24 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { @@ -137,12 +137,24 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
137 ans.model.kokoro.tokens = p; 137 ans.model.kokoro.tokens = p;
138 env->ReleaseStringUTFChars(s, p); 138 env->ReleaseStringUTFChars(s, p);
139 139
  140 + fid = env->GetFieldID(kokoro_cls, "lexicon", "Ljava/lang/String;");
  141 + s = (jstring)env->GetObjectField(kokoro, fid);
  142 + p = env->GetStringUTFChars(s, nullptr);
  143 + ans.model.kokoro.lexicon = p;
  144 + env->ReleaseStringUTFChars(s, p);
  145 +
140 fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); 146 fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;");
141 s = (jstring)env->GetObjectField(kokoro, fid); 147 s = (jstring)env->GetObjectField(kokoro, fid);
142 p = env->GetStringUTFChars(s, nullptr); 148 p = env->GetStringUTFChars(s, nullptr);
143 ans.model.kokoro.data_dir = p; 149 ans.model.kokoro.data_dir = p;
144 env->ReleaseStringUTFChars(s, p); 150 env->ReleaseStringUTFChars(s, p);
145 151
  152 + fid = env->GetFieldID(kokoro_cls, "dictDir", "Ljava/lang/String;");
  153 + s = (jstring)env->GetObjectField(kokoro, fid);
  154 + p = env->GetStringUTFChars(s, nullptr);
  155 + ans.model.kokoro.dict_dir = p;
  156 + env->ReleaseStringUTFChars(s, p);
  157 +
146 fid = env->GetFieldID(kokoro_cls, "lengthScale", "F"); 158 fid = env->GetFieldID(kokoro_cls, "lengthScale", "F");
147 ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid); 159 ans.model.kokoro.length_scale = env->GetFloatField(kokoro, fid);
148 160
@@ -30,6 +30,8 @@ data class OfflineTtsKokoroModelConfig( @@ -30,6 +30,8 @@ data class OfflineTtsKokoroModelConfig(
30 var voices: String = "", 30 var voices: String = "",
31 var tokens: String = "", 31 var tokens: String = "",
32 var dataDir: String = "", 32 var dataDir: String = "",
  33 + var lexicon: String = "",
  34 + var dictDir: String = "",
33 var lengthScale: Float = 1.0f, 35 var lengthScale: Float = 1.0f,
34 ) 36 )
35 37
@@ -254,6 +256,8 @@ fun getOfflineTtsConfig( @@ -254,6 +256,8 @@ fun getOfflineTtsConfig(
254 voices = "$modelDir/$voices", 256 voices = "$modelDir/$voices",
255 tokens = "$modelDir/tokens.txt", 257 tokens = "$modelDir/tokens.txt",
256 dataDir = dataDir, 258 dataDir = dataDir,
  259 + lexicon = if ("," in lexicon) lexicon else "$modelDir/$lexicon",
  260 + dictDir = dictDir,
257 ) 261 )
258 } else { 262 } else {
259 OfflineTtsKokoroModelConfig() 263 OfflineTtsKokoroModelConfig()