Fangjun Kuang
Committed by GitHub

Add Java API for Matcha-TTS models. (#1673)

@@ -235,6 +235,13 @@ jobs: @@ -235,6 +235,13 @@ jobs:
235 shell: bash 235 shell: bash
236 run: | 236 run: |
237 cd ./java-api-examples 237 cd ./java-api-examples
  238 +
  239 + ./run-non-streaming-tts-matcha-zh.sh
  240 + ./run-non-streaming-tts-matcha-en.sh
  241 +
  242 + rm -rf matcha-icefall-*
  243 + rm hifigan_v2.onnx
  244 +
238 ./run-non-streaming-tts-piper-en.sh 245 ./run-non-streaming-tts-piper-en.sh
239 rm -rf vits-piper-* 246 rm -rf vits-piper-*
240 247
@@ -126,3 +126,4 @@ sherpa-onnx-moonshine-base-en-int8 @@ -126,3 +126,4 @@ sherpa-onnx-moonshine-base-en-int8
126 harmony-os/SherpaOnnxHar/sherpa_onnx/LICENSE 126 harmony-os/SherpaOnnxHar/sherpa_onnx/LICENSE
127 harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md 127 harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md
128 matcha-icefall-zh-baker 128 matcha-icefall-zh-baker
  129 +matcha-icefall-en_US-ljspeech
  1 +// Copyright 2025 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a matcha English model
  4 +// to convert text to speech
  5 +import com.k2fsa.sherpa.onnx.*;
  6 +
  7 +public class NonStreamingTtsMatchaEn {
  8 + public static void main(String[] args) {
  9 + // please visit
  10 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  11 + // to download model files
  12 + String acousticModel = "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx";
  13 + String vocoder = "./hifigan_v2.onnx";
  14 + String tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt";
  15 + String dataDir = "./matcha-icefall-en_US-ljspeech/espeak-ng-data";
  16 + String text =
  17 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have"
  18 + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
  19 + + " businessman, an official, or a scholar.";
  20 +
  21 + OfflineTtsMatchaModelConfig matchaModelConfig =
  22 + OfflineTtsMatchaModelConfig.builder()
  23 + .setAcousticModel(acousticModel)
  24 + .setVocoder(vocoder)
  25 + .setTokens(tokens)
  26 + .setDataDir(dataDir)
  27 + .build();
  28 +
  29 + OfflineTtsModelConfig modelConfig =
  30 + OfflineTtsModelConfig.builder()
  31 + .setMatcha(matchaModelConfig)
  32 + .setNumThreads(1)
  33 + .setDebug(true)
  34 + .build();
  35 +
  36 + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
  37 + OfflineTts tts = new OfflineTts(config);
  38 +
  39 + int sid = 0;
  40 + float speed = 1.0f;
  41 + long start = System.currentTimeMillis();
  42 + GeneratedAudio audio = tts.generate(text, sid, speed);
  43 + long stop = System.currentTimeMillis();
  44 +
  45 + float timeElapsedSeconds = (stop - start) / 1000.0f;
  46 +
  47 + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
  48 + float real_time_factor = timeElapsedSeconds / audioDuration;
  49 +
  50 + String waveFilename = "tts-matcha-en.wav";
  51 + audio.save(waveFilename);
  52 + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
  53 + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
  54 + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
  55 + System.out.printf("-- text: %s\n", text);
  56 + System.out.printf("-- Saved to %s\n", waveFilename);
  57 +
  58 + tts.release();
  59 + }
  60 +}
  1 +// Copyright 2025 Xiaomi Corporation
  2 +
  3 +// This file shows how to use a matcha Chinese TTS model
  4 +// to convert text to speech
  5 +import com.k2fsa.sherpa.onnx.*;
  6 +
  7 +public class NonStreamingTtsMatchaZh {
  8 + public static void main(String[] args) {
  9 + // please visit
  10 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  11 + // to download model files
  12 + String acousticModel = "./matcha-icefall-zh-baker/model-steps-3.onnx";
  13 + String vocoder = "./hifigan_v2.onnx";
  14 + String tokens = "./matcha-icefall-zh-baker/tokens.txt";
  15 + String lexicon = "./matcha-icefall-zh-baker/lexicon.txt";
  16 + String dictDir = "./matcha-icefall-zh-baker/dict";
  17 + String ruleFsts =
  18 + "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst";
  19 + String text =
  20 + "某某银行的副行长和一些行政领导表示,他们去过长江"
  21 + + "和长白山; 经济不断增长。"
  22 + + "2024年12月31号,拨打110或者18920240511。"
  23 + + "123456块钱。";
  24 +
  25 + OfflineTtsMatchaModelConfig matchaModelConfig =
  26 + OfflineTtsMatchaModelConfig.builder()
  27 + .setAcousticModel(acousticModel)
  28 + .setVocoder(vocoder)
  29 + .setTokens(tokens)
  30 + .setLexicon(lexicon)
  31 + .setDictDir(dictDir)
  32 + .build();
  33 +
  34 + OfflineTtsModelConfig modelConfig =
  35 + OfflineTtsModelConfig.builder()
  36 + .setMatcha(matchaModelConfig)
  37 + .setNumThreads(1)
  38 + .setDebug(true)
  39 + .build();
  40 +
  41 + OfflineTtsConfig config =
  42 + OfflineTtsConfig.builder().setModel(modelConfig).setRuleFsts(ruleFsts).build();
  43 + OfflineTts tts = new OfflineTts(config);
  44 +
  45 + int sid = 0;
  46 + float speed = 1.0f;
  47 + long start = System.currentTimeMillis();
  48 + GeneratedAudio audio = tts.generate(text, sid, speed);
  49 + long stop = System.currentTimeMillis();
  50 +
  51 + float timeElapsedSeconds = (stop - start) / 1000.0f;
  52 +
  53 + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
  54 + float real_time_factor = timeElapsedSeconds / audioDuration;
  55 +
  56 + String waveFilename = "tts-matcha-zh.wav";
  57 + audio.save(waveFilename);
  58 + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
  59 + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
  60 + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
  61 + System.out.printf("-- text: %s\n", text);
  62 + System.out.printf("-- Saved to %s\n", waveFilename);
  63 +
  64 + tts.release();
  65 + }
  66 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +# please visit
  29 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  30 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  31 +# to download more models
  32 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  33 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  34 + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  35 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  36 +fi
  37 +
  38 +if [ ! -f ./hifigan_v2.onnx ]; then
  39 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  40 +fi
  41 +
  42 +java \
  43 + -Djava.library.path=$PWD/../build/lib \
  44 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  45 + NonStreamingTtsMatchaEn.java
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +# please visit
  29 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  30 +# to download more models
  31 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  32 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  33 + tar xvf matcha-icefall-zh-baker.tar.bz2
  34 + rm matcha-icefall-zh-baker.tar.bz2
  35 +fi
  36 +
  37 +if [ ! -f ./hifigan_v2.onnx ]; then
  38 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  39 +fi
  40 +
  41 +java \
  42 + -Djava.library.path=$PWD/../build/lib \
  43 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  44 + NonStreamingTtsMatchaZh.java
@@ -369,6 +369,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( @@ -369,6 +369,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
369 template PiperPhonemizeLexicon::PiperPhonemizeLexicon( 369 template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
370 NativeResourceManager *mgr, const std::string &tokens, 370 NativeResourceManager *mgr, const std::string &tokens,
371 const std::string &data_dir, 371 const std::string &data_dir,
  372 + const OfflineTtsVitsModelMetaData &vits_meta_data);
  373 +
  374 +template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
  375 + NativeResourceManager *mgr, const std::string &tokens,
  376 + const std::string &data_dir,
372 const OfflineTtsMatchaModelMetaData &matcha_meta_data); 377 const OfflineTtsMatchaModelMetaData &matcha_meta_data);
373 #endif 378 #endif
374 379
@@ -35,6 +35,7 @@ java_files += OfflineRecognizerResult.java @@ -35,6 +35,7 @@ java_files += OfflineRecognizerResult.java
35 java_files += OfflineStream.java 35 java_files += OfflineStream.java
36 java_files += OfflineRecognizer.java 36 java_files += OfflineRecognizer.java
37 37
  38 +java_files += OfflineTtsMatchaModelConfig.java
38 java_files += OfflineTtsVitsModelConfig.java 39 java_files += OfflineTtsVitsModelConfig.java
39 java_files += OfflineTtsModelConfig.java 40 java_files += OfflineTtsModelConfig.java
40 java_files += OfflineTtsConfig.java 41 java_files += OfflineTtsConfig.java
  1 +// Copyright 2025 Xiaomi Corporation
  2 +
  3 +package com.k2fsa.sherpa.onnx;
  4 +
  5 +public class OfflineTtsMatchaModelConfig {
  6 + private final String acousticModel;
  7 + private final String vocoder;
  8 + private final String lexicon;
  9 + private final String tokens;
  10 + private final String dataDir;
  11 + private final String dictDir;
  12 + private final float noiseScale;
  13 + private final float lengthScale;
  14 +
  15 + private OfflineTtsMatchaModelConfig(Builder builder) {
  16 + this.acousticModel = builder.acousticModel;
  17 + this.vocoder = builder.vocoder;
  18 + this.lexicon = builder.lexicon;
  19 + this.tokens = builder.tokens;
  20 + this.dataDir = builder.dataDir;
  21 + this.dictDir = builder.dictDir;
  22 + this.noiseScale = builder.noiseScale;
  23 + this.lengthScale = builder.lengthScale;
  24 + }
  25 +
  26 + public static Builder builder() {
  27 + return new Builder();
  28 + }
  29 +
  30 + public String getAcousticModel() {
  31 + return acousticModel;
  32 + }
  33 +
  34 + public String getVocoder() {
  35 + return vocoder;
  36 + }
  37 +
  38 + public String getLexicon() {
  39 + return lexicon;
  40 + }
  41 +
  42 + public String getTokens() {
  43 + return tokens;
  44 + }
  45 +
  46 + public String getDataDir() {
  47 + return dataDir;
  48 + }
  49 +
  50 + public String getDictDir() {
  51 + return dictDir;
  52 + }
  53 +
  54 + public float getLengthScale() {
  55 + return lengthScale;
  56 + }
  57 +
  58 + public float getNoiseScale() {
  59 + return noiseScale;
  60 + }
  61 +
  62 + public static class Builder {
  63 + private String acousticModel = "";
  64 + private String vocoder = "";
  65 + private String lexicon = "";
  66 + private String tokens = "";
  67 + private String dataDir = "";
  68 + private String dictDir = "";
  69 + private float noiseScale = 1.0f;
  70 + private float lengthScale = 1.0f;
  71 +
  72 + public OfflineTtsMatchaModelConfig build() {
  73 + return new OfflineTtsMatchaModelConfig(this);
  74 + }
  75 +
  76 + public Builder setAcousticModel(String acousticModel) {
  77 + this.acousticModel = acousticModel;
  78 + return this;
  79 + }
  80 +
  81 + public Builder setVocoder(String vocoder) {
  82 + this.vocoder = vocoder;
  83 + return this;
  84 + }
  85 +
  86 + public Builder setTokens(String tokens) {
  87 + this.tokens = tokens;
  88 + return this;
  89 + }
  90 +
  91 + public Builder setLexicon(String lexicon) {
  92 + this.lexicon = lexicon;
  93 + return this;
  94 + }
  95 +
  96 + public Builder setDataDir(String dataDir) {
  97 + this.dataDir = dataDir;
  98 + return this;
  99 + }
  100 +
  101 + public Builder setDictDir(String dictDir) {
  102 + this.dictDir = dictDir;
  103 + return this;
  104 + }
  105 +
  106 + public Builder setNoiseScale(float noiseScale) {
  107 + this.noiseScale = noiseScale;
  108 + return this;
  109 + }
  110 +
  111 + public Builder setLengthScale(float lengthScale) {
  112 + this.lengthScale = lengthScale;
  113 + return this;
  114 + }
  115 + }
  116 +}
@@ -4,12 +4,14 @@ package com.k2fsa.sherpa.onnx; @@ -4,12 +4,14 @@ package com.k2fsa.sherpa.onnx;
4 4
5 public class OfflineTtsModelConfig { 5 public class OfflineTtsModelConfig {
6 private final OfflineTtsVitsModelConfig vits; 6 private final OfflineTtsVitsModelConfig vits;
  7 + private final OfflineTtsMatchaModelConfig matcha;
7 private final int numThreads; 8 private final int numThreads;
8 private final boolean debug; 9 private final boolean debug;
9 private final String provider; 10 private final String provider;
10 11
11 private OfflineTtsModelConfig(Builder builder) { 12 private OfflineTtsModelConfig(Builder builder) {
12 this.vits = builder.vits; 13 this.vits = builder.vits;
  14 + this.matcha = builder.matcha;
13 this.numThreads = builder.numThreads; 15 this.numThreads = builder.numThreads;
14 this.debug = builder.debug; 16 this.debug = builder.debug;
15 this.provider = builder.provider; 17 this.provider = builder.provider;
@@ -23,8 +25,13 @@ public class OfflineTtsModelConfig { @@ -23,8 +25,13 @@ public class OfflineTtsModelConfig {
23 return vits; 25 return vits;
24 } 26 }
25 27
  28 + public OfflineTtsMatchaModelConfig getMatcha() {
  29 + return matcha;
  30 + }
  31 +
26 public static class Builder { 32 public static class Builder {
27 private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build(); 33 private OfflineTtsVitsModelConfig vits = OfflineTtsVitsModelConfig.builder().build();
  34 + private OfflineTtsMatchaModelConfig matcha = OfflineTtsMatchaModelConfig.builder().build();
28 private int numThreads = 1; 35 private int numThreads = 1;
29 private boolean debug = true; 36 private boolean debug = true;
30 private String provider = "cpu"; 37 private String provider = "cpu";
@@ -38,6 +45,11 @@ public class OfflineTtsModelConfig { @@ -38,6 +45,11 @@ public class OfflineTtsModelConfig {
38 return this; 45 return this;
39 } 46 }
40 47
  48 + public Builder setMatcha(OfflineTtsMatchaModelConfig matcha) {
  49 + this.matcha = matcha;
  50 + return this;
  51 + }
  52 +
41 public Builder setNumThreads(int numThreads) { 53 public Builder setNumThreads(int numThreads) {
42 this.numThreads = numThreads; 54 this.numThreads = numThreads;
43 return this; 55 return this;
@@ -60,9 +60,9 @@ public class OfflineTtsVitsModelConfig { @@ -60,9 +60,9 @@ public class OfflineTtsVitsModelConfig {
60 } 60 }
61 61
62 public static class Builder { 62 public static class Builder {
63 - private String model; 63 + private String model = "";
64 private String lexicon = ""; 64 private String lexicon = "";
65 - private String tokens; 65 + private String tokens = "";
66 private String dataDir = ""; 66 private String dataDir = "";
67 private String dictDir = ""; 67 private String dictDir = "";
68 private float noiseScale = 0.667f; 68 private float noiseScale = 0.667f;