Fangjun Kuang
Committed by GitHub

Add TTS example for Java API. (#1176)

It plays the generated audio as it is still generating.
  1 +// Copyright 2024 Xiaomi Corporation
  2 +//
  3 +// References
  4 +// https://www.baeldung.com/java-passing-method-parameter
  5 +// https://www.geeksforgeeks.org/how-to-create-a-thread-safe-queue-in-java/
  6 +// https://stackoverflow.com/questions/74077394/java-audio-how-to-continuously-write-bytes-to-an-audio-file-as-they-are-being-g
  7 +
  8 +// This file shows how to use a piper VITS English TTS model
  9 +// to convert text to speech. You can pass a callback to the generation call,
  10 +// which is invoked whenever max_num_sentences sentences have been
  11 +// finished generation.
  12 +//
  13 +// The callback saves the generated samples into a queue, which are played
  14 +// by a separate thread.
  15 +
  16 +import com.k2fsa.sherpa.onnx.*;
  17 +import java.util.Queue;
  18 +import java.util.concurrent.*;
  19 +import java.util.concurrent.ConcurrentLinkedQueue;
  20 +import javax.sound.sampled.*;
  21 +
  22 +public class NonStreamingTtsPiperEn {
  23 + public static void main(String[] args) {
  24 + // please visit
  25 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  26 + // to download model files
  27 + String model = "./vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx";
  28 + String tokens = "./vits-piper-en_GB-cori-medium/tokens.txt";
  29 + String dataDir = "./vits-piper-en_GB-cori-medium/espeak-ng-data";
  30 + String text =
  31 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have"
  32 + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a"
  33 + + " businessman, an official, or a scholar.";
  34 +
  35 + OfflineTtsVitsModelConfig vitsModelConfig =
  36 + OfflineTtsVitsModelConfig.builder()
  37 + .setModel(model)
  38 + .setTokens(tokens)
  39 + .setDataDir(dataDir)
  40 + .build();
  41 +
  42 + OfflineTtsModelConfig modelConfig =
  43 + OfflineTtsModelConfig.builder()
  44 + .setVits(vitsModelConfig)
  45 + .setNumThreads(1)
  46 + .setDebug(true)
  47 + .build();
  48 +
  49 + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build();
  50 + OfflineTts tts = new OfflineTts(config);
  51 +
  52 + Queue<byte[]> samplesQueue = new ConcurrentLinkedQueue<>();
  53 +
  54 + Semaphore canPlaySem = new Semaphore(1);
  55 + try {
  56 + canPlaySem.acquire();
  57 + } catch (InterruptedException ex) {
  58 + System.out.println("Failed to acquire the play semaphore in the main thread");
  59 + return;
  60 + }
  61 +
  62 + Runnable playRuannable =
  63 + () -> {
  64 + try {
  65 + canPlaySem.acquire();
  66 + } catch (InterruptedException e) {
  67 + System.out.println("Failed to get canPlay semaphore in the play thread");
  68 + return;
  69 + }
  70 +
  71 + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
  72 + AudioFormat format =
  73 + new AudioFormat(
  74 + tts.getSampleRate(), // sampleRate
  75 + 16, // sampleSizeInBits
  76 + 1, // channels
  77 + true, // signed
  78 + false // bigEndian
  79 + );
  80 + DataLine.Info info = new DataLine.Info(SourceDataLine.class, format);
  81 + SourceDataLine line;
  82 + try {
  83 + line = (SourceDataLine) AudioSystem.getLine(info);
  84 +
  85 + int bufferSizeInBytes = tts.getSampleRate(); // 0.5 seconds
  86 + line.open(format, bufferSizeInBytes);
  87 + } catch (LineUnavailableException ex) {
  88 + System.out.println("Failed to open a device for playing");
  89 + return;
  90 + }
  91 + line.start();
  92 +
  93 + while (true) {
  94 + if (samplesQueue.isEmpty()) {
  95 + // Do nothing.
  96 + //
  97 + // If the generating speed is very slow, we can sleep
  98 + // for some time here to save some CPU.
  99 + } else {
  100 + byte[] samples = samplesQueue.poll();
  101 + if (samples.length == 1) {
  102 + // end of the generating
  103 + break;
  104 + }
  105 + line.write(samples, 0, samples.length);
  106 + }
  107 + }
  108 +
  109 + line.drain();
  110 + line.close();
  111 + };
  112 +
  113 + Thread playThread = new Thread(playRuannable);
  114 + playThread.start();
  115 +
  116 + int sid = 0;
  117 + float speed = 1.0f;
  118 + long start = System.currentTimeMillis();
  119 + GeneratedAudio audio =
  120 + tts.generateWithCallback(
  121 + text,
  122 + sid,
  123 + speed,
  124 + (float[] samples) -> {
  125 +
  126 + // we use a byte array to save int16 samples
  127 + byte[] samplesInt16 = new byte[samples.length * 2];
  128 + for (int i = 0; i < samples.length; ++i) {
  129 + float s = samples[i];
  130 + if (s > 1) {
  131 + s = 1;
  132 + }
  133 +
  134 + if (s < -1) {
  135 + s = -1;
  136 + }
  137 +
  138 + short t = (short) (s * 32767);
  139 +
  140 + // we use little endian
  141 + samplesInt16[2 * i] = (byte) (t & 0xff);
  142 + samplesInt16[2 * i + 1] = (byte) ((t & 0xff00) >> 8);
  143 + }
  144 +
  145 + samplesQueue.add(samplesInt16);
  146 +
  147 + canPlaySem.release();
  148 +
  149 + // Note: You can play the samples.
  150 + // warning: You need to save a copy of samples since it is freed
  151 + // when this function returns
  152 +
  153 + // return 1 to continue generation
  154 + // return 0 to stop generation
  155 + return 1;
  156 + });
  157 +
  158 + // Since a sample always has two bytes. We put a single byte
  159 + // into the queue to indicate that we have finished processing.
  160 + samplesQueue.add(new byte[1]);
  161 +
  162 + long stop = System.currentTimeMillis();
  163 +
  164 + float timeElapsedSeconds = (stop - start) / 1000.0f;
  165 +
  166 + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate();
  167 + float real_time_factor = timeElapsedSeconds / audioDuration;
  168 +
  169 + try {
  170 + playThread.join();
  171 + } catch (InterruptedException ex) {
  172 + System.out.println("Failed to join the play thread");
  173 + return;
  174 + }
  175 +
  176 + String waveFilename = "tts-piper-en.wav";
  177 + audio.save(waveFilename);
  178 + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds);
  179 + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds);
  180 + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor);
  181 + System.out.printf("-- text: %s\n", text);
  182 + System.out.printf("-- Saved to %s\n", waveFilename);
  183 +
  184 + tts.release();
  185 + }
  186 +}
@@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx. @@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx.
24 ./run-non-streaming-decode-file-nemo.sh 24 ./run-non-streaming-decode-file-nemo.sh
25 ``` 25 ```
26 26
  27 +
27 ## Non-Streaming text-to-speech 28 ## Non-Streaming text-to-speech
28 29
29 ```bash 30 ```bash
@@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx. @@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx.
32 ./run-non-streaming-tts-vits-zh.sh 33 ./run-non-streaming-tts-vits-zh.sh
33 ``` 34 ```
34 35
  36 +## Non-Streaming text-to-speech (Play as it is generating)
  37 +
  38 +```bash
  39 +./run-non-streaming-tts-piper-en-with-callback.sh
  40 +```
  41 +
35 ## Spoken language identification 42 ## Spoken language identification
36 43
37 ```bash 44 ```bash
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
  6 + mkdir -p ../build
  7 + pushd ../build
  8 + cmake \
  9 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  10 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  11 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  12 + -DBUILD_SHARED_LIBS=ON \
  13 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  14 + -DSHERPA_ONNX_ENABLE_JNI=ON \
  15 + ..
  16 +
  17 + make -j4
  18 + ls -lh lib
  19 + popd
  20 +fi
  21 +
  22 +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
  23 + pushd ../sherpa-onnx/java-api
  24 + make
  25 + popd
  26 +fi
  27 +
  28 +# please visit
  29 +# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  30 +# to download more models
  31 +if [ ! -f ./vits-piper-en_GB-cori-medium/tokens.txt ]; then
  32 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
  33 + tar xf vits-piper-en_GB-cori-medium.tar.bz2
  34 + rm vits-piper-en_GB-cori-medium.tar.bz2
  35 +fi
  36 +
  37 +java \
  38 + -Djava.library.path=$PWD/../build/lib \
  39 + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
  40 + NonStreamingTtsPiperEnWithCallback.java
@@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java @@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java
38 java_files += OfflineTtsModelConfig.java 38 java_files += OfflineTtsModelConfig.java
39 java_files += OfflineTtsConfig.java 39 java_files += OfflineTtsConfig.java
40 java_files += GeneratedAudio.java 40 java_files += GeneratedAudio.java
  41 +java_files += OfflineTtsCallback.java
41 java_files += OfflineTts.java 42 java_files += OfflineTts.java
42 43
43 java_files += SpokenLanguageIdentificationWhisperConfig.java 44 java_files += SpokenLanguageIdentificationWhisperConfig.java
@@ -2,6 +2,7 @@ @@ -2,6 +2,7 @@
2 2
3 package com.k2fsa.sherpa.onnx; 3 package com.k2fsa.sherpa.onnx;
4 4
  5 +
5 public class OfflineTts { 6 public class OfflineTts {
6 static { 7 static {
7 System.loadLibrary("sherpa-onnx-jni"); 8 System.loadLibrary("sherpa-onnx-jni");
@@ -13,6 +14,10 @@ public class OfflineTts { @@ -13,6 +14,10 @@ public class OfflineTts {
13 ptr = newFromFile(config); 14 ptr = newFromFile(config);
14 } 15 }
15 16
  17 + public int getSampleRate() {
  18 + return getSampleRate(ptr);
  19 + }
  20 +
16 public GeneratedAudio generate(String text) { 21 public GeneratedAudio generate(String text) {
17 return generate(text, 0, 1.0f); 22 return generate(text, 0, 1.0f);
18 } 23 }
@@ -28,6 +33,21 @@ public class OfflineTts { @@ -28,6 +33,21 @@ public class OfflineTts {
28 return new GeneratedAudio(samples, sampleRate); 33 return new GeneratedAudio(samples, sampleRate);
29 } 34 }
30 35
  36 + public GeneratedAudio generateWithCallback(String text, OfflineTtsCallback callback) {
  37 + return generateWithCallback(text, 0, 1.0f, callback);
  38 + }
  39 +
  40 + public GeneratedAudio generateWithCallback(String text, int sid, OfflineTtsCallback callback) {
  41 + return generateWithCallback(text, sid, 1.0f, callback);
  42 + }
  43 +
  44 + public GeneratedAudio generateWithCallback(String text, int sid, float speed, OfflineTtsCallback callback) {
  45 + Object[] arr = generateWithCallbackImpl(ptr, text, sid, speed, callback);
  46 + float[] samples = (float[]) arr[0];
  47 + int sampleRate = (int) arr[1];
  48 + return new GeneratedAudio(samples, sampleRate);
  49 + }
  50 +
31 @Override 51 @Override
32 protected void finalize() throws Throwable { 52 protected void finalize() throws Throwable {
33 release(); 53 release();
@@ -49,5 +69,7 @@ public class OfflineTts { @@ -49,5 +69,7 @@ public class OfflineTts {
49 69
50 private native Object[] generateImpl(long ptr, String text, int sid, float speed); 70 private native Object[] generateImpl(long ptr, String text, int sid, float speed);
51 71
  72 + private native Object[] generateWithCallbackImpl(long ptr, String text, int sid, float speed, OfflineTtsCallback callback);
  73 +
52 private native long newFromFile(OfflineTtsConfig config); 74 private native long newFromFile(OfflineTtsConfig config);
53 } 75 }
  1 +package com.k2fsa.sherpa.onnx;
  2 +
  3 +@FunctionalInterface
  4 +public interface OfflineTtsCallback {
  5 + Integer invoke(float[] samples);
  6 +}