Committed by
GitHub
Add TTS example for Java API. (#1176)
It plays the generated audio as it is still generating.
正在显示
6 个修改的文件
包含
262 行增加
和
0 行删除
| 1 | +// Copyright 2024 Xiaomi Corporation | ||
| 2 | +// | ||
| 3 | +// References | ||
| 4 | +// https://www.baeldung.com/java-passing-method-parameter | ||
| 5 | +// https://www.geeksforgeeks.org/how-to-create-a-thread-safe-queue-in-java/ | ||
| 6 | +// https://stackoverflow.com/questions/74077394/java-audio-how-to-continuously-write-bytes-to-an-audio-file-as-they-are-being-g | ||
| 7 | + | ||
| 8 | +// This file shows how to use a piper VITS English TTS model | ||
| 9 | +// to convert text to speech. You can pass a callback to the generation call, | ||
| 10 | +// which is invoked whenever max_num_sentences sentences have been | ||
| 11 | +// finished generation. | ||
| 12 | +// | ||
| 13 | +// The callback saves the generated samples into a queue, which are played | ||
| 14 | +// by a separate thread. | ||
| 15 | + | ||
| 16 | +import com.k2fsa.sherpa.onnx.*; | ||
| 17 | +import java.util.Queue; | ||
| 18 | +import java.util.concurrent.*; | ||
| 19 | +import java.util.concurrent.ConcurrentLinkedQueue; | ||
| 20 | +import javax.sound.sampled.*; | ||
| 21 | + | ||
| 22 | +public class NonStreamingTtsPiperEn { | ||
| 23 | + public static void main(String[] args) { | ||
| 24 | + // please visit | ||
| 25 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 26 | + // to download model files | ||
| 27 | + String model = "./vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx"; | ||
| 28 | + String tokens = "./vits-piper-en_GB-cori-medium/tokens.txt"; | ||
| 29 | + String dataDir = "./vits-piper-en_GB-cori-medium/espeak-ng-data"; | ||
| 30 | + String text = | ||
| 31 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have" | ||
| 32 | + + " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a" | ||
| 33 | + + " businessman, an official, or a scholar."; | ||
| 34 | + | ||
| 35 | + OfflineTtsVitsModelConfig vitsModelConfig = | ||
| 36 | + OfflineTtsVitsModelConfig.builder() | ||
| 37 | + .setModel(model) | ||
| 38 | + .setTokens(tokens) | ||
| 39 | + .setDataDir(dataDir) | ||
| 40 | + .build(); | ||
| 41 | + | ||
| 42 | + OfflineTtsModelConfig modelConfig = | ||
| 43 | + OfflineTtsModelConfig.builder() | ||
| 44 | + .setVits(vitsModelConfig) | ||
| 45 | + .setNumThreads(1) | ||
| 46 | + .setDebug(true) | ||
| 47 | + .build(); | ||
| 48 | + | ||
| 49 | + OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build(); | ||
| 50 | + OfflineTts tts = new OfflineTts(config); | ||
| 51 | + | ||
| 52 | + Queue<byte[]> samplesQueue = new ConcurrentLinkedQueue<>(); | ||
| 53 | + | ||
| 54 | + Semaphore canPlaySem = new Semaphore(1); | ||
| 55 | + try { | ||
| 56 | + canPlaySem.acquire(); | ||
| 57 | + } catch (InterruptedException ex) { | ||
| 58 | + System.out.println("Failed to acquire the play semaphore in the main thread"); | ||
| 59 | + return; | ||
| 60 | + } | ||
| 61 | + | ||
| 62 | + Runnable playRuannable = | ||
| 63 | + () -> { | ||
| 64 | + try { | ||
| 65 | + canPlaySem.acquire(); | ||
| 66 | + } catch (InterruptedException e) { | ||
| 67 | + System.out.println("Failed to get canPlay semaphore in the play thread"); | ||
| 68 | + return; | ||
| 69 | + } | ||
| 70 | + | ||
| 71 | + // https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html | ||
| 72 | + AudioFormat format = | ||
| 73 | + new AudioFormat( | ||
| 74 | + tts.getSampleRate(), // sampleRate | ||
| 75 | + 16, // sampleSizeInBits | ||
| 76 | + 1, // channels | ||
| 77 | + true, // signed | ||
| 78 | + false // bigEndian | ||
| 79 | + ); | ||
| 80 | + DataLine.Info info = new DataLine.Info(SourceDataLine.class, format); | ||
| 81 | + SourceDataLine line; | ||
| 82 | + try { | ||
| 83 | + line = (SourceDataLine) AudioSystem.getLine(info); | ||
| 84 | + | ||
| 85 | + int bufferSizeInBytes = tts.getSampleRate(); // 0.5 seconds | ||
| 86 | + line.open(format, bufferSizeInBytes); | ||
| 87 | + } catch (LineUnavailableException ex) { | ||
| 88 | + System.out.println("Failed to open a device for playing"); | ||
| 89 | + return; | ||
| 90 | + } | ||
| 91 | + line.start(); | ||
| 92 | + | ||
| 93 | + while (true) { | ||
| 94 | + if (samplesQueue.isEmpty()) { | ||
| 95 | + // Do nothing. | ||
| 96 | + // | ||
| 97 | + // If the generating speed is very slow, we can sleep | ||
| 98 | + // for some time here to save some CPU. | ||
| 99 | + } else { | ||
| 100 | + byte[] samples = samplesQueue.poll(); | ||
| 101 | + if (samples.length == 1) { | ||
| 102 | + // end of the generating | ||
| 103 | + break; | ||
| 104 | + } | ||
| 105 | + line.write(samples, 0, samples.length); | ||
| 106 | + } | ||
| 107 | + } | ||
| 108 | + | ||
| 109 | + line.drain(); | ||
| 110 | + line.close(); | ||
| 111 | + }; | ||
| 112 | + | ||
| 113 | + Thread playThread = new Thread(playRuannable); | ||
| 114 | + playThread.start(); | ||
| 115 | + | ||
| 116 | + int sid = 0; | ||
| 117 | + float speed = 1.0f; | ||
| 118 | + long start = System.currentTimeMillis(); | ||
| 119 | + GeneratedAudio audio = | ||
| 120 | + tts.generateWithCallback( | ||
| 121 | + text, | ||
| 122 | + sid, | ||
| 123 | + speed, | ||
| 124 | + (float[] samples) -> { | ||
| 125 | + | ||
| 126 | + // we use a byte array to save int16 samples | ||
| 127 | + byte[] samplesInt16 = new byte[samples.length * 2]; | ||
| 128 | + for (int i = 0; i < samples.length; ++i) { | ||
| 129 | + float s = samples[i]; | ||
| 130 | + if (s > 1) { | ||
| 131 | + s = 1; | ||
| 132 | + } | ||
| 133 | + | ||
| 134 | + if (s < -1) { | ||
| 135 | + s = -1; | ||
| 136 | + } | ||
| 137 | + | ||
| 138 | + short t = (short) (s * 32767); | ||
| 139 | + | ||
| 140 | + // we use little endian | ||
| 141 | + samplesInt16[2 * i] = (byte) (t & 0xff); | ||
| 142 | + samplesInt16[2 * i + 1] = (byte) ((t & 0xff00) >> 8); | ||
| 143 | + } | ||
| 144 | + | ||
| 145 | + samplesQueue.add(samplesInt16); | ||
| 146 | + | ||
| 147 | + canPlaySem.release(); | ||
| 148 | + | ||
| 149 | + // Note: You can play the samples. | ||
| 150 | + // warning: You need to save a copy of samples since it is freed | ||
| 151 | + // when this function returns | ||
| 152 | + | ||
| 153 | + // return 1 to continue generation | ||
| 154 | + // return 0 to stop generation | ||
| 155 | + return 1; | ||
| 156 | + }); | ||
| 157 | + | ||
| 158 | + // Since a sample always has two bytes. We put a single byte | ||
| 159 | + // into the queue to indicate that we have finished processing. | ||
| 160 | + samplesQueue.add(new byte[1]); | ||
| 161 | + | ||
| 162 | + long stop = System.currentTimeMillis(); | ||
| 163 | + | ||
| 164 | + float timeElapsedSeconds = (stop - start) / 1000.0f; | ||
| 165 | + | ||
| 166 | + float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); | ||
| 167 | + float real_time_factor = timeElapsedSeconds / audioDuration; | ||
| 168 | + | ||
| 169 | + try { | ||
| 170 | + playThread.join(); | ||
| 171 | + } catch (InterruptedException ex) { | ||
| 172 | + System.out.println("Failed to join the play thread"); | ||
| 173 | + return; | ||
| 174 | + } | ||
| 175 | + | ||
| 176 | + String waveFilename = "tts-piper-en.wav"; | ||
| 177 | + audio.save(waveFilename); | ||
| 178 | + System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | ||
| 179 | + System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | ||
| 180 | + System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | ||
| 181 | + System.out.printf("-- text: %s\n", text); | ||
| 182 | + System.out.printf("-- Saved to %s\n", waveFilename); | ||
| 183 | + | ||
| 184 | + tts.release(); | ||
| 185 | + } | ||
| 186 | +} |
| @@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx. | @@ -24,6 +24,7 @@ This directory contains examples for the JAVA API of sherpa-onnx. | ||
| 24 | ./run-non-streaming-decode-file-nemo.sh | 24 | ./run-non-streaming-decode-file-nemo.sh |
| 25 | ``` | 25 | ``` |
| 26 | 26 | ||
| 27 | + | ||
| 27 | ## Non-Streaming text-to-speech | 28 | ## Non-Streaming text-to-speech |
| 28 | 29 | ||
| 29 | ```bash | 30 | ```bash |
| @@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx. | @@ -32,6 +33,12 @@ This directory contains examples for the JAVA API of sherpa-onnx. | ||
| 32 | ./run-non-streaming-tts-vits-zh.sh | 33 | ./run-non-streaming-tts-vits-zh.sh |
| 33 | ``` | 34 | ``` |
| 34 | 35 | ||
| 36 | +## Non-Streaming text-to-speech (Play as it is generating) | ||
| 37 | + | ||
| 38 | +```bash | ||
| 39 | +./run-non-streaming-tts-piper-en-with-callback.sh | ||
| 40 | +``` | ||
| 41 | + | ||
| 35 | ## Spoken language identification | 42 | ## Spoken language identification |
| 36 | 43 | ||
| 37 | ```bash | 44 | ```bash |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
| 6 | + mkdir -p ../build | ||
| 7 | + pushd ../build | ||
| 8 | + cmake \ | ||
| 9 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 10 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 11 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 12 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 13 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 14 | + -DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
| 15 | + .. | ||
| 16 | + | ||
| 17 | + make -j4 | ||
| 18 | + ls -lh lib | ||
| 19 | + popd | ||
| 20 | +fi | ||
| 21 | + | ||
| 22 | +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
| 23 | + pushd ../sherpa-onnx/java-api | ||
| 24 | + make | ||
| 25 | + popd | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +# please visit | ||
| 29 | +# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 30 | +# to download more models | ||
| 31 | +if [ ! -f ./vits-piper-en_GB-cori-medium/tokens.txt ]; then | ||
| 32 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 | ||
| 33 | + tar xf vits-piper-en_GB-cori-medium.tar.bz2 | ||
| 34 | + rm vits-piper-en_GB-cori-medium.tar.bz2 | ||
| 35 | +fi | ||
| 36 | + | ||
| 37 | +java \ | ||
| 38 | + -Djava.library.path=$PWD/../build/lib \ | ||
| 39 | + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
| 40 | + NonStreamingTtsPiperEnWithCallback.java |
| @@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java | @@ -38,6 +38,7 @@ java_files += OfflineTtsVitsModelConfig.java | ||
| 38 | java_files += OfflineTtsModelConfig.java | 38 | java_files += OfflineTtsModelConfig.java |
| 39 | java_files += OfflineTtsConfig.java | 39 | java_files += OfflineTtsConfig.java |
| 40 | java_files += GeneratedAudio.java | 40 | java_files += GeneratedAudio.java |
| 41 | +java_files += OfflineTtsCallback.java | ||
| 41 | java_files += OfflineTts.java | 42 | java_files += OfflineTts.java |
| 42 | 43 | ||
| 43 | java_files += SpokenLanguageIdentificationWhisperConfig.java | 44 | java_files += SpokenLanguageIdentificationWhisperConfig.java |
| @@ -2,6 +2,7 @@ | @@ -2,6 +2,7 @@ | ||
| 2 | 2 | ||
| 3 | package com.k2fsa.sherpa.onnx; | 3 | package com.k2fsa.sherpa.onnx; |
| 4 | 4 | ||
| 5 | + | ||
| 5 | public class OfflineTts { | 6 | public class OfflineTts { |
| 6 | static { | 7 | static { |
| 7 | System.loadLibrary("sherpa-onnx-jni"); | 8 | System.loadLibrary("sherpa-onnx-jni"); |
| @@ -13,6 +14,10 @@ public class OfflineTts { | @@ -13,6 +14,10 @@ public class OfflineTts { | ||
| 13 | ptr = newFromFile(config); | 14 | ptr = newFromFile(config); |
| 14 | } | 15 | } |
| 15 | 16 | ||
| 17 | + public int getSampleRate() { | ||
| 18 | + return getSampleRate(ptr); | ||
| 19 | + } | ||
| 20 | + | ||
| 16 | public GeneratedAudio generate(String text) { | 21 | public GeneratedAudio generate(String text) { |
| 17 | return generate(text, 0, 1.0f); | 22 | return generate(text, 0, 1.0f); |
| 18 | } | 23 | } |
| @@ -28,6 +33,21 @@ public class OfflineTts { | @@ -28,6 +33,21 @@ public class OfflineTts { | ||
| 28 | return new GeneratedAudio(samples, sampleRate); | 33 | return new GeneratedAudio(samples, sampleRate); |
| 29 | } | 34 | } |
| 30 | 35 | ||
| 36 | + public GeneratedAudio generateWithCallback(String text, OfflineTtsCallback callback) { | ||
| 37 | + return generateWithCallback(text, 0, 1.0f, callback); | ||
| 38 | + } | ||
| 39 | + | ||
| 40 | + public GeneratedAudio generateWithCallback(String text, int sid, OfflineTtsCallback callback) { | ||
| 41 | + return generateWithCallback(text, sid, 1.0f, callback); | ||
| 42 | + } | ||
| 43 | + | ||
| 44 | + public GeneratedAudio generateWithCallback(String text, int sid, float speed, OfflineTtsCallback callback) { | ||
| 45 | + Object[] arr = generateWithCallbackImpl(ptr, text, sid, speed, callback); | ||
| 46 | + float[] samples = (float[]) arr[0]; | ||
| 47 | + int sampleRate = (int) arr[1]; | ||
| 48 | + return new GeneratedAudio(samples, sampleRate); | ||
| 49 | + } | ||
| 50 | + | ||
| 31 | @Override | 51 | @Override |
| 32 | protected void finalize() throws Throwable { | 52 | protected void finalize() throws Throwable { |
| 33 | release(); | 53 | release(); |
| @@ -49,5 +69,7 @@ public class OfflineTts { | @@ -49,5 +69,7 @@ public class OfflineTts { | ||
| 49 | 69 | ||
| 50 | private native Object[] generateImpl(long ptr, String text, int sid, float speed); | 70 | private native Object[] generateImpl(long ptr, String text, int sid, float speed); |
| 51 | 71 | ||
| 72 | + private native Object[] generateWithCallbackImpl(long ptr, String text, int sid, float speed, OfflineTtsCallback callback); | ||
| 73 | + | ||
| 52 | private native long newFromFile(OfflineTtsConfig config); | 74 | private native long newFromFile(OfflineTtsConfig config); |
| 53 | } | 75 | } |
-
请 注册 或 登录 后发表评论