Fangjun Kuang
Committed by GitHub

Add CXX API for KittenTTS (#2469)

@@ -87,6 +87,45 @@ jobs: @@ -87,6 +87,45 @@ jobs:
87 otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib 87 otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib
88 fi 88 fi
89 89
  90 + - name: Test KittenTTS
  91 + shell: bash
  92 + run: |
  93 + name=kitten-tts-en-cxx-api
  94 + g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \
  95 + -I ./build/install/include \
  96 + -L ./build/install/lib/ \
  97 + -l sherpa-onnx-cxx-api \
  98 + -l sherpa-onnx-c-api \
  99 + -l onnxruntime
  100 +
  101 + ls -lh $name
  102 +
  103 + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
  104 + ls -lh ./$name
  105 + ldd ./$name
  106 + echo "----"
  107 + readelf -d ./$name
  108 + fi
  109 +
  110 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
  111 + tar xf kitten-nano-en-v0_1-fp16.tar.bz2
  112 + rm kitten-nano-en-v0_1-fp16.tar.bz2
  113 +
  114 + echo "---"
  115 +
  116 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  117 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  118 +
  119 + ./$name
  120 +
  121 + rm -rf kitten-nano-en-v0_1-fp16
  122 + rm -v ./$name
  123 +
  124 + - uses: actions/upload-artifact@v4
  125 + with:
  126 + name: kitten-tts-wavs-${{ matrix.os }}
  127 + path: ./generated-kitten-en-cxx.wav
  128 +
90 - name: Test NeMo Canary 129 - name: Test NeMo Canary
91 shell: bash 130 shell: bash
92 run: | 131 run: |
@@ -185,7 +185,7 @@ libsherpa-onnx-cxx-api.so. @@ -185,7 +185,7 @@ libsherpa-onnx-cxx-api.so.
185 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users 185 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users
186 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed. 186 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed.
187 187
188 -In any case, libonnxruntime.is is always needed. 188 +In any case, libonnxruntime.so is always needed.
189 EOF 189 EOF
190 ls -lh install/lib/README.md 190 ls -lh install/lib/README.md
191 fi 191 fi
@@ -185,7 +185,7 @@ libsherpa-onnx-cxx-api.so. @@ -185,7 +185,7 @@ libsherpa-onnx-cxx-api.so.
185 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users 185 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users
186 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed. 186 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed.
187 187
188 -In any case, libonnxruntime.is is always needed. 188 +In any case, libonnxruntime.so is always needed.
189 EOF 189 EOF
190 ls -lh install/lib/README.md 190 ls -lh install/lib/README.md
191 fi 191 fi
@@ -164,7 +164,7 @@ libsherpa-onnx-cxx-api.so. @@ -164,7 +164,7 @@ libsherpa-onnx-cxx-api.so.
164 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users 164 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users
165 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed. 165 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed.
166 166
167 -In any case, libonnxruntime.is is always needed. 167 +In any case, libonnxruntime.so is always needed.
168 EOF 168 EOF
169 ls -lh install/lib/README.md 169 ls -lh install/lib/README.md
170 fi 170 fi
@@ -125,7 +125,7 @@ libsherpa-onnx-cxx-api.so. @@ -125,7 +125,7 @@ libsherpa-onnx-cxx-api.so.
125 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users 125 libsherpa-onnx-c-api.so and libsherpa-onnx-cxx-api.so are for users
126 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed. 126 who don't use JNI. In that case, libsherpa-onnx-jni.so is not needed.
127 127
128 -In any case, libonnxruntime.is is always needed. 128 +In any case, libonnxruntime.so is always needed.
129 EOF 129 EOF
130 ls -lh install/lib/README.md 130 ls -lh install/lib/README.md
131 fi 131 fi
@@ -55,8 +55,9 @@ int32_t main(int32_t argc, char *argv[]) { @@ -55,8 +55,9 @@ int32_t main(int32_t argc, char *argv[]) {
55 55
56 const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); 56 const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config);
57 // mapping of sid to voice name 57 // mapping of sid to voice name
58 - // 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam  
59 - // 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis 58 + // 0->expr-voice-2-m, 1->expr-voice-2-f, 2->expr-voice-3-m
  59 + // 3->expr-voice-3-f, 4->expr-voice-4-m, 5->expr-voice-4-f
  60 + // 6->expr-voice-5-m, 7->expr-voice-5-f
60 int32_t sid = 0; 61 int32_t sid = 0;
61 float speed = 1.0; // larger -> faster in speech speed 62 float speed = 1.0; // larger -> faster in speech speed
62 63
@@ -124,6 +124,9 @@ if(SHERPA_ONNX_ENABLE_TTS) @@ -124,6 +124,9 @@ if(SHERPA_ONNX_ENABLE_TTS)
124 add_executable(kokoro-tts-en-cxx-api ./kokoro-tts-en-cxx-api.cc) 124 add_executable(kokoro-tts-en-cxx-api ./kokoro-tts-en-cxx-api.cc)
125 target_link_libraries(kokoro-tts-en-cxx-api sherpa-onnx-cxx-api) 125 target_link_libraries(kokoro-tts-en-cxx-api sherpa-onnx-cxx-api)
126 126
  127 + add_executable(kitten-tts-en-cxx-api ./kitten-tts-en-cxx-api.cc)
  128 + target_link_libraries(kitten-tts-en-cxx-api sherpa-onnx-cxx-api)
  129 +
127 add_executable(kokoro-tts-zh-en-cxx-api ./kokoro-tts-zh-en-cxx-api.cc) 130 add_executable(kokoro-tts-zh-en-cxx-api ./kokoro-tts-zh-en-cxx-api.cc)
128 target_link_libraries(kokoro-tts-zh-en-cxx-api sherpa-onnx-cxx-api) 131 target_link_libraries(kokoro-tts-zh-en-cxx-api sherpa-onnx-cxx-api)
129 endif() 132 endif()
  1 +// cxx-api-examples/kitten-tts-en-cxx-api.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +// This file shows how to use sherpa-onnx CXX API
  6 +// for English TTS with Kitten.
  7 +//
  8 +// clang-format off
  9 +/*
  10 +Usage
  11 +
  12 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
  13 +tar xf kitten-nano-en-v0_1-fp16.tar.bz2
  14 +rm kitten-nano-en-v0_1-fp16.tar.bz2
  15 +
  16 +./kitten-tts-en-cxx-api
  17 +
  18 + */
  19 +// clang-format on
  20 +
  21 +#include <cstdint>
  22 +#include <cstdio>
  23 +#include <string>
  24 +
  25 +#include "sherpa-onnx/c-api/cxx-api.h"
  26 +
  27 +static int32_t ProgressCallback(const float *samples, int32_t num_samples,
  28 + float progress, void *arg) {
  29 + fprintf(stderr, "Progress: %.3f%%\n", progress * 100);
  30 + // return 1 to continue generating
  31 + // return 0 to stop generating
  32 + return 1;
  33 +}
  34 +
  35 +int32_t main(int32_t argc, char *argv[]) {
  36 + using namespace sherpa_onnx::cxx; // NOLINT
  37 + OfflineTtsConfig config;
  38 +
  39 + config.model.kitten.model = "./kitten-nano-en-v0_1-fp16/model.fp16.onnx";
  40 + config.model.kitten.voices = "./kitten-nano-en-v0_1-fp16/voices.bin";
  41 + config.model.kitten.tokens = "./kitten-nano-en-v0_1-fp16/tokens.txt";
  42 + config.model.kitten.data_dir = "./kitten-nano-en-v0_1-fp16/espeak-ng-data";
  43 +
  44 + config.model.num_threads = 2;
  45 +
  46 + // If you don't want to see debug messages, please set it to 0
  47 + config.model.debug = 1;
  48 +
  49 + std::string filename = "./generated-kitten-en-cxx.wav";
  50 + std::string text =
  51 + "Today as always, men fall into two groups: slaves and free men. Whoever "
  52 + "does not have two-thirds of his day for himself, is a slave, whatever "
  53 + "he may be: a statesman, a businessman, an official, or a scholar. "
  54 + "Friends fell out often because life was changing so fast. The easiest "
  55 + "thing in the world was to lose touch with someone.";
  56 +
  57 + auto tts = OfflineTts::Create(config);
  58 + int32_t sid = 0;
  59 + float speed = 1.0; // larger -> faster in speech speed
  60 +
  61 +#if 0
  62 + // If you don't want to use a callback, then please enable this branch
  63 + GeneratedAudio audio = tts.Generate(text, sid, speed);
  64 +#else
  65 + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback);
  66 +#endif
  67 +
  68 + WriteWave(filename, {audio.samples, audio.sample_rate});
  69 +
  70 + fprintf(stderr, "Input text is: %s\n", text.c_str());
  71 + fprintf(stderr, "Speaker ID is: %d\n", sid);
  72 + fprintf(stderr, "Saved to: %s\n", filename.c_str());
  73 +
  74 + return 0;
  75 +}
1 -// cxx-api-examples/kokoro-tts-en-cxx-api.c 1 +// cxx-api-examples/kokoro-tts-en-cxx-api.cc
2 // 2 //
3 // Copyright (c) 2025 Xiaomi Corporation 3 // Copyright (c) 2025 Xiaomi Corporation
4 4
@@ -18,6 +18,8 @@ rm kokoro-en-v0_19.tar.bz2 @@ -18,6 +18,8 @@ rm kokoro-en-v0_19.tar.bz2
18 */ 18 */
19 // clang-format on 19 // clang-format on
20 20
  21 +#include <cstdint>
  22 +#include <cstdio>
21 #include <string> 23 #include <string>
22 24
23 #include "sherpa-onnx/c-api/cxx-api.h" 25 #include "sherpa-onnx/c-api/cxx-api.h"
@@ -66,7 +68,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -66,7 +68,7 @@ int32_t main(int32_t argc, char *argv[]) {
66 WriteWave(filename, {audio.samples, audio.sample_rate}); 68 WriteWave(filename, {audio.samples, audio.sample_rate});
67 69
68 fprintf(stderr, "Input text is: %s\n", text.c_str()); 70 fprintf(stderr, "Input text is: %s\n", text.c_str());
69 - fprintf(stderr, "Speaker ID is is: %d\n", sid); 71 + fprintf(stderr, "Speaker ID is: %d\n", sid);
70 fprintf(stderr, "Saved to: %s\n", filename.c_str()); 72 fprintf(stderr, "Saved to: %s\n", filename.c_str());
71 73
72 return 0; 74 return 0;
@@ -18,6 +18,8 @@ rm kokoro-multi-lang-v1_0.tar.bz2 @@ -18,6 +18,8 @@ rm kokoro-multi-lang-v1_0.tar.bz2
18 */ 18 */
19 // clang-format on 19 // clang-format on
20 20
  21 +#include <cstdint>
  22 +#include <cstdio>
21 #include <string> 23 #include <string>
22 24
23 #include "sherpa-onnx/c-api/cxx-api.h" 25 #include "sherpa-onnx/c-api/cxx-api.h"
@@ -67,7 +69,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -67,7 +69,7 @@ int32_t main(int32_t argc, char *argv[]) {
67 WriteWave(filename, {audio.samples, audio.sample_rate}); 69 WriteWave(filename, {audio.samples, audio.sample_rate});
68 70
69 fprintf(stderr, "Input text is: %s\n", text.c_str()); 71 fprintf(stderr, "Input text is: %s\n", text.c_str());
70 - fprintf(stderr, "Speaker ID is is: %d\n", sid); 72 + fprintf(stderr, "Speaker ID is: %d\n", sid);
71 fprintf(stderr, "Saved to: %s\n", filename.c_str()); 73 fprintf(stderr, "Saved to: %s\n", filename.c_str());
72 74
73 return 0; 75 return 0;
@@ -20,6 +20,8 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/voco @@ -20,6 +20,8 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/voco
20 */ 20 */
21 // clang-format on 21 // clang-format on
22 22
  23 +#include <cstdint>
  24 +#include <cstdio>
23 #include <string> 25 #include <string>
24 26
25 #include "sherpa-onnx/c-api/cxx-api.h" 27 #include "sherpa-onnx/c-api/cxx-api.h"
@@ -73,7 +75,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -73,7 +75,7 @@ int32_t main(int32_t argc, char *argv[]) {
73 WriteWave(filename, {audio.samples, audio.sample_rate}); 75 WriteWave(filename, {audio.samples, audio.sample_rate});
74 76
75 fprintf(stderr, "Input text is: %s\n", text.c_str()); 77 fprintf(stderr, "Input text is: %s\n", text.c_str());
76 - fprintf(stderr, "Speaker ID is is: %d\n", sid); 78 + fprintf(stderr, "Speaker ID is: %d\n", sid);
77 fprintf(stderr, "Saved to: %s\n", filename.c_str()); 79 fprintf(stderr, "Saved to: %s\n", filename.c_str());
78 80
79 return 0; 81 return 0;
@@ -20,6 +20,8 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/voco @@ -20,6 +20,8 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/voco
20 */ 20 */
21 // clang-format on 21 // clang-format on
22 22
  23 +#include <cstdint>
  24 +#include <cstdio>
23 #include <string> 25 #include <string>
24 26
25 #include "sherpa-onnx/c-api/cxx-api.h" 27 #include "sherpa-onnx/c-api/cxx-api.h"
@@ -72,7 +74,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -72,7 +74,7 @@ int32_t main(int32_t argc, char *argv[]) {
72 WriteWave(filename, {audio.samples, audio.sample_rate}); 74 WriteWave(filename, {audio.samples, audio.sample_rate});
73 75
74 fprintf(stderr, "Input text is: %s\n", text.c_str()); 76 fprintf(stderr, "Input text is: %s\n", text.c_str());
75 - fprintf(stderr, "Speaker ID is is: %d\n", sid); 77 + fprintf(stderr, "Speaker ID is: %d\n", sid);
76 fprintf(stderr, "Saved to: %s\n", filename.c_str()); 78 fprintf(stderr, "Saved to: %s\n", filename.c_str());
77 79
78 return 0; 80 return 0;
@@ -392,6 +392,12 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { @@ -392,6 +392,12 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
392 c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str(); 392 c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str();
393 c.model.kokoro.lang = config.model.kokoro.lang.c_str(); 393 c.model.kokoro.lang = config.model.kokoro.lang.c_str();
394 394
  395 + c.model.kitten.model = config.model.kitten.model.c_str();
  396 + c.model.kitten.voices = config.model.kitten.voices.c_str();
  397 + c.model.kitten.tokens = config.model.kitten.tokens.c_str();
  398 + c.model.kitten.data_dir = config.model.kitten.data_dir.c_str();
  399 + c.model.kitten.length_scale = config.model.kitten.length_scale;
  400 +
395 c.model.num_threads = config.model.num_threads; 401 c.model.num_threads = config.model.num_threads;
396 c.model.debug = config.model.debug; 402 c.model.debug = config.model.debug;
397 c.model.provider = config.model.provider.c_str(); 403 c.model.provider = config.model.provider.c_str();
@@ -394,10 +394,20 @@ struct OfflineTtsKokoroModelConfig { @@ -394,10 +394,20 @@ struct OfflineTtsKokoroModelConfig {
394 float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed 394 float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
395 }; 395 };
396 396
  397 +struct OfflineTtsKittenModelConfig {
  398 + std::string model;
  399 + std::string voices;
  400 + std::string tokens;
  401 + std::string data_dir;
  402 +
  403 + float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
  404 +};
  405 +
397 struct OfflineTtsModelConfig { 406 struct OfflineTtsModelConfig {
398 OfflineTtsVitsModelConfig vits; 407 OfflineTtsVitsModelConfig vits;
399 OfflineTtsMatchaModelConfig matcha; 408 OfflineTtsMatchaModelConfig matcha;
400 OfflineTtsKokoroModelConfig kokoro; 409 OfflineTtsKokoroModelConfig kokoro;
  410 + OfflineTtsKittenModelConfig kitten;
401 int32_t num_threads = 1; 411 int32_t num_threads = 1;
402 bool debug = false; 412 bool debug = false;
403 std::string provider = "cpu"; 413 std::string provider = "cpu";