Fangjun Kuang
Committed by GitHub

Add CXX API for Kokoro TTS 1.0 (#1802)

@@ -103,6 +103,28 @@ jobs: @@ -103,6 +103,28 @@ jobs:
103 rm kws-cxx-api 103 rm kws-cxx-api
104 rm -rf sherpa-onnx-kws-* 104 rm -rf sherpa-onnx-kws-*
105 105
  106 + - name: Test Kokoro TTS (zh+en)
  107 + shell: bash
  108 + run: |
  109 + g++ -std=c++17 -o kokoro-tts-zh-en-cxx-api ./cxx-api-examples/kokoro-tts-zh-en-cxx-api.cc \
  110 + -I ./build/install/include \
  111 + -L ./build/install/lib/ \
  112 + -l sherpa-onnx-cxx-api \
  113 + -l sherpa-onnx-c-api \
  114 + -l onnxruntime
  115 +
  116 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  117 + tar xf kokoro-multi-lang-v1_0.tar.bz2
  118 + rm kokoro-multi-lang-v1_0.tar.bz2
  119 +
  120 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  121 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  122 +
  123 + ./kokoro-tts-zh-en-cxx-api
  124 +
  125 + rm kokoro-tts-zh-en-cxx-api
  126 + rm -rf kokoro-*
  127 +
106 - name: Test Kokoro TTS (en) 128 - name: Test Kokoro TTS (en)
107 shell: bash 129 shell: bash
108 run: | 130 run: |
@@ -26,7 +26,7 @@ int32_t main() { @@ -26,7 +26,7 @@ int32_t main() {
26 memset(&config, 0, sizeof(config)); 26 memset(&config, 0, sizeof(config));
27 config.model_config.transducer.encoder = 27 config.model_config.transducer.encoder =
28 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 28 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
29 - "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; 29 + "encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx";
30 30
31 config.model_config.transducer.decoder = 31 config.model_config.transducer.decoder =
32 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 32 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
@@ -34,7 +34,7 @@ int32_t main() { @@ -34,7 +34,7 @@ int32_t main() {
34 34
35 config.model_config.transducer.joiner = 35 config.model_config.transducer.joiner =
36 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 36 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
37 - "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; 37 + "joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx";
38 38
39 config.model_config.tokens = 39 config.model_config.tokens =
40 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 40 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
@@ -58,7 +58,8 @@ int32_t main() { @@ -58,7 +58,8 @@ int32_t main() {
58 "--Test pre-defined keywords from test_wavs/test_keywords.txt--\n"); 58 "--Test pre-defined keywords from test_wavs/test_keywords.txt--\n");
59 59
60 const char *wav_filename = 60 const char *wav_filename =
61 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav"; 61 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
  62 + "test_wavs/3.wav";
62 63
63 float tail_paddings[8000] = {0}; // 0.5 seconds 64 float tail_paddings[8000] = {0}; // 0.5 seconds
64 65
@@ -27,4 +27,7 @@ if(SHERPA_ONNX_ENABLE_TTS) @@ -27,4 +27,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
27 27
28 add_executable(kokoro-tts-en-cxx-api ./kokoro-tts-en-cxx-api.cc) 28 add_executable(kokoro-tts-en-cxx-api ./kokoro-tts-en-cxx-api.cc)
29 target_link_libraries(kokoro-tts-en-cxx-api sherpa-onnx-cxx-api) 29 target_link_libraries(kokoro-tts-en-cxx-api sherpa-onnx-cxx-api)
  30 +
  31 + add_executable(kokoro-tts-zh-en-cxx-api ./kokoro-tts-zh-en-cxx-api.cc)
  32 + target_link_libraries(kokoro-tts-zh-en-cxx-api sherpa-onnx-cxx-api)
30 endif() 33 endif()
  1 +// cxx-api-examples/kokoro-tts-zh-en-cxx-api.c
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +// This file shows how to use sherpa-onnx CXX API
  6 +// for Chinese TTS with Kokoro.
  7 +//
  8 +// clang-format off
  9 +/*
  10 +Usage
  11 +
  12 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  13 +tar xf kokoro-multi-lang-v1_0.tar.bz2
  14 +rm kokoro-multi-lang-v1_0.tar.bz2
  15 +
  16 +./kokoro-tts-zh-en-cxx-api
  17 +
  18 + */
  19 +// clang-format on
  20 +
  21 +#include <string>
  22 +
  23 +#include "sherpa-onnx/c-api/cxx-api.h"
  24 +
  25 +static int32_t ProgressCallback(const float *samples, int32_t num_samples,
  26 + float progress, void *arg) {
  27 + fprintf(stderr, "Progress: %.3f%%\n", progress * 100);
  28 + // return 1 to continue generating
  29 + // return 0 to stop generating
  30 + return 1;
  31 +}
  32 +
  33 +int32_t main(int32_t argc, char *argv[]) {
  34 + using namespace sherpa_onnx::cxx; // NOLINT
  35 + OfflineTtsConfig config;
  36 +
  37 + config.model.kokoro.model = "./kokoro-multi-lang-v1_0/model.onnx";
  38 + config.model.kokoro.voices = "./kokoro-multi-lang-v1_0/voices.bin";
  39 + config.model.kokoro.tokens = "./kokoro-multi-lang-v1_0/tokens.txt";
  40 + config.model.kokoro.data_dir = "./kokoro-multi-lang-v1_0/espeak-ng-data";
  41 + config.model.kokoro.dict_dir = "./kokoro-multi-lang-v1_0/dict";
  42 + config.model.kokoro.lexicon =
  43 + "./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/"
  44 + "lexicon-zh.txt";
  45 +
  46 + config.model.num_threads = 2;
  47 +
  48 + // If you don't want to see debug messages, please set it to 0
  49 + config.model.debug = 1;
  50 +
  51 + std::string filename = "./generated-kokoro-zh-en-cxx.wav";
  52 + std::string text =
  53 + "中英文语音合成测试。This is generated by next generation Kaldi using "
  54 + "Kokoro without Misaki. 你觉得中英文说的如何呢?";
  55 +
  56 + auto tts = OfflineTts::Create(config);
  57 + int32_t sid = 50;
  58 + float speed = 1.0; // larger -> faster in speech speed
  59 +
  60 +#if 0
  61 + // If you don't want to use a callback, then please enable this branch
  62 + GeneratedAudio audio = tts.Generate(text, sid, speed);
  63 +#else
  64 + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback);
  65 +#endif
  66 +
  67 + WriteWave(filename, {audio.samples, audio.sample_rate});
  68 +
  69 + fprintf(stderr, "Input text is: %s\n", text.c_str());
  70 + fprintf(stderr, "Speaker ID is is: %d\n", sid);
  71 + fprintf(stderr, "Saved to: %s\n", filename.c_str());
  72 +
  73 + return 0;
  74 +}
@@ -25,7 +25,7 @@ int32_t main() { @@ -25,7 +25,7 @@ int32_t main() {
25 KeywordSpotterConfig config; 25 KeywordSpotterConfig config;
26 config.model_config.transducer.encoder = 26 config.model_config.transducer.encoder =
27 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 27 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
28 - "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; 28 + "encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx";
29 29
30 config.model_config.transducer.decoder = 30 config.model_config.transducer.decoder =
31 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 31 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
@@ -33,7 +33,7 @@ int32_t main() { @@ -33,7 +33,7 @@ int32_t main() {
33 33
34 config.model_config.transducer.joiner = 34 config.model_config.transducer.joiner =
35 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 35 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
36 - "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; 36 + "joiner-epoch-12-avg-2-chunk-16-left-64.int8.onnx";
37 37
38 config.model_config.tokens = 38 config.model_config.tokens =
39 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" 39 "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
@@ -57,7 +57,8 @@ int32_t main() { @@ -57,7 +57,8 @@ int32_t main() {
57 << "--Test pre-defined keywords from test_wavs/test_keywords.txt--\n"; 57 << "--Test pre-defined keywords from test_wavs/test_keywords.txt--\n";
58 58
59 std::string wave_filename = 59 std::string wave_filename =
60 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav"; 60 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
  61 + "test_wavs/3.wav";
61 62
62 std::array<float, 8000> tail_paddings = {0}; // 0.5 seconds 63 std::array<float, 8000> tail_paddings = {0}; // 0.5 seconds
63 64
@@ -343,6 +343,8 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { @@ -343,6 +343,8 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
343 c.model.kokoro.tokens = config.model.kokoro.tokens.c_str(); 343 c.model.kokoro.tokens = config.model.kokoro.tokens.c_str();
344 c.model.kokoro.data_dir = config.model.kokoro.data_dir.c_str(); 344 c.model.kokoro.data_dir = config.model.kokoro.data_dir.c_str();
345 c.model.kokoro.length_scale = config.model.kokoro.length_scale; 345 c.model.kokoro.length_scale = config.model.kokoro.length_scale;
  346 + c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str();
  347 + c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str();
346 348
347 c.model.num_threads = config.model.num_threads; 349 c.model.num_threads = config.model.num_threads;
348 c.model.debug = config.model.debug; 350 c.model.debug = config.model.debug;
@@ -343,6 +343,8 @@ struct OfflineTtsKokoroModelConfig { @@ -343,6 +343,8 @@ struct OfflineTtsKokoroModelConfig {
343 std::string voices; 343 std::string voices;
344 std::string tokens; 344 std::string tokens;
345 std::string data_dir; 345 std::string data_dir;
  346 + std::string dict_dir;
  347 + std::string lexicon;
346 348
347 float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed 349 float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
348 }; 350 };