Fangjun Kuang
Committed by GitHub

Add CXX API for MatchaTTS models (#1676)

@@ -101,6 +101,10 @@ jobs: @@ -101,6 +101,10 @@ jobs:
101 101
102 ./matcha-tts-zh-c-api 102 ./matcha-tts-zh-c-api
103 103
  104 + rm ./matcha-tts-zh-c-api
  105 + rm -rf matcha-icefall-*
  106 + rm hifigan_v2.onnx
  107 +
104 - name: Test Matcha TTS (en) 108 - name: Test Matcha TTS (en)
105 shell: bash 109 shell: bash
106 run: | 110 run: |
@@ -121,6 +125,10 @@ jobs: @@ -121,6 +125,10 @@ jobs:
121 125
122 ./matcha-tts-en-c-api 126 ./matcha-tts-en-c-api
123 127
  128 + rm ./matcha-tts-en-c-api
  129 + rm -rf matcha-icefall-*
  130 + rm hifigan_v2.onnx
  131 +
124 - uses: actions/upload-artifact@v4 132 - uses: actions/upload-artifact@v4
125 with: 133 with:
126 name: matcha-tts-${{ matrix.os }} 134 name: matcha-tts-${{ matrix.os }}
@@ -83,6 +83,61 @@ jobs: @@ -83,6 +83,61 @@ jobs:
83 otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib 83 otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib
84 fi 84 fi
85 85
  86 + - name: Test Matcha TTS (zh)
  87 + shell: bash
  88 + run: |
  89 + g++ -std=c++17 -o matcha-tts-zh-cxx-api ./cxx-api-examples/matcha-tts-zh-cxx-api.cc \
  90 + -I ./build/install/include \
  91 + -L ./build/install/lib/ \
  92 + -l sherpa-onnx-cxx-api \
  93 + -l sherpa-onnx-c-api \
  94 + -l onnxruntime
  95 +
  96 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  97 + tar xvf matcha-icefall-zh-baker.tar.bz2
  98 + rm matcha-icefall-zh-baker.tar.bz2
  99 +
  100 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  101 +
  102 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  103 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  104 +
  105 + ./matcha-tts-zh-cxx-api
  106 +
  107 + rm -rf matcha-icefall-*
  108 + rm hifigan_v2.onnx
  109 + rm matcha-tts-zh-cxx-api
  110 +
  111 + - name: Test Matcha TTS (en)
  112 + shell: bash
  113 + run: |
  114 + g++ -std=c++17 -o matcha-tts-en-cxx-api ./cxx-api-examples/matcha-tts-en-cxx-api.cc \
  115 + -I ./build/install/include \
  116 + -L ./build/install/lib/ \
  117 + -l sherpa-onnx-cxx-api \
  118 + -l sherpa-onnx-c-api \
  119 + -l onnxruntime
  120 +
  121 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  122 + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  123 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  124 +
  125 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  126 +
  127 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  128 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  129 +
  130 + ./matcha-tts-en-cxx-api
  131 +
  132 + rm matcha-tts-en-cxx-api
  133 + rm -rf matcha-icefall-*
  134 + rm hifigan_v2.onnx
  135 +
  136 + - uses: actions/upload-artifact@v4
  137 + with:
  138 + name: matcha-tts-${{ matrix.os }}
  139 + path: ./generated-matcha-*.wav
  140 +
86 - name: Test Moonshine tiny 141 - name: Test Moonshine tiny
87 shell: bash 142 shell: bash
88 run: | 143 run: |
@@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) {
60 "Friends fell out often because life was changing so fast. The easiest " 60 "Friends fell out often because life was changing so fast. The easiest "
61 "thing in the world was to lose touch with someone."; 61 "thing in the world was to lose touch with someone.";
62 62
63 - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); 63 + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config);
64 int32_t sid = 0; 64 int32_t sid = 0;
65 float speed = 1.0; // larger -> faster in speech speed 65 float speed = 1.0; // larger -> faster in speech speed
66 66
@@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) {
60 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " 60 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; "
61 "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; 61 "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。";
62 62
63 - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); 63 + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config);
64 int32_t sid = 0; 64 int32_t sid = 0;
65 float speed = 1.0; // larger -> faster in speech speed 65 float speed = 1.0; // larger -> faster in speech speed
66 66
@@ -229,7 +229,7 @@ int32_t main(int32_t argc, char *argv[]) { @@ -229,7 +229,7 @@ int32_t main(int32_t argc, char *argv[]) {
229 ShowUsage(); 229 ShowUsage();
230 } 230 }
231 231
232 - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); 232 + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config);
233 233
234 const SherpaOnnxGeneratedAudio *audio = 234 const SherpaOnnxGeneratedAudio *audio =
235 SherpaOnnxOfflineTtsGenerate(tts, text, sid, 1.0); 235 SherpaOnnxOfflineTtsGenerate(tts, text, sid, 1.0);
@@ -14,3 +14,11 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) @@ -14,3 +14,11 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api)
14 14
15 add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) 15 add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc)
16 target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) 16 target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api)
  17 +
  18 +if(SHERPA_ONNX_ENABLE_TTS)
  19 + add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc)
  20 + target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api)
  21 +
  22 + add_executable(matcha-tts-en-cxx-api ./matcha-tts-en-cxx-api.cc)
  23 + target_link_libraries(matcha-tts-en-cxx-api sherpa-onnx-cxx-api)
  24 +endif()
  1 +// cxx-api-examples/matcha-tts-en-cxx-api.c
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +// This file shows how to use sherpa-onnx CXX API
  6 +// for Chinese TTS with MatchaTTS.
  7 +//
  8 +// clang-format off
  9 +/*
  10 +Usage
  11 +
  12 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  13 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  14 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  15 +
  16 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  17 +
  18 +./matcha-tts-en-cxx-api
  19 +
  20 + */
  21 +// clang-format on
  22 +
  23 +#include <string>
  24 +
  25 +#include "sherpa-onnx/c-api/cxx-api.h"
  26 +
  27 +static int32_t ProgressCallback(const float *samples, int32_t num_samples,
  28 + float progress, void *arg) {
  29 + fprintf(stderr, "Progress: %.3f%%\n", progress * 100);
  30 + // return 1 to continue generating
  31 + // return 0 to stop generating
  32 + return 1;
  33 +}
  34 +
  35 +int32_t main(int32_t argc, char *argv[]) {
  36 + using namespace sherpa_onnx::cxx; // NOLINT
  37 + OfflineTtsConfig config;
  38 +
  39 + config.model.matcha.acoustic_model =
  40 + "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx";
  41 +
  42 + config.model.matcha.vocoder = "./hifigan_v2.onnx";
  43 +
  44 + config.model.matcha.tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt";
  45 +
  46 + config.model.matcha.data_dir =
  47 + "./matcha-icefall-en_US-ljspeech/espeak-ng-data";
  48 +
  49 + config.model.num_threads = 1;
  50 +
  51 + // If you don't want to see debug messages, please set it to 0
  52 + config.model.debug = 1;
  53 +
  54 + std::string filename = "./generated-matcha-en-cxx.wav";
  55 + std::string text =
  56 + "Today as always, men fall into two groups: slaves and free men. Whoever "
  57 + "does not have two-thirds of his day for himself, is a slave, whatever "
  58 + "he may be: a statesman, a businessman, an official, or a scholar. "
  59 + "Friends fell out often because life was changing so fast. The easiest "
  60 + "thing in the world was to lose touch with someone.";
  61 +
  62 + auto tts = OfflineTts::Create(config);
  63 + int32_t sid = 0;
  64 + float speed = 1.0; // larger -> faster in speech speed
  65 +
  66 +#if 0
  67 + // If you don't want to use a callback, then please enable this branch
  68 + GeneratedAudio audio = tts.Generate(text, sid, speed);
  69 +#else
  70 + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback);
  71 +#endif
  72 +
  73 + WriteWave(filename, {audio.samples, audio.sample_rate});
  74 +
  75 + fprintf(stderr, "Input text is: %s\n", text.c_str());
  76 + fprintf(stderr, "Speaker ID is is: %d\n", sid);
  77 + fprintf(stderr, "Saved to: %s\n", filename.c_str());
  78 +
  79 + return 0;
  80 +}
  1 +// cxx-api-examples/matcha-tts-zh-cxx-api.c
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +// This file shows how to use sherpa-onnx CXX API
  6 +// for Chinese TTS with MatchaTTS.
  7 +//
  8 +// clang-format off
  9 +/*
  10 +Usage
  11 +
  12 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  13 +tar xvf matcha-icefall-zh-baker.tar.bz2
  14 +rm matcha-icefall-zh-baker.tar.bz2
  15 +
  16 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  17 +
  18 +./matcha-tts-zh-cxx-api
  19 +
  20 + */
  21 +// clang-format on
  22 +
  23 +#include <string>
  24 +
  25 +#include "sherpa-onnx/c-api/cxx-api.h"
  26 +
  27 +static int32_t ProgressCallback(const float *samples, int32_t num_samples,
  28 + float progress, void *arg) {
  29 + fprintf(stderr, "Progress: %.3f%%\n", progress * 100);
  30 + // return 1 to continue generating
  31 + // return 0 to stop generating
  32 + return 1;
  33 +}
  34 +
  35 +int32_t main(int32_t argc, char *argv[]) {
  36 + using namespace sherpa_onnx::cxx; // NOLINT
  37 + OfflineTtsConfig config;
  38 + config.model.matcha.acoustic_model =
  39 + "./matcha-icefall-zh-baker/model-steps-3.onnx";
  40 + config.model.matcha.vocoder = "./hifigan_v2.onnx";
  41 + config.model.matcha.lexicon = "./matcha-icefall-zh-baker/lexicon.txt";
  42 + config.model.matcha.tokens = "./matcha-icefall-zh-baker/tokens.txt";
  43 + config.model.matcha.dict_dir = "./matcha-icefall-zh-baker/dict";
  44 + config.model.num_threads = 1;
  45 +
  46 + // If you don't want to see debug messages, please set it to 0
  47 + config.model.debug = 1;
  48 +
  49 + // clang-format off
  50 + config.rule_fsts = "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"; // NOLINT
  51 + // clang-format on
  52 +
  53 + std::string filename = "./generated-matcha-zh-cxx.wav";
  54 + std::string text =
  55 + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如"
  56 + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感"
  57 + "受着生命的奇迹与温柔."
  58 + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; "
  59 + "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。";
  60 +
  61 + auto tts = OfflineTts::Create(config);
  62 + int32_t sid = 0;
  63 + float speed = 1.0; // larger -> faster in speech speed
  64 +
  65 +#if 0
  66 + // If you don't want to use a callback, then please enable this branch
  67 + GeneratedAudio audio = tts.Generate(text, sid, speed);
  68 +#else
  69 + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback);
  70 +#endif
  71 +
  72 + WriteWave(filename, {audio.samples, audio.sample_rate});
  73 +
  74 + fprintf(stderr, "Input text is: %s\n", text.c_str());
  75 + fprintf(stderr, "Speaker ID is is: %d\n", sid);
  76 + fprintf(stderr, "Saved to: %s\n", filename.c_str());
  77 +
  78 + return 0;
  79 +}
@@ -1114,7 +1114,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( @@ -1114,7 +1114,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
1114 return tts_config; 1114 return tts_config;
1115 } 1115 }
1116 1116
1117 -SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( 1117 +const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
1118 const SherpaOnnxOfflineTtsConfig *config) { 1118 const SherpaOnnxOfflineTtsConfig *config) {
1119 auto tts_config = GetOfflineTtsConfig(config); 1119 auto tts_config = GetOfflineTtsConfig(config);
1120 1120
@@ -1130,7 +1130,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( @@ -1130,7 +1130,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
1130 return tts; 1130 return tts;
1131 } 1131 }
1132 1132
1133 -void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; } 1133 +void SherpaOnnxDestroyOfflineTts(const SherpaOnnxOfflineTts *tts) {
  1134 + delete tts;
  1135 +}
1134 1136
1135 int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) { 1137 int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) {
1136 return tts->impl->SampleRate(); 1138 return tts->impl->SampleRate();
@@ -950,11 +950,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; @@ -950,11 +950,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;
950 950
951 // Create an instance of offline TTS. The user has to use DestroyOfflineTts() 951 // Create an instance of offline TTS. The user has to use DestroyOfflineTts()
952 // to free the returned pointer to avoid memory leak. 952 // to free the returned pointer to avoid memory leak.
953 -SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( 953 +SHERPA_ONNX_API const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
954 const SherpaOnnxOfflineTtsConfig *config); 954 const SherpaOnnxOfflineTtsConfig *config);
955 955
956 // Free the pointer returned by SherpaOnnxCreateOfflineTts() 956 // Free the pointer returned by SherpaOnnxCreateOfflineTts()
957 -SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); 957 +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(
  958 + const SherpaOnnxOfflineTts *tts);
958 959
959 // Return the sample rate of the current TTS object 960 // Return the sample rate of the current TTS object
960 SHERPA_ONNX_API int32_t 961 SHERPA_ONNX_API int32_t
@@ -984,7 +985,6 @@ SHERPA_ONNX_API @@ -984,7 +985,6 @@ SHERPA_ONNX_API
984 const SherpaOnnxGeneratedAudio * 985 const SherpaOnnxGeneratedAudio *
985 SherpaOnnxOfflineTtsGenerateWithProgressCallback( 986 SherpaOnnxOfflineTtsGenerateWithProgressCallback(
986 const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, 987 const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
987 -  
988 SherpaOnnxGeneratedAudioProgressCallback callback); 988 SherpaOnnxGeneratedAudioProgressCallback callback);
989 989
990 SHERPA_ONNX_API 990 SHERPA_ONNX_API
@@ -24,6 +24,11 @@ Wave ReadWave(const std::string &filename) { @@ -24,6 +24,11 @@ Wave ReadWave(const std::string &filename) {
24 return ans; 24 return ans;
25 } 25 }
26 26
  27 +bool WriteWave(const std::string &filename, const Wave &wave) {
  28 + return SherpaOnnxWriteWave(wave.samples.data(), wave.samples.size(),
  29 + wave.sample_rate, filename.c_str());
  30 +}
  31 +
27 OnlineStream::OnlineStream(const SherpaOnnxOnlineStream *p) 32 OnlineStream::OnlineStream(const SherpaOnnxOnlineStream *p)
28 : MoveOnly<OnlineStream, SherpaOnnxOnlineStream>(p) {} 33 : MoveOnly<OnlineStream, SherpaOnnxOnlineStream>(p) {}
29 34
@@ -311,4 +316,73 @@ OfflineRecognizerResult OfflineRecognizer::GetResult( @@ -311,4 +316,73 @@ OfflineRecognizerResult OfflineRecognizer::GetResult(
311 return ans; 316 return ans;
312 } 317 }
313 318
  319 +OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
  320 + struct SherpaOnnxOfflineTtsConfig c;
  321 + memset(&c, 0, sizeof(c));
  322 +
  323 + c.model.vits.model = config.model.vits.model.c_str();
  324 + c.model.vits.lexicon = config.model.vits.lexicon.c_str();
  325 + c.model.vits.tokens = config.model.vits.tokens.c_str();
  326 + c.model.vits.data_dir = config.model.vits.data_dir.c_str();
  327 + c.model.vits.noise_scale = config.model.vits.noise_scale;
  328 + c.model.vits.noise_scale_w = config.model.vits.noise_scale_w;
  329 + c.model.vits.length_scale = config.model.vits.length_scale;
  330 + c.model.vits.dict_dir = config.model.vits.dict_dir.c_str();
  331 +
  332 + c.model.matcha.acoustic_model = config.model.matcha.acoustic_model.c_str();
  333 + c.model.matcha.vocoder = config.model.matcha.vocoder.c_str();
  334 + c.model.matcha.lexicon = config.model.matcha.lexicon.c_str();
  335 + c.model.matcha.tokens = config.model.matcha.tokens.c_str();
  336 + c.model.matcha.data_dir = config.model.matcha.data_dir.c_str();
  337 + c.model.matcha.noise_scale = config.model.matcha.noise_scale;
  338 + c.model.matcha.length_scale = config.model.matcha.length_scale;
  339 + c.model.matcha.dict_dir = config.model.matcha.dict_dir.c_str();
  340 +
  341 + c.model.num_threads = config.model.num_threads;
  342 + c.model.debug = config.model.debug;
  343 + c.model.provider = config.model.provider.c_str();
  344 +
  345 + c.rule_fsts = config.rule_fsts.c_str();
  346 + c.max_num_sentences = config.max_num_sentences;
  347 + c.rule_fars = config.rule_fars.c_str();
  348 +
  349 + auto p = SherpaOnnxCreateOfflineTts(&c);
  350 + return OfflineTts(p);
  351 +}
  352 +
  353 +OfflineTts::OfflineTts(const SherpaOnnxOfflineTts *p)
  354 + : MoveOnly<OfflineTts, SherpaOnnxOfflineTts>(p) {}
  355 +
  356 +void OfflineTts::Destroy(const SherpaOnnxOfflineTts *p) const {
  357 + SherpaOnnxDestroyOfflineTts(p);
  358 +}
  359 +
  360 +int32_t OfflineTts::SampleRate() const {
  361 + return SherpaOnnxOfflineTtsSampleRate(p_);
  362 +}
  363 +
  364 +int32_t OfflineTts::NumSpeakers() const {
  365 + return SherpaOnnxOfflineTtsNumSpeakers(p_);
  366 +}
  367 +
  368 +GeneratedAudio OfflineTts::Generate(const std::string &text,
  369 + int32_t sid /*= 0*/, float speed /*= 1.0*/,
  370 + OfflineTtsCallback callback /*= nullptr*/,
  371 + void *arg /*= nullptr*/) const {
  372 + const SherpaOnnxGeneratedAudio *audio;
  373 + if (!callback) {
  374 + audio = SherpaOnnxOfflineTtsGenerate(p_, text.c_str(), sid, speed);
  375 + } else {
  376 + audio = SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg(
  377 + p_, text.c_str(), sid, speed, callback, arg);
  378 + }
  379 +
  380 + GeneratedAudio ans;
  381 + ans.samples = std::vector<float>{audio->samples, audio->samples + audio->n};
  382 + ans.sample_rate = audio->sample_rate;
  383 +
  384 + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
  385 + return ans;
  386 +}
  387 +
314 } // namespace sherpa_onnx::cxx 388 } // namespace sherpa_onnx::cxx
@@ -97,6 +97,10 @@ struct Wave { @@ -97,6 +97,10 @@ struct Wave {
97 97
98 SHERPA_ONNX_API Wave ReadWave(const std::string &filename); 98 SHERPA_ONNX_API Wave ReadWave(const std::string &filename);
99 99
  100 +// Return true on success;
  101 +// Return false on failure
  102 +SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave);
  103 +
100 template <typename Derived, typename T> 104 template <typename Derived, typename T>
101 class SHERPA_ONNX_API MoveOnly { 105 class SHERPA_ONNX_API MoveOnly {
102 public: 106 public:
@@ -307,6 +311,91 @@ class SHERPA_ONNX_API OfflineRecognizer @@ -307,6 +311,91 @@ class SHERPA_ONNX_API OfflineRecognizer
307 explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p); 311 explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p);
308 }; 312 };
309 313
  314 +// ============================================================================
  315 +// Non-streaming TTS
  316 +// ============================================================================
  317 +struct OfflineTtsVitsModelConfig {
  318 + std::string model;
  319 + std::string lexicon;
  320 + std::string tokens;
  321 + std::string data_dir;
  322 + std::string dict_dir;
  323 +
  324 + float noise_scale = 0.667;
  325 + float noise_scale_w = 0.8;
  326 + float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
  327 +};
  328 +
  329 +struct OfflineTtsMatchaModelConfig {
  330 + std::string acoustic_model;
  331 + std::string vocoder;
  332 + std::string lexicon;
  333 + std::string tokens;
  334 + std::string data_dir;
  335 + std::string dict_dir;
  336 +
  337 + float noise_scale = 0.667;
  338 + float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
  339 +};
  340 +
  341 +struct OfflineTtsModelConfig {
  342 + OfflineTtsVitsModelConfig vits;
  343 + OfflineTtsMatchaModelConfig matcha;
  344 + int32_t num_threads = 1;
  345 + bool debug = false;
  346 + std::string provider = "cpu";
  347 +};
  348 +
  349 +struct OfflineTtsConfig {
  350 + OfflineTtsModelConfig model;
  351 + std::string rule_fsts;
  352 + std::string rule_fars;
  353 + int32_t max_num_sentences = 1;
  354 +};
  355 +
  356 +struct GeneratedAudio {
  357 + std::vector<float> samples; // in the range [-1, 1]
  358 + int32_t sample_rate;
  359 +};
  360 +
  361 +// Return 1 to continue generating
  362 +// Return 0 to stop generating
  363 +using OfflineTtsCallback = int32_t (*)(const float *samples,
  364 + int32_t num_samples, float progress,
  365 + void *arg);
  366 +
  367 +class SHERPA_ONNX_API OfflineTts
  368 + : public MoveOnly<OfflineTts, SherpaOnnxOfflineTts> {
  369 + public:
  370 + static OfflineTts Create(const OfflineTtsConfig &config);
  371 +
  372 + void Destroy(const SherpaOnnxOfflineTts *p) const;
  373 +
  374 + // Return the sample rate of the generated audio
  375 + int32_t SampleRate() const;
  376 +
  377 + // Number of supported speakers.
  378 + // If it supports only a single speaker, then it return 0 or 1.
  379 + int32_t NumSpeakers() const;
  380 +
  381 + // @param text A string containing words separated by spaces
  382 + // @param sid Speaker ID. Used only for multi-speaker models, e.g., models
  383 + // trained using the VCTK dataset. It is not used for
  384 + // single-speaker models, e.g., models trained using the ljspeech
  385 + // dataset.
  386 + // @param speed The speed for the generated speech. E.g., 2 means 2x faster.
  387 + // @param callback If not NULL, it is called whenever config.max_num_sentences
  388 + // sentences have been processed. The callback is called in
  389 + // the current thread.
  390 + GeneratedAudio Generate(const std::string &text, int32_t sid = 0,
  391 + float speed = 1.0,
  392 + OfflineTtsCallback callback = nullptr,
  393 + void *arg = nullptr) const;
  394 +
  395 + private:
  396 + explicit OfflineTts(const SherpaOnnxOfflineTts *p);
  397 +};
  398 +
310 } // namespace sherpa_onnx::cxx 399 } // namespace sherpa_onnx::cxx
311 400
312 #endif // SHERPA_ONNX_C_API_CXX_API_H_ 401 #endif // SHERPA_ONNX_C_API_CXX_API_H_