正在显示
12 个修改的文件
包含
403 行增加
和
8 行删除
| @@ -101,6 +101,10 @@ jobs: | @@ -101,6 +101,10 @@ jobs: | ||
| 101 | 101 | ||
| 102 | ./matcha-tts-zh-c-api | 102 | ./matcha-tts-zh-c-api |
| 103 | 103 | ||
| 104 | + rm ./matcha-tts-zh-c-api | ||
| 105 | + rm -rf matcha-icefall-* | ||
| 106 | + rm hifigan_v2.onnx | ||
| 107 | + | ||
| 104 | - name: Test Matcha TTS (en) | 108 | - name: Test Matcha TTS (en) |
| 105 | shell: bash | 109 | shell: bash |
| 106 | run: | | 110 | run: | |
| @@ -121,6 +125,10 @@ jobs: | @@ -121,6 +125,10 @@ jobs: | ||
| 121 | 125 | ||
| 122 | ./matcha-tts-en-c-api | 126 | ./matcha-tts-en-c-api |
| 123 | 127 | ||
| 128 | + rm ./matcha-tts-en-c-api | ||
| 129 | + rm -rf matcha-icefall-* | ||
| 130 | + rm hifigan_v2.onnx | ||
| 131 | + | ||
| 124 | - uses: actions/upload-artifact@v4 | 132 | - uses: actions/upload-artifact@v4 |
| 125 | with: | 133 | with: |
| 126 | name: matcha-tts-${{ matrix.os }} | 134 | name: matcha-tts-${{ matrix.os }} |
| @@ -83,6 +83,61 @@ jobs: | @@ -83,6 +83,61 @@ jobs: | ||
| 83 | otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib | 83 | otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib |
| 84 | fi | 84 | fi |
| 85 | 85 | ||
| 86 | + - name: Test Matcha TTS (zh) | ||
| 87 | + shell: bash | ||
| 88 | + run: | | ||
| 89 | + g++ -std=c++17 -o matcha-tts-zh-cxx-api ./cxx-api-examples/matcha-tts-zh-cxx-api.cc \ | ||
| 90 | + -I ./build/install/include \ | ||
| 91 | + -L ./build/install/lib/ \ | ||
| 92 | + -l sherpa-onnx-cxx-api \ | ||
| 93 | + -l sherpa-onnx-c-api \ | ||
| 94 | + -l onnxruntime | ||
| 95 | + | ||
| 96 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 97 | + tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 98 | + rm matcha-icefall-zh-baker.tar.bz2 | ||
| 99 | + | ||
| 100 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 101 | + | ||
| 102 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 103 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 104 | + | ||
| 105 | + ./matcha-tts-zh-cxx-api | ||
| 106 | + | ||
| 107 | + rm -rf matcha-icefall-* | ||
| 108 | + rm hifigan_v2.onnx | ||
| 109 | + rm matcha-tts-zh-cxx-api | ||
| 110 | + | ||
| 111 | + - name: Test Matcha TTS (en) | ||
| 112 | + shell: bash | ||
| 113 | + run: | | ||
| 114 | + g++ -std=c++17 -o matcha-tts-en-cxx-api ./cxx-api-examples/matcha-tts-en-cxx-api.cc \ | ||
| 115 | + -I ./build/install/include \ | ||
| 116 | + -L ./build/install/lib/ \ | ||
| 117 | + -l sherpa-onnx-cxx-api \ | ||
| 118 | + -l sherpa-onnx-c-api \ | ||
| 119 | + -l onnxruntime | ||
| 120 | + | ||
| 121 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 122 | + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 123 | + rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 124 | + | ||
| 125 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 126 | + | ||
| 127 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 128 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 129 | + | ||
| 130 | + ./matcha-tts-en-cxx-api | ||
| 131 | + | ||
| 132 | + rm matcha-tts-en-cxx-api | ||
| 133 | + rm -rf matcha-icefall-* | ||
| 134 | + rm hifigan_v2.onnx | ||
| 135 | + | ||
| 136 | + - uses: actions/upload-artifact@v4 | ||
| 137 | + with: | ||
| 138 | + name: matcha-tts-${{ matrix.os }} | ||
| 139 | + path: ./generated-matcha-*.wav | ||
| 140 | + | ||
| 86 | - name: Test Moonshine tiny | 141 | - name: Test Moonshine tiny |
| 87 | shell: bash | 142 | shell: bash |
| 88 | run: | | 143 | run: | |
| @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { | @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { | ||
| 60 | "Friends fell out often because life was changing so fast. The easiest " | 60 | "Friends fell out often because life was changing so fast. The easiest " |
| 61 | "thing in the world was to lose touch with someone."; | 61 | "thing in the world was to lose touch with someone."; |
| 62 | 62 | ||
| 63 | - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); | 63 | + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); |
| 64 | int32_t sid = 0; | 64 | int32_t sid = 0; |
| 65 | float speed = 1.0; // larger -> faster in speech speed | 65 | float speed = 1.0; // larger -> faster in speech speed |
| 66 | 66 |
| @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { | @@ -60,7 +60,7 @@ int32_t main(int32_t argc, char *argv[]) { | ||
| 60 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " | 60 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " |
| 61 | "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; | 61 | "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; |
| 62 | 62 | ||
| 63 | - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); | 63 | + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); |
| 64 | int32_t sid = 0; | 64 | int32_t sid = 0; |
| 65 | float speed = 1.0; // larger -> faster in speech speed | 65 | float speed = 1.0; // larger -> faster in speech speed |
| 66 | 66 |
| @@ -229,7 +229,7 @@ int32_t main(int32_t argc, char *argv[]) { | @@ -229,7 +229,7 @@ int32_t main(int32_t argc, char *argv[]) { | ||
| 229 | ShowUsage(); | 229 | ShowUsage(); |
| 230 | } | 230 | } |
| 231 | 231 | ||
| 232 | - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); | 232 | + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); |
| 233 | 233 | ||
| 234 | const SherpaOnnxGeneratedAudio *audio = | 234 | const SherpaOnnxGeneratedAudio *audio = |
| 235 | SherpaOnnxOfflineTtsGenerate(tts, text, sid, 1.0); | 235 | SherpaOnnxOfflineTtsGenerate(tts, text, sid, 1.0); |
| @@ -14,3 +14,11 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) | @@ -14,3 +14,11 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) | ||
| 14 | 14 | ||
| 15 | add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) | 15 | add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) |
| 16 | target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) | 16 | target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) |
| 17 | + | ||
| 18 | +if(SHERPA_ONNX_ENABLE_TTS) | ||
| 19 | + add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc) | ||
| 20 | + target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api) | ||
| 21 | + | ||
| 22 | + add_executable(matcha-tts-en-cxx-api ./matcha-tts-en-cxx-api.cc) | ||
| 23 | + target_link_libraries(matcha-tts-en-cxx-api sherpa-onnx-cxx-api) | ||
| 24 | +endif() |
cxx-api-examples/matcha-tts-en-cxx-api.cc
0 → 100644
| 1 | +// cxx-api-examples/matcha-tts-en-cxx-api.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// This file shows how to use sherpa-onnx CXX API | ||
| 6 | +// for Chinese TTS with MatchaTTS. | ||
| 7 | +// | ||
| 8 | +// clang-format off | ||
| 9 | +/* | ||
| 10 | +Usage | ||
| 11 | + | ||
| 12 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 13 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 14 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 15 | + | ||
| 16 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 17 | + | ||
| 18 | +./matcha-tts-en-cxx-api | ||
| 19 | + | ||
| 20 | + */ | ||
| 21 | +// clang-format on | ||
| 22 | + | ||
| 23 | +#include <string> | ||
| 24 | + | ||
| 25 | +#include "sherpa-onnx/c-api/cxx-api.h" | ||
| 26 | + | ||
| 27 | +static int32_t ProgressCallback(const float *samples, int32_t num_samples, | ||
| 28 | + float progress, void *arg) { | ||
| 29 | + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); | ||
| 30 | + // return 1 to continue generating | ||
| 31 | + // return 0 to stop generating | ||
| 32 | + return 1; | ||
| 33 | +} | ||
| 34 | + | ||
| 35 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 36 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 37 | + OfflineTtsConfig config; | ||
| 38 | + | ||
| 39 | + config.model.matcha.acoustic_model = | ||
| 40 | + "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"; | ||
| 41 | + | ||
| 42 | + config.model.matcha.vocoder = "./hifigan_v2.onnx"; | ||
| 43 | + | ||
| 44 | + config.model.matcha.tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"; | ||
| 45 | + | ||
| 46 | + config.model.matcha.data_dir = | ||
| 47 | + "./matcha-icefall-en_US-ljspeech/espeak-ng-data"; | ||
| 48 | + | ||
| 49 | + config.model.num_threads = 1; | ||
| 50 | + | ||
| 51 | + // If you don't want to see debug messages, please set it to 0 | ||
| 52 | + config.model.debug = 1; | ||
| 53 | + | ||
| 54 | + std::string filename = "./generated-matcha-en-cxx.wav"; | ||
| 55 | + std::string text = | ||
| 56 | + "Today as always, men fall into two groups: slaves and free men. Whoever " | ||
| 57 | + "does not have two-thirds of his day for himself, is a slave, whatever " | ||
| 58 | + "he may be: a statesman, a businessman, an official, or a scholar. " | ||
| 59 | + "Friends fell out often because life was changing so fast. The easiest " | ||
| 60 | + "thing in the world was to lose touch with someone."; | ||
| 61 | + | ||
| 62 | + auto tts = OfflineTts::Create(config); | ||
| 63 | + int32_t sid = 0; | ||
| 64 | + float speed = 1.0; // larger -> faster in speech speed | ||
| 65 | + | ||
| 66 | +#if 0 | ||
| 67 | + // If you don't want to use a callback, then please enable this branch | ||
| 68 | + GeneratedAudio audio = tts.Generate(text, sid, speed); | ||
| 69 | +#else | ||
| 70 | + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback); | ||
| 71 | +#endif | ||
| 72 | + | ||
| 73 | + WriteWave(filename, {audio.samples, audio.sample_rate}); | ||
| 74 | + | ||
| 75 | + fprintf(stderr, "Input text is: %s\n", text.c_str()); | ||
| 76 | + fprintf(stderr, "Speaker ID is is: %d\n", sid); | ||
| 77 | + fprintf(stderr, "Saved to: %s\n", filename.c_str()); | ||
| 78 | + | ||
| 79 | + return 0; | ||
| 80 | +} |
cxx-api-examples/matcha-tts-zh-cxx-api.cc
0 → 100644
| 1 | +// cxx-api-examples/matcha-tts-zh-cxx-api.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// This file shows how to use sherpa-onnx CXX API | ||
| 6 | +// for Chinese TTS with MatchaTTS. | ||
| 7 | +// | ||
| 8 | +// clang-format off | ||
| 9 | +/* | ||
| 10 | +Usage | ||
| 11 | + | ||
| 12 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 13 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 14 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 15 | + | ||
| 16 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 17 | + | ||
| 18 | +./matcha-tts-zh-cxx-api | ||
| 19 | + | ||
| 20 | + */ | ||
| 21 | +// clang-format on | ||
| 22 | + | ||
| 23 | +#include <string> | ||
| 24 | + | ||
| 25 | +#include "sherpa-onnx/c-api/cxx-api.h" | ||
| 26 | + | ||
| 27 | +static int32_t ProgressCallback(const float *samples, int32_t num_samples, | ||
| 28 | + float progress, void *arg) { | ||
| 29 | + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); | ||
| 30 | + // return 1 to continue generating | ||
| 31 | + // return 0 to stop generating | ||
| 32 | + return 1; | ||
| 33 | +} | ||
| 34 | + | ||
| 35 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 36 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 37 | + OfflineTtsConfig config; | ||
| 38 | + config.model.matcha.acoustic_model = | ||
| 39 | + "./matcha-icefall-zh-baker/model-steps-3.onnx"; | ||
| 40 | + config.model.matcha.vocoder = "./hifigan_v2.onnx"; | ||
| 41 | + config.model.matcha.lexicon = "./matcha-icefall-zh-baker/lexicon.txt"; | ||
| 42 | + config.model.matcha.tokens = "./matcha-icefall-zh-baker/tokens.txt"; | ||
| 43 | + config.model.matcha.dict_dir = "./matcha-icefall-zh-baker/dict"; | ||
| 44 | + config.model.num_threads = 1; | ||
| 45 | + | ||
| 46 | + // If you don't want to see debug messages, please set it to 0 | ||
| 47 | + config.model.debug = 1; | ||
| 48 | + | ||
| 49 | + // clang-format off | ||
| 50 | + config.rule_fsts = "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"; // NOLINT | ||
| 51 | + // clang-format on | ||
| 52 | + | ||
| 53 | + std::string filename = "./generated-matcha-zh-cxx.wav"; | ||
| 54 | + std::string text = | ||
| 55 | + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如" | ||
| 56 | + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感" | ||
| 57 | + "受着生命的奇迹与温柔." | ||
| 58 | + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " | ||
| 59 | + "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; | ||
| 60 | + | ||
| 61 | + auto tts = OfflineTts::Create(config); | ||
| 62 | + int32_t sid = 0; | ||
| 63 | + float speed = 1.0; // larger -> faster in speech speed | ||
| 64 | + | ||
| 65 | +#if 0 | ||
| 66 | + // If you don't want to use a callback, then please enable this branch | ||
| 67 | + GeneratedAudio audio = tts.Generate(text, sid, speed); | ||
| 68 | +#else | ||
| 69 | + GeneratedAudio audio = tts.Generate(text, sid, speed, ProgressCallback); | ||
| 70 | +#endif | ||
| 71 | + | ||
| 72 | + WriteWave(filename, {audio.samples, audio.sample_rate}); | ||
| 73 | + | ||
| 74 | + fprintf(stderr, "Input text is: %s\n", text.c_str()); | ||
| 75 | + fprintf(stderr, "Speaker ID is is: %d\n", sid); | ||
| 76 | + fprintf(stderr, "Saved to: %s\n", filename.c_str()); | ||
| 77 | + | ||
| 78 | + return 0; | ||
| 79 | +} |
| @@ -1114,7 +1114,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | @@ -1114,7 +1114,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | ||
| 1114 | return tts_config; | 1114 | return tts_config; |
| 1115 | } | 1115 | } |
| 1116 | 1116 | ||
| 1117 | -SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | 1117 | +const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( |
| 1118 | const SherpaOnnxOfflineTtsConfig *config) { | 1118 | const SherpaOnnxOfflineTtsConfig *config) { |
| 1119 | auto tts_config = GetOfflineTtsConfig(config); | 1119 | auto tts_config = GetOfflineTtsConfig(config); |
| 1120 | 1120 | ||
| @@ -1130,7 +1130,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | @@ -1130,7 +1130,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | ||
| 1130 | return tts; | 1130 | return tts; |
| 1131 | } | 1131 | } |
| 1132 | 1132 | ||
| 1133 | -void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; } | 1133 | +void SherpaOnnxDestroyOfflineTts(const SherpaOnnxOfflineTts *tts) { |
| 1134 | + delete tts; | ||
| 1135 | +} | ||
| 1134 | 1136 | ||
| 1135 | int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) { | 1137 | int32_t SherpaOnnxOfflineTtsSampleRate(const SherpaOnnxOfflineTts *tts) { |
| 1136 | return tts->impl->SampleRate(); | 1138 | return tts->impl->SampleRate(); |
| @@ -950,11 +950,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; | @@ -950,11 +950,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; | ||
| 950 | 950 | ||
| 951 | // Create an instance of offline TTS. The user has to use DestroyOfflineTts() | 951 | // Create an instance of offline TTS. The user has to use DestroyOfflineTts() |
| 952 | // to free the returned pointer to avoid memory leak. | 952 | // to free the returned pointer to avoid memory leak. |
| 953 | -SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | 953 | +SHERPA_ONNX_API const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( |
| 954 | const SherpaOnnxOfflineTtsConfig *config); | 954 | const SherpaOnnxOfflineTtsConfig *config); |
| 955 | 955 | ||
| 956 | // Free the pointer returned by SherpaOnnxCreateOfflineTts() | 956 | // Free the pointer returned by SherpaOnnxCreateOfflineTts() |
| 957 | -SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); | 957 | +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts( |
| 958 | + const SherpaOnnxOfflineTts *tts); | ||
| 958 | 959 | ||
| 959 | // Return the sample rate of the current TTS object | 960 | // Return the sample rate of the current TTS object |
| 960 | SHERPA_ONNX_API int32_t | 961 | SHERPA_ONNX_API int32_t |
| @@ -984,7 +985,6 @@ SHERPA_ONNX_API | @@ -984,7 +985,6 @@ SHERPA_ONNX_API | ||
| 984 | const SherpaOnnxGeneratedAudio * | 985 | const SherpaOnnxGeneratedAudio * |
| 985 | SherpaOnnxOfflineTtsGenerateWithProgressCallback( | 986 | SherpaOnnxOfflineTtsGenerateWithProgressCallback( |
| 986 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, | 987 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, |
| 987 | - | ||
| 988 | SherpaOnnxGeneratedAudioProgressCallback callback); | 988 | SherpaOnnxGeneratedAudioProgressCallback callback); |
| 989 | 989 | ||
| 990 | SHERPA_ONNX_API | 990 | SHERPA_ONNX_API |
| @@ -24,6 +24,11 @@ Wave ReadWave(const std::string &filename) { | @@ -24,6 +24,11 @@ Wave ReadWave(const std::string &filename) { | ||
| 24 | return ans; | 24 | return ans; |
| 25 | } | 25 | } |
| 26 | 26 | ||
| 27 | +bool WriteWave(const std::string &filename, const Wave &wave) { | ||
| 28 | + return SherpaOnnxWriteWave(wave.samples.data(), wave.samples.size(), | ||
| 29 | + wave.sample_rate, filename.c_str()); | ||
| 30 | +} | ||
| 31 | + | ||
| 27 | OnlineStream::OnlineStream(const SherpaOnnxOnlineStream *p) | 32 | OnlineStream::OnlineStream(const SherpaOnnxOnlineStream *p) |
| 28 | : MoveOnly<OnlineStream, SherpaOnnxOnlineStream>(p) {} | 33 | : MoveOnly<OnlineStream, SherpaOnnxOnlineStream>(p) {} |
| 29 | 34 | ||
| @@ -311,4 +316,73 @@ OfflineRecognizerResult OfflineRecognizer::GetResult( | @@ -311,4 +316,73 @@ OfflineRecognizerResult OfflineRecognizer::GetResult( | ||
| 311 | return ans; | 316 | return ans; |
| 312 | } | 317 | } |
| 313 | 318 | ||
| 319 | +OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { | ||
| 320 | + struct SherpaOnnxOfflineTtsConfig c; | ||
| 321 | + memset(&c, 0, sizeof(c)); | ||
| 322 | + | ||
| 323 | + c.model.vits.model = config.model.vits.model.c_str(); | ||
| 324 | + c.model.vits.lexicon = config.model.vits.lexicon.c_str(); | ||
| 325 | + c.model.vits.tokens = config.model.vits.tokens.c_str(); | ||
| 326 | + c.model.vits.data_dir = config.model.vits.data_dir.c_str(); | ||
| 327 | + c.model.vits.noise_scale = config.model.vits.noise_scale; | ||
| 328 | + c.model.vits.noise_scale_w = config.model.vits.noise_scale_w; | ||
| 329 | + c.model.vits.length_scale = config.model.vits.length_scale; | ||
| 330 | + c.model.vits.dict_dir = config.model.vits.dict_dir.c_str(); | ||
| 331 | + | ||
| 332 | + c.model.matcha.acoustic_model = config.model.matcha.acoustic_model.c_str(); | ||
| 333 | + c.model.matcha.vocoder = config.model.matcha.vocoder.c_str(); | ||
| 334 | + c.model.matcha.lexicon = config.model.matcha.lexicon.c_str(); | ||
| 335 | + c.model.matcha.tokens = config.model.matcha.tokens.c_str(); | ||
| 336 | + c.model.matcha.data_dir = config.model.matcha.data_dir.c_str(); | ||
| 337 | + c.model.matcha.noise_scale = config.model.matcha.noise_scale; | ||
| 338 | + c.model.matcha.length_scale = config.model.matcha.length_scale; | ||
| 339 | + c.model.matcha.dict_dir = config.model.matcha.dict_dir.c_str(); | ||
| 340 | + | ||
| 341 | + c.model.num_threads = config.model.num_threads; | ||
| 342 | + c.model.debug = config.model.debug; | ||
| 343 | + c.model.provider = config.model.provider.c_str(); | ||
| 344 | + | ||
| 345 | + c.rule_fsts = config.rule_fsts.c_str(); | ||
| 346 | + c.max_num_sentences = config.max_num_sentences; | ||
| 347 | + c.rule_fars = config.rule_fars.c_str(); | ||
| 348 | + | ||
| 349 | + auto p = SherpaOnnxCreateOfflineTts(&c); | ||
| 350 | + return OfflineTts(p); | ||
| 351 | +} | ||
| 352 | + | ||
| 353 | +OfflineTts::OfflineTts(const SherpaOnnxOfflineTts *p) | ||
| 354 | + : MoveOnly<OfflineTts, SherpaOnnxOfflineTts>(p) {} | ||
| 355 | + | ||
| 356 | +void OfflineTts::Destroy(const SherpaOnnxOfflineTts *p) const { | ||
| 357 | + SherpaOnnxDestroyOfflineTts(p); | ||
| 358 | +} | ||
| 359 | + | ||
| 360 | +int32_t OfflineTts::SampleRate() const { | ||
| 361 | + return SherpaOnnxOfflineTtsSampleRate(p_); | ||
| 362 | +} | ||
| 363 | + | ||
| 364 | +int32_t OfflineTts::NumSpeakers() const { | ||
| 365 | + return SherpaOnnxOfflineTtsNumSpeakers(p_); | ||
| 366 | +} | ||
| 367 | + | ||
| 368 | +GeneratedAudio OfflineTts::Generate(const std::string &text, | ||
| 369 | + int32_t sid /*= 0*/, float speed /*= 1.0*/, | ||
| 370 | + OfflineTtsCallback callback /*= nullptr*/, | ||
| 371 | + void *arg /*= nullptr*/) const { | ||
| 372 | + const SherpaOnnxGeneratedAudio *audio; | ||
| 373 | + if (!callback) { | ||
| 374 | + audio = SherpaOnnxOfflineTtsGenerate(p_, text.c_str(), sid, speed); | ||
| 375 | + } else { | ||
| 376 | + audio = SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg( | ||
| 377 | + p_, text.c_str(), sid, speed, callback, arg); | ||
| 378 | + } | ||
| 379 | + | ||
| 380 | + GeneratedAudio ans; | ||
| 381 | + ans.samples = std::vector<float>{audio->samples, audio->samples + audio->n}; | ||
| 382 | + ans.sample_rate = audio->sample_rate; | ||
| 383 | + | ||
| 384 | + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); | ||
| 385 | + return ans; | ||
| 386 | +} | ||
| 387 | + | ||
| 314 | } // namespace sherpa_onnx::cxx | 388 | } // namespace sherpa_onnx::cxx |
| @@ -97,6 +97,10 @@ struct Wave { | @@ -97,6 +97,10 @@ struct Wave { | ||
| 97 | 97 | ||
| 98 | SHERPA_ONNX_API Wave ReadWave(const std::string &filename); | 98 | SHERPA_ONNX_API Wave ReadWave(const std::string &filename); |
| 99 | 99 | ||
| 100 | +// Return true on success; | ||
| 101 | +// Return false on failure | ||
| 102 | +SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave); | ||
| 103 | + | ||
| 100 | template <typename Derived, typename T> | 104 | template <typename Derived, typename T> |
| 101 | class SHERPA_ONNX_API MoveOnly { | 105 | class SHERPA_ONNX_API MoveOnly { |
| 102 | public: | 106 | public: |
| @@ -307,6 +311,91 @@ class SHERPA_ONNX_API OfflineRecognizer | @@ -307,6 +311,91 @@ class SHERPA_ONNX_API OfflineRecognizer | ||
| 307 | explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p); | 311 | explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p); |
| 308 | }; | 312 | }; |
| 309 | 313 | ||
| 314 | +// ============================================================================ | ||
| 315 | +// Non-streaming TTS | ||
| 316 | +// ============================================================================ | ||
| 317 | +struct OfflineTtsVitsModelConfig { | ||
| 318 | + std::string model; | ||
| 319 | + std::string lexicon; | ||
| 320 | + std::string tokens; | ||
| 321 | + std::string data_dir; | ||
| 322 | + std::string dict_dir; | ||
| 323 | + | ||
| 324 | + float noise_scale = 0.667; | ||
| 325 | + float noise_scale_w = 0.8; | ||
| 326 | + float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed | ||
| 327 | +}; | ||
| 328 | + | ||
| 329 | +struct OfflineTtsMatchaModelConfig { | ||
| 330 | + std::string acoustic_model; | ||
| 331 | + std::string vocoder; | ||
| 332 | + std::string lexicon; | ||
| 333 | + std::string tokens; | ||
| 334 | + std::string data_dir; | ||
| 335 | + std::string dict_dir; | ||
| 336 | + | ||
| 337 | + float noise_scale = 0.667; | ||
| 338 | + float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed | ||
| 339 | +}; | ||
| 340 | + | ||
| 341 | +struct OfflineTtsModelConfig { | ||
| 342 | + OfflineTtsVitsModelConfig vits; | ||
| 343 | + OfflineTtsMatchaModelConfig matcha; | ||
| 344 | + int32_t num_threads = 1; | ||
| 345 | + bool debug = false; | ||
| 346 | + std::string provider = "cpu"; | ||
| 347 | +}; | ||
| 348 | + | ||
| 349 | +struct OfflineTtsConfig { | ||
| 350 | + OfflineTtsModelConfig model; | ||
| 351 | + std::string rule_fsts; | ||
| 352 | + std::string rule_fars; | ||
| 353 | + int32_t max_num_sentences = 1; | ||
| 354 | +}; | ||
| 355 | + | ||
| 356 | +struct GeneratedAudio { | ||
| 357 | + std::vector<float> samples; // in the range [-1, 1] | ||
| 358 | + int32_t sample_rate; | ||
| 359 | +}; | ||
| 360 | + | ||
| 361 | +// Return 1 to continue generating | ||
| 362 | +// Return 0 to stop generating | ||
| 363 | +using OfflineTtsCallback = int32_t (*)(const float *samples, | ||
| 364 | + int32_t num_samples, float progress, | ||
| 365 | + void *arg); | ||
| 366 | + | ||
| 367 | +class SHERPA_ONNX_API OfflineTts | ||
| 368 | + : public MoveOnly<OfflineTts, SherpaOnnxOfflineTts> { | ||
| 369 | + public: | ||
| 370 | + static OfflineTts Create(const OfflineTtsConfig &config); | ||
| 371 | + | ||
| 372 | + void Destroy(const SherpaOnnxOfflineTts *p) const; | ||
| 373 | + | ||
| 374 | + // Return the sample rate of the generated audio | ||
| 375 | + int32_t SampleRate() const; | ||
| 376 | + | ||
| 377 | + // Number of supported speakers. | ||
| 378 | + // If it supports only a single speaker, then it return 0 or 1. | ||
| 379 | + int32_t NumSpeakers() const; | ||
| 380 | + | ||
| 381 | + // @param text A string containing words separated by spaces | ||
| 382 | + // @param sid Speaker ID. Used only for multi-speaker models, e.g., models | ||
| 383 | + // trained using the VCTK dataset. It is not used for | ||
| 384 | + // single-speaker models, e.g., models trained using the ljspeech | ||
| 385 | + // dataset. | ||
| 386 | + // @param speed The speed for the generated speech. E.g., 2 means 2x faster. | ||
| 387 | + // @param callback If not NULL, it is called whenever config.max_num_sentences | ||
| 388 | + // sentences have been processed. The callback is called in | ||
| 389 | + // the current thread. | ||
| 390 | + GeneratedAudio Generate(const std::string &text, int32_t sid = 0, | ||
| 391 | + float speed = 1.0, | ||
| 392 | + OfflineTtsCallback callback = nullptr, | ||
| 393 | + void *arg = nullptr) const; | ||
| 394 | + | ||
| 395 | + private: | ||
| 396 | + explicit OfflineTts(const SherpaOnnxOfflineTts *p); | ||
| 397 | +}; | ||
| 398 | + | ||
| 310 | } // namespace sherpa_onnx::cxx | 399 | } // namespace sherpa_onnx::cxx |
| 311 | 400 | ||
| 312 | #endif // SHERPA_ONNX_C_API_CXX_API_H_ | 401 | #endif // SHERPA_ONNX_C_API_CXX_API_H_ |
-
请 注册 或 登录 后发表评论