正在显示
7 个修改的文件
包含
260 行增加
和
3 行删除
| @@ -81,6 +81,51 @@ jobs: | @@ -81,6 +81,51 @@ jobs: | ||
| 81 | otool -L ./install/lib/libsherpa-onnx-c-api.dylib | 81 | otool -L ./install/lib/libsherpa-onnx-c-api.dylib |
| 82 | fi | 82 | fi |
| 83 | 83 | ||
| 84 | + - name: Test Matcha TTS (zh) | ||
| 85 | + shell: bash | ||
| 86 | + run: | | ||
| 87 | + gcc -o matcha-tts-zh-c-api ./c-api-examples/matcha-tts-zh-c-api.c \ | ||
| 88 | + -I ./build/install/include \ | ||
| 89 | + -L ./build/install/lib/ \ | ||
| 90 | + -l sherpa-onnx-c-api \ | ||
| 91 | + -l onnxruntime | ||
| 92 | + | ||
| 93 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 94 | + tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 95 | + rm matcha-icefall-zh-baker.tar.bz2 | ||
| 96 | + | ||
| 97 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 98 | + | ||
| 99 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 100 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 101 | + | ||
| 102 | + ./matcha-tts-zh-c-api | ||
| 103 | + | ||
| 104 | + - name: Test Matcha TTS (en) | ||
| 105 | + shell: bash | ||
| 106 | + run: | | ||
| 107 | + gcc -o matcha-tts-en-c-api ./c-api-examples/matcha-tts-en-c-api.c \ | ||
| 108 | + -I ./build/install/include \ | ||
| 109 | + -L ./build/install/lib/ \ | ||
| 110 | + -l sherpa-onnx-c-api \ | ||
| 111 | + -l onnxruntime | ||
| 112 | + | ||
| 113 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 114 | + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 115 | + rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 116 | + | ||
| 117 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 118 | + | ||
| 119 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 120 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 121 | + | ||
| 122 | + ./matcha-tts-en-c-api | ||
| 123 | + | ||
| 124 | + - uses: actions/upload-artifact@v4 | ||
| 125 | + with: | ||
| 126 | + name: matcha-tts-${{ matrix.os }} | ||
| 127 | + path: ./generated-matcha-*.wav | ||
| 128 | + | ||
| 84 | - name: Test vad + Whisper tiny.en | 129 | - name: Test vad + Whisper tiny.en |
| 85 | shell: bash | 130 | shell: bash |
| 86 | run: | | 131 | run: | |
| @@ -7,6 +7,12 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) | @@ -7,6 +7,12 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) | ||
| 7 | if(SHERPA_ONNX_ENABLE_TTS) | 7 | if(SHERPA_ONNX_ENABLE_TTS) |
| 8 | add_executable(offline-tts-c-api offline-tts-c-api.c) | 8 | add_executable(offline-tts-c-api offline-tts-c-api.c) |
| 9 | target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) | 9 | target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) |
| 10 | + | ||
| 11 | + add_executable(matcha-tts-zh-c-api matcha-tts-zh-c-api.c) | ||
| 12 | + target_link_libraries(matcha-tts-zh-c-api sherpa-onnx-c-api) | ||
| 13 | + | ||
| 14 | + add_executable(matcha-tts-en-c-api matcha-tts-en-c-api.c) | ||
| 15 | + target_link_libraries(matcha-tts-en-c-api sherpa-onnx-c-api) | ||
| 10 | endif() | 16 | endif() |
| 11 | 17 | ||
| 12 | if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION) | 18 | if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION) |
c-api-examples/matcha-tts-en-c-api.c
0 → 100644
| 1 | +// c-api-examples/matcha-tts-en-c-api.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// This file shows how to use sherpa-onnx C API | ||
| 6 | +// for English TTS with MatchaTTS. | ||
| 7 | +// | ||
| 8 | +// clang-format off | ||
| 9 | +/* | ||
| 10 | +Usage | ||
| 11 | + | ||
| 12 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 13 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 14 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 15 | + | ||
| 16 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 17 | + | ||
| 18 | +./matcha-tts-en-c-api | ||
| 19 | + | ||
| 20 | + */ | ||
| 21 | +// clang-format on | ||
| 22 | + | ||
| 23 | +#include <stdio.h> | ||
| 24 | +#include <stdlib.h> | ||
| 25 | +#include <string.h> | ||
| 26 | + | ||
| 27 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 28 | + | ||
| 29 | +static int32_t ProgressCallback(const float *samples, int32_t num_samples, | ||
| 30 | + float progress) { | ||
| 31 | + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); | ||
| 32 | + // return 1 to continue generating | ||
| 33 | + // return 0 to stop generating | ||
| 34 | + return 1; | ||
| 35 | +} | ||
| 36 | + | ||
| 37 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 38 | + SherpaOnnxOfflineTtsConfig config; | ||
| 39 | + memset(&config, 0, sizeof(config)); | ||
| 40 | + config.model.matcha.acoustic_model = | ||
| 41 | + "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"; | ||
| 42 | + | ||
| 43 | + config.model.matcha.vocoder = "./hifigan_v2.onnx"; | ||
| 44 | + | ||
| 45 | + config.model.matcha.tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"; | ||
| 46 | + | ||
| 47 | + config.model.matcha.data_dir = | ||
| 48 | + "./matcha-icefall-en_US-ljspeech/espeak-ng-data"; | ||
| 49 | + | ||
| 50 | + config.model.num_threads = 1; | ||
| 51 | + | ||
| 52 | + // If you don't want to see debug messages, please set it to 0 | ||
| 53 | + config.model.debug = 1; | ||
| 54 | + | ||
| 55 | + const char *filename = "./generated-matcha-en.wav"; | ||
| 56 | + const char *text = | ||
| 57 | + "Today as always, men fall into two groups: slaves and free men. Whoever " | ||
| 58 | + "does not have two-thirds of his day for himself, is a slave, whatever " | ||
| 59 | + "he may be: a statesman, a businessman, an official, or a scholar. " | ||
| 60 | + "Friends fell out often because life was changing so fast. The easiest " | ||
| 61 | + "thing in the world was to lose touch with someone."; | ||
| 62 | + | ||
| 63 | + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); | ||
| 64 | + int32_t sid = 0; | ||
| 65 | + float speed = 1.0; // larger -> faster in speech speed | ||
| 66 | + | ||
| 67 | +#if 0 | ||
| 68 | + // If you don't want to use a callback, then please enable this branch | ||
| 69 | + const SherpaOnnxGeneratedAudio *audio = | ||
| 70 | + SherpaOnnxOfflineTtsGenerate(tts, text, sid, speed); | ||
| 71 | +#else | ||
| 72 | + const SherpaOnnxGeneratedAudio *audio = | ||
| 73 | + SherpaOnnxOfflineTtsGenerateWithProgressCallback(tts, text, sid, speed, | ||
| 74 | + ProgressCallback); | ||
| 75 | +#endif | ||
| 76 | + | ||
| 77 | + SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename); | ||
| 78 | + | ||
| 79 | + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); | ||
| 80 | + SherpaOnnxDestroyOfflineTts(tts); | ||
| 81 | + | ||
| 82 | + fprintf(stderr, "Input text is: %s\n", text); | ||
| 83 | + fprintf(stderr, "Speaker ID is is: %d\n", sid); | ||
| 84 | + fprintf(stderr, "Saved to: %s\n", filename); | ||
| 85 | + | ||
| 86 | + return 0; | ||
| 87 | +} |
c-api-examples/matcha-tts-zh-c-api.c
0 → 100644
| 1 | +// c-api-examples/matcha-tts-zh-c-api.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// This file shows how to use sherpa-onnx C API | ||
| 6 | +// for Chinese TTS with MatchaTTS. | ||
| 7 | +// | ||
| 8 | +// clang-format off | ||
| 9 | +/* | ||
| 10 | +Usage | ||
| 11 | + | ||
| 12 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 13 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 14 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 15 | + | ||
| 16 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 17 | + | ||
| 18 | +./matcha-tts-zh-c-api | ||
| 19 | + | ||
| 20 | + */ | ||
| 21 | +// clang-format on | ||
| 22 | + | ||
| 23 | +#include <stdio.h> | ||
| 24 | +#include <stdlib.h> | ||
| 25 | +#include <string.h> | ||
| 26 | + | ||
| 27 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 28 | + | ||
| 29 | +static int32_t ProgressCallback(const float *samples, int32_t num_samples, | ||
| 30 | + float progress) { | ||
| 31 | + fprintf(stderr, "Progress: %.3f%%\n", progress * 100); | ||
| 32 | + // return 1 to continue generating | ||
| 33 | + // return 0 to stop generating | ||
| 34 | + return 1; | ||
| 35 | +} | ||
| 36 | + | ||
| 37 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 38 | + SherpaOnnxOfflineTtsConfig config; | ||
| 39 | + memset(&config, 0, sizeof(config)); | ||
| 40 | + config.model.matcha.acoustic_model = | ||
| 41 | + "./matcha-icefall-zh-baker/model-steps-3.onnx"; | ||
| 42 | + config.model.matcha.vocoder = "./hifigan_v2.onnx"; | ||
| 43 | + config.model.matcha.lexicon = "./matcha-icefall-zh-baker/lexicon.txt"; | ||
| 44 | + config.model.matcha.tokens = "./matcha-icefall-zh-baker/tokens.txt"; | ||
| 45 | + config.model.matcha.dict_dir = "./matcha-icefall-zh-baker/dict"; | ||
| 46 | + config.model.num_threads = 1; | ||
| 47 | + | ||
| 48 | + // If you don't want to see debug messages, please set it to 0 | ||
| 49 | + config.model.debug = 1; | ||
| 50 | + | ||
| 51 | + // clang-format off | ||
| 52 | + config.rule_fsts = "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"; | ||
| 53 | + // clang-format on | ||
| 54 | + | ||
| 55 | + const char *filename = "./generated-matcha-zh.wav"; | ||
| 56 | + const char *text = | ||
| 57 | + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如" | ||
| 58 | + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感" | ||
| 59 | + "受着生命的奇迹与温柔." | ||
| 60 | + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; " | ||
| 61 | + "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"; | ||
| 62 | + | ||
| 63 | + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); | ||
| 64 | + int32_t sid = 0; | ||
| 65 | + float speed = 1.0; // larger -> faster in speech speed | ||
| 66 | + | ||
| 67 | +#if 0 | ||
| 68 | + // If you don't want to use a callback, then please enable this branch | ||
| 69 | + const SherpaOnnxGeneratedAudio *audio = | ||
| 70 | + SherpaOnnxOfflineTtsGenerate(tts, text, sid, speed); | ||
| 71 | +#else | ||
| 72 | + const SherpaOnnxGeneratedAudio *audio = | ||
| 73 | + SherpaOnnxOfflineTtsGenerateWithProgressCallback(tts, text, sid, speed, | ||
| 74 | + ProgressCallback); | ||
| 75 | +#endif | ||
| 76 | + | ||
| 77 | + SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename); | ||
| 78 | + | ||
| 79 | + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); | ||
| 80 | + SherpaOnnxDestroyOfflineTts(tts); | ||
| 81 | + | ||
| 82 | + fprintf(stderr, "Input text is: %s\n", text); | ||
| 83 | + fprintf(stderr, "Speaker ID is is: %d\n", sid); | ||
| 84 | + fprintf(stderr, "Saved to: %s\n", filename); | ||
| 85 | + | ||
| 86 | + return 0; | ||
| 87 | +} |
| @@ -1058,6 +1058,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | @@ -1058,6 +1058,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | ||
| 1058 | const SherpaOnnxOfflineTtsConfig *config) { | 1058 | const SherpaOnnxOfflineTtsConfig *config) { |
| 1059 | sherpa_onnx::OfflineTtsConfig tts_config; | 1059 | sherpa_onnx::OfflineTtsConfig tts_config; |
| 1060 | 1060 | ||
| 1061 | + // vits | ||
| 1061 | tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, ""); | 1062 | tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, ""); |
| 1062 | tts_config.model.vits.lexicon = | 1063 | tts_config.model.vits.lexicon = |
| 1063 | SHERPA_ONNX_OR(config->model.vits.lexicon, ""); | 1064 | SHERPA_ONNX_OR(config->model.vits.lexicon, ""); |
| @@ -1073,6 +1074,24 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | @@ -1073,6 +1074,24 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | ||
| 1073 | tts_config.model.vits.dict_dir = | 1074 | tts_config.model.vits.dict_dir = |
| 1074 | SHERPA_ONNX_OR(config->model.vits.dict_dir, ""); | 1075 | SHERPA_ONNX_OR(config->model.vits.dict_dir, ""); |
| 1075 | 1076 | ||
| 1077 | + // matcha | ||
| 1078 | + tts_config.model.matcha.acoustic_model = | ||
| 1079 | + SHERPA_ONNX_OR(config->model.matcha.acoustic_model, ""); | ||
| 1080 | + tts_config.model.matcha.vocoder = | ||
| 1081 | + SHERPA_ONNX_OR(config->model.matcha.vocoder, ""); | ||
| 1082 | + tts_config.model.matcha.lexicon = | ||
| 1083 | + SHERPA_ONNX_OR(config->model.matcha.lexicon, ""); | ||
| 1084 | + tts_config.model.matcha.tokens = | ||
| 1085 | + SHERPA_ONNX_OR(config->model.matcha.tokens, ""); | ||
| 1086 | + tts_config.model.matcha.data_dir = | ||
| 1087 | + SHERPA_ONNX_OR(config->model.matcha.data_dir, ""); | ||
| 1088 | + tts_config.model.matcha.noise_scale = | ||
| 1089 | + SHERPA_ONNX_OR(config->model.matcha.noise_scale, 0.667); | ||
| 1090 | + tts_config.model.matcha.length_scale = | ||
| 1091 | + SHERPA_ONNX_OR(config->model.matcha.length_scale, 1.0); | ||
| 1092 | + tts_config.model.matcha.dict_dir = | ||
| 1093 | + SHERPA_ONNX_OR(config->model.matcha.dict_dir, ""); | ||
| 1094 | + | ||
| 1076 | tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); | 1095 | tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); |
| 1077 | tts_config.model.debug = config->model.debug; | 1096 | tts_config.model.debug = config->model.debug; |
| 1078 | tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); | 1097 | tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); |
| @@ -1082,7 +1101,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | @@ -1082,7 +1101,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | ||
| 1082 | 1101 | ||
| 1083 | tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); | 1102 | tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); |
| 1084 | tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); | 1103 | tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); |
| 1085 | - tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); | 1104 | + tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1); |
| 1086 | 1105 | ||
| 1087 | if (tts_config.model.debug) { | 1106 | if (tts_config.model.debug) { |
| 1088 | #if __OHOS__ | 1107 | #if __OHOS__ |
| @@ -894,15 +894,28 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { | @@ -894,15 +894,28 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { | ||
| 894 | 894 | ||
| 895 | float noise_scale; | 895 | float noise_scale; |
| 896 | float noise_scale_w; | 896 | float noise_scale_w; |
| 897 | - float length_scale; // < 1, faster in speed; > 1, slower in speed | 897 | + float length_scale; // < 1, faster in speech speed; > 1, slower in speed |
| 898 | const char *dict_dir; | 898 | const char *dict_dir; |
| 899 | } SherpaOnnxOfflineTtsVitsModelConfig; | 899 | } SherpaOnnxOfflineTtsVitsModelConfig; |
| 900 | 900 | ||
| 901 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsMatchaModelConfig { | ||
| 902 | + const char *acoustic_model; | ||
| 903 | + const char *vocoder; | ||
| 904 | + const char *lexicon; | ||
| 905 | + const char *tokens; | ||
| 906 | + const char *data_dir; | ||
| 907 | + | ||
| 908 | + float noise_scale; | ||
| 909 | + float length_scale; // < 1, faster in speech speed; > 1, slower in speed | ||
| 910 | + const char *dict_dir; | ||
| 911 | +} SherpaOnnxOfflineTtsMatchaModelConfig; | ||
| 912 | + | ||
| 901 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { | 913 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { |
| 902 | SherpaOnnxOfflineTtsVitsModelConfig vits; | 914 | SherpaOnnxOfflineTtsVitsModelConfig vits; |
| 903 | int32_t num_threads; | 915 | int32_t num_threads; |
| 904 | int32_t debug; | 916 | int32_t debug; |
| 905 | const char *provider; | 917 | const char *provider; |
| 918 | + SherpaOnnxOfflineTtsMatchaModelConfig matcha; | ||
| 906 | } SherpaOnnxOfflineTtsModelConfig; | 919 | } SherpaOnnxOfflineTtsModelConfig; |
| 907 | 920 | ||
| 908 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { | 921 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { |
| @@ -30,7 +30,7 @@ struct OfflineTtsConfig { | @@ -30,7 +30,7 @@ struct OfflineTtsConfig { | ||
| 30 | // Maximum number of sentences that we process at a time. | 30 | // Maximum number of sentences that we process at a time. |
| 31 | // This is to avoid OOM for very long input text. | 31 | // This is to avoid OOM for very long input text. |
| 32 | // If you set it to -1, then we process all sentences in a single batch. | 32 | // If you set it to -1, then we process all sentences in a single batch. |
| 33 | - int32_t max_num_sentences = 2; | 33 | + int32_t max_num_sentences = 1; |
| 34 | 34 | ||
| 35 | OfflineTtsConfig() = default; | 35 | OfflineTtsConfig() = default; |
| 36 | OfflineTtsConfig(const OfflineTtsModelConfig &model, | 36 | OfflineTtsConfig(const OfflineTtsModelConfig &model, |
-
请 注册 或 登录 后发表评论