Fangjun Kuang
Committed by GitHub

Add C API for MatchaTTS models (#1675)

@@ -81,6 +81,51 @@ jobs: @@ -81,6 +81,51 @@ jobs:
81 otool -L ./install/lib/libsherpa-onnx-c-api.dylib 81 otool -L ./install/lib/libsherpa-onnx-c-api.dylib
82 fi 82 fi
83 83
  84 + - name: Test Matcha TTS (zh)
  85 + shell: bash
  86 + run: |
  87 + gcc -o matcha-tts-zh-c-api ./c-api-examples/matcha-tts-zh-c-api.c \
  88 + -I ./build/install/include \
  89 + -L ./build/install/lib/ \
  90 + -l sherpa-onnx-c-api \
  91 + -l onnxruntime
  92 +
  93 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  94 + tar xvf matcha-icefall-zh-baker.tar.bz2
  95 + rm matcha-icefall-zh-baker.tar.bz2
  96 +
  97 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  98 +
  99 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  100 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  101 +
  102 + ./matcha-tts-zh-c-api
  103 +
  104 + - name: Test Matcha TTS (en)
  105 + shell: bash
  106 + run: |
  107 + gcc -o matcha-tts-en-c-api ./c-api-examples/matcha-tts-en-c-api.c \
  108 + -I ./build/install/include \
  109 + -L ./build/install/lib/ \
  110 + -l sherpa-onnx-c-api \
  111 + -l onnxruntime
  112 +
  113 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  114 + tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  115 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  116 +
  117 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  118 +
  119 + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
  120 + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
  121 +
  122 + ./matcha-tts-en-c-api
  123 +
  124 + - uses: actions/upload-artifact@v4
  125 + with:
  126 + name: matcha-tts-${{ matrix.os }}
  127 + path: ./generated-matcha-*.wav
  128 +
84 - name: Test vad + Whisper tiny.en 129 - name: Test vad + Whisper tiny.en
85 shell: bash 130 shell: bash
86 run: | 131 run: |
@@ -7,6 +7,12 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) @@ -7,6 +7,12 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs)
7 if(SHERPA_ONNX_ENABLE_TTS) 7 if(SHERPA_ONNX_ENABLE_TTS)
8 add_executable(offline-tts-c-api offline-tts-c-api.c) 8 add_executable(offline-tts-c-api offline-tts-c-api.c)
9 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) 9 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs)
  10 +
  11 + add_executable(matcha-tts-zh-c-api matcha-tts-zh-c-api.c)
  12 + target_link_libraries(matcha-tts-zh-c-api sherpa-onnx-c-api)
  13 +
  14 + add_executable(matcha-tts-en-c-api matcha-tts-en-c-api.c)
  15 + target_link_libraries(matcha-tts-en-c-api sherpa-onnx-c-api)
10 endif() 16 endif()
11 17
12 if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION) 18 if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION)
  1 +// c-api-examples/matcha-tts-en-c-api.c
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +// This file shows how to use sherpa-onnx C API
  6 +// for English TTS with MatchaTTS.
  7 +//
  8 +// clang-format off
  9 +/*
  10 +Usage
  11 +
  12 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  13 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  14 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  15 +
  16 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  17 +
  18 +./matcha-tts-en-c-api
  19 +
  20 + */
  21 +// clang-format on
  22 +
  23 +#include <stdio.h>
  24 +#include <stdlib.h>
  25 +#include <string.h>
  26 +
  27 +#include "sherpa-onnx/c-api/c-api.h"
  28 +
  29 +static int32_t ProgressCallback(const float *samples, int32_t num_samples,
  30 + float progress) {
  31 + fprintf(stderr, "Progress: %.3f%%\n", progress * 100);
  32 + // return 1 to continue generating
  33 + // return 0 to stop generating
  34 + return 1;
  35 +}
  36 +
  37 +int32_t main(int32_t argc, char *argv[]) {
  38 + SherpaOnnxOfflineTtsConfig config;
  39 + memset(&config, 0, sizeof(config));
  40 + config.model.matcha.acoustic_model =
  41 + "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx";
  42 +
  43 + config.model.matcha.vocoder = "./hifigan_v2.onnx";
  44 +
  45 + config.model.matcha.tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt";
  46 +
  47 + config.model.matcha.data_dir =
  48 + "./matcha-icefall-en_US-ljspeech/espeak-ng-data";
  49 +
  50 + config.model.num_threads = 1;
  51 +
  52 + // If you don't want to see debug messages, please set it to 0
  53 + config.model.debug = 1;
  54 +
  55 + const char *filename = "./generated-matcha-en.wav";
  56 + const char *text =
  57 + "Today as always, men fall into two groups: slaves and free men. Whoever "
  58 + "does not have two-thirds of his day for himself, is a slave, whatever "
  59 + "he may be: a statesman, a businessman, an official, or a scholar. "
  60 + "Friends fell out often because life was changing so fast. The easiest "
  61 + "thing in the world was to lose touch with someone.";
  62 +
  63 + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config);
  64 + int32_t sid = 0;
  65 + float speed = 1.0; // larger -> faster in speech speed
  66 +
  67 +#if 0
  68 + // If you don't want to use a callback, then please enable this branch
  69 + const SherpaOnnxGeneratedAudio *audio =
  70 + SherpaOnnxOfflineTtsGenerate(tts, text, sid, speed);
  71 +#else
  72 + const SherpaOnnxGeneratedAudio *audio =
  73 + SherpaOnnxOfflineTtsGenerateWithProgressCallback(tts, text, sid, speed,
  74 + ProgressCallback);
  75 +#endif
  76 +
  77 + SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename);
  78 +
  79 + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
  80 + SherpaOnnxDestroyOfflineTts(tts);
  81 +
  82 + fprintf(stderr, "Input text is: %s\n", text);
  83 + fprintf(stderr, "Speaker ID is is: %d\n", sid);
  84 + fprintf(stderr, "Saved to: %s\n", filename);
  85 +
  86 + return 0;
  87 +}
  1 +// c-api-examples/matcha-tts-zh-c-api.c
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +// This file shows how to use sherpa-onnx C API
  6 +// for Chinese TTS with MatchaTTS.
  7 +//
  8 +// clang-format off
  9 +/*
  10 +Usage
  11 +
  12 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  13 +tar xvf matcha-icefall-zh-baker.tar.bz2
  14 +rm matcha-icefall-zh-baker.tar.bz2
  15 +
  16 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  17 +
  18 +./matcha-tts-zh-c-api
  19 +
  20 + */
  21 +// clang-format on
  22 +
  23 +#include <stdio.h>
  24 +#include <stdlib.h>
  25 +#include <string.h>
  26 +
  27 +#include "sherpa-onnx/c-api/c-api.h"
  28 +
  29 +static int32_t ProgressCallback(const float *samples, int32_t num_samples,
  30 + float progress) {
  31 + fprintf(stderr, "Progress: %.3f%%\n", progress * 100);
  32 + // return 1 to continue generating
  33 + // return 0 to stop generating
  34 + return 1;
  35 +}
  36 +
  37 +int32_t main(int32_t argc, char *argv[]) {
  38 + SherpaOnnxOfflineTtsConfig config;
  39 + memset(&config, 0, sizeof(config));
  40 + config.model.matcha.acoustic_model =
  41 + "./matcha-icefall-zh-baker/model-steps-3.onnx";
  42 + config.model.matcha.vocoder = "./hifigan_v2.onnx";
  43 + config.model.matcha.lexicon = "./matcha-icefall-zh-baker/lexicon.txt";
  44 + config.model.matcha.tokens = "./matcha-icefall-zh-baker/tokens.txt";
  45 + config.model.matcha.dict_dir = "./matcha-icefall-zh-baker/dict";
  46 + config.model.num_threads = 1;
  47 +
  48 + // If you don't want to see debug messages, please set it to 0
  49 + config.model.debug = 1;
  50 +
  51 + // clang-format off
  52 + config.rule_fsts = "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst";
  53 + // clang-format on
  54 +
  55 + const char *filename = "./generated-matcha-zh.wav";
  56 + const char *text =
  57 + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如"
  58 + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感"
  59 + "受着生命的奇迹与温柔."
  60 + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; "
  61 + "经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。";
  62 +
  63 + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config);
  64 + int32_t sid = 0;
  65 + float speed = 1.0; // larger -> faster in speech speed
  66 +
  67 +#if 0
  68 + // If you don't want to use a callback, then please enable this branch
  69 + const SherpaOnnxGeneratedAudio *audio =
  70 + SherpaOnnxOfflineTtsGenerate(tts, text, sid, speed);
  71 +#else
  72 + const SherpaOnnxGeneratedAudio *audio =
  73 + SherpaOnnxOfflineTtsGenerateWithProgressCallback(tts, text, sid, speed,
  74 + ProgressCallback);
  75 +#endif
  76 +
  77 + SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename);
  78 +
  79 + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
  80 + SherpaOnnxDestroyOfflineTts(tts);
  81 +
  82 + fprintf(stderr, "Input text is: %s\n", text);
  83 + fprintf(stderr, "Speaker ID is is: %d\n", sid);
  84 + fprintf(stderr, "Saved to: %s\n", filename);
  85 +
  86 + return 0;
  87 +}
@@ -1058,6 +1058,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( @@ -1058,6 +1058,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
1058 const SherpaOnnxOfflineTtsConfig *config) { 1058 const SherpaOnnxOfflineTtsConfig *config) {
1059 sherpa_onnx::OfflineTtsConfig tts_config; 1059 sherpa_onnx::OfflineTtsConfig tts_config;
1060 1060
  1061 + // vits
1061 tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, ""); 1062 tts_config.model.vits.model = SHERPA_ONNX_OR(config->model.vits.model, "");
1062 tts_config.model.vits.lexicon = 1063 tts_config.model.vits.lexicon =
1063 SHERPA_ONNX_OR(config->model.vits.lexicon, ""); 1064 SHERPA_ONNX_OR(config->model.vits.lexicon, "");
@@ -1073,6 +1074,24 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( @@ -1073,6 +1074,24 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
1073 tts_config.model.vits.dict_dir = 1074 tts_config.model.vits.dict_dir =
1074 SHERPA_ONNX_OR(config->model.vits.dict_dir, ""); 1075 SHERPA_ONNX_OR(config->model.vits.dict_dir, "");
1075 1076
  1077 + // matcha
  1078 + tts_config.model.matcha.acoustic_model =
  1079 + SHERPA_ONNX_OR(config->model.matcha.acoustic_model, "");
  1080 + tts_config.model.matcha.vocoder =
  1081 + SHERPA_ONNX_OR(config->model.matcha.vocoder, "");
  1082 + tts_config.model.matcha.lexicon =
  1083 + SHERPA_ONNX_OR(config->model.matcha.lexicon, "");
  1084 + tts_config.model.matcha.tokens =
  1085 + SHERPA_ONNX_OR(config->model.matcha.tokens, "");
  1086 + tts_config.model.matcha.data_dir =
  1087 + SHERPA_ONNX_OR(config->model.matcha.data_dir, "");
  1088 + tts_config.model.matcha.noise_scale =
  1089 + SHERPA_ONNX_OR(config->model.matcha.noise_scale, 0.667);
  1090 + tts_config.model.matcha.length_scale =
  1091 + SHERPA_ONNX_OR(config->model.matcha.length_scale, 1.0);
  1092 + tts_config.model.matcha.dict_dir =
  1093 + SHERPA_ONNX_OR(config->model.matcha.dict_dir, "");
  1094 +
1076 tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); 1095 tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
1077 tts_config.model.debug = config->model.debug; 1096 tts_config.model.debug = config->model.debug;
1078 tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); 1097 tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
@@ -1082,7 +1101,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( @@ -1082,7 +1101,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
1082 1101
1083 tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); 1102 tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
1084 tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); 1103 tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
1085 - tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); 1104 + tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1);
1086 1105
1087 if (tts_config.model.debug) { 1106 if (tts_config.model.debug) {
1088 #if __OHOS__ 1107 #if __OHOS__
@@ -894,15 +894,28 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { @@ -894,15 +894,28 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig {
894 894
895 float noise_scale; 895 float noise_scale;
896 float noise_scale_w; 896 float noise_scale_w;
897 - float length_scale; // < 1, faster in speed; > 1, slower in speed 897 + float length_scale; // < 1, faster in speech speed; > 1, slower in speed
898 const char *dict_dir; 898 const char *dict_dir;
899 } SherpaOnnxOfflineTtsVitsModelConfig; 899 } SherpaOnnxOfflineTtsVitsModelConfig;
900 900
  901 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsMatchaModelConfig {
  902 + const char *acoustic_model;
  903 + const char *vocoder;
  904 + const char *lexicon;
  905 + const char *tokens;
  906 + const char *data_dir;
  907 +
  908 + float noise_scale;
  909 + float length_scale; // < 1, faster in speech speed; > 1, slower in speed
  910 + const char *dict_dir;
  911 +} SherpaOnnxOfflineTtsMatchaModelConfig;
  912 +
901 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { 913 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {
902 SherpaOnnxOfflineTtsVitsModelConfig vits; 914 SherpaOnnxOfflineTtsVitsModelConfig vits;
903 int32_t num_threads; 915 int32_t num_threads;
904 int32_t debug; 916 int32_t debug;
905 const char *provider; 917 const char *provider;
  918 + SherpaOnnxOfflineTtsMatchaModelConfig matcha;
906 } SherpaOnnxOfflineTtsModelConfig; 919 } SherpaOnnxOfflineTtsModelConfig;
907 920
908 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { 921 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
@@ -30,7 +30,7 @@ struct OfflineTtsConfig { @@ -30,7 +30,7 @@ struct OfflineTtsConfig {
30 // Maximum number of sentences that we process at a time. 30 // Maximum number of sentences that we process at a time.
31 // This is to avoid OOM for very long input text. 31 // This is to avoid OOM for very long input text.
32 // If you set it to -1, then we process all sentences in a single batch. 32 // If you set it to -1, then we process all sentences in a single batch.
33 - int32_t max_num_sentences = 2; 33 + int32_t max_num_sentences = 1;
34 34
35 OfflineTtsConfig() = default; 35 OfflineTtsConfig() = default;
36 OfflineTtsConfig(const OfflineTtsModelConfig &model, 36 OfflineTtsConfig(const OfflineTtsModelConfig &model,