Fangjun Kuang
Committed by GitHub

Add C++ runtime for kitten-tts (#2460)

@@ -192,6 +192,8 @@ if(SHERPA_ONNX_ENABLE_TTS) @@ -192,6 +192,8 @@ if(SHERPA_ONNX_ENABLE_TTS)
192 offline-tts-character-frontend.cc 192 offline-tts-character-frontend.cc
193 offline-tts-frontend.cc 193 offline-tts-frontend.cc
194 offline-tts-impl.cc 194 offline-tts-impl.cc
  195 + offline-tts-kitten-model-config.cc
  196 + offline-tts-kitten-model.cc
195 offline-tts-kokoro-model-config.cc 197 offline-tts-kokoro-model-config.cc
196 offline-tts-kokoro-model.cc 198 offline-tts-kokoro-model.cc
197 offline-tts-matcha-model-config.cc 199 offline-tts-matcha-model-config.cc
@@ -260,7 +260,7 @@ class KokoroMultiLangLexicon::Impl { @@ -260,7 +260,7 @@ class KokoroMultiLangLexicon::Impl {
260 260
261 std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak( 261 std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak(
262 const std::string &text, const std::string &voice) const { 262 const std::string &text, const std::string &voice) const {
263 - auto temp = ConvertTextToTokenIdsKokoro( 263 + auto temp = ConvertTextToTokenIdsKokoroOrKitten(
264 phoneme2id_, meta_data_.max_token_len, text, voice); 264 phoneme2id_, meta_data_.max_token_len, text, voice);
265 std::vector<std::vector<int32_t>> ans; 265 std::vector<std::vector<int32_t>> ans;
266 ans.reserve(temp.size()); 266 ans.reserve(temp.size());
@@ -59,7 +59,7 @@ class OfflineTtsFrontend { @@ -59,7 +59,7 @@ class OfflineTtsFrontend {
59 void InitEspeak(const std::string &data_dir); 59 void InitEspeak(const std::string &data_dir);
60 60
61 // implementation in ./piper-phonemize-lexicon.cc 61 // implementation in ./piper-phonemize-lexicon.cc
62 -std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( 62 +std::vector<TokenIDs> ConvertTextToTokenIdsKokoroOrKitten(
63 const std::unordered_map<char32_t, int32_t> &token2id, 63 const std::unordered_map<char32_t, int32_t> &token2id,
64 int32_t max_token_len, const std::string &text, 64 int32_t max_token_len, const std::string &text,
65 const std::string &voice = ""); 65 const std::string &voice = "");
@@ -16,6 +16,7 @@ @@ -16,6 +16,7 @@
16 #include "rawfile/raw_file_manager.h" 16 #include "rawfile/raw_file_manager.h"
17 #endif 17 #endif
18 18
  19 +#include "sherpa-onnx/csrc/offline-tts-kitten-impl.h"
19 #include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h" 20 #include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h"
20 #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" 21 #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
21 #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" 22 #include "sherpa-onnx/csrc/offline-tts-vits-impl.h"
@@ -40,9 +41,15 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( @@ -40,9 +41,15 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
40 return std::make_unique<OfflineTtsVitsImpl>(config); 41 return std::make_unique<OfflineTtsVitsImpl>(config);
41 } else if (!config.model.matcha.acoustic_model.empty()) { 42 } else if (!config.model.matcha.acoustic_model.empty()) {
42 return std::make_unique<OfflineTtsMatchaImpl>(config); 43 return std::make_unique<OfflineTtsMatchaImpl>(config);
  44 + } else if (!config.model.kokoro.model.empty()) {
  45 + return std::make_unique<OfflineTtsKokoroImpl>(config);
  46 + } else if (!config.model.kitten.model.empty()) {
  47 + return std::make_unique<OfflineTtsKittenImpl>(config);
43 } 48 }
44 49
45 - return std::make_unique<OfflineTtsKokoroImpl>(config); 50 + SHERPA_ONNX_LOGE("Please provide a tts model.");
  51 +
  52 + return {};
46 } 53 }
47 54
48 template <typename Manager> 55 template <typename Manager>
@@ -52,9 +59,14 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( @@ -52,9 +59,14 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
52 return std::make_unique<OfflineTtsVitsImpl>(mgr, config); 59 return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
53 } else if (!config.model.matcha.acoustic_model.empty()) { 60 } else if (!config.model.matcha.acoustic_model.empty()) {
54 return std::make_unique<OfflineTtsMatchaImpl>(mgr, config); 61 return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
  62 + } else if (!config.model.kokoro.model.empty()) {
  63 + return std::make_unique<OfflineTtsKokoroImpl>(mgr, config);
  64 + } else if (!config.model.kitten.model.empty()) {
  65 + return std::make_unique<OfflineTtsKittenImpl>(mgr, config);
55 } 66 }
56 67
57 - return std::make_unique<OfflineTtsKokoroImpl>(mgr, config); 68 + SHERPA_ONNX_LOGE("Please provide a tts model.");
  69 + return {};
58 } 70 }
59 71
60 #if __ANDROID_API__ >= 9 72 #if __ANDROID_API__ >= 9
  1 +// sherpa-onnx/csrc/offline-tts-kitten-impl.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_IMPL_H_
  5 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_IMPL_H_
  6 +
  7 +#include <iomanip>
  8 +#include <ios>
  9 +#include <memory>
  10 +#include <string>
  11 +#include <strstream>
  12 +#include <utility>
  13 +#include <vector>
  14 +
  15 +#include "fst/extensions/far/far.h"
  16 +#include "kaldifst/csrc/kaldi-fst-io.h"
  17 +#include "kaldifst/csrc/text-normalizer.h"
  18 +#include "sherpa-onnx/csrc/file-utils.h"
  19 +#include "sherpa-onnx/csrc/lexicon.h"
  20 +#include "sherpa-onnx/csrc/macros.h"
  21 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
  22 +#include "sherpa-onnx/csrc/offline-tts-impl.h"
  23 +#include "sherpa-onnx/csrc/offline-tts-kitten-model.h"
  24 +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
  25 +#include "sherpa-onnx/csrc/text-utils.h"
  26 +
  27 +namespace sherpa_onnx {
  28 +
  29 +class OfflineTtsKittenImpl : public OfflineTtsImpl {
  30 + public:
  31 + explicit OfflineTtsKittenImpl(const OfflineTtsConfig &config)
  32 + : config_(config),
  33 + model_(std::make_unique<OfflineTtsKittenModel>(config.model)) {
  34 + InitFrontend();
  35 +
  36 + if (!config.rule_fsts.empty()) {
  37 + std::vector<std::string> files;
  38 + SplitStringToVector(config.rule_fsts, ",", false, &files);
  39 + tn_list_.reserve(files.size());
  40 + for (const auto &f : files) {
  41 + if (config.model.debug) {
  42 +#if __OHOS__
  43 + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
  44 +#else
  45 + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
  46 +#endif
  47 + }
  48 + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
  49 + }
  50 + }
  51 +
  52 + if (!config.rule_fars.empty()) {
  53 + if (config.model.debug) {
  54 + SHERPA_ONNX_LOGE("Loading FST archives");
  55 + }
  56 + std::vector<std::string> files;
  57 + SplitStringToVector(config.rule_fars, ",", false, &files);
  58 +
  59 + tn_list_.reserve(files.size() + tn_list_.size());
  60 +
  61 + for (const auto &f : files) {
  62 + if (config.model.debug) {
  63 +#if __OHOS__
  64 + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
  65 +#else
  66 + SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
  67 +#endif
  68 + }
  69 + std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
  70 + fst::FarReader<fst::StdArc>::Open(f));
  71 + for (; !reader->Done(); reader->Next()) {
  72 + std::unique_ptr<fst::StdConstFst> r(
  73 + fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
  74 +
  75 + tn_list_.push_back(
  76 + std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
  77 + }
  78 + }
  79 +
  80 + if (config.model.debug) {
  81 + SHERPA_ONNX_LOGE("FST archives loaded!");
  82 + }
  83 + }
  84 + }
  85 +
  86 + template <typename Manager>
  87 + OfflineTtsKittenImpl(Manager *mgr, const OfflineTtsConfig &config)
  88 + : config_(config),
  89 + model_(std::make_unique<OfflineTtsKittenModel>(mgr, config.model)) {
  90 + InitFrontend(mgr);
  91 +
  92 + if (!config.rule_fsts.empty()) {
  93 + std::vector<std::string> files;
  94 + SplitStringToVector(config.rule_fsts, ",", false, &files);
  95 + tn_list_.reserve(files.size());
  96 + for (const auto &f : files) {
  97 + if (config.model.debug) {
  98 +#if __OHOS__
  99 + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
  100 +#else
  101 + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
  102 +#endif
  103 + }
  104 + auto buf = ReadFile(mgr, f);
  105 + std::istrstream is(buf.data(), buf.size());
  106 + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is));
  107 + }
  108 + }
  109 +
  110 + if (!config.rule_fars.empty()) {
  111 + std::vector<std::string> files;
  112 + SplitStringToVector(config.rule_fars, ",", false, &files);
  113 + tn_list_.reserve(files.size() + tn_list_.size());
  114 +
  115 + for (const auto &f : files) {
  116 + if (config.model.debug) {
  117 +#if __OHOS__
  118 + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
  119 +#else
  120 + SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
  121 +#endif
  122 + }
  123 +
  124 + auto buf = ReadFile(mgr, f);
  125 +
  126 + std::unique_ptr<std::istream> s(
  127 + new std::istrstream(buf.data(), buf.size()));
  128 +
  129 + std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
  130 + fst::FarReader<fst::StdArc>::Open(std::move(s)));
  131 +
  132 + for (; !reader->Done(); reader->Next()) {
  133 + std::unique_ptr<fst::StdConstFst> r(
  134 + fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
  135 +
  136 + tn_list_.push_back(
  137 + std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
  138 + } // for (; !reader->Done(); reader->Next())
  139 + } // for (const auto &f : files)
  140 + } // if (!config.rule_fars.empty())
  141 + }
  142 +
  143 + int32_t SampleRate() const override {
  144 + return model_->GetMetaData().sample_rate;
  145 + }
  146 +
  147 + int32_t NumSpeakers() const override {
  148 + return model_->GetMetaData().num_speakers;
  149 + }
  150 +
  151 + GeneratedAudio Generate(
  152 + const std::string &_text, int64_t sid = 0, float speed = 1.0,
  153 + GeneratedAudioCallback callback = nullptr) const override {
  154 + const auto &meta_data = model_->GetMetaData();
  155 + int32_t num_speakers = meta_data.num_speakers;
  156 +
  157 + if (num_speakers == 0 && sid != 0) {
  158 +#if __OHOS__
  159 + SHERPA_ONNX_LOGE(
  160 + "This is a single-speaker model and supports only sid 0. Given sid: "
  161 + "%{public}d. sid is ignored",
  162 + static_cast<int32_t>(sid));
  163 +#else
  164 + SHERPA_ONNX_LOGE(
  165 + "This is a single-speaker model and supports only sid 0. Given sid: "
  166 + "%d. sid is ignored",
  167 + static_cast<int32_t>(sid));
  168 +#endif
  169 + }
  170 +
  171 + if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) {
  172 +#if __OHOS__
  173 + SHERPA_ONNX_LOGE(
  174 + "This model contains only %{public}d speakers. sid should be in the "
  175 + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0",
  176 + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  177 +#else
  178 + SHERPA_ONNX_LOGE(
  179 + "This model contains only %d speakers. sid should be in the range "
  180 + "[%d, %d]. Given: %d. Use sid=0",
  181 + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  182 +#endif
  183 + sid = 0;
  184 + }
  185 +
  186 + std::string text = _text;
  187 + if (config_.model.debug) {
  188 +#if __OHOS__
  189 + SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str());
  190 +#else
  191 + SHERPA_ONNX_LOGE("Raw text: %s", text.c_str());
  192 +#endif
  193 + std::ostringstream os;
  194 + os << "In bytes (hex):\n";
  195 + const auto p = reinterpret_cast<const uint8_t *>(text.c_str());
  196 + for (int32_t i = 0; i != text.size(); ++i) {
  197 + os << std::setw(2) << std::setfill('0') << std::hex
  198 + << static_cast<uint32_t>(p[i]) << " ";
  199 + }
  200 + os << "\n";
  201 +
  202 +#if __OHOS__
  203 + SHERPA_ONNX_LOGE("%{public}s", os.str().c_str());
  204 +#else
  205 + SHERPA_ONNX_LOGE("%s", os.str().c_str());
  206 +#endif
  207 + }
  208 +
  209 + if (!tn_list_.empty()) {
  210 + for (const auto &tn : tn_list_) {
  211 + text = tn->Normalize(text);
  212 + if (config_.model.debug) {
  213 +#if __OHOS__
  214 + SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str());
  215 +#else
  216 + SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str());
  217 +#endif
  218 + }
  219 + }
  220 + }
  221 +
  222 + std::vector<TokenIDs> token_ids =
  223 + frontend_->ConvertTextToTokenIds(text, meta_data.voice);
  224 +
  225 + if (token_ids.empty() ||
  226 + (token_ids.size() == 1 && token_ids[0].tokens.empty())) {
  227 +#if __OHOS__
  228 + SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs",
  229 + text.c_str());
  230 +#else
  231 + SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str());
  232 +#endif
  233 + return {};
  234 + }
  235 +
  236 + std::vector<std::vector<int64_t>> x;
  237 +
  238 + x.reserve(token_ids.size());
  239 +
  240 + for (auto &i : token_ids) {
  241 + x.push_back(std::move(i.tokens));
  242 + }
  243 +
  244 + int32_t x_size = static_cast<int32_t>(x.size());
  245 +
  246 + if (config_.max_num_sentences != 1) {
  247 +#if __OHOS__
  248 + SHERPA_ONNX_LOGE(
  249 + "max_num_sentences (%{public}d) != 1 is ignored for Kitten TTS "
  250 + "models",
  251 + config_.max_num_sentences);
  252 +#else
  253 + SHERPA_ONNX_LOGE(
  254 + "max_num_sentences (%d) != 1 is ignored for Kitten TTS models",
  255 + config_.max_num_sentences);
  256 +#endif
  257 + }
  258 +
  259 + // the input text is too long, we process sentences within it in batches
  260 + // to avoid OOM. Batch size is config_.max_num_sentences
  261 + std::vector<std::vector<int64_t>> batch_x;
  262 +
  263 + int32_t batch_size = 1;
  264 + batch_x.reserve(batch_size);
  265 + int32_t num_batches = x_size / batch_size;
  266 +
  267 + if (config_.model.debug) {
  268 +#if __OHOS__
  269 + SHERPA_ONNX_LOGE(
  270 + "Split it into %{public}d batches. batch size: "
  271 + "%{public}d. Number of sentences: %{public}d",
  272 + num_batches, batch_size, x_size);
  273 +#else
  274 + SHERPA_ONNX_LOGE(
  275 + "Split it into %d batches. batch size: %d. Number "
  276 + "of sentences: %d",
  277 + num_batches, batch_size, x_size);
  278 +#endif
  279 + }
  280 +
  281 + GeneratedAudio ans;
  282 +
  283 + int32_t should_continue = 1;
  284 +
  285 + int32_t k = 0;
  286 +
  287 + for (int32_t b = 0; b != num_batches && should_continue; ++b) {
  288 + batch_x.clear();
  289 + for (int32_t i = 0; i != batch_size; ++i, ++k) {
  290 + batch_x.push_back(std::move(x[k]));
  291 + }
  292 +
  293 + auto audio = Process(batch_x, sid, speed);
  294 + ans.sample_rate = audio.sample_rate;
  295 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  296 + audio.samples.end());
  297 + if (callback) {
  298 + should_continue = callback(audio.samples.data(), audio.samples.size(),
  299 + (b + 1) * 1.0 / num_batches);
  300 + // Caution(fangjun): audio is freed when the callback returns, so users
  301 + // should copy the data if they want to access the data after
  302 + // the callback returns to avoid segmentation fault.
  303 + }
  304 + }
  305 +
  306 + batch_x.clear();
  307 + while (k < static_cast<int32_t>(x.size()) && should_continue) {
  308 + batch_x.push_back(std::move(x[k]));
  309 +
  310 + ++k;
  311 + }
  312 +
  313 + if (!batch_x.empty()) {
  314 + auto audio = Process(batch_x, sid, speed);
  315 + ans.sample_rate = audio.sample_rate;
  316 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  317 + audio.samples.end());
  318 + if (callback) {
  319 + callback(audio.samples.data(), audio.samples.size(), 1.0);
  320 + // Caution(fangjun): audio is freed when the callback returns, so users
  321 + // should copy the data if they want to access the data after
  322 + // the callback returns to avoid segmentation fault.
  323 + }
  324 + }
  325 +
  326 + return ans;
  327 + }
  328 +
  329 + private:
  330 + template <typename Manager>
  331 + void InitFrontend(Manager *mgr) {
  332 + const auto &meta_data = model_->GetMetaData();
  333 + frontend_ = std::make_unique<PiperPhonemizeLexicon>(
  334 + mgr, config_.model.kitten.tokens, config_.model.kitten.data_dir,
  335 + meta_data);
  336 + }
  337 +
  338 + void InitFrontend() {
  339 + const auto &meta_data = model_->GetMetaData();
  340 + frontend_ = std::make_unique<PiperPhonemizeLexicon>(
  341 + config_.model.kitten.tokens, config_.model.kitten.data_dir, meta_data);
  342 + }
  343 +
  344 + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
  345 + int32_t sid, float speed) const {
  346 + int32_t num_tokens = 0;
  347 + for (const auto &k : tokens) {
  348 + num_tokens += k.size();
  349 + }
  350 +
  351 + std::vector<int64_t> x;
  352 + x.reserve(num_tokens);
  353 + for (const auto &k : tokens) {
  354 + x.insert(x.end(), k.begin(), k.end());
  355 + }
  356 +
  357 + auto memory_info =
  358 + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
  359 +
  360 + std::array<int64_t, 2> x_shape = {1, static_cast<int32_t>(x.size())};
  361 + Ort::Value x_tensor = Ort::Value::CreateTensor(
  362 + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
  363 +
  364 + Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed);
  365 +
  366 + std::vector<int64_t> audio_shape =
  367 + audio.GetTensorTypeAndShapeInfo().GetShape();
  368 +
  369 + int64_t total = 1;
  370 + // The output shape may be (1, 1, total) or (1, total) or (total,)
  371 + for (auto i : audio_shape) {
  372 + total *= i;
  373 + }
  374 +
  375 + const float *p = audio.GetTensorData<float>();
  376 +
  377 + GeneratedAudio ans;
  378 + ans.sample_rate = model_->GetMetaData().sample_rate;
  379 + ans.samples = std::vector<float>(p, p + total);
  380 +
  381 + float silence_scale = config_.silence_scale;
  382 + if (silence_scale != 1) {
  383 + ans = ans.ScaleSilence(silence_scale);
  384 + }
  385 +
  386 + return ans;
  387 + }
  388 +
  389 + private:
  390 + OfflineTtsConfig config_;
  391 + std::unique_ptr<OfflineTtsKittenModel> model_;
  392 + std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_;
  393 + std::unique_ptr<OfflineTtsFrontend> frontend_;
  394 +};
  395 +
  396 +} // namespace sherpa_onnx
  397 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_IMPL_H_
  1 +// sherpa-onnx/csrc/offline-tts-kitten-model-config.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/offline-tts-kitten-model-config.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "sherpa-onnx/csrc/file-utils.h"
  10 +#include "sherpa-onnx/csrc/macros.h"
  11 +#include "sherpa-onnx/csrc/text-utils.h"
  12 +
  13 +namespace sherpa_onnx {
  14 +
  15 +void OfflineTtsKittenModelConfig::Register(ParseOptions *po) {
  16 + po->Register("kitten-model", &model, "Path to kitten model");
  17 + po->Register("kitten-voices", &voices,
  18 + "Path to voices.bin for kitten models");
  19 + po->Register("kitten-tokens", &tokens,
  20 + "Path to tokens.txt for kitten models");
  21 + po->Register("kitten-data-dir", &data_dir,
  22 + "Path to the directory containing dict for espeak-ng.");
  23 + po->Register("kitten-length-scale", &length_scale,
  24 + "Inverse of speech speed. Larger->Slower; Smaller->faster.");
  25 +}
  26 +
  27 +bool OfflineTtsKittenModelConfig::Validate() const {
  28 + if (model.empty()) {
  29 + SHERPA_ONNX_LOGE("Please provide --kitten-model");
  30 + return false;
  31 + }
  32 +
  33 + if (!FileExists(model)) {
  34 + SHERPA_ONNX_LOGE("--kitten-model: '%s' does not exist", model.c_str());
  35 + return false;
  36 + }
  37 +
  38 + if (voices.empty()) {
  39 + SHERPA_ONNX_LOGE("Please provide --kitten-voices");
  40 + return false;
  41 + }
  42 +
  43 + if (!FileExists(voices)) {
  44 + SHERPA_ONNX_LOGE("--kitten-voices: '%s' does not exist", voices.c_str());
  45 + return false;
  46 + }
  47 +
  48 + if (tokens.empty()) {
  49 + SHERPA_ONNX_LOGE("Please provide --kitten-tokens");
  50 + return false;
  51 + }
  52 +
  53 + if (!FileExists(tokens)) {
  54 + SHERPA_ONNX_LOGE("--kitten-tokens: '%s' does not exist", tokens.c_str());
  55 + return false;
  56 + }
  57 +
  58 + if (data_dir.empty()) {
  59 + SHERPA_ONNX_LOGE("Please provide --kitten-data-dir");
  60 + return false;
  61 + }
  62 +
  63 + if (!FileExists(data_dir + "/phontab")) {
  64 + SHERPA_ONNX_LOGE(
  65 + "'%s/phontab' does not exist. Please check --kitten-data-dir",
  66 + data_dir.c_str());
  67 + return false;
  68 + }
  69 +
  70 + if (!FileExists(data_dir + "/phonindex")) {
  71 + SHERPA_ONNX_LOGE(
  72 + "'%s/phonindex' does not exist. Please check --kitten-data-dir",
  73 + data_dir.c_str());
  74 + return false;
  75 + }
  76 +
  77 + if (!FileExists(data_dir + "/phondata")) {
  78 + SHERPA_ONNX_LOGE(
  79 + "'%s/phondata' does not exist. Please check --kitten-data-dir",
  80 + data_dir.c_str());
  81 + return false;
  82 + }
  83 +
  84 + if (!FileExists(data_dir + "/intonations")) {
  85 + SHERPA_ONNX_LOGE(
  86 + "'%s/intonations' does not exist. Please check --kitten-data-dir",
  87 + data_dir.c_str());
  88 + return false;
  89 + }
  90 +
  91 + if (length_scale <= 0) {
  92 + SHERPA_ONNX_LOGE(
  93 + "Please provide a positive length_scale for --kitten-length-scale. "
  94 + "Given: %.3f",
  95 + length_scale);
  96 + return false;
  97 + }
  98 +
  99 + return true;
  100 +}
  101 +
  102 +std::string OfflineTtsKittenModelConfig::ToString() const {
  103 + std::ostringstream os;
  104 +
  105 + os << "OfflineTtsKittenModelConfig(";
  106 + os << "model=\"" << model << "\", ";
  107 + os << "voices=\"" << voices << "\", ";
  108 + os << "tokens=\"" << tokens << "\", ";
  109 + os << "data_dir=\"" << data_dir << "\", ";
  110 + os << "length_scale=" << length_scale << ")";
  111 +
  112 + return os.str();
  113 +}
  114 +
  115 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-tts-kitten-model-config.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_
  7 +
  8 +#include <string>
  9 +
  10 +#include "sherpa-onnx/csrc/parse-options.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +struct OfflineTtsKittenModelConfig {
  15 + std::string model;
  16 + std::string voices;
  17 + std::string tokens;
  18 +
  19 + std::string data_dir;
  20 + // speed = 1 / length_scale
  21 + float length_scale = 1.0;
  22 +
  23 + OfflineTtsKittenModelConfig() = default;
  24 +
  25 + OfflineTtsKittenModelConfig(const std::string &model,
  26 + const std::string &voices,
  27 + const std::string &tokens,
  28 + const std::string &data_dir, float length_scale)
  29 + : model(model),
  30 + voices(voices),
  31 + tokens(tokens),
  32 + data_dir(data_dir),
  33 + length_scale(length_scale) {}
  34 +
  35 + void Register(ParseOptions *po);
  36 + bool Validate() const;
  37 +
  38 + std::string ToString() const;
  39 +};
  40 +
  41 +} // namespace sherpa_onnx
  42 +
  43 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_
  1 +// sherpa-onnx/csrc/offline-tts-kitten-model-meta-data.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_META_DATA_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_META_DATA_H_
  7 +
  8 +#include <cstdint>
  9 +#include <string>
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +// please refer to
  14 +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kitten-tts/nano_v0_1/add_meta_data.py
  15 +struct OfflineTtsKittenModelMetaData {
  16 + int32_t sample_rate = 0;
  17 + int32_t num_speakers = 0;
  18 + int32_t version = 1;
  19 + int32_t has_espeak = 1;
  20 +
  21 + int32_t max_token_len = 256;
  22 +
  23 + std::string voice;
  24 +};
  25 +
  26 +} // namespace sherpa_onnx
  27 +
  28 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_META_DATA_H_
  1 +// sherpa-onnx/csrc/offline-tts-kitten-model.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/offline-tts-kitten-model.h"
  6 +
  7 +#include <algorithm>
  8 +#include <string>
  9 +#include <utility>
  10 +#include <vector>
  11 +
  12 +#if __ANDROID_API__ >= 9
  13 +#include "android/asset_manager.h"
  14 +#include "android/asset_manager_jni.h"
  15 +#endif
  16 +
  17 +#if __OHOS__
  18 +#include "rawfile/raw_file_manager.h"
  19 +#endif
  20 +
  21 +#include "sherpa-onnx/csrc/file-utils.h"
  22 +#include "sherpa-onnx/csrc/macros.h"
  23 +#include "sherpa-onnx/csrc/onnx-utils.h"
  24 +#include "sherpa-onnx/csrc/session.h"
  25 +#include "sherpa-onnx/csrc/text-utils.h"
  26 +
  27 +namespace sherpa_onnx {
  28 +
  29 +class OfflineTtsKittenModel::Impl {
  30 + public:
  31 + explicit Impl(const OfflineTtsModelConfig &config)
  32 + : config_(config),
  33 + env_(ORT_LOGGING_LEVEL_ERROR),
  34 + sess_opts_(GetSessionOptions(config)),
  35 + allocator_{} {
  36 + auto model_buf = ReadFile(config.kitten.model);
  37 + auto voices_buf = ReadFile(config.kitten.voices);
  38 + Init(model_buf.data(), model_buf.size(), voices_buf.data(),
  39 + voices_buf.size());
  40 + }
  41 +
  42 + template <typename Manager>
  43 + Impl(Manager *mgr, const OfflineTtsModelConfig &config)
  44 + : config_(config),
  45 + env_(ORT_LOGGING_LEVEL_ERROR),
  46 + sess_opts_(GetSessionOptions(config)),
  47 + allocator_{} {
  48 + auto model_buf = ReadFile(mgr, config.kitten.model);
  49 + auto voices_buf = ReadFile(mgr, config.kitten.voices);
  50 + Init(model_buf.data(), model_buf.size(), voices_buf.data(),
  51 + voices_buf.size());
  52 + }
  53 +
  54 + const OfflineTtsKittenModelMetaData &GetMetaData() const {
  55 + return meta_data_;
  56 + }
  57 +
  58 + Ort::Value Run(Ort::Value x, int32_t sid, float speed) {
  59 + auto memory_info =
  60 + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
  61 +
  62 + std::vector<int64_t> x_shape = x.GetTensorTypeAndShapeInfo().GetShape();
  63 + if (x_shape[0] != 1) {
  64 + SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d",
  65 + static_cast<int32_t>(x_shape[0]));
  66 + SHERPA_ONNX_EXIT(-1);
  67 + }
  68 +
  69 + int32_t num_speakers = meta_data_.num_speakers;
  70 + int32_t dim1 = style_dim_[1];
  71 +
  72 + /*const*/ float *p = styles_.data() + sid * dim1;
  73 +
  74 + std::array<int64_t, 2> style_embedding_shape = {1, dim1};
  75 + Ort::Value style_embedding = Ort::Value::CreateTensor(
  76 + memory_info, p, dim1, style_embedding_shape.data(),
  77 + style_embedding_shape.size());
  78 +
  79 + int64_t speed_shape = 1;
  80 + if (config_.kitten.length_scale != 1 && speed == 1) {
  81 + speed = 1. / config_.kitten.length_scale;
  82 + }
  83 +
  84 + Ort::Value speed_tensor =
  85 + Ort::Value::CreateTensor(memory_info, &speed, 1, &speed_shape, 1);
  86 +
  87 + std::array<Ort::Value, 3> inputs = {
  88 + std::move(x), std::move(style_embedding), std::move(speed_tensor)};
  89 +
  90 + auto out =
  91 + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
  92 + output_names_ptr_.data(), output_names_ptr_.size());
  93 +
  94 + return std::move(out[0]);
  95 + }
  96 +
  97 + private:
  98 + void Init(void *model_data, size_t model_data_length, const char *voices_data,
  99 + size_t voices_data_length) {
  100 + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
  101 + sess_opts_);
  102 +
  103 + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
  104 +
  105 + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
  106 + // get meta data
  107 + Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
  108 + if (config_.debug) {
  109 + std::ostringstream os;
  110 + os << "---kitten model---\n";
  111 + PrintModelMetadata(os, meta_data);
  112 +
  113 + os << "----------input names----------\n";
  114 + int32_t i = 0;
  115 + for (const auto &s : input_names_) {
  116 + os << i << " " << s << "\n";
  117 + ++i;
  118 + }
  119 + os << "----------output names----------\n";
  120 + i = 0;
  121 + for (const auto &s : output_names_) {
  122 + os << i << " " << s << "\n";
  123 + ++i;
  124 + }
  125 +
  126 +#if __OHOS__
  127 + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
  128 +#else
  129 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
  130 +#endif
  131 + }
  132 +
  133 + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
  134 +
  135 + std::string model_type;
  136 + SHERPA_ONNX_READ_META_DATA_STR(model_type, "model_type");
  137 + if (model_type != "kitten-tts") {
  138 + SHERPA_ONNX_LOGE(
  139 + "Please download the kitten tts model from us containing meta data");
  140 + SHERPA_ONNX_EXIT(-1);
  141 + }
  142 +
  143 + SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate");
  144 + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
  145 + SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
  146 + SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
  147 + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
  148 + "en-us");
  149 + if (meta_data_.has_espeak != 1) {
  150 + SHERPA_ONNX_LOGE("It should require espeak-ng");
  151 + SHERPA_ONNX_EXIT(-1);
  152 + }
  153 +
  154 + if (config_.debug) {
  155 + std::vector<std::string> speaker_names;
  156 + SHERPA_ONNX_READ_META_DATA_VEC_STRING(speaker_names, "speaker_names");
  157 + std::ostringstream os;
  158 + os << "\n";
  159 + for (int32_t i = 0; i != speaker_names.size(); ++i) {
  160 + os << i << "->" << speaker_names[i] << ", ";
  161 + }
  162 + os << "\n";
  163 +
  164 +#if __OHOS__
  165 + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
  166 +#else
  167 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
  168 +#endif
  169 + }
  170 +
  171 + SHERPA_ONNX_READ_META_DATA_VEC(style_dim_, "style_dim");
  172 + if (style_dim_.size() != 2) {
  173 + SHERPA_ONNX_LOGE("style_dim should be 2-d, given: %d",
  174 + static_cast<int32_t>(style_dim_.size()));
  175 + SHERPA_ONNX_EXIT(-1);
  176 + }
  177 +
  178 + if (style_dim_[0] != 1) {
  179 + SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[0]);
  180 + SHERPA_ONNX_EXIT(-1);
  181 + }
  182 +
  183 + int32_t actual_num_floats = voices_data_length / sizeof(float);
  184 + int32_t expected_num_floats =
  185 + style_dim_[0] * style_dim_[1] * meta_data_.num_speakers;
  186 +
  187 + if (actual_num_floats != expected_num_floats) {
  188 +#if __OHOS__
  189 + SHERPA_ONNX_LOGE(
  190 + "Corrupted --kitten-voices '%{public}s'. Expected #floats: "
  191 + "%{public}d, actual: %{public}d",
  192 + config_.kitten.voices.c_str(), expected_num_floats,
  193 + actual_num_floats);
  194 +#else
  195 + SHERPA_ONNX_LOGE(
  196 + "Corrupted --kitten-voices '%s'. Expected #floats: %d, actual: %d",
  197 + config_.kitten.voices.c_str(), expected_num_floats,
  198 + actual_num_floats);
  199 +#endif
  200 +
  201 + SHERPA_ONNX_EXIT(-1);
  202 + }
  203 +
  204 + styles_ = std::vector<float>(
  205 + reinterpret_cast<const float *>(voices_data),
  206 + reinterpret_cast<const float *>(voices_data) + expected_num_floats);
  207 + }
  208 +
  209 + private:
  210 + OfflineTtsModelConfig config_;
  211 + Ort::Env env_;
  212 + Ort::SessionOptions sess_opts_;
  213 + Ort::AllocatorWithDefaultOptions allocator_;
  214 +
  215 + std::unique_ptr<Ort::Session> sess_;
  216 +
  217 + std::vector<std::string> input_names_;
  218 + std::vector<const char *> input_names_ptr_;
  219 +
  220 + std::vector<std::string> output_names_;
  221 + std::vector<const char *> output_names_ptr_;
  222 +
  223 + OfflineTtsKittenModelMetaData meta_data_;
  224 + std::vector<int32_t> style_dim_;
  225 +
  226 + // (num_speakers, style_dim_[1])
  227 + std::vector<float> styles_;
  228 +};
  229 +
  230 +OfflineTtsKittenModel::OfflineTtsKittenModel(
  231 + const OfflineTtsModelConfig &config)
  232 + : impl_(std::make_unique<Impl>(config)) {}
  233 +
  234 +template <typename Manager>
  235 +OfflineTtsKittenModel::OfflineTtsKittenModel(
  236 + Manager *mgr, const OfflineTtsModelConfig &config)
  237 + : impl_(std::make_unique<Impl>(mgr, config)) {}
  238 +
  239 +OfflineTtsKittenModel::~OfflineTtsKittenModel() = default;
  240 +
  241 +const OfflineTtsKittenModelMetaData &OfflineTtsKittenModel::GetMetaData()
  242 + const {
  243 + return impl_->GetMetaData();
  244 +}
  245 +
  246 +Ort::Value OfflineTtsKittenModel::Run(Ort::Value x, int64_t sid /*= 0*/,
  247 + float speed /*= 1.0*/) const {
  248 + return impl_->Run(std::move(x), sid, speed);
  249 +}
  250 +
  251 +#if __ANDROID_API__ >= 9
  252 +template OfflineTtsKittenModel::OfflineTtsKittenModel(
  253 + AAssetManager *mgr, const OfflineTtsModelConfig &config);
  254 +#endif
  255 +
  256 +#if __OHOS__
  257 +template OfflineTtsKittenModel::OfflineTtsKittenModel(
  258 + NativeResourceManager *mgr, const OfflineTtsModelConfig &config);
  259 +#endif
  260 +
  261 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-tts-kitten-model.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_H_
  7 +
  8 +#include <memory>
  9 +#include <string>
  10 +
  11 +#include "onnxruntime_cxx_api.h" // NOLINT
  12 +#include "sherpa-onnx/csrc/offline-tts-kitten-model-meta-data.h"
  13 +#include "sherpa-onnx/csrc/offline-tts-model-config.h"
  14 +
  15 +namespace sherpa_onnx {
  16 +
  17 +class OfflineTtsKittenModel {
  18 + public:
  19 + ~OfflineTtsKittenModel();
  20 +
  21 + explicit OfflineTtsKittenModel(const OfflineTtsModelConfig &config);
  22 +
  23 + template <typename Manager>
  24 + OfflineTtsKittenModel(Manager *mgr, const OfflineTtsModelConfig &config);
  25 +
  26 + // @params x An int64 tensor of shape (1, num_tokens)
  27 + // @return Return a float32 tensor containing the
  28 + // samples of shape (num_samples,)
  29 + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const;
  30 +
  31 + const OfflineTtsKittenModelMetaData &GetMetaData() const;
  32 +
  33 + private:
  34 + class Impl;
  35 + std::unique_ptr<Impl> impl_;
  36 +};
  37 +
  38 +} // namespace sherpa_onnx
  39 +
  40 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_H_
@@ -11,7 +11,9 @@ @@ -11,7 +11,9 @@
11 namespace sherpa_onnx { 11 namespace sherpa_onnx {
12 12
13 // please refer to 13 // please refer to
14 -// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/add-meta-data.py 14 +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/v0.19/add_meta_data.py
  15 +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/v1.0/add_meta_data.py
  16 +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/v1.1-zh/add_meta_data.py
15 struct OfflineTtsKokoroModelMetaData { 17 struct OfflineTtsKokoroModelMetaData {
16 int32_t sample_rate = 0; 18 int32_t sample_rate = 0;
17 int32_t num_speakers = 0; 19 int32_t num_speakers = 0;
@@ -170,7 +170,7 @@ class OfflineTtsKokoroModel::Impl { @@ -170,7 +170,7 @@ class OfflineTtsKokoroModel::Impl {
170 } 170 }
171 171
172 if (style_dim_[1] != 1) { 172 if (style_dim_[1] != 1) {
173 - SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[1]); 173 + SHERPA_ONNX_LOGE("style_dim[1] should be 1, given: %d", style_dim_[1]);
174 SHERPA_ONNX_EXIT(-1); 174 SHERPA_ONNX_EXIT(-1);
175 } 175 }
176 176
@@ -23,8 +23,8 @@ class OfflineTtsKokoroModel { @@ -23,8 +23,8 @@ class OfflineTtsKokoroModel {
23 template <typename Manager> 23 template <typename Manager>
24 OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config); 24 OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config);
25 25
26 - // Return a float32 tensor containing the mel  
27 - // of shape (batch_size, mel_dim, num_frames) 26 + // Return a float32 tensor containing the samples
  27 + // of shape (batch_size, num_samples)
28 Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const; 28 Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const;
29 29
30 const OfflineTtsKokoroModelMetaData &GetMetaData() const; 30 const OfflineTtsKokoroModelMetaData &GetMetaData() const;
@@ -12,6 +12,7 @@ void OfflineTtsModelConfig::Register(ParseOptions *po) { @@ -12,6 +12,7 @@ void OfflineTtsModelConfig::Register(ParseOptions *po) {
12 vits.Register(po); 12 vits.Register(po);
13 matcha.Register(po); 13 matcha.Register(po);
14 kokoro.Register(po); 14 kokoro.Register(po);
  15 + kitten.Register(po);
15 16
16 po->Register("num-threads", &num_threads, 17 po->Register("num-threads", &num_threads,
17 "Number of threads to run the neural network"); 18 "Number of threads to run the neural network");
@@ -37,7 +38,17 @@ bool OfflineTtsModelConfig::Validate() const { @@ -37,7 +38,17 @@ bool OfflineTtsModelConfig::Validate() const {
37 return matcha.Validate(); 38 return matcha.Validate();
38 } 39 }
39 40
40 - return kokoro.Validate(); 41 + if (!kokoro.model.empty()) {
  42 + return kokoro.Validate();
  43 + }
  44 +
  45 + if (!kitten.model.empty()) {
  46 + return kitten.Validate();
  47 + }
  48 +
  49 + SHERPA_ONNX_LOGE("Please provide at exactly one tts model.");
  50 +
  51 + return false;
41 } 52 }
42 53
43 std::string OfflineTtsModelConfig::ToString() const { 54 std::string OfflineTtsModelConfig::ToString() const {
@@ -47,6 +58,7 @@ std::string OfflineTtsModelConfig::ToString() const { @@ -47,6 +58,7 @@ std::string OfflineTtsModelConfig::ToString() const {
47 os << "vits=" << vits.ToString() << ", "; 58 os << "vits=" << vits.ToString() << ", ";
48 os << "matcha=" << matcha.ToString() << ", "; 59 os << "matcha=" << matcha.ToString() << ", ";
49 os << "kokoro=" << kokoro.ToString() << ", "; 60 os << "kokoro=" << kokoro.ToString() << ", ";
  61 + os << "kitten=" << kitten.ToString() << ", ";
50 os << "num_threads=" << num_threads << ", "; 62 os << "num_threads=" << num_threads << ", ";
51 os << "debug=" << (debug ? "True" : "False") << ", "; 63 os << "debug=" << (debug ? "True" : "False") << ", ";
52 os << "provider=\"" << provider << "\")"; 64 os << "provider=\"" << provider << "\")";
@@ -7,6 +7,7 @@ @@ -7,6 +7,7 @@
7 7
8 #include <string> 8 #include <string>
9 9
  10 +#include "sherpa-onnx/csrc/offline-tts-kitten-model-config.h"
10 #include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" 11 #include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
11 #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" 12 #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h"
12 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" 13 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h"
@@ -18,6 +19,7 @@ struct OfflineTtsModelConfig { @@ -18,6 +19,7 @@ struct OfflineTtsModelConfig {
18 OfflineTtsVitsModelConfig vits; 19 OfflineTtsVitsModelConfig vits;
19 OfflineTtsMatchaModelConfig matcha; 20 OfflineTtsMatchaModelConfig matcha;
20 OfflineTtsKokoroModelConfig kokoro; 21 OfflineTtsKokoroModelConfig kokoro;
  22 + OfflineTtsKittenModelConfig kitten;
21 23
22 int32_t num_threads = 1; 24 int32_t num_threads = 1;
23 bool debug = false; 25 bool debug = false;
@@ -28,11 +30,13 @@ struct OfflineTtsModelConfig { @@ -28,11 +30,13 @@ struct OfflineTtsModelConfig {
28 OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, 30 OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits,
29 const OfflineTtsMatchaModelConfig &matcha, 31 const OfflineTtsMatchaModelConfig &matcha,
30 const OfflineTtsKokoroModelConfig &kokoro, 32 const OfflineTtsKokoroModelConfig &kokoro,
  33 + const OfflineTtsKittenModelConfig &kitten,
31 int32_t num_threads, bool debug, 34 int32_t num_threads, bool debug,
32 const std::string &provider) 35 const std::string &provider)
33 : vits(vits), 36 : vits(vits),
34 matcha(matcha), 37 matcha(matcha),
35 kokoro(kokoro), 38 kokoro(kokoro),
  39 + kitten(kitten),
36 num_threads(num_threads), 40 num_threads(num_threads),
37 debug(debug), 41 debug(debug),
38 provider(provider) {} 42 provider(provider) {}
@@ -180,7 +180,7 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha( @@ -180,7 +180,7 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha(
180 return ans; 180 return ans;
181 } 181 }
182 182
183 -static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro( 183 +static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoroOrKitten(
184 const std::unordered_map<char32_t, int32_t> &token2id, 184 const std::unordered_map<char32_t, int32_t> &token2id,
185 const std::vector<piper::Phoneme> &phonemes, int32_t max_len) { 185 const std::vector<piper::Phoneme> &phonemes, int32_t max_len) {
186 std::vector<std::vector<int64_t>> ans; 186 std::vector<std::vector<int64_t>> ans;
@@ -277,7 +277,6 @@ static std::vector<int64_t> CoquiPhonemesToIds( @@ -277,7 +277,6 @@ static std::vector<int64_t> CoquiPhonemesToIds(
277 void InitEspeak(const std::string &data_dir) { 277 void InitEspeak(const std::string &data_dir) {
278 static std::once_flag init_flag; 278 static std::once_flag init_flag;
279 std::call_once(init_flag, [data_dir]() { 279 std::call_once(init_flag, [data_dir]() {
280 -  
281 #if __ANDROID_API__ >= 9 || defined(__OHOS__) 280 #if __ANDROID_API__ >= 9 || defined(__OHOS__)
282 if (data_dir[0] != '/') { 281 if (data_dir[0] != '/') {
283 SHERPA_ONNX_LOGE( 282 SHERPA_ONNX_LOGE(
@@ -358,6 +357,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( @@ -358,6 +357,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
358 InitEspeak(data_dir); 357 InitEspeak(data_dir);
359 } 358 }
360 359
  360 +PiperPhonemizeLexicon::PiperPhonemizeLexicon(
  361 + const std::string &tokens, const std::string &data_dir,
  362 + const OfflineTtsKittenModelMetaData &kitten_meta_data)
  363 + : kitten_meta_data_(kitten_meta_data), is_kitten_(true) {
  364 + {
  365 + std::ifstream is(tokens);
  366 + token2id_ = ReadTokens(is);
  367 + }
  368 +
  369 + InitEspeak(data_dir);
  370 +}
  371 +
361 template <typename Manager> 372 template <typename Manager>
362 PiperPhonemizeLexicon::PiperPhonemizeLexicon( 373 PiperPhonemizeLexicon::PiperPhonemizeLexicon(
363 Manager *mgr, const std::string &tokens, const std::string &data_dir, 374 Manager *mgr, const std::string &tokens, const std::string &data_dir,
@@ -392,13 +403,33 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( @@ -392,13 +403,33 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
392 InitEspeak(data_dir); 403 InitEspeak(data_dir);
393 } 404 }
394 405
  406 +template <typename Manager>
  407 +PiperPhonemizeLexicon::PiperPhonemizeLexicon(
  408 + Manager *mgr, const std::string &tokens, const std::string &data_dir,
  409 + const OfflineTtsKittenModelMetaData &kitten_meta_data)
  410 + : kitten_meta_data_(kitten_meta_data), is_kitten_(true) {
  411 + {
  412 + auto buf = ReadFile(mgr, tokens);
  413 + std::istrstream is(buf.data(), buf.size());
  414 + token2id_ = ReadTokens(is);
  415 + }
  416 +
  417 + // We should copy the directory of espeak-ng-data from the asset to
  418 + // some internal or external storage and then pass the directory to
  419 + // data_dir.
  420 + InitEspeak(data_dir);
  421 +}
  422 +
395 std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( 423 std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
396 const std::string &text, const std::string &voice /*= ""*/) const { 424 const std::string &text, const std::string &voice /*= ""*/) const {
397 if (is_matcha_) { 425 if (is_matcha_) {
398 return ConvertTextToTokenIdsMatcha(text, voice); 426 return ConvertTextToTokenIdsMatcha(text, voice);
399 } else if (is_kokoro_) { 427 } else if (is_kokoro_) {
400 - return ConvertTextToTokenIdsKokoro( 428 + return ConvertTextToTokenIdsKokoroOrKitten(
401 token2id_, kokoro_meta_data_.max_token_len, text, voice); 429 token2id_, kokoro_meta_data_.max_token_len, text, voice);
  430 + } else if (is_kitten_) {
  431 + return ConvertTextToTokenIdsKokoroOrKitten(
  432 + token2id_, kitten_meta_data_.max_token_len, text, voice);
402 } else { 433 } else {
403 return ConvertTextToTokenIdsVits(text, voice); 434 return ConvertTextToTokenIdsVits(text, voice);
404 } 435 }
@@ -429,7 +460,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( @@ -429,7 +460,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
429 return ans; 460 return ans;
430 } 461 }
431 462
432 -std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( 463 +std::vector<TokenIDs> ConvertTextToTokenIdsKokoroOrKitten(
433 const std::unordered_map<char32_t, int32_t> &token2id, 464 const std::unordered_map<char32_t, int32_t> &token2id,
434 int32_t max_token_len, const std::string &text, 465 int32_t max_token_len, const std::string &text,
435 const std::string &voice /*= ""*/) { 466 const std::string &voice /*= ""*/) {
@@ -446,7 +477,8 @@ std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( @@ -446,7 +477,8 @@ std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
446 std::vector<TokenIDs> ans; 477 std::vector<TokenIDs> ans;
447 478
448 for (const auto &p : phonemes) { 479 for (const auto &p : phonemes) {
449 - auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len); 480 + auto phoneme_ids =
  481 + PiperPhonemesToIdsKokoroOrKitten(token2id, p, max_token_len);
450 482
451 for (auto &ids : phoneme_ids) { 483 for (auto &ids : phoneme_ids) {
452 ans.emplace_back(std::move(ids)); 484 ans.emplace_back(std::move(ids));
@@ -10,6 +10,7 @@ @@ -10,6 +10,7 @@
10 #include <vector> 10 #include <vector>
11 11
12 #include "sherpa-onnx/csrc/offline-tts-frontend.h" 12 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
  13 +#include "sherpa-onnx/csrc/offline-tts-kitten-model-meta-data.h"
13 #include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" 14 #include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
14 #include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h" 15 #include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h"
15 #include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" 16 #include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
@@ -27,6 +28,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { @@ -27,6 +28,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
27 PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, 28 PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
28 const OfflineTtsKokoroModelMetaData &kokoro_meta_data); 29 const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
29 30
  31 + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
  32 + const OfflineTtsKittenModelMetaData &kitten_meta_data);
  33 +
30 template <typename Manager> 34 template <typename Manager>
31 PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, 35 PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
32 const std::string &data_dir, 36 const std::string &data_dir,
@@ -42,6 +46,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { @@ -42,6 +46,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
42 const std::string &data_dir, 46 const std::string &data_dir,
43 const OfflineTtsKokoroModelMetaData &kokoro_meta_data); 47 const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
44 48
  49 + template <typename Manager>
  50 + PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
  51 + const std::string &data_dir,
  52 + const OfflineTtsKittenModelMetaData &kitten_meta_data);
  53 +
45 std::vector<TokenIDs> ConvertTextToTokenIds( 54 std::vector<TokenIDs> ConvertTextToTokenIds(
46 const std::string &text, const std::string &voice = "") const override; 55 const std::string &text, const std::string &voice = "") const override;
47 56
@@ -58,8 +67,10 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { @@ -58,8 +67,10 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
58 OfflineTtsVitsModelMetaData vits_meta_data_; 67 OfflineTtsVitsModelMetaData vits_meta_data_;
59 OfflineTtsMatchaModelMetaData matcha_meta_data_; 68 OfflineTtsMatchaModelMetaData matcha_meta_data_;
60 OfflineTtsKokoroModelMetaData kokoro_meta_data_; 69 OfflineTtsKokoroModelMetaData kokoro_meta_data_;
  70 + OfflineTtsKittenModelMetaData kitten_meta_data_;
61 bool is_matcha_ = false; 71 bool is_matcha_ = false;
62 bool is_kokoro_ = false; 72 bool is_kokoro_ = false;
  73 + bool is_kitten_ = false;
63 }; 74 };
64 75
65 } // namespace sherpa_onnx 76 } // namespace sherpa_onnx
@@ -101,6 +101,7 @@ or details. @@ -101,6 +101,7 @@ or details.
101 float duration = audio.samples.size() / static_cast<float>(audio.sample_rate); 101 float duration = audio.samples.size() / static_cast<float>(audio.sample_rate);
102 102
103 float rtf = elapsed_seconds / duration; 103 float rtf = elapsed_seconds / duration;
  104 + fprintf(stderr, "Number of threads: %d\n", config.model.num_threads);
104 fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds); 105 fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
105 fprintf(stderr, "Audio duration: %.3f s\n", duration); 106 fprintf(stderr, "Audio duration: %.3f s\n", duration);
106 fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds, 107 fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds,
@@ -67,6 +67,7 @@ endif() @@ -67,6 +67,7 @@ endif()
67 67
68 if(SHERPA_ONNX_ENABLE_TTS) 68 if(SHERPA_ONNX_ENABLE_TTS)
69 list(APPEND srcs 69 list(APPEND srcs
  70 + offline-tts-kitten-model-config.cc
70 offline-tts-kokoro-model-config.cc 71 offline-tts-kokoro-model-config.cc
71 offline-tts-matcha-model-config.cc 72 offline-tts-matcha-model-config.cc
72 offline-tts-model-config.cc 73 offline-tts-model-config.cc
  1 +// sherpa-onnx/python/csrc/offline-tts-kitten-model-config.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/offline-tts-kitten-model-config.h"
  6 +
  7 +#include <string>
  8 +
  9 +#include "sherpa-onnx/csrc/offline-tts-kitten-model-config.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +void PybindOfflineTtsKittenModelConfig(py::module *m) {
  14 + using PyClass = OfflineTtsKittenModelConfig;
  15 +
  16 + py::class_<PyClass>(*m, "OfflineTtsKittenModelConfig")
  17 + .def(py::init<>())
  18 + .def(py::init<const std::string &, const std::string &,
  19 + const std::string &, const std::string &, float>(),
  20 + py::arg("model"), py::arg("voices"), py::arg("tokens"),
  21 + py::arg("data_dir"), py::arg("length_scale") = 1.0)
  22 + .def_readwrite("model", &PyClass::model)
  23 + .def_readwrite("voices", &PyClass::voices)
  24 + .def_readwrite("tokens", &PyClass::tokens)
  25 + .def_readwrite("data_dir", &PyClass::data_dir)
  26 + .def_readwrite("length_scale", &PyClass::length_scale)
  27 + .def("__str__", &PyClass::ToString)
  28 + .def("validate", &PyClass::Validate);
  29 +}
  30 +
  31 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/offline-tts-kitten-model-config.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindOfflineTtsKittenModelConfig(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_
@@ -7,6 +7,7 @@ @@ -7,6 +7,7 @@
7 #include <string> 7 #include <string>
8 8
9 #include "sherpa-onnx/csrc/offline-tts-model-config.h" 9 #include "sherpa-onnx/csrc/offline-tts-model-config.h"
  10 +#include "sherpa-onnx/python/csrc/offline-tts-kitten-model-config.h"
10 #include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h" 11 #include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h"
11 #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" 12 #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h"
12 #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" 13 #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h"
@@ -17,6 +18,7 @@ void PybindOfflineTtsModelConfig(py::module *m) { @@ -17,6 +18,7 @@ void PybindOfflineTtsModelConfig(py::module *m) {
17 PybindOfflineTtsVitsModelConfig(m); 18 PybindOfflineTtsVitsModelConfig(m);
18 PybindOfflineTtsMatchaModelConfig(m); 19 PybindOfflineTtsMatchaModelConfig(m);
19 PybindOfflineTtsKokoroModelConfig(m); 20 PybindOfflineTtsKokoroModelConfig(m);
  21 + PybindOfflineTtsKittenModelConfig(m);
20 22
21 using PyClass = OfflineTtsModelConfig; 23 using PyClass = OfflineTtsModelConfig;
22 24
@@ -24,16 +26,19 @@ void PybindOfflineTtsModelConfig(py::module *m) { @@ -24,16 +26,19 @@ void PybindOfflineTtsModelConfig(py::module *m) {
24 .def(py::init<>()) 26 .def(py::init<>())
25 .def(py::init<const OfflineTtsVitsModelConfig &, 27 .def(py::init<const OfflineTtsVitsModelConfig &,
26 const OfflineTtsMatchaModelConfig &, 28 const OfflineTtsMatchaModelConfig &,
27 - const OfflineTtsKokoroModelConfig &, int32_t, bool, 29 + const OfflineTtsKokoroModelConfig &,
  30 + const OfflineTtsKittenModelConfig &, int32_t, bool,
28 const std::string &>(), 31 const std::string &>(),
29 py::arg("vits") = OfflineTtsVitsModelConfig{}, 32 py::arg("vits") = OfflineTtsVitsModelConfig{},
30 py::arg("matcha") = OfflineTtsMatchaModelConfig{}, 33 py::arg("matcha") = OfflineTtsMatchaModelConfig{},
31 py::arg("kokoro") = OfflineTtsKokoroModelConfig{}, 34 py::arg("kokoro") = OfflineTtsKokoroModelConfig{},
  35 + py::arg("kitten") = OfflineTtsKittenModelConfig{},
32 py::arg("num_threads") = 1, py::arg("debug") = false, 36 py::arg("num_threads") = 1, py::arg("debug") = false,
33 py::arg("provider") = "cpu") 37 py::arg("provider") = "cpu")
34 .def_readwrite("vits", &PyClass::vits) 38 .def_readwrite("vits", &PyClass::vits)
35 .def_readwrite("matcha", &PyClass::matcha) 39 .def_readwrite("matcha", &PyClass::matcha)
36 .def_readwrite("kokoro", &PyClass::kokoro) 40 .def_readwrite("kokoro", &PyClass::kokoro)
  41 + .def_readwrite("kitten", &PyClass::kitten)
37 .def_readwrite("num_threads", &PyClass::num_threads) 42 .def_readwrite("num_threads", &PyClass::num_threads)
38 .def_readwrite("debug", &PyClass::debug) 43 .def_readwrite("debug", &PyClass::debug)
39 .def_readwrite("provider", &PyClass::provider) 44 .def_readwrite("provider", &PyClass::provider)