正在显示
22 个修改的文件
包含
1027 行增加
和
14 行删除
| @@ -192,6 +192,8 @@ if(SHERPA_ONNX_ENABLE_TTS) | @@ -192,6 +192,8 @@ if(SHERPA_ONNX_ENABLE_TTS) | ||
| 192 | offline-tts-character-frontend.cc | 192 | offline-tts-character-frontend.cc |
| 193 | offline-tts-frontend.cc | 193 | offline-tts-frontend.cc |
| 194 | offline-tts-impl.cc | 194 | offline-tts-impl.cc |
| 195 | + offline-tts-kitten-model-config.cc | ||
| 196 | + offline-tts-kitten-model.cc | ||
| 195 | offline-tts-kokoro-model-config.cc | 197 | offline-tts-kokoro-model-config.cc |
| 196 | offline-tts-kokoro-model.cc | 198 | offline-tts-kokoro-model.cc |
| 197 | offline-tts-matcha-model-config.cc | 199 | offline-tts-matcha-model-config.cc |
| @@ -260,7 +260,7 @@ class KokoroMultiLangLexicon::Impl { | @@ -260,7 +260,7 @@ class KokoroMultiLangLexicon::Impl { | ||
| 260 | 260 | ||
| 261 | std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak( | 261 | std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak( |
| 262 | const std::string &text, const std::string &voice) const { | 262 | const std::string &text, const std::string &voice) const { |
| 263 | - auto temp = ConvertTextToTokenIdsKokoro( | 263 | + auto temp = ConvertTextToTokenIdsKokoroOrKitten( |
| 264 | phoneme2id_, meta_data_.max_token_len, text, voice); | 264 | phoneme2id_, meta_data_.max_token_len, text, voice); |
| 265 | std::vector<std::vector<int32_t>> ans; | 265 | std::vector<std::vector<int32_t>> ans; |
| 266 | ans.reserve(temp.size()); | 266 | ans.reserve(temp.size()); |
| @@ -59,7 +59,7 @@ class OfflineTtsFrontend { | @@ -59,7 +59,7 @@ class OfflineTtsFrontend { | ||
| 59 | void InitEspeak(const std::string &data_dir); | 59 | void InitEspeak(const std::string &data_dir); |
| 60 | 60 | ||
| 61 | // implementation in ./piper-phonemize-lexicon.cc | 61 | // implementation in ./piper-phonemize-lexicon.cc |
| 62 | -std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( | 62 | +std::vector<TokenIDs> ConvertTextToTokenIdsKokoroOrKitten( |
| 63 | const std::unordered_map<char32_t, int32_t> &token2id, | 63 | const std::unordered_map<char32_t, int32_t> &token2id, |
| 64 | int32_t max_token_len, const std::string &text, | 64 | int32_t max_token_len, const std::string &text, |
| 65 | const std::string &voice = ""); | 65 | const std::string &voice = ""); |
| @@ -16,6 +16,7 @@ | @@ -16,6 +16,7 @@ | ||
| 16 | #include "rawfile/raw_file_manager.h" | 16 | #include "rawfile/raw_file_manager.h" |
| 17 | #endif | 17 | #endif |
| 18 | 18 | ||
| 19 | +#include "sherpa-onnx/csrc/offline-tts-kitten-impl.h" | ||
| 19 | #include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h" | 20 | #include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h" |
| 20 | #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" | 21 | #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" |
| 21 | #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" | 22 | #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" |
| @@ -40,9 +41,15 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | @@ -40,9 +41,15 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | ||
| 40 | return std::make_unique<OfflineTtsVitsImpl>(config); | 41 | return std::make_unique<OfflineTtsVitsImpl>(config); |
| 41 | } else if (!config.model.matcha.acoustic_model.empty()) { | 42 | } else if (!config.model.matcha.acoustic_model.empty()) { |
| 42 | return std::make_unique<OfflineTtsMatchaImpl>(config); | 43 | return std::make_unique<OfflineTtsMatchaImpl>(config); |
| 44 | + } else if (!config.model.kokoro.model.empty()) { | ||
| 45 | + return std::make_unique<OfflineTtsKokoroImpl>(config); | ||
| 46 | + } else if (!config.model.kitten.model.empty()) { | ||
| 47 | + return std::make_unique<OfflineTtsKittenImpl>(config); | ||
| 43 | } | 48 | } |
| 44 | 49 | ||
| 45 | - return std::make_unique<OfflineTtsKokoroImpl>(config); | 50 | + SHERPA_ONNX_LOGE("Please provide a tts model."); |
| 51 | + | ||
| 52 | + return {}; | ||
| 46 | } | 53 | } |
| 47 | 54 | ||
| 48 | template <typename Manager> | 55 | template <typename Manager> |
| @@ -52,9 +59,14 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | @@ -52,9 +59,14 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | ||
| 52 | return std::make_unique<OfflineTtsVitsImpl>(mgr, config); | 59 | return std::make_unique<OfflineTtsVitsImpl>(mgr, config); |
| 53 | } else if (!config.model.matcha.acoustic_model.empty()) { | 60 | } else if (!config.model.matcha.acoustic_model.empty()) { |
| 54 | return std::make_unique<OfflineTtsMatchaImpl>(mgr, config); | 61 | return std::make_unique<OfflineTtsMatchaImpl>(mgr, config); |
| 62 | + } else if (!config.model.kokoro.model.empty()) { | ||
| 63 | + return std::make_unique<OfflineTtsKokoroImpl>(mgr, config); | ||
| 64 | + } else if (!config.model.kitten.model.empty()) { | ||
| 65 | + return std::make_unique<OfflineTtsKittenImpl>(mgr, config); | ||
| 55 | } | 66 | } |
| 56 | 67 | ||
| 57 | - return std::make_unique<OfflineTtsKokoroImpl>(mgr, config); | 68 | + SHERPA_ONNX_LOGE("Please provide a tts model."); |
| 69 | + return {}; | ||
| 58 | } | 70 | } |
| 59 | 71 | ||
| 60 | #if __ANDROID_API__ >= 9 | 72 | #if __ANDROID_API__ >= 9 |
sherpa-onnx/csrc/offline-tts-kitten-impl.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-kitten-impl.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_IMPL_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_IMPL_H_ | ||
| 6 | + | ||
| 7 | +#include <iomanip> | ||
| 8 | +#include <ios> | ||
| 9 | +#include <memory> | ||
| 10 | +#include <string> | ||
| 11 | +#include <strstream> | ||
| 12 | +#include <utility> | ||
| 13 | +#include <vector> | ||
| 14 | + | ||
| 15 | +#include "fst/extensions/far/far.h" | ||
| 16 | +#include "kaldifst/csrc/kaldi-fst-io.h" | ||
| 17 | +#include "kaldifst/csrc/text-normalizer.h" | ||
| 18 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 19 | +#include "sherpa-onnx/csrc/lexicon.h" | ||
| 20 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 21 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 22 | +#include "sherpa-onnx/csrc/offline-tts-impl.h" | ||
| 23 | +#include "sherpa-onnx/csrc/offline-tts-kitten-model.h" | ||
| 24 | +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" | ||
| 25 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 26 | + | ||
| 27 | +namespace sherpa_onnx { | ||
| 28 | + | ||
| 29 | +class OfflineTtsKittenImpl : public OfflineTtsImpl { | ||
| 30 | + public: | ||
| 31 | + explicit OfflineTtsKittenImpl(const OfflineTtsConfig &config) | ||
| 32 | + : config_(config), | ||
| 33 | + model_(std::make_unique<OfflineTtsKittenModel>(config.model)) { | ||
| 34 | + InitFrontend(); | ||
| 35 | + | ||
| 36 | + if (!config.rule_fsts.empty()) { | ||
| 37 | + std::vector<std::string> files; | ||
| 38 | + SplitStringToVector(config.rule_fsts, ",", false, &files); | ||
| 39 | + tn_list_.reserve(files.size()); | ||
| 40 | + for (const auto &f : files) { | ||
| 41 | + if (config.model.debug) { | ||
| 42 | +#if __OHOS__ | ||
| 43 | + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); | ||
| 44 | +#else | ||
| 45 | + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); | ||
| 46 | +#endif | ||
| 47 | + } | ||
| 48 | + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f)); | ||
| 49 | + } | ||
| 50 | + } | ||
| 51 | + | ||
| 52 | + if (!config.rule_fars.empty()) { | ||
| 53 | + if (config.model.debug) { | ||
| 54 | + SHERPA_ONNX_LOGE("Loading FST archives"); | ||
| 55 | + } | ||
| 56 | + std::vector<std::string> files; | ||
| 57 | + SplitStringToVector(config.rule_fars, ",", false, &files); | ||
| 58 | + | ||
| 59 | + tn_list_.reserve(files.size() + tn_list_.size()); | ||
| 60 | + | ||
| 61 | + for (const auto &f : files) { | ||
| 62 | + if (config.model.debug) { | ||
| 63 | +#if __OHOS__ | ||
| 64 | + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); | ||
| 65 | +#else | ||
| 66 | + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); | ||
| 67 | +#endif | ||
| 68 | + } | ||
| 69 | + std::unique_ptr<fst::FarReader<fst::StdArc>> reader( | ||
| 70 | + fst::FarReader<fst::StdArc>::Open(f)); | ||
| 71 | + for (; !reader->Done(); reader->Next()) { | ||
| 72 | + std::unique_ptr<fst::StdConstFst> r( | ||
| 73 | + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); | ||
| 74 | + | ||
| 75 | + tn_list_.push_back( | ||
| 76 | + std::make_unique<kaldifst::TextNormalizer>(std::move(r))); | ||
| 77 | + } | ||
| 78 | + } | ||
| 79 | + | ||
| 80 | + if (config.model.debug) { | ||
| 81 | + SHERPA_ONNX_LOGE("FST archives loaded!"); | ||
| 82 | + } | ||
| 83 | + } | ||
| 84 | + } | ||
| 85 | + | ||
| 86 | + template <typename Manager> | ||
| 87 | + OfflineTtsKittenImpl(Manager *mgr, const OfflineTtsConfig &config) | ||
| 88 | + : config_(config), | ||
| 89 | + model_(std::make_unique<OfflineTtsKittenModel>(mgr, config.model)) { | ||
| 90 | + InitFrontend(mgr); | ||
| 91 | + | ||
| 92 | + if (!config.rule_fsts.empty()) { | ||
| 93 | + std::vector<std::string> files; | ||
| 94 | + SplitStringToVector(config.rule_fsts, ",", false, &files); | ||
| 95 | + tn_list_.reserve(files.size()); | ||
| 96 | + for (const auto &f : files) { | ||
| 97 | + if (config.model.debug) { | ||
| 98 | +#if __OHOS__ | ||
| 99 | + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); | ||
| 100 | +#else | ||
| 101 | + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); | ||
| 102 | +#endif | ||
| 103 | + } | ||
| 104 | + auto buf = ReadFile(mgr, f); | ||
| 105 | + std::istrstream is(buf.data(), buf.size()); | ||
| 106 | + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is)); | ||
| 107 | + } | ||
| 108 | + } | ||
| 109 | + | ||
| 110 | + if (!config.rule_fars.empty()) { | ||
| 111 | + std::vector<std::string> files; | ||
| 112 | + SplitStringToVector(config.rule_fars, ",", false, &files); | ||
| 113 | + tn_list_.reserve(files.size() + tn_list_.size()); | ||
| 114 | + | ||
| 115 | + for (const auto &f : files) { | ||
| 116 | + if (config.model.debug) { | ||
| 117 | +#if __OHOS__ | ||
| 118 | + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); | ||
| 119 | +#else | ||
| 120 | + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); | ||
| 121 | +#endif | ||
| 122 | + } | ||
| 123 | + | ||
| 124 | + auto buf = ReadFile(mgr, f); | ||
| 125 | + | ||
| 126 | + std::unique_ptr<std::istream> s( | ||
| 127 | + new std::istrstream(buf.data(), buf.size())); | ||
| 128 | + | ||
| 129 | + std::unique_ptr<fst::FarReader<fst::StdArc>> reader( | ||
| 130 | + fst::FarReader<fst::StdArc>::Open(std::move(s))); | ||
| 131 | + | ||
| 132 | + for (; !reader->Done(); reader->Next()) { | ||
| 133 | + std::unique_ptr<fst::StdConstFst> r( | ||
| 134 | + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); | ||
| 135 | + | ||
| 136 | + tn_list_.push_back( | ||
| 137 | + std::make_unique<kaldifst::TextNormalizer>(std::move(r))); | ||
| 138 | + } // for (; !reader->Done(); reader->Next()) | ||
| 139 | + } // for (const auto &f : files) | ||
| 140 | + } // if (!config.rule_fars.empty()) | ||
| 141 | + } | ||
| 142 | + | ||
| 143 | + int32_t SampleRate() const override { | ||
| 144 | + return model_->GetMetaData().sample_rate; | ||
| 145 | + } | ||
| 146 | + | ||
| 147 | + int32_t NumSpeakers() const override { | ||
| 148 | + return model_->GetMetaData().num_speakers; | ||
| 149 | + } | ||
| 150 | + | ||
| 151 | + GeneratedAudio Generate( | ||
| 152 | + const std::string &_text, int64_t sid = 0, float speed = 1.0, | ||
| 153 | + GeneratedAudioCallback callback = nullptr) const override { | ||
| 154 | + const auto &meta_data = model_->GetMetaData(); | ||
| 155 | + int32_t num_speakers = meta_data.num_speakers; | ||
| 156 | + | ||
| 157 | + if (num_speakers == 0 && sid != 0) { | ||
| 158 | +#if __OHOS__ | ||
| 159 | + SHERPA_ONNX_LOGE( | ||
| 160 | + "This is a single-speaker model and supports only sid 0. Given sid: " | ||
| 161 | + "%{public}d. sid is ignored", | ||
| 162 | + static_cast<int32_t>(sid)); | ||
| 163 | +#else | ||
| 164 | + SHERPA_ONNX_LOGE( | ||
| 165 | + "This is a single-speaker model and supports only sid 0. Given sid: " | ||
| 166 | + "%d. sid is ignored", | ||
| 167 | + static_cast<int32_t>(sid)); | ||
| 168 | +#endif | ||
| 169 | + } | ||
| 170 | + | ||
| 171 | + if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) { | ||
| 172 | +#if __OHOS__ | ||
| 173 | + SHERPA_ONNX_LOGE( | ||
| 174 | + "This model contains only %{public}d speakers. sid should be in the " | ||
| 175 | + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0", | ||
| 176 | + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | ||
| 177 | +#else | ||
| 178 | + SHERPA_ONNX_LOGE( | ||
| 179 | + "This model contains only %d speakers. sid should be in the range " | ||
| 180 | + "[%d, %d]. Given: %d. Use sid=0", | ||
| 181 | + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | ||
| 182 | +#endif | ||
| 183 | + sid = 0; | ||
| 184 | + } | ||
| 185 | + | ||
| 186 | + std::string text = _text; | ||
| 187 | + if (config_.model.debug) { | ||
| 188 | +#if __OHOS__ | ||
| 189 | + SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str()); | ||
| 190 | +#else | ||
| 191 | + SHERPA_ONNX_LOGE("Raw text: %s", text.c_str()); | ||
| 192 | +#endif | ||
| 193 | + std::ostringstream os; | ||
| 194 | + os << "In bytes (hex):\n"; | ||
| 195 | + const auto p = reinterpret_cast<const uint8_t *>(text.c_str()); | ||
| 196 | + for (int32_t i = 0; i != text.size(); ++i) { | ||
| 197 | + os << std::setw(2) << std::setfill('0') << std::hex | ||
| 198 | + << static_cast<uint32_t>(p[i]) << " "; | ||
| 199 | + } | ||
| 200 | + os << "\n"; | ||
| 201 | + | ||
| 202 | +#if __OHOS__ | ||
| 203 | + SHERPA_ONNX_LOGE("%{public}s", os.str().c_str()); | ||
| 204 | +#else | ||
| 205 | + SHERPA_ONNX_LOGE("%s", os.str().c_str()); | ||
| 206 | +#endif | ||
| 207 | + } | ||
| 208 | + | ||
| 209 | + if (!tn_list_.empty()) { | ||
| 210 | + for (const auto &tn : tn_list_) { | ||
| 211 | + text = tn->Normalize(text); | ||
| 212 | + if (config_.model.debug) { | ||
| 213 | +#if __OHOS__ | ||
| 214 | + SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str()); | ||
| 215 | +#else | ||
| 216 | + SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str()); | ||
| 217 | +#endif | ||
| 218 | + } | ||
| 219 | + } | ||
| 220 | + } | ||
| 221 | + | ||
| 222 | + std::vector<TokenIDs> token_ids = | ||
| 223 | + frontend_->ConvertTextToTokenIds(text, meta_data.voice); | ||
| 224 | + | ||
| 225 | + if (token_ids.empty() || | ||
| 226 | + (token_ids.size() == 1 && token_ids[0].tokens.empty())) { | ||
| 227 | +#if __OHOS__ | ||
| 228 | + SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs", | ||
| 229 | + text.c_str()); | ||
| 230 | +#else | ||
| 231 | + SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str()); | ||
| 232 | +#endif | ||
| 233 | + return {}; | ||
| 234 | + } | ||
| 235 | + | ||
| 236 | + std::vector<std::vector<int64_t>> x; | ||
| 237 | + | ||
| 238 | + x.reserve(token_ids.size()); | ||
| 239 | + | ||
| 240 | + for (auto &i : token_ids) { | ||
| 241 | + x.push_back(std::move(i.tokens)); | ||
| 242 | + } | ||
| 243 | + | ||
| 244 | + int32_t x_size = static_cast<int32_t>(x.size()); | ||
| 245 | + | ||
| 246 | + if (config_.max_num_sentences != 1) { | ||
| 247 | +#if __OHOS__ | ||
| 248 | + SHERPA_ONNX_LOGE( | ||
| 249 | + "max_num_sentences (%{public}d) != 1 is ignored for Kitten TTS " | ||
| 250 | + "models", | ||
| 251 | + config_.max_num_sentences); | ||
| 252 | +#else | ||
| 253 | + SHERPA_ONNX_LOGE( | ||
| 254 | + "max_num_sentences (%d) != 1 is ignored for Kitten TTS models", | ||
| 255 | + config_.max_num_sentences); | ||
| 256 | +#endif | ||
| 257 | + } | ||
| 258 | + | ||
| 259 | + // the input text is too long, we process sentences within it in batches | ||
| 260 | + // to avoid OOM. Batch size is config_.max_num_sentences | ||
| 261 | + std::vector<std::vector<int64_t>> batch_x; | ||
| 262 | + | ||
| 263 | + int32_t batch_size = 1; | ||
| 264 | + batch_x.reserve(batch_size); | ||
| 265 | + int32_t num_batches = x_size / batch_size; | ||
| 266 | + | ||
| 267 | + if (config_.model.debug) { | ||
| 268 | +#if __OHOS__ | ||
| 269 | + SHERPA_ONNX_LOGE( | ||
| 270 | + "Split it into %{public}d batches. batch size: " | ||
| 271 | + "%{public}d. Number of sentences: %{public}d", | ||
| 272 | + num_batches, batch_size, x_size); | ||
| 273 | +#else | ||
| 274 | + SHERPA_ONNX_LOGE( | ||
| 275 | + "Split it into %d batches. batch size: %d. Number " | ||
| 276 | + "of sentences: %d", | ||
| 277 | + num_batches, batch_size, x_size); | ||
| 278 | +#endif | ||
| 279 | + } | ||
| 280 | + | ||
| 281 | + GeneratedAudio ans; | ||
| 282 | + | ||
| 283 | + int32_t should_continue = 1; | ||
| 284 | + | ||
| 285 | + int32_t k = 0; | ||
| 286 | + | ||
| 287 | + for (int32_t b = 0; b != num_batches && should_continue; ++b) { | ||
| 288 | + batch_x.clear(); | ||
| 289 | + for (int32_t i = 0; i != batch_size; ++i, ++k) { | ||
| 290 | + batch_x.push_back(std::move(x[k])); | ||
| 291 | + } | ||
| 292 | + | ||
| 293 | + auto audio = Process(batch_x, sid, speed); | ||
| 294 | + ans.sample_rate = audio.sample_rate; | ||
| 295 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 296 | + audio.samples.end()); | ||
| 297 | + if (callback) { | ||
| 298 | + should_continue = callback(audio.samples.data(), audio.samples.size(), | ||
| 299 | + (b + 1) * 1.0 / num_batches); | ||
| 300 | + // Caution(fangjun): audio is freed when the callback returns, so users | ||
| 301 | + // should copy the data if they want to access the data after | ||
| 302 | + // the callback returns to avoid segmentation fault. | ||
| 303 | + } | ||
| 304 | + } | ||
| 305 | + | ||
| 306 | + batch_x.clear(); | ||
| 307 | + while (k < static_cast<int32_t>(x.size()) && should_continue) { | ||
| 308 | + batch_x.push_back(std::move(x[k])); | ||
| 309 | + | ||
| 310 | + ++k; | ||
| 311 | + } | ||
| 312 | + | ||
| 313 | + if (!batch_x.empty()) { | ||
| 314 | + auto audio = Process(batch_x, sid, speed); | ||
| 315 | + ans.sample_rate = audio.sample_rate; | ||
| 316 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 317 | + audio.samples.end()); | ||
| 318 | + if (callback) { | ||
| 319 | + callback(audio.samples.data(), audio.samples.size(), 1.0); | ||
| 320 | + // Caution(fangjun): audio is freed when the callback returns, so users | ||
| 321 | + // should copy the data if they want to access the data after | ||
| 322 | + // the callback returns to avoid segmentation fault. | ||
| 323 | + } | ||
| 324 | + } | ||
| 325 | + | ||
| 326 | + return ans; | ||
| 327 | + } | ||
| 328 | + | ||
| 329 | + private: | ||
| 330 | + template <typename Manager> | ||
| 331 | + void InitFrontend(Manager *mgr) { | ||
| 332 | + const auto &meta_data = model_->GetMetaData(); | ||
| 333 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 334 | + mgr, config_.model.kitten.tokens, config_.model.kitten.data_dir, | ||
| 335 | + meta_data); | ||
| 336 | + } | ||
| 337 | + | ||
| 338 | + void InitFrontend() { | ||
| 339 | + const auto &meta_data = model_->GetMetaData(); | ||
| 340 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 341 | + config_.model.kitten.tokens, config_.model.kitten.data_dir, meta_data); | ||
| 342 | + } | ||
| 343 | + | ||
| 344 | + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, | ||
| 345 | + int32_t sid, float speed) const { | ||
| 346 | + int32_t num_tokens = 0; | ||
| 347 | + for (const auto &k : tokens) { | ||
| 348 | + num_tokens += k.size(); | ||
| 349 | + } | ||
| 350 | + | ||
| 351 | + std::vector<int64_t> x; | ||
| 352 | + x.reserve(num_tokens); | ||
| 353 | + for (const auto &k : tokens) { | ||
| 354 | + x.insert(x.end(), k.begin(), k.end()); | ||
| 355 | + } | ||
| 356 | + | ||
| 357 | + auto memory_info = | ||
| 358 | + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
| 359 | + | ||
| 360 | + std::array<int64_t, 2> x_shape = {1, static_cast<int32_t>(x.size())}; | ||
| 361 | + Ort::Value x_tensor = Ort::Value::CreateTensor( | ||
| 362 | + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size()); | ||
| 363 | + | ||
| 364 | + Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed); | ||
| 365 | + | ||
| 366 | + std::vector<int64_t> audio_shape = | ||
| 367 | + audio.GetTensorTypeAndShapeInfo().GetShape(); | ||
| 368 | + | ||
| 369 | + int64_t total = 1; | ||
| 370 | + // The output shape may be (1, 1, total) or (1, total) or (total,) | ||
| 371 | + for (auto i : audio_shape) { | ||
| 372 | + total *= i; | ||
| 373 | + } | ||
| 374 | + | ||
| 375 | + const float *p = audio.GetTensorData<float>(); | ||
| 376 | + | ||
| 377 | + GeneratedAudio ans; | ||
| 378 | + ans.sample_rate = model_->GetMetaData().sample_rate; | ||
| 379 | + ans.samples = std::vector<float>(p, p + total); | ||
| 380 | + | ||
| 381 | + float silence_scale = config_.silence_scale; | ||
| 382 | + if (silence_scale != 1) { | ||
| 383 | + ans = ans.ScaleSilence(silence_scale); | ||
| 384 | + } | ||
| 385 | + | ||
| 386 | + return ans; | ||
| 387 | + } | ||
| 388 | + | ||
| 389 | + private: | ||
| 390 | + OfflineTtsConfig config_; | ||
| 391 | + std::unique_ptr<OfflineTtsKittenModel> model_; | ||
| 392 | + std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_; | ||
| 393 | + std::unique_ptr<OfflineTtsFrontend> frontend_; | ||
| 394 | +}; | ||
| 395 | + | ||
| 396 | +} // namespace sherpa_onnx | ||
| 397 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_IMPL_H_ |
| 1 | +// sherpa-onnx/csrc/offline-tts-kitten-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/offline-tts-kitten-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 10 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 11 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 12 | + | ||
| 13 | +namespace sherpa_onnx { | ||
| 14 | + | ||
| 15 | +void OfflineTtsKittenModelConfig::Register(ParseOptions *po) { | ||
| 16 | + po->Register("kitten-model", &model, "Path to kitten model"); | ||
| 17 | + po->Register("kitten-voices", &voices, | ||
| 18 | + "Path to voices.bin for kitten models"); | ||
| 19 | + po->Register("kitten-tokens", &tokens, | ||
| 20 | + "Path to tokens.txt for kitten models"); | ||
| 21 | + po->Register("kitten-data-dir", &data_dir, | ||
| 22 | + "Path to the directory containing dict for espeak-ng."); | ||
| 23 | + po->Register("kitten-length-scale", &length_scale, | ||
| 24 | + "Inverse of speech speed. Larger->Slower; Smaller->faster."); | ||
| 25 | +} | ||
| 26 | + | ||
| 27 | +bool OfflineTtsKittenModelConfig::Validate() const { | ||
| 28 | + if (model.empty()) { | ||
| 29 | + SHERPA_ONNX_LOGE("Please provide --kitten-model"); | ||
| 30 | + return false; | ||
| 31 | + } | ||
| 32 | + | ||
| 33 | + if (!FileExists(model)) { | ||
| 34 | + SHERPA_ONNX_LOGE("--kitten-model: '%s' does not exist", model.c_str()); | ||
| 35 | + return false; | ||
| 36 | + } | ||
| 37 | + | ||
| 38 | + if (voices.empty()) { | ||
| 39 | + SHERPA_ONNX_LOGE("Please provide --kitten-voices"); | ||
| 40 | + return false; | ||
| 41 | + } | ||
| 42 | + | ||
| 43 | + if (!FileExists(voices)) { | ||
| 44 | + SHERPA_ONNX_LOGE("--kitten-voices: '%s' does not exist", voices.c_str()); | ||
| 45 | + return false; | ||
| 46 | + } | ||
| 47 | + | ||
| 48 | + if (tokens.empty()) { | ||
| 49 | + SHERPA_ONNX_LOGE("Please provide --kitten-tokens"); | ||
| 50 | + return false; | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | + if (!FileExists(tokens)) { | ||
| 54 | + SHERPA_ONNX_LOGE("--kitten-tokens: '%s' does not exist", tokens.c_str()); | ||
| 55 | + return false; | ||
| 56 | + } | ||
| 57 | + | ||
| 58 | + if (data_dir.empty()) { | ||
| 59 | + SHERPA_ONNX_LOGE("Please provide --kitten-data-dir"); | ||
| 60 | + return false; | ||
| 61 | + } | ||
| 62 | + | ||
| 63 | + if (!FileExists(data_dir + "/phontab")) { | ||
| 64 | + SHERPA_ONNX_LOGE( | ||
| 65 | + "'%s/phontab' does not exist. Please check --kitten-data-dir", | ||
| 66 | + data_dir.c_str()); | ||
| 67 | + return false; | ||
| 68 | + } | ||
| 69 | + | ||
| 70 | + if (!FileExists(data_dir + "/phonindex")) { | ||
| 71 | + SHERPA_ONNX_LOGE( | ||
| 72 | + "'%s/phonindex' does not exist. Please check --kitten-data-dir", | ||
| 73 | + data_dir.c_str()); | ||
| 74 | + return false; | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + if (!FileExists(data_dir + "/phondata")) { | ||
| 78 | + SHERPA_ONNX_LOGE( | ||
| 79 | + "'%s/phondata' does not exist. Please check --kitten-data-dir", | ||
| 80 | + data_dir.c_str()); | ||
| 81 | + return false; | ||
| 82 | + } | ||
| 83 | + | ||
| 84 | + if (!FileExists(data_dir + "/intonations")) { | ||
| 85 | + SHERPA_ONNX_LOGE( | ||
| 86 | + "'%s/intonations' does not exist. Please check --kitten-data-dir", | ||
| 87 | + data_dir.c_str()); | ||
| 88 | + return false; | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + if (length_scale <= 0) { | ||
| 92 | + SHERPA_ONNX_LOGE( | ||
| 93 | + "Please provide a positive length_scale for --kitten-length-scale. " | ||
| 94 | + "Given: %.3f", | ||
| 95 | + length_scale); | ||
| 96 | + return false; | ||
| 97 | + } | ||
| 98 | + | ||
| 99 | + return true; | ||
| 100 | +} | ||
| 101 | + | ||
| 102 | +std::string OfflineTtsKittenModelConfig::ToString() const { | ||
| 103 | + std::ostringstream os; | ||
| 104 | + | ||
| 105 | + os << "OfflineTtsKittenModelConfig("; | ||
| 106 | + os << "model=\"" << model << "\", "; | ||
| 107 | + os << "voices=\"" << voices << "\", "; | ||
| 108 | + os << "tokens=\"" << tokens << "\", "; | ||
| 109 | + os << "data_dir=\"" << data_dir << "\", "; | ||
| 110 | + os << "length_scale=" << length_scale << ")"; | ||
| 111 | + | ||
| 112 | + return os.str(); | ||
| 113 | +} | ||
| 114 | + | ||
| 115 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/csrc/offline-tts-kitten-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include <string> | ||
| 9 | + | ||
| 10 | +#include "sherpa-onnx/csrc/parse-options.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +struct OfflineTtsKittenModelConfig { | ||
| 15 | + std::string model; | ||
| 16 | + std::string voices; | ||
| 17 | + std::string tokens; | ||
| 18 | + | ||
| 19 | + std::string data_dir; | ||
| 20 | + // speed = 1 / length_scale | ||
| 21 | + float length_scale = 1.0; | ||
| 22 | + | ||
| 23 | + OfflineTtsKittenModelConfig() = default; | ||
| 24 | + | ||
| 25 | + OfflineTtsKittenModelConfig(const std::string &model, | ||
| 26 | + const std::string &voices, | ||
| 27 | + const std::string &tokens, | ||
| 28 | + const std::string &data_dir, float length_scale) | ||
| 29 | + : model(model), | ||
| 30 | + voices(voices), | ||
| 31 | + tokens(tokens), | ||
| 32 | + data_dir(data_dir), | ||
| 33 | + length_scale(length_scale) {} | ||
| 34 | + | ||
| 35 | + void Register(ParseOptions *po); | ||
| 36 | + bool Validate() const; | ||
| 37 | + | ||
| 38 | + std::string ToString() const; | ||
| 39 | +}; | ||
| 40 | + | ||
| 41 | +} // namespace sherpa_onnx | ||
| 42 | + | ||
| 43 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_ |
| 1 | +// sherpa-onnx/csrc/offline-tts-kitten-model-meta-data.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_META_DATA_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_META_DATA_H_ | ||
| 7 | + | ||
| 8 | +#include <cstdint> | ||
| 9 | +#include <string> | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +// please refer to | ||
| 14 | +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kitten-tts/nano_v0_1/add_meta_data.py | ||
| 15 | +struct OfflineTtsKittenModelMetaData { | ||
| 16 | + int32_t sample_rate = 0; | ||
| 17 | + int32_t num_speakers = 0; | ||
| 18 | + int32_t version = 1; | ||
| 19 | + int32_t has_espeak = 1; | ||
| 20 | + | ||
| 21 | + int32_t max_token_len = 256; | ||
| 22 | + | ||
| 23 | + std::string voice; | ||
| 24 | +}; | ||
| 25 | + | ||
| 26 | +} // namespace sherpa_onnx | ||
| 27 | + | ||
| 28 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_META_DATA_H_ |
sherpa-onnx/csrc/offline-tts-kitten-model.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-kitten-model.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/offline-tts-kitten-model.h" | ||
| 6 | + | ||
| 7 | +#include <algorithm> | ||
| 8 | +#include <string> | ||
| 9 | +#include <utility> | ||
| 10 | +#include <vector> | ||
| 11 | + | ||
| 12 | +#if __ANDROID_API__ >= 9 | ||
| 13 | +#include "android/asset_manager.h" | ||
| 14 | +#include "android/asset_manager_jni.h" | ||
| 15 | +#endif | ||
| 16 | + | ||
| 17 | +#if __OHOS__ | ||
| 18 | +#include "rawfile/raw_file_manager.h" | ||
| 19 | +#endif | ||
| 20 | + | ||
| 21 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 22 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 23 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 24 | +#include "sherpa-onnx/csrc/session.h" | ||
| 25 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 26 | + | ||
| 27 | +namespace sherpa_onnx { | ||
| 28 | + | ||
| 29 | +class OfflineTtsKittenModel::Impl { | ||
| 30 | + public: | ||
| 31 | + explicit Impl(const OfflineTtsModelConfig &config) | ||
| 32 | + : config_(config), | ||
| 33 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 34 | + sess_opts_(GetSessionOptions(config)), | ||
| 35 | + allocator_{} { | ||
| 36 | + auto model_buf = ReadFile(config.kitten.model); | ||
| 37 | + auto voices_buf = ReadFile(config.kitten.voices); | ||
| 38 | + Init(model_buf.data(), model_buf.size(), voices_buf.data(), | ||
| 39 | + voices_buf.size()); | ||
| 40 | + } | ||
| 41 | + | ||
| 42 | + template <typename Manager> | ||
| 43 | + Impl(Manager *mgr, const OfflineTtsModelConfig &config) | ||
| 44 | + : config_(config), | ||
| 45 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 46 | + sess_opts_(GetSessionOptions(config)), | ||
| 47 | + allocator_{} { | ||
| 48 | + auto model_buf = ReadFile(mgr, config.kitten.model); | ||
| 49 | + auto voices_buf = ReadFile(mgr, config.kitten.voices); | ||
| 50 | + Init(model_buf.data(), model_buf.size(), voices_buf.data(), | ||
| 51 | + voices_buf.size()); | ||
| 52 | + } | ||
| 53 | + | ||
| 54 | + const OfflineTtsKittenModelMetaData &GetMetaData() const { | ||
| 55 | + return meta_data_; | ||
| 56 | + } | ||
| 57 | + | ||
| 58 | + Ort::Value Run(Ort::Value x, int32_t sid, float speed) { | ||
| 59 | + auto memory_info = | ||
| 60 | + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
| 61 | + | ||
| 62 | + std::vector<int64_t> x_shape = x.GetTensorTypeAndShapeInfo().GetShape(); | ||
| 63 | + if (x_shape[0] != 1) { | ||
| 64 | + SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d", | ||
| 65 | + static_cast<int32_t>(x_shape[0])); | ||
| 66 | + SHERPA_ONNX_EXIT(-1); | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | + int32_t num_speakers = meta_data_.num_speakers; | ||
| 70 | + int32_t dim1 = style_dim_[1]; | ||
| 71 | + | ||
| 72 | + /*const*/ float *p = styles_.data() + sid * dim1; | ||
| 73 | + | ||
| 74 | + std::array<int64_t, 2> style_embedding_shape = {1, dim1}; | ||
| 75 | + Ort::Value style_embedding = Ort::Value::CreateTensor( | ||
| 76 | + memory_info, p, dim1, style_embedding_shape.data(), | ||
| 77 | + style_embedding_shape.size()); | ||
| 78 | + | ||
| 79 | + int64_t speed_shape = 1; | ||
| 80 | + if (config_.kitten.length_scale != 1 && speed == 1) { | ||
| 81 | + speed = 1. / config_.kitten.length_scale; | ||
| 82 | + } | ||
| 83 | + | ||
| 84 | + Ort::Value speed_tensor = | ||
| 85 | + Ort::Value::CreateTensor(memory_info, &speed, 1, &speed_shape, 1); | ||
| 86 | + | ||
| 87 | + std::array<Ort::Value, 3> inputs = { | ||
| 88 | + std::move(x), std::move(style_embedding), std::move(speed_tensor)}; | ||
| 89 | + | ||
| 90 | + auto out = | ||
| 91 | + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), | ||
| 92 | + output_names_ptr_.data(), output_names_ptr_.size()); | ||
| 93 | + | ||
| 94 | + return std::move(out[0]); | ||
| 95 | + } | ||
| 96 | + | ||
| 97 | + private: | ||
| 98 | + void Init(void *model_data, size_t model_data_length, const char *voices_data, | ||
| 99 | + size_t voices_data_length) { | ||
| 100 | + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | ||
| 101 | + sess_opts_); | ||
| 102 | + | ||
| 103 | + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); | ||
| 104 | + | ||
| 105 | + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); | ||
| 106 | + // get meta data | ||
| 107 | + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); | ||
| 108 | + if (config_.debug) { | ||
| 109 | + std::ostringstream os; | ||
| 110 | + os << "---kitten model---\n"; | ||
| 111 | + PrintModelMetadata(os, meta_data); | ||
| 112 | + | ||
| 113 | + os << "----------input names----------\n"; | ||
| 114 | + int32_t i = 0; | ||
| 115 | + for (const auto &s : input_names_) { | ||
| 116 | + os << i << " " << s << "\n"; | ||
| 117 | + ++i; | ||
| 118 | + } | ||
| 119 | + os << "----------output names----------\n"; | ||
| 120 | + i = 0; | ||
| 121 | + for (const auto &s : output_names_) { | ||
| 122 | + os << i << " " << s << "\n"; | ||
| 123 | + ++i; | ||
| 124 | + } | ||
| 125 | + | ||
| 126 | +#if __OHOS__ | ||
| 127 | + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); | ||
| 128 | +#else | ||
| 129 | + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); | ||
| 130 | +#endif | ||
| 131 | + } | ||
| 132 | + | ||
| 133 | + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below | ||
| 134 | + | ||
| 135 | + std::string model_type; | ||
| 136 | + SHERPA_ONNX_READ_META_DATA_STR(model_type, "model_type"); | ||
| 137 | + if (model_type != "kitten-tts") { | ||
| 138 | + SHERPA_ONNX_LOGE( | ||
| 139 | + "Please download the kitten tts model from us containing meta data"); | ||
| 140 | + SHERPA_ONNX_EXIT(-1); | ||
| 141 | + } | ||
| 142 | + | ||
| 143 | + SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate"); | ||
| 144 | + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); | ||
| 145 | + SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); | ||
| 146 | + SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); | ||
| 147 | + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice", | ||
| 148 | + "en-us"); | ||
| 149 | + if (meta_data_.has_espeak != 1) { | ||
| 150 | + SHERPA_ONNX_LOGE("It should require espeak-ng"); | ||
| 151 | + SHERPA_ONNX_EXIT(-1); | ||
| 152 | + } | ||
| 153 | + | ||
| 154 | + if (config_.debug) { | ||
| 155 | + std::vector<std::string> speaker_names; | ||
| 156 | + SHERPA_ONNX_READ_META_DATA_VEC_STRING(speaker_names, "speaker_names"); | ||
| 157 | + std::ostringstream os; | ||
| 158 | + os << "\n"; | ||
| 159 | + for (int32_t i = 0; i != speaker_names.size(); ++i) { | ||
| 160 | + os << i << "->" << speaker_names[i] << ", "; | ||
| 161 | + } | ||
| 162 | + os << "\n"; | ||
| 163 | + | ||
| 164 | +#if __OHOS__ | ||
| 165 | + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); | ||
| 166 | +#else | ||
| 167 | + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); | ||
| 168 | +#endif | ||
| 169 | + } | ||
| 170 | + | ||
| 171 | + SHERPA_ONNX_READ_META_DATA_VEC(style_dim_, "style_dim"); | ||
| 172 | + if (style_dim_.size() != 2) { | ||
| 173 | + SHERPA_ONNX_LOGE("style_dim should be 2-d, given: %d", | ||
| 174 | + static_cast<int32_t>(style_dim_.size())); | ||
| 175 | + SHERPA_ONNX_EXIT(-1); | ||
| 176 | + } | ||
| 177 | + | ||
| 178 | + if (style_dim_[0] != 1) { | ||
| 179 | + SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[0]); | ||
| 180 | + SHERPA_ONNX_EXIT(-1); | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + int32_t actual_num_floats = voices_data_length / sizeof(float); | ||
| 184 | + int32_t expected_num_floats = | ||
| 185 | + style_dim_[0] * style_dim_[1] * meta_data_.num_speakers; | ||
| 186 | + | ||
| 187 | + if (actual_num_floats != expected_num_floats) { | ||
| 188 | +#if __OHOS__ | ||
| 189 | + SHERPA_ONNX_LOGE( | ||
| 190 | + "Corrupted --kitten-voices '%{public}s'. Expected #floats: " | ||
| 191 | + "%{public}d, actual: %{public}d", | ||
| 192 | + config_.kitten.voices.c_str(), expected_num_floats, | ||
| 193 | + actual_num_floats); | ||
| 194 | +#else | ||
| 195 | + SHERPA_ONNX_LOGE( | ||
| 196 | + "Corrupted --kitten-voices '%s'. Expected #floats: %d, actual: %d", | ||
| 197 | + config_.kitten.voices.c_str(), expected_num_floats, | ||
| 198 | + actual_num_floats); | ||
| 199 | +#endif | ||
| 200 | + | ||
| 201 | + SHERPA_ONNX_EXIT(-1); | ||
| 202 | + } | ||
| 203 | + | ||
| 204 | + styles_ = std::vector<float>( | ||
| 205 | + reinterpret_cast<const float *>(voices_data), | ||
| 206 | + reinterpret_cast<const float *>(voices_data) + expected_num_floats); | ||
| 207 | + } | ||
| 208 | + | ||
| 209 | + private: | ||
| 210 | + OfflineTtsModelConfig config_; | ||
| 211 | + Ort::Env env_; | ||
| 212 | + Ort::SessionOptions sess_opts_; | ||
| 213 | + Ort::AllocatorWithDefaultOptions allocator_; | ||
| 214 | + | ||
| 215 | + std::unique_ptr<Ort::Session> sess_; | ||
| 216 | + | ||
| 217 | + std::vector<std::string> input_names_; | ||
| 218 | + std::vector<const char *> input_names_ptr_; | ||
| 219 | + | ||
| 220 | + std::vector<std::string> output_names_; | ||
| 221 | + std::vector<const char *> output_names_ptr_; | ||
| 222 | + | ||
| 223 | + OfflineTtsKittenModelMetaData meta_data_; | ||
| 224 | + std::vector<int32_t> style_dim_; | ||
| 225 | + | ||
| 226 | + // (num_speakers, style_dim_[1]) | ||
| 227 | + std::vector<float> styles_; | ||
| 228 | +}; | ||
| 229 | + | ||
| 230 | +OfflineTtsKittenModel::OfflineTtsKittenModel( | ||
| 231 | + const OfflineTtsModelConfig &config) | ||
| 232 | + : impl_(std::make_unique<Impl>(config)) {} | ||
| 233 | + | ||
| 234 | +template <typename Manager> | ||
| 235 | +OfflineTtsKittenModel::OfflineTtsKittenModel( | ||
| 236 | + Manager *mgr, const OfflineTtsModelConfig &config) | ||
| 237 | + : impl_(std::make_unique<Impl>(mgr, config)) {} | ||
| 238 | + | ||
| 239 | +OfflineTtsKittenModel::~OfflineTtsKittenModel() = default; | ||
| 240 | + | ||
| 241 | +const OfflineTtsKittenModelMetaData &OfflineTtsKittenModel::GetMetaData() | ||
| 242 | + const { | ||
| 243 | + return impl_->GetMetaData(); | ||
| 244 | +} | ||
| 245 | + | ||
| 246 | +Ort::Value OfflineTtsKittenModel::Run(Ort::Value x, int64_t sid /*= 0*/, | ||
| 247 | + float speed /*= 1.0*/) const { | ||
| 248 | + return impl_->Run(std::move(x), sid, speed); | ||
| 249 | +} | ||
| 250 | + | ||
| 251 | +#if __ANDROID_API__ >= 9 | ||
| 252 | +template OfflineTtsKittenModel::OfflineTtsKittenModel( | ||
| 253 | + AAssetManager *mgr, const OfflineTtsModelConfig &config); | ||
| 254 | +#endif | ||
| 255 | + | ||
| 256 | +#if __OHOS__ | ||
| 257 | +template OfflineTtsKittenModel::OfflineTtsKittenModel( | ||
| 258 | + NativeResourceManager *mgr, const OfflineTtsModelConfig &config); | ||
| 259 | +#endif | ||
| 260 | + | ||
| 261 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/offline-tts-kitten-model.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-kitten-model.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_H_ | ||
| 7 | + | ||
| 8 | +#include <memory> | ||
| 9 | +#include <string> | ||
| 10 | + | ||
| 11 | +#include "onnxruntime_cxx_api.h" // NOLINT | ||
| 12 | +#include "sherpa-onnx/csrc/offline-tts-kitten-model-meta-data.h" | ||
| 13 | +#include "sherpa-onnx/csrc/offline-tts-model-config.h" | ||
| 14 | + | ||
| 15 | +namespace sherpa_onnx { | ||
| 16 | + | ||
| 17 | +class OfflineTtsKittenModel { | ||
| 18 | + public: | ||
| 19 | + ~OfflineTtsKittenModel(); | ||
| 20 | + | ||
| 21 | + explicit OfflineTtsKittenModel(const OfflineTtsModelConfig &config); | ||
| 22 | + | ||
| 23 | + template <typename Manager> | ||
| 24 | + OfflineTtsKittenModel(Manager *mgr, const OfflineTtsModelConfig &config); | ||
| 25 | + | ||
| 26 | + // @params x An int64 tensor of shape (1, num_tokens) | ||
| 27 | + // @return Return a float32 tensor containing the | ||
| 28 | + // samples of shape (num_samples,) | ||
| 29 | + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const; | ||
| 30 | + | ||
| 31 | + const OfflineTtsKittenModelMetaData &GetMetaData() const; | ||
| 32 | + | ||
| 33 | + private: | ||
| 34 | + class Impl; | ||
| 35 | + std::unique_ptr<Impl> impl_; | ||
| 36 | +}; | ||
| 37 | + | ||
| 38 | +} // namespace sherpa_onnx | ||
| 39 | + | ||
| 40 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KITTEN_MODEL_H_ |
| @@ -11,7 +11,9 @@ | @@ -11,7 +11,9 @@ | ||
| 11 | namespace sherpa_onnx { | 11 | namespace sherpa_onnx { |
| 12 | 12 | ||
| 13 | // please refer to | 13 | // please refer to |
| 14 | -// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/add-meta-data.py | 14 | +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/v0.19/add_meta_data.py |
| 15 | +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/v1.0/add_meta_data.py | ||
| 16 | +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/v1.1-zh/add_meta_data.py | ||
| 15 | struct OfflineTtsKokoroModelMetaData { | 17 | struct OfflineTtsKokoroModelMetaData { |
| 16 | int32_t sample_rate = 0; | 18 | int32_t sample_rate = 0; |
| 17 | int32_t num_speakers = 0; | 19 | int32_t num_speakers = 0; |
| @@ -170,7 +170,7 @@ class OfflineTtsKokoroModel::Impl { | @@ -170,7 +170,7 @@ class OfflineTtsKokoroModel::Impl { | ||
| 170 | } | 170 | } |
| 171 | 171 | ||
| 172 | if (style_dim_[1] != 1) { | 172 | if (style_dim_[1] != 1) { |
| 173 | - SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[1]); | 173 | + SHERPA_ONNX_LOGE("style_dim[1] should be 1, given: %d", style_dim_[1]); |
| 174 | SHERPA_ONNX_EXIT(-1); | 174 | SHERPA_ONNX_EXIT(-1); |
| 175 | } | 175 | } |
| 176 | 176 |
| @@ -23,8 +23,8 @@ class OfflineTtsKokoroModel { | @@ -23,8 +23,8 @@ class OfflineTtsKokoroModel { | ||
| 23 | template <typename Manager> | 23 | template <typename Manager> |
| 24 | OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config); | 24 | OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config); |
| 25 | 25 | ||
| 26 | - // Return a float32 tensor containing the mel | ||
| 27 | - // of shape (batch_size, mel_dim, num_frames) | 26 | + // Return a float32 tensor containing the samples |
| 27 | + // of shape (batch_size, num_samples) | ||
| 28 | Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const; | 28 | Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const; |
| 29 | 29 | ||
| 30 | const OfflineTtsKokoroModelMetaData &GetMetaData() const; | 30 | const OfflineTtsKokoroModelMetaData &GetMetaData() const; |
| @@ -12,6 +12,7 @@ void OfflineTtsModelConfig::Register(ParseOptions *po) { | @@ -12,6 +12,7 @@ void OfflineTtsModelConfig::Register(ParseOptions *po) { | ||
| 12 | vits.Register(po); | 12 | vits.Register(po); |
| 13 | matcha.Register(po); | 13 | matcha.Register(po); |
| 14 | kokoro.Register(po); | 14 | kokoro.Register(po); |
| 15 | + kitten.Register(po); | ||
| 15 | 16 | ||
| 16 | po->Register("num-threads", &num_threads, | 17 | po->Register("num-threads", &num_threads, |
| 17 | "Number of threads to run the neural network"); | 18 | "Number of threads to run the neural network"); |
| @@ -37,7 +38,17 @@ bool OfflineTtsModelConfig::Validate() const { | @@ -37,7 +38,17 @@ bool OfflineTtsModelConfig::Validate() const { | ||
| 37 | return matcha.Validate(); | 38 | return matcha.Validate(); |
| 38 | } | 39 | } |
| 39 | 40 | ||
| 41 | + if (!kokoro.model.empty()) { | ||
| 40 | return kokoro.Validate(); | 42 | return kokoro.Validate(); |
| 43 | + } | ||
| 44 | + | ||
| 45 | + if (!kitten.model.empty()) { | ||
| 46 | + return kitten.Validate(); | ||
| 47 | + } | ||
| 48 | + | ||
| 49 | + SHERPA_ONNX_LOGE("Please provide at exactly one tts model."); | ||
| 50 | + | ||
| 51 | + return false; | ||
| 41 | } | 52 | } |
| 42 | 53 | ||
| 43 | std::string OfflineTtsModelConfig::ToString() const { | 54 | std::string OfflineTtsModelConfig::ToString() const { |
| @@ -47,6 +58,7 @@ std::string OfflineTtsModelConfig::ToString() const { | @@ -47,6 +58,7 @@ std::string OfflineTtsModelConfig::ToString() const { | ||
| 47 | os << "vits=" << vits.ToString() << ", "; | 58 | os << "vits=" << vits.ToString() << ", "; |
| 48 | os << "matcha=" << matcha.ToString() << ", "; | 59 | os << "matcha=" << matcha.ToString() << ", "; |
| 49 | os << "kokoro=" << kokoro.ToString() << ", "; | 60 | os << "kokoro=" << kokoro.ToString() << ", "; |
| 61 | + os << "kitten=" << kitten.ToString() << ", "; | ||
| 50 | os << "num_threads=" << num_threads << ", "; | 62 | os << "num_threads=" << num_threads << ", "; |
| 51 | os << "debug=" << (debug ? "True" : "False") << ", "; | 63 | os << "debug=" << (debug ? "True" : "False") << ", "; |
| 52 | os << "provider=\"" << provider << "\")"; | 64 | os << "provider=\"" << provider << "\")"; |
| @@ -7,6 +7,7 @@ | @@ -7,6 +7,7 @@ | ||
| 7 | 7 | ||
| 8 | #include <string> | 8 | #include <string> |
| 9 | 9 | ||
| 10 | +#include "sherpa-onnx/csrc/offline-tts-kitten-model-config.h" | ||
| 10 | #include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" | 11 | #include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" |
| 11 | #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" |
| 12 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" | 13 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" |
| @@ -18,6 +19,7 @@ struct OfflineTtsModelConfig { | @@ -18,6 +19,7 @@ struct OfflineTtsModelConfig { | ||
| 18 | OfflineTtsVitsModelConfig vits; | 19 | OfflineTtsVitsModelConfig vits; |
| 19 | OfflineTtsMatchaModelConfig matcha; | 20 | OfflineTtsMatchaModelConfig matcha; |
| 20 | OfflineTtsKokoroModelConfig kokoro; | 21 | OfflineTtsKokoroModelConfig kokoro; |
| 22 | + OfflineTtsKittenModelConfig kitten; | ||
| 21 | 23 | ||
| 22 | int32_t num_threads = 1; | 24 | int32_t num_threads = 1; |
| 23 | bool debug = false; | 25 | bool debug = false; |
| @@ -28,11 +30,13 @@ struct OfflineTtsModelConfig { | @@ -28,11 +30,13 @@ struct OfflineTtsModelConfig { | ||
| 28 | OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, | 30 | OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, |
| 29 | const OfflineTtsMatchaModelConfig &matcha, | 31 | const OfflineTtsMatchaModelConfig &matcha, |
| 30 | const OfflineTtsKokoroModelConfig &kokoro, | 32 | const OfflineTtsKokoroModelConfig &kokoro, |
| 33 | + const OfflineTtsKittenModelConfig &kitten, | ||
| 31 | int32_t num_threads, bool debug, | 34 | int32_t num_threads, bool debug, |
| 32 | const std::string &provider) | 35 | const std::string &provider) |
| 33 | : vits(vits), | 36 | : vits(vits), |
| 34 | matcha(matcha), | 37 | matcha(matcha), |
| 35 | kokoro(kokoro), | 38 | kokoro(kokoro), |
| 39 | + kitten(kitten), | ||
| 36 | num_threads(num_threads), | 40 | num_threads(num_threads), |
| 37 | debug(debug), | 41 | debug(debug), |
| 38 | provider(provider) {} | 42 | provider(provider) {} |
| @@ -180,7 +180,7 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha( | @@ -180,7 +180,7 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha( | ||
| 180 | return ans; | 180 | return ans; |
| 181 | } | 181 | } |
| 182 | 182 | ||
| 183 | -static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro( | 183 | +static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoroOrKitten( |
| 184 | const std::unordered_map<char32_t, int32_t> &token2id, | 184 | const std::unordered_map<char32_t, int32_t> &token2id, |
| 185 | const std::vector<piper::Phoneme> &phonemes, int32_t max_len) { | 185 | const std::vector<piper::Phoneme> &phonemes, int32_t max_len) { |
| 186 | std::vector<std::vector<int64_t>> ans; | 186 | std::vector<std::vector<int64_t>> ans; |
| @@ -277,7 +277,6 @@ static std::vector<int64_t> CoquiPhonemesToIds( | @@ -277,7 +277,6 @@ static std::vector<int64_t> CoquiPhonemesToIds( | ||
| 277 | void InitEspeak(const std::string &data_dir) { | 277 | void InitEspeak(const std::string &data_dir) { |
| 278 | static std::once_flag init_flag; | 278 | static std::once_flag init_flag; |
| 279 | std::call_once(init_flag, [data_dir]() { | 279 | std::call_once(init_flag, [data_dir]() { |
| 280 | - | ||
| 281 | #if __ANDROID_API__ >= 9 || defined(__OHOS__) | 280 | #if __ANDROID_API__ >= 9 || defined(__OHOS__) |
| 282 | if (data_dir[0] != '/') { | 281 | if (data_dir[0] != '/') { |
| 283 | SHERPA_ONNX_LOGE( | 282 | SHERPA_ONNX_LOGE( |
| @@ -358,6 +357,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -358,6 +357,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 358 | InitEspeak(data_dir); | 357 | InitEspeak(data_dir); |
| 359 | } | 358 | } |
| 360 | 359 | ||
| 360 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 361 | + const std::string &tokens, const std::string &data_dir, | ||
| 362 | + const OfflineTtsKittenModelMetaData &kitten_meta_data) | ||
| 363 | + : kitten_meta_data_(kitten_meta_data), is_kitten_(true) { | ||
| 364 | + { | ||
| 365 | + std::ifstream is(tokens); | ||
| 366 | + token2id_ = ReadTokens(is); | ||
| 367 | + } | ||
| 368 | + | ||
| 369 | + InitEspeak(data_dir); | ||
| 370 | +} | ||
| 371 | + | ||
| 361 | template <typename Manager> | 372 | template <typename Manager> |
| 362 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( | 373 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( |
| 363 | Manager *mgr, const std::string &tokens, const std::string &data_dir, | 374 | Manager *mgr, const std::string &tokens, const std::string &data_dir, |
| @@ -392,13 +403,33 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -392,13 +403,33 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 392 | InitEspeak(data_dir); | 403 | InitEspeak(data_dir); |
| 393 | } | 404 | } |
| 394 | 405 | ||
| 406 | +template <typename Manager> | ||
| 407 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 408 | + Manager *mgr, const std::string &tokens, const std::string &data_dir, | ||
| 409 | + const OfflineTtsKittenModelMetaData &kitten_meta_data) | ||
| 410 | + : kitten_meta_data_(kitten_meta_data), is_kitten_(true) { | ||
| 411 | + { | ||
| 412 | + auto buf = ReadFile(mgr, tokens); | ||
| 413 | + std::istrstream is(buf.data(), buf.size()); | ||
| 414 | + token2id_ = ReadTokens(is); | ||
| 415 | + } | ||
| 416 | + | ||
| 417 | + // We should copy the directory of espeak-ng-data from the asset to | ||
| 418 | + // some internal or external storage and then pass the directory to | ||
| 419 | + // data_dir. | ||
| 420 | + InitEspeak(data_dir); | ||
| 421 | +} | ||
| 422 | + | ||
| 395 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | 423 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( |
| 396 | const std::string &text, const std::string &voice /*= ""*/) const { | 424 | const std::string &text, const std::string &voice /*= ""*/) const { |
| 397 | if (is_matcha_) { | 425 | if (is_matcha_) { |
| 398 | return ConvertTextToTokenIdsMatcha(text, voice); | 426 | return ConvertTextToTokenIdsMatcha(text, voice); |
| 399 | } else if (is_kokoro_) { | 427 | } else if (is_kokoro_) { |
| 400 | - return ConvertTextToTokenIdsKokoro( | 428 | + return ConvertTextToTokenIdsKokoroOrKitten( |
| 401 | token2id_, kokoro_meta_data_.max_token_len, text, voice); | 429 | token2id_, kokoro_meta_data_.max_token_len, text, voice); |
| 430 | + } else if (is_kitten_) { | ||
| 431 | + return ConvertTextToTokenIdsKokoroOrKitten( | ||
| 432 | + token2id_, kitten_meta_data_.max_token_len, text, voice); | ||
| 402 | } else { | 433 | } else { |
| 403 | return ConvertTextToTokenIdsVits(text, voice); | 434 | return ConvertTextToTokenIdsVits(text, voice); |
| 404 | } | 435 | } |
| @@ -429,7 +460,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( | @@ -429,7 +460,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( | ||
| 429 | return ans; | 460 | return ans; |
| 430 | } | 461 | } |
| 431 | 462 | ||
| 432 | -std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( | 463 | +std::vector<TokenIDs> ConvertTextToTokenIdsKokoroOrKitten( |
| 433 | const std::unordered_map<char32_t, int32_t> &token2id, | 464 | const std::unordered_map<char32_t, int32_t> &token2id, |
| 434 | int32_t max_token_len, const std::string &text, | 465 | int32_t max_token_len, const std::string &text, |
| 435 | const std::string &voice /*= ""*/) { | 466 | const std::string &voice /*= ""*/) { |
| @@ -446,7 +477,8 @@ std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( | @@ -446,7 +477,8 @@ std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( | ||
| 446 | std::vector<TokenIDs> ans; | 477 | std::vector<TokenIDs> ans; |
| 447 | 478 | ||
| 448 | for (const auto &p : phonemes) { | 479 | for (const auto &p : phonemes) { |
| 449 | - auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len); | 480 | + auto phoneme_ids = |
| 481 | + PiperPhonemesToIdsKokoroOrKitten(token2id, p, max_token_len); | ||
| 450 | 482 | ||
| 451 | for (auto &ids : phoneme_ids) { | 483 | for (auto &ids : phoneme_ids) { |
| 452 | ans.emplace_back(std::move(ids)); | 484 | ans.emplace_back(std::move(ids)); |
| @@ -10,6 +10,7 @@ | @@ -10,6 +10,7 @@ | ||
| 10 | #include <vector> | 10 | #include <vector> |
| 11 | 11 | ||
| 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| 13 | +#include "sherpa-onnx/csrc/offline-tts-kitten-model-meta-data.h" | ||
| 13 | #include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" | 14 | #include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" |
| 14 | #include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h" | 15 | #include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h" |
| 15 | #include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" | 16 | #include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" |
| @@ -27,6 +28,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | @@ -27,6 +28,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 27 | PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, | 28 | PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, |
| 28 | const OfflineTtsKokoroModelMetaData &kokoro_meta_data); | 29 | const OfflineTtsKokoroModelMetaData &kokoro_meta_data); |
| 29 | 30 | ||
| 31 | + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, | ||
| 32 | + const OfflineTtsKittenModelMetaData &kitten_meta_data); | ||
| 33 | + | ||
| 30 | template <typename Manager> | 34 | template <typename Manager> |
| 31 | PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, | 35 | PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, |
| 32 | const std::string &data_dir, | 36 | const std::string &data_dir, |
| @@ -42,6 +46,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | @@ -42,6 +46,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 42 | const std::string &data_dir, | 46 | const std::string &data_dir, |
| 43 | const OfflineTtsKokoroModelMetaData &kokoro_meta_data); | 47 | const OfflineTtsKokoroModelMetaData &kokoro_meta_data); |
| 44 | 48 | ||
| 49 | + template <typename Manager> | ||
| 50 | + PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, | ||
| 51 | + const std::string &data_dir, | ||
| 52 | + const OfflineTtsKittenModelMetaData &kitten_meta_data); | ||
| 53 | + | ||
| 45 | std::vector<TokenIDs> ConvertTextToTokenIds( | 54 | std::vector<TokenIDs> ConvertTextToTokenIds( |
| 46 | const std::string &text, const std::string &voice = "") const override; | 55 | const std::string &text, const std::string &voice = "") const override; |
| 47 | 56 | ||
| @@ -58,8 +67,10 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | @@ -58,8 +67,10 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 58 | OfflineTtsVitsModelMetaData vits_meta_data_; | 67 | OfflineTtsVitsModelMetaData vits_meta_data_; |
| 59 | OfflineTtsMatchaModelMetaData matcha_meta_data_; | 68 | OfflineTtsMatchaModelMetaData matcha_meta_data_; |
| 60 | OfflineTtsKokoroModelMetaData kokoro_meta_data_; | 69 | OfflineTtsKokoroModelMetaData kokoro_meta_data_; |
| 70 | + OfflineTtsKittenModelMetaData kitten_meta_data_; | ||
| 61 | bool is_matcha_ = false; | 71 | bool is_matcha_ = false; |
| 62 | bool is_kokoro_ = false; | 72 | bool is_kokoro_ = false; |
| 73 | + bool is_kitten_ = false; | ||
| 63 | }; | 74 | }; |
| 64 | 75 | ||
| 65 | } // namespace sherpa_onnx | 76 | } // namespace sherpa_onnx |
| @@ -101,6 +101,7 @@ or details. | @@ -101,6 +101,7 @@ or details. | ||
| 101 | float duration = audio.samples.size() / static_cast<float>(audio.sample_rate); | 101 | float duration = audio.samples.size() / static_cast<float>(audio.sample_rate); |
| 102 | 102 | ||
| 103 | float rtf = elapsed_seconds / duration; | 103 | float rtf = elapsed_seconds / duration; |
| 104 | + fprintf(stderr, "Number of threads: %d\n", config.model.num_threads); | ||
| 104 | fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds); | 105 | fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds); |
| 105 | fprintf(stderr, "Audio duration: %.3f s\n", duration); | 106 | fprintf(stderr, "Audio duration: %.3f s\n", duration); |
| 106 | fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds, | 107 | fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds, |
| @@ -67,6 +67,7 @@ endif() | @@ -67,6 +67,7 @@ endif() | ||
| 67 | 67 | ||
| 68 | if(SHERPA_ONNX_ENABLE_TTS) | 68 | if(SHERPA_ONNX_ENABLE_TTS) |
| 69 | list(APPEND srcs | 69 | list(APPEND srcs |
| 70 | + offline-tts-kitten-model-config.cc | ||
| 70 | offline-tts-kokoro-model-config.cc | 71 | offline-tts-kokoro-model-config.cc |
| 71 | offline-tts-matcha-model-config.cc | 72 | offline-tts-matcha-model-config.cc |
| 72 | offline-tts-model-config.cc | 73 | offline-tts-model-config.cc |
| 1 | +// sherpa-onnx/python/csrc/offline-tts-kitten-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/offline-tts-kitten-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/offline-tts-kitten-model-config.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +void PybindOfflineTtsKittenModelConfig(py::module *m) { | ||
| 14 | + using PyClass = OfflineTtsKittenModelConfig; | ||
| 15 | + | ||
| 16 | + py::class_<PyClass>(*m, "OfflineTtsKittenModelConfig") | ||
| 17 | + .def(py::init<>()) | ||
| 18 | + .def(py::init<const std::string &, const std::string &, | ||
| 19 | + const std::string &, const std::string &, float>(), | ||
| 20 | + py::arg("model"), py::arg("voices"), py::arg("tokens"), | ||
| 21 | + py::arg("data_dir"), py::arg("length_scale") = 1.0) | ||
| 22 | + .def_readwrite("model", &PyClass::model) | ||
| 23 | + .def_readwrite("voices", &PyClass::voices) | ||
| 24 | + .def_readwrite("tokens", &PyClass::tokens) | ||
| 25 | + .def_readwrite("data_dir", &PyClass::data_dir) | ||
| 26 | + .def_readwrite("length_scale", &PyClass::length_scale) | ||
| 27 | + .def("__str__", &PyClass::ToString) | ||
| 28 | + .def("validate", &PyClass::Validate); | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/python/csrc/offline-tts-kitten-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindOfflineTtsKittenModelConfig(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KITTEN_MODEL_CONFIG_H_ |
| @@ -7,6 +7,7 @@ | @@ -7,6 +7,7 @@ | ||
| 7 | #include <string> | 7 | #include <string> |
| 8 | 8 | ||
| 9 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" | 9 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" |
| 10 | +#include "sherpa-onnx/python/csrc/offline-tts-kitten-model-config.h" | ||
| 10 | #include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h" | 11 | #include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h" |
| 11 | #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" | 12 | #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" |
| 12 | #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" | 13 | #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" |
| @@ -17,6 +18,7 @@ void PybindOfflineTtsModelConfig(py::module *m) { | @@ -17,6 +18,7 @@ void PybindOfflineTtsModelConfig(py::module *m) { | ||
| 17 | PybindOfflineTtsVitsModelConfig(m); | 18 | PybindOfflineTtsVitsModelConfig(m); |
| 18 | PybindOfflineTtsMatchaModelConfig(m); | 19 | PybindOfflineTtsMatchaModelConfig(m); |
| 19 | PybindOfflineTtsKokoroModelConfig(m); | 20 | PybindOfflineTtsKokoroModelConfig(m); |
| 21 | + PybindOfflineTtsKittenModelConfig(m); | ||
| 20 | 22 | ||
| 21 | using PyClass = OfflineTtsModelConfig; | 23 | using PyClass = OfflineTtsModelConfig; |
| 22 | 24 | ||
| @@ -24,16 +26,19 @@ void PybindOfflineTtsModelConfig(py::module *m) { | @@ -24,16 +26,19 @@ void PybindOfflineTtsModelConfig(py::module *m) { | ||
| 24 | .def(py::init<>()) | 26 | .def(py::init<>()) |
| 25 | .def(py::init<const OfflineTtsVitsModelConfig &, | 27 | .def(py::init<const OfflineTtsVitsModelConfig &, |
| 26 | const OfflineTtsMatchaModelConfig &, | 28 | const OfflineTtsMatchaModelConfig &, |
| 27 | - const OfflineTtsKokoroModelConfig &, int32_t, bool, | 29 | + const OfflineTtsKokoroModelConfig &, |
| 30 | + const OfflineTtsKittenModelConfig &, int32_t, bool, | ||
| 28 | const std::string &>(), | 31 | const std::string &>(), |
| 29 | py::arg("vits") = OfflineTtsVitsModelConfig{}, | 32 | py::arg("vits") = OfflineTtsVitsModelConfig{}, |
| 30 | py::arg("matcha") = OfflineTtsMatchaModelConfig{}, | 33 | py::arg("matcha") = OfflineTtsMatchaModelConfig{}, |
| 31 | py::arg("kokoro") = OfflineTtsKokoroModelConfig{}, | 34 | py::arg("kokoro") = OfflineTtsKokoroModelConfig{}, |
| 35 | + py::arg("kitten") = OfflineTtsKittenModelConfig{}, | ||
| 32 | py::arg("num_threads") = 1, py::arg("debug") = false, | 36 | py::arg("num_threads") = 1, py::arg("debug") = false, |
| 33 | py::arg("provider") = "cpu") | 37 | py::arg("provider") = "cpu") |
| 34 | .def_readwrite("vits", &PyClass::vits) | 38 | .def_readwrite("vits", &PyClass::vits) |
| 35 | .def_readwrite("matcha", &PyClass::matcha) | 39 | .def_readwrite("matcha", &PyClass::matcha) |
| 36 | .def_readwrite("kokoro", &PyClass::kokoro) | 40 | .def_readwrite("kokoro", &PyClass::kokoro) |
| 41 | + .def_readwrite("kitten", &PyClass::kitten) | ||
| 37 | .def_readwrite("num_threads", &PyClass::num_threads) | 42 | .def_readwrite("num_threads", &PyClass::num_threads) |
| 38 | .def_readwrite("debug", &PyClass::debug) | 43 | .def_readwrite("debug", &PyClass::debug) |
| 39 | .def_readwrite("provider", &PyClass::provider) | 44 | .def_readwrite("provider", &PyClass::provider) |
-
请 注册 或 登录 后发表评论