Committed by
GitHub
Support Ukrainian VITS models from coqui-ai/TTS (#469)
正在显示
9 个修改的文件
包含
303 行增加
和
32 行删除
| @@ -431,15 +431,12 @@ void CNonStreamingTextToSpeechDlg::Init() { | @@ -431,15 +431,12 @@ void CNonStreamingTextToSpeechDlg::Init() { | ||
| 431 | ok = false; | 431 | ok = false; |
| 432 | } | 432 | } |
| 433 | 433 | ||
| 434 | - if (!Exists("./lexicon.txt") && !Exists("./espeak-ng-data/phontab")) { | ||
| 435 | - error_message += "Cannot find espeak-ng-data directory or ./lexicon.txt\r\n"; | ||
| 436 | - ok = false; | ||
| 437 | - } | ||
| 438 | - | ||
| 439 | if (!Exists("./tokens.txt")) { | 434 | if (!Exists("./tokens.txt")) { |
| 440 | error_message += "Cannot find ./tokens.txt\r\n"; | 435 | error_message += "Cannot find ./tokens.txt\r\n"; |
| 441 | ok = false; | 436 | ok = false; |
| 442 | } | 437 | } |
| 438 | + // it is OK to leave lexicon.txt and espeak-ng-data empty | ||
| 439 | + // since models using characters don't need them | ||
| 443 | 440 | ||
| 444 | if (!ok) { | 441 | if (!ok) { |
| 445 | generate_btn_.EnableWindow(FALSE); | 442 | generate_btn_.EnableWindow(FALSE); |
| @@ -470,7 +467,7 @@ void CNonStreamingTextToSpeechDlg::Init() { | @@ -470,7 +467,7 @@ void CNonStreamingTextToSpeechDlg::Init() { | ||
| 470 | config.model.vits.model = "./model.onnx"; | 467 | config.model.vits.model = "./model.onnx"; |
| 471 | if (Exists("./espeak-ng-data/phontab")) { | 468 | if (Exists("./espeak-ng-data/phontab")) { |
| 472 | config.model.vits.data_dir = "./espeak-ng-data"; | 469 | config.model.vits.data_dir = "./espeak-ng-data"; |
| 473 | - } else { | 470 | + } else if (Exists("./lexicon.txt")) { |
| 474 | config.model.vits.lexicon = "./lexicon.txt"; | 471 | config.model.vits.lexicon = "./lexicon.txt"; |
| 475 | } | 472 | } |
| 476 | config.model.vits.tokens = "./tokens.txt"; | 473 | config.model.vits.tokens = "./tokens.txt"; |
| @@ -41,6 +41,7 @@ set(sources | @@ -41,6 +41,7 @@ set(sources | ||
| 41 | offline-transducer-model-config.cc | 41 | offline-transducer-model-config.cc |
| 42 | offline-transducer-model.cc | 42 | offline-transducer-model.cc |
| 43 | offline-transducer-modified-beam-search-decoder.cc | 43 | offline-transducer-modified-beam-search-decoder.cc |
| 44 | + offline-tts-character-frontend.cc | ||
| 44 | offline-wenet-ctc-model-config.cc | 45 | offline-wenet-ctc-model-config.cc |
| 45 | offline-wenet-ctc-model.cc | 46 | offline-wenet-ctc-model.cc |
| 46 | offline-whisper-greedy-search-decoder.cc | 47 | offline-whisper-greedy-search-decoder.cc |
| 1 | +// sherpa-onnx/csrc/offline-tts-character-frontend.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#if __ANDROID_API__ >= 9 | ||
| 6 | +#include <strstream> | ||
| 7 | + | ||
| 8 | +#include "android/asset_manager.h" | ||
| 9 | +#include "android/asset_manager_jni.h" | ||
| 10 | +#endif | ||
| 11 | +#include <algorithm> | ||
| 12 | +#include <cctype> | ||
| 13 | +#include <codecvt> | ||
| 14 | +#include <fstream> | ||
| 15 | +#include <locale> | ||
| 16 | +#include <sstream> | ||
| 17 | +#include <utility> | ||
| 18 | + | ||
| 19 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 20 | +#include "sherpa-onnx/csrc/offline-tts-character-frontend.h" | ||
| 21 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 22 | + | ||
| 23 | +namespace sherpa_onnx { | ||
| 24 | + | ||
| 25 | +static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) { | ||
| 26 | + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; | ||
| 27 | + std::unordered_map<char32_t, int32_t> token2id; | ||
| 28 | + | ||
| 29 | + std::string line; | ||
| 30 | + | ||
| 31 | + std::string sym; | ||
| 32 | + std::u32string s; | ||
| 33 | + int32_t id; | ||
| 34 | + while (std::getline(is, line)) { | ||
| 35 | + std::istringstream iss(line); | ||
| 36 | + iss >> sym; | ||
| 37 | + if (iss.eof()) { | ||
| 38 | + id = atoi(sym.c_str()); | ||
| 39 | + sym = " "; | ||
| 40 | + } else { | ||
| 41 | + iss >> id; | ||
| 42 | + } | ||
| 43 | + | ||
| 44 | + // eat the trailing \r\n on windows | ||
| 45 | + iss >> std::ws; | ||
| 46 | + if (!iss.eof()) { | ||
| 47 | + SHERPA_ONNX_LOGE("Error when reading tokens: %s", line.c_str()); | ||
| 48 | + exit(-1); | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + // Form models from coqui-ai/TTS, we have saved the IDs of the following | ||
| 52 | + // symbols in OfflineTtsVitsModelMetaData, so it is safe to skip them here. | ||
| 53 | + if (sym == "<PAD>" || sym == "<EOS>" || sym == "<BOS>" || sym == "<BLNK>") { | ||
| 54 | + continue; | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + s = conv.from_bytes(sym); | ||
| 58 | + if (s.size() != 1) { | ||
| 59 | + SHERPA_ONNX_LOGE("Error when reading tokens at Line %s. size: %d", | ||
| 60 | + line.c_str(), static_cast<int32_t>(s.size())); | ||
| 61 | + exit(-1); | ||
| 62 | + } | ||
| 63 | + | ||
| 64 | + char32_t c = s[0]; | ||
| 65 | + | ||
| 66 | + if (token2id.count(c)) { | ||
| 67 | + SHERPA_ONNX_LOGE("Duplicated token %s. Line %s. Existing ID: %d", | ||
| 68 | + sym.c_str(), line.c_str(), token2id.at(c)); | ||
| 69 | + exit(-1); | ||
| 70 | + } | ||
| 71 | + | ||
| 72 | + token2id.insert({c, id}); | ||
| 73 | + } | ||
| 74 | + | ||
| 75 | + return token2id; | ||
| 76 | +} | ||
| 77 | + | ||
| 78 | +OfflineTtsCharacterFrontend::OfflineTtsCharacterFrontend( | ||
| 79 | + const std::string &tokens, const OfflineTtsVitsModelMetaData &meta_data) | ||
| 80 | + : meta_data_(meta_data) { | ||
| 81 | + std::ifstream is(tokens); | ||
| 82 | + token2id_ = ReadTokens(is); | ||
| 83 | +} | ||
| 84 | + | ||
| 85 | +#if __ANDROID_API__ >= 9 | ||
| 86 | +OfflineTtsCharacterFrontend::OfflineTtsCharacterFrontend( | ||
| 87 | + AAssetManager *mgr, const std::string &tokens, | ||
| 88 | + const OfflineTtsVitsModelMetaData &meta_data) | ||
| 89 | + : meta_data_(meta_data) { | ||
| 90 | + auto buf = ReadFile(mgr, tokens); | ||
| 91 | + std::istrstream is(buf.data(), buf.size()); | ||
| 92 | + token2id_ = ReadTokens(is); | ||
| 93 | +} | ||
| 94 | + | ||
| 95 | +#endif | ||
| 96 | + | ||
| 97 | +std::vector<std::vector<int64_t>> | ||
| 98 | +OfflineTtsCharacterFrontend::ConvertTextToTokenIds( | ||
| 99 | + const std::string &_text, const std::string &voice /*= ""*/) const { | ||
| 100 | + // see | ||
| 101 | + // https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py#L87 | ||
| 102 | + int32_t use_eos_bos = meta_data_.use_eos_bos; | ||
| 103 | + int32_t bos_id = meta_data_.bos_id; | ||
| 104 | + int32_t eos_id = meta_data_.eos_id; | ||
| 105 | + int32_t blank_id = meta_data_.blank_id; | ||
| 106 | + int32_t add_blank = meta_data_.add_blank; | ||
| 107 | + | ||
| 108 | + std::string text(_text.size(), 0); | ||
| 109 | + std::transform(_text.begin(), _text.end(), text.begin(), | ||
| 110 | + [](auto c) { return std::tolower(c); }); | ||
| 111 | + | ||
| 112 | + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; | ||
| 113 | + std::u32string s = conv.from_bytes(text); | ||
| 114 | + | ||
| 115 | + std::vector<std::vector<int64_t>> ans; | ||
| 116 | + | ||
| 117 | + std::vector<int64_t> this_sentence; | ||
| 118 | + if (add_blank) { | ||
| 119 | + if (use_eos_bos) { | ||
| 120 | + this_sentence.push_back(bos_id); | ||
| 121 | + } | ||
| 122 | + | ||
| 123 | + this_sentence.push_back(blank_id); | ||
| 124 | + | ||
| 125 | + for (char32_t c : s) { | ||
| 126 | + if (token2id_.count(c)) { | ||
| 127 | + this_sentence.push_back(token2id_.at(c)); | ||
| 128 | + this_sentence.push_back(blank_id); | ||
| 129 | + } else { | ||
| 130 | + SHERPA_ONNX_LOGE("Skip unknown character. Unicode codepoint: \\U+%04x.", | ||
| 131 | + static_cast<uint32_t>(c)); | ||
| 132 | + } | ||
| 133 | + | ||
| 134 | + if (c == '.' || c == ':' || c == '?' || c == '!') { | ||
| 135 | + // end of a sentence | ||
| 136 | + if (use_eos_bos) { | ||
| 137 | + this_sentence.push_back(eos_id); | ||
| 138 | + } | ||
| 139 | + | ||
| 140 | + ans.push_back(std::move(this_sentence)); | ||
| 141 | + | ||
| 142 | + // re-initialize this_sentence | ||
| 143 | + if (use_eos_bos) { | ||
| 144 | + this_sentence.push_back(bos_id); | ||
| 145 | + } | ||
| 146 | + this_sentence.push_back(blank_id); | ||
| 147 | + } | ||
| 148 | + } | ||
| 149 | + | ||
| 150 | + if (use_eos_bos) { | ||
| 151 | + this_sentence.push_back(eos_id); | ||
| 152 | + } | ||
| 153 | + | ||
| 154 | + if (this_sentence.size() > 1 + use_eos_bos) { | ||
| 155 | + ans.push_back(std::move(this_sentence)); | ||
| 156 | + } | ||
| 157 | + } else { | ||
| 158 | + // not adding blank | ||
| 159 | + if (use_eos_bos) { | ||
| 160 | + this_sentence.push_back(bos_id); | ||
| 161 | + } | ||
| 162 | + | ||
| 163 | + for (char32_t c : s) { | ||
| 164 | + if (token2id_.count(c)) { | ||
| 165 | + this_sentence.push_back(token2id_.at(c)); | ||
| 166 | + } | ||
| 167 | + | ||
| 168 | + if (c == '.' || c == ':' || c == '?' || c == '!') { | ||
| 169 | + // end of a sentence | ||
| 170 | + if (use_eos_bos) { | ||
| 171 | + this_sentence.push_back(eos_id); | ||
| 172 | + } | ||
| 173 | + | ||
| 174 | + ans.push_back(std::move(this_sentence)); | ||
| 175 | + | ||
| 176 | + // re-initialize this_sentence | ||
| 177 | + if (use_eos_bos) { | ||
| 178 | + this_sentence.push_back(bos_id); | ||
| 179 | + } | ||
| 180 | + } | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + if (this_sentence.size() > 1) { | ||
| 184 | + ans.push_back(std::move(this_sentence)); | ||
| 185 | + } | ||
| 186 | + } | ||
| 187 | + | ||
| 188 | + return ans; | ||
| 189 | +} | ||
| 190 | + | ||
| 191 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/csrc/offline-tts-character-frontend.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_CHARACTER_FRONTEND_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_CHARACTER_FRONTEND_H_ | ||
| 7 | +#include <cstdint> | ||
| 8 | +#include <string> | ||
| 9 | +#include <unordered_map> | ||
| 10 | +#include <vector> | ||
| 11 | + | ||
| 12 | +#if __ANDROID_API__ >= 9 | ||
| 13 | +#include "android/asset_manager.h" | ||
| 14 | +#include "android/asset_manager_jni.h" | ||
| 15 | +#endif | ||
| 16 | + | ||
| 17 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 18 | +#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | ||
| 19 | + | ||
| 20 | +namespace sherpa_onnx { | ||
| 21 | + | ||
| 22 | +class OfflineTtsCharacterFrontend : public OfflineTtsFrontend { | ||
| 23 | + public: | ||
| 24 | + OfflineTtsCharacterFrontend(const std::string &tokens, | ||
| 25 | + const OfflineTtsVitsModelMetaData &meta_data); | ||
| 26 | + | ||
| 27 | +#if __ANDROID_API__ >= 9 | ||
| 28 | + OfflineTtsCharacterFrontend(AAssetManager *mgr, const std::string &tokens, | ||
| 29 | + const OfflineTtsVitsModelMetaData &meta_data); | ||
| 30 | + | ||
| 31 | +#endif | ||
| 32 | + /** Convert a string to token IDs. | ||
| 33 | + * | ||
| 34 | + * @param text The input text. | ||
| 35 | + * Example 1: "This is the first sample sentence; this is the | ||
| 36 | + * second one." Example 2: "这是第一句。这是第二句。" | ||
| 37 | + * @param voice Optional. It is for espeak-ng. | ||
| 38 | + * | ||
| 39 | + * @return Return a vector-of-vector of token IDs. Each subvector contains | ||
| 40 | + * a sentence that can be processed independently. | ||
| 41 | + * If a frontend does not support splitting the text into | ||
| 42 | + * sentences, the resulting vector contains only one subvector. | ||
| 43 | + */ | ||
| 44 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIds( | ||
| 45 | + const std::string &text, const std::string &voice = "") const override; | ||
| 46 | + | ||
| 47 | + private: | ||
| 48 | + OfflineTtsVitsModelMetaData meta_data_; | ||
| 49 | + std::unordered_map<char32_t, int32_t> token2id_; | ||
| 50 | +}; | ||
| 51 | + | ||
| 52 | +} // namespace sherpa_onnx | ||
| 53 | + | ||
| 54 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_CHARACTER_FRONTEND_H_ |
| @@ -18,6 +18,7 @@ | @@ -18,6 +18,7 @@ | ||
| 18 | #include "kaldifst/csrc/text-normalizer.h" | 18 | #include "kaldifst/csrc/text-normalizer.h" |
| 19 | #include "sherpa-onnx/csrc/lexicon.h" | 19 | #include "sherpa-onnx/csrc/lexicon.h" |
| 20 | #include "sherpa-onnx/csrc/macros.h" | 20 | #include "sherpa-onnx/csrc/macros.h" |
| 21 | +#include "sherpa-onnx/csrc/offline-tts-character-frontend.h" | ||
| 21 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 22 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| 22 | #include "sherpa-onnx/csrc/offline-tts-impl.h" | 23 | #include "sherpa-onnx/csrc/offline-tts-impl.h" |
| 23 | #include "sherpa-onnx/csrc/offline-tts-vits-model.h" | 24 | #include "sherpa-onnx/csrc/offline-tts-vits-model.h" |
| @@ -116,7 +117,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -116,7 +117,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 116 | return {}; | 117 | return {}; |
| 117 | } | 118 | } |
| 118 | 119 | ||
| 119 | - if (meta_data.add_blank && config_.model.vits.data_dir.empty()) { | 120 | + // TODO(fangjun): add blank inside the frontend, not here |
| 121 | + if (meta_data.add_blank && config_.model.vits.data_dir.empty() && | ||
| 122 | + meta_data.frontend != "characters") { | ||
| 120 | for (auto &k : x) { | 123 | for (auto &k : x) { |
| 121 | k = AddBlank(k); | 124 | k = AddBlank(k); |
| 122 | } | 125 | } |
| @@ -195,12 +198,22 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -195,12 +198,22 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 195 | void InitFrontend(AAssetManager *mgr) { | 198 | void InitFrontend(AAssetManager *mgr) { |
| 196 | const auto &meta_data = model_->GetMetaData(); | 199 | const auto &meta_data = model_->GetMetaData(); |
| 197 | 200 | ||
| 198 | - if ((meta_data.is_piper || meta_data.is_coqui) && | ||
| 199 | - !config_.model.vits.data_dir.empty()) { | 201 | + if (meta_data.frontend == "characters") { |
| 202 | + frontend_ = std::make_unique<OfflineTtsCharacterFrontend>( | ||
| 203 | + mgr, config_.model.vits.tokens, meta_data); | ||
| 204 | + } else if ((meta_data.is_piper || meta_data.is_coqui) && | ||
| 205 | + !config_.model.vits.data_dir.empty()) { | ||
| 200 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( | 206 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( |
| 201 | mgr, config_.model.vits.tokens, config_.model.vits.data_dir, | 207 | mgr, config_.model.vits.tokens, config_.model.vits.data_dir, |
| 202 | meta_data); | 208 | meta_data); |
| 203 | } else { | 209 | } else { |
| 210 | + if (config_.model.vits.lexicon.empty()) { | ||
| 211 | + SHERPA_ONNX_LOGE( | ||
| 212 | + "Not a model using characters as modeling unit. Please provide " | ||
| 213 | + "--vits-lexicon if you leave --vits-data-dir empty"); | ||
| 214 | + exit(-1); | ||
| 215 | + } | ||
| 216 | + | ||
| 204 | frontend_ = std::make_unique<Lexicon>( | 217 | frontend_ = std::make_unique<Lexicon>( |
| 205 | mgr, config_.model.vits.lexicon, config_.model.vits.tokens, | 218 | mgr, config_.model.vits.lexicon, config_.model.vits.tokens, |
| 206 | meta_data.punctuations, meta_data.language, config_.model.debug); | 219 | meta_data.punctuations, meta_data.language, config_.model.debug); |
| @@ -211,12 +224,21 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -211,12 +224,21 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 211 | void InitFrontend() { | 224 | void InitFrontend() { |
| 212 | const auto &meta_data = model_->GetMetaData(); | 225 | const auto &meta_data = model_->GetMetaData(); |
| 213 | 226 | ||
| 214 | - if ((meta_data.is_piper || meta_data.is_coqui) && | ||
| 215 | - !config_.model.vits.data_dir.empty()) { | 227 | + if (meta_data.frontend == "characters") { |
| 228 | + frontend_ = std::make_unique<OfflineTtsCharacterFrontend>( | ||
| 229 | + config_.model.vits.tokens, meta_data); | ||
| 230 | + } else if ((meta_data.is_piper || meta_data.is_coqui) && | ||
| 231 | + !config_.model.vits.data_dir.empty()) { | ||
| 216 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( | 232 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( |
| 217 | config_.model.vits.tokens, config_.model.vits.data_dir, | 233 | config_.model.vits.tokens, config_.model.vits.data_dir, |
| 218 | model_->GetMetaData()); | 234 | model_->GetMetaData()); |
| 219 | } else { | 235 | } else { |
| 236 | + if (config_.model.vits.lexicon.empty()) { | ||
| 237 | + SHERPA_ONNX_LOGE( | ||
| 238 | + "Not a model using characters as modeling unit. Please provide " | ||
| 239 | + "--vits-lexicon if you leave --vits-data-dir empty"); | ||
| 240 | + exit(-1); | ||
| 241 | + } | ||
| 220 | frontend_ = std::make_unique<Lexicon>( | 242 | frontend_ = std::make_unique<Lexicon>( |
| 221 | config_.model.vits.lexicon, config_.model.vits.tokens, | 243 | config_.model.vits.lexicon, config_.model.vits.tokens, |
| 222 | meta_data.punctuations, meta_data.language, config_.model.debug); | 244 | meta_data.punctuations, meta_data.language, config_.model.debug); |
| @@ -44,19 +44,7 @@ bool OfflineTtsVitsModelConfig::Validate() const { | @@ -44,19 +44,7 @@ bool OfflineTtsVitsModelConfig::Validate() const { | ||
| 44 | return false; | 44 | return false; |
| 45 | } | 45 | } |
| 46 | 46 | ||
| 47 | - if (data_dir.empty()) { | ||
| 48 | - if (lexicon.empty()) { | ||
| 49 | - SHERPA_ONNX_LOGE( | ||
| 50 | - "Please provide --vits-lexicon if you leave --vits-data-dir empty"); | ||
| 51 | - return false; | ||
| 52 | - } | ||
| 53 | - | ||
| 54 | - if (!FileExists(lexicon)) { | ||
| 55 | - SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str()); | ||
| 56 | - return false; | ||
| 57 | - } | ||
| 58 | - | ||
| 59 | - } else { | 47 | + if (!data_dir.empty()) { |
| 60 | if (!FileExists(data_dir + "/phontab")) { | 48 | if (!FileExists(data_dir + "/phontab")) { |
| 61 | SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test", | 49 | SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test", |
| 62 | data_dir.c_str()); | 50 | data_dir.c_str()); |
| @@ -10,15 +10,14 @@ | @@ -10,15 +10,14 @@ | ||
| 10 | 10 | ||
| 11 | namespace sherpa_onnx { | 11 | namespace sherpa_onnx { |
| 12 | 12 | ||
| 13 | +// If you are not sure what each field means, please | ||
| 14 | +// have a look of the Python file in the model directory that | ||
| 15 | +// you have downloaded. | ||
| 13 | struct OfflineTtsVitsModelMetaData { | 16 | struct OfflineTtsVitsModelMetaData { |
| 14 | - int32_t sample_rate; | 17 | + int32_t sample_rate = 0; |
| 15 | int32_t add_blank = 0; | 18 | int32_t add_blank = 0; |
| 16 | int32_t num_speakers = 0; | 19 | int32_t num_speakers = 0; |
| 17 | 20 | ||
| 18 | - std::string punctuations; | ||
| 19 | - std::string language; | ||
| 20 | - std::string voice; | ||
| 21 | - | ||
| 22 | bool is_piper = false; | 21 | bool is_piper = false; |
| 23 | bool is_coqui = false; | 22 | bool is_coqui = false; |
| 24 | 23 | ||
| @@ -27,6 +26,12 @@ struct OfflineTtsVitsModelMetaData { | @@ -27,6 +26,12 @@ struct OfflineTtsVitsModelMetaData { | ||
| 27 | int32_t bos_id = 0; | 26 | int32_t bos_id = 0; |
| 28 | int32_t eos_id = 0; | 27 | int32_t eos_id = 0; |
| 29 | int32_t use_eos_bos = 0; | 28 | int32_t use_eos_bos = 0; |
| 29 | + int32_t pad_id = 0; | ||
| 30 | + | ||
| 31 | + std::string punctuations; | ||
| 32 | + std::string language; | ||
| 33 | + std::string voice; | ||
| 34 | + std::string frontend; // characters | ||
| 30 | }; | 35 | }; |
| 31 | 36 | ||
| 32 | } // namespace sherpa_onnx | 37 | } // namespace sherpa_onnx |
| @@ -87,13 +87,18 @@ class OfflineTtsVitsModel::Impl { | @@ -87,13 +87,18 @@ class OfflineTtsVitsModel::Impl { | ||
| 87 | SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.punctuations, | 87 | SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.punctuations, |
| 88 | "punctuation", ""); | 88 | "punctuation", ""); |
| 89 | SHERPA_ONNX_READ_META_DATA_STR(meta_data_.language, "language"); | 89 | SHERPA_ONNX_READ_META_DATA_STR(meta_data_.language, "language"); |
| 90 | + | ||
| 90 | SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice", ""); | 91 | SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice", ""); |
| 91 | 92 | ||
| 93 | + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.frontend, "frontend", | ||
| 94 | + ""); | ||
| 95 | + | ||
| 92 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.blank_id, "blank_id", 0); | 96 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.blank_id, "blank_id", 0); |
| 93 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); | 97 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); |
| 94 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); | 98 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); |
| 95 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_eos_bos, | 99 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_eos_bos, |
| 96 | "use_eos_bos", 0); | 100 | "use_eos_bos", 0); |
| 101 | + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.pad_id, "pad_id", 0); | ||
| 97 | 102 | ||
| 98 | std::string comment; | 103 | std::string comment; |
| 99 | SHERPA_ONNX_READ_META_DATA_STR(comment, "comment"); | 104 | SHERPA_ONNX_READ_META_DATA_STR(comment, "comment"); |
| @@ -142,16 +147,25 @@ class OfflineTtsVitsModel::Impl { | @@ -142,16 +147,25 @@ class OfflineTtsVitsModel::Impl { | ||
| 142 | Ort::Value sid_tensor = | 147 | Ort::Value sid_tensor = |
| 143 | Ort::Value::CreateTensor(memory_info, &sid, 1, &sid_shape, 1); | 148 | Ort::Value::CreateTensor(memory_info, &sid, 1, &sid_shape, 1); |
| 144 | 149 | ||
| 150 | + int64_t lang_id_shape = 1; | ||
| 151 | + int64_t lang_id = 0; | ||
| 152 | + Ort::Value lang_id_tensor = | ||
| 153 | + Ort::Value::CreateTensor(memory_info, &lang_id, 1, &lang_id_shape, 1); | ||
| 154 | + | ||
| 145 | std::vector<Ort::Value> inputs; | 155 | std::vector<Ort::Value> inputs; |
| 146 | - inputs.reserve(4); | 156 | + inputs.reserve(5); |
| 147 | inputs.push_back(std::move(x)); | 157 | inputs.push_back(std::move(x)); |
| 148 | inputs.push_back(std::move(x_length)); | 158 | inputs.push_back(std::move(x_length)); |
| 149 | inputs.push_back(std::move(scales_tensor)); | 159 | inputs.push_back(std::move(scales_tensor)); |
| 150 | 160 | ||
| 151 | - if (input_names_.size() == 4 && input_names_.back() == "sid") { | 161 | + if (input_names_.size() >= 4 && input_names_[3] == "sid") { |
| 152 | inputs.push_back(std::move(sid_tensor)); | 162 | inputs.push_back(std::move(sid_tensor)); |
| 153 | } | 163 | } |
| 154 | 164 | ||
| 165 | + if (input_names_.size() >= 5 && input_names_[4] == "langid") { | ||
| 166 | + inputs.push_back(std::move(lang_id_tensor)); | ||
| 167 | + } | ||
| 168 | + | ||
| 155 | auto out = | 169 | auto out = |
| 156 | sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), | 170 | sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), |
| 157 | output_names_ptr_.data(), output_names_ptr_.size()); | 171 | output_names_ptr_.data(), output_names_ptr_.size()); |
| @@ -123,7 +123,6 @@ static std::vector<int64_t> CoquiPhonemesToIds( | @@ -123,7 +123,6 @@ static std::vector<int64_t> CoquiPhonemesToIds( | ||
| 123 | int32_t blank_id = meta_data.blank_id; | 123 | int32_t blank_id = meta_data.blank_id; |
| 124 | int32_t add_blank = meta_data.add_blank; | 124 | int32_t add_blank = meta_data.add_blank; |
| 125 | int32_t comma_id = token2id.at(','); | 125 | int32_t comma_id = token2id.at(','); |
| 126 | - SHERPA_ONNX_LOGE("comma id: %d", comma_id); | ||
| 127 | 126 | ||
| 128 | std::vector<int64_t> ans; | 127 | std::vector<int64_t> ans; |
| 129 | if (add_blank) { | 128 | if (add_blank) { |
-
请 注册 或 登录 后发表评论