Fangjun Kuang
Committed by GitHub

Add jieba for Chinese TTS models (#797)

@@ -260,6 +260,7 @@ if(SHERPA_ONNX_ENABLE_TTS) @@ -260,6 +260,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
260 set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR}) 260 set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR})
261 message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}") 261 message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}")
262 include(piper-phonemize) 262 include(piper-phonemize)
  263 + include(cppjieba) # For Chinese TTS. It is a header-only C++ library
263 endif() 264 endif()
264 265
265 add_subdirectory(sherpa-onnx) 266 add_subdirectory(sherpa-onnx)
  1 +function(download_cppjieba)
  2 + include(FetchContent)
  3 +
  4 + set(cppjieba_URL "https://github.com/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz")
  5 + set(cppjieba_URL2 "https://hub.nuaa.cf/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz")
  6 + set(cppjieba_HASH "SHA256=03e5264687f0efaef05487a07d49c3f4c0f743347bfbf825df4b30cc75ac5288")
  7 +
  8 + # If you don't have access to the Internet,
  9 + # please pre-download cppjieba
  10 + set(possible_file_locations
  11 + $ENV{HOME}/Downloads/cppjieba-sherpa-onnx-2024-04-19.tar.gz
  12 + ${CMAKE_SOURCE_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz
  13 + ${CMAKE_BINARY_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz
  14 + /tmp/cppjieba-sherpa-onnx-2024-04-19.tar.gz
  15 + /star-fj/fangjun/download/github/cppjieba-sherpa-onnx-2024-04-19.tar.gz
  16 + )
  17 +
  18 + foreach(f IN LISTS possible_file_locations)
  19 + if(EXISTS ${f})
  20 + set(cppjieba_URL "${f}")
  21 + file(TO_CMAKE_PATH "${cppjieba_URL}" cppjieba_URL)
  22 + message(STATUS "Found local downloaded cppjieba: ${cppjieba_URL}")
  23 + set(cppjieba_URL2)
  24 + break()
  25 + endif()
  26 + endforeach()
  27 +
  28 + FetchContent_Declare(cppjieba
  29 + URL
  30 + ${cppjieba_URL}
  31 + ${cppjieba_URL2}
  32 + URL_HASH
  33 + ${cppjieba_HASH}
  34 + )
  35 +
  36 + FetchContent_GetProperties(cppjieba)
  37 + if(NOT cppjieba_POPULATED)
  38 + message(STATUS "Downloading cppjieba ${cppjieba_URL}")
  39 + FetchContent_Populate(cppjieba)
  40 + endif()
  41 + message(STATUS "cppjieba is downloaded to ${cppjieba_SOURCE_DIR}")
  42 + add_subdirectory(${cppjieba_SOURCE_DIR} ${cppjieba_BINARY_DIR} EXCLUDE_FROM_ALL)
  43 +endfunction()
  44 +
  45 +download_cppjieba()
@@ -132,6 +132,7 @@ list(APPEND sources @@ -132,6 +132,7 @@ list(APPEND sources
132 132
133 if(SHERPA_ONNX_ENABLE_TTS) 133 if(SHERPA_ONNX_ENABLE_TTS)
134 list(APPEND sources 134 list(APPEND sources
  135 + jieba-lexicon.cc
135 lexicon.cc 136 lexicon.cc
136 offline-tts-character-frontend.cc 137 offline-tts-character-frontend.cc
137 offline-tts-impl.cc 138 offline-tts-impl.cc
@@ -184,6 +185,7 @@ endif() @@ -184,6 +185,7 @@ endif()
184 if(SHERPA_ONNX_ENABLE_TTS) 185 if(SHERPA_ONNX_ENABLE_TTS)
185 target_link_libraries(sherpa-onnx-core piper_phonemize) 186 target_link_libraries(sherpa-onnx-core piper_phonemize)
186 target_link_libraries(sherpa-onnx-core fstfar fst) 187 target_link_libraries(sherpa-onnx-core fstfar fst)
  188 + target_link_libraries(sherpa-onnx-core cppjieba)
187 endif() 189 endif()
188 190
189 if(SHERPA_ONNX_ENABLE_CHECK) 191 if(SHERPA_ONNX_ENABLE_CHECK)
@@ -491,6 +493,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) @@ -491,6 +493,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
491 ) 493 )
492 if(SHERPA_ONNX_ENABLE_TTS) 494 if(SHERPA_ONNX_ENABLE_TTS)
493 list(APPEND sherpa_onnx_test_srcs 495 list(APPEND sherpa_onnx_test_srcs
  496 + cppjieba-test.cc
494 piper-phonemize-test.cc 497 piper-phonemize-test.cc
495 ) 498 )
496 endif() 499 endif()
  1 +// sherpa-onnx/csrc/cppjieba-test.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#include <iostream>
  5 +#include <regex> // NOLINT
  6 +#include <string>
  7 +#include <vector>
  8 +
  9 +#include "cppjieba/Jieba.hpp"
  10 +#include "gtest/gtest.h"
  11 +#include "sherpa-onnx/csrc/file-utils.h"
  12 +#include "sherpa-onnx/csrc/macros.h"
  13 +
  14 +namespace sherpa_onnx {
  15 +
  16 +// Please download dict files form
  17 +// https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
  18 +const char *const kDictPath = "./dict/jieba.dict.utf8";
  19 +const char *const kHmmPath = "./dict/hmm_model.utf8";
  20 +const char *const kUserDictPath = "./dict/user.dict.utf8";
  21 +const char *const kIdfPath = "./dict/idf.utf8";
  22 +const char *const kStopWordPath = "./dict/stop_words.utf8";
  23 +
  24 +TEST(CppJieBa, Case1) {
  25 + if (!FileExists(kDictPath)) {
  26 + SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
  27 + return;
  28 + }
  29 +
  30 + cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
  31 + kStopWordPath);
  32 +
  33 + std::vector<std::string> words;
  34 + std::vector<cppjieba::Word> jiebawords;
  35 +
  36 + std::string s = "他来到了网易杭研大厦";
  37 + std::cout << s << std::endl;
  38 + std::cout << "[demo] Cut With HMM" << std::endl;
  39 + jieba.Cut(s, words, true);
  40 + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
  41 + /*
  42 + 他来到了网易杭研大厦
  43 + [demo] Cut With HMM
  44 + 他/来到/了/网易/杭研/大厦
  45 + */
  46 + s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
  47 + std::cout << s << std::endl;
  48 + std::cout << "[demo] CutForSearch" << std::endl;
  49 + jieba.CutForSearch(s, words);
  50 + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
  51 + /*
  52 + 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
  53 + [demo] CutForSearch
  54 + 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
  55 + */
  56 + std::cout << "[demo] Insert User Word" << std::endl;
  57 + jieba.Cut("男默女泪", words);
  58 + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
  59 + jieba.InsertUserWord("男默女泪");
  60 + jieba.Cut("男默女泪", words);
  61 + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
  62 + /*
  63 + [demo] Insert User Word
  64 + 男默/女泪
  65 + 男默女泪
  66 + */
  67 + std::cout << "[demo] CutForSearch Word With Offset" << std::endl;
  68 + jieba.CutForSearch(s, jiebawords, true);
  69 + std::cout << jiebawords << std::endl;
  70 + /*
  71 +[demo] CutForSearch Word With Offset
  72 +[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业",
  73 +"offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21},
  74 +{"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word":
  75 +"科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算",
  76 +"offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45},
  77 +{"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本",
  78 +"offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66},
  79 +{"word": "日本京都大学", "offset": 54}, {"word": " 深造", "offset": 72}]
  80 + */
  81 + // see more test at
  82 + // https://github.com/yanyiwu/cppjieba/blob/master/test/demo.cpp
  83 +}
  84 +
  85 +TEST(CppJieBa, Case2) {
  86 + if (!FileExists(kDictPath)) {
  87 + SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
  88 + return;
  89 + }
  90 +
  91 + cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
  92 + kStopWordPath);
  93 + std::string s =
  94 + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如"
  95 + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感"
  96 + "受着生命的奇迹与温柔";
  97 + std::vector<std::string> words;
  98 + bool is_hmm = true;
  99 + jieba.Cut(s, words, is_hmm);
  100 + {
  101 + std::ostringstream os;
  102 + std::string sep = "";
  103 + for (const auto &w : words) {
  104 + os << sep << w;
  105 + sep = "_";
  106 + }
  107 +
  108 + std::cout << os.str() << "\n";
  109 + }
  110 + /*
  111 +当_夜幕降临_,_星光点点_,_伴随_着_微风_拂面_,
  112 +_我_在_静谧_中_感受_着_时光_的_流转_,
  113 +_思念_如_涟漪_荡漾_,_梦境_如_画卷_展开_,_我_与_自然_融为一体_,
  114 +_沉静_在_这_片_宁静_的_美丽_之中_,_感受_着_生命_的_奇迹_与_温柔
  115 + */
  116 + s = "这里有:红的、绿的、蓝的;各种各样的颜色都有!你想要什么呢?测试.";
  117 + std::regex punct_re(":|、|;");
  118 + std::string s2 = std::regex_replace(s, punct_re, ",");
  119 +
  120 + std::regex punct_re2("[.]");
  121 + s2 = std::regex_replace(s2, punct_re2, "。");
  122 +
  123 + std::regex punct_re3("[?]");
  124 + s2 = std::regex_replace(s2, punct_re3, "?");
  125 +
  126 + std::regex punct_re4("[!]");
  127 + s2 = std::regex_replace(s2, punct_re4, "!");
  128 + std::cout << s << "\n" << s2 << "\n";
  129 +
  130 + words.clear();
  131 + jieba.Cut(s2, words, is_hmm);
  132 + {
  133 + std::ostringstream os;
  134 + std::string sep = "";
  135 + for (const auto &w : words) {
  136 + os << sep << w;
  137 + sep = "_";
  138 + }
  139 +
  140 + std::cout << os.str() << "\n";
  141 + }
  142 +}
  143 +
  144 +} // namespace sherpa_onnx
@@ -18,6 +18,7 @@ bool FileExists(const std::string &filename) { @@ -18,6 +18,7 @@ bool FileExists(const std::string &filename) {
18 void AssertFileExists(const std::string &filename) { 18 void AssertFileExists(const std::string &filename) {
19 if (!FileExists(filename)) { 19 if (!FileExists(filename)) {
20 SHERPA_ONNX_LOG(FATAL) << filename << " does not exist!"; 20 SHERPA_ONNX_LOG(FATAL) << filename << " does not exist!";
  21 + exit(-1);
21 } 22 }
22 } 23 }
23 24
  1 +// sherpa-onnx/csrc/jieba-lexicon.cc
  2 +//
  3 +// Copyright (c) 2022-2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/jieba-lexicon.h"
  6 +
  7 +#include <fstream>
  8 +#include <regex> // NOLINT
  9 +#include <utility>
  10 +
  11 +#include "cppjieba/Jieba.hpp"
  12 +#include "sherpa-onnx/csrc/file-utils.h"
  13 +#include "sherpa-onnx/csrc/macros.h"
  14 +#include "sherpa-onnx/csrc/text-utils.h"
  15 +
  16 +namespace sherpa_onnx {
  17 +
  18 +// implemented in ./lexicon.cc
  19 +std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is);
  20 +std::vector<int32_t> ConvertTokensToIds(
  21 + const std::unordered_map<std::string, int32_t> &token2id,
  22 + const std::vector<std::string> &tokens);
  23 +
  24 +class JiebaLexicon::Impl {
  25 + public:
  26 + Impl(const std::string &lexicon, const std::string &tokens,
  27 + const std::string &dict_dir,
  28 + const OfflineTtsVitsModelMetaData &meta_data, bool debug)
  29 + : meta_data_(meta_data), debug_(debug) {
  30 + std::string dict = dict_dir + "/jieba.dict.utf8";
  31 + std::string hmm = dict_dir + "/hmm_model.utf8";
  32 + std::string user_dict = dict_dir + "/user.dict.utf8";
  33 + std::string idf = dict_dir + "/idf.utf8";
  34 + std::string stop_word = dict_dir + "/stop_words.utf8";
  35 +
  36 + AssertFileExists(dict);
  37 + AssertFileExists(hmm);
  38 + AssertFileExists(user_dict);
  39 + AssertFileExists(idf);
  40 + AssertFileExists(stop_word);
  41 +
  42 + jieba_ =
  43 + std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
  44 +
  45 + {
  46 + std::ifstream is(tokens);
  47 + InitTokens(is);
  48 + }
  49 +
  50 + {
  51 + std::ifstream is(lexicon);
  52 + InitLexicon(is);
  53 + }
  54 + }
  55 +
  56 + std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
  57 + const std::string &text) const {
  58 + // see
  59 + // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244
  60 + std::regex punct_re{":|、|;"};
  61 + std::string s = std::regex_replace(text, punct_re, ",");
  62 +
  63 + std::regex punct_re2("[.]");
  64 + s = std::regex_replace(s, punct_re2, "。");
  65 +
  66 + std::regex punct_re3("[?]");
  67 + s = std::regex_replace(s, punct_re3, "?");
  68 +
  69 + std::regex punct_re4("[!]");
  70 + s = std::regex_replace(s, punct_re4, "!");
  71 +
  72 + std::vector<std::string> words;
  73 + bool is_hmm = true;
  74 + jieba_->Cut(text, words, is_hmm);
  75 +
  76 + if (debug_) {
  77 + SHERPA_ONNX_LOGE("input text: %s", text.c_str());
  78 + SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str());
  79 +
  80 + std::ostringstream os;
  81 + std::string sep = "";
  82 + for (const auto &w : words) {
  83 + os << sep << w;
  84 + sep = "_";
  85 + }
  86 +
  87 + SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str());
  88 + }
  89 +
  90 + std::vector<std::vector<int64_t>> ans;
  91 + std::vector<int64_t> this_sentence;
  92 +
  93 + int32_t blank = token2id_.at(" ");
  94 + for (const auto &w : words) {
  95 + auto ids = ConvertWordToIds(w);
  96 + if (ids.empty()) {
  97 + SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str());
  98 + continue;
  99 + }
  100 +
  101 + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
  102 + this_sentence.push_back(blank);
  103 +
  104 + if (w == "。" || w == "!" || w == "?" || w == ",") {
  105 + ans.push_back(std::move(this_sentence));
  106 + }
  107 + } // for (const auto &w : words)
  108 +
  109 + if (!this_sentence.empty()) {
  110 + ans.push_back(std::move(this_sentence));
  111 + }
  112 +
  113 + return ans;
  114 + }
  115 +
  116 + private:
  117 + std::vector<int32_t> ConvertWordToIds(const std::string &w) const {
  118 + if (word2ids_.count(w)) {
  119 + return word2ids_.at(w);
  120 + }
  121 +
  122 + if (token2id_.count(w)) {
  123 + return {token2id_.at(w)};
  124 + }
  125 +
  126 + std::vector<int32_t> ans;
  127 +
  128 + std::vector<std::string> words = SplitUtf8(w);
  129 + for (const auto &word : words) {
  130 + if (word2ids_.count(word)) {
  131 + auto ids = ConvertWordToIds(word);
  132 + ans.insert(ans.end(), ids.begin(), ids.end());
  133 + }
  134 + }
  135 +
  136 + return ans;
  137 + }
  138 +
  139 + void InitTokens(std::istream &is) {
  140 + token2id_ = ReadTokens(is);
  141 +
  142 + std::vector<std::pair<std::string, std::string>> puncts = {
  143 + {",", ","}, {".", "。"}, {"!", "!"}, {"?", "?"}};
  144 +
  145 + for (const auto &p : puncts) {
  146 + if (token2id_.count(p.first) && !token2id_.count(p.second)) {
  147 + token2id_[p.second] = token2id_[p.first];
  148 + }
  149 + }
  150 + }
  151 +
  152 + void InitLexicon(std::istream &is) {
  153 + std::string word;
  154 + std::vector<std::string> token_list;
  155 + std::string line;
  156 + std::string phone;
  157 + int32_t line_num = 0;
  158 +
  159 + while (std::getline(is, line)) {
  160 + ++line_num;
  161 +
  162 + std::istringstream iss(line);
  163 +
  164 + token_list.clear();
  165 +
  166 + iss >> word;
  167 + ToLowerCase(&word);
  168 +
  169 + if (word2ids_.count(word)) {
  170 + SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
  171 + word.c_str(), line_num, line.c_str());
  172 + continue;
  173 + }
  174 +
  175 + while (iss >> phone) {
  176 + token_list.push_back(std::move(phone));
  177 + }
  178 +
  179 + std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
  180 + if (ids.empty()) {
  181 + continue;
  182 + }
  183 +
  184 + word2ids_.insert({std::move(word), std::move(ids)});
  185 + }
  186 + }
  187 +
  188 + private:
  189 + // lexicon.txt is saved in word2ids_
  190 + std::unordered_map<std::string, std::vector<int32_t>> word2ids_;
  191 +
  192 + // tokens.txt is saved in token2id_
  193 + std::unordered_map<std::string, int32_t> token2id_;
  194 +
  195 + OfflineTtsVitsModelMetaData meta_data_;
  196 +
  197 + std::unique_ptr<cppjieba::Jieba> jieba_;
  198 + bool debug_ = false;
  199 +};
  200 +
  201 +JiebaLexicon::~JiebaLexicon() = default;
  202 +
  203 +JiebaLexicon::JiebaLexicon(const std::string &lexicon,
  204 + const std::string &tokens,
  205 + const std::string &dict_dir,
  206 + const OfflineTtsVitsModelMetaData &meta_data,
  207 + bool debug)
  208 + : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, meta_data,
  209 + debug)) {}
  210 +
  211 +std::vector<std::vector<int64_t>> JiebaLexicon::ConvertTextToTokenIds(
  212 + const std::string &text, const std::string &unused_voice /*= ""*/) const {
  213 + return impl_->ConvertTextToTokenIds(text);
  214 +}
  215 +
  216 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/jieba-lexicon.h
  2 +//
  3 +// Copyright (c) 2022-2024 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_
  6 +#define SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_
  7 +
  8 +#include <memory>
  9 +#include <string>
  10 +#include <unordered_map>
  11 +#include <vector>
  12 +
  13 +#if __ANDROID_API__ >= 9
  14 +#include "android/asset_manager.h"
  15 +#include "android/asset_manager_jni.h"
  16 +#endif
  17 +
  18 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
  19 +#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
  20 +
  21 +namespace sherpa_onnx {
  22 +
  23 +class JiebaLexicon : public OfflineTtsFrontend {
  24 + public:
  25 + ~JiebaLexicon() override;
  26 + JiebaLexicon(const std::string &lexicon, const std::string &tokens,
  27 + const std::string &dict_dir,
  28 + const OfflineTtsVitsModelMetaData &meta_data, bool debug);
  29 +
  30 +#if __ANDROID_API__ >= 9
  31 + JiebaLexicon(AAssetManager *mgr, const std::string &lexicon,
  32 + const std::string &tokens, const std::string &dict_dir,
  33 + const OfflineTtsVitsModelMetaData &meta_data);
  34 +#endif
  35 +
  36 + std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
  37 + const std::string &text,
  38 + const std::string &unused_voice = "") const override;
  39 +
  40 + private:
  41 + class Impl;
  42 + std::unique_ptr<Impl> impl_;
  43 +};
  44 +
  45 +} // namespace sherpa_onnx
  46 +
  47 +#endif // SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_
@@ -76,7 +76,7 @@ static std::vector<std::string> ProcessHeteronyms( @@ -76,7 +76,7 @@ static std::vector<std::string> ProcessHeteronyms(
76 76
77 // Note: We don't use SymbolTable here since tokens may contain a blank 77 // Note: We don't use SymbolTable here since tokens may contain a blank
78 // in the first column 78 // in the first column
79 -static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) { 79 +std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) {
80 std::unordered_map<std::string, int32_t> token2id; 80 std::unordered_map<std::string, int32_t> token2id;
81 81
82 std::string line; 82 std::string line;
@@ -113,7 +113,7 @@ static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) { @@ -113,7 +113,7 @@ static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) {
113 return token2id; 113 return token2id;
114 } 114 }
115 115
116 -static std::vector<int32_t> ConvertTokensToIds( 116 +std::vector<int32_t> ConvertTokensToIds(
117 const std::unordered_map<std::string, int32_t> &token2id, 117 const std::unordered_map<std::string, int32_t> &token2id,
118 const std::vector<std::string> &tokens) { 118 const std::vector<std::string> &tokens) {
119 std::vector<int32_t> ids; 119 std::vector<int32_t> ids;
@@ -19,6 +19,7 @@ @@ -19,6 +19,7 @@
19 #include "fst/extensions/far/far.h" 19 #include "fst/extensions/far/far.h"
20 #include "kaldifst/csrc/kaldi-fst-io.h" 20 #include "kaldifst/csrc/kaldi-fst-io.h"
21 #include "kaldifst/csrc/text-normalizer.h" 21 #include "kaldifst/csrc/text-normalizer.h"
  22 +#include "sherpa-onnx/csrc/jieba-lexicon.h"
22 #include "sherpa-onnx/csrc/lexicon.h" 23 #include "sherpa-onnx/csrc/lexicon.h"
23 #include "sherpa-onnx/csrc/macros.h" 24 #include "sherpa-onnx/csrc/macros.h"
24 #include "sherpa-onnx/csrc/offline-tts-character-frontend.h" 25 #include "sherpa-onnx/csrc/offline-tts-character-frontend.h"
@@ -290,9 +291,26 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -290,9 +291,26 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
290 void InitFrontend() { 291 void InitFrontend() {
291 const auto &meta_data = model_->GetMetaData(); 292 const auto &meta_data = model_->GetMetaData();
292 293
  294 + if (meta_data.jieba && config_.model.vits.dict_dir.empty()) {
  295 + SHERPA_ONNX_LOGE(
  296 + "Please provide --vits-dict-dir for Chinese TTS models using jieba");
  297 + exit(-1);
  298 + }
  299 +
  300 + if (!meta_data.jieba && !config_.model.vits.dict_dir.empty()) {
  301 + SHERPA_ONNX_LOGE(
  302 + "Current model is not using jieba but you provided --vits-dict-dir");
  303 + exit(-1);
  304 + }
  305 +
293 if (meta_data.frontend == "characters") { 306 if (meta_data.frontend == "characters") {
294 frontend_ = std::make_unique<OfflineTtsCharacterFrontend>( 307 frontend_ = std::make_unique<OfflineTtsCharacterFrontend>(
295 config_.model.vits.tokens, meta_data); 308 config_.model.vits.tokens, meta_data);
  309 + } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) {
  310 + frontend_ = std::make_unique<JiebaLexicon>(
  311 + config_.model.vits.lexicon, config_.model.vits.tokens,
  312 + config_.model.vits.dict_dir, model_->GetMetaData(),
  313 + config_.model.debug);
296 } else if ((meta_data.is_piper || meta_data.is_coqui || 314 } else if ((meta_data.is_piper || meta_data.is_coqui ||
297 meta_data.is_icefall) && 315 meta_data.is_icefall) &&
298 !config_.model.vits.data_dir.empty()) { 316 !config_.model.vits.data_dir.empty()) {
@@ -4,6 +4,8 @@ @@ -4,6 +4,8 @@
4 4
5 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" 5 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h"
6 6
  7 +#include <vector>
  8 +
7 #include "sherpa-onnx/csrc/file-utils.h" 9 #include "sherpa-onnx/csrc/file-utils.h"
8 #include "sherpa-onnx/csrc/macros.h" 10 #include "sherpa-onnx/csrc/macros.h"
9 11
@@ -16,6 +18,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { @@ -16,6 +18,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
16 po->Register("vits-data-dir", &data_dir, 18 po->Register("vits-data-dir", &data_dir,
17 "Path to the directory containing dict for espeak-ng. If it is " 19 "Path to the directory containing dict for espeak-ng. If it is "
18 "given, --vits-lexicon is ignored."); 20 "given, --vits-lexicon is ignored.");
  21 + po->Register("vits-dict-dir", &dict_dir,
  22 + "Path to the directory containing dict for jieba. Used only for "
  23 + "Chinese TTS models using jieba");
19 po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); 24 po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models");
20 po->Register("vits-noise-scale-w", &noise_scale_w, 25 po->Register("vits-noise-scale-w", &noise_scale_w,
21 "noise_scale_w for VITS models"); 26 "noise_scale_w for VITS models");
@@ -64,12 +69,24 @@ bool OfflineTtsVitsModelConfig::Validate() const { @@ -64,12 +69,24 @@ bool OfflineTtsVitsModelConfig::Validate() const {
64 } 69 }
65 70
66 if (!FileExists(data_dir + "/intonations")) { 71 if (!FileExists(data_dir + "/intonations")) {
67 - SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test",  
68 - data_dir.c_str()); 72 + SHERPA_ONNX_LOGE("%s/intonations does not exist.", data_dir.c_str());
69 return false; 73 return false;
70 } 74 }
71 } 75 }
72 76
  77 + if (!dict_dir.empty()) {
  78 + std::vector<std::string> required_files = {
  79 + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8",
  80 + "idf.utf8", "stop_words.utf8",
  81 + };
  82 +
  83 + for (const auto &f : required_files) {
  84 + if (!FileExists(dict_dir + "/" + f)) {
  85 + SHERPA_ONNX_LOGE("%s/%s does not exist.", data_dir.c_str(), f.c_str());
  86 + return false;
  87 + }
  88 + }
  89 + }
73 return true; 90 return true;
74 } 91 }
75 92
@@ -81,6 +98,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { @@ -81,6 +98,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const {
81 os << "lexicon=\"" << lexicon << "\", "; 98 os << "lexicon=\"" << lexicon << "\", ";
82 os << "tokens=\"" << tokens << "\", "; 99 os << "tokens=\"" << tokens << "\", ";
83 os << "data_dir=\"" << data_dir << "\", "; 100 os << "data_dir=\"" << data_dir << "\", ";
  101 + os << "dict_dir=\"" << dict_dir << "\", ";
84 os << "noise_scale=" << noise_scale << ", "; 102 os << "noise_scale=" << noise_scale << ", ";
85 os << "noise_scale_w=" << noise_scale_w << ", "; 103 os << "noise_scale_w=" << noise_scale_w << ", ";
86 os << "length_scale=" << length_scale << ")"; 104 os << "length_scale=" << length_scale << ")";
@@ -20,6 +20,9 @@ struct OfflineTtsVitsModelConfig { @@ -20,6 +20,9 @@ struct OfflineTtsVitsModelConfig {
20 // data_dir is for piper-phonemize, which uses espeak-ng 20 // data_dir is for piper-phonemize, which uses espeak-ng
21 std::string data_dir; 21 std::string data_dir;
22 22
  23 + // Used for Chinese TTS models using jieba
  24 + std::string dict_dir;
  25 +
23 float noise_scale = 0.667; 26 float noise_scale = 0.667;
24 float noise_scale_w = 0.8; 27 float noise_scale_w = 0.8;
25 float length_scale = 1; 28 float length_scale = 1;
@@ -33,12 +36,14 @@ struct OfflineTtsVitsModelConfig { @@ -33,12 +36,14 @@ struct OfflineTtsVitsModelConfig {
33 const std::string &lexicon, 36 const std::string &lexicon,
34 const std::string &tokens, 37 const std::string &tokens,
35 const std::string &data_dir, 38 const std::string &data_dir,
  39 + const std::string &dict_dir,
36 float noise_scale = 0.667, 40 float noise_scale = 0.667,
37 float noise_scale_w = 0.8, float length_scale = 1) 41 float noise_scale_w = 0.8, float length_scale = 1)
38 : model(model), 42 : model(model),
39 lexicon(lexicon), 43 lexicon(lexicon),
40 tokens(tokens), 44 tokens(tokens),
41 data_dir(data_dir), 45 data_dir(data_dir),
  46 + dict_dir(dict_dir),
42 noise_scale(noise_scale), 47 noise_scale(noise_scale),
43 noise_scale_w(noise_scale_w), 48 noise_scale_w(noise_scale_w),
44 length_scale(length_scale) {} 49 length_scale(length_scale) {}
@@ -22,6 +22,10 @@ struct OfflineTtsVitsModelMetaData { @@ -22,6 +22,10 @@ struct OfflineTtsVitsModelMetaData {
22 bool is_coqui = false; 22 bool is_coqui = false;
23 bool is_icefall = false; 23 bool is_icefall = false;
24 24
  25 + // for Chinese TTS models from
  26 + // https://github.com/Plachtaa/VITS-fast-fine-tuning
  27 + int32_t jieba = 0;
  28 +
25 // the following options are for models from coqui-ai/TTS 29 // the following options are for models from coqui-ai/TTS
26 int32_t blank_id = 0; 30 int32_t blank_id = 0;
27 int32_t bos_id = 0; 31 int32_t bos_id = 0;
@@ -93,6 +93,7 @@ class OfflineTtsVitsModel::Impl { @@ -93,6 +93,7 @@ class OfflineTtsVitsModel::Impl {
93 SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.frontend, "frontend", 93 SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.frontend, "frontend",
94 ""); 94 "");
95 95
  96 + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.jieba, "jieba", 0);
96 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.blank_id, "blank_id", 0); 97 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.blank_id, "blank_id", 0);
97 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); 98 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0);
98 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); 99 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0);
@@ -16,15 +16,17 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { @@ -16,15 +16,17 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) {
16 py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig") 16 py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig")
17 .def(py::init<>()) 17 .def(py::init<>())
18 .def(py::init<const std::string &, const std::string &, 18 .def(py::init<const std::string &, const std::string &,
19 - const std::string &, const std::string &, float, float,  
20 - float>(), 19 + const std::string &, const std::string &,
  20 + const std::string &, float, float, float>(),
21 py::arg("model"), py::arg("lexicon"), py::arg("tokens"), 21 py::arg("model"), py::arg("lexicon"), py::arg("tokens"),
22 - py::arg("data_dir") = "", py::arg("noise_scale") = 0.667,  
23 - py::arg("noise_scale_w") = 0.8, py::arg("length_scale") = 1.0) 22 + py::arg("data_dir") = "", py::arg("dict_dir") = "",
  23 + py::arg("noise_scale") = 0.667, py::arg("noise_scale_w") = 0.8,
  24 + py::arg("length_scale") = 1.0)
24 .def_readwrite("model", &PyClass::model) 25 .def_readwrite("model", &PyClass::model)
25 .def_readwrite("lexicon", &PyClass::lexicon) 26 .def_readwrite("lexicon", &PyClass::lexicon)
26 .def_readwrite("tokens", &PyClass::tokens) 27 .def_readwrite("tokens", &PyClass::tokens)
27 .def_readwrite("data_dir", &PyClass::data_dir) 28 .def_readwrite("data_dir", &PyClass::data_dir)
  29 + .def_readwrite("dict_dir", &PyClass::dict_dir)
28 .def_readwrite("noise_scale", &PyClass::noise_scale) 30 .def_readwrite("noise_scale", &PyClass::noise_scale)
29 .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) 31 .def_readwrite("noise_scale_w", &PyClass::noise_scale_w)
30 .def_readwrite("length_scale", &PyClass::length_scale) 32 .def_readwrite("length_scale", &PyClass::length_scale)