正在显示
14 个修改的文件
包含
513 行增加
和
8 行删除
| @@ -260,6 +260,7 @@ if(SHERPA_ONNX_ENABLE_TTS) | @@ -260,6 +260,7 @@ if(SHERPA_ONNX_ENABLE_TTS) | ||
| 260 | set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR}) | 260 | set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR}) |
| 261 | message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}") | 261 | message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}") |
| 262 | include(piper-phonemize) | 262 | include(piper-phonemize) |
| 263 | + include(cppjieba) # For Chinese TTS. It is a header-only C++ library | ||
| 263 | endif() | 264 | endif() |
| 264 | 265 | ||
| 265 | add_subdirectory(sherpa-onnx) | 266 | add_subdirectory(sherpa-onnx) |
cmake/cppjieba.cmake
0 → 100644
| 1 | +function(download_cppjieba) | ||
| 2 | + include(FetchContent) | ||
| 3 | + | ||
| 4 | + set(cppjieba_URL "https://github.com/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz") | ||
| 5 | + set(cppjieba_URL2 "https://hub.nuaa.cf/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz") | ||
| 6 | + set(cppjieba_HASH "SHA256=03e5264687f0efaef05487a07d49c3f4c0f743347bfbf825df4b30cc75ac5288") | ||
| 7 | + | ||
| 8 | + # If you don't have access to the Internet, | ||
| 9 | + # please pre-download cppjieba | ||
| 10 | + set(possible_file_locations | ||
| 11 | + $ENV{HOME}/Downloads/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
| 12 | + ${CMAKE_SOURCE_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
| 13 | + ${CMAKE_BINARY_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
| 14 | + /tmp/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
| 15 | + /star-fj/fangjun/download/github/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
| 16 | + ) | ||
| 17 | + | ||
| 18 | + foreach(f IN LISTS possible_file_locations) | ||
| 19 | + if(EXISTS ${f}) | ||
| 20 | + set(cppjieba_URL "${f}") | ||
| 21 | + file(TO_CMAKE_PATH "${cppjieba_URL}" cppjieba_URL) | ||
| 22 | + message(STATUS "Found local downloaded cppjieba: ${cppjieba_URL}") | ||
| 23 | + set(cppjieba_URL2) | ||
| 24 | + break() | ||
| 25 | + endif() | ||
| 26 | + endforeach() | ||
| 27 | + | ||
| 28 | + FetchContent_Declare(cppjieba | ||
| 29 | + URL | ||
| 30 | + ${cppjieba_URL} | ||
| 31 | + ${cppjieba_URL2} | ||
| 32 | + URL_HASH | ||
| 33 | + ${cppjieba_HASH} | ||
| 34 | + ) | ||
| 35 | + | ||
| 36 | + FetchContent_GetProperties(cppjieba) | ||
| 37 | + if(NOT cppjieba_POPULATED) | ||
| 38 | + message(STATUS "Downloading cppjieba ${cppjieba_URL}") | ||
| 39 | + FetchContent_Populate(cppjieba) | ||
| 40 | + endif() | ||
| 41 | + message(STATUS "cppjieba is downloaded to ${cppjieba_SOURCE_DIR}") | ||
| 42 | + add_subdirectory(${cppjieba_SOURCE_DIR} ${cppjieba_BINARY_DIR} EXCLUDE_FROM_ALL) | ||
| 43 | +endfunction() | ||
| 44 | + | ||
| 45 | +download_cppjieba() |
| @@ -132,6 +132,7 @@ list(APPEND sources | @@ -132,6 +132,7 @@ list(APPEND sources | ||
| 132 | 132 | ||
| 133 | if(SHERPA_ONNX_ENABLE_TTS) | 133 | if(SHERPA_ONNX_ENABLE_TTS) |
| 134 | list(APPEND sources | 134 | list(APPEND sources |
| 135 | + jieba-lexicon.cc | ||
| 135 | lexicon.cc | 136 | lexicon.cc |
| 136 | offline-tts-character-frontend.cc | 137 | offline-tts-character-frontend.cc |
| 137 | offline-tts-impl.cc | 138 | offline-tts-impl.cc |
| @@ -184,6 +185,7 @@ endif() | @@ -184,6 +185,7 @@ endif() | ||
| 184 | if(SHERPA_ONNX_ENABLE_TTS) | 185 | if(SHERPA_ONNX_ENABLE_TTS) |
| 185 | target_link_libraries(sherpa-onnx-core piper_phonemize) | 186 | target_link_libraries(sherpa-onnx-core piper_phonemize) |
| 186 | target_link_libraries(sherpa-onnx-core fstfar fst) | 187 | target_link_libraries(sherpa-onnx-core fstfar fst) |
| 188 | + target_link_libraries(sherpa-onnx-core cppjieba) | ||
| 187 | endif() | 189 | endif() |
| 188 | 190 | ||
| 189 | if(SHERPA_ONNX_ENABLE_CHECK) | 191 | if(SHERPA_ONNX_ENABLE_CHECK) |
| @@ -491,6 +493,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) | @@ -491,6 +493,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) | ||
| 491 | ) | 493 | ) |
| 492 | if(SHERPA_ONNX_ENABLE_TTS) | 494 | if(SHERPA_ONNX_ENABLE_TTS) |
| 493 | list(APPEND sherpa_onnx_test_srcs | 495 | list(APPEND sherpa_onnx_test_srcs |
| 496 | + cppjieba-test.cc | ||
| 494 | piper-phonemize-test.cc | 497 | piper-phonemize-test.cc |
| 495 | ) | 498 | ) |
| 496 | endif() | 499 | endif() |
sherpa-onnx/csrc/cppjieba-test.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/cppjieba-test.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | +#include <iostream> | ||
| 5 | +#include <regex> // NOLINT | ||
| 6 | +#include <string> | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "cppjieba/Jieba.hpp" | ||
| 10 | +#include "gtest/gtest.h" | ||
| 11 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 12 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 13 | + | ||
| 14 | +namespace sherpa_onnx { | ||
| 15 | + | ||
| 16 | +// Please download dict files form | ||
| 17 | +// https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2 | ||
| 18 | +const char *const kDictPath = "./dict/jieba.dict.utf8"; | ||
| 19 | +const char *const kHmmPath = "./dict/hmm_model.utf8"; | ||
| 20 | +const char *const kUserDictPath = "./dict/user.dict.utf8"; | ||
| 21 | +const char *const kIdfPath = "./dict/idf.utf8"; | ||
| 22 | +const char *const kStopWordPath = "./dict/stop_words.utf8"; | ||
| 23 | + | ||
| 24 | +TEST(CppJieBa, Case1) { | ||
| 25 | + if (!FileExists(kDictPath)) { | ||
| 26 | + SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath); | ||
| 27 | + return; | ||
| 28 | + } | ||
| 29 | + | ||
| 30 | + cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath, | ||
| 31 | + kStopWordPath); | ||
| 32 | + | ||
| 33 | + std::vector<std::string> words; | ||
| 34 | + std::vector<cppjieba::Word> jiebawords; | ||
| 35 | + | ||
| 36 | + std::string s = "他来到了网易杭研大厦"; | ||
| 37 | + std::cout << s << std::endl; | ||
| 38 | + std::cout << "[demo] Cut With HMM" << std::endl; | ||
| 39 | + jieba.Cut(s, words, true); | ||
| 40 | + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
| 41 | + /* | ||
| 42 | + 他来到了网易杭研大厦 | ||
| 43 | + [demo] Cut With HMM | ||
| 44 | + 他/来到/了/网易/杭研/大厦 | ||
| 45 | + */ | ||
| 46 | + s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; | ||
| 47 | + std::cout << s << std::endl; | ||
| 48 | + std::cout << "[demo] CutForSearch" << std::endl; | ||
| 49 | + jieba.CutForSearch(s, words); | ||
| 50 | + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
| 51 | + /* | ||
| 52 | + 小明硕士毕业于中国科学院计算所,后在日本京都大学深造 | ||
| 53 | + [demo] CutForSearch | ||
| 54 | + 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造 | ||
| 55 | + */ | ||
| 56 | + std::cout << "[demo] Insert User Word" << std::endl; | ||
| 57 | + jieba.Cut("男默女泪", words); | ||
| 58 | + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
| 59 | + jieba.InsertUserWord("男默女泪"); | ||
| 60 | + jieba.Cut("男默女泪", words); | ||
| 61 | + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
| 62 | + /* | ||
| 63 | + [demo] Insert User Word | ||
| 64 | + 男默/女泪 | ||
| 65 | + 男默女泪 | ||
| 66 | + */ | ||
| 67 | + std::cout << "[demo] CutForSearch Word With Offset" << std::endl; | ||
| 68 | + jieba.CutForSearch(s, jiebawords, true); | ||
| 69 | + std::cout << jiebawords << std::endl; | ||
| 70 | + /* | ||
| 71 | +[demo] CutForSearch Word With Offset | ||
| 72 | +[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", | ||
| 73 | +"offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, | ||
| 74 | +{"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": | ||
| 75 | +"科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", | ||
| 76 | +"offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, | ||
| 77 | +{"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", | ||
| 78 | +"offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, | ||
| 79 | +{"word": "日本京都大学", "offset": 54}, {"word": " 深造", "offset": 72}] | ||
| 80 | + */ | ||
| 81 | + // see more test at | ||
| 82 | + // https://github.com/yanyiwu/cppjieba/blob/master/test/demo.cpp | ||
| 83 | +} | ||
| 84 | + | ||
| 85 | +TEST(CppJieBa, Case2) { | ||
| 86 | + if (!FileExists(kDictPath)) { | ||
| 87 | + SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath); | ||
| 88 | + return; | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath, | ||
| 92 | + kStopWordPath); | ||
| 93 | + std::string s = | ||
| 94 | + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如" | ||
| 95 | + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感" | ||
| 96 | + "受着生命的奇迹与温柔"; | ||
| 97 | + std::vector<std::string> words; | ||
| 98 | + bool is_hmm = true; | ||
| 99 | + jieba.Cut(s, words, is_hmm); | ||
| 100 | + { | ||
| 101 | + std::ostringstream os; | ||
| 102 | + std::string sep = ""; | ||
| 103 | + for (const auto &w : words) { | ||
| 104 | + os << sep << w; | ||
| 105 | + sep = "_"; | ||
| 106 | + } | ||
| 107 | + | ||
| 108 | + std::cout << os.str() << "\n"; | ||
| 109 | + } | ||
| 110 | + /* | ||
| 111 | +当_夜幕降临_,_星光点点_,_伴随_着_微风_拂面_, | ||
| 112 | +_我_在_静谧_中_感受_着_时光_的_流转_, | ||
| 113 | +_思念_如_涟漪_荡漾_,_梦境_如_画卷_展开_,_我_与_自然_融为一体_, | ||
| 114 | +_沉静_在_这_片_宁静_的_美丽_之中_,_感受_着_生命_的_奇迹_与_温柔 | ||
| 115 | + */ | ||
| 116 | + s = "这里有:红的、绿的、蓝的;各种各样的颜色都有!你想要什么呢?测试."; | ||
| 117 | + std::regex punct_re(":|、|;"); | ||
| 118 | + std::string s2 = std::regex_replace(s, punct_re, ","); | ||
| 119 | + | ||
| 120 | + std::regex punct_re2("[.]"); | ||
| 121 | + s2 = std::regex_replace(s2, punct_re2, "。"); | ||
| 122 | + | ||
| 123 | + std::regex punct_re3("[?]"); | ||
| 124 | + s2 = std::regex_replace(s2, punct_re3, "?"); | ||
| 125 | + | ||
| 126 | + std::regex punct_re4("[!]"); | ||
| 127 | + s2 = std::regex_replace(s2, punct_re4, "!"); | ||
| 128 | + std::cout << s << "\n" << s2 << "\n"; | ||
| 129 | + | ||
| 130 | + words.clear(); | ||
| 131 | + jieba.Cut(s2, words, is_hmm); | ||
| 132 | + { | ||
| 133 | + std::ostringstream os; | ||
| 134 | + std::string sep = ""; | ||
| 135 | + for (const auto &w : words) { | ||
| 136 | + os << sep << w; | ||
| 137 | + sep = "_"; | ||
| 138 | + } | ||
| 139 | + | ||
| 140 | + std::cout << os.str() << "\n"; | ||
| 141 | + } | ||
| 142 | +} | ||
| 143 | + | ||
| 144 | +} // namespace sherpa_onnx |
| @@ -18,6 +18,7 @@ bool FileExists(const std::string &filename) { | @@ -18,6 +18,7 @@ bool FileExists(const std::string &filename) { | ||
| 18 | void AssertFileExists(const std::string &filename) { | 18 | void AssertFileExists(const std::string &filename) { |
| 19 | if (!FileExists(filename)) { | 19 | if (!FileExists(filename)) { |
| 20 | SHERPA_ONNX_LOG(FATAL) << filename << " does not exist!"; | 20 | SHERPA_ONNX_LOG(FATAL) << filename << " does not exist!"; |
| 21 | + exit(-1); | ||
| 21 | } | 22 | } |
| 22 | } | 23 | } |
| 23 | 24 |
sherpa-onnx/csrc/jieba-lexicon.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/jieba-lexicon.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/jieba-lexicon.h" | ||
| 6 | + | ||
| 7 | +#include <fstream> | ||
| 8 | +#include <regex> // NOLINT | ||
| 9 | +#include <utility> | ||
| 10 | + | ||
| 11 | +#include "cppjieba/Jieba.hpp" | ||
| 12 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 13 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 14 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 15 | + | ||
| 16 | +namespace sherpa_onnx { | ||
| 17 | + | ||
| 18 | +// implemented in ./lexicon.cc | ||
| 19 | +std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is); | ||
| 20 | +std::vector<int32_t> ConvertTokensToIds( | ||
| 21 | + const std::unordered_map<std::string, int32_t> &token2id, | ||
| 22 | + const std::vector<std::string> &tokens); | ||
| 23 | + | ||
| 24 | +class JiebaLexicon::Impl { | ||
| 25 | + public: | ||
| 26 | + Impl(const std::string &lexicon, const std::string &tokens, | ||
| 27 | + const std::string &dict_dir, | ||
| 28 | + const OfflineTtsVitsModelMetaData &meta_data, bool debug) | ||
| 29 | + : meta_data_(meta_data), debug_(debug) { | ||
| 30 | + std::string dict = dict_dir + "/jieba.dict.utf8"; | ||
| 31 | + std::string hmm = dict_dir + "/hmm_model.utf8"; | ||
| 32 | + std::string user_dict = dict_dir + "/user.dict.utf8"; | ||
| 33 | + std::string idf = dict_dir + "/idf.utf8"; | ||
| 34 | + std::string stop_word = dict_dir + "/stop_words.utf8"; | ||
| 35 | + | ||
| 36 | + AssertFileExists(dict); | ||
| 37 | + AssertFileExists(hmm); | ||
| 38 | + AssertFileExists(user_dict); | ||
| 39 | + AssertFileExists(idf); | ||
| 40 | + AssertFileExists(stop_word); | ||
| 41 | + | ||
| 42 | + jieba_ = | ||
| 43 | + std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word); | ||
| 44 | + | ||
| 45 | + { | ||
| 46 | + std::ifstream is(tokens); | ||
| 47 | + InitTokens(is); | ||
| 48 | + } | ||
| 49 | + | ||
| 50 | + { | ||
| 51 | + std::ifstream is(lexicon); | ||
| 52 | + InitLexicon(is); | ||
| 53 | + } | ||
| 54 | + } | ||
| 55 | + | ||
| 56 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIds( | ||
| 57 | + const std::string &text) const { | ||
| 58 | + // see | ||
| 59 | + // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244 | ||
| 60 | + std::regex punct_re{":|、|;"}; | ||
| 61 | + std::string s = std::regex_replace(text, punct_re, ","); | ||
| 62 | + | ||
| 63 | + std::regex punct_re2("[.]"); | ||
| 64 | + s = std::regex_replace(s, punct_re2, "。"); | ||
| 65 | + | ||
| 66 | + std::regex punct_re3("[?]"); | ||
| 67 | + s = std::regex_replace(s, punct_re3, "?"); | ||
| 68 | + | ||
| 69 | + std::regex punct_re4("[!]"); | ||
| 70 | + s = std::regex_replace(s, punct_re4, "!"); | ||
| 71 | + | ||
| 72 | + std::vector<std::string> words; | ||
| 73 | + bool is_hmm = true; | ||
| 74 | + jieba_->Cut(text, words, is_hmm); | ||
| 75 | + | ||
| 76 | + if (debug_) { | ||
| 77 | + SHERPA_ONNX_LOGE("input text: %s", text.c_str()); | ||
| 78 | + SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str()); | ||
| 79 | + | ||
| 80 | + std::ostringstream os; | ||
| 81 | + std::string sep = ""; | ||
| 82 | + for (const auto &w : words) { | ||
| 83 | + os << sep << w; | ||
| 84 | + sep = "_"; | ||
| 85 | + } | ||
| 86 | + | ||
| 87 | + SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str()); | ||
| 88 | + } | ||
| 89 | + | ||
| 90 | + std::vector<std::vector<int64_t>> ans; | ||
| 91 | + std::vector<int64_t> this_sentence; | ||
| 92 | + | ||
| 93 | + int32_t blank = token2id_.at(" "); | ||
| 94 | + for (const auto &w : words) { | ||
| 95 | + auto ids = ConvertWordToIds(w); | ||
| 96 | + if (ids.empty()) { | ||
| 97 | + SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str()); | ||
| 98 | + continue; | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); | ||
| 102 | + this_sentence.push_back(blank); | ||
| 103 | + | ||
| 104 | + if (w == "。" || w == "!" || w == "?" || w == ",") { | ||
| 105 | + ans.push_back(std::move(this_sentence)); | ||
| 106 | + } | ||
| 107 | + } // for (const auto &w : words) | ||
| 108 | + | ||
| 109 | + if (!this_sentence.empty()) { | ||
| 110 | + ans.push_back(std::move(this_sentence)); | ||
| 111 | + } | ||
| 112 | + | ||
| 113 | + return ans; | ||
| 114 | + } | ||
| 115 | + | ||
| 116 | + private: | ||
| 117 | + std::vector<int32_t> ConvertWordToIds(const std::string &w) const { | ||
| 118 | + if (word2ids_.count(w)) { | ||
| 119 | + return word2ids_.at(w); | ||
| 120 | + } | ||
| 121 | + | ||
| 122 | + if (token2id_.count(w)) { | ||
| 123 | + return {token2id_.at(w)}; | ||
| 124 | + } | ||
| 125 | + | ||
| 126 | + std::vector<int32_t> ans; | ||
| 127 | + | ||
| 128 | + std::vector<std::string> words = SplitUtf8(w); | ||
| 129 | + for (const auto &word : words) { | ||
| 130 | + if (word2ids_.count(word)) { | ||
| 131 | + auto ids = ConvertWordToIds(word); | ||
| 132 | + ans.insert(ans.end(), ids.begin(), ids.end()); | ||
| 133 | + } | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + return ans; | ||
| 137 | + } | ||
| 138 | + | ||
| 139 | + void InitTokens(std::istream &is) { | ||
| 140 | + token2id_ = ReadTokens(is); | ||
| 141 | + | ||
| 142 | + std::vector<std::pair<std::string, std::string>> puncts = { | ||
| 143 | + {",", ","}, {".", "。"}, {"!", "!"}, {"?", "?"}}; | ||
| 144 | + | ||
| 145 | + for (const auto &p : puncts) { | ||
| 146 | + if (token2id_.count(p.first) && !token2id_.count(p.second)) { | ||
| 147 | + token2id_[p.second] = token2id_[p.first]; | ||
| 148 | + } | ||
| 149 | + } | ||
| 150 | + } | ||
| 151 | + | ||
| 152 | + void InitLexicon(std::istream &is) { | ||
| 153 | + std::string word; | ||
| 154 | + std::vector<std::string> token_list; | ||
| 155 | + std::string line; | ||
| 156 | + std::string phone; | ||
| 157 | + int32_t line_num = 0; | ||
| 158 | + | ||
| 159 | + while (std::getline(is, line)) { | ||
| 160 | + ++line_num; | ||
| 161 | + | ||
| 162 | + std::istringstream iss(line); | ||
| 163 | + | ||
| 164 | + token_list.clear(); | ||
| 165 | + | ||
| 166 | + iss >> word; | ||
| 167 | + ToLowerCase(&word); | ||
| 168 | + | ||
| 169 | + if (word2ids_.count(word)) { | ||
| 170 | + SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.", | ||
| 171 | + word.c_str(), line_num, line.c_str()); | ||
| 172 | + continue; | ||
| 173 | + } | ||
| 174 | + | ||
| 175 | + while (iss >> phone) { | ||
| 176 | + token_list.push_back(std::move(phone)); | ||
| 177 | + } | ||
| 178 | + | ||
| 179 | + std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list); | ||
| 180 | + if (ids.empty()) { | ||
| 181 | + continue; | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + word2ids_.insert({std::move(word), std::move(ids)}); | ||
| 185 | + } | ||
| 186 | + } | ||
| 187 | + | ||
| 188 | + private: | ||
| 189 | + // lexicon.txt is saved in word2ids_ | ||
| 190 | + std::unordered_map<std::string, std::vector<int32_t>> word2ids_; | ||
| 191 | + | ||
| 192 | + // tokens.txt is saved in token2id_ | ||
| 193 | + std::unordered_map<std::string, int32_t> token2id_; | ||
| 194 | + | ||
| 195 | + OfflineTtsVitsModelMetaData meta_data_; | ||
| 196 | + | ||
| 197 | + std::unique_ptr<cppjieba::Jieba> jieba_; | ||
| 198 | + bool debug_ = false; | ||
| 199 | +}; | ||
| 200 | + | ||
| 201 | +JiebaLexicon::~JiebaLexicon() = default; | ||
| 202 | + | ||
| 203 | +JiebaLexicon::JiebaLexicon(const std::string &lexicon, | ||
| 204 | + const std::string &tokens, | ||
| 205 | + const std::string &dict_dir, | ||
| 206 | + const OfflineTtsVitsModelMetaData &meta_data, | ||
| 207 | + bool debug) | ||
| 208 | + : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, meta_data, | ||
| 209 | + debug)) {} | ||
| 210 | + | ||
| 211 | +std::vector<std::vector<int64_t>> JiebaLexicon::ConvertTextToTokenIds( | ||
| 212 | + const std::string &text, const std::string &unused_voice /*= ""*/) const { | ||
| 213 | + return impl_->ConvertTextToTokenIds(text); | ||
| 214 | +} | ||
| 215 | + | ||
| 216 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/jieba-lexicon.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/jieba-lexicon.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_ | ||
| 7 | + | ||
| 8 | +#include <memory> | ||
| 9 | +#include <string> | ||
| 10 | +#include <unordered_map> | ||
| 11 | +#include <vector> | ||
| 12 | + | ||
| 13 | +#if __ANDROID_API__ >= 9 | ||
| 14 | +#include "android/asset_manager.h" | ||
| 15 | +#include "android/asset_manager_jni.h" | ||
| 16 | +#endif | ||
| 17 | + | ||
| 18 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 19 | +#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | ||
| 20 | + | ||
| 21 | +namespace sherpa_onnx { | ||
| 22 | + | ||
| 23 | +class JiebaLexicon : public OfflineTtsFrontend { | ||
| 24 | + public: | ||
| 25 | + ~JiebaLexicon() override; | ||
| 26 | + JiebaLexicon(const std::string &lexicon, const std::string &tokens, | ||
| 27 | + const std::string &dict_dir, | ||
| 28 | + const OfflineTtsVitsModelMetaData &meta_data, bool debug); | ||
| 29 | + | ||
| 30 | +#if __ANDROID_API__ >= 9 | ||
| 31 | + JiebaLexicon(AAssetManager *mgr, const std::string &lexicon, | ||
| 32 | + const std::string &tokens, const std::string &dict_dir, | ||
| 33 | + const OfflineTtsVitsModelMetaData &meta_data); | ||
| 34 | +#endif | ||
| 35 | + | ||
| 36 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIds( | ||
| 37 | + const std::string &text, | ||
| 38 | + const std::string &unused_voice = "") const override; | ||
| 39 | + | ||
| 40 | + private: | ||
| 41 | + class Impl; | ||
| 42 | + std::unique_ptr<Impl> impl_; | ||
| 43 | +}; | ||
| 44 | + | ||
| 45 | +} // namespace sherpa_onnx | ||
| 46 | + | ||
| 47 | +#endif // SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_ |
| @@ -76,7 +76,7 @@ static std::vector<std::string> ProcessHeteronyms( | @@ -76,7 +76,7 @@ static std::vector<std::string> ProcessHeteronyms( | ||
| 76 | 76 | ||
| 77 | // Note: We don't use SymbolTable here since tokens may contain a blank | 77 | // Note: We don't use SymbolTable here since tokens may contain a blank |
| 78 | // in the first column | 78 | // in the first column |
| 79 | -static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) { | 79 | +std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) { |
| 80 | std::unordered_map<std::string, int32_t> token2id; | 80 | std::unordered_map<std::string, int32_t> token2id; |
| 81 | 81 | ||
| 82 | std::string line; | 82 | std::string line; |
| @@ -113,7 +113,7 @@ static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) { | @@ -113,7 +113,7 @@ static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) { | ||
| 113 | return token2id; | 113 | return token2id; |
| 114 | } | 114 | } |
| 115 | 115 | ||
| 116 | -static std::vector<int32_t> ConvertTokensToIds( | 116 | +std::vector<int32_t> ConvertTokensToIds( |
| 117 | const std::unordered_map<std::string, int32_t> &token2id, | 117 | const std::unordered_map<std::string, int32_t> &token2id, |
| 118 | const std::vector<std::string> &tokens) { | 118 | const std::vector<std::string> &tokens) { |
| 119 | std::vector<int32_t> ids; | 119 | std::vector<int32_t> ids; |
| @@ -19,6 +19,7 @@ | @@ -19,6 +19,7 @@ | ||
| 19 | #include "fst/extensions/far/far.h" | 19 | #include "fst/extensions/far/far.h" |
| 20 | #include "kaldifst/csrc/kaldi-fst-io.h" | 20 | #include "kaldifst/csrc/kaldi-fst-io.h" |
| 21 | #include "kaldifst/csrc/text-normalizer.h" | 21 | #include "kaldifst/csrc/text-normalizer.h" |
| 22 | +#include "sherpa-onnx/csrc/jieba-lexicon.h" | ||
| 22 | #include "sherpa-onnx/csrc/lexicon.h" | 23 | #include "sherpa-onnx/csrc/lexicon.h" |
| 23 | #include "sherpa-onnx/csrc/macros.h" | 24 | #include "sherpa-onnx/csrc/macros.h" |
| 24 | #include "sherpa-onnx/csrc/offline-tts-character-frontend.h" | 25 | #include "sherpa-onnx/csrc/offline-tts-character-frontend.h" |
| @@ -290,9 +291,26 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -290,9 +291,26 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 290 | void InitFrontend() { | 291 | void InitFrontend() { |
| 291 | const auto &meta_data = model_->GetMetaData(); | 292 | const auto &meta_data = model_->GetMetaData(); |
| 292 | 293 | ||
| 294 | + if (meta_data.jieba && config_.model.vits.dict_dir.empty()) { | ||
| 295 | + SHERPA_ONNX_LOGE( | ||
| 296 | + "Please provide --vits-dict-dir for Chinese TTS models using jieba"); | ||
| 297 | + exit(-1); | ||
| 298 | + } | ||
| 299 | + | ||
| 300 | + if (!meta_data.jieba && !config_.model.vits.dict_dir.empty()) { | ||
| 301 | + SHERPA_ONNX_LOGE( | ||
| 302 | + "Current model is not using jieba but you provided --vits-dict-dir"); | ||
| 303 | + exit(-1); | ||
| 304 | + } | ||
| 305 | + | ||
| 293 | if (meta_data.frontend == "characters") { | 306 | if (meta_data.frontend == "characters") { |
| 294 | frontend_ = std::make_unique<OfflineTtsCharacterFrontend>( | 307 | frontend_ = std::make_unique<OfflineTtsCharacterFrontend>( |
| 295 | config_.model.vits.tokens, meta_data); | 308 | config_.model.vits.tokens, meta_data); |
| 309 | + } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) { | ||
| 310 | + frontend_ = std::make_unique<JiebaLexicon>( | ||
| 311 | + config_.model.vits.lexicon, config_.model.vits.tokens, | ||
| 312 | + config_.model.vits.dict_dir, model_->GetMetaData(), | ||
| 313 | + config_.model.debug); | ||
| 296 | } else if ((meta_data.is_piper || meta_data.is_coqui || | 314 | } else if ((meta_data.is_piper || meta_data.is_coqui || |
| 297 | meta_data.is_icefall) && | 315 | meta_data.is_icefall) && |
| 298 | !config_.model.vits.data_dir.empty()) { | 316 | !config_.model.vits.data_dir.empty()) { |
| @@ -4,6 +4,8 @@ | @@ -4,6 +4,8 @@ | ||
| 4 | 4 | ||
| 5 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" | 5 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" |
| 6 | 6 | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 7 | #include "sherpa-onnx/csrc/file-utils.h" | 9 | #include "sherpa-onnx/csrc/file-utils.h" |
| 8 | #include "sherpa-onnx/csrc/macros.h" | 10 | #include "sherpa-onnx/csrc/macros.h" |
| 9 | 11 | ||
| @@ -16,6 +18,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { | @@ -16,6 +18,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { | ||
| 16 | po->Register("vits-data-dir", &data_dir, | 18 | po->Register("vits-data-dir", &data_dir, |
| 17 | "Path to the directory containing dict for espeak-ng. If it is " | 19 | "Path to the directory containing dict for espeak-ng. If it is " |
| 18 | "given, --vits-lexicon is ignored."); | 20 | "given, --vits-lexicon is ignored."); |
| 21 | + po->Register("vits-dict-dir", &dict_dir, | ||
| 22 | + "Path to the directory containing dict for jieba. Used only for " | ||
| 23 | + "Chinese TTS models using jieba"); | ||
| 19 | po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); | 24 | po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); |
| 20 | po->Register("vits-noise-scale-w", &noise_scale_w, | 25 | po->Register("vits-noise-scale-w", &noise_scale_w, |
| 21 | "noise_scale_w for VITS models"); | 26 | "noise_scale_w for VITS models"); |
| @@ -64,12 +69,24 @@ bool OfflineTtsVitsModelConfig::Validate() const { | @@ -64,12 +69,24 @@ bool OfflineTtsVitsModelConfig::Validate() const { | ||
| 64 | } | 69 | } |
| 65 | 70 | ||
| 66 | if (!FileExists(data_dir + "/intonations")) { | 71 | if (!FileExists(data_dir + "/intonations")) { |
| 67 | - SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test", | ||
| 68 | - data_dir.c_str()); | 72 | + SHERPA_ONNX_LOGE("%s/intonations does not exist.", data_dir.c_str()); |
| 69 | return false; | 73 | return false; |
| 70 | } | 74 | } |
| 71 | } | 75 | } |
| 72 | 76 | ||
| 77 | + if (!dict_dir.empty()) { | ||
| 78 | + std::vector<std::string> required_files = { | ||
| 79 | + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8", | ||
| 80 | + "idf.utf8", "stop_words.utf8", | ||
| 81 | + }; | ||
| 82 | + | ||
| 83 | + for (const auto &f : required_files) { | ||
| 84 | + if (!FileExists(dict_dir + "/" + f)) { | ||
| 85 | + SHERPA_ONNX_LOGE("%s/%s does not exist.", data_dir.c_str(), f.c_str()); | ||
| 86 | + return false; | ||
| 87 | + } | ||
| 88 | + } | ||
| 89 | + } | ||
| 73 | return true; | 90 | return true; |
| 74 | } | 91 | } |
| 75 | 92 | ||
| @@ -81,6 +98,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { | @@ -81,6 +98,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { | ||
| 81 | os << "lexicon=\"" << lexicon << "\", "; | 98 | os << "lexicon=\"" << lexicon << "\", "; |
| 82 | os << "tokens=\"" << tokens << "\", "; | 99 | os << "tokens=\"" << tokens << "\", "; |
| 83 | os << "data_dir=\"" << data_dir << "\", "; | 100 | os << "data_dir=\"" << data_dir << "\", "; |
| 101 | + os << "dict_dir=\"" << dict_dir << "\", "; | ||
| 84 | os << "noise_scale=" << noise_scale << ", "; | 102 | os << "noise_scale=" << noise_scale << ", "; |
| 85 | os << "noise_scale_w=" << noise_scale_w << ", "; | 103 | os << "noise_scale_w=" << noise_scale_w << ", "; |
| 86 | os << "length_scale=" << length_scale << ")"; | 104 | os << "length_scale=" << length_scale << ")"; |
| @@ -20,6 +20,9 @@ struct OfflineTtsVitsModelConfig { | @@ -20,6 +20,9 @@ struct OfflineTtsVitsModelConfig { | ||
| 20 | // data_dir is for piper-phonemize, which uses espeak-ng | 20 | // data_dir is for piper-phonemize, which uses espeak-ng |
| 21 | std::string data_dir; | 21 | std::string data_dir; |
| 22 | 22 | ||
| 23 | + // Used for Chinese TTS models using jieba | ||
| 24 | + std::string dict_dir; | ||
| 25 | + | ||
| 23 | float noise_scale = 0.667; | 26 | float noise_scale = 0.667; |
| 24 | float noise_scale_w = 0.8; | 27 | float noise_scale_w = 0.8; |
| 25 | float length_scale = 1; | 28 | float length_scale = 1; |
| @@ -33,12 +36,14 @@ struct OfflineTtsVitsModelConfig { | @@ -33,12 +36,14 @@ struct OfflineTtsVitsModelConfig { | ||
| 33 | const std::string &lexicon, | 36 | const std::string &lexicon, |
| 34 | const std::string &tokens, | 37 | const std::string &tokens, |
| 35 | const std::string &data_dir, | 38 | const std::string &data_dir, |
| 39 | + const std::string &dict_dir, | ||
| 36 | float noise_scale = 0.667, | 40 | float noise_scale = 0.667, |
| 37 | float noise_scale_w = 0.8, float length_scale = 1) | 41 | float noise_scale_w = 0.8, float length_scale = 1) |
| 38 | : model(model), | 42 | : model(model), |
| 39 | lexicon(lexicon), | 43 | lexicon(lexicon), |
| 40 | tokens(tokens), | 44 | tokens(tokens), |
| 41 | data_dir(data_dir), | 45 | data_dir(data_dir), |
| 46 | + dict_dir(dict_dir), | ||
| 42 | noise_scale(noise_scale), | 47 | noise_scale(noise_scale), |
| 43 | noise_scale_w(noise_scale_w), | 48 | noise_scale_w(noise_scale_w), |
| 44 | length_scale(length_scale) {} | 49 | length_scale(length_scale) {} |
| @@ -22,6 +22,10 @@ struct OfflineTtsVitsModelMetaData { | @@ -22,6 +22,10 @@ struct OfflineTtsVitsModelMetaData { | ||
| 22 | bool is_coqui = false; | 22 | bool is_coqui = false; |
| 23 | bool is_icefall = false; | 23 | bool is_icefall = false; |
| 24 | 24 | ||
| 25 | + // for Chinese TTS models from | ||
| 26 | + // https://github.com/Plachtaa/VITS-fast-fine-tuning | ||
| 27 | + int32_t jieba = 0; | ||
| 28 | + | ||
| 25 | // the following options are for models from coqui-ai/TTS | 29 | // the following options are for models from coqui-ai/TTS |
| 26 | int32_t blank_id = 0; | 30 | int32_t blank_id = 0; |
| 27 | int32_t bos_id = 0; | 31 | int32_t bos_id = 0; |
| @@ -93,6 +93,7 @@ class OfflineTtsVitsModel::Impl { | @@ -93,6 +93,7 @@ class OfflineTtsVitsModel::Impl { | ||
| 93 | SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.frontend, "frontend", | 93 | SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.frontend, "frontend", |
| 94 | ""); | 94 | ""); |
| 95 | 95 | ||
| 96 | + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.jieba, "jieba", 0); | ||
| 96 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.blank_id, "blank_id", 0); | 97 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.blank_id, "blank_id", 0); |
| 97 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); | 98 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); |
| 98 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); | 99 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); |
| @@ -16,15 +16,17 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { | @@ -16,15 +16,17 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { | ||
| 16 | py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig") | 16 | py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig") |
| 17 | .def(py::init<>()) | 17 | .def(py::init<>()) |
| 18 | .def(py::init<const std::string &, const std::string &, | 18 | .def(py::init<const std::string &, const std::string &, |
| 19 | - const std::string &, const std::string &, float, float, | ||
| 20 | - float>(), | 19 | + const std::string &, const std::string &, |
| 20 | + const std::string &, float, float, float>(), | ||
| 21 | py::arg("model"), py::arg("lexicon"), py::arg("tokens"), | 21 | py::arg("model"), py::arg("lexicon"), py::arg("tokens"), |
| 22 | - py::arg("data_dir") = "", py::arg("noise_scale") = 0.667, | ||
| 23 | - py::arg("noise_scale_w") = 0.8, py::arg("length_scale") = 1.0) | 22 | + py::arg("data_dir") = "", py::arg("dict_dir") = "", |
| 23 | + py::arg("noise_scale") = 0.667, py::arg("noise_scale_w") = 0.8, | ||
| 24 | + py::arg("length_scale") = 1.0) | ||
| 24 | .def_readwrite("model", &PyClass::model) | 25 | .def_readwrite("model", &PyClass::model) |
| 25 | .def_readwrite("lexicon", &PyClass::lexicon) | 26 | .def_readwrite("lexicon", &PyClass::lexicon) |
| 26 | .def_readwrite("tokens", &PyClass::tokens) | 27 | .def_readwrite("tokens", &PyClass::tokens) |
| 27 | .def_readwrite("data_dir", &PyClass::data_dir) | 28 | .def_readwrite("data_dir", &PyClass::data_dir) |
| 29 | + .def_readwrite("dict_dir", &PyClass::dict_dir) | ||
| 28 | .def_readwrite("noise_scale", &PyClass::noise_scale) | 30 | .def_readwrite("noise_scale", &PyClass::noise_scale) |
| 29 | .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) | 31 | .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) |
| 30 | .def_readwrite("length_scale", &PyClass::length_scale) | 32 | .def_readwrite("length_scale", &PyClass::length_scale) |
-
请 注册 或 登录 后发表评论