Fangjun Kuang
Committed by GitHub

Support specifying pronunciations of phrases in Chinese TTS. (#2507)

This PR implements support for specifying pronunciations of phrases in Chinese TTS by modifying the lexicon processing logic. The change introduces a greedy longest-match algorithm that attempts to match multi-word phrases before falling back to individual word processing.
... ... @@ -36,6 +36,22 @@ static bool IsPunct(const std::string &s) {
return puncts.count(s);
}
// end is inclusive
static std::string GetWord(const std::vector<std::string> &words, int32_t start,
int32_t end) {
std::string ans;
if (start >= words.size() || end >= words.size()) {
return ans;
}
for (int32_t i = start; i <= end; ++i) {
ans += words[i];
}
return ans;
}
class JiebaLexicon::Impl {
public:
Impl(const std::string &lexicon, const std::string &tokens,
... ... @@ -160,7 +176,46 @@ class JiebaLexicon::Impl {
std::vector<TokenIDs> ans;
std::vector<int64_t> this_sentence;
for (const auto &w : words) {
int32_t num_words = static_cast<int32_t>(words.size());
int32_t max_len = 10;
for (int32_t i = 0; i < num_words;) {
int32_t start = i;
int32_t end = std::min(i + max_len, num_words - 1);
std::string w;
while (end > start) {
auto this_word = GetWord(words, start, end);
if (debug_) {
#if __OHOS__
SHERPA_ONNX_LOGE("%{public}d-%{public}d: %{public}s", start, end,
this_word.c_str());
#else
SHERPA_ONNX_LOGE("%d-%d: %s", start, end, this_word.c_str());
#endif
}
if (word2ids_.count(this_word)) {
i = end + 1;
w = std::move(this_word);
if (debug_) {
#if __OHOS__
SHERPA_ONNX_LOGE("matched %{public}d-%{public}d: %{public}s", start,
end, w.c_str());
#else
SHERPA_ONNX_LOGE("matched %d-%d: %s", start, end, w.c_str());
#endif
}
break;
}
end -= 1;
}
if (w.empty()) {
w = words[i];
i += 1;
}
auto ids = ConvertWordToIds(w);
if (ids.empty()) {
#if __OHOS__
... ...