Committed by
GitHub
Fix spliting text by languages for kokoro tts. (#1849)
正在显示
7 个修改的文件
包含
203 行增加
和
35 行删除
| @@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const { | @@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const { | ||
| 270 | return OfflineStream{s}; | 270 | return OfflineStream{s}; |
| 271 | } | 271 | } |
| 272 | 272 | ||
| 273 | -OfflineStream OfflineRecognizer::CreateStream(const std::string &hotwords) const { | 273 | +OfflineStream OfflineRecognizer::CreateStream( |
| 274 | + const std::string &hotwords) const { | ||
| 274 | auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str()); | 275 | auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str()); |
| 275 | return OfflineStream{s}; | 276 | return OfflineStream{s}; |
| 276 | } | 277 | } |
| @@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) | @@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) | ||
| 549 | context-graph-test.cc | 549 | context-graph-test.cc |
| 550 | packed-sequence-test.cc | 550 | packed-sequence-test.cc |
| 551 | pad-sequence-test.cc | 551 | pad-sequence-test.cc |
| 552 | + regex-lang-test.cc | ||
| 552 | slice-test.cc | 553 | slice-test.cc |
| 553 | stack-test.cc | 554 | stack-test.cc |
| 554 | text-utils-test.cc | 555 | text-utils-test.cc |
| @@ -4,9 +4,7 @@ | @@ -4,9 +4,7 @@ | ||
| 4 | 4 | ||
| 5 | #include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" | 5 | #include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" |
| 6 | 6 | ||
| 7 | -#include <codecvt> | ||
| 8 | #include <fstream> | 7 | #include <fstream> |
| 9 | -#include <locale> | ||
| 10 | #include <regex> // NOLINT | 8 | #include <regex> // NOLINT |
| 11 | #include <sstream> | 9 | #include <sstream> |
| 12 | #include <strstream> | 10 | #include <strstream> |
| @@ -22,6 +20,8 @@ | @@ -22,6 +20,8 @@ | ||
| 22 | #include "rawfile/raw_file_manager.h" | 20 | #include "rawfile/raw_file_manager.h" |
| 23 | #endif | 21 | #endif |
| 24 | 22 | ||
| 23 | +#include <codecvt> | ||
| 24 | + | ||
| 25 | #include "cppjieba/Jieba.hpp" | 25 | #include "cppjieba/Jieba.hpp" |
| 26 | #include "espeak-ng/speak_lib.h" | 26 | #include "espeak-ng/speak_lib.h" |
| 27 | #include "phoneme_ids.hpp" | 27 | #include "phoneme_ids.hpp" |
| @@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text, | @@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text, | ||
| 37 | piper::eSpeakPhonemeConfig &config, // NOLINT | 37 | piper::eSpeakPhonemeConfig &config, // NOLINT |
| 38 | std::vector<std::vector<piper::Phoneme>> *phonemes); | 38 | std::vector<std::vector<piper::Phoneme>> *phonemes); |
| 39 | 39 | ||
| 40 | -static std::wstring ToWideString(const std::string &s) { | ||
| 41 | - // see | ||
| 42 | - // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t | ||
| 43 | - std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; | ||
| 44 | - return converter.from_bytes(s); | ||
| 45 | -} | ||
| 46 | - | ||
| 47 | -static std::string ToString(const std::wstring &s) { | ||
| 48 | - // see | ||
| 49 | - // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t | ||
| 50 | - std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; | ||
| 51 | - return converter.to_bytes(s); | ||
| 52 | -} | ||
| 53 | - | ||
| 54 | class KokoroMultiLangLexicon::Impl { | 40 | class KokoroMultiLangLexicon::Impl { |
| 55 | public: | 41 | public: |
| 56 | Impl(const std::string &tokens, const std::string &lexicon, | 42 | Impl(const std::string &tokens, const std::string &lexicon, |
| @@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl { | @@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl { | ||
| 103 | 89 | ||
| 104 | // https://en.cppreference.com/w/cpp/regex | 90 | // https://en.cppreference.com/w/cpp/regex |
| 105 | // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex | 91 | // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex |
| 106 | - std::string expr = | ||
| 107 | - "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+" | ||
| 108 | - ")"; | 92 | + std::string expr_chinese = "([\\u4e00-\\u9fff]+)"; |
| 93 | + std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)"; | ||
| 94 | + | ||
| 95 | + std::string expr_both = expr_chinese + "|" + expr_not_chinese; | ||
| 109 | 96 | ||
| 110 | auto ws = ToWideString(text); | 97 | auto ws = ToWideString(text); |
| 111 | - std::wstring wexpr = ToWideString(expr); | ||
| 112 | - std::wregex we(wexpr); | 98 | + std::wstring wexpr_both = ToWideString(expr_both); |
| 99 | + std::wregex we_both(wexpr_both); | ||
| 100 | + | ||
| 101 | + std::wstring wexpr_zh = ToWideString(expr_chinese); | ||
| 102 | + std::wregex we_zh(wexpr_zh); | ||
| 113 | 103 | ||
| 114 | - auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we); | 104 | + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both); |
| 115 | auto end = std::wsregex_iterator(); | 105 | auto end = std::wsregex_iterator(); |
| 116 | 106 | ||
| 117 | std::vector<TokenIDs> ans; | 107 | std::vector<TokenIDs> ans; |
| @@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl { | @@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl { | ||
| 119 | for (std::wsregex_iterator i = begin; i != end; ++i) { | 109 | for (std::wsregex_iterator i = begin; i != end; ++i) { |
| 120 | std::wsmatch match = *i; | 110 | std::wsmatch match = *i; |
| 121 | std::wstring match_str = match.str(); | 111 | std::wstring match_str = match.str(); |
| 112 | + | ||
| 122 | auto ms = ToString(match_str); | 113 | auto ms = ToString(match_str); |
| 123 | uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0]; | 114 | uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0]; |
| 124 | 115 | ||
| 125 | std::vector<std::vector<int32_t>> ids_vec; | 116 | std::vector<std::vector<int32_t>> ids_vec; |
| 126 | - | ||
| 127 | - if (c < 0x80) { | 117 | + if (std::regex_match(match_str, we_zh)) { |
| 128 | if (debug_) { | 118 | if (debug_) { |
| 129 | - SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); | 119 | + SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); |
| 130 | } | 120 | } |
| 131 | - ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); | 121 | + ids_vec = ConvertChineseToTokenIDs(ms); |
| 132 | } else { | 122 | } else { |
| 133 | if (debug_) { | 123 | if (debug_) { |
| 134 | - SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); | 124 | + SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); |
| 135 | } | 125 | } |
| 136 | - ids_vec = ConvertChineseToTokenIDs(ms); | 126 | + |
| 127 | + ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); | ||
| 137 | } | 128 | } |
| 138 | 129 | ||
| 139 | for (const auto &ids : ids_vec) { | 130 | for (const auto &ids : ids_vec) { |
| @@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl { | @@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl { | ||
| 315 | this_sentence.push_back(space_id); | 306 | this_sentence.push_back(space_id); |
| 316 | } else { | 307 | } else { |
| 317 | if (debug_) { | 308 | if (debug_) { |
| 318 | - SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str()); | 309 | + SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", |
| 310 | + word.c_str()); | ||
| 319 | } | 311 | } |
| 320 | - | 312 | + |
| 321 | piper::eSpeakPhonemeConfig config; | 313 | piper::eSpeakPhonemeConfig config; |
| 322 | 314 | ||
| 323 | config.voice = voice; | 315 | config.voice = voice; |
sherpa-onnx/csrc/regex-lang-test.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/regex-lang-test.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <regex> // NOLINT | ||
| 6 | + | ||
| 7 | +#include "gtest/gtest.h" | ||
| 8 | +#include "sherpa-onnx/csrc/text-utils.cc" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +static void TestLang(const std::string &expr, const std::string &text, | ||
| 13 | + const std::vector<std::string> &expected) { | ||
| 14 | + auto ws = ToWideString(text); | ||
| 15 | + std::wstring wexpr = ToWideString(expr); | ||
| 16 | + std::wregex we(wexpr); | ||
| 17 | + | ||
| 18 | + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we); | ||
| 19 | + auto end = std::wsregex_iterator(); | ||
| 20 | + int32_t k = 0; | ||
| 21 | + for (std::wsregex_iterator i = begin; i != end; ++i) { | ||
| 22 | + std::wsmatch match = *i; | ||
| 23 | + std::wstring match_str = match.str(); | ||
| 24 | + auto ms = ToString(match_str); | ||
| 25 | + std::cout << ms << "\n"; | ||
| 26 | + EXPECT_EQ(ms, expected[k]); | ||
| 27 | + k++; | ||
| 28 | + } | ||
| 29 | + EXPECT_EQ(k, expected.size()); | ||
| 30 | +} | ||
| 31 | + | ||
| 32 | +TEST(German, Case1) { | ||
| 33 | + std::cout << "----------Test German----------"; | ||
| 34 | + // see https://character-table.netlify.app/german/ | ||
| 35 | + std::string expr = | ||
| 36 | + "([\\u0020-\\u005f\\u0061-" | ||
| 37 | + "\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df\\" | ||
| 38 | + "u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-" | ||
| 39 | + "\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)"; | ||
| 40 | + | ||
| 41 | + std::string text = | ||
| 42 | + "开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€"; | ||
| 43 | + | ||
| 44 | + std::vector<std::string> expected = {"Übeltäter übergibt Ärzten ", | ||
| 45 | + "öfters äußerst ätzende Öle", "3€"}; | ||
| 46 | + | ||
| 47 | + TestLang(expr, text, expected); | ||
| 48 | +} | ||
| 49 | + | ||
| 50 | +TEST(French, Case1) { | ||
| 51 | + std::string expr = | ||
| 52 | + "([\\u0020-\\u005f\\u0061-" | ||
| 53 | + "\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-" | ||
| 54 | + "\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-" | ||
| 55 | + "\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-" | ||
| 56 | + "\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-" | ||
| 57 | + "\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-" | ||
| 58 | + "\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-" | ||
| 59 | + "\\u2030\\u20ac\\u2212]+)"; | ||
| 60 | + std::string text = | ||
| 61 | + "L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon"; | ||
| 62 | + std::vector<std::string> expected = { | ||
| 63 | + "L'été, ", | ||
| 64 | + "avec son ciel bleuâtre, ", | ||
| 65 | + "est un moment où, ", | ||
| 66 | + "Noël, maçon", | ||
| 67 | + }; | ||
| 68 | + TestLang(expr, text, expected); | ||
| 69 | +} | ||
| 70 | + | ||
| 71 | +TEST(English, Case1) { | ||
| 72 | + // https://character-table.netlify.app/english/ | ||
| 73 | + std::string expr = | ||
| 74 | + "([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-" | ||
| 75 | + "\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-" | ||
| 76 | + "\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)"; | ||
| 77 | + std::string text = "一how are you doing? 二Thank you!"; | ||
| 78 | + | ||
| 79 | + std::vector<std::string> expected = { | ||
| 80 | + "how are you doing? ", | ||
| 81 | + "Thank you!", | ||
| 82 | + }; | ||
| 83 | + TestLang(expr, text, expected); | ||
| 84 | +} | ||
| 85 | + | ||
| 86 | +} // namespace sherpa_onnx |
| @@ -8,6 +8,14 @@ | @@ -8,6 +8,14 @@ | ||
| 8 | 8 | ||
| 9 | namespace sherpa_onnx { | 9 | namespace sherpa_onnx { |
| 10 | 10 | ||
| 11 | +TEST(ToLowerCase, WideString) { | ||
| 12 | + std::string text = | ||
| 13 | + "Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€"; | ||
| 14 | + auto t = ToLowerCase(text); | ||
| 15 | + std::cout << text << "\n"; | ||
| 16 | + std::cout << t << "\n"; | ||
| 17 | +} | ||
| 18 | + | ||
| 11 | TEST(RemoveInvalidUtf8Sequences, Case1) { | 19 | TEST(RemoveInvalidUtf8Sequences, Case1) { |
| 12 | std::vector<uint8_t> v = { | 20 | std::vector<uint8_t> v = { |
| 13 | 0xe4, 0xbb, 0x8a, // 今 | 21 | 0xe4, 0xbb, 0x8a, // 今 |
| @@ -8,8 +8,11 @@ | @@ -8,8 +8,11 @@ | ||
| 8 | #include <algorithm> | 8 | #include <algorithm> |
| 9 | #include <cassert> | 9 | #include <cassert> |
| 10 | #include <cctype> | 10 | #include <cctype> |
| 11 | +#include <codecvt> | ||
| 11 | #include <cstdint> | 12 | #include <cstdint> |
| 13 | +#include <cwctype> | ||
| 12 | #include <limits> | 14 | #include <limits> |
| 15 | +#include <locale> | ||
| 13 | #include <sstream> | 16 | #include <sstream> |
| 14 | #include <string> | 17 | #include <string> |
| 15 | #include <unordered_map> | 18 | #include <unordered_map> |
| @@ -389,10 +392,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) { | @@ -389,10 +392,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) { | ||
| 389 | } | 392 | } |
| 390 | 393 | ||
| 391 | std::string ToLowerCase(const std::string &s) { | 394 | std::string ToLowerCase(const std::string &s) { |
| 392 | - std::string ans(s.size(), 0); | ||
| 393 | - std::transform(s.begin(), s.end(), ans.begin(), | ||
| 394 | - [](unsigned char c) { return std::tolower(c); }); | ||
| 395 | - return ans; | 395 | + return ToString(ToLowerCase(ToWideString(s))); |
| 396 | } | 396 | } |
| 397 | 397 | ||
| 398 | void ToLowerCase(std::string *in_out) { | 398 | void ToLowerCase(std::string *in_out) { |
| @@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) { | @@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) { | ||
| 400 | [](unsigned char c) { return std::tolower(c); }); | 400 | [](unsigned char c) { return std::tolower(c); }); |
| 401 | } | 401 | } |
| 402 | 402 | ||
| 403 | +std::wstring ToLowerCase(const std::wstring &s) { | ||
| 404 | + std::wstring ans(s.size(), 0); | ||
| 405 | + std::transform(s.begin(), s.end(), ans.begin(), [](wchar_t c) -> wchar_t { | ||
| 406 | + switch (c) { | ||
| 407 | + // French | ||
| 408 | + case L'À': | ||
| 409 | + return L'à'; | ||
| 410 | + case L'Â': | ||
| 411 | + return L'â'; | ||
| 412 | + case L'Æ': | ||
| 413 | + return L'æ'; | ||
| 414 | + case L'Ç': | ||
| 415 | + return L'ç'; | ||
| 416 | + case L'È': | ||
| 417 | + return L'è'; | ||
| 418 | + case L'É': | ||
| 419 | + return L'é'; | ||
| 420 | + case L'Ë': | ||
| 421 | + return L'ë'; | ||
| 422 | + case L'Î': | ||
| 423 | + return L'î'; | ||
| 424 | + case L'Ï': | ||
| 425 | + return L'ï'; | ||
| 426 | + case L'Ô': | ||
| 427 | + return L'ô'; | ||
| 428 | + case L'Ù': | ||
| 429 | + return L'ù'; | ||
| 430 | + case L'Û': | ||
| 431 | + return L'û'; | ||
| 432 | + case L'Ü': | ||
| 433 | + return L'ü'; | ||
| 434 | + | ||
| 435 | + // others | ||
| 436 | + case L'Á': | ||
| 437 | + return L'á'; | ||
| 438 | + case L'Í': | ||
| 439 | + return L'í'; | ||
| 440 | + case L'Ó': | ||
| 441 | + return L'ó'; | ||
| 442 | + case L'Ú': | ||
| 443 | + return L'ú'; | ||
| 444 | + case L'Ñ': | ||
| 445 | + return L'ñ'; | ||
| 446 | + case L'Ì': | ||
| 447 | + return L'ì'; | ||
| 448 | + case L'Ò': | ||
| 449 | + return L'ò'; | ||
| 450 | + case L'Ä': | ||
| 451 | + return L'ä'; | ||
| 452 | + case L'Ö': | ||
| 453 | + return L'ö'; | ||
| 454 | + // TODO(fangjun): Add more | ||
| 455 | + | ||
| 456 | + default: | ||
| 457 | + return std::towlower(c); | ||
| 458 | + } | ||
| 459 | + }); | ||
| 460 | + return ans; | ||
| 461 | +} | ||
| 462 | + | ||
| 403 | static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) { | 463 | static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) { |
| 404 | return low <= x && x <= high; | 464 | return low <= x && x <= high; |
| 405 | } | 465 | } |
| @@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) { | @@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) { | ||
| 625 | } | 685 | } |
| 626 | #endif | 686 | #endif |
| 627 | 687 | ||
| 688 | +std::wstring ToWideString(const std::string &s) { | ||
| 689 | + // see | ||
| 690 | + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t | ||
| 691 | + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; | ||
| 692 | + return converter.from_bytes(s); | ||
| 693 | +} | ||
| 694 | + | ||
| 695 | +std::string ToString(const std::wstring &s) { | ||
| 696 | + // see | ||
| 697 | + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t | ||
| 698 | + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; | ||
| 699 | + return converter.to_bytes(s); | ||
| 700 | +} | ||
| 701 | + | ||
| 628 | } // namespace sherpa_onnx | 702 | } // namespace sherpa_onnx |
| @@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text); | @@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text); | ||
| 124 | std::string ToLowerCase(const std::string &s); | 124 | std::string ToLowerCase(const std::string &s); |
| 125 | void ToLowerCase(std::string *in_out); | 125 | void ToLowerCase(std::string *in_out); |
| 126 | 126 | ||
| 127 | +std::wstring ToLowerCase(const std::wstring &s); | ||
| 128 | + | ||
| 127 | std::string RemoveInvalidUtf8Sequences(const std::string &text, | 129 | std::string RemoveInvalidUtf8Sequences(const std::string &text, |
| 128 | bool show_debug_msg = false); | 130 | bool show_debug_msg = false); |
| 129 | 131 | ||
| @@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text); | @@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text); | ||
| 139 | std::string Gb2312ToUtf8(const std::string &text); | 141 | std::string Gb2312ToUtf8(const std::string &text); |
| 140 | #endif | 142 | #endif |
| 141 | 143 | ||
| 144 | +std::wstring ToWideString(const std::string &s); | ||
| 145 | + | ||
| 146 | +std::string ToString(const std::wstring &s); | ||
| 147 | + | ||
| 142 | } // namespace sherpa_onnx | 148 | } // namespace sherpa_onnx |
| 143 | 149 | ||
| 144 | #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_ | 150 | #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_ |
-
请 注册 或 登录 后发表评论