Fangjun Kuang
Committed by GitHub

Fix spliting text by languages for kokoro tts. (#1849)

@@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const { @@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const {
270 return OfflineStream{s}; 270 return OfflineStream{s};
271 } 271 }
272 272
273 -OfflineStream OfflineRecognizer::CreateStream(const std::string &hotwords) const { 273 +OfflineStream OfflineRecognizer::CreateStream(
  274 + const std::string &hotwords) const {
274 auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str()); 275 auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str());
275 return OfflineStream{s}; 276 return OfflineStream{s};
276 } 277 }
@@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) @@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
549 context-graph-test.cc 549 context-graph-test.cc
550 packed-sequence-test.cc 550 packed-sequence-test.cc
551 pad-sequence-test.cc 551 pad-sequence-test.cc
  552 + regex-lang-test.cc
552 slice-test.cc 553 slice-test.cc
553 stack-test.cc 554 stack-test.cc
554 text-utils-test.cc 555 text-utils-test.cc
@@ -4,9 +4,7 @@ @@ -4,9 +4,7 @@
4 4
5 #include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" 5 #include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
6 6
7 -#include <codecvt>  
8 #include <fstream> 7 #include <fstream>
9 -#include <locale>  
10 #include <regex> // NOLINT 8 #include <regex> // NOLINT
11 #include <sstream> 9 #include <sstream>
12 #include <strstream> 10 #include <strstream>
@@ -22,6 +20,8 @@ @@ -22,6 +20,8 @@
22 #include "rawfile/raw_file_manager.h" 20 #include "rawfile/raw_file_manager.h"
23 #endif 21 #endif
24 22
  23 +#include <codecvt>
  24 +
25 #include "cppjieba/Jieba.hpp" 25 #include "cppjieba/Jieba.hpp"
26 #include "espeak-ng/speak_lib.h" 26 #include "espeak-ng/speak_lib.h"
27 #include "phoneme_ids.hpp" 27 #include "phoneme_ids.hpp"
@@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text, @@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text,
37 piper::eSpeakPhonemeConfig &config, // NOLINT 37 piper::eSpeakPhonemeConfig &config, // NOLINT
38 std::vector<std::vector<piper::Phoneme>> *phonemes); 38 std::vector<std::vector<piper::Phoneme>> *phonemes);
39 39
40 -static std::wstring ToWideString(const std::string &s) {  
41 - // see  
42 - // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t  
43 - std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;  
44 - return converter.from_bytes(s);  
45 -}  
46 -  
47 -static std::string ToString(const std::wstring &s) {  
48 - // see  
49 - // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t  
50 - std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;  
51 - return converter.to_bytes(s);  
52 -}  
53 -  
54 class KokoroMultiLangLexicon::Impl { 40 class KokoroMultiLangLexicon::Impl {
55 public: 41 public:
56 Impl(const std::string &tokens, const std::string &lexicon, 42 Impl(const std::string &tokens, const std::string &lexicon,
@@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl { @@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl {
103 89
104 // https://en.cppreference.com/w/cpp/regex 90 // https://en.cppreference.com/w/cpp/regex
105 // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex 91 // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
106 - std::string expr =  
107 - "([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"  
108 - ")"; 92 + std::string expr_chinese = "([\\u4e00-\\u9fff]+)";
  93 + std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)";
  94 +
  95 + std::string expr_both = expr_chinese + "|" + expr_not_chinese;
109 96
110 auto ws = ToWideString(text); 97 auto ws = ToWideString(text);
111 - std::wstring wexpr = ToWideString(expr);  
112 - std::wregex we(wexpr); 98 + std::wstring wexpr_both = ToWideString(expr_both);
  99 + std::wregex we_both(wexpr_both);
  100 +
  101 + std::wstring wexpr_zh = ToWideString(expr_chinese);
  102 + std::wregex we_zh(wexpr_zh);
113 103
114 - auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we); 104 + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both);
115 auto end = std::wsregex_iterator(); 105 auto end = std::wsregex_iterator();
116 106
117 std::vector<TokenIDs> ans; 107 std::vector<TokenIDs> ans;
@@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl { @@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl {
119 for (std::wsregex_iterator i = begin; i != end; ++i) { 109 for (std::wsregex_iterator i = begin; i != end; ++i) {
120 std::wsmatch match = *i; 110 std::wsmatch match = *i;
121 std::wstring match_str = match.str(); 111 std::wstring match_str = match.str();
  112 +
122 auto ms = ToString(match_str); 113 auto ms = ToString(match_str);
123 uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0]; 114 uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
124 115
125 std::vector<std::vector<int32_t>> ids_vec; 116 std::vector<std::vector<int32_t>> ids_vec;
126 -  
127 - if (c < 0x80) { 117 + if (std::regex_match(match_str, we_zh)) {
128 if (debug_) { 118 if (debug_) {
129 - SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); 119 + SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
130 } 120 }
131 - ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); 121 + ids_vec = ConvertChineseToTokenIDs(ms);
132 } else { 122 } else {
133 if (debug_) { 123 if (debug_) {
134 - SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); 124 + SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
135 } 125 }
136 - ids_vec = ConvertChineseToTokenIDs(ms); 126 +
  127 + ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
137 } 128 }
138 129
139 for (const auto &ids : ids_vec) { 130 for (const auto &ids : ids_vec) {
@@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl { @@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl {
315 this_sentence.push_back(space_id); 306 this_sentence.push_back(space_id);
316 } else { 307 } else {
317 if (debug_) { 308 if (debug_) {
318 - SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str()); 309 + SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'",
  310 + word.c_str());
319 } 311 }
320 - 312 +
321 piper::eSpeakPhonemeConfig config; 313 piper::eSpeakPhonemeConfig config;
322 314
323 config.voice = voice; 315 config.voice = voice;
  1 +// sherpa-onnx/csrc/regex-lang-test.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include <regex> // NOLINT
  6 +
  7 +#include "gtest/gtest.h"
  8 +#include "sherpa-onnx/csrc/text-utils.cc"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +static void TestLang(const std::string &expr, const std::string &text,
  13 + const std::vector<std::string> &expected) {
  14 + auto ws = ToWideString(text);
  15 + std::wstring wexpr = ToWideString(expr);
  16 + std::wregex we(wexpr);
  17 +
  18 + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
  19 + auto end = std::wsregex_iterator();
  20 + int32_t k = 0;
  21 + for (std::wsregex_iterator i = begin; i != end; ++i) {
  22 + std::wsmatch match = *i;
  23 + std::wstring match_str = match.str();
  24 + auto ms = ToString(match_str);
  25 + std::cout << ms << "\n";
  26 + EXPECT_EQ(ms, expected[k]);
  27 + k++;
  28 + }
  29 + EXPECT_EQ(k, expected.size());
  30 +}
  31 +
  32 +TEST(German, Case1) {
  33 + std::cout << "----------Test German----------";
  34 + // see https://character-table.netlify.app/german/
  35 + std::string expr =
  36 + "([\\u0020-\\u005f\\u0061-"
  37 + "\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df\\"
  38 + "u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-"
  39 + "\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)";
  40 +
  41 + std::string text =
  42 + "开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€";
  43 +
  44 + std::vector<std::string> expected = {"Übeltäter übergibt Ärzten ",
  45 + "öfters äußerst ätzende Öle", "3€"};
  46 +
  47 + TestLang(expr, text, expected);
  48 +}
  49 +
  50 +TEST(French, Case1) {
  51 + std::string expr =
  52 + "([\\u0020-\\u005f\\u0061-"
  53 + "\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-"
  54 + "\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-"
  55 + "\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-"
  56 + "\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-"
  57 + "\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-"
  58 + "\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-"
  59 + "\\u2030\\u20ac\\u2212]+)";
  60 + std::string text =
  61 + "L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon";
  62 + std::vector<std::string> expected = {
  63 + "L'été, ",
  64 + "avec son ciel bleuâtre, ",
  65 + "est un moment où, ",
  66 + "Noël, maçon",
  67 + };
  68 + TestLang(expr, text, expected);
  69 +}
  70 +
  71 +TEST(English, Case1) {
  72 + // https://character-table.netlify.app/english/
  73 + std::string expr =
  74 + "([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-"
  75 + "\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-"
  76 + "\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)";
  77 + std::string text = "一how are you doing? 二Thank you!";
  78 +
  79 + std::vector<std::string> expected = {
  80 + "how are you doing? ",
  81 + "Thank you!",
  82 + };
  83 + TestLang(expr, text, expected);
  84 +}
  85 +
  86 +} // namespace sherpa_onnx
@@ -8,6 +8,14 @@ @@ -8,6 +8,14 @@
8 8
9 namespace sherpa_onnx { 9 namespace sherpa_onnx {
10 10
  11 +TEST(ToLowerCase, WideString) {
  12 + std::string text =
  13 + "Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€";
  14 + auto t = ToLowerCase(text);
  15 + std::cout << text << "\n";
  16 + std::cout << t << "\n";
  17 +}
  18 +
11 TEST(RemoveInvalidUtf8Sequences, Case1) { 19 TEST(RemoveInvalidUtf8Sequences, Case1) {
12 std::vector<uint8_t> v = { 20 std::vector<uint8_t> v = {
13 0xe4, 0xbb, 0x8a, // 今 21 0xe4, 0xbb, 0x8a, // 今
@@ -8,8 +8,11 @@ @@ -8,8 +8,11 @@
8 #include <algorithm> 8 #include <algorithm>
9 #include <cassert> 9 #include <cassert>
10 #include <cctype> 10 #include <cctype>
  11 +#include <codecvt>
11 #include <cstdint> 12 #include <cstdint>
  13 +#include <cwctype>
12 #include <limits> 14 #include <limits>
  15 +#include <locale>
13 #include <sstream> 16 #include <sstream>
14 #include <string> 17 #include <string>
15 #include <unordered_map> 18 #include <unordered_map>
@@ -389,10 +392,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) { @@ -389,10 +392,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
389 } 392 }
390 393
391 std::string ToLowerCase(const std::string &s) { 394 std::string ToLowerCase(const std::string &s) {
392 - std::string ans(s.size(), 0);  
393 - std::transform(s.begin(), s.end(), ans.begin(),  
394 - [](unsigned char c) { return std::tolower(c); });  
395 - return ans; 395 + return ToString(ToLowerCase(ToWideString(s)));
396 } 396 }
397 397
398 void ToLowerCase(std::string *in_out) { 398 void ToLowerCase(std::string *in_out) {
@@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) { @@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) {
400 [](unsigned char c) { return std::tolower(c); }); 400 [](unsigned char c) { return std::tolower(c); });
401 } 401 }
402 402
  403 +std::wstring ToLowerCase(const std::wstring &s) {
  404 + std::wstring ans(s.size(), 0);
  405 + std::transform(s.begin(), s.end(), ans.begin(), [](wchar_t c) -> wchar_t {
  406 + switch (c) {
  407 + // French
  408 + case L'À':
  409 + return L'à';
  410 + case L'Â':
  411 + return L'â';
  412 + case L'Æ':
  413 + return L'æ';
  414 + case L'Ç':
  415 + return L'ç';
  416 + case L'È':
  417 + return L'è';
  418 + case L'É':
  419 + return L'é';
  420 + case L'Ë':
  421 + return L'ë';
  422 + case L'Î':
  423 + return L'î';
  424 + case L'Ï':
  425 + return L'ï';
  426 + case L'Ô':
  427 + return L'ô';
  428 + case L'Ù':
  429 + return L'ù';
  430 + case L'Û':
  431 + return L'û';
  432 + case L'Ü':
  433 + return L'ü';
  434 +
  435 + // others
  436 + case L'Á':
  437 + return L'á';
  438 + case L'Í':
  439 + return L'í';
  440 + case L'Ó':
  441 + return L'ó';
  442 + case L'Ú':
  443 + return L'ú';
  444 + case L'Ñ':
  445 + return L'ñ';
  446 + case L'Ì':
  447 + return L'ì';
  448 + case L'Ò':
  449 + return L'ò';
  450 + case L'Ä':
  451 + return L'ä';
  452 + case L'Ö':
  453 + return L'ö';
  454 + // TODO(fangjun): Add more
  455 +
  456 + default:
  457 + return std::towlower(c);
  458 + }
  459 + });
  460 + return ans;
  461 +}
  462 +
403 static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) { 463 static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) {
404 return low <= x && x <= high; 464 return low <= x && x <= high;
405 } 465 }
@@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) { @@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) {
625 } 685 }
626 #endif 686 #endif
627 687
  688 +std::wstring ToWideString(const std::string &s) {
  689 + // see
  690 + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
  691 + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
  692 + return converter.from_bytes(s);
  693 +}
  694 +
  695 +std::string ToString(const std::wstring &s) {
  696 + // see
  697 + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
  698 + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
  699 + return converter.to_bytes(s);
  700 +}
  701 +
628 } // namespace sherpa_onnx 702 } // namespace sherpa_onnx
@@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text); @@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text);
124 std::string ToLowerCase(const std::string &s); 124 std::string ToLowerCase(const std::string &s);
125 void ToLowerCase(std::string *in_out); 125 void ToLowerCase(std::string *in_out);
126 126
  127 +std::wstring ToLowerCase(const std::wstring &s);
  128 +
127 std::string RemoveInvalidUtf8Sequences(const std::string &text, 129 std::string RemoveInvalidUtf8Sequences(const std::string &text,
128 bool show_debug_msg = false); 130 bool show_debug_msg = false);
129 131
@@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text); @@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text);
139 std::string Gb2312ToUtf8(const std::string &text); 141 std::string Gb2312ToUtf8(const std::string &text);
140 #endif 142 #endif
141 143
  144 +std::wstring ToWideString(const std::string &s);
  145 +
  146 +std::string ToString(const std::wstring &s);
  147 +
142 } // namespace sherpa_onnx 148 } // namespace sherpa_onnx
143 149
144 #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_ 150 #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_