Fangjun Kuang
Committed by GitHub

Support French in TTS (#397)

1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.8.6") 4 +set(SHERPA_ONNX_VERSION "1.8.7")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -133,6 +133,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( @@ -133,6 +133,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
133 return ConvertTextToTokenIdsGerman(text); 133 return ConvertTextToTokenIdsGerman(text);
134 case Language::kSpanish: 134 case Language::kSpanish:
135 return ConvertTextToTokenIdsSpanish(text); 135 return ConvertTextToTokenIdsSpanish(text);
  136 + case Language::kFrench:
  137 + return ConvertTextToTokenIdsFrench(text);
136 case Language::kChinese: 138 case Language::kChinese:
137 return ConvertTextToTokenIdsChinese(text); 139 return ConvertTextToTokenIdsChinese(text);
138 default: 140 default:
@@ -254,6 +256,8 @@ void Lexicon::InitLanguage(const std::string &_lang) { @@ -254,6 +256,8 @@ void Lexicon::InitLanguage(const std::string &_lang) {
254 language_ = Language::kGerman; 256 language_ = Language::kGerman;
255 } else if (lang == "spanish") { 257 } else if (lang == "spanish") {
256 language_ = Language::kSpanish; 258 language_ = Language::kSpanish;
  259 + } else if (lang == "french") {
  260 + language_ = Language::kFrench;
257 } else if (lang == "chinese") { 261 } else if (lang == "chinese") {
258 language_ = Language::kChinese; 262 language_ = Language::kChinese;
259 } else { 263 } else {
@@ -46,6 +46,11 @@ class Lexicon { @@ -46,6 +46,11 @@ class Lexicon {
46 return ConvertTextToTokenIdsEnglish(text); 46 return ConvertTextToTokenIdsEnglish(text);
47 } 47 }
48 48
  49 + std::vector<int64_t> ConvertTextToTokenIdsFrench(
  50 + const std::string &text) const {
  51 + return ConvertTextToTokenIdsEnglish(text);
  52 + }
  53 +
49 std::vector<int64_t> ConvertTextToTokenIdsEnglish( 54 std::vector<int64_t> ConvertTextToTokenIdsEnglish(
50 const std::string &text) const; 55 const std::string &text) const;
51 56
@@ -62,6 +67,7 @@ class Lexicon { @@ -62,6 +67,7 @@ class Lexicon {
62 kEnglish, 67 kEnglish,
63 kGerman, 68 kGerman,
64 kSpanish, 69 kSpanish,
  70 + kFrench,
65 kChinese, 71 kChinese,
66 kUnknown, 72 kUnknown,
67 }; 73 };
@@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, @@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
164 std::vector<double> *out); 164 std::vector<double> *out);
165 165
166 static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); } 166 static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); }
167 -static bool IsGermanUmlauts(const std::string &word) { 167 +static bool IsGermanUmlaut(const std::string &word) {
168 // ä 0xC3 0xA4 168 // ä 0xC3 0xA4
169 // ö 0xC3 0xB6 169 // ö 0xC3 0xB6
170 // ü 0xC3 0xBC 170 // ü 0xC3 0xBC
@@ -187,6 +187,7 @@ static bool IsGermanUmlauts(const std::string &word) { @@ -187,6 +187,7 @@ static bool IsGermanUmlauts(const std::string &word) {
187 } 187 }
188 188
189 // see https://www.tandem.net/blog/spanish-accents 189 // see https://www.tandem.net/blog/spanish-accents
  190 +// https://www.compart.com/en/unicode/U+00DC
190 static bool IsSpanishDiacritic(const std::string &word) { 191 static bool IsSpanishDiacritic(const std::string &word) {
191 // á 0xC3 0xA1 192 // á 0xC3 0xA1
192 // é 0xC3 0xA9 193 // é 0xC3 0xA9
@@ -195,6 +196,16 @@ static bool IsSpanishDiacritic(const std::string &word) { @@ -195,6 +196,16 @@ static bool IsSpanishDiacritic(const std::string &word) {
195 // ú 0xC3 0xBA 196 // ú 0xC3 0xBA
196 // ü 0xC3 0xBC 197 // ü 0xC3 0xBC
197 // ñ 0xC3 0xB1 198 // ñ 0xC3 0xB1
  199 + //
  200 + // uppercase
  201 + //
  202 + // Á 0xC3 0x81
  203 + // É 0xC3 0x89
  204 + // Í 0xC3 0x8D
  205 + // Ó 0xC3 0x93
  206 + // Ú 0xC3 0x9A
  207 + // Ü 0xC3 0x9C
  208 + // Ñ 0xC3 0x91
198 209
199 if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) { 210 if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
200 return false; 211 return false;
@@ -202,15 +213,86 @@ static bool IsSpanishDiacritic(const std::string &word) { @@ -202,15 +213,86 @@ static bool IsSpanishDiacritic(const std::string &word) {
202 213
203 auto c = static_cast<uint8_t>(word[1]); 214 auto c = static_cast<uint8_t>(word[1]);
204 if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba || 215 if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba ||
205 - c == 0xbc || c == 0xb1) { 216 + c == 0xbc || c == 0xb1 || c == 0x81 || c == 0x89 || c == 0x8d ||
  217 + c == 0x93 || c == 0x9a || c == 0x9c || c == 0x91) {
206 return true; 218 return true;
207 } 219 }
208 220
209 return false; 221 return false;
210 } 222 }
211 223
  224 +// see https://www.busuu.com/en/french/accent-marks
  225 +static bool IsFrenchDiacritic(const std::string &word) {
  226 + // acute accent
  227 + // é 0xC3 0xA9
  228 + //
  229 + // grave accent
  230 + // à 0xC3 0xA0
  231 + // è 0xC3 0xA8
  232 + // ù 0xC3 0xB9
  233 + //
  234 + // cedilla
  235 + // ç 0xC3 0xA7
  236 + //
  237 + // circumflex
  238 + // â 0xC3 0xA2
  239 + // ê 0xC3 0xAA
  240 + // î 0xC3 0xAE
  241 + // ô 0xC3 0xB4
  242 + // û 0xC3 0xBB
  243 + //
  244 + // trema
  245 + // ë 0xC3 0xAB
  246 + // ï 0xC3 0xAF
  247 + // ü 0xC3 0xBC
  248 + //
  249 + // É 0xC3 0x89
  250 + //
  251 + // À 0xC3 0x80
  252 + // È 0xC3 0x88
  253 + // Ù 0xC3 0x99
  254 + // Ç 0xC3 0x87
  255 + // Â 0xC3 0x82
  256 + // Ê 0xC3 0x8A
  257 + // Î 0xC3 0x8E
  258 + // Ô 0xC3 0x94
  259 + // Û 0xC3 0x9B
  260 + // Ë 0xC3 0x8B
  261 + // Ï 0xC3 0x8F
  262 + // Ü 0xC3 0x9C
  263 +
  264 + if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
  265 + return false;
  266 + }
  267 +
  268 + auto c = static_cast<uint8_t>(word[1]);
  269 + if (c == 0xa9 || c == 0xa0 || c == 0xa8 || c == 0xb9 || c == 0xa7 ||
  270 + c == 0xa2 || c == 0xaa || c == 0xae || c == 0xb4 || c == 0xbb ||
  271 + c == 0xab || c == 0xaf || c == 0xbc || c == 0x89 || c == 0x80 ||
  272 + c == 0x88 || c == 0x99 || c == 0x87 || c == 0x82 || c == 0x8a ||
  273 + c == 0x8e || c == 0x94 || c == 0x9b || c == 0x8b || c == 0x8f ||
  274 + c == 0x9c) {
  275 + return true;
  276 + }
  277 + return false;
  278 +}
  279 +
212 static bool IsSpecial(const std::string &w) { 280 static bool IsSpecial(const std::string &w) {
213 - return IsGermanUmlauts(w) || IsSpanishDiacritic(w); 281 + bool ans = IsGermanUmlaut(w) || IsSpanishDiacritic(w) || IsFrenchDiacritic(w);
  282 +
  283 + // for french d’impossible
  284 + // ’ 0xE2 0x80 0x99
  285 + bool ans2 = false;
  286 + if (w.size() == 3) {
  287 + auto c0 = static_cast<uint8_t>(w[0]);
  288 + auto c1 = static_cast<uint8_t>(w[1]);
  289 + auto c2 = static_cast<uint8_t>(w[2]);
  290 + if (c0 == 0xe2 && c1 == 0x80 && c2 == 0x99) {
  291 + ans2 = true;
  292 + }
  293 + }
  294 +
  295 + return ans || ans2;
214 } 296 }
215 297
216 static std::vector<std::string> MergeCharactersIntoWords( 298 static std::vector<std::string> MergeCharactersIntoWords(