Support Spanish in TTS (#396)

Fangjun Kuang · GitHub
Commit 64ab1ea9f8f8ad649dee6f1b4725e034ad08ebb2 64ab1ea9 1 parent 69e985f7
sherpa-onnx/csrc/lexicon.cc
sherpa-onnx/csrc/lexicon.h
sherpa-onnx/csrc/text-utils.cc
--- a/sherpa-onnx/csrc/lexicon.cc
查看文件 @64ab1ea
+++ b/sherpa-onnx/csrc/lexicon.cc
查看文件 @64ab1ea
@@ -131,6 +131,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
       return ConvertTextToTokenIdsEnglish(text);
     case Language::kGerman:
       return ConvertTextToTokenIdsGerman(text);
+     case Language::kSpanish:
+       return ConvertTextToTokenIdsSpanish(text);
     case Language::kChinese:
       return ConvertTextToTokenIdsChinese(text);
     default:
@@ -250,6 +252,8 @@ void Lexicon::InitLanguage(const std::string &_lang) {
     language_ = Language::kEnglish;
   } else if (lang == "german") {
     language_ = Language::kGerman;
+   } else if (lang == "spanish") {
+     language_ = Language::kSpanish;
   } else if (lang == "chinese") {
     language_ = Language::kChinese;
   } else {
--- a/sherpa-onnx/csrc/lexicon.h
查看文件 @64ab1ea
+++ b/sherpa-onnx/csrc/lexicon.h
查看文件 @64ab1ea
@@ -41,6 +41,11 @@ class Lexicon {
     return ConvertTextToTokenIdsEnglish(text);
   }
 
+   std::vector<int64_t> ConvertTextToTokenIdsSpanish(
+       const std::string &text) const {
+     return ConvertTextToTokenIdsEnglish(text);
+   }
+ 
   std::vector<int64_t> ConvertTextToTokenIdsEnglish(
       const std::string &text) const;
 
@@ -56,6 +61,7 @@ class Lexicon {
   enum class Language {
     kEnglish,
     kGerman,
+     kSpanish,
     kChinese,
     kUnknown,
   };
--- a/sherpa-onnx/csrc/text-utils.cc
查看文件 @64ab1ea
+++ b/sherpa-onnx/csrc/text-utils.cc
查看文件 @64ab1ea
@@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
                                   std::vector<double> *out);
 
 static bool IsPunct(char c) { return c != '\'' && std::ispunct(c); }
- static bool IsGermanUmlauts(const std::string &words) {
+ static bool IsGermanUmlauts(const std::string &word) {
   // ä 0xC3 0xA4
   // ö 0xC3 0xB6
   // ü 0xC3 0xBC
@@ -173,12 +173,12 @@ static bool IsGermanUmlauts(const std::string &words) {
   // Ü 0xC3 0x9C
   // ß 0xC3 0x9F
 
-   if (words.size() != 2 || static_cast<uint8_t>(words[0]) != 0xc3) {
+   if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
     return false;
   }
 
-   auto c = static_cast<uint8_t>(words[1]);
-   if (c == 0xa4 || c == 0xb6 || c == 0xbC || c == 0x84 || c == 0x96 ||
+   auto c = static_cast<uint8_t>(word[1]);
+   if (c == 0xa4 || c == 0xb6 || c == 0xbc || c == 0x84 || c == 0x96 ||
       c == 0x9c || c == 0x9f) {
     return true;
   }
@@ -186,6 +186,33 @@ static bool IsGermanUmlauts(const std::string &words) {
   return false;
 }
 
+ // see https://www.tandem.net/blog/spanish-accents
+ static bool IsSpanishDiacritic(const std::string &word) {
+   // á 0xC3 0xA1
+   // é 0xC3 0xA9
+   // í 0xC3 0xAD
+   // ó 0xC3 0xB3
+   // ú 0xC3 0xBA
+   // ü 0xC3 0xBC
+   // ñ 0xC3 0xB1
+ 
+   if (word.size() != 2 || static_cast<uint8_t>(word[0]) != 0xc3) {
+     return false;
+   }
+ 
+   auto c = static_cast<uint8_t>(word[1]);
+   if (c == 0xa1 || c == 0xa9 || c == 0xad || c == 0xb3 || c == 0xba ||
+       c == 0xbc || c == 0xb1) {
+     return true;
+   }
+ 
+   return false;
+ }
+ 
+ static bool IsSpecial(const std::string &w) {
+   return IsGermanUmlauts(w) || IsSpanishDiacritic(w);
+ }
+ 
 static std::vector<std::string> MergeCharactersIntoWords(
     const std::vector<std::string> &words) {
   std::vector<std::string> ans;
@@ -196,7 +223,7 @@ static std::vector<std::string> MergeCharactersIntoWords(
 
   while (i < n) {
     const auto &w = words[i];
-     if (w.size() >= 3 || (w.size() == 2 && !IsGermanUmlauts(w)) ||
+     if (w.size() >= 3 || (w.size() == 2 && !IsSpecial(w)) ||
         (w.size() == 1 && (IsPunct(w[0]) || std::isspace(w[0])))) {
       if (prev != -1) {
         std::string t;
@@ -215,7 +242,7 @@ static std::vector<std::string> MergeCharactersIntoWords(
     }
 
     // e.g., öffnen
-     if (w.size() == 1 || (w.size() == 2 && IsGermanUmlauts(w))) {
+     if (w.size() == 1 || (w.size() == 2 && IsSpecial(w))) {
       if (prev == -1) {
         prev = i;
       }