Fangjun Kuang
Committed by GitHub

Support extra languages in multi-lang kokoro tts (#2303)

@@ -35,18 +35,18 @@ jobs: @@ -35,18 +35,18 @@ jobs:
35 matrix: 35 matrix:
36 # See https://github.com/actions/runner-images 36 # See https://github.com/actions/runner-images
37 include: 37 include:
38 - - os: ubuntu-22.04  
39 - python-version: "3.7"  
40 - - os: ubuntu-22.04 38 + - os: ubuntu-latest
41 python-version: "3.8" 39 python-version: "3.8"
42 - - os: ubuntu-22.04 40 + - os: ubuntu-latest
43 python-version: "3.9" 41 python-version: "3.9"
44 - - os: ubuntu-22.04 42 + - os: ubuntu-latest
45 python-version: "3.10" 43 python-version: "3.10"
46 - - os: ubuntu-22.04 44 + - os: ubuntu-latest
47 python-version: "3.11" 45 python-version: "3.11"
48 - - os: ubuntu-22.04 46 + - os: ubuntu-latest
49 python-version: "3.12" 47 python-version: "3.12"
  48 + - os: ubuntu-latest
  49 + python-version: "3.13"
50 50
51 - os: macos-13 51 - os: macos-13
52 python-version: "3.8" 52 python-version: "3.8"
@@ -103,7 +103,7 @@ jobs: @@ -103,7 +103,7 @@ jobs:
103 export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" 103 export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
104 cmake --version 104 cmake --version
105 105
106 - export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j" 106 + export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j2"
107 107
108 python3 setup.py bdist_wheel 108 python3 setup.py bdist_wheel
109 ls -lh dist 109 ls -lh dist
1 ### Supported functions 1 ### Supported functions
2 2
3 -|Speech recognition| Speech synthesis | Source separation | 3 +|Speech recognition| [Speech synthesis][tts-url] | [Source separation][ss-url] |
4 |------------------|------------------|-------------------| 4 |------------------|------------------|-------------------|
5 | ✔️ | ✔️ | ✔️ | 5 | ✔️ | ✔️ | ✔️ |
6 6
7 -|Speaker identification| Speaker diarization | Speaker verification | 7 +|Speaker identification| [Speaker diarization][sd-url] | Speaker verification |
8 |----------------------|-------------------- |------------------------| 8 |----------------------|-------------------- |------------------------|
9 | ✔️ | ✔️ | ✔️ | 9 | ✔️ | ✔️ | ✔️ |
10 10
11 -| Spoken Language identification | Audio tagging | Voice activity detection | 11 +| [Spoken Language identification][slid-url] | [Audio tagging][at-url] | [Voice activity detection][vad-url] |
12 |--------------------------------|---------------|--------------------------| 12 |--------------------------------|---------------|--------------------------|
13 | ✔️ | ✔️ | ✔️ | 13 | ✔️ | ✔️ | ✔️ |
14 14
15 -| Keyword spotting | Add punctuation | Speech enhancement | 15 +| [Keyword spotting][kws-url] | [Add punctuation][punct-url] | [Speech enhancement][se-url] |
16 |------------------|-----------------|--------------------| 16 |------------------|-----------------|--------------------|
17 | ✔️ | ✔️ | ✔️ | 17 | ✔️ | ✔️ | ✔️ |
18 18
@@ -501,3 +501,12 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. @@ -501,3 +501,12 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
501 [spleeter]: https://github.com/deezer/spleeter 501 [spleeter]: https://github.com/deezer/spleeter
502 [UVR]: https://github.com/Anjok07/ultimatevocalremovergui 502 [UVR]: https://github.com/Anjok07/ultimatevocalremovergui
503 [gtcrn]: https://github.com/Xiaobin-Rong/gtcrn 503 [gtcrn]: https://github.com/Xiaobin-Rong/gtcrn
  504 +[tts-url]: https://k2-fsa.github.io/sherpa/onnx/tts/all-in-one.html
  505 +[ss-url]: https://k2-fsa.github.io/sherpa/onnx/source-separation/index.html
  506 +[sd-url]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/index.html
  507 +[slid-url]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/index.html
  508 +[at-url]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/index.html
  509 +[vad-url]: https://k2-fsa.github.io/sherpa/onnx/vad/index.html
  510 +[kws-url]: https://k2-fsa.github.io/sherpa/onnx/kws/index.html
  511 +[punct-url]: https://k2-fsa.github.io/sherpa/onnx/punctuation/index.html
  512 +[se-url]: https://k2-fsa.github.io/sherpa/onnx/speech-enhancment/index.html
@@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct { @@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct {
201 external double lengthScale; 201 external double lengthScale;
202 external Pointer<Utf8> dictDir; 202 external Pointer<Utf8> dictDir;
203 external Pointer<Utf8> lexicon; 203 external Pointer<Utf8> lexicon;
  204 + external Pointer<Utf8> lang;
204 } 205 }
205 206
206 final class SherpaOnnxOfflineTtsModelConfig extends Struct { 207 final class SherpaOnnxOfflineTtsModelConfig extends Struct {
@@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig { @@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig {
117 this.lengthScale = 1.0, 117 this.lengthScale = 1.0,
118 this.dictDir = '', 118 this.dictDir = '',
119 this.lexicon = '', 119 this.lexicon = '',
  120 + this.lang = '',
120 }); 121 });
121 122
122 factory OfflineTtsKokoroModelConfig.fromJson(Map<String, dynamic> json) { 123 factory OfflineTtsKokoroModelConfig.fromJson(Map<String, dynamic> json) {
@@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig { @@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig {
128 lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0, 129 lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0,
129 dictDir: json['dictDir'] as String? ?? '', 130 dictDir: json['dictDir'] as String? ?? '',
130 lexicon: json['lexicon'] as String? ?? '', 131 lexicon: json['lexicon'] as String? ?? '',
  132 + lang: json['lang'] as String? ?? '',
131 ); 133 );
132 } 134 }
133 135
134 @override 136 @override
135 String toString() { 137 String toString() {
136 - return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon)'; 138 + return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon, lang: $lang)';
137 } 139 }
138 140
139 Map<String, dynamic> toJson() => { 141 Map<String, dynamic> toJson() => {
@@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig { @@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig {
144 'lengthScale': lengthScale, 146 'lengthScale': lengthScale,
145 'dictDir': dictDir, 147 'dictDir': dictDir,
146 'lexicon': lexicon, 148 'lexicon': lexicon,
  149 + 'lang': lang,
147 }; 150 };
148 151
149 final String model; 152 final String model;
@@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig { @@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig {
153 final double lengthScale; 156 final double lengthScale;
154 final String dictDir; 157 final String dictDir;
155 final String lexicon; 158 final String lexicon;
  159 + final String lang;
156 } 160 }
157 161
158 class OfflineTtsModelConfig { 162 class OfflineTtsModelConfig {
@@ -286,6 +290,7 @@ class OfflineTts { @@ -286,6 +290,7 @@ class OfflineTts {
286 c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale; 290 c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale;
287 c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8(); 291 c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8();
288 c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8(); 292 c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8();
  293 + c.ref.model.kokoro.lang = config.model.kokoro.lang.toNativeUtf8();
289 294
290 c.ref.model.numThreads = config.model.numThreads; 295 c.ref.model.numThreads = config.model.numThreads;
291 c.ref.model.debug = config.model.debug ? 1 : 0; 296 c.ref.model.debug = config.model.debug ? 1 : 0;
@@ -302,6 +307,7 @@ class OfflineTts { @@ -302,6 +307,7 @@ class OfflineTts {
302 calloc.free(c.ref.ruleFsts); 307 calloc.free(c.ref.ruleFsts);
303 calloc.free(c.ref.model.provider); 308 calloc.free(c.ref.model.provider);
304 309
  310 + calloc.free(c.ref.model.kokoro.lang);
305 calloc.free(c.ref.model.kokoro.lexicon); 311 calloc.free(c.ref.model.kokoro.lexicon);
306 calloc.free(c.ref.model.kokoro.dictDir); 312 calloc.free(c.ref.model.kokoro.dictDir);
307 calloc.free(c.ref.model.kokoro.dataDir); 313 calloc.free(c.ref.model.kokoro.dataDir);
@@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig( @@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig(
70 SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale); 70 SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale);
71 SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir); 71 SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir);
72 SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon); 72 SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon);
  73 + SHERPA_ONNX_ASSIGN_ATTR_STR(lang, lang);
73 74
74 return c; 75 return c;
75 } 76 }
@@ -177,6 +178,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( @@ -177,6 +178,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
177 SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir); 178 SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir);
178 SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir); 179 SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir);
179 SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon); 180 SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon);
  181 + SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lang);
180 182
181 SHERPA_ONNX_DELETE_C_STR(c.model.provider); 183 SHERPA_ONNX_DELETE_C_STR(c.model.provider);
182 184
@@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig { @@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig {
36 public lengthScale: number = 1.0; 36 public lengthScale: number = 1.0;
37 public dictDir: string = ''; 37 public dictDir: string = '';
38 public lexicon: string = ''; 38 public lexicon: string = '';
  39 + public lang: string = '';
39 } 40 }
40 41
41 export class OfflineTtsModelConfig { 42 export class OfflineTtsModelConfig {
@@ -18,6 +18,7 @@ namespace SherpaOnnx @@ -18,6 +18,7 @@ namespace SherpaOnnx
18 18
19 DictDir = ""; 19 DictDir = "";
20 Lexicon = ""; 20 Lexicon = "";
  21 + Lang = "";
21 } 22 }
22 [MarshalAs(UnmanagedType.LPStr)] 23 [MarshalAs(UnmanagedType.LPStr)]
23 public string Model; 24 public string Model;
@@ -38,5 +39,8 @@ namespace SherpaOnnx @@ -38,5 +39,8 @@ namespace SherpaOnnx
38 39
39 [MarshalAs(UnmanagedType.LPStr)] 40 [MarshalAs(UnmanagedType.LPStr)]
40 public string Lexicon; 41 public string Lexicon;
  42 +
  43 + [MarshalAs(UnmanagedType.LPStr)]
  44 + public string Lang;
41 } 45 }
42 } 46 }
@@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct { @@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct {
857 DataDir string // Path to espeak-ng-data directory 857 DataDir string // Path to espeak-ng-data directory
858 DictDir string // Path to dict directory 858 DictDir string // Path to dict directory
859 Lexicon string // Path to lexicon files 859 Lexicon string // Path to lexicon files
  860 + Lang string // Example: es for Spanish, fr-fr for French. Can be empty
860 LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed 861 LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
861 } 862 }
862 863
@@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { @@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
1006 c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon) 1007 c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon)
1007 defer C.free(unsafe.Pointer(c.model.kokoro.lexicon)) 1008 defer C.free(unsafe.Pointer(c.model.kokoro.lexicon))
1008 1009
  1010 + c.model.kokoro.lang = C.CString(config.Model.Kokoro.Lang)
  1011 + defer C.free(unsafe.Pointer(c.model.kokoro.lang))
  1012 +
1009 c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale) 1013 c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale)
1010 1014
1011 c.model.num_threads = C.int(config.Model.NumThreads) 1015 c.model.num_threads = C.int(config.Model.NumThreads)
@@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( @@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
1164 SHERPA_ONNX_OR(config->model.kokoro.dict_dir, ""); 1164 SHERPA_ONNX_OR(config->model.kokoro.dict_dir, "");
1165 tts_config.model.kokoro.lexicon = 1165 tts_config.model.kokoro.lexicon =
1166 SHERPA_ONNX_OR(config->model.kokoro.lexicon, ""); 1166 SHERPA_ONNX_OR(config->model.kokoro.lexicon, "");
  1167 + tts_config.model.kokoro.lang = SHERPA_ONNX_OR(config->model.kokoro.lang, "");
1167 1168
1168 tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); 1169 tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
1169 tts_config.model.debug = config->model.debug; 1170 tts_config.model.debug = config->model.debug;
@@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig { @@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig {
958 float length_scale; // < 1, faster in speech speed; > 1, slower in speed 958 float length_scale; // < 1, faster in speech speed; > 1, slower in speed
959 const char *dict_dir; 959 const char *dict_dir;
960 const char *lexicon; 960 const char *lexicon;
  961 + const char *lang;
961 } SherpaOnnxOfflineTtsKokoroModelConfig; 962 } SherpaOnnxOfflineTtsKokoroModelConfig;
962 963
963 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { 964 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {
@@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { @@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
366 c.model.kokoro.length_scale = config.model.kokoro.length_scale; 366 c.model.kokoro.length_scale = config.model.kokoro.length_scale;
367 c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str(); 367 c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str();
368 c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str(); 368 c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str();
  369 + c.model.kokoro.lang = config.model.kokoro.lang.c_str();
369 370
370 c.model.num_threads = config.model.num_threads; 371 c.model.num_threads = config.model.num_threads;
371 c.model.debug = config.model.debug; 372 c.model.debug = config.model.debug;
@@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig { @@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig {
367 std::string data_dir; 367 std::string data_dir;
368 std::string dict_dir; 368 std::string dict_dir;
369 std::string lexicon; 369 std::string lexicon;
  370 + std::string lang;
370 371
371 float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed 372 float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
372 }; 373 };
@@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl { @@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl {
67 InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc 67 InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
68 } 68 }
69 69
70 - std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const { 70 + std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
  71 + const std::string &voice) const {
71 std::string text = ToLowerCase(_text); 72 std::string text = ToLowerCase(_text);
72 if (debug_) { 73 if (debug_) {
73 SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str()); 74 SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
@@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl { @@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl {
124 SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); 125 SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
125 } 126 }
126 127
127 - ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); 128 + ids_vec = ConvertNonChineseToTokenIDs(ms, voice);
128 } 129 }
129 130
130 for (const auto &ids : ids_vec) { 131 for (const auto &ids : ids_vec) {
@@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl { @@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl {
255 return ans; 256 return ans;
256 } 257 }
257 258
258 - std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs( 259 + std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak(
259 const std::string &text, const std::string &voice) const { 260 const std::string &text, const std::string &voice) const {
  261 + auto temp = ConvertTextToTokenIdsKokoro(
  262 + phoneme2id_, meta_data_.max_token_len, text, voice);
  263 + std::vector<std::vector<int32_t>> ans;
  264 + ans.reserve(temp.size());
  265 +
  266 + for (const auto &i : temp) {
  267 + ans.emplace_back(i.tokens.begin(), i.tokens.end());
  268 + }
  269 +
  270 + return ans;
  271 + }
  272 +
  273 + std::vector<std::vector<int32_t>> ConvertNonChineseToTokenIDs(
  274 + const std::string &text, const std::string &voice) const {
  275 + if (!voice.empty()) {
  276 + return ConvertTextToTokenIDsWithEspeak(text, voice);
  277 + }
  278 +
  279 + // If voice is empty, we split the text into words and use the lexicon
  280 + // to lookup the pronunciation of each word, fallback to espeak if
  281 + // a word is not in the lexicon.
  282 +
260 std::vector<std::string> words = SplitUtf8(text); 283 std::vector<std::string> words = SplitUtf8(text);
261 if (debug_) { 284 if (debug_) {
262 std::ostringstream os; 285 std::ostringstream os;
@@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl { @@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl {
317 340
318 piper::eSpeakPhonemeConfig config; 341 piper::eSpeakPhonemeConfig config;
319 342
320 - config.voice = voice; 343 + config.voice = meta_data_.voice;
321 344
322 std::vector<std::vector<piper::Phoneme>> phonemes; 345 std::vector<std::vector<piper::Phoneme>> phonemes;
323 346
@@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl { @@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl {
391 414
392 void InitTokens(std::istream &is) { 415 void InitTokens(std::istream &is) {
393 token2id_ = ReadTokens(is); // defined in ./symbol-table.cc 416 token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
  417 +
  418 + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
  419 + std::u32string s;
  420 + for (const auto &p : token2id_) {
  421 + s = conv.from_bytes(p.first);
  422 +
  423 + if (s.size() != 1) {
  424 + SHERPA_ONNX_LOGE("Error for token %s with id %d", p.first.c_str(),
  425 + p.second);
  426 + SHERPA_ONNX_EXIT(-1);
  427 + }
  428 +
  429 + char32_t c = s[0];
  430 + phoneme2id_.insert({c, p.second});
  431 + }
394 } 432 }
395 433
396 void InitLexicon(const std::string &lexicon) { 434 void InitLexicon(const std::string &lexicon) {
  435 + if (lexicon.empty()) {
  436 + return;
  437 + }
  438 +
397 std::vector<std::string> files; 439 std::vector<std::string> files;
398 SplitStringToVector(lexicon, ",", false, &files); 440 SplitStringToVector(lexicon, ",", false, &files);
399 for (const auto &f : files) { 441 for (const auto &f : files) {
@@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl { @@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl {
404 446
405 template <typename Manager> 447 template <typename Manager>
406 void InitLexicon(Manager *mgr, const std::string &lexicon) { 448 void InitLexicon(Manager *mgr, const std::string &lexicon) {
  449 + if (lexicon.empty()) {
  450 + return;
  451 + }
  452 +
407 std::vector<std::string> files; 453 std::vector<std::string> files;
408 SplitStringToVector(lexicon, ",", false, &files); 454 SplitStringToVector(lexicon, ",", false, &files);
409 for (const auto &f : files) { 455 for (const auto &f : files) {
@@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl { @@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl {
445 491
446 std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list); 492 std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
447 493
448 - if (ids.empty()) { 494 + if (ids.empty() && word != "呣") {
449 SHERPA_ONNX_LOGE( 495 SHERPA_ONNX_LOGE(
450 "Invalid pronunciation for word '%s' at line %d:%s. Ignore it", 496 "Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
451 word.c_str(), line_num, line.c_str()); 497 word.c_str(), line_num, line.c_str());
@@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl { @@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl {
465 // tokens.txt is saved in token2id_ 511 // tokens.txt is saved in token2id_
466 std::unordered_map<std::string, int32_t> token2id_; 512 std::unordered_map<std::string, int32_t> token2id_;
467 513
  514 + std::unordered_map<char32_t, int32_t> phoneme2id_;
  515 +
468 std::unique_ptr<cppjieba::Jieba> jieba_; 516 std::unique_ptr<cppjieba::Jieba> jieba_;
469 bool debug_ = false; 517 bool debug_ = false;
470 }; 518 };
@@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon( @@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon(
487 meta_data, debug)) {} 535 meta_data, debug)) {}
488 536
489 std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds( 537 std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
490 - const std::string &text, const std::string & /*unused_voice = ""*/) const {  
491 - return impl_->ConvertTextToTokenIds(text); 538 + const std::string &text, const std::string &voice /*= ""*/) const {
  539 + return impl_->ConvertTextToTokenIds(text, voice);
492 } 540 }
493 541
494 #if __ANDROID_API__ >= 9 542 #if __ANDROID_API__ >= 9
@@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig { @@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig {
20 20
21 OfflineSpeechDenoiserModelConfig() = default; 21 OfflineSpeechDenoiserModelConfig() = default;
22 22
23 - OfflineSpeechDenoiserModelConfig(OfflineSpeechDenoiserGtcrnModelConfig gtcrn,  
24 - int32_t num_threads, bool debug,  
25 - const std::string &provider) 23 + OfflineSpeechDenoiserModelConfig(
  24 + const OfflineSpeechDenoiserGtcrnModelConfig &gtcrn, int32_t num_threads,
  25 + bool debug, const std::string &provider)
26 : gtcrn(gtcrn), 26 : gtcrn(gtcrn),
27 num_threads(num_threads), 27 num_threads(num_threads),
28 debug(debug), 28 debug(debug),
@@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
6 #define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ 6 #define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
7 #include <cstdint> 7 #include <cstdint>
8 #include <string> 8 #include <string>
  9 +#include <unordered_map>
9 #include <utility> 10 #include <utility>
10 #include <vector> 11 #include <vector>
11 12
@@ -57,6 +58,12 @@ class OfflineTtsFrontend { @@ -57,6 +58,12 @@ class OfflineTtsFrontend {
57 // implementation is in ./piper-phonemize-lexicon.cc 58 // implementation is in ./piper-phonemize-lexicon.cc
58 void InitEspeak(const std::string &data_dir); 59 void InitEspeak(const std::string &data_dir);
59 60
  61 +// implementation in ./piper-phonemize-lexicon.cc
  62 +std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
  63 + const std::unordered_map<char32_t, int32_t> &token2id,
  64 + int32_t max_token_len, const std::string &text,
  65 + const std::string &voice = "");
  66 +
60 } // namespace sherpa_onnx 67 } // namespace sherpa_onnx
61 68
62 #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ 69 #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
@@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { @@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
220 } 220 }
221 } 221 }
222 222
223 - std::vector<TokenIDs> token_ids =  
224 - frontend_->ConvertTextToTokenIds(text, meta_data.voice); 223 + std::vector<TokenIDs> token_ids = frontend_->ConvertTextToTokenIds(
  224 + text, config_.model.kokoro.lang.empty() ? meta_data.voice
  225 + : config_.model.kokoro.lang);
225 226
226 if (token_ids.empty() || 227 if (token_ids.empty() ||
227 (token_ids.size() == 1 && token_ids[0].tokens.empty())) { 228 (token_ids.size() == 1 && token_ids[0].tokens.empty())) {
@@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { @@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
335 if (meta_data.version >= 2) { 336 if (meta_data.version >= 2) {
336 // this is a multi-lingual model, we require that you pass lexicon 337 // this is a multi-lingual model, we require that you pass lexicon
337 // and dict_dir 338 // and dict_dir
338 - if (config_.model.kokoro.lexicon.empty() || 339 + if ((config_.model.kokoro.lexicon.empty() &&
  340 + config_.model.kokoro.lang.empty()) ||
339 config_.model.kokoro.dict_dir.empty()) { 341 config_.model.kokoro.dict_dir.empty()) {
340 SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); 342 SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
341 SHERPA_ONNX_LOGE( 343 SHERPA_ONNX_LOGE(
342 "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " 344 "You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
343 - "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir"); 345 + "v1.0). Please pass --kokoro-lexicon and --kokoro-dict-dir or "
  346 + "provide --kokoro-lang and --kokoro-dict-dir");
344 SHERPA_ONNX_EXIT(-1); 347 SHERPA_ONNX_EXIT(-1);
345 } 348 }
346 349
@@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { @@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
362 if (meta_data.version >= 2) { 365 if (meta_data.version >= 2) {
363 // this is a multi-lingual model, we require that you pass lexicon 366 // this is a multi-lingual model, we require that you pass lexicon
364 // and dict_dir 367 // and dict_dir
365 - if (config_.model.kokoro.lexicon.empty() || 368 + if ((config_.model.kokoro.lexicon.empty() &&
  369 + config_.model.kokoro.lang.empty()) ||
366 config_.model.kokoro.dict_dir.empty()) { 370 config_.model.kokoro.dict_dir.empty()) {
367 SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); 371 SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
368 SHERPA_ONNX_LOGE( 372 SHERPA_ONNX_LOGE(
@@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { @@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
18 "Path to voices.bin for Kokoro models"); 18 "Path to voices.bin for Kokoro models");
19 po->Register("kokoro-tokens", &tokens, 19 po->Register("kokoro-tokens", &tokens,
20 "Path to tokens.txt for Kokoro models"); 20 "Path to tokens.txt for Kokoro models");
  21 + po->Register("kokoro-lang", &lang,
  22 + "Used only by kokoro >= 1.0. Example values: "
  23 + "en (English), "
  24 + "es (Spanish), fr (French), hi (hindi), it (Italian), "
  25 + "pt-br (Brazilian Portuguese)."
  26 + "You can leave it empty, in which case you need to provide "
  27 + "--kokoro-lexicon.");
21 po->Register( 28 po->Register(
22 "kokoro-lexicon", &lexicon, 29 "kokoro-lexicon", &lexicon,
23 "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0" 30 "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
@@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { @@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
127 os << "lexicon=\"" << lexicon << "\", "; 134 os << "lexicon=\"" << lexicon << "\", ";
128 os << "data_dir=\"" << data_dir << "\", "; 135 os << "data_dir=\"" << data_dir << "\", ";
129 os << "dict_dir=\"" << dict_dir << "\", "; 136 os << "dict_dir=\"" << dict_dir << "\", ";
130 - os << "length_scale=" << length_scale << ")"; 137 + os << "length_scale=" << length_scale << ", ";
  138 + os << "lang=\"" << lang << "\")";
131 139
132 return os.str(); 140 return os.str();
133 } 141 }
@@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig { @@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig {
27 // speed = 1 / length_scale 27 // speed = 1 / length_scale
28 float length_scale = 1.0; 28 float length_scale = 1.0;
29 29
  30 + // Used only for Kokoro >= 1.0.
  31 + //
  32 + // If it is not empty, meta_data.voice is ignored.
  33 + // Example values: es (Spanish), fr (French), pt (Portuguese)
  34 + // See https://hf-mirror.com/hexgrad/Kokoro-82M/blob/main/VOICES.md
  35 + std::string lang;
  36 +
30 OfflineTtsKokoroModelConfig() = default; 37 OfflineTtsKokoroModelConfig() = default;
31 38
32 OfflineTtsKokoroModelConfig(const std::string &model, 39 OfflineTtsKokoroModelConfig(const std::string &model,
@@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig { @@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig {
34 const std::string &tokens, 41 const std::string &tokens,
35 const std::string &lexicon, 42 const std::string &lexicon,
36 const std::string &data_dir, 43 const std::string &data_dir,
37 - const std::string &dict_dir, float length_scale) 44 + const std::string &dict_dir, float length_scale,
  45 + const std::string &lang)
38 : model(model), 46 : model(model),
39 voices(voices), 47 voices(voices),
40 tokens(tokens), 48 tokens(tokens),
41 lexicon(lexicon), 49 lexicon(lexicon),
42 data_dir(data_dir), 50 data_dir(data_dir),
43 dict_dir(dict_dir), 51 dict_dir(dict_dir),
44 - length_scale(length_scale) {} 52 + length_scale(length_scale),
  53 + lang(lang) {}
45 54
46 void Register(ParseOptions *po); 55 void Register(ParseOptions *po);
47 bool Validate() const; 56 bool Validate() const;
@@ -351,7 +351,8 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( @@ -351,7 +351,8 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
351 if (is_matcha_) { 351 if (is_matcha_) {
352 return ConvertTextToTokenIdsMatcha(text, voice); 352 return ConvertTextToTokenIdsMatcha(text, voice);
353 } else if (is_kokoro_) { 353 } else if (is_kokoro_) {
354 - return ConvertTextToTokenIdsKokoro(text, voice); 354 + return ConvertTextToTokenIdsKokoro(
  355 + token2id_, kokoro_meta_data_.max_token_len, text, voice);
355 } else { 356 } else {
356 return ConvertTextToTokenIdsVits(text, voice); 357 return ConvertTextToTokenIdsVits(text, voice);
357 } 358 }
@@ -382,8 +383,10 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( @@ -382,8 +383,10 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
382 return ans; 383 return ans;
383 } 384 }
384 385
385 -std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(  
386 - const std::string &text, const std::string &voice /*= ""*/) const { 386 +std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
  387 + const std::unordered_map<char32_t, int32_t> &token2id,
  388 + int32_t max_token_len, const std::string &text,
  389 + const std::string &voice /*= ""*/) {
387 piper::eSpeakPhonemeConfig config; 390 piper::eSpeakPhonemeConfig config;
388 391
389 // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices 392 // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
@@ -397,8 +400,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( @@ -397,8 +400,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
397 std::vector<TokenIDs> ans; 400 std::vector<TokenIDs> ans;
398 401
399 for (const auto &p : phonemes) { 402 for (const auto &p : phonemes) {
400 - auto phoneme_ids =  
401 - PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len); 403 + auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len);
402 404
403 for (auto &ids : phoneme_ids) { 405 for (auto &ids : phoneme_ids) {
404 ans.emplace_back(std::move(ids)); 406 ans.emplace_back(std::move(ids));
@@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { @@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
52 std::vector<TokenIDs> ConvertTextToTokenIdsMatcha( 52 std::vector<TokenIDs> ConvertTextToTokenIdsMatcha(
53 const std::string &text, const std::string &voice = "") const; 53 const std::string &text, const std::string &voice = "") const;
54 54
55 - std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(  
56 - const std::string &text, const std::string &voice = "") const;  
57 -  
58 private: 55 private:
59 // map unicode codepoint to an integer ID 56 // map unicode codepoint to an integer ID
60 std::unordered_map<char32_t, int32_t> token2id_; 57 std::unordered_map<char32_t, int32_t> token2id_;
@@ -6,6 +6,7 @@ public class OfflineTtsKokoroModelConfig { @@ -6,6 +6,7 @@ public class OfflineTtsKokoroModelConfig {
6 private final String voices; 6 private final String voices;
7 private final String tokens; 7 private final String tokens;
8 private final String lexicon; 8 private final String lexicon;
  9 + private final String lang;
9 private final String dataDir; 10 private final String dataDir;
10 private final String dictDir; 11 private final String dictDir;
11 private final float lengthScale; 12 private final float lengthScale;
@@ -15,6 +16,7 @@ public class OfflineTtsKokoroModelConfig { @@ -15,6 +16,7 @@ public class OfflineTtsKokoroModelConfig {
15 this.voices = builder.voices; 16 this.voices = builder.voices;
16 this.tokens = builder.tokens; 17 this.tokens = builder.tokens;
17 this.lexicon = builder.lexicon; 18 this.lexicon = builder.lexicon;
  19 + this.lang = builder.lang;
18 this.dataDir = builder.dataDir; 20 this.dataDir = builder.dataDir;
19 this.dictDir = builder.dictDir; 21 this.dictDir = builder.dictDir;
20 this.lengthScale = builder.lengthScale; 22 this.lengthScale = builder.lengthScale;
@@ -50,6 +52,7 @@ public class OfflineTtsKokoroModelConfig { @@ -50,6 +52,7 @@ public class OfflineTtsKokoroModelConfig {
50 private String voices = ""; 52 private String voices = "";
51 private String tokens = ""; 53 private String tokens = "";
52 private String lexicon = ""; 54 private String lexicon = "";
  55 + private String lang = "";
53 private String dataDir = ""; 56 private String dataDir = "";
54 private String dictDir = ""; 57 private String dictDir = "";
55 private float lengthScale = 1.0f; 58 private float lengthScale = 1.0f;
@@ -78,6 +81,11 @@ public class OfflineTtsKokoroModelConfig { @@ -78,6 +81,11 @@ public class OfflineTtsKokoroModelConfig {
78 return this; 81 return this;
79 } 82 }
80 83
  84 + public Builder setLang(String lang) {
  85 + this.lang = lang;
  86 + return this;
  87 + }
  88 +
81 public Builder setDataDir(String dataDir) { 89 public Builder setDataDir(String dataDir) {
82 this.dataDir = dataDir; 90 this.dataDir = dataDir;
83 return this; 91 return this;
@@ -145,6 +145,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { @@ -145,6 +145,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
145 ans.model.kokoro.lexicon = p; 145 ans.model.kokoro.lexicon = p;
146 env->ReleaseStringUTFChars(s, p); 146 env->ReleaseStringUTFChars(s, p);
147 147
  148 + fid = env->GetFieldID(kokoro_cls, "lang", "Ljava/lang/String;");
  149 + s = (jstring)env->GetObjectField(kokoro, fid);
  150 + p = env->GetStringUTFChars(s, nullptr);
  151 + ans.model.kokoro.lang = p;
  152 + env->ReleaseStringUTFChars(s, p);
  153 +
148 fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); 154 fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;");
149 s = (jstring)env->GetObjectField(kokoro, fid); 155 s = (jstring)env->GetObjectField(kokoro, fid);
150 p = env->GetStringUTFChars(s, nullptr); 156 p = env->GetStringUTFChars(s, nullptr);
@@ -31,6 +31,7 @@ data class OfflineTtsKokoroModelConfig( @@ -31,6 +31,7 @@ data class OfflineTtsKokoroModelConfig(
31 var tokens: String = "", 31 var tokens: String = "",
32 var dataDir: String = "", 32 var dataDir: String = "",
33 var lexicon: String = "", 33 var lexicon: String = "",
  34 + var lang: String = "",
34 var dictDir: String = "", 35 var dictDir: String = "",
35 var lengthScale: Float = 1.0f, 36 var lengthScale: Float = 1.0f,
36 ) 37 )
@@ -84,6 +84,7 @@ type @@ -84,6 +84,7 @@ type
84 LengthScale: Single; 84 LengthScale: Single;
85 DictDir: AnsiString; 85 DictDir: AnsiString;
86 Lexicon: AnsiString; 86 Lexicon: AnsiString;
  87 + Lang: AnsiString;
87 88
88 function ToString: AnsiString; 89 function ToString: AnsiString;
89 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); 90 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
@@ -841,6 +842,7 @@ type @@ -841,6 +842,7 @@ type
841 LengthScale: cfloat; 842 LengthScale: cfloat;
842 DictDir: PAnsiChar; 843 DictDir: PAnsiChar;
843 Lexicon: PAnsiChar; 844 Lexicon: PAnsiChar;
  845 + Lang: PAnsiChar;
844 end; 846 end;
845 847
846 SherpaOnnxOfflineTtsModelConfig = record 848 SherpaOnnxOfflineTtsModelConfig = record
@@ -2096,10 +2098,11 @@ begin @@ -2096,10 +2098,11 @@ begin
2096 'DataDir := %s, ' + 2098 'DataDir := %s, ' +
2097 'LengthScale := %.2f, ' + 2099 'LengthScale := %.2f, ' +
2098 'DictDir := %s, ' + 2100 'DictDir := %s, ' +
2099 - 'Lexicon := %s' + 2101 + 'Lexicon := %s, ' +
  2102 + 'Lang := %s' +
2100 ')', 2103 ')',
2101 [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale, 2104 [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale,
2102 - Self.DictDir, Self.Lexicon]); 2105 + Self.DictDir, Self.Lexicon, Self.Lang]);
2103 end; 2106 end;
2104 2107
2105 class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); 2108 class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
@@ -2180,6 +2183,7 @@ begin @@ -2180,6 +2183,7 @@ begin
2180 C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale; 2183 C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale;
2181 C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir); 2184 C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir);
2182 C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon); 2185 C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon);
  2186 + C.Model.Kokoro.Lang := PAnsiChar(Config.Model.Kokoro.Lang);
2183 2187
2184 C.Model.NumThreads := Config.Model.NumThreads; 2188 C.Model.NumThreads := Config.Model.NumThreads;
2185 C.Model.Provider := PAnsiChar(Config.Model.Provider); 2189 C.Model.Provider := PAnsiChar(Config.Model.Provider);
@@ -17,10 +17,12 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { @@ -17,10 +17,12 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
17 .def(py::init<>()) 17 .def(py::init<>())
18 .def(py::init<const std::string &, const std::string &, 18 .def(py::init<const std::string &, const std::string &,
19 const std::string &, const std::string &, 19 const std::string &, const std::string &,
20 - const std::string &, const std::string &, float>(), 20 + const std::string &, const std::string &, float,
  21 + const std::string &>(),
21 py::arg("model"), py::arg("voices"), py::arg("tokens"), 22 py::arg("model"), py::arg("voices"), py::arg("tokens"),
22 py::arg("lexicon") = "", py::arg("data_dir"), 23 py::arg("lexicon") = "", py::arg("data_dir"),
23 - py::arg("dict_dir") = "", py::arg("length_scale") = 1.0) 24 + py::arg("dict_dir") = "", py::arg("length_scale") = 1.0,
  25 + py::arg("lang") = "")
24 .def_readwrite("model", &PyClass::model) 26 .def_readwrite("model", &PyClass::model)
25 .def_readwrite("voices", &PyClass::voices) 27 .def_readwrite("voices", &PyClass::voices)
26 .def_readwrite("tokens", &PyClass::tokens) 28 .def_readwrite("tokens", &PyClass::tokens)
@@ -28,6 +30,7 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { @@ -28,6 +30,7 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
28 .def_readwrite("data_dir", &PyClass::data_dir) 30 .def_readwrite("data_dir", &PyClass::data_dir)
29 .def_readwrite("dict_dir", &PyClass::dict_dir) 31 .def_readwrite("dict_dir", &PyClass::dict_dir)
30 .def_readwrite("length_scale", &PyClass::length_scale) 32 .def_readwrite("length_scale", &PyClass::length_scale)
  33 + .def_readwrite("lang", &PyClass::lang)
31 .def("__str__", &PyClass::ToString) 34 .def("__str__", &PyClass::ToString)
32 .def("validate", &PyClass::Validate); 35 .def("validate", &PyClass::Validate);
33 } 36 }
@@ -806,7 +806,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( @@ -806,7 +806,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig(
806 dataDir: String = "", 806 dataDir: String = "",
807 lengthScale: Float = 1.0, 807 lengthScale: Float = 1.0,
808 dictDir: String = "", 808 dictDir: String = "",
809 - lexicon: String = "" 809 + lexicon: String = "",
  810 + lang: String = ""
810 ) -> SherpaOnnxOfflineTtsKokoroModelConfig { 811 ) -> SherpaOnnxOfflineTtsKokoroModelConfig {
811 return SherpaOnnxOfflineTtsKokoroModelConfig( 812 return SherpaOnnxOfflineTtsKokoroModelConfig(
812 model: toCPointer(model), 813 model: toCPointer(model),
@@ -815,7 +816,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( @@ -815,7 +816,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig(
815 data_dir: toCPointer(dataDir), 816 data_dir: toCPointer(dataDir),
816 length_scale: lengthScale, 817 length_scale: lengthScale,
817 dict_dir: toCPointer(dictDir), 818 dict_dir: toCPointer(dictDir),
818 - lexicon: toCPointer(lexicon) 819 + lexicon: toCPointer(lexicon),
  820 + lang: toCPointer(lang)
819 ) 821 )
820 } 822 }
821 823
@@ -143,13 +143,14 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { @@ -143,13 +143,14 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
143 const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; 143 const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
144 const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1; 144 const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
145 const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; 145 const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
  146 + const langLen = Module.lengthBytesUTF8(config.lang || '') + 1;
146 147
147 - const n =  
148 - modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen; 148 + const n = modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen +
  149 + lexiconLen + langLen;
149 150
150 const buffer = Module._malloc(n); 151 const buffer = Module._malloc(n);
151 152
152 - const len = 7 * 4; 153 + const len = 8 * 4;
153 const ptr = Module._malloc(len); 154 const ptr = Module._malloc(len);
154 155
155 let offset = 0; 156 let offset = 0;
@@ -171,6 +172,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { @@ -171,6 +172,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
171 Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); 172 Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
172 offset += lexiconLen; 173 offset += lexiconLen;
173 174
  175 + Module.stringToUTF8(config.lang || '', buffer + offset, langLen);
  176 + offset += langLen;
  177 +
174 offset = 0; 178 offset = 0;
175 Module.setValue(ptr, buffer + offset, 'i8*'); 179 Module.setValue(ptr, buffer + offset, 'i8*');
176 offset += modelLen; 180 offset += modelLen;
@@ -192,6 +196,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { @@ -192,6 +196,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
192 Module.setValue(ptr + 24, buffer + offset, 'i8*'); 196 Module.setValue(ptr + 24, buffer + offset, 'i8*');
193 offset += lexiconLen; 197 offset += lexiconLen;
194 198
  199 + Module.setValue(ptr + 28, buffer + offset, 'i8*');
  200 + offset += langLen;
  201 +
195 return { 202 return {
196 buffer: buffer, ptr: ptr, len: len, 203 buffer: buffer, ptr: ptr, len: len,
197 } 204 }
@@ -233,6 +240,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -233,6 +240,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
233 dataDir: '', 240 dataDir: '',
234 dictDir: '', 241 dictDir: '',
235 lexicon: '', 242 lexicon: '',
  243 + lang: '',
236 }; 244 };
237 } 245 }
238 246
@@ -15,7 +15,7 @@ extern "C" { @@ -15,7 +15,7 @@ extern "C" {
15 15
16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); 16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
18 -static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 7 * 4, ""); 18 +static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, "");
19 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == 19 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
20 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 20 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
21 sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 21 sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +