Fangjun Kuang
Committed by GitHub

Add C++ and Python API for Kokoro 1.0 multilingual TTS model (#1795)

@@ -267,6 +267,27 @@ log "Offline TTS test" @@ -267,6 +267,27 @@ log "Offline TTS test"
267 # test waves are saved in ./tts 267 # test waves are saved in ./tts
268 mkdir ./tts 268 mkdir ./tts
269 269
  270 +log "kokoro-multi-lang-v1_0 test"
  271 +
  272 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  273 +tar xf kokoro-multi-lang-v1_0.tar.bz2
  274 +rm kokoro-multi-lang-v1_0.tar.bz2
  275 +
  276 +python3 ./python-api-examples/offline-tts.py \
  277 + --debug=1 \
  278 + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
  279 + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
  280 + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
  281 + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
  282 + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
  283 + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
  284 + --num-threads=2 \
  285 + --sid=18 \
  286 + --output-filename="./tts/kokoro-18-zh-en.wav" \
  287 + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
  288 +
  289 +rm -rf kokoro-multi-lang-v1_0
  290 +
270 log "kokoro-en-v0_19 test" 291 log "kokoro-en-v0_19 test"
271 292
272 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 293 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
@@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then @@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then
580 repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 601 repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
581 log "Start testing ${repo}" 602 log "Start testing ${repo}"
582 603
583 - pushd $dir  
584 curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz 604 curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
585 tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz 605 tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
586 rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz 606 rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz
587 - popd  
588 607
589 - repo=$dir/$repo  
590 ls -lh $repo 608 ls -lh $repo
591 609
592 python3 ./python-api-examples/keyword-spotter.py 610 python3 ./python-api-examples/keyword-spotter.py
@@ -4,7 +4,6 @@ on: @@ -4,7 +4,6 @@ on:
4 push: 4 push:
5 branches: 5 branches:
6 - export-kokoro 6 - export-kokoro
7 - - kokoro-1.0-2  
8 7
9 workflow_dispatch: 8 workflow_dispatch:
10 9
@@ -76,6 +75,14 @@ jobs: @@ -76,6 +75,14 @@ jobs:
76 if: matrix.version == '1.0' 75 if: matrix.version == '1.0'
77 shell: bash 76 shell: bash
78 run: | 77 run: |
  78 + curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
  79 + tar xvf dict.tar.bz2
  80 + rm dict.tar.bz2
  81 +
  82 + curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
  83 + curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
  84 + curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
  85 +
79 src=scripts/kokoro/v1.0 86 src=scripts/kokoro/v1.0
80 87
81 d=kokoro-multi-lang-v1_0 88 d=kokoro-multi-lang-v1_0
@@ -87,7 +94,12 @@ jobs: @@ -87,7 +94,12 @@ jobs:
87 cp -v $src/tokens.txt $d/ 94 cp -v $src/tokens.txt $d/
88 cp -v $src/lexicon*.txt $d/ 95 cp -v $src/lexicon*.txt $d/
89 cp -v $src/README.md $d/README.md 96 cp -v $src/README.md $d/README.md
  97 + cp -av dict $d/
  98 + cp -v ./*.fst $d/
90 ls -lh $d/ 99 ls -lh $d/
  100 + echo "---"
  101 + ls -lh $d/dict
  102 +
91 tar cjfv $d.tar.bz2 $d 103 tar cjfv $d.tar.bz2 $d
92 rm -rf $d 104 rm -rf $d
93 105
@@ -180,6 +192,8 @@ jobs: @@ -180,6 +192,8 @@ jobs:
180 cp -v ../scripts/kokoro/v1.0/lexicon*.txt . 192 cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
181 cp -v ../scripts/kokoro/v1.0/README.md ./README.md 193 cp -v ../scripts/kokoro/v1.0/README.md ./README.md
182 cp -v ../LICENSE ./ 194 cp -v ../LICENSE ./
  195 + cp -av ../dict ./
  196 + cp -v ../*.fst $d/
183 197
184 git lfs track "*.onnx" 198 git lfs track "*.onnx"
185 git add . 199 git add .
@@ -132,3 +132,4 @@ kokoro-en-v0_19 @@ -132,3 +132,4 @@ kokoro-en-v0_19
132 lexicon.txt 132 lexicon.txt
133 us_gold.json 133 us_gold.json
134 us_silver.json 134 us_silver.json
  135 +kokoro-multi-lang-v1_0
@@ -25,27 +25,28 @@ int32_t main() { @@ -25,27 +25,28 @@ int32_t main() {
25 25
26 memset(&config, 0, sizeof(config)); 26 memset(&config, 0, sizeof(config));
27 config.model_config.transducer.encoder = 27 config.model_config.transducer.encoder =
28 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" 28 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
29 "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; 29 "encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
30 30
31 config.model_config.transducer.decoder = 31 config.model_config.transducer.decoder =
32 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" 32 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
33 "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; 33 "decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
34 34
35 config.model_config.transducer.joiner = 35 config.model_config.transducer.joiner =
36 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" 36 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
37 "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; 37 "joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
38 38
39 config.model_config.tokens = 39 config.model_config.tokens =
40 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt"; 40 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
  41 + "tokens.txt";
41 42
42 config.model_config.provider = "cpu"; 43 config.model_config.provider = "cpu";
43 config.model_config.num_threads = 1; 44 config.model_config.num_threads = 1;
44 config.model_config.debug = 1; 45 config.model_config.debug = 1;
45 46
46 config.keywords_file = 47 config.keywords_file =
47 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"  
48 - "test_keywords.txt"; 48 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
  49 + "test_wavs/test_keywords.txt";
49 50
50 const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config); 51 const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config);
51 if (!kws) { 52 if (!kws) {
@@ -24,27 +24,28 @@ int32_t main() { @@ -24,27 +24,28 @@ int32_t main() {
24 24
25 KeywordSpotterConfig config; 25 KeywordSpotterConfig config;
26 config.model_config.transducer.encoder = 26 config.model_config.transducer.encoder =
27 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" 27 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
28 "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; 28 "encoder-epoch-12-avg-2-chunk-16-left-64.onnx";
29 29
30 config.model_config.transducer.decoder = 30 config.model_config.transducer.decoder =
31 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" 31 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
32 "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; 32 "decoder-epoch-12-avg-2-chunk-16-left-64.onnx";
33 33
34 config.model_config.transducer.joiner = 34 config.model_config.transducer.joiner =
35 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" 35 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
36 "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; 36 "joiner-epoch-12-avg-2-chunk-16-left-64.onnx";
37 37
38 config.model_config.tokens = 38 config.model_config.tokens =
39 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt"; 39 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
  40 + "tokens.txt";
40 41
41 config.model_config.provider = "cpu"; 42 config.model_config.provider = "cpu";
42 config.model_config.num_threads = 1; 43 config.model_config.num_threads = 1;
43 config.model_config.debug = 1; 44 config.model_config.debug = 1;
44 45
45 config.keywords_file = 46 config.keywords_file =
46 - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/"  
47 - "test_keywords.txt"; 47 + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/"
  48 + "test_wavs/test_keywords.txt";
48 49
49 KeywordSpotter kws = KeywordSpotter::Create(config); 50 KeywordSpotter kws = KeywordSpotter::Create(config);
50 if (!kws.Get()) { 51 if (!kws.Get()) {
@@ -11,7 +11,7 @@ while the model is still generating. @@ -11,7 +11,7 @@ while the model is still generating.
11 11
12 Usage: 12 Usage:
13 13
14 -Example (1/6) 14 +Example (1/7)
15 15
16 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 16 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
17 tar xf vits-piper-en_US-amy-low.tar.bz2 17 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \
23 --output-filename=./generated.wav \ 23 --output-filename=./generated.wav \
24 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 24 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
25 25
26 -Example (2/6) 26 +Example (2/7)
27 27
28 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 28 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
29 tar xvf vits-zh-aishell3.tar.bz2 29 tar xvf vits-zh-aishell3.tar.bz2
@@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \
37 --output-filename=./liubei-21.wav \ 37 --output-filename=./liubei-21.wav \
38 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" 38 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
39 39
40 -Example (3/6) 40 +Example (3/7)
41 41
42 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 42 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
43 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 43 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \
53 --output-filename=./test-2.wav \ 53 --output-filename=./test-2.wav \
54 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" 54 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
55 55
56 -Example (4/6) 56 +Example (4/7)
57 57
58 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 58 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
59 tar xvf matcha-icefall-zh-baker.tar.bz2 59 tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \
71 --output-filename=./test-matcha.wav \ 71 --output-filename=./test-matcha.wav \
72 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" 72 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
73 73
74 -Example (5/6) 74 +Example (5/7)
75 75
76 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 76 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
77 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 77 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -88,7 +88,9 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -88,7 +88,9 @@ python3 ./python-api-examples/offline-tts-play.py \
88 --num-threads=2 \ 88 --num-threads=2 \
89 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 89 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
90 90
91 -Example (6/6) 91 +Example (6/7)
  92 +
  93 +(This version of kokoro supports only English)
92 94
93 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 95 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
94 tar xf kokoro-en-v0_19.tar.bz2 96 tar xf kokoro-en-v0_19.tar.bz2
@@ -105,6 +107,27 @@ python3 ./python-api-examples/offline-tts.py \ @@ -105,6 +107,27 @@ python3 ./python-api-examples/offline-tts.py \
105 --output-filename="./kokoro-10.wav" \ 107 --output-filename="./kokoro-10.wav" \
106 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." 108 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
107 109
  110 +Example (7/7)
  111 +
  112 +(This version of kokoro supports English, Chinese, etc.)
  113 +
  114 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  115 +tar xf kokoro-multi-lang-v1_0.tar.bz2
  116 +rm kokoro-multi-lang-v1_0.tar.bz2
  117 +
  118 +python3 ./python-api-examples/offline-tts-play.py \
  119 + --debug=1 \
  120 + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
  121 + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
  122 + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
  123 + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
  124 + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
  125 + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
  126 + --num-threads=2 \
  127 + --sid=18 \
  128 + --output-filename="./kokoro-18-zh-en.wav" \
  129 + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
  130 +
108 You can find more models at 131 You can find more models at
109 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 132 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
110 133
@@ -247,6 +270,20 @@ def add_kokoro_args(parser): @@ -247,6 +270,20 @@ def add_kokoro_args(parser):
247 help="Path to the dict directory of espeak-ng.", 270 help="Path to the dict directory of espeak-ng.",
248 ) 271 )
249 272
  273 + parser.add_argument(
  274 + "--kokoro-dict-dir",
  275 + type=str,
  276 + default="",
  277 + help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
  278 + )
  279 +
  280 + parser.add_argument(
  281 + "--kokoro-lexicon",
  282 + type=str,
  283 + default="",
  284 + help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
  285 + )
  286 +
250 287
251 def get_args(): 288 def get_args():
252 parser = argparse.ArgumentParser( 289 parser = argparse.ArgumentParser(
@@ -459,6 +496,8 @@ def main(): @@ -459,6 +496,8 @@ def main():
459 voices=args.kokoro_voices, 496 voices=args.kokoro_voices,
460 tokens=args.kokoro_tokens, 497 tokens=args.kokoro_tokens,
461 data_dir=args.kokoro_data_dir, 498 data_dir=args.kokoro_data_dir,
  499 + dict_dir=args.kokoro_dict_dir,
  500 + lexicon=args.kokoro_lexicon,
462 ), 501 ),
463 provider=args.provider, 502 provider=args.provider,
464 debug=args.debug, 503 debug=args.debug,
@@ -12,7 +12,7 @@ generated audio. @@ -12,7 +12,7 @@ generated audio.
12 12
13 Usage: 13 Usage:
14 14
15 -Example (1/6) 15 +Example (1/7)
16 16
17 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 17 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
18 tar xf vits-piper-en_US-amy-low.tar.bz2 18 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \
24 --output-filename=./generated.wav \ 24 --output-filename=./generated.wav \
25 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 25 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
26 26
27 -Example (2/6) 27 +Example (2/7)
28 28
29 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 29 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
30 tar xvf vits-icefall-zh-aishell3.tar.bz2 30 tar xvf vits-icefall-zh-aishell3.tar.bz2
@@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \
38 --output-filename=./liubei-21.wav \ 38 --output-filename=./liubei-21.wav \
39 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" 39 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
40 40
41 -Example (3/6) 41 +Example (3/7)
42 42
43 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 43 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
44 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 44 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \
54 --output-filename=./test-2.wav \ 54 --output-filename=./test-2.wav \
55 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" 55 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
56 56
57 -Example (4/6) 57 +Example (4/7)
58 58
59 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 59 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
60 tar xvf matcha-icefall-zh-baker.tar.bz2 60 tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \
72 --output-filename=./test-matcha.wav \ 72 --output-filename=./test-matcha.wav \
73 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" 73 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
74 74
75 -Example (5/6) 75 +Example (5/7)
76 76
77 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 77 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
78 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 78 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -89,7 +89,9 @@ python3 ./python-api-examples/offline-tts.py \ @@ -89,7 +89,9 @@ python3 ./python-api-examples/offline-tts.py \
89 --num-threads=2 \ 89 --num-threads=2 \
90 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 90 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
91 91
92 -Example (6/6) 92 +Example (6/7)
  93 +
  94 +(This version of kokoro supports only English)
93 95
94 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 96 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
95 tar xf kokoro-en-v0_19.tar.bz2 97 tar xf kokoro-en-v0_19.tar.bz2
@@ -106,6 +108,27 @@ python3 ./python-api-examples/offline-tts.py \ @@ -106,6 +108,27 @@ python3 ./python-api-examples/offline-tts.py \
106 --output-filename="./kokoro-10.wav" \ 108 --output-filename="./kokoro-10.wav" \
107 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." 109 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
108 110
  111 +Example (7/7)
  112 +
  113 +(This version of kokoro supports English, Chinese, etc.)
  114 +
  115 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  116 +tar xf kokoro-multi-lang-v1_0.tar.bz2
  117 +rm kokoro-multi-lang-v1_0.tar.bz2
  118 +
  119 +python3 ./python-api-examples/offline-tts.py \
  120 + --debug=1 \
  121 + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
  122 + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
  123 + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
  124 + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
  125 + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
  126 + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
  127 + --num-threads=2 \
  128 + --sid=18 \
  129 + --output-filename="./kokoro-18-zh-en.wav" \
  130 + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
  131 +
109 You can find more models at 132 You can find more models at
110 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 133 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
111 134
@@ -234,6 +257,20 @@ def add_kokoro_args(parser): @@ -234,6 +257,20 @@ def add_kokoro_args(parser):
234 help="Path to the dict directory of espeak-ng.", 257 help="Path to the dict directory of espeak-ng.",
235 ) 258 )
236 259
  260 + parser.add_argument(
  261 + "--kokoro-dict-dir",
  262 + type=str,
  263 + default="",
  264 + help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro",
  265 + )
  266 +
  267 + parser.add_argument(
  268 + "--kokoro-lexicon",
  269 + type=str,
  270 + default="",
  271 + help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro",
  272 + )
  273 +
237 274
238 def get_args(): 275 def get_args():
239 parser = argparse.ArgumentParser( 276 parser = argparse.ArgumentParser(
@@ -342,6 +379,8 @@ def main(): @@ -342,6 +379,8 @@ def main():
342 voices=args.kokoro_voices, 379 voices=args.kokoro_voices,
343 tokens=args.kokoro_tokens, 380 tokens=args.kokoro_tokens,
344 data_dir=args.kokoro_data_dir, 381 data_dir=args.kokoro_data_dir,
  382 + dict_dir=args.kokoro_dict_dir,
  383 + lexicon=args.kokoro_lexicon,
345 ), 384 ),
346 provider=args.provider, 385 provider=args.provider,
347 debug=args.debug, 386 debug=args.debug,
@@ -71,7 +71,7 @@ def main(): @@ -71,7 +71,7 @@ def main():
71 with open("voices.bin", "wb") as f: 71 with open("voices.bin", "wb") as f:
72 for _, speaker in id2speaker.items(): 72 for _, speaker in id2speaker.items():
73 m = torch.load( 73 m = torch.load(
74 - f"{speaker}.pt", 74 + f"voices/{speaker}.pt",
75 weights_only=True, 75 weights_only=True,
76 map_location="cpu", 76 map_location="cpu",
77 ).numpy() 77 ).numpy()
@@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS) @@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
153 list(APPEND sources 153 list(APPEND sources
154 hifigan-vocoder.cc 154 hifigan-vocoder.cc
155 jieba-lexicon.cc 155 jieba-lexicon.cc
  156 + kokoro-multi-lang-lexicon.cc
156 lexicon.cc 157 lexicon.cc
157 melo-tts-lexicon.cc 158 melo-tts-lexicon.cc
158 offline-tts-character-frontend.cc 159 offline-tts-character-frontend.cc
  1 +// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
  6 +
  7 +#include <codecvt>
  8 +#include <fstream>
  9 +#include <locale>
  10 +#include <regex> // NOLINT
  11 +#include <sstream>
  12 +#include <strstream>
  13 +#include <unordered_map>
  14 +#include <utility>
  15 +
  16 +#if __ANDROID_API__ >= 9
  17 +#include "android/asset_manager.h"
  18 +#include "android/asset_manager_jni.h"
  19 +#endif
  20 +
  21 +#if __OHOS__
  22 +#include "rawfile/raw_file_manager.h"
  23 +#endif
  24 +
  25 +#include "cppjieba/Jieba.hpp"
  26 +#include "espeak-ng/speak_lib.h"
  27 +#include "phoneme_ids.hpp"
  28 +#include "phonemize.hpp"
  29 +#include "sherpa-onnx/csrc/file-utils.h"
  30 +#include "sherpa-onnx/csrc/onnx-utils.h"
  31 +#include "sherpa-onnx/csrc/symbol-table.h"
  32 +#include "sherpa-onnx/csrc/text-utils.h"
  33 +
  34 +namespace sherpa_onnx {
  35 +
  36 +void CallPhonemizeEspeak(const std::string &text,
  37 + piper::eSpeakPhonemeConfig &config, // NOLINT
  38 + std::vector<std::vector<piper::Phoneme>> *phonemes);
  39 +
  40 +static std::wstring ToWideString(const std::string &s) {
  41 + // see
  42 + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
  43 + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
  44 + return converter.from_bytes(s);
  45 +}
  46 +
  47 +static std::string ToString(const std::wstring &s) {
  48 + // see
  49 + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
  50 + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
  51 + return converter.to_bytes(s);
  52 +}
  53 +
  54 +class KokoroMultiLangLexicon::Impl {
  55 + public:
  56 + Impl(const std::string &tokens, const std::string &lexicon,
  57 + const std::string &dict_dir, const std::string &data_dir,
  58 + const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
  59 + : meta_data_(meta_data), debug_(debug) {
  60 + InitTokens(tokens);
  61 +
  62 + InitLexicon(lexicon);
  63 +
  64 + InitJieba(dict_dir);
  65 +
  66 + InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
  67 + }
  68 +
  69 + template <typename Manager>
  70 + Impl(Manager *mgr, const std::string &tokens, const std::string &lexicon,
  71 + const std::string &dict_dir, const std::string &data_dir,
  72 + const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
  73 + : meta_data_(meta_data), debug_(debug) {
  74 + InitTokens(mgr, tokens);
  75 +
  76 + InitLexicon(mgr, lexicon);
  77 +
  78 + // we assume you have copied dict_dir and data_dir from assets to some path
  79 + InitJieba(dict_dir);
  80 +
  81 + InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
  82 + }
  83 +
  84 + std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
  85 + std::string text = ToLowerCase(_text);
  86 + if (debug_) {
  87 + SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
  88 + }
  89 +
  90 + std::vector<std::pair<std::string, std::string>> replace_str_pairs = {
  91 + {",", ","}, {":", ","}, {"、", ","}, {";", ";"}, {":", ":"},
  92 + {"。", "."}, {"?", "?"}, {"!", "!"}, {"\\s+", " "},
  93 + };
  94 + for (const auto &p : replace_str_pairs) {
  95 + std::regex re(p.first);
  96 + text = std::regex_replace(text, re, p.second);
  97 + }
  98 +
  99 + if (debug_) {
  100 + SHERPA_ONNX_LOGE("After replacing punctuations and merging spaces:\n%s",
  101 + text.c_str());
  102 + }
  103 +
  104 + // https://en.cppreference.com/w/cpp/regex
  105 + // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
  106 + std::string expr =
  107 + "([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
  108 +
  109 + auto ws = ToWideString(text);
  110 + std::wstring wexpr = ToWideString(expr);
  111 + std::wregex we(wexpr);
  112 +
  113 + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
  114 + auto end = std::wsregex_iterator();
  115 +
  116 + std::vector<TokenIDs> ans;
  117 +
  118 + for (std::wsregex_iterator i = begin; i != end; ++i) {
  119 + std::wsmatch match = *i;
  120 + std::wstring match_str = match.str();
  121 + auto ms = ToString(match_str);
  122 + uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
  123 +
  124 + std::vector<std::vector<int32_t>> ids_vec;
  125 +
  126 + if (c < 0x80) {
  127 + if (debug_) {
  128 + SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
  129 + }
  130 + ids_vec = ConvertEnglishToTokenIDs(ms);
  131 + } else {
  132 + if (debug_) {
  133 + SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
  134 + }
  135 + ids_vec = ConvertChineseToTokenIDs(ms);
  136 + }
  137 +
  138 + for (const auto &ids : ids_vec) {
  139 + if (ids.size() > 4) {
  140 + ans.emplace_back(ids);
  141 + } else {
  142 + if (ans.empty()) {
  143 + ans.emplace_back(ids);
  144 + } else {
  145 + ans.back().tokens.back() = ids[1];
  146 + ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
  147 + ids.end());
  148 + }
  149 + }
  150 + }
  151 + }
  152 +
  153 + if (debug_) {
  154 + for (const auto &v : ans) {
  155 + std::ostringstream os;
  156 + os << "\n";
  157 + std::string sep;
  158 + for (auto i : v.tokens) {
  159 + os << sep << i;
  160 + sep = " ";
  161 + }
  162 + os << "\n";
  163 + SHERPA_ONNX_LOGE("%s", os.str().c_str());
  164 + }
  165 + }
  166 +
  167 + return ans;
  168 + }
  169 +
  170 + private:
  171 + bool IsPunctuation(const std::string &text) const {
  172 + if (text == ";" || text == ":" || text == "," || text == "." ||
  173 + text == "!" || text == "?" || text == "—" || text == "…" ||
  174 + text == "\"" || text == "(" || text == ")" || text == "“" ||
  175 + text == "”") {
  176 + return true;
  177 + }
  178 +
  179 + return false;
  180 + }
  181 +
  182 + std::vector<int32_t> ConvertWordToIds(const std::string &w) const {
  183 + std::vector<int32_t> ans;
  184 + if (word2ids_.count(w)) {
  185 + ans = word2ids_.at(w);
  186 + return ans;
  187 + }
  188 +
  189 + std::vector<std::string> words = SplitUtf8(w);
  190 + for (const auto &word : words) {
  191 + if (word2ids_.count(word)) {
  192 + auto ids = ConvertWordToIds(word);
  193 + ans.insert(ans.end(), ids.begin(), ids.end());
  194 + } else {
  195 + SHERPA_ONNX_LOGE("Skip OOV: '%s'", word.c_str());
  196 + }
  197 + }
  198 +
  199 + return ans;
  200 + }
  201 +
  202 + std::vector<std::vector<int32_t>> ConvertChineseToTokenIDs(
  203 + const std::string &text) const {
  204 + bool is_hmm = true;
  205 +
  206 + std::vector<std::string> words;
  207 + jieba_->Cut(text, words, is_hmm);
  208 + if (debug_) {
  209 + std::ostringstream os;
  210 + os << "After jieba processing:\n";
  211 +
  212 + std::string sep;
  213 + for (const auto &w : words) {
  214 + os << sep << w;
  215 + sep = "_";
  216 + }
  217 + SHERPA_ONNX_LOGE("%s", os.str().c_str());
  218 + }
  219 +
  220 + std::vector<std::vector<int32_t>> ans;
  221 + std::vector<int32_t> this_sentence;
  222 + int32_t max_len = meta_data_.max_token_len;
  223 +
  224 + this_sentence.push_back(0);
  225 + for (const auto &w : words) {
  226 + auto ids = ConvertWordToIds(w);
  227 + if (this_sentence.size() + ids.size() > max_len - 2) {
  228 + this_sentence.push_back(0);
  229 + ans.push_back(std::move(this_sentence));
  230 +
  231 + this_sentence.push_back(0);
  232 + }
  233 +
  234 + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
  235 + }
  236 +
  237 + if (this_sentence.size() > 1) {
  238 + this_sentence.push_back(0);
  239 + ans.push_back(std::move(this_sentence));
  240 + }
  241 +
  242 + if (debug_) {
  243 + for (const auto &v : ans) {
  244 + std::ostringstream os;
  245 + os << "\n";
  246 + std::string sep;
  247 + for (auto i : v) {
  248 + os << sep << i;
  249 + sep = " ";
  250 + }
  251 + os << "\n";
  252 + SHERPA_ONNX_LOGE("%s", os.str().c_str());
  253 + }
  254 + }
  255 +
  256 + return ans;
  257 + }
  258 +
  259 + std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
  260 + const std::string &text) const {
  261 + std::vector<std::string> words = SplitUtf8(text);
  262 + if (debug_) {
  263 + std::ostringstream os;
  264 + os << "After splitting to words: ";
  265 + std::string sep;
  266 + for (const auto &w : words) {
  267 + os << sep << w;
  268 + sep = "_";
  269 + }
  270 + SHERPA_ONNX_LOGE("%s", os.str().c_str());
  271 + }
  272 +
  273 + std::vector<std::vector<int32_t>> ans;
  274 + int32_t max_len = meta_data_.max_token_len;
  275 + std::vector<int32_t> this_sentence;
  276 +
  277 + int32_t space_id = token2id_.at(" ");
  278 +
  279 + this_sentence.push_back(0);
  280 +
  281 + for (const auto &word : words) {
  282 + if (IsPunctuation(word)) {
  283 + this_sentence.push_back(token2id_.at(word));
  284 +
  285 + if (this_sentence.size() > max_len - 2) {
  286 + // this sentence is too long, split it
  287 + this_sentence.push_back(0);
  288 + ans.push_back(std::move(this_sentence));
  289 +
  290 + this_sentence.push_back(0);
  291 + continue;
  292 + }
  293 +
  294 + if (word == "." || word == "!" || word == "?" || word == ";") {
  295 + // Note: You can add more punctuations here to split the text
  296 + // into sentences. We just use four here: .!?;
  297 + this_sentence.push_back(0);
  298 + ans.push_back(std::move(this_sentence));
  299 +
  300 + this_sentence.push_back(0);
  301 + }
  302 + } else if (word2ids_.count(word)) {
  303 + const auto &ids = word2ids_.at(word);
  304 + if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
  305 + this_sentence.push_back(0);
  306 + ans.push_back(std::move(this_sentence));
  307 +
  308 + this_sentence.push_back(0);
  309 + }
  310 +
  311 + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
  312 + this_sentence.push_back(space_id);
  313 + } else {
  314 + SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
  315 +
  316 + piper::eSpeakPhonemeConfig config;
  317 +
  318 + config.voice = "en-us";
  319 +
  320 + std::vector<std::vector<piper::Phoneme>> phonemes;
  321 +
  322 + CallPhonemizeEspeak(word, config, &phonemes);
  323 + // Note phonemes[i] contains a vector of unicode codepoints;
  324 + // we need to convert them to utf8
  325 +
  326 + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
  327 +
  328 + std::vector<int32_t> ids;
  329 + for (const auto &v : phonemes) {
  330 + for (const auto p : v) {
  331 + auto token = conv.to_bytes(p);
  332 + if (token2id_.count(token)) {
  333 + ids.push_back(token2id_.at(token));
  334 + } else {
  335 + SHERPA_ONNX_LOGE("Skip OOV token '%s' from '%s'", token.c_str(),
  336 + word.c_str());
  337 + }
  338 + }
  339 + }
  340 +
  341 + if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
  342 + this_sentence.push_back(0);
  343 + ans.push_back(std::move(this_sentence));
  344 +
  345 + this_sentence.push_back(0);
  346 + }
  347 +
  348 + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
  349 + this_sentence.push_back(space_id);
  350 + }
  351 + }
  352 +
  353 + if (this_sentence.size() > 1) {
  354 + this_sentence.push_back(0);
  355 + ans.push_back(std::move(this_sentence));
  356 + }
  357 +
  358 + if (debug_) {
  359 + for (const auto &v : ans) {
  360 + std::ostringstream os;
  361 + os << "\n";
  362 + std::string sep;
  363 + for (auto i : v) {
  364 + os << sep << i;
  365 + sep = " ";
  366 + }
  367 + os << "\n";
  368 + SHERPA_ONNX_LOGE("%s", os.str().c_str());
  369 + }
  370 + }
  371 +
  372 + return ans;
  373 + }
  374 +
  375 + void InitTokens(const std::string &tokens) {
  376 + std::ifstream is(tokens);
  377 + InitTokens(is);
  378 + }
  379 +
  380 + template <typename Manager>
  381 + void InitTokens(Manager *mgr, const std::string &tokens) {
  382 + auto buf = ReadFile(mgr, tokens);
  383 +
  384 + std::istrstream is(buf.data(), buf.size());
  385 + InitTokens(is);
  386 + }
  387 +
  388 + void InitTokens(std::istream &is) {
  389 + token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
  390 + }
  391 +
  392 + void InitLexicon(const std::string &lexicon) {
  393 + std::vector<std::string> files;
  394 + SplitStringToVector(lexicon, ",", false, &files);
  395 + for (const auto &f : files) {
  396 + std::ifstream is(f);
  397 + InitLexicon(is);
  398 + }
  399 + }
  400 +
  401 + template <typename Manager>
  402 + void InitLexicon(Manager *mgr, const std::string &lexicon) {
  403 + std::vector<std::string> files;
  404 + SplitStringToVector(lexicon, ",", false, &files);
  405 + for (const auto &f : files) {
  406 + auto buf = ReadFile(mgr, f);
  407 +
  408 + std::istrstream is(buf.data(), buf.size());
  409 + InitLexicon(is);
  410 + }
  411 + }
  412 +
  413 + void InitLexicon(std::istream &is) {
  414 + std::string word;
  415 + std::vector<std::string> token_list;
  416 + std::string token;
  417 +
  418 + std::string line;
  419 + int32_t line_num = 0;
  420 + int32_t num_warn = 0;
  421 + while (std::getline(is, line)) {
  422 + ++line_num;
  423 + std::istringstream iss(line);
  424 +
  425 + token_list.clear();
  426 + iss >> word;
  427 + ToLowerCase(&word);
  428 +
  429 + if (word2ids_.count(word)) {
  430 + num_warn += 1;
  431 + if (num_warn < 10) {
  432 + SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
  433 + word.c_str(), line_num, line.c_str());
  434 + }
  435 + continue;
  436 + }
  437 +
  438 + while (iss >> token) {
  439 + token_list.push_back(std::move(token));
  440 + }
  441 +
  442 + std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
  443 +
  444 + if (ids.empty()) {
  445 + SHERPA_ONNX_LOGE(
  446 + "Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
  447 + word.c_str(), line_num, line.c_str());
  448 + continue;
  449 + }
  450 +
  451 + word2ids_.insert({std::move(word), std::move(ids)});
  452 + }
  453 + }
  454 +
  455 + void InitJieba(const std::string &dict_dir) {
  456 + std::string dict = dict_dir + "/jieba.dict.utf8";
  457 + std::string hmm = dict_dir + "/hmm_model.utf8";
  458 + std::string user_dict = dict_dir + "/user.dict.utf8";
  459 + std::string idf = dict_dir + "/idf.utf8";
  460 + std::string stop_word = dict_dir + "/stop_words.utf8";
  461 +
  462 + AssertFileExists(dict);
  463 + AssertFileExists(hmm);
  464 + AssertFileExists(user_dict);
  465 + AssertFileExists(idf);
  466 + AssertFileExists(stop_word);
  467 +
  468 + jieba_ =
  469 + std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
  470 + }
  471 +
  472 + private:
  473 + OfflineTtsKokoroModelMetaData meta_data_;
  474 +
  475 + // word to token IDs
  476 + std::unordered_map<std::string, std::vector<int32_t>> word2ids_;
  477 +
  478 + // tokens.txt is saved in token2id_
  479 + std::unordered_map<std::string, int32_t> token2id_;
  480 +
  481 + std::unique_ptr<cppjieba::Jieba> jieba_;
  482 + bool debug_ = false;
  483 +};
  484 +
  485 +KokoroMultiLangLexicon::~KokoroMultiLangLexicon() = default;
  486 +
  487 +KokoroMultiLangLexicon::KokoroMultiLangLexicon(
  488 + const std::string &tokens, const std::string &lexicon,
  489 + const std::string &dict_dir, const std::string &data_dir,
  490 + const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
  491 + : impl_(std::make_unique<Impl>(tokens, lexicon, dict_dir, data_dir,
  492 + meta_data, debug)) {}
  493 +
  494 +template <typename Manager>
  495 +KokoroMultiLangLexicon::KokoroMultiLangLexicon(
  496 + Manager *mgr, const std::string &tokens, const std::string &lexicon,
  497 + const std::string &dict_dir, const std::string &data_dir,
  498 + const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
  499 + : impl_(std::make_unique<Impl>(mgr, tokens, lexicon, dict_dir, data_dir,
  500 + meta_data, debug)) {}
  501 +
  502 +std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
  503 + const std::string &text, const std::string & /*unused_voice = ""*/) const {
  504 + return impl_->ConvertTextToTokenIds(text);
  505 +}
  506 +
  507 +#if __ANDROID_API__ >= 9
  508 +template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
  509 + AAssetManager *mgr, const std::string &tokens, const std::string &lexicon,
  510 + const std::string &dict_dir, const std::string &data_dir,
  511 + const OfflineTtsKokoroModelMetaData &meta_data, bool debug);
  512 +#endif
  513 +
  514 +#if __OHOS__
  515 +template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
  516 + NativeResourceManager *mgr, const std::string &tokens,
  517 + const std::string &lexicon, const std::string &dict_dir,
  518 + const std::string &data_dir, const OfflineTtsKokoroModelMetaData &meta_data,
  519 + bool debug);
  520 +#endif
  521 +
  522 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
  6 +#define SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
  7 +
  8 +#include <memory>
  9 +#include <string>
  10 +#include <vector>
  11 +
  12 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
  13 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
  14 +
  15 +namespace sherpa_onnx {
  16 +
  17 +class KokoroMultiLangLexicon : public OfflineTtsFrontend {
  18 + public:
  19 + ~KokoroMultiLangLexicon() override;
  20 +
  21 + KokoroMultiLangLexicon(const std::string &tokens, const std::string &lexicon,
  22 + const std::string &dict_dir,
  23 + const std::string &data_dir,
  24 + const OfflineTtsKokoroModelMetaData &meta_data,
  25 + bool debug);
  26 +
  27 + template <typename Manager>
  28 + KokoroMultiLangLexicon(Manager *mgr, const std::string &tokens,
  29 + const std::string &lexicon,
  30 + const std::string &dict_dir,
  31 + const std::string &data_dir,
  32 + const OfflineTtsKokoroModelMetaData &meta_data,
  33 + bool debug);
  34 +
  35 + std::vector<TokenIDs> ConvertTextToTokenIds(
  36 + const std::string &text, const std::string &voice = "") const override;
  37 +
  38 + private:
  39 + class Impl;
  40 + std::unique_ptr<Impl> impl_;
  41 +};
  42 +
  43 +} // namespace sherpa_onnx
  44 +
  45 +#endif // SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_
@@ -6,7 +6,9 @@ @@ -6,7 +6,9 @@
6 6
7 #include <fstream> 7 #include <fstream>
8 #include <regex> // NOLINT 8 #include <regex> // NOLINT
  9 +#include <sstream>
9 #include <strstream> 10 #include <strstream>
  11 +#include <unordered_map>
10 #include <utility> 12 #include <utility>
11 #if __ANDROID_API__ >= 9 13 #if __ANDROID_API__ >= 9
12 #include "android/asset_manager.h" 14 #include "android/asset_manager.h"
@@ -7,7 +7,6 @@ @@ -7,7 +7,6 @@
7 7
8 #include <memory> 8 #include <memory>
9 #include <string> 9 #include <string>
10 -#include <unordered_map>  
11 #include <vector> 10 #include <vector>
12 11
13 #include "sherpa-onnx/csrc/offline-tts-frontend.h" 12 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
@@ -19,6 +19,9 @@ struct TokenIDs { @@ -19,6 +19,9 @@ struct TokenIDs {
19 /*implicit*/ TokenIDs(std::vector<int64_t> tokens) // NOLINT 19 /*implicit*/ TokenIDs(std::vector<int64_t> tokens) // NOLINT
20 : tokens{std::move(tokens)} {} 20 : tokens{std::move(tokens)} {}
21 21
  22 + /*implicit*/ TokenIDs(const std::vector<int32_t> &tokens) // NOLINT
  23 + : tokens{tokens.begin(), tokens.end()} {}
  24 +
22 TokenIDs(std::vector<int64_t> tokens, // NOLINT 25 TokenIDs(std::vector<int64_t> tokens, // NOLINT
23 std::vector<int64_t> tones) // NOLINT 26 std::vector<int64_t> tones) // NOLINT
24 : tokens{std::move(tokens)}, tones{std::move(tones)} {} 27 : tokens{std::move(tokens)}, tones{std::move(tones)} {}
@@ -51,6 +54,9 @@ class OfflineTtsFrontend { @@ -51,6 +54,9 @@ class OfflineTtsFrontend {
51 const std::string &text, const std::string &voice = "") const = 0; 54 const std::string &text, const std::string &voice = "") const = 0;
52 }; 55 };
53 56
  57 +// implementation is in ./piper-phonemize-lexicon.cc
  58 +void InitEspeak(const std::string &data_dir);
  59 +
54 } // namespace sherpa_onnx 60 } // namespace sherpa_onnx
55 61
56 #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ 62 #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 #include "fst/extensions/far/far.h" 13 #include "fst/extensions/far/far.h"
14 #include "kaldifst/csrc/kaldi-fst-io.h" 14 #include "kaldifst/csrc/kaldi-fst-io.h"
15 #include "kaldifst/csrc/text-normalizer.h" 15 #include "kaldifst/csrc/text-normalizer.h"
  16 +#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
16 #include "sherpa-onnx/csrc/lexicon.h" 17 #include "sherpa-onnx/csrc/lexicon.h"
17 #include "sherpa-onnx/csrc/macros.h" 18 #include "sherpa-onnx/csrc/macros.h"
18 #include "sherpa-onnx/csrc/offline-tts-frontend.h" 19 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
@@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { @@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
314 template <typename Manager> 315 template <typename Manager>
315 void InitFrontend(Manager *mgr) { 316 void InitFrontend(Manager *mgr) {
316 const auto &meta_data = model_->GetMetaData(); 317 const auto &meta_data = model_->GetMetaData();
  318 +
  319 + if (meta_data.version >= 2) {
  320 + // this is a multi-lingual model, we require that you pass lexicon
  321 + // and dict_dir
  322 + if (config_.model.kokoro.lexicon.empty() ||
  323 + config_.model.kokoro.dict_dir.empty()) {
  324 + SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
  325 + SHERPA_ONNX_LOGE(
  326 + "You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
  327 + "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
  328 + SHERPA_ONNX_EXIT(-1);
  329 + }
  330 +
  331 + frontend_ = std::make_unique<KokoroMultiLangLexicon>(
  332 + mgr, config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
  333 + config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
  334 + meta_data, config_.model.debug);
  335 +
  336 + return;
  337 + }
  338 +
317 frontend_ = std::make_unique<PiperPhonemizeLexicon>( 339 frontend_ = std::make_unique<PiperPhonemizeLexicon>(
318 mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir, 340 mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir,
319 meta_data); 341 meta_data);
@@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { @@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
321 343
322 void InitFrontend() { 344 void InitFrontend() {
323 const auto &meta_data = model_->GetMetaData(); 345 const auto &meta_data = model_->GetMetaData();
  346 + if (meta_data.version >= 2) {
  347 + // this is a multi-lingual model, we require that you pass lexicon
  348 + // and dict_dir
  349 + if (config_.model.kokoro.lexicon.empty() ||
  350 + config_.model.kokoro.dict_dir.empty()) {
  351 + SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
  352 + SHERPA_ONNX_LOGE(
  353 + "You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
  354 + "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
  355 + SHERPA_ONNX_EXIT(-1);
  356 + }
  357 +
  358 + frontend_ = std::make_unique<KokoroMultiLangLexicon>(
  359 + config_.model.kokoro.tokens, config_.model.kokoro.lexicon,
  360 + config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir,
  361 + meta_data, config_.model.debug);
  362 +
  363 + return;
  364 + }
324 365
  366 + // this is for kokoro v0.19, which supports only English
325 frontend_ = std::make_unique<PiperPhonemizeLexicon>( 367 frontend_ = std::make_unique<PiperPhonemizeLexicon>(
326 config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data); 368 config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data);
327 } 369 }
@@ -8,6 +8,7 @@ @@ -8,6 +8,7 @@
8 8
9 #include "sherpa-onnx/csrc/file-utils.h" 9 #include "sherpa-onnx/csrc/file-utils.h"
10 #include "sherpa-onnx/csrc/macros.h" 10 #include "sherpa-onnx/csrc/macros.h"
  11 +#include "sherpa-onnx/csrc/text-utils.h"
11 12
12 namespace sherpa_onnx { 13 namespace sherpa_onnx {
13 14
@@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { @@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
17 "Path to voices.bin for Kokoro models"); 18 "Path to voices.bin for Kokoro models");
18 po->Register("kokoro-tokens", &tokens, 19 po->Register("kokoro-tokens", &tokens,
19 "Path to tokens.txt for Kokoro models"); 20 "Path to tokens.txt for Kokoro models");
  21 + po->Register(
  22 + "kokoro-lexicon", &lexicon,
  23 + "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
  24 + "You can pass multiple files, separated by ','. Example: "
  25 + "./lexicon-us-en.txt,./lexicon-zh.txt");
20 po->Register("kokoro-data-dir", &data_dir, 26 po->Register("kokoro-data-dir", &data_dir,
21 "Path to the directory containing dict for espeak-ng."); 27 "Path to the directory containing dict for espeak-ng.");
  28 + po->Register("kokoro-dict-dir", &dict_dir,
  29 + "Path to the directory containing dict for jieba. "
  30 + "Used only for Kokoro >= v1.0");
22 po->Register("kokoro-length-scale", &length_scale, 31 po->Register("kokoro-length-scale", &length_scale,
23 "Speech speed. Larger->Slower; Smaller->faster."); 32 "Speech speed. Larger->Slower; Smaller->faster.");
24 } 33 }
@@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const { @@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
44 return false; 53 return false;
45 } 54 }
46 55
  56 + if (!lexicon.empty()) {
  57 + std::vector<std::string> files;
  58 + SplitStringToVector(lexicon, ",", false, &files);
  59 + for (const auto &f : files) {
  60 + if (!FileExists(f)) {
  61 + SHERPA_ONNX_LOGE(
  62 + "lexicon '%s' does not exist. Please re-check --kokoro-lexicon",
  63 + f.c_str());
  64 + return false;
  65 + }
  66 + }
  67 + }
  68 +
47 if (data_dir.empty()) { 69 if (data_dir.empty()) {
48 SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir"); 70 SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir");
49 return false; 71 return false;
@@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const { @@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const {
77 return false; 99 return false;
78 } 100 }
79 101
  102 + if (!dict_dir.empty()) {
  103 + std::vector<std::string> required_files = {
  104 + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8",
  105 + "idf.utf8", "stop_words.utf8",
  106 + };
  107 +
  108 + for (const auto &f : required_files) {
  109 + if (!FileExists(dict_dir + "/" + f)) {
  110 + SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check kokoro-dict-dir",
  111 + dict_dir.c_str(), f.c_str());
  112 + return false;
  113 + }
  114 + }
  115 + }
  116 +
80 return true; 117 return true;
81 } 118 }
82 119
@@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { @@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
87 os << "model=\"" << model << "\", "; 124 os << "model=\"" << model << "\", ";
88 os << "voices=\"" << voices << "\", "; 125 os << "voices=\"" << voices << "\", ";
89 os << "tokens=\"" << tokens << "\", "; 126 os << "tokens=\"" << tokens << "\", ";
  127 + os << "lexicon=\"" << lexicon << "\", ";
90 os << "data_dir=\"" << data_dir << "\", "; 128 os << "data_dir=\"" << data_dir << "\", ";
  129 + os << "dict_dir=\"" << dict_dir << "\", ";
91 os << "length_scale=" << length_scale << ")"; 130 os << "length_scale=" << length_scale << ")";
92 131
93 return os.str(); 132 return os.str();
@@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig { @@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig {
16 std::string voices; 16 std::string voices;
17 std::string tokens; 17 std::string tokens;
18 18
  19 + // Note: You can pass multiple files, separated by ",", to lexicon
  20 + // Example: lexicon = "./lexicon-gb-en.txt,./lexicon-zh.txt";
  21 + std::string lexicon;
  22 +
19 std::string data_dir; 23 std::string data_dir;
20 24
  25 + std::string dict_dir;
  26 +
21 // speed = 1 / length_scale 27 // speed = 1 / length_scale
22 float length_scale = 1.0; 28 float length_scale = 1.0;
23 29
@@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig { @@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig {
26 OfflineTtsKokoroModelConfig(const std::string &model, 32 OfflineTtsKokoroModelConfig(const std::string &model,
27 const std::string &voices, 33 const std::string &voices,
28 const std::string &tokens, 34 const std::string &tokens,
29 - const std::string &data_dir, float length_scale) 35 + const std::string &lexicon,
  36 + const std::string &data_dir,
  37 + const std::string &dict_dir, float length_scale)
30 : model(model), 38 : model(model),
31 voices(voices), 39 voices(voices),
32 tokens(tokens), 40 tokens(tokens),
  41 + lexicon(lexicon),
33 data_dir(data_dir), 42 data_dir(data_dir),
  43 + dict_dir(dict_dir),
34 length_scale(length_scale) {} 44 length_scale(length_scale) {}
35 45
36 void Register(ParseOptions *po); 46 void Register(ParseOptions *po);
@@ -32,8 +32,7 @@ @@ -32,8 +32,7 @@
32 32
33 namespace sherpa_onnx { 33 namespace sherpa_onnx {
34 34
35 -static void CallPhonemizeEspeak(  
36 - const std::string &text, 35 +void CallPhonemizeEspeak(const std::string &text,
37 piper::eSpeakPhonemeConfig &config, // NOLINT 36 piper::eSpeakPhonemeConfig &config, // NOLINT
38 std::vector<std::vector<piper::Phoneme>> *phonemes) { 37 std::vector<std::vector<piper::Phoneme>> *phonemes) {
39 static std::mutex espeak_mutex; 38 static std::mutex espeak_mutex;
@@ -245,7 +244,7 @@ static std::vector<int64_t> CoquiPhonemesToIds( @@ -245,7 +244,7 @@ static std::vector<int64_t> CoquiPhonemesToIds(
245 return ans; 244 return ans;
246 } 245 }
247 246
248 -static void InitEspeak(const std::string &data_dir) { 247 +void InitEspeak(const std::string &data_dir) {
249 static std::once_flag init_flag; 248 static std::once_flag init_flag;
250 std::call_once(init_flag, [data_dir]() { 249 std::call_once(init_flag, [data_dir]() {
251 int32_t result = 250 int32_t result =
@@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/, @@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/,
241 jlong ptr, jstring text, 241 jlong ptr, jstring text,
242 jint sid, jfloat speed) { 242 jint sid, jfloat speed) {
243 const char *p_text = env->GetStringUTFChars(text, nullptr); 243 const char *p_text = env->GetStringUTFChars(text, nullptr);
244 - SHERPA_ONNX_LOGE("string is: %s", p_text);  
245 244
246 auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate( 245 auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate(
247 p_text, sid, speed); 246 p_text, sid, speed);
@@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl( @@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl(
267 JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid, 266 JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid,
268 jfloat speed, jobject callback) { 267 jfloat speed, jobject callback) {
269 const char *p_text = env->GetStringUTFChars(text, nullptr); 268 const char *p_text = env->GetStringUTFChars(text, nullptr);
270 - SHERPA_ONNX_LOGE("string is: %s", p_text);  
271 269
272 std::function<int32_t(const float *, int32_t, float)> callback_wrapper = 270 std::function<int32_t(const float *, int32_t, float)> callback_wrapper =
273 [env, callback](const float *samples, int32_t n, 271 [env, callback](const float *samples, int32_t n,
@@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { @@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
16 py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig") 16 py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig")
17 .def(py::init<>()) 17 .def(py::init<>())
18 .def(py::init<const std::string &, const std::string &, 18 .def(py::init<const std::string &, const std::string &,
  19 + const std::string &, const std::string &,
19 const std::string &, const std::string &, float>(), 20 const std::string &, const std::string &, float>(),
20 py::arg("model"), py::arg("voices"), py::arg("tokens"), 21 py::arg("model"), py::arg("voices"), py::arg("tokens"),
21 - py::arg("data_dir"), py::arg("length_scale") = 1.0) 22 + py::arg("lexicon") = "", py::arg("data_dir"),
  23 + py::arg("dict_dir") = "", py::arg("length_scale") = 1.0)
22 .def_readwrite("model", &PyClass::model) 24 .def_readwrite("model", &PyClass::model)
23 .def_readwrite("voices", &PyClass::voices) 25 .def_readwrite("voices", &PyClass::voices)
24 .def_readwrite("tokens", &PyClass::tokens) 26 .def_readwrite("tokens", &PyClass::tokens)
  27 + .def_readwrite("lexicon", &PyClass::lexicon)
25 .def_readwrite("data_dir", &PyClass::data_dir) 28 .def_readwrite("data_dir", &PyClass::data_dir)
  29 + .def_readwrite("dict_dir", &PyClass::dict_dir)
26 .def_readwrite("length_scale", &PyClass::length_scale) 30 .def_readwrite("length_scale", &PyClass::length_scale)
27 .def("__str__", &PyClass::ToString) 31 .def("__str__", &PyClass::ToString)
28 .def("validate", &PyClass::Validate); 32 .def("validate", &PyClass::Validate);