Fangjun Kuang
Committed by GitHub

Add C++ and Python API for Kokoro TTS models. (#1715)

@@ -19,6 +19,31 @@ which $EXE @@ -19,6 +19,31 @@ which $EXE
19 mkdir ./tts 19 mkdir ./tts
20 20
21 log "------------------------------------------------------------" 21 log "------------------------------------------------------------"
  22 +log "kokoro-en-v0_19"
  23 +log "------------------------------------------------------------"
  24 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  25 +tar xf kokoro-en-v0_19.tar.bz2
  26 +rm kokoro-en-v0_19.tar.bz2
  27 +
  28 +# mapping of sid to voice name
  29 +# 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
  30 +# 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis
  31 +
  32 +for sid in $(seq 0 10); do
  33 + $EXE \
  34 + --debug=1 \
  35 + --kokoro-model=./kokoro-en-v0_19/model.onnx \
  36 + --kokoro-voices=./kokoro-en-v0_19/voices.bin \
  37 + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
  38 + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
  39 + --num-threads=2 \
  40 + --sid=$sid \
  41 + --output-filename="./tts/kokoro-$sid.wav" \
  42 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
  43 +done
  44 +rm -rf kokoro-en-v0_19
  45 +
  46 +log "------------------------------------------------------------"
22 log "matcha-icefall-en_US-ljspeech" 47 log "matcha-icefall-en_US-ljspeech"
23 log "------------------------------------------------------------" 48 log "------------------------------------------------------------"
24 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 49 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
@@ -267,6 +267,25 @@ log "Offline TTS test" @@ -267,6 +267,25 @@ log "Offline TTS test"
267 # test waves are saved in ./tts 267 # test waves are saved in ./tts
268 mkdir ./tts 268 mkdir ./tts
269 269
  270 +log "kokoro-en-v0_19 test"
  271 +
  272 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  273 +tar xf kokoro-en-v0_19.tar.bz2
  274 +rm kokoro-en-v0_19.tar.bz2
  275 +
  276 +python3 ./python-api-examples/offline-tts.py \
  277 + --debug=1 \
  278 + --kokoro-model=./kokoro-en-v0_19/model.onnx \
  279 + --kokoro-voices=./kokoro-en-v0_19/voices.bin \
  280 + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
  281 + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
  282 + --num-threads=2 \
  283 + --sid=10 \
  284 + --output-filename="./tts/kokoro-10.wav" \
  285 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
  286 +
  287 +rm -rf kokoro-en-v0_19
  288 +
270 log "matcha-ljspeech-en test" 289 log "matcha-ljspeech-en test"
271 290
272 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 291 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
@@ -11,7 +11,7 @@ while the model is still generating. @@ -11,7 +11,7 @@ while the model is still generating.
11 11
12 Usage: 12 Usage:
13 13
14 -Example (1/5) 14 +Example (1/6)
15 15
16 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 16 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
17 tar xf vits-piper-en_US-amy-low.tar.bz2 17 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \
23 --output-filename=./generated.wav \ 23 --output-filename=./generated.wav \
24 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 24 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
25 25
26 -Example (2/5) 26 +Example (2/6)
27 27
28 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 28 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
29 tar xvf vits-zh-aishell3.tar.bz2 29 tar xvf vits-zh-aishell3.tar.bz2
@@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \
37 --output-filename=./liubei-21.wav \ 37 --output-filename=./liubei-21.wav \
38 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" 38 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
39 39
40 -Example (3/5) 40 +Example (3/6)
41 41
42 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 42 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
43 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 43 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \
53 --output-filename=./test-2.wav \ 53 --output-filename=./test-2.wav \
54 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" 54 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
55 55
56 -Example (4/5) 56 +Example (4/6)
57 57
58 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 58 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
59 tar xvf matcha-icefall-zh-baker.tar.bz2 59 tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \
71 --output-filename=./test-matcha.wav \ 71 --output-filename=./test-matcha.wav \
72 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" 72 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
73 73
74 -Example (5/5) 74 +Example (5/6)
75 75
76 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 76 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
77 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 77 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -88,6 +88,22 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -88,6 +88,22 @@ python3 ./python-api-examples/offline-tts-play.py \
88 --num-threads=2 \ 88 --num-threads=2 \
89 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 89 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
90 90
  91 +Example (6/6)
  92 +
  93 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  94 +tar xf kokoro-en-v0_19.tar.bz2
  95 +rm kokoro-en-v0_19.tar.bz2
  96 +
  97 +python3 ./python-api-examples/offline-tts.py \
  98 + --debug=1 \
  99 + --kokoro-model=./kokoro-en-v0_19/model.onnx \
  100 + --kokoro-voices=./kokoro-en-v0_19/voices.bin \
  101 + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
  102 + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
  103 + --num-threads=2 \
  104 + --sid=10 \
  105 + --output-filename="./kokoro-10.wav" \
  106 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
91 107
92 You can find more models at 108 You can find more models at
93 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 109 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
@@ -202,6 +218,36 @@ def add_matcha_args(parser): @@ -202,6 +218,36 @@ def add_matcha_args(parser):
202 ) 218 )
203 219
204 220
  221 +def add_kokoro_args(parser):
  222 + parser.add_argument(
  223 + "--kokoro-model",
  224 + type=str,
  225 + default="",
  226 + help="Path to model.onnx for kokoro",
  227 + )
  228 +
  229 + parser.add_argument(
  230 + "--kokoro-voices",
  231 + type=str,
  232 + default="",
  233 + help="Path to voices.bin for kokoro",
  234 + )
  235 +
  236 + parser.add_argument(
  237 + "--kokoro-tokens",
  238 + type=str,
  239 + default="",
  240 + help="Path to tokens.txt for kokoro",
  241 + )
  242 +
  243 + parser.add_argument(
  244 + "--kokoro-data-dir",
  245 + type=str,
  246 + default="",
  247 + help="Path to the dict directory of espeak-ng.",
  248 + )
  249 +
  250 +
205 def get_args(): 251 def get_args():
206 parser = argparse.ArgumentParser( 252 parser = argparse.ArgumentParser(
207 formatter_class=argparse.ArgumentDefaultsHelpFormatter 253 formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -209,6 +255,7 @@ def get_args(): @@ -209,6 +255,7 @@ def get_args():
209 255
210 add_vits_args(parser) 256 add_vits_args(parser)
211 add_matcha_args(parser) 257 add_matcha_args(parser)
  258 + add_kokoro_args(parser)
212 259
213 parser.add_argument( 260 parser.add_argument(
214 "--tts-rule-fsts", 261 "--tts-rule-fsts",
@@ -407,6 +454,12 @@ def main(): @@ -407,6 +454,12 @@ def main():
407 data_dir=args.matcha_data_dir, 454 data_dir=args.matcha_data_dir,
408 dict_dir=args.matcha_dict_dir, 455 dict_dir=args.matcha_dict_dir,
409 ), 456 ),
  457 + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
  458 + model=args.kokoro_model,
  459 + voices=args.kokoro_voices,
  460 + tokens=args.kokoro_tokens,
  461 + data_dir=args.kokoro_data_dir,
  462 + ),
410 provider=args.provider, 463 provider=args.provider,
411 debug=args.debug, 464 debug=args.debug,
412 num_threads=args.num_threads, 465 num_threads=args.num_threads,
@@ -12,7 +12,7 @@ generated audio. @@ -12,7 +12,7 @@ generated audio.
12 12
13 Usage: 13 Usage:
14 14
15 -Example (1/5) 15 +Example (1/6)
16 16
17 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 17 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
18 tar xf vits-piper-en_US-amy-low.tar.bz2 18 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \
24 --output-filename=./generated.wav \ 24 --output-filename=./generated.wav \
25 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 25 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
26 26
27 -Example (2/5) 27 +Example (2/6)
28 28
29 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 29 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
30 tar xvf vits-icefall-zh-aishell3.tar.bz2 30 tar xvf vits-icefall-zh-aishell3.tar.bz2
@@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \
38 --output-filename=./liubei-21.wav \ 38 --output-filename=./liubei-21.wav \
39 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" 39 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
40 40
41 -Example (3/5) 41 +Example (3/6)
42 42
43 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 43 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
44 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 44 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \
54 --output-filename=./test-2.wav \ 54 --output-filename=./test-2.wav \
55 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" 55 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
56 56
57 -Example (4/5) 57 +Example (4/6)
58 58
59 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 59 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
60 tar xvf matcha-icefall-zh-baker.tar.bz2 60 tar xvf matcha-icefall-zh-baker.tar.bz2
@@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \
72 --output-filename=./test-matcha.wav \ 72 --output-filename=./test-matcha.wav \
73 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" 73 "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
74 74
75 -Example (5/5) 75 +Example (5/6)
76 76
77 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 77 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
78 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 78 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
@@ -89,6 +89,23 @@ python3 ./python-api-examples/offline-tts.py \ @@ -89,6 +89,23 @@ python3 ./python-api-examples/offline-tts.py \
89 --num-threads=2 \ 89 --num-threads=2 \
90 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 90 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
91 91
  92 +Example (6/6)
  93 +
  94 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  95 +tar xf kokoro-en-v0_19.tar.bz2
  96 +rm kokoro-en-v0_19.tar.bz2
  97 +
  98 +python3 ./python-api-examples/offline-tts.py \
  99 + --debug=1 \
  100 + --kokoro-model=./kokoro-en-v0_19/model.onnx \
  101 + --kokoro-voices=./kokoro-en-v0_19/voices.bin \
  102 + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
  103 + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
  104 + --num-threads=2 \
  105 + --sid=10 \
  106 + --output-filename="./kokoro-10.wav" \
  107 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
  108 +
92 You can find more models at 109 You can find more models at
93 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 110 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
94 111
@@ -188,6 +205,36 @@ def add_matcha_args(parser): @@ -188,6 +205,36 @@ def add_matcha_args(parser):
188 ) 205 )
189 206
190 207
  208 +def add_kokoro_args(parser):
  209 + parser.add_argument(
  210 + "--kokoro-model",
  211 + type=str,
  212 + default="",
  213 + help="Path to model.onnx for kokoro",
  214 + )
  215 +
  216 + parser.add_argument(
  217 + "--kokoro-voices",
  218 + type=str,
  219 + default="",
  220 + help="Path to voices.bin for kokoro",
  221 + )
  222 +
  223 + parser.add_argument(
  224 + "--kokoro-tokens",
  225 + type=str,
  226 + default="",
  227 + help="Path to tokens.txt for kokoro",
  228 + )
  229 +
  230 + parser.add_argument(
  231 + "--kokoro-data-dir",
  232 + type=str,
  233 + default="",
  234 + help="Path to the dict directory of espeak-ng.",
  235 + )
  236 +
  237 +
191 def get_args(): 238 def get_args():
192 parser = argparse.ArgumentParser( 239 parser = argparse.ArgumentParser(
193 formatter_class=argparse.ArgumentDefaultsHelpFormatter 240 formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -195,6 +242,7 @@ def get_args(): @@ -195,6 +242,7 @@ def get_args():
195 242
196 add_vits_args(parser) 243 add_vits_args(parser)
197 add_matcha_args(parser) 244 add_matcha_args(parser)
  245 + add_kokoro_args(parser)
198 246
199 parser.add_argument( 247 parser.add_argument(
200 "--tts-rule-fsts", 248 "--tts-rule-fsts",
@@ -206,7 +254,7 @@ def get_args(): @@ -206,7 +254,7 @@ def get_args():
206 parser.add_argument( 254 parser.add_argument(
207 "--max-num-sentences", 255 "--max-num-sentences",
208 type=int, 256 type=int,
209 - default=2, 257 + default=1,
210 help="""Max number of sentences in a batch to avoid OOM if the input 258 help="""Max number of sentences in a batch to avoid OOM if the input
211 text is very long. Set it to -1 to process all the sentences in a 259 text is very long. Set it to -1 to process all the sentences in a
212 single batch. A smaller value does not mean it is slower compared 260 single batch. A smaller value does not mean it is slower compared
@@ -289,6 +337,12 @@ def main(): @@ -289,6 +337,12 @@ def main():
289 data_dir=args.matcha_data_dir, 337 data_dir=args.matcha_data_dir,
290 dict_dir=args.matcha_dict_dir, 338 dict_dir=args.matcha_dict_dir,
291 ), 339 ),
  340 + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
  341 + model=args.kokoro_model,
  342 + voices=args.kokoro_voices,
  343 + tokens=args.kokoro_tokens,
  344 + data_dir=args.kokoro_data_dir,
  345 + ),
292 provider=args.provider, 346 provider=args.provider,
293 debug=args.debug, 347 debug=args.debug,
294 num_threads=args.num_threads, 348 num_threads=args.num_threads,
@@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS) @@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS)
158 offline-tts-character-frontend.cc 158 offline-tts-character-frontend.cc
159 offline-tts-frontend.cc 159 offline-tts-frontend.cc
160 offline-tts-impl.cc 160 offline-tts-impl.cc
  161 + offline-tts-kokoro-model-config.cc
  162 + offline-tts-kokoro-model.cc
161 offline-tts-matcha-model-config.cc 163 offline-tts-matcha-model-config.cc
162 offline-tts-matcha-model.cc 164 offline-tts-matcha-model.cc
163 offline-tts-model-config.cc 165 offline-tts-model-config.cc
@@ -11,7 +11,7 @@ @@ -11,7 +11,7 @@
11 #include <vector> 11 #include <vector>
12 12
13 #include "sherpa-onnx/csrc/offline-tts-frontend.h" 13 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
14 -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" 14 +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
15 15
16 namespace sherpa_onnx { 16 namespace sherpa_onnx {
17 17
@@ -10,7 +10,7 @@ @@ -10,7 +10,7 @@
10 #include <vector> 10 #include <vector>
11 11
12 #include "sherpa-onnx/csrc/offline-tts-frontend.h" 12 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
13 -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" 13 +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
14 14
15 namespace sherpa_onnx { 15 namespace sherpa_onnx {
16 16
@@ -16,6 +16,7 @@ @@ -16,6 +16,7 @@
16 #include "rawfile/raw_file_manager.h" 16 #include "rawfile/raw_file_manager.h"
17 #endif 17 #endif
18 18
  19 +#include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h"
19 #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" 20 #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
20 #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" 21 #include "sherpa-onnx/csrc/offline-tts-vits-impl.h"
21 22
@@ -37,8 +38,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( @@ -37,8 +38,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
37 const OfflineTtsConfig &config) { 38 const OfflineTtsConfig &config) {
38 if (!config.model.vits.model.empty()) { 39 if (!config.model.vits.model.empty()) {
39 return std::make_unique<OfflineTtsVitsImpl>(config); 40 return std::make_unique<OfflineTtsVitsImpl>(config);
  41 + } else if (!config.model.matcha.acoustic_model.empty()) {
  42 + return std::make_unique<OfflineTtsMatchaImpl>(config);
40 } 43 }
41 - return std::make_unique<OfflineTtsMatchaImpl>(config); 44 +
  45 + return std::make_unique<OfflineTtsKokoroImpl>(config);
42 } 46 }
43 47
44 template <typename Manager> 48 template <typename Manager>
@@ -46,9 +50,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( @@ -46,9 +50,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
46 Manager *mgr, const OfflineTtsConfig &config) { 50 Manager *mgr, const OfflineTtsConfig &config) {
47 if (!config.model.vits.model.empty()) { 51 if (!config.model.vits.model.empty()) {
48 return std::make_unique<OfflineTtsVitsImpl>(mgr, config); 52 return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
  53 + } else if (!config.model.matcha.acoustic_model.empty()) {
  54 + return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
49 } 55 }
50 56
51 - return std::make_unique<OfflineTtsMatchaImpl>(mgr, config); 57 + return std::make_unique<OfflineTtsKokoroImpl>(mgr, config);
52 } 58 }
53 59
54 #if __ANDROID_API__ >= 9 60 #if __ANDROID_API__ >= 9
  1 +// sherpa-onnx/csrc/offline-tts-kokoro-impl.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_
  5 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_
  6 +
  7 +#include <memory>
  8 +#include <string>
  9 +#include <strstream>
  10 +#include <utility>
  11 +#include <vector>
  12 +
  13 +#include "fst/extensions/far/far.h"
  14 +#include "kaldifst/csrc/kaldi-fst-io.h"
  15 +#include "kaldifst/csrc/text-normalizer.h"
  16 +#include "sherpa-onnx/csrc/lexicon.h"
  17 +#include "sherpa-onnx/csrc/macros.h"
  18 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
  19 +#include "sherpa-onnx/csrc/offline-tts-impl.h"
  20 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h"
  21 +#include "sherpa-onnx/csrc/onnx-utils.h"
  22 +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
  23 +#include "sherpa-onnx/csrc/text-utils.h"
  24 +
  25 +namespace sherpa_onnx {
  26 +
  27 +class OfflineTtsKokoroImpl : public OfflineTtsImpl {
  28 + public:
  29 + explicit OfflineTtsKokoroImpl(const OfflineTtsConfig &config)
  30 + : config_(config),
  31 + model_(std::make_unique<OfflineTtsKokoroModel>(config.model)) {
  32 + InitFrontend();
  33 +
  34 + if (!config.rule_fsts.empty()) {
  35 + std::vector<std::string> files;
  36 + SplitStringToVector(config.rule_fsts, ",", false, &files);
  37 + tn_list_.reserve(files.size());
  38 + for (const auto &f : files) {
  39 + if (config.model.debug) {
  40 +#if __OHOS__
  41 + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
  42 +#else
  43 + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
  44 +#endif
  45 + }
  46 + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
  47 + }
  48 + }
  49 +
  50 + if (!config.rule_fars.empty()) {
  51 + if (config.model.debug) {
  52 + SHERPA_ONNX_LOGE("Loading FST archives");
  53 + }
  54 + std::vector<std::string> files;
  55 + SplitStringToVector(config.rule_fars, ",", false, &files);
  56 +
  57 + tn_list_.reserve(files.size() + tn_list_.size());
  58 +
  59 + for (const auto &f : files) {
  60 + if (config.model.debug) {
  61 +#if __OHOS__
  62 + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
  63 +#else
  64 + SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
  65 +#endif
  66 + }
  67 + std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
  68 + fst::FarReader<fst::StdArc>::Open(f));
  69 + for (; !reader->Done(); reader->Next()) {
  70 + std::unique_ptr<fst::StdConstFst> r(
  71 + fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
  72 +
  73 + tn_list_.push_back(
  74 + std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
  75 + }
  76 + }
  77 +
  78 + if (config.model.debug) {
  79 + SHERPA_ONNX_LOGE("FST archives loaded!");
  80 + }
  81 + }
  82 + }
  83 +
  84 + template <typename Manager>
  85 + OfflineTtsKokoroImpl(Manager *mgr, const OfflineTtsConfig &config)
  86 + : config_(config),
  87 + model_(std::make_unique<OfflineTtsKokoroModel>(mgr, config.model)) {
  88 + InitFrontend(mgr);
  89 +
  90 + if (!config.rule_fsts.empty()) {
  91 + std::vector<std::string> files;
  92 + SplitStringToVector(config.rule_fsts, ",", false, &files);
  93 + tn_list_.reserve(files.size());
  94 + for (const auto &f : files) {
  95 + if (config.model.debug) {
  96 +#if __OHOS__
  97 + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
  98 +#else
  99 + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
  100 +#endif
  101 + }
  102 + auto buf = ReadFile(mgr, f);
  103 + std::istrstream is(buf.data(), buf.size());
  104 + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is));
  105 + }
  106 + }
  107 +
  108 + if (!config.rule_fars.empty()) {
  109 + std::vector<std::string> files;
  110 + SplitStringToVector(config.rule_fars, ",", false, &files);
  111 + tn_list_.reserve(files.size() + tn_list_.size());
  112 +
  113 + for (const auto &f : files) {
  114 + if (config.model.debug) {
  115 +#if __OHOS__
  116 + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
  117 +#else
  118 + SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
  119 +#endif
  120 + }
  121 +
  122 + auto buf = ReadFile(mgr, f);
  123 +
  124 + std::unique_ptr<std::istream> s(
  125 + new std::istrstream(buf.data(), buf.size()));
  126 +
  127 + std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
  128 + fst::FarReader<fst::StdArc>::Open(std::move(s)));
  129 +
  130 + for (; !reader->Done(); reader->Next()) {
  131 + std::unique_ptr<fst::StdConstFst> r(
  132 + fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
  133 +
  134 + tn_list_.push_back(
  135 + std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
  136 + } // for (; !reader->Done(); reader->Next())
  137 + } // for (const auto &f : files)
  138 + } // if (!config.rule_fars.empty())
  139 + }
  140 +
  141 + int32_t SampleRate() const override {
  142 + return model_->GetMetaData().sample_rate;
  143 + }
  144 +
  145 + int32_t NumSpeakers() const override {
  146 + return model_->GetMetaData().num_speakers;
  147 + }
  148 +
  149 + GeneratedAudio Generate(
  150 + const std::string &_text, int64_t sid = 0, float speed = 1.0,
  151 + GeneratedAudioCallback callback = nullptr) const override {
  152 + const auto &meta_data = model_->GetMetaData();
  153 + int32_t num_speakers = meta_data.num_speakers;
  154 +
  155 + if (num_speakers == 0 && sid != 0) {
  156 +#if __OHOS__
  157 + SHERPA_ONNX_LOGE(
  158 + "This is a single-speaker model and supports only sid 0. Given sid: "
  159 + "%{public}d. sid is ignored",
  160 + static_cast<int32_t>(sid));
  161 +#else
  162 + SHERPA_ONNX_LOGE(
  163 + "This is a single-speaker model and supports only sid 0. Given sid: "
  164 + "%d. sid is ignored",
  165 + static_cast<int32_t>(sid));
  166 +#endif
  167 + }
  168 +
  169 + if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) {
  170 +#if __OHOS__
  171 + SHERPA_ONNX_LOGE(
  172 + "This model contains only %{public}d speakers. sid should be in the "
  173 + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0",
  174 + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  175 +#else
  176 + SHERPA_ONNX_LOGE(
  177 + "This model contains only %d speakers. sid should be in the range "
  178 + "[%d, %d]. Given: %d. Use sid=0",
  179 + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  180 +#endif
  181 + sid = 0;
  182 + }
  183 +
  184 + std::string text = _text;
  185 + if (config_.model.debug) {
  186 +#if __OHOS__
  187 + SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str());
  188 +#else
  189 + SHERPA_ONNX_LOGE("Raw text: %s", text.c_str());
  190 +#endif
  191 + }
  192 +
  193 + if (!tn_list_.empty()) {
  194 + for (const auto &tn : tn_list_) {
  195 + text = tn->Normalize(text);
  196 + if (config_.model.debug) {
  197 +#if __OHOS__
  198 + SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str());
  199 +#else
  200 + SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str());
  201 +#endif
  202 + }
  203 + }
  204 + }
  205 +
  206 + std::vector<TokenIDs> token_ids =
  207 + frontend_->ConvertTextToTokenIds(text, "en-us");
  208 +
  209 + if (token_ids.empty() ||
  210 + (token_ids.size() == 1 && token_ids[0].tokens.empty())) {
  211 +#if __OHOS__
  212 + SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs",
  213 + text.c_str());
  214 +#else
  215 + SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str());
  216 +#endif
  217 + return {};
  218 + }
  219 +
  220 + std::vector<std::vector<int64_t>> x;
  221 +
  222 + x.reserve(token_ids.size());
  223 +
  224 + for (auto &i : token_ids) {
  225 + x.push_back(std::move(i.tokens));
  226 + }
  227 +
  228 + int32_t x_size = static_cast<int32_t>(x.size());
  229 +
  230 + if (config_.max_num_sentences != 1) {
  231 +#if __OHOS__
  232 + SHERPA_ONNX_LOGE(
  233 + "max_num_sentences (%{public}d) != 1 is ignored for Kokoro TTS "
  234 + "models",
  235 + config_.max_num_sentences);
  236 +#else
  237 + SHERPA_ONNX_LOGE(
  238 + "max_num_sentences (%d) != 1 is ignored for Kokoro TTS models",
  239 + config_.max_num_sentences);
  240 +#endif
  241 + }
  242 +
  243 + // the input text is too long, we process sentences within it in batches
  244 + // to avoid OOM. Batch size is config_.max_num_sentences
  245 + std::vector<std::vector<int64_t>> batch_x;
  246 +
  247 + int32_t batch_size = 1;
  248 + batch_x.reserve(config_.max_num_sentences);
  249 + int32_t num_batches = x_size / batch_size;
  250 +
  251 + if (config_.model.debug) {
  252 +#if __OHOS__
  253 + SHERPA_ONNX_LOGE(
  254 + "Split it into %{public}d batches. batch size: "
  255 + "%{public}d. Number of sentences: %{public}d",
  256 + num_batches, batch_size, x_size);
  257 +#else
  258 + SHERPA_ONNX_LOGE(
  259 + "Split it into %d batches. batch size: %d. Number "
  260 + "of sentences: %d",
  261 + num_batches, batch_size, x_size);
  262 +#endif
  263 + }
  264 +
  265 + GeneratedAudio ans;
  266 +
  267 + int32_t should_continue = 1;
  268 +
  269 + int32_t k = 0;
  270 +
  271 + for (int32_t b = 0; b != num_batches && should_continue; ++b) {
  272 + batch_x.clear();
  273 + for (int32_t i = 0; i != batch_size; ++i, ++k) {
  274 + batch_x.push_back(std::move(x[k]));
  275 + }
  276 +
  277 + auto audio = Process(batch_x, sid, speed);
  278 + ans.sample_rate = audio.sample_rate;
  279 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  280 + audio.samples.end());
  281 + if (callback) {
  282 + should_continue = callback(audio.samples.data(), audio.samples.size(),
  283 + (b + 1) * 1.0 / num_batches);
  284 + // Caution(fangjun): audio is freed when the callback returns, so users
  285 + // should copy the data if they want to access the data after
  286 + // the callback returns to avoid segmentation fault.
  287 + }
  288 + }
  289 +
  290 + batch_x.clear();
  291 + while (k < static_cast<int32_t>(x.size()) && should_continue) {
  292 + batch_x.push_back(std::move(x[k]));
  293 +
  294 + ++k;
  295 + }
  296 +
  297 + if (!batch_x.empty()) {
  298 + auto audio = Process(batch_x, sid, speed);
  299 + ans.sample_rate = audio.sample_rate;
  300 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  301 + audio.samples.end());
  302 + if (callback) {
  303 + callback(audio.samples.data(), audio.samples.size(), 1.0);
  304 + // Caution(fangjun): audio is freed when the callback returns, so users
  305 + // should copy the data if they want to access the data after
  306 + // the callback returns to avoid segmentation fault.
  307 + }
  308 + }
  309 +
  310 + return ans;
  311 + }
  312 +
  313 + private:
  314 + template <typename Manager>
  315 + void InitFrontend(Manager *mgr) {
  316 + const auto &meta_data = model_->GetMetaData();
  317 + frontend_ = std::make_unique<PiperPhonemizeLexicon>(
  318 + mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir,
  319 + meta_data);
  320 + }
  321 +
  322 + void InitFrontend() {
  323 + const auto &meta_data = model_->GetMetaData();
  324 +
  325 + frontend_ = std::make_unique<PiperPhonemizeLexicon>(
  326 + config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data);
  327 + }
  328 +
  329 + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
  330 + int32_t sid, float speed) const {
  331 + int32_t num_tokens = 0;
  332 + for (const auto &k : tokens) {
  333 + num_tokens += k.size();
  334 + }
  335 +
  336 + std::vector<int64_t> x;
  337 + x.reserve(num_tokens);
  338 + for (const auto &k : tokens) {
  339 + x.insert(x.end(), k.begin(), k.end());
  340 + }
  341 +
  342 + auto memory_info =
  343 + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
  344 +
  345 + std::array<int64_t, 2> x_shape = {1, static_cast<int32_t>(x.size())};
  346 + Ort::Value x_tensor = Ort::Value::CreateTensor(
  347 + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
  348 +
  349 + Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed);
  350 +
  351 + std::vector<int64_t> audio_shape =
  352 + audio.GetTensorTypeAndShapeInfo().GetShape();
  353 +
  354 + int64_t total = 1;
  355 + // The output shape may be (1, 1, total) or (1, total) or (total,)
  356 + for (auto i : audio_shape) {
  357 + total *= i;
  358 + }
  359 +
  360 + const float *p = audio.GetTensorData<float>();
  361 +
  362 + GeneratedAudio ans;
  363 + ans.sample_rate = model_->GetMetaData().sample_rate;
  364 + ans.samples = std::vector<float>(p, p + total);
  365 + return ans;
  366 + }
  367 +
  368 + private:
  369 + OfflineTtsConfig config_;
  370 + std::unique_ptr<OfflineTtsKokoroModel> model_;
  371 + std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_;
  372 + std::unique_ptr<OfflineTtsFrontend> frontend_;
  373 +};
  374 +
  375 +} // namespace sherpa_onnx
  376 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_
  1 +// sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "sherpa-onnx/csrc/file-utils.h"
  10 +#include "sherpa-onnx/csrc/macros.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
  15 + po->Register("kokoro-model", &model, "Path to Kokoro model");
  16 + po->Register("kokoro-voices", &voices,
  17 + "Path to voices.bin for Kokoro models");
  18 + po->Register("kokoro-tokens", &tokens,
  19 + "Path to tokens.txt for Kokoro models");
  20 + po->Register("kokoro-data-dir", &data_dir,
  21 + "Path to the directory containing dict for espeak-ng.");
  22 + po->Register("kokoro-length-scale", &length_scale,
  23 + "Speech speed. Larger->Slower; Smaller->faster.");
  24 +}
  25 +
  26 +bool OfflineTtsKokoroModelConfig::Validate() const {
  27 + if (model.empty()) {
  28 + SHERPA_ONNX_LOGE("Please provide --kokoro-model");
  29 + return false;
  30 + }
  31 +
  32 + if (!FileExists(model)) {
  33 + SHERPA_ONNX_LOGE("--kokoro-model: '%s' does not exist", model.c_str());
  34 + return false;
  35 + }
  36 +
  37 + if (tokens.empty()) {
  38 + SHERPA_ONNX_LOGE("Please provide --kokoro-tokens");
  39 + return false;
  40 + }
  41 +
  42 + if (!FileExists(tokens)) {
  43 + SHERPA_ONNX_LOGE("--kokoro-tokens: '%s' does not exist", tokens.c_str());
  44 + return false;
  45 + }
  46 +
  47 + if (data_dir.empty()) {
  48 + SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir");
  49 + return false;
  50 + }
  51 +
  52 + if (!FileExists(data_dir + "/phontab")) {
  53 + SHERPA_ONNX_LOGE(
  54 + "'%s/phontab' does not exist. Please check --kokoro-data-dir",
  55 + data_dir.c_str());
  56 + return false;
  57 + }
  58 +
  59 + if (!FileExists(data_dir + "/phonindex")) {
  60 + SHERPA_ONNX_LOGE(
  61 + "'%s/phonindex' does not exist. Please check --kokoro-data-dir",
  62 + data_dir.c_str());
  63 + return false;
  64 + }
  65 +
  66 + if (!FileExists(data_dir + "/phondata")) {
  67 + SHERPA_ONNX_LOGE(
  68 + "'%s/phondata' does not exist. Please check --kokoro-data-dir",
  69 + data_dir.c_str());
  70 + return false;
  71 + }
  72 +
  73 + if (!FileExists(data_dir + "/intonations")) {
  74 + SHERPA_ONNX_LOGE(
  75 + "'%s/intonations' does not exist. Please check --kokoro-data-dir",
  76 + data_dir.c_str());
  77 + return false;
  78 + }
  79 +
  80 + return true;
  81 +}
  82 +
  83 +std::string OfflineTtsKokoroModelConfig::ToString() const {
  84 + std::ostringstream os;
  85 +
  86 + os << "OfflineTtsKokoroModelConfig(";
  87 + os << "model=\"" << model << "\", ";
  88 + os << "voices=\"" << voices << "\", ";
  89 + os << "tokens=\"" << tokens << "\", ";
  90 + os << "data_dir=\"" << data_dir << "\", ";
  91 + os << "length_scale=" << length_scale << ")";
  92 +
  93 + return os.str();
  94 +}
  95 +
  96 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-tts-kokoro-model-config.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
  7 +
  8 +#include <string>
  9 +
  10 +#include "sherpa-onnx/csrc/parse-options.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +struct OfflineTtsKokoroModelConfig {
  15 + std::string model;
  16 + std::string voices;
  17 + std::string tokens;
  18 +
  19 + std::string data_dir;
  20 +
  21 + // speed = 1 / length_scale
  22 + float length_scale = 1.0;
  23 +
  24 + OfflineTtsKokoroModelConfig() = default;
  25 +
  26 + OfflineTtsKokoroModelConfig(const std::string &model,
  27 + const std::string &voices,
  28 + const std::string &tokens,
  29 + const std::string &data_dir, float length_scale)
  30 + : model(model),
  31 + voices(voices),
  32 + tokens(tokens),
  33 + data_dir(data_dir),
  34 + length_scale(length_scale) {}
  35 +
  36 + void Register(ParseOptions *po);
  37 + bool Validate() const;
  38 +
  39 + std::string ToString() const;
  40 +};
  41 +
  42 +} // namespace sherpa_onnx
  43 +
  44 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
  1 +// sherpa-onnx/csrc/offline-tts-kokoro-model-metadata.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_
  7 +
  8 +#include <cstdint>
  9 +#include <string>
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +// please refer to
  14 +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/add-meta-data.py
  15 +struct OfflineTtsKokoroModelMetaData {
  16 + int32_t sample_rate = 0;
  17 + int32_t num_speakers = 0;
  18 + int32_t version = 1;
  19 + int32_t has_espeak = 1;
  20 + int32_t max_token_len = 0;
  21 +};
  22 +
  23 +} // namespace sherpa_onnx
  24 +
  25 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_
  1 +// sherpa-onnx/csrc/offline-tts-kokoro-model.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h"
  6 +
  7 +#include <algorithm>
  8 +#include <string>
  9 +#include <utility>
  10 +#include <vector>
  11 +
  12 +#if __ANDROID_API__ >= 9
  13 +#include "android/asset_manager.h"
  14 +#include "android/asset_manager_jni.h"
  15 +#endif
  16 +
  17 +#if __OHOS__
  18 +#include "rawfile/raw_file_manager.h"
  19 +#endif
  20 +
  21 +#include "sherpa-onnx/csrc/macros.h"
  22 +#include "sherpa-onnx/csrc/onnx-utils.h"
  23 +#include "sherpa-onnx/csrc/session.h"
  24 +#include "sherpa-onnx/csrc/text-utils.h"
  25 +
  26 +namespace sherpa_onnx {
  27 +
  28 +class OfflineTtsKokoroModel::Impl {
  29 + public:
  30 + explicit Impl(const OfflineTtsModelConfig &config)
  31 + : config_(config),
  32 + env_(ORT_LOGGING_LEVEL_ERROR),
  33 + sess_opts_(GetSessionOptions(config)),
  34 + allocator_{} {
  35 + auto model_buf = ReadFile(config.kokoro.model);
  36 + auto voices_buf = ReadFile(config.kokoro.voices);
  37 + Init(model_buf.data(), model_buf.size(), voices_buf.data(),
  38 + voices_buf.size());
  39 + }
  40 +
  41 + template <typename Manager>
  42 + Impl(Manager *mgr, const OfflineTtsModelConfig &config)
  43 + : config_(config),
  44 + env_(ORT_LOGGING_LEVEL_ERROR),
  45 + sess_opts_(GetSessionOptions(config)),
  46 + allocator_{} {
  47 + auto model_buf = ReadFile(mgr, config.kokoro.model);
  48 + auto voices_buf = ReadFile(mgr, config.kokoro.voices);
  49 + Init(model_buf.data(), model_buf.size(), voices_buf.data(),
  50 + voices_buf.size());
  51 + }
  52 +
  53 + const OfflineTtsKokoroModelMetaData &GetMetaData() const {
  54 + return meta_data_;
  55 + }
  56 +
  57 + Ort::Value Run(Ort::Value x, int32_t sid, float speed) {
  58 + auto memory_info =
  59 + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
  60 +
  61 + std::vector<int64_t> x_shape = x.GetTensorTypeAndShapeInfo().GetShape();
  62 + if (x_shape[0] != 1) {
  63 + SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d",
  64 + static_cast<int32_t>(x_shape[0]));
  65 + exit(-1);
  66 + }
  67 +
  68 + // there is a 0 at the front and end of x
  69 + int32_t len = static_cast<int32_t>(x_shape[1]) - 2;
  70 + int32_t num_speakers = meta_data_.num_speakers;
  71 + int32_t dim0 = style_dim_[0];
  72 + int32_t dim1 = style_dim_[2];
  73 + if (len >= dim0) {
  74 + SHERPA_ONNX_LOGE("Bad things happened! %d vs %d", len, dim0);
  75 + SHERPA_ONNX_EXIT(-1);
  76 + }
  77 +
  78 + /*const*/ float *p = styles_.data() + sid * dim0 * dim1 + len * dim1;
  79 +
  80 + std::array<int64_t, 2> style_embedding_shape = {1, dim1};
  81 + Ort::Value style_embedding = Ort::Value::CreateTensor(
  82 + memory_info, p, dim1, style_embedding_shape.data(),
  83 + style_embedding_shape.size());
  84 +
  85 + int64_t speed_shape = 1;
  86 +
  87 + Ort::Value speed_tensor =
  88 + Ort::Value::CreateTensor(memory_info, &speed, 1, &speed_shape, 1);
  89 +
  90 + std::array<Ort::Value, 3> inputs = {
  91 + std::move(x), std::move(style_embedding), std::move(speed_tensor)};
  92 +
  93 + auto out =
  94 + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
  95 + output_names_ptr_.data(), output_names_ptr_.size());
  96 +
  97 + return std::move(out[0]);
  98 + }
  99 +
  100 + private:
  101 + void Init(void *model_data, size_t model_data_length, const char *voices_data,
  102 + size_t voices_data_length) {
  103 + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
  104 + sess_opts_);
  105 +
  106 + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
  107 +
  108 + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
  109 + // get meta data
  110 + Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
  111 + if (config_.debug) {
  112 + std::ostringstream os;
  113 + os << "---kokoro model---\n";
  114 + PrintModelMetadata(os, meta_data);
  115 +
  116 + os << "----------input names----------\n";
  117 + int32_t i = 0;
  118 + for (const auto &s : input_names_) {
  119 + os << i << " " << s << "\n";
  120 + ++i;
  121 + }
  122 + os << "----------output names----------\n";
  123 + i = 0;
  124 + for (const auto &s : output_names_) {
  125 + os << i << " " << s << "\n";
  126 + ++i;
  127 + }
  128 +
  129 +#if __OHOS__
  130 + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
  131 +#else
  132 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
  133 +#endif
  134 + }
  135 +
  136 + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
  137 + SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate");
  138 + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
  139 + SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
  140 + SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
  141 +
  142 + if (config_.debug) {
  143 + std::vector<std::string> speaker_names;
  144 + SHERPA_ONNX_READ_META_DATA_VEC_STRING(speaker_names, "speaker_names");
  145 + std::ostringstream os;
  146 + os << "\n";
  147 + for (int32_t i = 0; i != speaker_names.size(); ++i) {
  148 + os << i << "->" << speaker_names[i] << ", ";
  149 + }
  150 + os << "\n";
  151 +
  152 +#if __OHOS__
  153 + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
  154 +#else
  155 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
  156 +#endif
  157 + }
  158 +
  159 + SHERPA_ONNX_READ_META_DATA_VEC(style_dim_, "style_dim");
  160 + if (style_dim_.size() != 3) {
  161 + SHERPA_ONNX_LOGE("style_dim should be 3-d, given: %d",
  162 + static_cast<int32_t>(style_dim_.size()));
  163 + SHERPA_ONNX_EXIT(-1);
  164 + }
  165 +
  166 + if (style_dim_[1] != 1) {
  167 + SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[1]);
  168 + SHERPA_ONNX_EXIT(-1);
  169 + }
  170 +
  171 + int32_t actual_num_floats = voices_data_length / sizeof(float);
  172 + int32_t expected_num_floats =
  173 + style_dim_[0] * style_dim_[2] * meta_data_.num_speakers;
  174 +
  175 + if (actual_num_floats != expected_num_floats) {
  176 +#if __OHOS__
  177 + SHERPA_ONNX_LOGE(
  178 + "Corrupted --kokoro-voices '%{public}s'. Expected #floats: "
  179 + "%{public}d, actual: %{public}d",
  180 + config_.kokoro.voices.c_str(), expected_num_floats,
  181 + actual_num_floats);
  182 +#else
  183 + SHERPA_ONNX_LOGE(
  184 + "Corrupted --kokoro-voices '%s'. Expected #floats: %d, actual: %d",
  185 + config_.kokoro.voices.c_str(), expected_num_floats,
  186 + actual_num_floats);
  187 +#endif
  188 +
  189 + SHERPA_ONNX_EXIT(-1);
  190 + }
  191 +
  192 + styles_ = std::vector<float>(
  193 + reinterpret_cast<const float *>(voices_data),
  194 + reinterpret_cast<const float *>(voices_data) + expected_num_floats);
  195 +
  196 + meta_data_.max_token_len = style_dim_[0];
  197 + }
  198 +
  199 + private:
  200 + OfflineTtsModelConfig config_;
  201 + Ort::Env env_;
  202 + Ort::SessionOptions sess_opts_;
  203 + Ort::AllocatorWithDefaultOptions allocator_;
  204 +
  205 + std::unique_ptr<Ort::Session> sess_;
  206 +
  207 + std::vector<std::string> input_names_;
  208 + std::vector<const char *> input_names_ptr_;
  209 +
  210 + std::vector<std::string> output_names_;
  211 + std::vector<const char *> output_names_ptr_;
  212 +
  213 + OfflineTtsKokoroModelMetaData meta_data_;
  214 + std::vector<int32_t> style_dim_;
  215 +
  216 + // (num_speakers, style_dim_[0], style_dim_[2])
  217 + std::vector<float> styles_;
  218 +};
  219 +
  220 +OfflineTtsKokoroModel::OfflineTtsKokoroModel(
  221 + const OfflineTtsModelConfig &config)
  222 + : impl_(std::make_unique<Impl>(config)) {}
  223 +
  224 +template <typename Manager>
  225 +OfflineTtsKokoroModel::OfflineTtsKokoroModel(
  226 + Manager *mgr, const OfflineTtsModelConfig &config)
  227 + : impl_(std::make_unique<Impl>(mgr, config)) {}
  228 +
  229 +OfflineTtsKokoroModel::~OfflineTtsKokoroModel() = default;
  230 +
  231 +const OfflineTtsKokoroModelMetaData &OfflineTtsKokoroModel::GetMetaData()
  232 + const {
  233 + return impl_->GetMetaData();
  234 +}
  235 +
  236 +Ort::Value OfflineTtsKokoroModel::Run(Ort::Value x, int64_t sid /*= 0*/,
  237 + float speed /*= 1.0*/) const {
  238 + return impl_->Run(std::move(x), sid, speed);
  239 +}
  240 +
  241 +#if __ANDROID_API__ >= 9
  242 +template OfflineTtsKokoroModel::OfflineTtsKokoroModel(
  243 + AAssetManager *mgr, const OfflineTtsModelConfig &config);
  244 +#endif
  245 +
  246 +#if __OHOS__
  247 +template OfflineTtsKokoroModel::OfflineTtsKokoroModel(
  248 + NativeResourceManager *mgr, const OfflineTtsModelConfig &config);
  249 +#endif
  250 +
  251 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-tts-kokoro-model.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_
  7 +
  8 +#include <memory>
  9 +#include <string>
  10 +
  11 +#include "onnxruntime_cxx_api.h" // NOLINT
  12 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
  13 +#include "sherpa-onnx/csrc/offline-tts-model-config.h"
  14 +
  15 +namespace sherpa_onnx {
  16 +
  17 +class OfflineTtsKokoroModel {
  18 + public:
  19 + ~OfflineTtsKokoroModel();
  20 +
  21 + explicit OfflineTtsKokoroModel(const OfflineTtsModelConfig &config);
  22 +
  23 + template <typename Manager>
  24 + OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config);
  25 +
  26 + // Return a float32 tensor containing the mel
  27 + // of shape (batch_size, mel_dim, num_frames)
  28 + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const;
  29 +
  30 + const OfflineTtsKokoroModelMetaData &GetMetaData() const;
  31 +
  32 + private:
  33 + class Impl;
  34 + std::unique_ptr<Impl> impl_;
  35 +};
  36 +
  37 +} // namespace sherpa_onnx
  38 +
  39 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_
1 -// sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h 1 +// sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h
2 // 2 //
3 // Copyright (c) 2023 Xiaomi Corporation 3 // Copyright (c) 2023 Xiaomi Corporation
4 4
5 -#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_  
6 -#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ 5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_
7 7
8 #include <cstdint> 8 #include <cstdint>
9 #include <string> 9 #include <string>
@@ -25,4 +25,4 @@ struct OfflineTtsMatchaModelMetaData { @@ -25,4 +25,4 @@ struct OfflineTtsMatchaModelMetaData {
25 25
26 } // namespace sherpa_onnx 26 } // namespace sherpa_onnx
27 27
28 -#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ 28 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_
@@ -9,7 +9,7 @@ @@ -9,7 +9,7 @@
9 #include <string> 9 #include <string>
10 10
11 #include "onnxruntime_cxx_api.h" // NOLINT 11 #include "onnxruntime_cxx_api.h" // NOLINT
12 -#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" 12 +#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h"
13 #include "sherpa-onnx/csrc/offline-tts-model-config.h" 13 #include "sherpa-onnx/csrc/offline-tts-model-config.h"
14 14
15 namespace sherpa_onnx { 15 namespace sherpa_onnx {
@@ -11,6 +11,7 @@ namespace sherpa_onnx { @@ -11,6 +11,7 @@ namespace sherpa_onnx {
11 void OfflineTtsModelConfig::Register(ParseOptions *po) { 11 void OfflineTtsModelConfig::Register(ParseOptions *po) {
12 vits.Register(po); 12 vits.Register(po);
13 matcha.Register(po); 13 matcha.Register(po);
  14 + kokoro.Register(po);
14 15
15 po->Register("num-threads", &num_threads, 16 po->Register("num-threads", &num_threads,
16 "Number of threads to run the neural network"); 17 "Number of threads to run the neural network");
@@ -32,7 +33,11 @@ bool OfflineTtsModelConfig::Validate() const { @@ -32,7 +33,11 @@ bool OfflineTtsModelConfig::Validate() const {
32 return vits.Validate(); 33 return vits.Validate();
33 } 34 }
34 35
35 - return matcha.Validate(); 36 + if (!matcha.acoustic_model.empty()) {
  37 + return matcha.Validate();
  38 + }
  39 +
  40 + return kokoro.Validate();
36 } 41 }
37 42
38 std::string OfflineTtsModelConfig::ToString() const { 43 std::string OfflineTtsModelConfig::ToString() const {
@@ -41,6 +46,7 @@ std::string OfflineTtsModelConfig::ToString() const { @@ -41,6 +46,7 @@ std::string OfflineTtsModelConfig::ToString() const {
41 os << "OfflineTtsModelConfig("; 46 os << "OfflineTtsModelConfig(";
42 os << "vits=" << vits.ToString() << ", "; 47 os << "vits=" << vits.ToString() << ", ";
43 os << "matcha=" << matcha.ToString() << ", "; 48 os << "matcha=" << matcha.ToString() << ", ";
  49 + os << "kokoro=" << kokoro.ToString() << ", ";
44 os << "num_threads=" << num_threads << ", "; 50 os << "num_threads=" << num_threads << ", ";
45 os << "debug=" << (debug ? "True" : "False") << ", "; 51 os << "debug=" << (debug ? "True" : "False") << ", ";
46 os << "provider=\"" << provider << "\")"; 52 os << "provider=\"" << provider << "\")";
@@ -7,6 +7,7 @@ @@ -7,6 +7,7 @@
7 7
8 #include <string> 8 #include <string>
9 9
  10 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
10 #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" 11 #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h"
11 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" 12 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h"
12 #include "sherpa-onnx/csrc/parse-options.h" 13 #include "sherpa-onnx/csrc/parse-options.h"
@@ -16,6 +17,7 @@ namespace sherpa_onnx { @@ -16,6 +17,7 @@ namespace sherpa_onnx {
16 struct OfflineTtsModelConfig { 17 struct OfflineTtsModelConfig {
17 OfflineTtsVitsModelConfig vits; 18 OfflineTtsVitsModelConfig vits;
18 OfflineTtsMatchaModelConfig matcha; 19 OfflineTtsMatchaModelConfig matcha;
  20 + OfflineTtsKokoroModelConfig kokoro;
19 21
20 int32_t num_threads = 1; 22 int32_t num_threads = 1;
21 bool debug = false; 23 bool debug = false;
@@ -25,10 +27,12 @@ struct OfflineTtsModelConfig { @@ -25,10 +27,12 @@ struct OfflineTtsModelConfig {
25 27
26 OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, 28 OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits,
27 const OfflineTtsMatchaModelConfig &matcha, 29 const OfflineTtsMatchaModelConfig &matcha,
  30 + const OfflineTtsKokoroModelConfig &kokoro,
28 int32_t num_threads, bool debug, 31 int32_t num_threads, bool debug,
29 const std::string &provider) 32 const std::string &provider)
30 : vits(vits), 33 : vits(vits),
31 matcha(matcha), 34 matcha(matcha),
  35 + kokoro(kokoro),
32 num_threads(num_threads), 36 num_threads(num_threads),
33 debug(debug), 37 debug(debug),
34 provider(provider) {} 38 provider(provider) {}
1 -// sherpa-onnx/csrc/offline-tts-vits-model-metadata.h 1 +// sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h
2 // 2 //
3 // Copyright (c) 2023 Xiaomi Corporation 3 // Copyright (c) 2023 Xiaomi Corporation
4 4
5 -#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_  
6 -#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ 5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_
7 7
8 #include <cstdint> 8 #include <cstdint>
9 #include <string> 9 #include <string>
@@ -46,4 +46,4 @@ struct OfflineTtsVitsModelMetaData { @@ -46,4 +46,4 @@ struct OfflineTtsVitsModelMetaData {
46 46
47 } // namespace sherpa_onnx 47 } // namespace sherpa_onnx
48 48
49 -#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ 49 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_
@@ -10,7 +10,7 @@ @@ -10,7 +10,7 @@
10 10
11 #include "onnxruntime_cxx_api.h" // NOLINT 11 #include "onnxruntime_cxx_api.h" // NOLINT
12 #include "sherpa-onnx/csrc/offline-tts-model-config.h" 12 #include "sherpa-onnx/csrc/offline-tts-model-config.h"
13 -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" 13 +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
14 14
15 namespace sherpa_onnx { 15 namespace sherpa_onnx {
16 16
@@ -155,6 +155,36 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha( @@ -155,6 +155,36 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha(
155 return ans; 155 return ans;
156 } 156 }
157 157
  158 +static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
  159 + const std::unordered_map<char32_t, int32_t> &token2id,
  160 + const std::vector<piper::Phoneme> &phonemes, int32_t max_len) {
  161 + std::vector<std::vector<int64_t>> ans;
  162 +
  163 + std::vector<int64_t> current;
  164 + current.reserve(phonemes.size());
  165 +
  166 + for (auto p : phonemes) {
  167 + if (token2id.count(p)) {
  168 + if (current.size() > max_len - 1) {
  169 + current.push_back(0);
  170 + ans.push_back(std::move(current));
  171 +
  172 + current.reserve(phonemes.size());
  173 + current.push_back(0);
  174 + }
  175 +
  176 + current.push_back(token2id.at(p));
  177 + } else {
  178 + SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
  179 + static_cast<uint32_t>(p));
  180 + }
  181 + }
  182 +
  183 + current.push_back(0);
  184 + ans.push_back(std::move(current));
  185 + return ans;
  186 +}
  187 +
158 static std::vector<int64_t> CoquiPhonemesToIds( 188 static std::vector<int64_t> CoquiPhonemesToIds(
159 const std::unordered_map<char32_t, int32_t> &token2id, 189 const std::unordered_map<char32_t, int32_t> &token2id,
160 const std::vector<piper::Phoneme> &phonemes, 190 const std::vector<piper::Phoneme> &phonemes,
@@ -269,6 +299,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( @@ -269,6 +299,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
269 InitEspeak(data_dir); 299 InitEspeak(data_dir);
270 } 300 }
271 301
  302 +PiperPhonemizeLexicon::PiperPhonemizeLexicon(
  303 + const std::string &tokens, const std::string &data_dir,
  304 + const OfflineTtsKokoroModelMetaData &kokoro_meta_data)
  305 + : kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) {
  306 + {
  307 + std::ifstream is(tokens);
  308 + token2id_ = ReadTokens(is);
  309 + }
  310 +
  311 + InitEspeak(data_dir);
  312 +}
  313 +
272 template <typename Manager> 314 template <typename Manager>
273 PiperPhonemizeLexicon::PiperPhonemizeLexicon( 315 PiperPhonemizeLexicon::PiperPhonemizeLexicon(
274 Manager *mgr, const std::string &tokens, const std::string &data_dir, 316 Manager *mgr, const std::string &tokens, const std::string &data_dir,
@@ -286,10 +328,29 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( @@ -286,10 +328,29 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
286 InitEspeak(data_dir); 328 InitEspeak(data_dir);
287 } 329 }
288 330
  331 +template <typename Manager>
  332 +PiperPhonemizeLexicon::PiperPhonemizeLexicon(
  333 + Manager *mgr, const std::string &tokens, const std::string &data_dir,
  334 + const OfflineTtsKokoroModelMetaData &kokoro_meta_data)
  335 + : kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) {
  336 + {
  337 + auto buf = ReadFile(mgr, tokens);
  338 + std::istrstream is(buf.data(), buf.size());
  339 + token2id_ = ReadTokens(is);
  340 + }
  341 +
  342 + // We should copy the directory of espeak-ng-data from the asset to
  343 + // some internal or external storage and then pass the directory to
  344 + // data_dir.
  345 + InitEspeak(data_dir);
  346 +}
  347 +
289 std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( 348 std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
290 const std::string &text, const std::string &voice /*= ""*/) const { 349 const std::string &text, const std::string &voice /*= ""*/) const {
291 if (is_matcha_) { 350 if (is_matcha_) {
292 return ConvertTextToTokenIdsMatcha(text, voice); 351 return ConvertTextToTokenIdsMatcha(text, voice);
  352 + } else if (is_kokoro_) {
  353 + return ConvertTextToTokenIdsKokoro(text, voice);
293 } else { 354 } else {
294 return ConvertTextToTokenIdsVits(text, voice); 355 return ConvertTextToTokenIdsVits(text, voice);
295 } 356 }
@@ -320,6 +381,32 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( @@ -320,6 +381,32 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
320 return ans; 381 return ans;
321 } 382 }
322 383
  384 +std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
  385 + const std::string &text, const std::string &voice /*= ""*/) const {
  386 + piper::eSpeakPhonemeConfig config;
  387 +
  388 + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
  389 + // to list available voices
  390 + config.voice = voice; // e.g., voice is en-us
  391 +
  392 + std::vector<std::vector<piper::Phoneme>> phonemes;
  393 +
  394 + CallPhonemizeEspeak(text, config, &phonemes);
  395 +
  396 + std::vector<TokenIDs> ans;
  397 +
  398 + for (const auto &p : phonemes) {
  399 + auto phoneme_ids =
  400 + PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len);
  401 +
  402 + for (auto &ids : phoneme_ids) {
  403 + ans.emplace_back(std::move(ids));
  404 + }
  405 + }
  406 +
  407 + return ans;
  408 +}
  409 +
323 std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsVits( 410 std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsVits(
324 const std::string &text, const std::string &voice /*= ""*/) const { 411 const std::string &text, const std::string &voice /*= ""*/) const {
325 piper::eSpeakPhonemeConfig config; 412 piper::eSpeakPhonemeConfig config;
@@ -363,6 +450,10 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( @@ -363,6 +450,10 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
363 template PiperPhonemizeLexicon::PiperPhonemizeLexicon( 450 template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
364 AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, 451 AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
365 const OfflineTtsMatchaModelMetaData &matcha_meta_data); 452 const OfflineTtsMatchaModelMetaData &matcha_meta_data);
  453 +
  454 +template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
  455 + AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
  456 + const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
366 #endif 457 #endif
367 458
368 #if __OHOS__ 459 #if __OHOS__
@@ -375,6 +466,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( @@ -375,6 +466,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
375 NativeResourceManager *mgr, const std::string &tokens, 466 NativeResourceManager *mgr, const std::string &tokens,
376 const std::string &data_dir, 467 const std::string &data_dir,
377 const OfflineTtsMatchaModelMetaData &matcha_meta_data); 468 const OfflineTtsMatchaModelMetaData &matcha_meta_data);
  469 +
  470 +template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
  471 + NativeResourceManager *mgr, const std::string &tokens,
  472 + const std::string &data_dir,
  473 + const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
378 #endif 474 #endif
379 475
380 } // namespace sherpa_onnx 476 } // namespace sherpa_onnx
@@ -10,8 +10,9 @@ @@ -10,8 +10,9 @@
10 #include <vector> 10 #include <vector>
11 11
12 #include "sherpa-onnx/csrc/offline-tts-frontend.h" 12 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
13 -#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h"  
14 -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" 13 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
  14 +#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h"
  15 +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
15 16
16 namespace sherpa_onnx { 17 namespace sherpa_onnx {
17 18
@@ -23,6 +24,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { @@ -23,6 +24,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
23 PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, 24 PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
24 const OfflineTtsMatchaModelMetaData &matcha_meta_data); 25 const OfflineTtsMatchaModelMetaData &matcha_meta_data);
25 26
  27 + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
  28 + const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
  29 +
26 template <typename Manager> 30 template <typename Manager>
27 PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, 31 PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
28 const std::string &data_dir, 32 const std::string &data_dir,
@@ -33,6 +37,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { @@ -33,6 +37,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
33 const std::string &data_dir, 37 const std::string &data_dir,
34 const OfflineTtsMatchaModelMetaData &matcha_meta_data); 38 const OfflineTtsMatchaModelMetaData &matcha_meta_data);
35 39
  40 + template <typename Manager>
  41 + PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
  42 + const std::string &data_dir,
  43 + const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
  44 +
36 std::vector<TokenIDs> ConvertTextToTokenIds( 45 std::vector<TokenIDs> ConvertTextToTokenIds(
37 const std::string &text, const std::string &voice = "") const override; 46 const std::string &text, const std::string &voice = "") const override;
38 47
@@ -43,12 +52,17 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { @@ -43,12 +52,17 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
43 std::vector<TokenIDs> ConvertTextToTokenIdsMatcha( 52 std::vector<TokenIDs> ConvertTextToTokenIdsMatcha(
44 const std::string &text, const std::string &voice = "") const; 53 const std::string &text, const std::string &voice = "") const;
45 54
  55 + std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
  56 + const std::string &text, const std::string &voice = "") const;
  57 +
46 private: 58 private:
47 // map unicode codepoint to an integer ID 59 // map unicode codepoint to an integer ID
48 std::unordered_map<char32_t, int32_t> token2id_; 60 std::unordered_map<char32_t, int32_t> token2id_;
49 OfflineTtsVitsModelMetaData vits_meta_data_; 61 OfflineTtsVitsModelMetaData vits_meta_data_;
50 OfflineTtsMatchaModelMetaData matcha_meta_data_; 62 OfflineTtsMatchaModelMetaData matcha_meta_data_;
  63 + OfflineTtsKokoroModelMetaData kokoro_meta_data_;
51 bool is_matcha_ = false; 64 bool is_matcha_ = false;
  65 + bool is_kokoro_ = false;
52 }; 66 };
53 67
54 } // namespace sherpa_onnx 68 } // namespace sherpa_onnx
@@ -54,6 +54,7 @@ endif() @@ -54,6 +54,7 @@ endif()
54 54
55 if(SHERPA_ONNX_ENABLE_TTS) 55 if(SHERPA_ONNX_ENABLE_TTS)
56 list(APPEND srcs 56 list(APPEND srcs
  57 + offline-tts-kokoro-model-config.cc
57 offline-tts-matcha-model-config.cc 58 offline-tts-matcha-model-config.cc
58 offline-tts-model-config.cc 59 offline-tts-model-config.cc
59 offline-tts-vits-model-config.cc 60 offline-tts-vits-model-config.cc
  1 +// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h"
  6 +
  7 +#include <string>
  8 +
  9 +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +void PybindOfflineTtsKokoroModelConfig(py::module *m) {
  14 + using PyClass = OfflineTtsKokoroModelConfig;
  15 +
  16 + py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig")
  17 + .def(py::init<>())
  18 + .def(py::init<const std::string &, const std::string &,
  19 + const std::string &, const std::string &, float>(),
  20 + py::arg("model"), py::arg("voices"), py::arg("tokens"),
  21 + py::arg("data_dir"), py::arg("length_scale") = 1.0)
  22 + .def_readwrite("model", &PyClass::model)
  23 + .def_readwrite("voices", &PyClass::voices)
  24 + .def_readwrite("tokens", &PyClass::tokens)
  25 + .def_readwrite("data_dir", &PyClass::data_dir)
  26 + .def_readwrite("length_scale", &PyClass::length_scale)
  27 + .def("__str__", &PyClass::ToString)
  28 + .def("validate", &PyClass::Validate);
  29 +}
  30 +
  31 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h
  2 +//
  3 +// Copyright (c) 2025 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindOfflineTtsKokoroModelConfig(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
@@ -7,6 +7,7 @@ @@ -7,6 +7,7 @@
7 #include <string> 7 #include <string>
8 8
9 #include "sherpa-onnx/csrc/offline-tts-model-config.h" 9 #include "sherpa-onnx/csrc/offline-tts-model-config.h"
  10 +#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h"
10 #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" 11 #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h"
11 #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" 12 #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h"
12 13
@@ -15,20 +16,24 @@ namespace sherpa_onnx { @@ -15,20 +16,24 @@ namespace sherpa_onnx {
15 void PybindOfflineTtsModelConfig(py::module *m) { 16 void PybindOfflineTtsModelConfig(py::module *m) {
16 PybindOfflineTtsVitsModelConfig(m); 17 PybindOfflineTtsVitsModelConfig(m);
17 PybindOfflineTtsMatchaModelConfig(m); 18 PybindOfflineTtsMatchaModelConfig(m);
  19 + PybindOfflineTtsKokoroModelConfig(m);
18 20
19 using PyClass = OfflineTtsModelConfig; 21 using PyClass = OfflineTtsModelConfig;
20 22
21 py::class_<PyClass>(*m, "OfflineTtsModelConfig") 23 py::class_<PyClass>(*m, "OfflineTtsModelConfig")
22 .def(py::init<>()) 24 .def(py::init<>())
23 .def(py::init<const OfflineTtsVitsModelConfig &, 25 .def(py::init<const OfflineTtsVitsModelConfig &,
24 - const OfflineTtsMatchaModelConfig &, int32_t, bool, 26 + const OfflineTtsMatchaModelConfig &,
  27 + const OfflineTtsKokoroModelConfig &, int32_t, bool,
25 const std::string &>(), 28 const std::string &>(),
26 py::arg("vits") = OfflineTtsVitsModelConfig{}, 29 py::arg("vits") = OfflineTtsVitsModelConfig{},
27 py::arg("matcha") = OfflineTtsMatchaModelConfig{}, 30 py::arg("matcha") = OfflineTtsMatchaModelConfig{},
  31 + py::arg("kokoro") = OfflineTtsKokoroModelConfig{},
28 py::arg("num_threads") = 1, py::arg("debug") = false, 32 py::arg("num_threads") = 1, py::arg("debug") = false,
29 py::arg("provider") = "cpu") 33 py::arg("provider") = "cpu")
30 .def_readwrite("vits", &PyClass::vits) 34 .def_readwrite("vits", &PyClass::vits)
31 .def_readwrite("matcha", &PyClass::matcha) 35 .def_readwrite("matcha", &PyClass::matcha)
  36 + .def_readwrite("kokoro", &PyClass::kokoro)
32 .def_readwrite("num_threads", &PyClass::num_threads) 37 .def_readwrite("num_threads", &PyClass::num_threads)
33 .def_readwrite("debug", &PyClass::debug) 38 .def_readwrite("debug", &PyClass::debug)
34 .def_readwrite("provider", &PyClass::provider) 39 .def_readwrite("provider", &PyClass::provider)
@@ -20,6 +20,7 @@ from _sherpa_onnx import ( @@ -20,6 +20,7 @@ from _sherpa_onnx import (
20 OfflineStream, 20 OfflineStream,
21 OfflineTts, 21 OfflineTts,
22 OfflineTtsConfig, 22 OfflineTtsConfig,
  23 + OfflineTtsKokoroModelConfig,
23 OfflineTtsMatchaModelConfig, 24 OfflineTtsMatchaModelConfig,
24 OfflineTtsModelConfig, 25 OfflineTtsModelConfig,
25 OfflineTtsVitsModelConfig, 26 OfflineTtsVitsModelConfig,