Committed by
GitHub
Add C++ and Python API for Kokoro TTS models. (#1715)
正在显示
27 个修改的文件
包含
1193 行增加
和
29 行删除
| @@ -19,6 +19,31 @@ which $EXE | @@ -19,6 +19,31 @@ which $EXE | ||
| 19 | mkdir ./tts | 19 | mkdir ./tts |
| 20 | 20 | ||
| 21 | log "------------------------------------------------------------" | 21 | log "------------------------------------------------------------" |
| 22 | +log "kokoro-en-v0_19" | ||
| 23 | +log "------------------------------------------------------------" | ||
| 24 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | ||
| 25 | +tar xf kokoro-en-v0_19.tar.bz2 | ||
| 26 | +rm kokoro-en-v0_19.tar.bz2 | ||
| 27 | + | ||
| 28 | +# mapping of sid to voice name | ||
| 29 | +# 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam | ||
| 30 | +# 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis | ||
| 31 | + | ||
| 32 | +for sid in $(seq 0 10); do | ||
| 33 | + $EXE \ | ||
| 34 | + --debug=1 \ | ||
| 35 | + --kokoro-model=./kokoro-en-v0_19/model.onnx \ | ||
| 36 | + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ | ||
| 37 | + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ | ||
| 38 | + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ | ||
| 39 | + --num-threads=2 \ | ||
| 40 | + --sid=$sid \ | ||
| 41 | + --output-filename="./tts/kokoro-$sid.wav" \ | ||
| 42 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." | ||
| 43 | +done | ||
| 44 | +rm -rf kokoro-en-v0_19 | ||
| 45 | + | ||
| 46 | +log "------------------------------------------------------------" | ||
| 22 | log "matcha-icefall-en_US-ljspeech" | 47 | log "matcha-icefall-en_US-ljspeech" |
| 23 | log "------------------------------------------------------------" | 48 | log "------------------------------------------------------------" |
| 24 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | 49 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 |
| @@ -267,6 +267,25 @@ log "Offline TTS test" | @@ -267,6 +267,25 @@ log "Offline TTS test" | ||
| 267 | # test waves are saved in ./tts | 267 | # test waves are saved in ./tts |
| 268 | mkdir ./tts | 268 | mkdir ./tts |
| 269 | 269 | ||
| 270 | +log "kokoro-en-v0_19 test" | ||
| 271 | + | ||
| 272 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | ||
| 273 | +tar xf kokoro-en-v0_19.tar.bz2 | ||
| 274 | +rm kokoro-en-v0_19.tar.bz2 | ||
| 275 | + | ||
| 276 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 277 | + --debug=1 \ | ||
| 278 | + --kokoro-model=./kokoro-en-v0_19/model.onnx \ | ||
| 279 | + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ | ||
| 280 | + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ | ||
| 281 | + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ | ||
| 282 | + --num-threads=2 \ | ||
| 283 | + --sid=10 \ | ||
| 284 | + --output-filename="./tts/kokoro-10.wav" \ | ||
| 285 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." | ||
| 286 | + | ||
| 287 | +rm -rf kokoro-en-v0_19 | ||
| 288 | + | ||
| 270 | log "matcha-ljspeech-en test" | 289 | log "matcha-ljspeech-en test" |
| 271 | 290 | ||
| 272 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | 291 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 |
| @@ -11,7 +11,7 @@ while the model is still generating. | @@ -11,7 +11,7 @@ while the model is still generating. | ||
| 11 | 11 | ||
| 12 | Usage: | 12 | Usage: |
| 13 | 13 | ||
| 14 | -Example (1/5) | 14 | +Example (1/6) |
| 15 | 15 | ||
| 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 23 | --output-filename=./generated.wav \ | 23 | --output-filename=./generated.wav \ |
| 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 25 | 25 | ||
| 26 | -Example (2/5) | 26 | +Example (2/6) |
| 27 | 27 | ||
| 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 | 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 |
| 29 | tar xvf vits-zh-aishell3.tar.bz2 | 29 | tar xvf vits-zh-aishell3.tar.bz2 |
| @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 37 | --output-filename=./liubei-21.wav \ | 37 | --output-filename=./liubei-21.wav \ |
| 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 39 | 39 | ||
| 40 | -Example (3/5) | 40 | +Example (3/6) |
| 41 | 41 | ||
| 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 53 | --output-filename=./test-2.wav \ | 53 | --output-filename=./test-2.wav \ |
| 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 55 | 55 | ||
| 56 | -Example (4/5) | 56 | +Example (4/6) |
| 57 | 57 | ||
| 58 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 58 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 59 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 59 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 71 | --output-filename=./test-matcha.wav \ | 71 | --output-filename=./test-matcha.wav \ |
| 72 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | 72 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| 73 | 73 | ||
| 74 | -Example (5/5) | 74 | +Example (5/6) |
| 75 | 75 | ||
| 76 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | 76 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 |
| 77 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | 77 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 |
| @@ -88,6 +88,22 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -88,6 +88,22 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 88 | --num-threads=2 \ | 88 | --num-threads=2 \ |
| 89 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 89 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 90 | 90 | ||
| 91 | +Example (6/6) | ||
| 92 | + | ||
| 93 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | ||
| 94 | +tar xf kokoro-en-v0_19.tar.bz2 | ||
| 95 | +rm kokoro-en-v0_19.tar.bz2 | ||
| 96 | + | ||
| 97 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 98 | + --debug=1 \ | ||
| 99 | + --kokoro-model=./kokoro-en-v0_19/model.onnx \ | ||
| 100 | + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ | ||
| 101 | + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ | ||
| 102 | + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ | ||
| 103 | + --num-threads=2 \ | ||
| 104 | + --sid=10 \ | ||
| 105 | + --output-filename="./kokoro-10.wav" \ | ||
| 106 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." | ||
| 91 | 107 | ||
| 92 | You can find more models at | 108 | You can find more models at |
| 93 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 109 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| @@ -202,6 +218,36 @@ def add_matcha_args(parser): | @@ -202,6 +218,36 @@ def add_matcha_args(parser): | ||
| 202 | ) | 218 | ) |
| 203 | 219 | ||
| 204 | 220 | ||
| 221 | +def add_kokoro_args(parser): | ||
| 222 | + parser.add_argument( | ||
| 223 | + "--kokoro-model", | ||
| 224 | + type=str, | ||
| 225 | + default="", | ||
| 226 | + help="Path to model.onnx for kokoro", | ||
| 227 | + ) | ||
| 228 | + | ||
| 229 | + parser.add_argument( | ||
| 230 | + "--kokoro-voices", | ||
| 231 | + type=str, | ||
| 232 | + default="", | ||
| 233 | + help="Path to voices.bin for kokoro", | ||
| 234 | + ) | ||
| 235 | + | ||
| 236 | + parser.add_argument( | ||
| 237 | + "--kokoro-tokens", | ||
| 238 | + type=str, | ||
| 239 | + default="", | ||
| 240 | + help="Path to tokens.txt for kokoro", | ||
| 241 | + ) | ||
| 242 | + | ||
| 243 | + parser.add_argument( | ||
| 244 | + "--kokoro-data-dir", | ||
| 245 | + type=str, | ||
| 246 | + default="", | ||
| 247 | + help="Path to the dict directory of espeak-ng.", | ||
| 248 | + ) | ||
| 249 | + | ||
| 250 | + | ||
| 205 | def get_args(): | 251 | def get_args(): |
| 206 | parser = argparse.ArgumentParser( | 252 | parser = argparse.ArgumentParser( |
| 207 | formatter_class=argparse.ArgumentDefaultsHelpFormatter | 253 | formatter_class=argparse.ArgumentDefaultsHelpFormatter |
| @@ -209,6 +255,7 @@ def get_args(): | @@ -209,6 +255,7 @@ def get_args(): | ||
| 209 | 255 | ||
| 210 | add_vits_args(parser) | 256 | add_vits_args(parser) |
| 211 | add_matcha_args(parser) | 257 | add_matcha_args(parser) |
| 258 | + add_kokoro_args(parser) | ||
| 212 | 259 | ||
| 213 | parser.add_argument( | 260 | parser.add_argument( |
| 214 | "--tts-rule-fsts", | 261 | "--tts-rule-fsts", |
| @@ -407,6 +454,12 @@ def main(): | @@ -407,6 +454,12 @@ def main(): | ||
| 407 | data_dir=args.matcha_data_dir, | 454 | data_dir=args.matcha_data_dir, |
| 408 | dict_dir=args.matcha_dict_dir, | 455 | dict_dir=args.matcha_dict_dir, |
| 409 | ), | 456 | ), |
| 457 | + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig( | ||
| 458 | + model=args.kokoro_model, | ||
| 459 | + voices=args.kokoro_voices, | ||
| 460 | + tokens=args.kokoro_tokens, | ||
| 461 | + data_dir=args.kokoro_data_dir, | ||
| 462 | + ), | ||
| 410 | provider=args.provider, | 463 | provider=args.provider, |
| 411 | debug=args.debug, | 464 | debug=args.debug, |
| 412 | num_threads=args.num_threads, | 465 | num_threads=args.num_threads, |
| @@ -12,7 +12,7 @@ generated audio. | @@ -12,7 +12,7 @@ generated audio. | ||
| 12 | 12 | ||
| 13 | Usage: | 13 | Usage: |
| 14 | 14 | ||
| 15 | -Example (1/5) | 15 | +Example (1/6) |
| 16 | 16 | ||
| 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 24 | --output-filename=./generated.wav \ | 24 | --output-filename=./generated.wav \ |
| 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 26 | 26 | ||
| 27 | -Example (2/5) | 27 | +Example (2/6) |
| 28 | 28 | ||
| 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 | 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 |
| @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 38 | --output-filename=./liubei-21.wav \ | 38 | --output-filename=./liubei-21.wav \ |
| 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 40 | 40 | ||
| 41 | -Example (3/5) | 41 | +Example (3/6) |
| 42 | 42 | ||
| 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 54 | --output-filename=./test-2.wav \ | 54 | --output-filename=./test-2.wav \ |
| 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 56 | 56 | ||
| 57 | -Example (4/5) | 57 | +Example (4/6) |
| 58 | 58 | ||
| 59 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 59 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 60 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 60 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 72 | --output-filename=./test-matcha.wav \ | 72 | --output-filename=./test-matcha.wav \ |
| 73 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | 73 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| 74 | 74 | ||
| 75 | -Example (5/5) | 75 | +Example (5/6) |
| 76 | 76 | ||
| 77 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | 77 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 |
| 78 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | 78 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 |
| @@ -89,6 +89,23 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -89,6 +89,23 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 89 | --num-threads=2 \ | 89 | --num-threads=2 \ |
| 90 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 90 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 91 | 91 | ||
| 92 | +Example (6/6) | ||
| 93 | + | ||
| 94 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | ||
| 95 | +tar xf kokoro-en-v0_19.tar.bz2 | ||
| 96 | +rm kokoro-en-v0_19.tar.bz2 | ||
| 97 | + | ||
| 98 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 99 | + --debug=1 \ | ||
| 100 | + --kokoro-model=./kokoro-en-v0_19/model.onnx \ | ||
| 101 | + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ | ||
| 102 | + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ | ||
| 103 | + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ | ||
| 104 | + --num-threads=2 \ | ||
| 105 | + --sid=10 \ | ||
| 106 | + --output-filename="./kokoro-10.wav" \ | ||
| 107 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." | ||
| 108 | + | ||
| 92 | You can find more models at | 109 | You can find more models at |
| 93 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 110 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 94 | 111 | ||
| @@ -188,6 +205,36 @@ def add_matcha_args(parser): | @@ -188,6 +205,36 @@ def add_matcha_args(parser): | ||
| 188 | ) | 205 | ) |
| 189 | 206 | ||
| 190 | 207 | ||
| 208 | +def add_kokoro_args(parser): | ||
| 209 | + parser.add_argument( | ||
| 210 | + "--kokoro-model", | ||
| 211 | + type=str, | ||
| 212 | + default="", | ||
| 213 | + help="Path to model.onnx for kokoro", | ||
| 214 | + ) | ||
| 215 | + | ||
| 216 | + parser.add_argument( | ||
| 217 | + "--kokoro-voices", | ||
| 218 | + type=str, | ||
| 219 | + default="", | ||
| 220 | + help="Path to voices.bin for kokoro", | ||
| 221 | + ) | ||
| 222 | + | ||
| 223 | + parser.add_argument( | ||
| 224 | + "--kokoro-tokens", | ||
| 225 | + type=str, | ||
| 226 | + default="", | ||
| 227 | + help="Path to tokens.txt for kokoro", | ||
| 228 | + ) | ||
| 229 | + | ||
| 230 | + parser.add_argument( | ||
| 231 | + "--kokoro-data-dir", | ||
| 232 | + type=str, | ||
| 233 | + default="", | ||
| 234 | + help="Path to the dict directory of espeak-ng.", | ||
| 235 | + ) | ||
| 236 | + | ||
| 237 | + | ||
| 191 | def get_args(): | 238 | def get_args(): |
| 192 | parser = argparse.ArgumentParser( | 239 | parser = argparse.ArgumentParser( |
| 193 | formatter_class=argparse.ArgumentDefaultsHelpFormatter | 240 | formatter_class=argparse.ArgumentDefaultsHelpFormatter |
| @@ -195,6 +242,7 @@ def get_args(): | @@ -195,6 +242,7 @@ def get_args(): | ||
| 195 | 242 | ||
| 196 | add_vits_args(parser) | 243 | add_vits_args(parser) |
| 197 | add_matcha_args(parser) | 244 | add_matcha_args(parser) |
| 245 | + add_kokoro_args(parser) | ||
| 198 | 246 | ||
| 199 | parser.add_argument( | 247 | parser.add_argument( |
| 200 | "--tts-rule-fsts", | 248 | "--tts-rule-fsts", |
| @@ -206,7 +254,7 @@ def get_args(): | @@ -206,7 +254,7 @@ def get_args(): | ||
| 206 | parser.add_argument( | 254 | parser.add_argument( |
| 207 | "--max-num-sentences", | 255 | "--max-num-sentences", |
| 208 | type=int, | 256 | type=int, |
| 209 | - default=2, | 257 | + default=1, |
| 210 | help="""Max number of sentences in a batch to avoid OOM if the input | 258 | help="""Max number of sentences in a batch to avoid OOM if the input |
| 211 | text is very long. Set it to -1 to process all the sentences in a | 259 | text is very long. Set it to -1 to process all the sentences in a |
| 212 | single batch. A smaller value does not mean it is slower compared | 260 | single batch. A smaller value does not mean it is slower compared |
| @@ -289,6 +337,12 @@ def main(): | @@ -289,6 +337,12 @@ def main(): | ||
| 289 | data_dir=args.matcha_data_dir, | 337 | data_dir=args.matcha_data_dir, |
| 290 | dict_dir=args.matcha_dict_dir, | 338 | dict_dir=args.matcha_dict_dir, |
| 291 | ), | 339 | ), |
| 340 | + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig( | ||
| 341 | + model=args.kokoro_model, | ||
| 342 | + voices=args.kokoro_voices, | ||
| 343 | + tokens=args.kokoro_tokens, | ||
| 344 | + data_dir=args.kokoro_data_dir, | ||
| 345 | + ), | ||
| 292 | provider=args.provider, | 346 | provider=args.provider, |
| 293 | debug=args.debug, | 347 | debug=args.debug, |
| 294 | num_threads=args.num_threads, | 348 | num_threads=args.num_threads, |
| @@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS) | @@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS) | ||
| 158 | offline-tts-character-frontend.cc | 158 | offline-tts-character-frontend.cc |
| 159 | offline-tts-frontend.cc | 159 | offline-tts-frontend.cc |
| 160 | offline-tts-impl.cc | 160 | offline-tts-impl.cc |
| 161 | + offline-tts-kokoro-model-config.cc | ||
| 162 | + offline-tts-kokoro-model.cc | ||
| 161 | offline-tts-matcha-model-config.cc | 163 | offline-tts-matcha-model-config.cc |
| 162 | offline-tts-matcha-model.cc | 164 | offline-tts-matcha-model.cc |
| 163 | offline-tts-model-config.cc | 165 | offline-tts-model-config.cc |
| @@ -11,7 +11,7 @@ | @@ -11,7 +11,7 @@ | ||
| 11 | #include <vector> | 11 | #include <vector> |
| 12 | 12 | ||
| 13 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 13 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| 14 | -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | 14 | +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" |
| 15 | 15 | ||
| 16 | namespace sherpa_onnx { | 16 | namespace sherpa_onnx { |
| 17 | 17 |
| @@ -10,7 +10,7 @@ | @@ -10,7 +10,7 @@ | ||
| 10 | #include <vector> | 10 | #include <vector> |
| 11 | 11 | ||
| 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| 13 | -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | 13 | +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" |
| 14 | 14 | ||
| 15 | namespace sherpa_onnx { | 15 | namespace sherpa_onnx { |
| 16 | 16 |
| @@ -16,6 +16,7 @@ | @@ -16,6 +16,7 @@ | ||
| 16 | #include "rawfile/raw_file_manager.h" | 16 | #include "rawfile/raw_file_manager.h" |
| 17 | #endif | 17 | #endif |
| 18 | 18 | ||
| 19 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h" | ||
| 19 | #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" | 20 | #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" |
| 20 | #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" | 21 | #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" |
| 21 | 22 | ||
| @@ -37,8 +38,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | @@ -37,8 +38,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | ||
| 37 | const OfflineTtsConfig &config) { | 38 | const OfflineTtsConfig &config) { |
| 38 | if (!config.model.vits.model.empty()) { | 39 | if (!config.model.vits.model.empty()) { |
| 39 | return std::make_unique<OfflineTtsVitsImpl>(config); | 40 | return std::make_unique<OfflineTtsVitsImpl>(config); |
| 41 | + } else if (!config.model.matcha.acoustic_model.empty()) { | ||
| 42 | + return std::make_unique<OfflineTtsMatchaImpl>(config); | ||
| 40 | } | 43 | } |
| 41 | - return std::make_unique<OfflineTtsMatchaImpl>(config); | 44 | + |
| 45 | + return std::make_unique<OfflineTtsKokoroImpl>(config); | ||
| 42 | } | 46 | } |
| 43 | 47 | ||
| 44 | template <typename Manager> | 48 | template <typename Manager> |
| @@ -46,9 +50,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | @@ -46,9 +50,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | ||
| 46 | Manager *mgr, const OfflineTtsConfig &config) { | 50 | Manager *mgr, const OfflineTtsConfig &config) { |
| 47 | if (!config.model.vits.model.empty()) { | 51 | if (!config.model.vits.model.empty()) { |
| 48 | return std::make_unique<OfflineTtsVitsImpl>(mgr, config); | 52 | return std::make_unique<OfflineTtsVitsImpl>(mgr, config); |
| 53 | + } else if (!config.model.matcha.acoustic_model.empty()) { | ||
| 54 | + return std::make_unique<OfflineTtsMatchaImpl>(mgr, config); | ||
| 49 | } | 55 | } |
| 50 | 56 | ||
| 51 | - return std::make_unique<OfflineTtsMatchaImpl>(mgr, config); | 57 | + return std::make_unique<OfflineTtsKokoroImpl>(mgr, config); |
| 52 | } | 58 | } |
| 53 | 59 | ||
| 54 | #if __ANDROID_API__ >= 9 | 60 | #if __ANDROID_API__ >= 9 |
sherpa-onnx/csrc/offline-tts-kokoro-impl.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-kokoro-impl.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_ | ||
| 6 | + | ||
| 7 | +#include <memory> | ||
| 8 | +#include <string> | ||
| 9 | +#include <strstream> | ||
| 10 | +#include <utility> | ||
| 11 | +#include <vector> | ||
| 12 | + | ||
| 13 | +#include "fst/extensions/far/far.h" | ||
| 14 | +#include "kaldifst/csrc/kaldi-fst-io.h" | ||
| 15 | +#include "kaldifst/csrc/text-normalizer.h" | ||
| 16 | +#include "sherpa-onnx/csrc/lexicon.h" | ||
| 17 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 18 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 19 | +#include "sherpa-onnx/csrc/offline-tts-impl.h" | ||
| 20 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h" | ||
| 21 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 22 | +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" | ||
| 23 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 24 | + | ||
| 25 | +namespace sherpa_onnx { | ||
| 26 | + | ||
| 27 | +class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 28 | + public: | ||
| 29 | + explicit OfflineTtsKokoroImpl(const OfflineTtsConfig &config) | ||
| 30 | + : config_(config), | ||
| 31 | + model_(std::make_unique<OfflineTtsKokoroModel>(config.model)) { | ||
| 32 | + InitFrontend(); | ||
| 33 | + | ||
| 34 | + if (!config.rule_fsts.empty()) { | ||
| 35 | + std::vector<std::string> files; | ||
| 36 | + SplitStringToVector(config.rule_fsts, ",", false, &files); | ||
| 37 | + tn_list_.reserve(files.size()); | ||
| 38 | + for (const auto &f : files) { | ||
| 39 | + if (config.model.debug) { | ||
| 40 | +#if __OHOS__ | ||
| 41 | + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); | ||
| 42 | +#else | ||
| 43 | + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); | ||
| 44 | +#endif | ||
| 45 | + } | ||
| 46 | + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f)); | ||
| 47 | + } | ||
| 48 | + } | ||
| 49 | + | ||
| 50 | + if (!config.rule_fars.empty()) { | ||
| 51 | + if (config.model.debug) { | ||
| 52 | + SHERPA_ONNX_LOGE("Loading FST archives"); | ||
| 53 | + } | ||
| 54 | + std::vector<std::string> files; | ||
| 55 | + SplitStringToVector(config.rule_fars, ",", false, &files); | ||
| 56 | + | ||
| 57 | + tn_list_.reserve(files.size() + tn_list_.size()); | ||
| 58 | + | ||
| 59 | + for (const auto &f : files) { | ||
| 60 | + if (config.model.debug) { | ||
| 61 | +#if __OHOS__ | ||
| 62 | + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); | ||
| 63 | +#else | ||
| 64 | + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); | ||
| 65 | +#endif | ||
| 66 | + } | ||
| 67 | + std::unique_ptr<fst::FarReader<fst::StdArc>> reader( | ||
| 68 | + fst::FarReader<fst::StdArc>::Open(f)); | ||
| 69 | + for (; !reader->Done(); reader->Next()) { | ||
| 70 | + std::unique_ptr<fst::StdConstFst> r( | ||
| 71 | + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); | ||
| 72 | + | ||
| 73 | + tn_list_.push_back( | ||
| 74 | + std::make_unique<kaldifst::TextNormalizer>(std::move(r))); | ||
| 75 | + } | ||
| 76 | + } | ||
| 77 | + | ||
| 78 | + if (config.model.debug) { | ||
| 79 | + SHERPA_ONNX_LOGE("FST archives loaded!"); | ||
| 80 | + } | ||
| 81 | + } | ||
| 82 | + } | ||
| 83 | + | ||
| 84 | + template <typename Manager> | ||
| 85 | + OfflineTtsKokoroImpl(Manager *mgr, const OfflineTtsConfig &config) | ||
| 86 | + : config_(config), | ||
| 87 | + model_(std::make_unique<OfflineTtsKokoroModel>(mgr, config.model)) { | ||
| 88 | + InitFrontend(mgr); | ||
| 89 | + | ||
| 90 | + if (!config.rule_fsts.empty()) { | ||
| 91 | + std::vector<std::string> files; | ||
| 92 | + SplitStringToVector(config.rule_fsts, ",", false, &files); | ||
| 93 | + tn_list_.reserve(files.size()); | ||
| 94 | + for (const auto &f : files) { | ||
| 95 | + if (config.model.debug) { | ||
| 96 | +#if __OHOS__ | ||
| 97 | + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); | ||
| 98 | +#else | ||
| 99 | + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); | ||
| 100 | +#endif | ||
| 101 | + } | ||
| 102 | + auto buf = ReadFile(mgr, f); | ||
| 103 | + std::istrstream is(buf.data(), buf.size()); | ||
| 104 | + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is)); | ||
| 105 | + } | ||
| 106 | + } | ||
| 107 | + | ||
| 108 | + if (!config.rule_fars.empty()) { | ||
| 109 | + std::vector<std::string> files; | ||
| 110 | + SplitStringToVector(config.rule_fars, ",", false, &files); | ||
| 111 | + tn_list_.reserve(files.size() + tn_list_.size()); | ||
| 112 | + | ||
| 113 | + for (const auto &f : files) { | ||
| 114 | + if (config.model.debug) { | ||
| 115 | +#if __OHOS__ | ||
| 116 | + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); | ||
| 117 | +#else | ||
| 118 | + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); | ||
| 119 | +#endif | ||
| 120 | + } | ||
| 121 | + | ||
| 122 | + auto buf = ReadFile(mgr, f); | ||
| 123 | + | ||
| 124 | + std::unique_ptr<std::istream> s( | ||
| 125 | + new std::istrstream(buf.data(), buf.size())); | ||
| 126 | + | ||
| 127 | + std::unique_ptr<fst::FarReader<fst::StdArc>> reader( | ||
| 128 | + fst::FarReader<fst::StdArc>::Open(std::move(s))); | ||
| 129 | + | ||
| 130 | + for (; !reader->Done(); reader->Next()) { | ||
| 131 | + std::unique_ptr<fst::StdConstFst> r( | ||
| 132 | + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); | ||
| 133 | + | ||
| 134 | + tn_list_.push_back( | ||
| 135 | + std::make_unique<kaldifst::TextNormalizer>(std::move(r))); | ||
| 136 | + } // for (; !reader->Done(); reader->Next()) | ||
| 137 | + } // for (const auto &f : files) | ||
| 138 | + } // if (!config.rule_fars.empty()) | ||
| 139 | + } | ||
| 140 | + | ||
| 141 | + int32_t SampleRate() const override { | ||
| 142 | + return model_->GetMetaData().sample_rate; | ||
| 143 | + } | ||
| 144 | + | ||
| 145 | + int32_t NumSpeakers() const override { | ||
| 146 | + return model_->GetMetaData().num_speakers; | ||
| 147 | + } | ||
| 148 | + | ||
| 149 | + GeneratedAudio Generate( | ||
| 150 | + const std::string &_text, int64_t sid = 0, float speed = 1.0, | ||
| 151 | + GeneratedAudioCallback callback = nullptr) const override { | ||
| 152 | + const auto &meta_data = model_->GetMetaData(); | ||
| 153 | + int32_t num_speakers = meta_data.num_speakers; | ||
| 154 | + | ||
| 155 | + if (num_speakers == 0 && sid != 0) { | ||
| 156 | +#if __OHOS__ | ||
| 157 | + SHERPA_ONNX_LOGE( | ||
| 158 | + "This is a single-speaker model and supports only sid 0. Given sid: " | ||
| 159 | + "%{public}d. sid is ignored", | ||
| 160 | + static_cast<int32_t>(sid)); | ||
| 161 | +#else | ||
| 162 | + SHERPA_ONNX_LOGE( | ||
| 163 | + "This is a single-speaker model and supports only sid 0. Given sid: " | ||
| 164 | + "%d. sid is ignored", | ||
| 165 | + static_cast<int32_t>(sid)); | ||
| 166 | +#endif | ||
| 167 | + } | ||
| 168 | + | ||
| 169 | + if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) { | ||
| 170 | +#if __OHOS__ | ||
| 171 | + SHERPA_ONNX_LOGE( | ||
| 172 | + "This model contains only %{public}d speakers. sid should be in the " | ||
| 173 | + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0", | ||
| 174 | + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | ||
| 175 | +#else | ||
| 176 | + SHERPA_ONNX_LOGE( | ||
| 177 | + "This model contains only %d speakers. sid should be in the range " | ||
| 178 | + "[%d, %d]. Given: %d. Use sid=0", | ||
| 179 | + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | ||
| 180 | +#endif | ||
| 181 | + sid = 0; | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + std::string text = _text; | ||
| 185 | + if (config_.model.debug) { | ||
| 186 | +#if __OHOS__ | ||
| 187 | + SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str()); | ||
| 188 | +#else | ||
| 189 | + SHERPA_ONNX_LOGE("Raw text: %s", text.c_str()); | ||
| 190 | +#endif | ||
| 191 | + } | ||
| 192 | + | ||
| 193 | + if (!tn_list_.empty()) { | ||
| 194 | + for (const auto &tn : tn_list_) { | ||
| 195 | + text = tn->Normalize(text); | ||
| 196 | + if (config_.model.debug) { | ||
| 197 | +#if __OHOS__ | ||
| 198 | + SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str()); | ||
| 199 | +#else | ||
| 200 | + SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str()); | ||
| 201 | +#endif | ||
| 202 | + } | ||
| 203 | + } | ||
| 204 | + } | ||
| 205 | + | ||
| 206 | + std::vector<TokenIDs> token_ids = | ||
| 207 | + frontend_->ConvertTextToTokenIds(text, "en-us"); | ||
| 208 | + | ||
| 209 | + if (token_ids.empty() || | ||
| 210 | + (token_ids.size() == 1 && token_ids[0].tokens.empty())) { | ||
| 211 | +#if __OHOS__ | ||
| 212 | + SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs", | ||
| 213 | + text.c_str()); | ||
| 214 | +#else | ||
| 215 | + SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str()); | ||
| 216 | +#endif | ||
| 217 | + return {}; | ||
| 218 | + } | ||
| 219 | + | ||
| 220 | + std::vector<std::vector<int64_t>> x; | ||
| 221 | + | ||
| 222 | + x.reserve(token_ids.size()); | ||
| 223 | + | ||
| 224 | + for (auto &i : token_ids) { | ||
| 225 | + x.push_back(std::move(i.tokens)); | ||
| 226 | + } | ||
| 227 | + | ||
| 228 | + int32_t x_size = static_cast<int32_t>(x.size()); | ||
| 229 | + | ||
| 230 | + if (config_.max_num_sentences != 1) { | ||
| 231 | +#if __OHOS__ | ||
| 232 | + SHERPA_ONNX_LOGE( | ||
| 233 | + "max_num_sentences (%{public}d) != 1 is ignored for Kokoro TTS " | ||
| 234 | + "models", | ||
| 235 | + config_.max_num_sentences); | ||
| 236 | +#else | ||
| 237 | + SHERPA_ONNX_LOGE( | ||
| 238 | + "max_num_sentences (%d) != 1 is ignored for Kokoro TTS models", | ||
| 239 | + config_.max_num_sentences); | ||
| 240 | +#endif | ||
| 241 | + } | ||
| 242 | + | ||
| 243 | + // the input text is too long, we process sentences within it in batches | ||
| 244 | + // to avoid OOM. Batch size is config_.max_num_sentences | ||
| 245 | + std::vector<std::vector<int64_t>> batch_x; | ||
| 246 | + | ||
| 247 | + int32_t batch_size = 1; | ||
| 248 | + batch_x.reserve(config_.max_num_sentences); | ||
| 249 | + int32_t num_batches = x_size / batch_size; | ||
| 250 | + | ||
| 251 | + if (config_.model.debug) { | ||
| 252 | +#if __OHOS__ | ||
| 253 | + SHERPA_ONNX_LOGE( | ||
| 254 | + "Split it into %{public}d batches. batch size: " | ||
| 255 | + "%{public}d. Number of sentences: %{public}d", | ||
| 256 | + num_batches, batch_size, x_size); | ||
| 257 | +#else | ||
| 258 | + SHERPA_ONNX_LOGE( | ||
| 259 | + "Split it into %d batches. batch size: %d. Number " | ||
| 260 | + "of sentences: %d", | ||
| 261 | + num_batches, batch_size, x_size); | ||
| 262 | +#endif | ||
| 263 | + } | ||
| 264 | + | ||
| 265 | + GeneratedAudio ans; | ||
| 266 | + | ||
| 267 | + int32_t should_continue = 1; | ||
| 268 | + | ||
| 269 | + int32_t k = 0; | ||
| 270 | + | ||
| 271 | + for (int32_t b = 0; b != num_batches && should_continue; ++b) { | ||
| 272 | + batch_x.clear(); | ||
| 273 | + for (int32_t i = 0; i != batch_size; ++i, ++k) { | ||
| 274 | + batch_x.push_back(std::move(x[k])); | ||
| 275 | + } | ||
| 276 | + | ||
| 277 | + auto audio = Process(batch_x, sid, speed); | ||
| 278 | + ans.sample_rate = audio.sample_rate; | ||
| 279 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 280 | + audio.samples.end()); | ||
| 281 | + if (callback) { | ||
| 282 | + should_continue = callback(audio.samples.data(), audio.samples.size(), | ||
| 283 | + (b + 1) * 1.0 / num_batches); | ||
| 284 | + // Caution(fangjun): audio is freed when the callback returns, so users | ||
| 285 | + // should copy the data if they want to access the data after | ||
| 286 | + // the callback returns to avoid segmentation fault. | ||
| 287 | + } | ||
| 288 | + } | ||
| 289 | + | ||
| 290 | + batch_x.clear(); | ||
| 291 | + while (k < static_cast<int32_t>(x.size()) && should_continue) { | ||
| 292 | + batch_x.push_back(std::move(x[k])); | ||
| 293 | + | ||
| 294 | + ++k; | ||
| 295 | + } | ||
| 296 | + | ||
| 297 | + if (!batch_x.empty()) { | ||
| 298 | + auto audio = Process(batch_x, sid, speed); | ||
| 299 | + ans.sample_rate = audio.sample_rate; | ||
| 300 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 301 | + audio.samples.end()); | ||
| 302 | + if (callback) { | ||
| 303 | + callback(audio.samples.data(), audio.samples.size(), 1.0); | ||
| 304 | + // Caution(fangjun): audio is freed when the callback returns, so users | ||
| 305 | + // should copy the data if they want to access the data after | ||
| 306 | + // the callback returns to avoid segmentation fault. | ||
| 307 | + } | ||
| 308 | + } | ||
| 309 | + | ||
| 310 | + return ans; | ||
| 311 | + } | ||
| 312 | + | ||
| 313 | + private: | ||
| 314 | + template <typename Manager> | ||
| 315 | + void InitFrontend(Manager *mgr) { | ||
| 316 | + const auto &meta_data = model_->GetMetaData(); | ||
| 317 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 318 | + mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir, | ||
| 319 | + meta_data); | ||
| 320 | + } | ||
| 321 | + | ||
| 322 | + void InitFrontend() { | ||
| 323 | + const auto &meta_data = model_->GetMetaData(); | ||
| 324 | + | ||
| 325 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 326 | + config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data); | ||
| 327 | + } | ||
| 328 | + | ||
| 329 | + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, | ||
| 330 | + int32_t sid, float speed) const { | ||
| 331 | + int32_t num_tokens = 0; | ||
| 332 | + for (const auto &k : tokens) { | ||
| 333 | + num_tokens += k.size(); | ||
| 334 | + } | ||
| 335 | + | ||
| 336 | + std::vector<int64_t> x; | ||
| 337 | + x.reserve(num_tokens); | ||
| 338 | + for (const auto &k : tokens) { | ||
| 339 | + x.insert(x.end(), k.begin(), k.end()); | ||
| 340 | + } | ||
| 341 | + | ||
| 342 | + auto memory_info = | ||
| 343 | + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
| 344 | + | ||
| 345 | + std::array<int64_t, 2> x_shape = {1, static_cast<int32_t>(x.size())}; | ||
| 346 | + Ort::Value x_tensor = Ort::Value::CreateTensor( | ||
| 347 | + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size()); | ||
| 348 | + | ||
| 349 | + Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed); | ||
| 350 | + | ||
| 351 | + std::vector<int64_t> audio_shape = | ||
| 352 | + audio.GetTensorTypeAndShapeInfo().GetShape(); | ||
| 353 | + | ||
| 354 | + int64_t total = 1; | ||
| 355 | + // The output shape may be (1, 1, total) or (1, total) or (total,) | ||
| 356 | + for (auto i : audio_shape) { | ||
| 357 | + total *= i; | ||
| 358 | + } | ||
| 359 | + | ||
| 360 | + const float *p = audio.GetTensorData<float>(); | ||
| 361 | + | ||
| 362 | + GeneratedAudio ans; | ||
| 363 | + ans.sample_rate = model_->GetMetaData().sample_rate; | ||
| 364 | + ans.samples = std::vector<float>(p, p + total); | ||
| 365 | + return ans; | ||
| 366 | + } | ||
| 367 | + | ||
| 368 | + private: | ||
| 369 | + OfflineTtsConfig config_; | ||
| 370 | + std::unique_ptr<OfflineTtsKokoroModel> model_; | ||
| 371 | + std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_; | ||
| 372 | + std::unique_ptr<OfflineTtsFrontend> frontend_; | ||
| 373 | +}; | ||
| 374 | + | ||
| 375 | +} // namespace sherpa_onnx | ||
| 376 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_ |
| 1 | +// sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 10 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { | ||
| 15 | + po->Register("kokoro-model", &model, "Path to Kokoro model"); | ||
| 16 | + po->Register("kokoro-voices", &voices, | ||
| 17 | + "Path to voices.bin for Kokoro models"); | ||
| 18 | + po->Register("kokoro-tokens", &tokens, | ||
| 19 | + "Path to tokens.txt for Kokoro models"); | ||
| 20 | + po->Register("kokoro-data-dir", &data_dir, | ||
| 21 | + "Path to the directory containing dict for espeak-ng."); | ||
| 22 | + po->Register("kokoro-length-scale", &length_scale, | ||
| 23 | + "Speech speed. Larger->Slower; Smaller->faster."); | ||
| 24 | +} | ||
| 25 | + | ||
| 26 | +bool OfflineTtsKokoroModelConfig::Validate() const { | ||
| 27 | + if (model.empty()) { | ||
| 28 | + SHERPA_ONNX_LOGE("Please provide --kokoro-model"); | ||
| 29 | + return false; | ||
| 30 | + } | ||
| 31 | + | ||
| 32 | + if (!FileExists(model)) { | ||
| 33 | + SHERPA_ONNX_LOGE("--kokoro-model: '%s' does not exist", model.c_str()); | ||
| 34 | + return false; | ||
| 35 | + } | ||
| 36 | + | ||
| 37 | + if (tokens.empty()) { | ||
| 38 | + SHERPA_ONNX_LOGE("Please provide --kokoro-tokens"); | ||
| 39 | + return false; | ||
| 40 | + } | ||
| 41 | + | ||
| 42 | + if (!FileExists(tokens)) { | ||
| 43 | + SHERPA_ONNX_LOGE("--kokoro-tokens: '%s' does not exist", tokens.c_str()); | ||
| 44 | + return false; | ||
| 45 | + } | ||
| 46 | + | ||
| 47 | + if (data_dir.empty()) { | ||
| 48 | + SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir"); | ||
| 49 | + return false; | ||
| 50 | + } | ||
| 51 | + | ||
| 52 | + if (!FileExists(data_dir + "/phontab")) { | ||
| 53 | + SHERPA_ONNX_LOGE( | ||
| 54 | + "'%s/phontab' does not exist. Please check --kokoro-data-dir", | ||
| 55 | + data_dir.c_str()); | ||
| 56 | + return false; | ||
| 57 | + } | ||
| 58 | + | ||
| 59 | + if (!FileExists(data_dir + "/phonindex")) { | ||
| 60 | + SHERPA_ONNX_LOGE( | ||
| 61 | + "'%s/phonindex' does not exist. Please check --kokoro-data-dir", | ||
| 62 | + data_dir.c_str()); | ||
| 63 | + return false; | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + if (!FileExists(data_dir + "/phondata")) { | ||
| 67 | + SHERPA_ONNX_LOGE( | ||
| 68 | + "'%s/phondata' does not exist. Please check --kokoro-data-dir", | ||
| 69 | + data_dir.c_str()); | ||
| 70 | + return false; | ||
| 71 | + } | ||
| 72 | + | ||
| 73 | + if (!FileExists(data_dir + "/intonations")) { | ||
| 74 | + SHERPA_ONNX_LOGE( | ||
| 75 | + "'%s/intonations' does not exist. Please check --kokoro-data-dir", | ||
| 76 | + data_dir.c_str()); | ||
| 77 | + return false; | ||
| 78 | + } | ||
| 79 | + | ||
| 80 | + return true; | ||
| 81 | +} | ||
| 82 | + | ||
| 83 | +std::string OfflineTtsKokoroModelConfig::ToString() const { | ||
| 84 | + std::ostringstream os; | ||
| 85 | + | ||
| 86 | + os << "OfflineTtsKokoroModelConfig("; | ||
| 87 | + os << "model=\"" << model << "\", "; | ||
| 88 | + os << "voices=\"" << voices << "\", "; | ||
| 89 | + os << "tokens=\"" << tokens << "\", "; | ||
| 90 | + os << "data_dir=\"" << data_dir << "\", "; | ||
| 91 | + os << "length_scale=" << length_scale << ")"; | ||
| 92 | + | ||
| 93 | + return os.str(); | ||
| 94 | +} | ||
| 95 | + | ||
| 96 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/csrc/offline-tts-kokoro-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include <string> | ||
| 9 | + | ||
| 10 | +#include "sherpa-onnx/csrc/parse-options.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +struct OfflineTtsKokoroModelConfig { | ||
| 15 | + std::string model; | ||
| 16 | + std::string voices; | ||
| 17 | + std::string tokens; | ||
| 18 | + | ||
| 19 | + std::string data_dir; | ||
| 20 | + | ||
| 21 | + // speed = 1 / length_scale | ||
| 22 | + float length_scale = 1.0; | ||
| 23 | + | ||
| 24 | + OfflineTtsKokoroModelConfig() = default; | ||
| 25 | + | ||
| 26 | + OfflineTtsKokoroModelConfig(const std::string &model, | ||
| 27 | + const std::string &voices, | ||
| 28 | + const std::string &tokens, | ||
| 29 | + const std::string &data_dir, float length_scale) | ||
| 30 | + : model(model), | ||
| 31 | + voices(voices), | ||
| 32 | + tokens(tokens), | ||
| 33 | + data_dir(data_dir), | ||
| 34 | + length_scale(length_scale) {} | ||
| 35 | + | ||
| 36 | + void Register(ParseOptions *po); | ||
| 37 | + bool Validate() const; | ||
| 38 | + | ||
| 39 | + std::string ToString() const; | ||
| 40 | +}; | ||
| 41 | + | ||
| 42 | +} // namespace sherpa_onnx | ||
| 43 | + | ||
| 44 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ |
| 1 | +// sherpa-onnx/csrc/offline-tts-kokoro-model-metadata.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_ | ||
| 7 | + | ||
| 8 | +#include <cstdint> | ||
| 9 | +#include <string> | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +// please refer to | ||
| 14 | +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/add-meta-data.py | ||
| 15 | +struct OfflineTtsKokoroModelMetaData { | ||
| 16 | + int32_t sample_rate = 0; | ||
| 17 | + int32_t num_speakers = 0; | ||
| 18 | + int32_t version = 1; | ||
| 19 | + int32_t has_espeak = 1; | ||
| 20 | + int32_t max_token_len = 0; | ||
| 21 | +}; | ||
| 22 | + | ||
| 23 | +} // namespace sherpa_onnx | ||
| 24 | + | ||
| 25 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_ |
sherpa-onnx/csrc/offline-tts-kokoro-model.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-kokoro-model.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h" | ||
| 6 | + | ||
| 7 | +#include <algorithm> | ||
| 8 | +#include <string> | ||
| 9 | +#include <utility> | ||
| 10 | +#include <vector> | ||
| 11 | + | ||
| 12 | +#if __ANDROID_API__ >= 9 | ||
| 13 | +#include "android/asset_manager.h" | ||
| 14 | +#include "android/asset_manager_jni.h" | ||
| 15 | +#endif | ||
| 16 | + | ||
| 17 | +#if __OHOS__ | ||
| 18 | +#include "rawfile/raw_file_manager.h" | ||
| 19 | +#endif | ||
| 20 | + | ||
| 21 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 22 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 23 | +#include "sherpa-onnx/csrc/session.h" | ||
| 24 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 25 | + | ||
| 26 | +namespace sherpa_onnx { | ||
| 27 | + | ||
| 28 | +class OfflineTtsKokoroModel::Impl { | ||
| 29 | + public: | ||
| 30 | + explicit Impl(const OfflineTtsModelConfig &config) | ||
| 31 | + : config_(config), | ||
| 32 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 33 | + sess_opts_(GetSessionOptions(config)), | ||
| 34 | + allocator_{} { | ||
| 35 | + auto model_buf = ReadFile(config.kokoro.model); | ||
| 36 | + auto voices_buf = ReadFile(config.kokoro.voices); | ||
| 37 | + Init(model_buf.data(), model_buf.size(), voices_buf.data(), | ||
| 38 | + voices_buf.size()); | ||
| 39 | + } | ||
| 40 | + | ||
| 41 | + template <typename Manager> | ||
| 42 | + Impl(Manager *mgr, const OfflineTtsModelConfig &config) | ||
| 43 | + : config_(config), | ||
| 44 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 45 | + sess_opts_(GetSessionOptions(config)), | ||
| 46 | + allocator_{} { | ||
| 47 | + auto model_buf = ReadFile(mgr, config.kokoro.model); | ||
| 48 | + auto voices_buf = ReadFile(mgr, config.kokoro.voices); | ||
| 49 | + Init(model_buf.data(), model_buf.size(), voices_buf.data(), | ||
| 50 | + voices_buf.size()); | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | + const OfflineTtsKokoroModelMetaData &GetMetaData() const { | ||
| 54 | + return meta_data_; | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + Ort::Value Run(Ort::Value x, int32_t sid, float speed) { | ||
| 58 | + auto memory_info = | ||
| 59 | + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
| 60 | + | ||
| 61 | + std::vector<int64_t> x_shape = x.GetTensorTypeAndShapeInfo().GetShape(); | ||
| 62 | + if (x_shape[0] != 1) { | ||
| 63 | + SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d", | ||
| 64 | + static_cast<int32_t>(x_shape[0])); | ||
| 65 | + exit(-1); | ||
| 66 | + } | ||
| 67 | + | ||
| 68 | + // there is a 0 at the front and end of x | ||
| 69 | + int32_t len = static_cast<int32_t>(x_shape[1]) - 2; | ||
| 70 | + int32_t num_speakers = meta_data_.num_speakers; | ||
| 71 | + int32_t dim0 = style_dim_[0]; | ||
| 72 | + int32_t dim1 = style_dim_[2]; | ||
| 73 | + if (len >= dim0) { | ||
| 74 | + SHERPA_ONNX_LOGE("Bad things happened! %d vs %d", len, dim0); | ||
| 75 | + SHERPA_ONNX_EXIT(-1); | ||
| 76 | + } | ||
| 77 | + | ||
| 78 | + /*const*/ float *p = styles_.data() + sid * dim0 * dim1 + len * dim1; | ||
| 79 | + | ||
| 80 | + std::array<int64_t, 2> style_embedding_shape = {1, dim1}; | ||
| 81 | + Ort::Value style_embedding = Ort::Value::CreateTensor( | ||
| 82 | + memory_info, p, dim1, style_embedding_shape.data(), | ||
| 83 | + style_embedding_shape.size()); | ||
| 84 | + | ||
| 85 | + int64_t speed_shape = 1; | ||
| 86 | + | ||
| 87 | + Ort::Value speed_tensor = | ||
| 88 | + Ort::Value::CreateTensor(memory_info, &speed, 1, &speed_shape, 1); | ||
| 89 | + | ||
| 90 | + std::array<Ort::Value, 3> inputs = { | ||
| 91 | + std::move(x), std::move(style_embedding), std::move(speed_tensor)}; | ||
| 92 | + | ||
| 93 | + auto out = | ||
| 94 | + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), | ||
| 95 | + output_names_ptr_.data(), output_names_ptr_.size()); | ||
| 96 | + | ||
| 97 | + return std::move(out[0]); | ||
| 98 | + } | ||
| 99 | + | ||
| 100 | + private: | ||
| 101 | + void Init(void *model_data, size_t model_data_length, const char *voices_data, | ||
| 102 | + size_t voices_data_length) { | ||
| 103 | + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | ||
| 104 | + sess_opts_); | ||
| 105 | + | ||
| 106 | + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); | ||
| 107 | + | ||
| 108 | + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); | ||
| 109 | + // get meta data | ||
| 110 | + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); | ||
| 111 | + if (config_.debug) { | ||
| 112 | + std::ostringstream os; | ||
| 113 | + os << "---kokoro model---\n"; | ||
| 114 | + PrintModelMetadata(os, meta_data); | ||
| 115 | + | ||
| 116 | + os << "----------input names----------\n"; | ||
| 117 | + int32_t i = 0; | ||
| 118 | + for (const auto &s : input_names_) { | ||
| 119 | + os << i << " " << s << "\n"; | ||
| 120 | + ++i; | ||
| 121 | + } | ||
| 122 | + os << "----------output names----------\n"; | ||
| 123 | + i = 0; | ||
| 124 | + for (const auto &s : output_names_) { | ||
| 125 | + os << i << " " << s << "\n"; | ||
| 126 | + ++i; | ||
| 127 | + } | ||
| 128 | + | ||
| 129 | +#if __OHOS__ | ||
| 130 | + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); | ||
| 131 | +#else | ||
| 132 | + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); | ||
| 133 | +#endif | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below | ||
| 137 | + SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate"); | ||
| 138 | + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); | ||
| 139 | + SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); | ||
| 140 | + SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); | ||
| 141 | + | ||
| 142 | + if (config_.debug) { | ||
| 143 | + std::vector<std::string> speaker_names; | ||
| 144 | + SHERPA_ONNX_READ_META_DATA_VEC_STRING(speaker_names, "speaker_names"); | ||
| 145 | + std::ostringstream os; | ||
| 146 | + os << "\n"; | ||
| 147 | + for (int32_t i = 0; i != speaker_names.size(); ++i) { | ||
| 148 | + os << i << "->" << speaker_names[i] << ", "; | ||
| 149 | + } | ||
| 150 | + os << "\n"; | ||
| 151 | + | ||
| 152 | +#if __OHOS__ | ||
| 153 | + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); | ||
| 154 | +#else | ||
| 155 | + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); | ||
| 156 | +#endif | ||
| 157 | + } | ||
| 158 | + | ||
| 159 | + SHERPA_ONNX_READ_META_DATA_VEC(style_dim_, "style_dim"); | ||
| 160 | + if (style_dim_.size() != 3) { | ||
| 161 | + SHERPA_ONNX_LOGE("style_dim should be 3-d, given: %d", | ||
| 162 | + static_cast<int32_t>(style_dim_.size())); | ||
| 163 | + SHERPA_ONNX_EXIT(-1); | ||
| 164 | + } | ||
| 165 | + | ||
| 166 | + if (style_dim_[1] != 1) { | ||
| 167 | + SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[1]); | ||
| 168 | + SHERPA_ONNX_EXIT(-1); | ||
| 169 | + } | ||
| 170 | + | ||
| 171 | + int32_t actual_num_floats = voices_data_length / sizeof(float); | ||
| 172 | + int32_t expected_num_floats = | ||
| 173 | + style_dim_[0] * style_dim_[2] * meta_data_.num_speakers; | ||
| 174 | + | ||
| 175 | + if (actual_num_floats != expected_num_floats) { | ||
| 176 | +#if __OHOS__ | ||
| 177 | + SHERPA_ONNX_LOGE( | ||
| 178 | + "Corrupted --kokoro-voices '%{public}s'. Expected #floats: " | ||
| 179 | + "%{public}d, actual: %{public}d", | ||
| 180 | + config_.kokoro.voices.c_str(), expected_num_floats, | ||
| 181 | + actual_num_floats); | ||
| 182 | +#else | ||
| 183 | + SHERPA_ONNX_LOGE( | ||
| 184 | + "Corrupted --kokoro-voices '%s'. Expected #floats: %d, actual: %d", | ||
| 185 | + config_.kokoro.voices.c_str(), expected_num_floats, | ||
| 186 | + actual_num_floats); | ||
| 187 | +#endif | ||
| 188 | + | ||
| 189 | + SHERPA_ONNX_EXIT(-1); | ||
| 190 | + } | ||
| 191 | + | ||
| 192 | + styles_ = std::vector<float>( | ||
| 193 | + reinterpret_cast<const float *>(voices_data), | ||
| 194 | + reinterpret_cast<const float *>(voices_data) + expected_num_floats); | ||
| 195 | + | ||
| 196 | + meta_data_.max_token_len = style_dim_[0]; | ||
| 197 | + } | ||
| 198 | + | ||
| 199 | + private: | ||
| 200 | + OfflineTtsModelConfig config_; | ||
| 201 | + Ort::Env env_; | ||
| 202 | + Ort::SessionOptions sess_opts_; | ||
| 203 | + Ort::AllocatorWithDefaultOptions allocator_; | ||
| 204 | + | ||
| 205 | + std::unique_ptr<Ort::Session> sess_; | ||
| 206 | + | ||
| 207 | + std::vector<std::string> input_names_; | ||
| 208 | + std::vector<const char *> input_names_ptr_; | ||
| 209 | + | ||
| 210 | + std::vector<std::string> output_names_; | ||
| 211 | + std::vector<const char *> output_names_ptr_; | ||
| 212 | + | ||
| 213 | + OfflineTtsKokoroModelMetaData meta_data_; | ||
| 214 | + std::vector<int32_t> style_dim_; | ||
| 215 | + | ||
| 216 | + // (num_speakers, style_dim_[0], style_dim_[2]) | ||
| 217 | + std::vector<float> styles_; | ||
| 218 | +}; | ||
| 219 | + | ||
| 220 | +OfflineTtsKokoroModel::OfflineTtsKokoroModel( | ||
| 221 | + const OfflineTtsModelConfig &config) | ||
| 222 | + : impl_(std::make_unique<Impl>(config)) {} | ||
| 223 | + | ||
| 224 | +template <typename Manager> | ||
| 225 | +OfflineTtsKokoroModel::OfflineTtsKokoroModel( | ||
| 226 | + Manager *mgr, const OfflineTtsModelConfig &config) | ||
| 227 | + : impl_(std::make_unique<Impl>(mgr, config)) {} | ||
| 228 | + | ||
| 229 | +OfflineTtsKokoroModel::~OfflineTtsKokoroModel() = default; | ||
| 230 | + | ||
| 231 | +const OfflineTtsKokoroModelMetaData &OfflineTtsKokoroModel::GetMetaData() | ||
| 232 | + const { | ||
| 233 | + return impl_->GetMetaData(); | ||
| 234 | +} | ||
| 235 | + | ||
| 236 | +Ort::Value OfflineTtsKokoroModel::Run(Ort::Value x, int64_t sid /*= 0*/, | ||
| 237 | + float speed /*= 1.0*/) const { | ||
| 238 | + return impl_->Run(std::move(x), sid, speed); | ||
| 239 | +} | ||
| 240 | + | ||
| 241 | +#if __ANDROID_API__ >= 9 | ||
| 242 | +template OfflineTtsKokoroModel::OfflineTtsKokoroModel( | ||
| 243 | + AAssetManager *mgr, const OfflineTtsModelConfig &config); | ||
| 244 | +#endif | ||
| 245 | + | ||
| 246 | +#if __OHOS__ | ||
| 247 | +template OfflineTtsKokoroModel::OfflineTtsKokoroModel( | ||
| 248 | + NativeResourceManager *mgr, const OfflineTtsModelConfig &config); | ||
| 249 | +#endif | ||
| 250 | + | ||
| 251 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/offline-tts-kokoro-model.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-kokoro-model.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_ | ||
| 7 | + | ||
| 8 | +#include <memory> | ||
| 9 | +#include <string> | ||
| 10 | + | ||
| 11 | +#include "onnxruntime_cxx_api.h" // NOLINT | ||
| 12 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" | ||
| 13 | +#include "sherpa-onnx/csrc/offline-tts-model-config.h" | ||
| 14 | + | ||
| 15 | +namespace sherpa_onnx { | ||
| 16 | + | ||
| 17 | +class OfflineTtsKokoroModel { | ||
| 18 | + public: | ||
| 19 | + ~OfflineTtsKokoroModel(); | ||
| 20 | + | ||
| 21 | + explicit OfflineTtsKokoroModel(const OfflineTtsModelConfig &config); | ||
| 22 | + | ||
| 23 | + template <typename Manager> | ||
| 24 | + OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config); | ||
| 25 | + | ||
| 26 | + // Return a float32 tensor containing the mel | ||
| 27 | + // of shape (batch_size, mel_dim, num_frames) | ||
| 28 | + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const; | ||
| 29 | + | ||
| 30 | + const OfflineTtsKokoroModelMetaData &GetMetaData() const; | ||
| 31 | + | ||
| 32 | + private: | ||
| 33 | + class Impl; | ||
| 34 | + std::unique_ptr<Impl> impl_; | ||
| 35 | +}; | ||
| 36 | + | ||
| 37 | +} // namespace sherpa_onnx | ||
| 38 | + | ||
| 39 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_ |
| 1 | -// sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h | 1 | +// sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h |
| 2 | // | 2 | // |
| 3 | // Copyright (c) 2023 Xiaomi Corporation | 3 | // Copyright (c) 2023 Xiaomi Corporation |
| 4 | 4 | ||
| 5 | -#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ | ||
| 6 | -#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ | 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_ |
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_ | ||
| 7 | 7 | ||
| 8 | #include <cstdint> | 8 | #include <cstdint> |
| 9 | #include <string> | 9 | #include <string> |
| @@ -25,4 +25,4 @@ struct OfflineTtsMatchaModelMetaData { | @@ -25,4 +25,4 @@ struct OfflineTtsMatchaModelMetaData { | ||
| 25 | 25 | ||
| 26 | } // namespace sherpa_onnx | 26 | } // namespace sherpa_onnx |
| 27 | 27 | ||
| 28 | -#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ | 28 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_ |
| @@ -9,7 +9,7 @@ | @@ -9,7 +9,7 @@ | ||
| 9 | #include <string> | 9 | #include <string> |
| 10 | 10 | ||
| 11 | #include "onnxruntime_cxx_api.h" // NOLINT | 11 | #include "onnxruntime_cxx_api.h" // NOLINT |
| 12 | -#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" | 12 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h" |
| 13 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" | 13 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" |
| 14 | 14 | ||
| 15 | namespace sherpa_onnx { | 15 | namespace sherpa_onnx { |
| @@ -11,6 +11,7 @@ namespace sherpa_onnx { | @@ -11,6 +11,7 @@ namespace sherpa_onnx { | ||
| 11 | void OfflineTtsModelConfig::Register(ParseOptions *po) { | 11 | void OfflineTtsModelConfig::Register(ParseOptions *po) { |
| 12 | vits.Register(po); | 12 | vits.Register(po); |
| 13 | matcha.Register(po); | 13 | matcha.Register(po); |
| 14 | + kokoro.Register(po); | ||
| 14 | 15 | ||
| 15 | po->Register("num-threads", &num_threads, | 16 | po->Register("num-threads", &num_threads, |
| 16 | "Number of threads to run the neural network"); | 17 | "Number of threads to run the neural network"); |
| @@ -32,7 +33,11 @@ bool OfflineTtsModelConfig::Validate() const { | @@ -32,7 +33,11 @@ bool OfflineTtsModelConfig::Validate() const { | ||
| 32 | return vits.Validate(); | 33 | return vits.Validate(); |
| 33 | } | 34 | } |
| 34 | 35 | ||
| 35 | - return matcha.Validate(); | 36 | + if (!matcha.acoustic_model.empty()) { |
| 37 | + return matcha.Validate(); | ||
| 38 | + } | ||
| 39 | + | ||
| 40 | + return kokoro.Validate(); | ||
| 36 | } | 41 | } |
| 37 | 42 | ||
| 38 | std::string OfflineTtsModelConfig::ToString() const { | 43 | std::string OfflineTtsModelConfig::ToString() const { |
| @@ -41,6 +46,7 @@ std::string OfflineTtsModelConfig::ToString() const { | @@ -41,6 +46,7 @@ std::string OfflineTtsModelConfig::ToString() const { | ||
| 41 | os << "OfflineTtsModelConfig("; | 46 | os << "OfflineTtsModelConfig("; |
| 42 | os << "vits=" << vits.ToString() << ", "; | 47 | os << "vits=" << vits.ToString() << ", "; |
| 43 | os << "matcha=" << matcha.ToString() << ", "; | 48 | os << "matcha=" << matcha.ToString() << ", "; |
| 49 | + os << "kokoro=" << kokoro.ToString() << ", "; | ||
| 44 | os << "num_threads=" << num_threads << ", "; | 50 | os << "num_threads=" << num_threads << ", "; |
| 45 | os << "debug=" << (debug ? "True" : "False") << ", "; | 51 | os << "debug=" << (debug ? "True" : "False") << ", "; |
| 46 | os << "provider=\"" << provider << "\")"; | 52 | os << "provider=\"" << provider << "\")"; |
| @@ -7,6 +7,7 @@ | @@ -7,6 +7,7 @@ | ||
| 7 | 7 | ||
| 8 | #include <string> | 8 | #include <string> |
| 9 | 9 | ||
| 10 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" | ||
| 10 | #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" | 11 | #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" |
| 11 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" |
| 12 | #include "sherpa-onnx/csrc/parse-options.h" | 13 | #include "sherpa-onnx/csrc/parse-options.h" |
| @@ -16,6 +17,7 @@ namespace sherpa_onnx { | @@ -16,6 +17,7 @@ namespace sherpa_onnx { | ||
| 16 | struct OfflineTtsModelConfig { | 17 | struct OfflineTtsModelConfig { |
| 17 | OfflineTtsVitsModelConfig vits; | 18 | OfflineTtsVitsModelConfig vits; |
| 18 | OfflineTtsMatchaModelConfig matcha; | 19 | OfflineTtsMatchaModelConfig matcha; |
| 20 | + OfflineTtsKokoroModelConfig kokoro; | ||
| 19 | 21 | ||
| 20 | int32_t num_threads = 1; | 22 | int32_t num_threads = 1; |
| 21 | bool debug = false; | 23 | bool debug = false; |
| @@ -25,10 +27,12 @@ struct OfflineTtsModelConfig { | @@ -25,10 +27,12 @@ struct OfflineTtsModelConfig { | ||
| 25 | 27 | ||
| 26 | OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, | 28 | OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, |
| 27 | const OfflineTtsMatchaModelConfig &matcha, | 29 | const OfflineTtsMatchaModelConfig &matcha, |
| 30 | + const OfflineTtsKokoroModelConfig &kokoro, | ||
| 28 | int32_t num_threads, bool debug, | 31 | int32_t num_threads, bool debug, |
| 29 | const std::string &provider) | 32 | const std::string &provider) |
| 30 | : vits(vits), | 33 | : vits(vits), |
| 31 | matcha(matcha), | 34 | matcha(matcha), |
| 35 | + kokoro(kokoro), | ||
| 32 | num_threads(num_threads), | 36 | num_threads(num_threads), |
| 33 | debug(debug), | 37 | debug(debug), |
| 34 | provider(provider) {} | 38 | provider(provider) {} |
| 1 | -// sherpa-onnx/csrc/offline-tts-vits-model-metadata.h | 1 | +// sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h |
| 2 | // | 2 | // |
| 3 | // Copyright (c) 2023 Xiaomi Corporation | 3 | // Copyright (c) 2023 Xiaomi Corporation |
| 4 | 4 | ||
| 5 | -#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ | ||
| 6 | -#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ | 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_ |
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_ | ||
| 7 | 7 | ||
| 8 | #include <cstdint> | 8 | #include <cstdint> |
| 9 | #include <string> | 9 | #include <string> |
| @@ -46,4 +46,4 @@ struct OfflineTtsVitsModelMetaData { | @@ -46,4 +46,4 @@ struct OfflineTtsVitsModelMetaData { | ||
| 46 | 46 | ||
| 47 | } // namespace sherpa_onnx | 47 | } // namespace sherpa_onnx |
| 48 | 48 | ||
| 49 | -#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ | 49 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_ |
| @@ -10,7 +10,7 @@ | @@ -10,7 +10,7 @@ | ||
| 10 | 10 | ||
| 11 | #include "onnxruntime_cxx_api.h" // NOLINT | 11 | #include "onnxruntime_cxx_api.h" // NOLINT |
| 12 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" |
| 13 | -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | 13 | +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" |
| 14 | 14 | ||
| 15 | namespace sherpa_onnx { | 15 | namespace sherpa_onnx { |
| 16 | 16 |
| @@ -155,6 +155,36 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha( | @@ -155,6 +155,36 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha( | ||
| 155 | return ans; | 155 | return ans; |
| 156 | } | 156 | } |
| 157 | 157 | ||
| 158 | +static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro( | ||
| 159 | + const std::unordered_map<char32_t, int32_t> &token2id, | ||
| 160 | + const std::vector<piper::Phoneme> &phonemes, int32_t max_len) { | ||
| 161 | + std::vector<std::vector<int64_t>> ans; | ||
| 162 | + | ||
| 163 | + std::vector<int64_t> current; | ||
| 164 | + current.reserve(phonemes.size()); | ||
| 165 | + | ||
| 166 | + for (auto p : phonemes) { | ||
| 167 | + if (token2id.count(p)) { | ||
| 168 | + if (current.size() > max_len - 1) { | ||
| 169 | + current.push_back(0); | ||
| 170 | + ans.push_back(std::move(current)); | ||
| 171 | + | ||
| 172 | + current.reserve(phonemes.size()); | ||
| 173 | + current.push_back(0); | ||
| 174 | + } | ||
| 175 | + | ||
| 176 | + current.push_back(token2id.at(p)); | ||
| 177 | + } else { | ||
| 178 | + SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.", | ||
| 179 | + static_cast<uint32_t>(p)); | ||
| 180 | + } | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + current.push_back(0); | ||
| 184 | + ans.push_back(std::move(current)); | ||
| 185 | + return ans; | ||
| 186 | +} | ||
| 187 | + | ||
| 158 | static std::vector<int64_t> CoquiPhonemesToIds( | 188 | static std::vector<int64_t> CoquiPhonemesToIds( |
| 159 | const std::unordered_map<char32_t, int32_t> &token2id, | 189 | const std::unordered_map<char32_t, int32_t> &token2id, |
| 160 | const std::vector<piper::Phoneme> &phonemes, | 190 | const std::vector<piper::Phoneme> &phonemes, |
| @@ -269,6 +299,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -269,6 +299,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 269 | InitEspeak(data_dir); | 299 | InitEspeak(data_dir); |
| 270 | } | 300 | } |
| 271 | 301 | ||
| 302 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 303 | + const std::string &tokens, const std::string &data_dir, | ||
| 304 | + const OfflineTtsKokoroModelMetaData &kokoro_meta_data) | ||
| 305 | + : kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) { | ||
| 306 | + { | ||
| 307 | + std::ifstream is(tokens); | ||
| 308 | + token2id_ = ReadTokens(is); | ||
| 309 | + } | ||
| 310 | + | ||
| 311 | + InitEspeak(data_dir); | ||
| 312 | +} | ||
| 313 | + | ||
| 272 | template <typename Manager> | 314 | template <typename Manager> |
| 273 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( | 315 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( |
| 274 | Manager *mgr, const std::string &tokens, const std::string &data_dir, | 316 | Manager *mgr, const std::string &tokens, const std::string &data_dir, |
| @@ -286,10 +328,29 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -286,10 +328,29 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 286 | InitEspeak(data_dir); | 328 | InitEspeak(data_dir); |
| 287 | } | 329 | } |
| 288 | 330 | ||
| 331 | +template <typename Manager> | ||
| 332 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 333 | + Manager *mgr, const std::string &tokens, const std::string &data_dir, | ||
| 334 | + const OfflineTtsKokoroModelMetaData &kokoro_meta_data) | ||
| 335 | + : kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) { | ||
| 336 | + { | ||
| 337 | + auto buf = ReadFile(mgr, tokens); | ||
| 338 | + std::istrstream is(buf.data(), buf.size()); | ||
| 339 | + token2id_ = ReadTokens(is); | ||
| 340 | + } | ||
| 341 | + | ||
| 342 | + // We should copy the directory of espeak-ng-data from the asset to | ||
| 343 | + // some internal or external storage and then pass the directory to | ||
| 344 | + // data_dir. | ||
| 345 | + InitEspeak(data_dir); | ||
| 346 | +} | ||
| 347 | + | ||
| 289 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | 348 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( |
| 290 | const std::string &text, const std::string &voice /*= ""*/) const { | 349 | const std::string &text, const std::string &voice /*= ""*/) const { |
| 291 | if (is_matcha_) { | 350 | if (is_matcha_) { |
| 292 | return ConvertTextToTokenIdsMatcha(text, voice); | 351 | return ConvertTextToTokenIdsMatcha(text, voice); |
| 352 | + } else if (is_kokoro_) { | ||
| 353 | + return ConvertTextToTokenIdsKokoro(text, voice); | ||
| 293 | } else { | 354 | } else { |
| 294 | return ConvertTextToTokenIdsVits(text, voice); | 355 | return ConvertTextToTokenIdsVits(text, voice); |
| 295 | } | 356 | } |
| @@ -320,6 +381,32 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( | @@ -320,6 +381,32 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( | ||
| 320 | return ans; | 381 | return ans; |
| 321 | } | 382 | } |
| 322 | 383 | ||
| 384 | +std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( | ||
| 385 | + const std::string &text, const std::string &voice /*= ""*/) const { | ||
| 386 | + piper::eSpeakPhonemeConfig config; | ||
| 387 | + | ||
| 388 | + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices | ||
| 389 | + // to list available voices | ||
| 390 | + config.voice = voice; // e.g., voice is en-us | ||
| 391 | + | ||
| 392 | + std::vector<std::vector<piper::Phoneme>> phonemes; | ||
| 393 | + | ||
| 394 | + CallPhonemizeEspeak(text, config, &phonemes); | ||
| 395 | + | ||
| 396 | + std::vector<TokenIDs> ans; | ||
| 397 | + | ||
| 398 | + for (const auto &p : phonemes) { | ||
| 399 | + auto phoneme_ids = | ||
| 400 | + PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len); | ||
| 401 | + | ||
| 402 | + for (auto &ids : phoneme_ids) { | ||
| 403 | + ans.emplace_back(std::move(ids)); | ||
| 404 | + } | ||
| 405 | + } | ||
| 406 | + | ||
| 407 | + return ans; | ||
| 408 | +} | ||
| 409 | + | ||
| 323 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsVits( | 410 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsVits( |
| 324 | const std::string &text, const std::string &voice /*= ""*/) const { | 411 | const std::string &text, const std::string &voice /*= ""*/) const { |
| 325 | piper::eSpeakPhonemeConfig config; | 412 | piper::eSpeakPhonemeConfig config; |
| @@ -363,6 +450,10 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -363,6 +450,10 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 363 | template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | 450 | template PiperPhonemizeLexicon::PiperPhonemizeLexicon( |
| 364 | AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, | 451 | AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, |
| 365 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); | 452 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); |
| 453 | + | ||
| 454 | +template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 455 | + AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, | ||
| 456 | + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); | ||
| 366 | #endif | 457 | #endif |
| 367 | 458 | ||
| 368 | #if __OHOS__ | 459 | #if __OHOS__ |
| @@ -375,6 +466,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -375,6 +466,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 375 | NativeResourceManager *mgr, const std::string &tokens, | 466 | NativeResourceManager *mgr, const std::string &tokens, |
| 376 | const std::string &data_dir, | 467 | const std::string &data_dir, |
| 377 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); | 468 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); |
| 469 | + | ||
| 470 | +template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 471 | + NativeResourceManager *mgr, const std::string &tokens, | ||
| 472 | + const std::string &data_dir, | ||
| 473 | + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); | ||
| 378 | #endif | 474 | #endif |
| 379 | 475 | ||
| 380 | } // namespace sherpa_onnx | 476 | } // namespace sherpa_onnx |
| @@ -10,8 +10,9 @@ | @@ -10,8 +10,9 @@ | ||
| 10 | #include <vector> | 10 | #include <vector> |
| 11 | 11 | ||
| 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| 13 | -#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" | ||
| 14 | -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | 13 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" |
| 14 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h" | ||
| 15 | +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" | ||
| 15 | 16 | ||
| 16 | namespace sherpa_onnx { | 17 | namespace sherpa_onnx { |
| 17 | 18 | ||
| @@ -23,6 +24,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | @@ -23,6 +24,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 23 | PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, | 24 | PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, |
| 24 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); | 25 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); |
| 25 | 26 | ||
| 27 | + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, | ||
| 28 | + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); | ||
| 29 | + | ||
| 26 | template <typename Manager> | 30 | template <typename Manager> |
| 27 | PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, | 31 | PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, |
| 28 | const std::string &data_dir, | 32 | const std::string &data_dir, |
| @@ -33,6 +37,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | @@ -33,6 +37,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 33 | const std::string &data_dir, | 37 | const std::string &data_dir, |
| 34 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); | 38 | const OfflineTtsMatchaModelMetaData &matcha_meta_data); |
| 35 | 39 | ||
| 40 | + template <typename Manager> | ||
| 41 | + PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, | ||
| 42 | + const std::string &data_dir, | ||
| 43 | + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); | ||
| 44 | + | ||
| 36 | std::vector<TokenIDs> ConvertTextToTokenIds( | 45 | std::vector<TokenIDs> ConvertTextToTokenIds( |
| 37 | const std::string &text, const std::string &voice = "") const override; | 46 | const std::string &text, const std::string &voice = "") const override; |
| 38 | 47 | ||
| @@ -43,12 +52,17 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | @@ -43,12 +52,17 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 43 | std::vector<TokenIDs> ConvertTextToTokenIdsMatcha( | 52 | std::vector<TokenIDs> ConvertTextToTokenIdsMatcha( |
| 44 | const std::string &text, const std::string &voice = "") const; | 53 | const std::string &text, const std::string &voice = "") const; |
| 45 | 54 | ||
| 55 | + std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( | ||
| 56 | + const std::string &text, const std::string &voice = "") const; | ||
| 57 | + | ||
| 46 | private: | 58 | private: |
| 47 | // map unicode codepoint to an integer ID | 59 | // map unicode codepoint to an integer ID |
| 48 | std::unordered_map<char32_t, int32_t> token2id_; | 60 | std::unordered_map<char32_t, int32_t> token2id_; |
| 49 | OfflineTtsVitsModelMetaData vits_meta_data_; | 61 | OfflineTtsVitsModelMetaData vits_meta_data_; |
| 50 | OfflineTtsMatchaModelMetaData matcha_meta_data_; | 62 | OfflineTtsMatchaModelMetaData matcha_meta_data_; |
| 63 | + OfflineTtsKokoroModelMetaData kokoro_meta_data_; | ||
| 51 | bool is_matcha_ = false; | 64 | bool is_matcha_ = false; |
| 65 | + bool is_kokoro_ = false; | ||
| 52 | }; | 66 | }; |
| 53 | 67 | ||
| 54 | } // namespace sherpa_onnx | 68 | } // namespace sherpa_onnx |
| @@ -54,6 +54,7 @@ endif() | @@ -54,6 +54,7 @@ endif() | ||
| 54 | 54 | ||
| 55 | if(SHERPA_ONNX_ENABLE_TTS) | 55 | if(SHERPA_ONNX_ENABLE_TTS) |
| 56 | list(APPEND srcs | 56 | list(APPEND srcs |
| 57 | + offline-tts-kokoro-model-config.cc | ||
| 57 | offline-tts-matcha-model-config.cc | 58 | offline-tts-matcha-model-config.cc |
| 58 | offline-tts-model-config.cc | 59 | offline-tts-model-config.cc |
| 59 | offline-tts-vits-model-config.cc | 60 | offline-tts-vits-model-config.cc |
| 1 | +// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +void PybindOfflineTtsKokoroModelConfig(py::module *m) { | ||
| 14 | + using PyClass = OfflineTtsKokoroModelConfig; | ||
| 15 | + | ||
| 16 | + py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig") | ||
| 17 | + .def(py::init<>()) | ||
| 18 | + .def(py::init<const std::string &, const std::string &, | ||
| 19 | + const std::string &, const std::string &, float>(), | ||
| 20 | + py::arg("model"), py::arg("voices"), py::arg("tokens"), | ||
| 21 | + py::arg("data_dir"), py::arg("length_scale") = 1.0) | ||
| 22 | + .def_readwrite("model", &PyClass::model) | ||
| 23 | + .def_readwrite("voices", &PyClass::voices) | ||
| 24 | + .def_readwrite("tokens", &PyClass::tokens) | ||
| 25 | + .def_readwrite("data_dir", &PyClass::data_dir) | ||
| 26 | + .def_readwrite("length_scale", &PyClass::length_scale) | ||
| 27 | + .def("__str__", &PyClass::ToString) | ||
| 28 | + .def("validate", &PyClass::Validate); | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindOfflineTtsKokoroModelConfig(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ |
| @@ -7,6 +7,7 @@ | @@ -7,6 +7,7 @@ | ||
| 7 | #include <string> | 7 | #include <string> |
| 8 | 8 | ||
| 9 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" | 9 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" |
| 10 | +#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h" | ||
| 10 | #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" | 11 | #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" |
| 11 | #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" | 12 | #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" |
| 12 | 13 | ||
| @@ -15,20 +16,24 @@ namespace sherpa_onnx { | @@ -15,20 +16,24 @@ namespace sherpa_onnx { | ||
| 15 | void PybindOfflineTtsModelConfig(py::module *m) { | 16 | void PybindOfflineTtsModelConfig(py::module *m) { |
| 16 | PybindOfflineTtsVitsModelConfig(m); | 17 | PybindOfflineTtsVitsModelConfig(m); |
| 17 | PybindOfflineTtsMatchaModelConfig(m); | 18 | PybindOfflineTtsMatchaModelConfig(m); |
| 19 | + PybindOfflineTtsKokoroModelConfig(m); | ||
| 18 | 20 | ||
| 19 | using PyClass = OfflineTtsModelConfig; | 21 | using PyClass = OfflineTtsModelConfig; |
| 20 | 22 | ||
| 21 | py::class_<PyClass>(*m, "OfflineTtsModelConfig") | 23 | py::class_<PyClass>(*m, "OfflineTtsModelConfig") |
| 22 | .def(py::init<>()) | 24 | .def(py::init<>()) |
| 23 | .def(py::init<const OfflineTtsVitsModelConfig &, | 25 | .def(py::init<const OfflineTtsVitsModelConfig &, |
| 24 | - const OfflineTtsMatchaModelConfig &, int32_t, bool, | 26 | + const OfflineTtsMatchaModelConfig &, |
| 27 | + const OfflineTtsKokoroModelConfig &, int32_t, bool, | ||
| 25 | const std::string &>(), | 28 | const std::string &>(), |
| 26 | py::arg("vits") = OfflineTtsVitsModelConfig{}, | 29 | py::arg("vits") = OfflineTtsVitsModelConfig{}, |
| 27 | py::arg("matcha") = OfflineTtsMatchaModelConfig{}, | 30 | py::arg("matcha") = OfflineTtsMatchaModelConfig{}, |
| 31 | + py::arg("kokoro") = OfflineTtsKokoroModelConfig{}, | ||
| 28 | py::arg("num_threads") = 1, py::arg("debug") = false, | 32 | py::arg("num_threads") = 1, py::arg("debug") = false, |
| 29 | py::arg("provider") = "cpu") | 33 | py::arg("provider") = "cpu") |
| 30 | .def_readwrite("vits", &PyClass::vits) | 34 | .def_readwrite("vits", &PyClass::vits) |
| 31 | .def_readwrite("matcha", &PyClass::matcha) | 35 | .def_readwrite("matcha", &PyClass::matcha) |
| 36 | + .def_readwrite("kokoro", &PyClass::kokoro) | ||
| 32 | .def_readwrite("num_threads", &PyClass::num_threads) | 37 | .def_readwrite("num_threads", &PyClass::num_threads) |
| 33 | .def_readwrite("debug", &PyClass::debug) | 38 | .def_readwrite("debug", &PyClass::debug) |
| 34 | .def_readwrite("provider", &PyClass::provider) | 39 | .def_readwrite("provider", &PyClass::provider) |
| @@ -20,6 +20,7 @@ from _sherpa_onnx import ( | @@ -20,6 +20,7 @@ from _sherpa_onnx import ( | ||
| 20 | OfflineStream, | 20 | OfflineStream, |
| 21 | OfflineTts, | 21 | OfflineTts, |
| 22 | OfflineTtsConfig, | 22 | OfflineTtsConfig, |
| 23 | + OfflineTtsKokoroModelConfig, | ||
| 23 | OfflineTtsMatchaModelConfig, | 24 | OfflineTtsMatchaModelConfig, |
| 24 | OfflineTtsModelConfig, | 25 | OfflineTtsModelConfig, |
| 25 | OfflineTtsVitsModelConfig, | 26 | OfflineTtsVitsModelConfig, |
-
请 注册 或 登录 后发表评论