Committed by
GitHub
Add C++ and Python API for Kokoro 1.0 multilingual TTS model (#1795)
正在显示
20 个修改的文件
包含
817 行增加
和
37 行删除
| @@ -267,6 +267,27 @@ log "Offline TTS test" | @@ -267,6 +267,27 @@ log "Offline TTS test" | ||
| 267 | # test waves are saved in ./tts | 267 | # test waves are saved in ./tts |
| 268 | mkdir ./tts | 268 | mkdir ./tts |
| 269 | 269 | ||
| 270 | +log "kokoro-multi-lang-v1_0 test" | ||
| 271 | + | ||
| 272 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 | ||
| 273 | +tar xf kokoro-multi-lang-v1_0.tar.bz2 | ||
| 274 | +rm kokoro-multi-lang-v1_0.tar.bz2 | ||
| 275 | + | ||
| 276 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 277 | + --debug=1 \ | ||
| 278 | + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ | ||
| 279 | + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ | ||
| 280 | + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ | ||
| 281 | + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ | ||
| 282 | + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ | ||
| 283 | + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ | ||
| 284 | + --num-threads=2 \ | ||
| 285 | + --sid=18 \ | ||
| 286 | + --output-filename="./tts/kokoro-18-zh-en.wav" \ | ||
| 287 | + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?" | ||
| 288 | + | ||
| 289 | +rm -rf kokoro-multi-lang-v1_0 | ||
| 290 | + | ||
| 270 | log "kokoro-en-v0_19 test" | 291 | log "kokoro-en-v0_19 test" |
| 271 | 292 | ||
| 272 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | 293 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 |
| @@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then | @@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then | ||
| 580 | repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 | 601 | repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 |
| 581 | log "Start testing ${repo}" | 602 | log "Start testing ${repo}" |
| 582 | 603 | ||
| 583 | - pushd $dir | ||
| 584 | curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz | 604 | curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz |
| 585 | tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz | 605 | tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz |
| 586 | rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz | 606 | rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz |
| 587 | - popd | ||
| 588 | 607 | ||
| 589 | - repo=$dir/$repo | ||
| 590 | ls -lh $repo | 608 | ls -lh $repo |
| 591 | 609 | ||
| 592 | python3 ./python-api-examples/keyword-spotter.py | 610 | python3 ./python-api-examples/keyword-spotter.py |
| @@ -4,7 +4,6 @@ on: | @@ -4,7 +4,6 @@ on: | ||
| 4 | push: | 4 | push: |
| 5 | branches: | 5 | branches: |
| 6 | - export-kokoro | 6 | - export-kokoro |
| 7 | - - kokoro-1.0-2 | ||
| 8 | 7 | ||
| 9 | workflow_dispatch: | 8 | workflow_dispatch: |
| 10 | 9 | ||
| @@ -76,6 +75,14 @@ jobs: | @@ -76,6 +75,14 @@ jobs: | ||
| 76 | if: matrix.version == '1.0' | 75 | if: matrix.version == '1.0' |
| 77 | shell: bash | 76 | shell: bash |
| 78 | run: | | 77 | run: | |
| 78 | + curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2 | ||
| 79 | + tar xvf dict.tar.bz2 | ||
| 80 | + rm dict.tar.bz2 | ||
| 81 | + | ||
| 82 | + curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst | ||
| 83 | + curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst | ||
| 84 | + curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst | ||
| 85 | + | ||
| 79 | src=scripts/kokoro/v1.0 | 86 | src=scripts/kokoro/v1.0 |
| 80 | 87 | ||
| 81 | d=kokoro-multi-lang-v1_0 | 88 | d=kokoro-multi-lang-v1_0 |
| @@ -87,7 +94,12 @@ jobs: | @@ -87,7 +94,12 @@ jobs: | ||
| 87 | cp -v $src/tokens.txt $d/ | 94 | cp -v $src/tokens.txt $d/ |
| 88 | cp -v $src/lexicon*.txt $d/ | 95 | cp -v $src/lexicon*.txt $d/ |
| 89 | cp -v $src/README.md $d/README.md | 96 | cp -v $src/README.md $d/README.md |
| 97 | + cp -av dict $d/ | ||
| 98 | + cp -v ./*.fst $d/ | ||
| 90 | ls -lh $d/ | 99 | ls -lh $d/ |
| 100 | + echo "---" | ||
| 101 | + ls -lh $d/dict | ||
| 102 | + | ||
| 91 | tar cjfv $d.tar.bz2 $d | 103 | tar cjfv $d.tar.bz2 $d |
| 92 | rm -rf $d | 104 | rm -rf $d |
| 93 | 105 | ||
| @@ -180,6 +192,8 @@ jobs: | @@ -180,6 +192,8 @@ jobs: | ||
| 180 | cp -v ../scripts/kokoro/v1.0/lexicon*.txt . | 192 | cp -v ../scripts/kokoro/v1.0/lexicon*.txt . |
| 181 | cp -v ../scripts/kokoro/v1.0/README.md ./README.md | 193 | cp -v ../scripts/kokoro/v1.0/README.md ./README.md |
| 182 | cp -v ../LICENSE ./ | 194 | cp -v ../LICENSE ./ |
| 195 | + cp -av ../dict ./ | ||
| 196 | + cp -v ../*.fst $d/ | ||
| 183 | 197 | ||
| 184 | git lfs track "*.onnx" | 198 | git lfs track "*.onnx" |
| 185 | git add . | 199 | git add . |
| @@ -25,27 +25,28 @@ int32_t main() { | @@ -25,27 +25,28 @@ int32_t main() { | ||
| 25 | 25 | ||
| 26 | memset(&config, 0, sizeof(config)); | 26 | memset(&config, 0, sizeof(config)); |
| 27 | config.model_config.transducer.encoder = | 27 | config.model_config.transducer.encoder = |
| 28 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" | 28 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 29 | "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; | 29 | "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; |
| 30 | 30 | ||
| 31 | config.model_config.transducer.decoder = | 31 | config.model_config.transducer.decoder = |
| 32 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" | 32 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 33 | "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; | 33 | "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; |
| 34 | 34 | ||
| 35 | config.model_config.transducer.joiner = | 35 | config.model_config.transducer.joiner = |
| 36 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" | 36 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 37 | "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; | 37 | "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; |
| 38 | 38 | ||
| 39 | config.model_config.tokens = | 39 | config.model_config.tokens = |
| 40 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt"; | 40 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 41 | + "tokens.txt"; | ||
| 41 | 42 | ||
| 42 | config.model_config.provider = "cpu"; | 43 | config.model_config.provider = "cpu"; |
| 43 | config.model_config.num_threads = 1; | 44 | config.model_config.num_threads = 1; |
| 44 | config.model_config.debug = 1; | 45 | config.model_config.debug = 1; |
| 45 | 46 | ||
| 46 | config.keywords_file = | 47 | config.keywords_file = |
| 47 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/" | ||
| 48 | - "test_keywords.txt"; | 48 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 49 | + "test_wavs/test_keywords.txt"; | ||
| 49 | 50 | ||
| 50 | const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config); | 51 | const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config); |
| 51 | if (!kws) { | 52 | if (!kws) { |
| @@ -24,27 +24,28 @@ int32_t main() { | @@ -24,27 +24,28 @@ int32_t main() { | ||
| 24 | 24 | ||
| 25 | KeywordSpotterConfig config; | 25 | KeywordSpotterConfig config; |
| 26 | config.model_config.transducer.encoder = | 26 | config.model_config.transducer.encoder = |
| 27 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" | 27 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 28 | "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; | 28 | "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; |
| 29 | 29 | ||
| 30 | config.model_config.transducer.decoder = | 30 | config.model_config.transducer.decoder = |
| 31 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" | 31 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 32 | "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; | 32 | "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; |
| 33 | 33 | ||
| 34 | config.model_config.transducer.joiner = | 34 | config.model_config.transducer.joiner = |
| 35 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" | 35 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 36 | "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; | 36 | "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; |
| 37 | 37 | ||
| 38 | config.model_config.tokens = | 38 | config.model_config.tokens = |
| 39 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt"; | 39 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 40 | + "tokens.txt"; | ||
| 40 | 41 | ||
| 41 | config.model_config.provider = "cpu"; | 42 | config.model_config.provider = "cpu"; |
| 42 | config.model_config.num_threads = 1; | 43 | config.model_config.num_threads = 1; |
| 43 | config.model_config.debug = 1; | 44 | config.model_config.debug = 1; |
| 44 | 45 | ||
| 45 | config.keywords_file = | 46 | config.keywords_file = |
| 46 | - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/" | ||
| 47 | - "test_keywords.txt"; | 47 | + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" |
| 48 | + "test_wavs/test_keywords.txt"; | ||
| 48 | 49 | ||
| 49 | KeywordSpotter kws = KeywordSpotter::Create(config); | 50 | KeywordSpotter kws = KeywordSpotter::Create(config); |
| 50 | if (!kws.Get()) { | 51 | if (!kws.Get()) { |
| @@ -11,7 +11,7 @@ while the model is still generating. | @@ -11,7 +11,7 @@ while the model is still generating. | ||
| 11 | 11 | ||
| 12 | Usage: | 12 | Usage: |
| 13 | 13 | ||
| 14 | -Example (1/6) | 14 | +Example (1/7) |
| 15 | 15 | ||
| 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 23 | --output-filename=./generated.wav \ | 23 | --output-filename=./generated.wav \ |
| 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 25 | 25 | ||
| 26 | -Example (2/6) | 26 | +Example (2/7) |
| 27 | 27 | ||
| 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 | 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 |
| 29 | tar xvf vits-zh-aishell3.tar.bz2 | 29 | tar xvf vits-zh-aishell3.tar.bz2 |
| @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 37 | --output-filename=./liubei-21.wav \ | 37 | --output-filename=./liubei-21.wav \ |
| 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 39 | 39 | ||
| 40 | -Example (3/6) | 40 | +Example (3/7) |
| 41 | 41 | ||
| 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 53 | --output-filename=./test-2.wav \ | 53 | --output-filename=./test-2.wav \ |
| 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 55 | 55 | ||
| 56 | -Example (4/6) | 56 | +Example (4/7) |
| 57 | 57 | ||
| 58 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 58 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 59 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 59 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 71 | --output-filename=./test-matcha.wav \ | 71 | --output-filename=./test-matcha.wav \ |
| 72 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | 72 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| 73 | 73 | ||
| 74 | -Example (5/6) | 74 | +Example (5/7) |
| 75 | 75 | ||
| 76 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | 76 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 |
| 77 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | 77 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 |
| @@ -88,7 +88,9 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -88,7 +88,9 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 88 | --num-threads=2 \ | 88 | --num-threads=2 \ |
| 89 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 89 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 90 | 90 | ||
| 91 | -Example (6/6) | 91 | +Example (6/7) |
| 92 | + | ||
| 93 | +(This version of kokoro supports only English) | ||
| 92 | 94 | ||
| 93 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | 95 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 |
| 94 | tar xf kokoro-en-v0_19.tar.bz2 | 96 | tar xf kokoro-en-v0_19.tar.bz2 |
| @@ -105,6 +107,27 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -105,6 +107,27 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 105 | --output-filename="./kokoro-10.wav" \ | 107 | --output-filename="./kokoro-10.wav" \ |
| 106 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." | 108 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." |
| 107 | 109 | ||
| 110 | +Example (7/7) | ||
| 111 | + | ||
| 112 | +(This version of kokoro supports English, Chinese, etc.) | ||
| 113 | + | ||
| 114 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 | ||
| 115 | +tar xf kokoro-multi-lang-v1_0.tar.bz2 | ||
| 116 | +rm kokoro-multi-lang-v1_0.tar.bz2 | ||
| 117 | + | ||
| 118 | +python3 ./python-api-examples/offline-tts-play.py \ | ||
| 119 | + --debug=1 \ | ||
| 120 | + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ | ||
| 121 | + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ | ||
| 122 | + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ | ||
| 123 | + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ | ||
| 124 | + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ | ||
| 125 | + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ | ||
| 126 | + --num-threads=2 \ | ||
| 127 | + --sid=18 \ | ||
| 128 | + --output-filename="./kokoro-18-zh-en.wav" \ | ||
| 129 | + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?" | ||
| 130 | + | ||
| 108 | You can find more models at | 131 | You can find more models at |
| 109 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 132 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 110 | 133 | ||
| @@ -247,6 +270,20 @@ def add_kokoro_args(parser): | @@ -247,6 +270,20 @@ def add_kokoro_args(parser): | ||
| 247 | help="Path to the dict directory of espeak-ng.", | 270 | help="Path to the dict directory of espeak-ng.", |
| 248 | ) | 271 | ) |
| 249 | 272 | ||
| 273 | + parser.add_argument( | ||
| 274 | + "--kokoro-dict-dir", | ||
| 275 | + type=str, | ||
| 276 | + default="", | ||
| 277 | + help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro", | ||
| 278 | + ) | ||
| 279 | + | ||
| 280 | + parser.add_argument( | ||
| 281 | + "--kokoro-lexicon", | ||
| 282 | + type=str, | ||
| 283 | + default="", | ||
| 284 | + help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro", | ||
| 285 | + ) | ||
| 286 | + | ||
| 250 | 287 | ||
| 251 | def get_args(): | 288 | def get_args(): |
| 252 | parser = argparse.ArgumentParser( | 289 | parser = argparse.ArgumentParser( |
| @@ -459,6 +496,8 @@ def main(): | @@ -459,6 +496,8 @@ def main(): | ||
| 459 | voices=args.kokoro_voices, | 496 | voices=args.kokoro_voices, |
| 460 | tokens=args.kokoro_tokens, | 497 | tokens=args.kokoro_tokens, |
| 461 | data_dir=args.kokoro_data_dir, | 498 | data_dir=args.kokoro_data_dir, |
| 499 | + dict_dir=args.kokoro_dict_dir, | ||
| 500 | + lexicon=args.kokoro_lexicon, | ||
| 462 | ), | 501 | ), |
| 463 | provider=args.provider, | 502 | provider=args.provider, |
| 464 | debug=args.debug, | 503 | debug=args.debug, |
| @@ -12,7 +12,7 @@ generated audio. | @@ -12,7 +12,7 @@ generated audio. | ||
| 12 | 12 | ||
| 13 | Usage: | 13 | Usage: |
| 14 | 14 | ||
| 15 | -Example (1/6) | 15 | +Example (1/7) |
| 16 | 16 | ||
| 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 24 | --output-filename=./generated.wav \ | 24 | --output-filename=./generated.wav \ |
| 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 26 | 26 | ||
| 27 | -Example (2/6) | 27 | +Example (2/7) |
| 28 | 28 | ||
| 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 | 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 |
| @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 38 | --output-filename=./liubei-21.wav \ | 38 | --output-filename=./liubei-21.wav \ |
| 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 40 | 40 | ||
| 41 | -Example (3/6) | 41 | +Example (3/7) |
| 42 | 42 | ||
| 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 54 | --output-filename=./test-2.wav \ | 54 | --output-filename=./test-2.wav \ |
| 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 56 | 56 | ||
| 57 | -Example (4/6) | 57 | +Example (4/7) |
| 58 | 58 | ||
| 59 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 59 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 60 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 60 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 72 | --output-filename=./test-matcha.wav \ | 72 | --output-filename=./test-matcha.wav \ |
| 73 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | 73 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| 74 | 74 | ||
| 75 | -Example (5/6) | 75 | +Example (5/7) |
| 76 | 76 | ||
| 77 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | 77 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 |
| 78 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | 78 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 |
| @@ -89,7 +89,9 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -89,7 +89,9 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 89 | --num-threads=2 \ | 89 | --num-threads=2 \ |
| 90 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 90 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 91 | 91 | ||
| 92 | -Example (6/6) | 92 | +Example (6/7) |
| 93 | + | ||
| 94 | +(This version of kokoro supports only English) | ||
| 93 | 95 | ||
| 94 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | 96 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 |
| 95 | tar xf kokoro-en-v0_19.tar.bz2 | 97 | tar xf kokoro-en-v0_19.tar.bz2 |
| @@ -106,6 +108,27 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -106,6 +108,27 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 106 | --output-filename="./kokoro-10.wav" \ | 108 | --output-filename="./kokoro-10.wav" \ |
| 107 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." | 109 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." |
| 108 | 110 | ||
| 111 | +Example (7/7) | ||
| 112 | + | ||
| 113 | +(This version of kokoro supports English, Chinese, etc.) | ||
| 114 | + | ||
| 115 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 | ||
| 116 | +tar xf kokoro-multi-lang-v1_0.tar.bz2 | ||
| 117 | +rm kokoro-multi-lang-v1_0.tar.bz2 | ||
| 118 | + | ||
| 119 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 120 | + --debug=1 \ | ||
| 121 | + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ | ||
| 122 | + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ | ||
| 123 | + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ | ||
| 124 | + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ | ||
| 125 | + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ | ||
| 126 | + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ | ||
| 127 | + --num-threads=2 \ | ||
| 128 | + --sid=18 \ | ||
| 129 | + --output-filename="./kokoro-18-zh-en.wav" \ | ||
| 130 | + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?" | ||
| 131 | + | ||
| 109 | You can find more models at | 132 | You can find more models at |
| 110 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 133 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 111 | 134 | ||
| @@ -234,6 +257,20 @@ def add_kokoro_args(parser): | @@ -234,6 +257,20 @@ def add_kokoro_args(parser): | ||
| 234 | help="Path to the dict directory of espeak-ng.", | 257 | help="Path to the dict directory of espeak-ng.", |
| 235 | ) | 258 | ) |
| 236 | 259 | ||
| 260 | + parser.add_argument( | ||
| 261 | + "--kokoro-dict-dir", | ||
| 262 | + type=str, | ||
| 263 | + default="", | ||
| 264 | + help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro", | ||
| 265 | + ) | ||
| 266 | + | ||
| 267 | + parser.add_argument( | ||
| 268 | + "--kokoro-lexicon", | ||
| 269 | + type=str, | ||
| 270 | + default="", | ||
| 271 | + help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro", | ||
| 272 | + ) | ||
| 273 | + | ||
| 237 | 274 | ||
| 238 | def get_args(): | 275 | def get_args(): |
| 239 | parser = argparse.ArgumentParser( | 276 | parser = argparse.ArgumentParser( |
| @@ -342,6 +379,8 @@ def main(): | @@ -342,6 +379,8 @@ def main(): | ||
| 342 | voices=args.kokoro_voices, | 379 | voices=args.kokoro_voices, |
| 343 | tokens=args.kokoro_tokens, | 380 | tokens=args.kokoro_tokens, |
| 344 | data_dir=args.kokoro_data_dir, | 381 | data_dir=args.kokoro_data_dir, |
| 382 | + dict_dir=args.kokoro_dict_dir, | ||
| 383 | + lexicon=args.kokoro_lexicon, | ||
| 345 | ), | 384 | ), |
| 346 | provider=args.provider, | 385 | provider=args.provider, |
| 347 | debug=args.debug, | 386 | debug=args.debug, |
| @@ -71,7 +71,7 @@ def main(): | @@ -71,7 +71,7 @@ def main(): | ||
| 71 | with open("voices.bin", "wb") as f: | 71 | with open("voices.bin", "wb") as f: |
| 72 | for _, speaker in id2speaker.items(): | 72 | for _, speaker in id2speaker.items(): |
| 73 | m = torch.load( | 73 | m = torch.load( |
| 74 | - f"{speaker}.pt", | 74 | + f"voices/{speaker}.pt", |
| 75 | weights_only=True, | 75 | weights_only=True, |
| 76 | map_location="cpu", | 76 | map_location="cpu", |
| 77 | ).numpy() | 77 | ).numpy() |
| @@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS) | @@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS) | ||
| 153 | list(APPEND sources | 153 | list(APPEND sources |
| 154 | hifigan-vocoder.cc | 154 | hifigan-vocoder.cc |
| 155 | jieba-lexicon.cc | 155 | jieba-lexicon.cc |
| 156 | + kokoro-multi-lang-lexicon.cc | ||
| 156 | lexicon.cc | 157 | lexicon.cc |
| 157 | melo-tts-lexicon.cc | 158 | melo-tts-lexicon.cc |
| 158 | offline-tts-character-frontend.cc | 159 | offline-tts-character-frontend.cc |
| 1 | +// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" | ||
| 6 | + | ||
| 7 | +#include <codecvt> | ||
| 8 | +#include <fstream> | ||
| 9 | +#include <locale> | ||
| 10 | +#include <regex> // NOLINT | ||
| 11 | +#include <sstream> | ||
| 12 | +#include <strstream> | ||
| 13 | +#include <unordered_map> | ||
| 14 | +#include <utility> | ||
| 15 | + | ||
| 16 | +#if __ANDROID_API__ >= 9 | ||
| 17 | +#include "android/asset_manager.h" | ||
| 18 | +#include "android/asset_manager_jni.h" | ||
| 19 | +#endif | ||
| 20 | + | ||
| 21 | +#if __OHOS__ | ||
| 22 | +#include "rawfile/raw_file_manager.h" | ||
| 23 | +#endif | ||
| 24 | + | ||
| 25 | +#include "cppjieba/Jieba.hpp" | ||
| 26 | +#include "espeak-ng/speak_lib.h" | ||
| 27 | +#include "phoneme_ids.hpp" | ||
| 28 | +#include "phonemize.hpp" | ||
| 29 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 30 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 31 | +#include "sherpa-onnx/csrc/symbol-table.h" | ||
| 32 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 33 | + | ||
| 34 | +namespace sherpa_onnx { | ||
| 35 | + | ||
| 36 | +void CallPhonemizeEspeak(const std::string &text, | ||
| 37 | + piper::eSpeakPhonemeConfig &config, // NOLINT | ||
| 38 | + std::vector<std::vector<piper::Phoneme>> *phonemes); | ||
| 39 | + | ||
| 40 | +static std::wstring ToWideString(const std::string &s) { | ||
| 41 | + // see | ||
| 42 | + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t | ||
| 43 | + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; | ||
| 44 | + return converter.from_bytes(s); | ||
| 45 | +} | ||
| 46 | + | ||
| 47 | +static std::string ToString(const std::wstring &s) { | ||
| 48 | + // see | ||
| 49 | + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t | ||
| 50 | + std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; | ||
| 51 | + return converter.to_bytes(s); | ||
| 52 | +} | ||
| 53 | + | ||
| 54 | +class KokoroMultiLangLexicon::Impl { | ||
| 55 | + public: | ||
| 56 | + Impl(const std::string &tokens, const std::string &lexicon, | ||
| 57 | + const std::string &dict_dir, const std::string &data_dir, | ||
| 58 | + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) | ||
| 59 | + : meta_data_(meta_data), debug_(debug) { | ||
| 60 | + InitTokens(tokens); | ||
| 61 | + | ||
| 62 | + InitLexicon(lexicon); | ||
| 63 | + | ||
| 64 | + InitJieba(dict_dir); | ||
| 65 | + | ||
| 66 | + InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | + template <typename Manager> | ||
| 70 | + Impl(Manager *mgr, const std::string &tokens, const std::string &lexicon, | ||
| 71 | + const std::string &dict_dir, const std::string &data_dir, | ||
| 72 | + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) | ||
| 73 | + : meta_data_(meta_data), debug_(debug) { | ||
| 74 | + InitTokens(mgr, tokens); | ||
| 75 | + | ||
| 76 | + InitLexicon(mgr, lexicon); | ||
| 77 | + | ||
| 78 | + // we assume you have copied dict_dir and data_dir from assets to some path | ||
| 79 | + InitJieba(dict_dir); | ||
| 80 | + | ||
| 81 | + InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc | ||
| 82 | + } | ||
| 83 | + | ||
| 84 | + std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const { | ||
| 85 | + std::string text = ToLowerCase(_text); | ||
| 86 | + if (debug_) { | ||
| 87 | + SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str()); | ||
| 88 | + } | ||
| 89 | + | ||
| 90 | + std::vector<std::pair<std::string, std::string>> replace_str_pairs = { | ||
| 91 | + {",", ","}, {":", ","}, {"、", ","}, {";", ";"}, {":", ":"}, | ||
| 92 | + {"。", "."}, {"?", "?"}, {"!", "!"}, {"\\s+", " "}, | ||
| 93 | + }; | ||
| 94 | + for (const auto &p : replace_str_pairs) { | ||
| 95 | + std::regex re(p.first); | ||
| 96 | + text = std::regex_replace(text, re, p.second); | ||
| 97 | + } | ||
| 98 | + | ||
| 99 | + if (debug_) { | ||
| 100 | + SHERPA_ONNX_LOGE("After replacing punctuations and merging spaces:\n%s", | ||
| 101 | + text.c_str()); | ||
| 102 | + } | ||
| 103 | + | ||
| 104 | + // https://en.cppreference.com/w/cpp/regex | ||
| 105 | + // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex | ||
| 106 | + std::string expr = | ||
| 107 | + "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)"; | ||
| 108 | + | ||
| 109 | + auto ws = ToWideString(text); | ||
| 110 | + std::wstring wexpr = ToWideString(expr); | ||
| 111 | + std::wregex we(wexpr); | ||
| 112 | + | ||
| 113 | + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we); | ||
| 114 | + auto end = std::wsregex_iterator(); | ||
| 115 | + | ||
| 116 | + std::vector<TokenIDs> ans; | ||
| 117 | + | ||
| 118 | + for (std::wsregex_iterator i = begin; i != end; ++i) { | ||
| 119 | + std::wsmatch match = *i; | ||
| 120 | + std::wstring match_str = match.str(); | ||
| 121 | + auto ms = ToString(match_str); | ||
| 122 | + uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0]; | ||
| 123 | + | ||
| 124 | + std::vector<std::vector<int32_t>> ids_vec; | ||
| 125 | + | ||
| 126 | + if (c < 0x80) { | ||
| 127 | + if (debug_) { | ||
| 128 | + SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); | ||
| 129 | + } | ||
| 130 | + ids_vec = ConvertEnglishToTokenIDs(ms); | ||
| 131 | + } else { | ||
| 132 | + if (debug_) { | ||
| 133 | + SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); | ||
| 134 | + } | ||
| 135 | + ids_vec = ConvertChineseToTokenIDs(ms); | ||
| 136 | + } | ||
| 137 | + | ||
| 138 | + for (const auto &ids : ids_vec) { | ||
| 139 | + if (ids.size() > 4) { | ||
| 140 | + ans.emplace_back(ids); | ||
| 141 | + } else { | ||
| 142 | + if (ans.empty()) { | ||
| 143 | + ans.emplace_back(ids); | ||
| 144 | + } else { | ||
| 145 | + ans.back().tokens.back() = ids[1]; | ||
| 146 | + ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2, | ||
| 147 | + ids.end()); | ||
| 148 | + } | ||
| 149 | + } | ||
| 150 | + } | ||
| 151 | + } | ||
| 152 | + | ||
| 153 | + if (debug_) { | ||
| 154 | + for (const auto &v : ans) { | ||
| 155 | + std::ostringstream os; | ||
| 156 | + os << "\n"; | ||
| 157 | + std::string sep; | ||
| 158 | + for (auto i : v.tokens) { | ||
| 159 | + os << sep << i; | ||
| 160 | + sep = " "; | ||
| 161 | + } | ||
| 162 | + os << "\n"; | ||
| 163 | + SHERPA_ONNX_LOGE("%s", os.str().c_str()); | ||
| 164 | + } | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + return ans; | ||
| 168 | + } | ||
| 169 | + | ||
| 170 | + private: | ||
| 171 | + bool IsPunctuation(const std::string &text) const { | ||
| 172 | + if (text == ";" || text == ":" || text == "," || text == "." || | ||
| 173 | + text == "!" || text == "?" || text == "—" || text == "…" || | ||
| 174 | + text == "\"" || text == "(" || text == ")" || text == "“" || | ||
| 175 | + text == "”") { | ||
| 176 | + return true; | ||
| 177 | + } | ||
| 178 | + | ||
| 179 | + return false; | ||
| 180 | + } | ||
| 181 | + | ||
| 182 | + std::vector<int32_t> ConvertWordToIds(const std::string &w) const { | ||
| 183 | + std::vector<int32_t> ans; | ||
| 184 | + if (word2ids_.count(w)) { | ||
| 185 | + ans = word2ids_.at(w); | ||
| 186 | + return ans; | ||
| 187 | + } | ||
| 188 | + | ||
| 189 | + std::vector<std::string> words = SplitUtf8(w); | ||
| 190 | + for (const auto &word : words) { | ||
| 191 | + if (word2ids_.count(word)) { | ||
| 192 | + auto ids = ConvertWordToIds(word); | ||
| 193 | + ans.insert(ans.end(), ids.begin(), ids.end()); | ||
| 194 | + } else { | ||
| 195 | + SHERPA_ONNX_LOGE("Skip OOV: '%s'", word.c_str()); | ||
| 196 | + } | ||
| 197 | + } | ||
| 198 | + | ||
| 199 | + return ans; | ||
| 200 | + } | ||
| 201 | + | ||
| 202 | + std::vector<std::vector<int32_t>> ConvertChineseToTokenIDs( | ||
| 203 | + const std::string &text) const { | ||
| 204 | + bool is_hmm = true; | ||
| 205 | + | ||
| 206 | + std::vector<std::string> words; | ||
| 207 | + jieba_->Cut(text, words, is_hmm); | ||
| 208 | + if (debug_) { | ||
| 209 | + std::ostringstream os; | ||
| 210 | + os << "After jieba processing:\n"; | ||
| 211 | + | ||
| 212 | + std::string sep; | ||
| 213 | + for (const auto &w : words) { | ||
| 214 | + os << sep << w; | ||
| 215 | + sep = "_"; | ||
| 216 | + } | ||
| 217 | + SHERPA_ONNX_LOGE("%s", os.str().c_str()); | ||
| 218 | + } | ||
| 219 | + | ||
| 220 | + std::vector<std::vector<int32_t>> ans; | ||
| 221 | + std::vector<int32_t> this_sentence; | ||
| 222 | + int32_t max_len = meta_data_.max_token_len; | ||
| 223 | + | ||
| 224 | + this_sentence.push_back(0); | ||
| 225 | + for (const auto &w : words) { | ||
| 226 | + auto ids = ConvertWordToIds(w); | ||
| 227 | + if (this_sentence.size() + ids.size() > max_len - 2) { | ||
| 228 | + this_sentence.push_back(0); | ||
| 229 | + ans.push_back(std::move(this_sentence)); | ||
| 230 | + | ||
| 231 | + this_sentence.push_back(0); | ||
| 232 | + } | ||
| 233 | + | ||
| 234 | + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); | ||
| 235 | + } | ||
| 236 | + | ||
| 237 | + if (this_sentence.size() > 1) { | ||
| 238 | + this_sentence.push_back(0); | ||
| 239 | + ans.push_back(std::move(this_sentence)); | ||
| 240 | + } | ||
| 241 | + | ||
| 242 | + if (debug_) { | ||
| 243 | + for (const auto &v : ans) { | ||
| 244 | + std::ostringstream os; | ||
| 245 | + os << "\n"; | ||
| 246 | + std::string sep; | ||
| 247 | + for (auto i : v) { | ||
| 248 | + os << sep << i; | ||
| 249 | + sep = " "; | ||
| 250 | + } | ||
| 251 | + os << "\n"; | ||
| 252 | + SHERPA_ONNX_LOGE("%s", os.str().c_str()); | ||
| 253 | + } | ||
| 254 | + } | ||
| 255 | + | ||
| 256 | + return ans; | ||
| 257 | + } | ||
| 258 | + | ||
| 259 | + std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs( | ||
| 260 | + const std::string &text) const { | ||
| 261 | + std::vector<std::string> words = SplitUtf8(text); | ||
| 262 | + if (debug_) { | ||
| 263 | + std::ostringstream os; | ||
| 264 | + os << "After splitting to words: "; | ||
| 265 | + std::string sep; | ||
| 266 | + for (const auto &w : words) { | ||
| 267 | + os << sep << w; | ||
| 268 | + sep = "_"; | ||
| 269 | + } | ||
| 270 | + SHERPA_ONNX_LOGE("%s", os.str().c_str()); | ||
| 271 | + } | ||
| 272 | + | ||
| 273 | + std::vector<std::vector<int32_t>> ans; | ||
| 274 | + int32_t max_len = meta_data_.max_token_len; | ||
| 275 | + std::vector<int32_t> this_sentence; | ||
| 276 | + | ||
| 277 | + int32_t space_id = token2id_.at(" "); | ||
| 278 | + | ||
| 279 | + this_sentence.push_back(0); | ||
| 280 | + | ||
| 281 | + for (const auto &word : words) { | ||
| 282 | + if (IsPunctuation(word)) { | ||
| 283 | + this_sentence.push_back(token2id_.at(word)); | ||
| 284 | + | ||
| 285 | + if (this_sentence.size() > max_len - 2) { | ||
| 286 | + // this sentence is too long, split it | ||
| 287 | + this_sentence.push_back(0); | ||
| 288 | + ans.push_back(std::move(this_sentence)); | ||
| 289 | + | ||
| 290 | + this_sentence.push_back(0); | ||
| 291 | + continue; | ||
| 292 | + } | ||
| 293 | + | ||
| 294 | + if (word == "." || word == "!" || word == "?" || word == ";") { | ||
| 295 | + // Note: You can add more punctuations here to split the text | ||
| 296 | + // into sentences. We just use four here: .!?; | ||
| 297 | + this_sentence.push_back(0); | ||
| 298 | + ans.push_back(std::move(this_sentence)); | ||
| 299 | + | ||
| 300 | + this_sentence.push_back(0); | ||
| 301 | + } | ||
| 302 | + } else if (word2ids_.count(word)) { | ||
| 303 | + const auto &ids = word2ids_.at(word); | ||
| 304 | + if (this_sentence.size() + ids.size() + 3 > max_len - 2) { | ||
| 305 | + this_sentence.push_back(0); | ||
| 306 | + ans.push_back(std::move(this_sentence)); | ||
| 307 | + | ||
| 308 | + this_sentence.push_back(0); | ||
| 309 | + } | ||
| 310 | + | ||
| 311 | + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); | ||
| 312 | + this_sentence.push_back(space_id); | ||
| 313 | + } else { | ||
| 314 | + SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str()); | ||
| 315 | + | ||
| 316 | + piper::eSpeakPhonemeConfig config; | ||
| 317 | + | ||
| 318 | + config.voice = "en-us"; | ||
| 319 | + | ||
| 320 | + std::vector<std::vector<piper::Phoneme>> phonemes; | ||
| 321 | + | ||
| 322 | + CallPhonemizeEspeak(word, config, &phonemes); | ||
| 323 | + // Note phonemes[i] contains a vector of unicode codepoints; | ||
| 324 | + // we need to convert them to utf8 | ||
| 325 | + | ||
| 326 | + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; | ||
| 327 | + | ||
| 328 | + std::vector<int32_t> ids; | ||
| 329 | + for (const auto &v : phonemes) { | ||
| 330 | + for (const auto p : v) { | ||
| 331 | + auto token = conv.to_bytes(p); | ||
| 332 | + if (token2id_.count(token)) { | ||
| 333 | + ids.push_back(token2id_.at(token)); | ||
| 334 | + } else { | ||
| 335 | + SHERPA_ONNX_LOGE("Skip OOV token '%s' from '%s'", token.c_str(), | ||
| 336 | + word.c_str()); | ||
| 337 | + } | ||
| 338 | + } | ||
| 339 | + } | ||
| 340 | + | ||
| 341 | + if (this_sentence.size() + ids.size() + 3 > max_len - 2) { | ||
| 342 | + this_sentence.push_back(0); | ||
| 343 | + ans.push_back(std::move(this_sentence)); | ||
| 344 | + | ||
| 345 | + this_sentence.push_back(0); | ||
| 346 | + } | ||
| 347 | + | ||
| 348 | + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); | ||
| 349 | + this_sentence.push_back(space_id); | ||
| 350 | + } | ||
| 351 | + } | ||
| 352 | + | ||
| 353 | + if (this_sentence.size() > 1) { | ||
| 354 | + this_sentence.push_back(0); | ||
| 355 | + ans.push_back(std::move(this_sentence)); | ||
| 356 | + } | ||
| 357 | + | ||
| 358 | + if (debug_) { | ||
| 359 | + for (const auto &v : ans) { | ||
| 360 | + std::ostringstream os; | ||
| 361 | + os << "\n"; | ||
| 362 | + std::string sep; | ||
| 363 | + for (auto i : v) { | ||
| 364 | + os << sep << i; | ||
| 365 | + sep = " "; | ||
| 366 | + } | ||
| 367 | + os << "\n"; | ||
| 368 | + SHERPA_ONNX_LOGE("%s", os.str().c_str()); | ||
| 369 | + } | ||
| 370 | + } | ||
| 371 | + | ||
| 372 | + return ans; | ||
| 373 | + } | ||
| 374 | + | ||
| 375 | + void InitTokens(const std::string &tokens) { | ||
| 376 | + std::ifstream is(tokens); | ||
| 377 | + InitTokens(is); | ||
| 378 | + } | ||
| 379 | + | ||
| 380 | + template <typename Manager> | ||
| 381 | + void InitTokens(Manager *mgr, const std::string &tokens) { | ||
| 382 | + auto buf = ReadFile(mgr, tokens); | ||
| 383 | + | ||
| 384 | + std::istrstream is(buf.data(), buf.size()); | ||
| 385 | + InitTokens(is); | ||
| 386 | + } | ||
| 387 | + | ||
| 388 | + void InitTokens(std::istream &is) { | ||
| 389 | + token2id_ = ReadTokens(is); // defined in ./symbol-table.cc | ||
| 390 | + } | ||
| 391 | + | ||
| 392 | + void InitLexicon(const std::string &lexicon) { | ||
| 393 | + std::vector<std::string> files; | ||
| 394 | + SplitStringToVector(lexicon, ",", false, &files); | ||
| 395 | + for (const auto &f : files) { | ||
| 396 | + std::ifstream is(f); | ||
| 397 | + InitLexicon(is); | ||
| 398 | + } | ||
| 399 | + } | ||
| 400 | + | ||
| 401 | + template <typename Manager> | ||
| 402 | + void InitLexicon(Manager *mgr, const std::string &lexicon) { | ||
| 403 | + std::vector<std::string> files; | ||
| 404 | + SplitStringToVector(lexicon, ",", false, &files); | ||
| 405 | + for (const auto &f : files) { | ||
| 406 | + auto buf = ReadFile(mgr, f); | ||
| 407 | + | ||
| 408 | + std::istrstream is(buf.data(), buf.size()); | ||
| 409 | + InitLexicon(is); | ||
| 410 | + } | ||
| 411 | + } | ||
| 412 | + | ||
| 413 | + void InitLexicon(std::istream &is) { | ||
| 414 | + std::string word; | ||
| 415 | + std::vector<std::string> token_list; | ||
| 416 | + std::string token; | ||
| 417 | + | ||
| 418 | + std::string line; | ||
| 419 | + int32_t line_num = 0; | ||
| 420 | + int32_t num_warn = 0; | ||
| 421 | + while (std::getline(is, line)) { | ||
| 422 | + ++line_num; | ||
| 423 | + std::istringstream iss(line); | ||
| 424 | + | ||
| 425 | + token_list.clear(); | ||
| 426 | + iss >> word; | ||
| 427 | + ToLowerCase(&word); | ||
| 428 | + | ||
| 429 | + if (word2ids_.count(word)) { | ||
| 430 | + num_warn += 1; | ||
| 431 | + if (num_warn < 10) { | ||
| 432 | + SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.", | ||
| 433 | + word.c_str(), line_num, line.c_str()); | ||
| 434 | + } | ||
| 435 | + continue; | ||
| 436 | + } | ||
| 437 | + | ||
| 438 | + while (iss >> token) { | ||
| 439 | + token_list.push_back(std::move(token)); | ||
| 440 | + } | ||
| 441 | + | ||
| 442 | + std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list); | ||
| 443 | + | ||
| 444 | + if (ids.empty()) { | ||
| 445 | + SHERPA_ONNX_LOGE( | ||
| 446 | + "Invalid pronunciation for word '%s' at line %d:%s. Ignore it", | ||
| 447 | + word.c_str(), line_num, line.c_str()); | ||
| 448 | + continue; | ||
| 449 | + } | ||
| 450 | + | ||
| 451 | + word2ids_.insert({std::move(word), std::move(ids)}); | ||
| 452 | + } | ||
| 453 | + } | ||
| 454 | + | ||
| 455 | + void InitJieba(const std::string &dict_dir) { | ||
| 456 | + std::string dict = dict_dir + "/jieba.dict.utf8"; | ||
| 457 | + std::string hmm = dict_dir + "/hmm_model.utf8"; | ||
| 458 | + std::string user_dict = dict_dir + "/user.dict.utf8"; | ||
| 459 | + std::string idf = dict_dir + "/idf.utf8"; | ||
| 460 | + std::string stop_word = dict_dir + "/stop_words.utf8"; | ||
| 461 | + | ||
| 462 | + AssertFileExists(dict); | ||
| 463 | + AssertFileExists(hmm); | ||
| 464 | + AssertFileExists(user_dict); | ||
| 465 | + AssertFileExists(idf); | ||
| 466 | + AssertFileExists(stop_word); | ||
| 467 | + | ||
| 468 | + jieba_ = | ||
| 469 | + std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word); | ||
| 470 | + } | ||
| 471 | + | ||
| 472 | + private: | ||
| 473 | + OfflineTtsKokoroModelMetaData meta_data_; | ||
| 474 | + | ||
| 475 | + // word to token IDs | ||
| 476 | + std::unordered_map<std::string, std::vector<int32_t>> word2ids_; | ||
| 477 | + | ||
| 478 | + // tokens.txt is saved in token2id_ | ||
| 479 | + std::unordered_map<std::string, int32_t> token2id_; | ||
| 480 | + | ||
| 481 | + std::unique_ptr<cppjieba::Jieba> jieba_; | ||
| 482 | + bool debug_ = false; | ||
| 483 | +}; | ||
| 484 | + | ||
| 485 | +KokoroMultiLangLexicon::~KokoroMultiLangLexicon() = default; | ||
| 486 | + | ||
| 487 | +KokoroMultiLangLexicon::KokoroMultiLangLexicon( | ||
| 488 | + const std::string &tokens, const std::string &lexicon, | ||
| 489 | + const std::string &dict_dir, const std::string &data_dir, | ||
| 490 | + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) | ||
| 491 | + : impl_(std::make_unique<Impl>(tokens, lexicon, dict_dir, data_dir, | ||
| 492 | + meta_data, debug)) {} | ||
| 493 | + | ||
| 494 | +template <typename Manager> | ||
| 495 | +KokoroMultiLangLexicon::KokoroMultiLangLexicon( | ||
| 496 | + Manager *mgr, const std::string &tokens, const std::string &lexicon, | ||
| 497 | + const std::string &dict_dir, const std::string &data_dir, | ||
| 498 | + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) | ||
| 499 | + : impl_(std::make_unique<Impl>(mgr, tokens, lexicon, dict_dir, data_dir, | ||
| 500 | + meta_data, debug)) {} | ||
| 501 | + | ||
| 502 | +std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds( | ||
| 503 | + const std::string &text, const std::string & /*unused_voice = ""*/) const { | ||
| 504 | + return impl_->ConvertTextToTokenIds(text); | ||
| 505 | +} | ||
| 506 | + | ||
| 507 | +#if __ANDROID_API__ >= 9 | ||
| 508 | +template KokoroMultiLangLexicon::KokoroMultiLangLexicon( | ||
| 509 | + AAssetManager *mgr, const std::string &tokens, const std::string &lexicon, | ||
| 510 | + const std::string &dict_dir, const std::string &data_dir, | ||
| 511 | + const OfflineTtsKokoroModelMetaData &meta_data, bool debug); | ||
| 512 | +#endif | ||
| 513 | + | ||
| 514 | +#if __OHOS__ | ||
| 515 | +template KokoroMultiLangLexicon::KokoroMultiLangLexicon( | ||
| 516 | + NativeResourceManager *mgr, const std::string &tokens, | ||
| 517 | + const std::string &lexicon, const std::string &dict_dir, | ||
| 518 | + const std::string &data_dir, const OfflineTtsKokoroModelMetaData &meta_data, | ||
| 519 | + bool debug); | ||
| 520 | +#endif | ||
| 521 | + | ||
| 522 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_ | ||
| 7 | + | ||
| 8 | +#include <memory> | ||
| 9 | +#include <string> | ||
| 10 | +#include <vector> | ||
| 11 | + | ||
| 12 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 13 | +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" | ||
| 14 | + | ||
| 15 | +namespace sherpa_onnx { | ||
| 16 | + | ||
| 17 | +class KokoroMultiLangLexicon : public OfflineTtsFrontend { | ||
| 18 | + public: | ||
| 19 | + ~KokoroMultiLangLexicon() override; | ||
| 20 | + | ||
| 21 | + KokoroMultiLangLexicon(const std::string &tokens, const std::string &lexicon, | ||
| 22 | + const std::string &dict_dir, | ||
| 23 | + const std::string &data_dir, | ||
| 24 | + const OfflineTtsKokoroModelMetaData &meta_data, | ||
| 25 | + bool debug); | ||
| 26 | + | ||
| 27 | + template <typename Manager> | ||
| 28 | + KokoroMultiLangLexicon(Manager *mgr, const std::string &tokens, | ||
| 29 | + const std::string &lexicon, | ||
| 30 | + const std::string &dict_dir, | ||
| 31 | + const std::string &data_dir, | ||
| 32 | + const OfflineTtsKokoroModelMetaData &meta_data, | ||
| 33 | + bool debug); | ||
| 34 | + | ||
| 35 | + std::vector<TokenIDs> ConvertTextToTokenIds( | ||
| 36 | + const std::string &text, const std::string &voice = "") const override; | ||
| 37 | + | ||
| 38 | + private: | ||
| 39 | + class Impl; | ||
| 40 | + std::unique_ptr<Impl> impl_; | ||
| 41 | +}; | ||
| 42 | + | ||
| 43 | +} // namespace sherpa_onnx | ||
| 44 | + | ||
| 45 | +#endif // SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_ |
| @@ -6,7 +6,9 @@ | @@ -6,7 +6,9 @@ | ||
| 6 | 6 | ||
| 7 | #include <fstream> | 7 | #include <fstream> |
| 8 | #include <regex> // NOLINT | 8 | #include <regex> // NOLINT |
| 9 | +#include <sstream> | ||
| 9 | #include <strstream> | 10 | #include <strstream> |
| 11 | +#include <unordered_map> | ||
| 10 | #include <utility> | 12 | #include <utility> |
| 11 | #if __ANDROID_API__ >= 9 | 13 | #if __ANDROID_API__ >= 9 |
| 12 | #include "android/asset_manager.h" | 14 | #include "android/asset_manager.h" |
| @@ -7,7 +7,6 @@ | @@ -7,7 +7,6 @@ | ||
| 7 | 7 | ||
| 8 | #include <memory> | 8 | #include <memory> |
| 9 | #include <string> | 9 | #include <string> |
| 10 | -#include <unordered_map> | ||
| 11 | #include <vector> | 10 | #include <vector> |
| 12 | 11 | ||
| 13 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| @@ -19,6 +19,9 @@ struct TokenIDs { | @@ -19,6 +19,9 @@ struct TokenIDs { | ||
| 19 | /*implicit*/ TokenIDs(std::vector<int64_t> tokens) // NOLINT | 19 | /*implicit*/ TokenIDs(std::vector<int64_t> tokens) // NOLINT |
| 20 | : tokens{std::move(tokens)} {} | 20 | : tokens{std::move(tokens)} {} |
| 21 | 21 | ||
| 22 | + /*implicit*/ TokenIDs(const std::vector<int32_t> &tokens) // NOLINT | ||
| 23 | + : tokens{tokens.begin(), tokens.end()} {} | ||
| 24 | + | ||
| 22 | TokenIDs(std::vector<int64_t> tokens, // NOLINT | 25 | TokenIDs(std::vector<int64_t> tokens, // NOLINT |
| 23 | std::vector<int64_t> tones) // NOLINT | 26 | std::vector<int64_t> tones) // NOLINT |
| 24 | : tokens{std::move(tokens)}, tones{std::move(tones)} {} | 27 | : tokens{std::move(tokens)}, tones{std::move(tones)} {} |
| @@ -51,6 +54,9 @@ class OfflineTtsFrontend { | @@ -51,6 +54,9 @@ class OfflineTtsFrontend { | ||
| 51 | const std::string &text, const std::string &voice = "") const = 0; | 54 | const std::string &text, const std::string &voice = "") const = 0; |
| 52 | }; | 55 | }; |
| 53 | 56 | ||
| 57 | +// implementation is in ./piper-phonemize-lexicon.cc | ||
| 58 | +void InitEspeak(const std::string &data_dir); | ||
| 59 | + | ||
| 54 | } // namespace sherpa_onnx | 60 | } // namespace sherpa_onnx |
| 55 | 61 | ||
| 56 | #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ | 62 | #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ |
| @@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
| 13 | #include "fst/extensions/far/far.h" | 13 | #include "fst/extensions/far/far.h" |
| 14 | #include "kaldifst/csrc/kaldi-fst-io.h" | 14 | #include "kaldifst/csrc/kaldi-fst-io.h" |
| 15 | #include "kaldifst/csrc/text-normalizer.h" | 15 | #include "kaldifst/csrc/text-normalizer.h" |
| 16 | +#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" | ||
| 16 | #include "sherpa-onnx/csrc/lexicon.h" | 17 | #include "sherpa-onnx/csrc/lexicon.h" |
| 17 | #include "sherpa-onnx/csrc/macros.h" | 18 | #include "sherpa-onnx/csrc/macros.h" |
| 18 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 19 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| @@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | @@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 314 | template <typename Manager> | 315 | template <typename Manager> |
| 315 | void InitFrontend(Manager *mgr) { | 316 | void InitFrontend(Manager *mgr) { |
| 316 | const auto &meta_data = model_->GetMetaData(); | 317 | const auto &meta_data = model_->GetMetaData(); |
| 318 | + | ||
| 319 | + if (meta_data.version >= 2) { | ||
| 320 | + // this is a multi-lingual model, we require that you pass lexicon | ||
| 321 | + // and dict_dir | ||
| 322 | + if (config_.model.kokoro.lexicon.empty() || | ||
| 323 | + config_.model.kokoro.dict_dir.empty()) { | ||
| 324 | + SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); | ||
| 325 | + SHERPA_ONNX_LOGE( | ||
| 326 | + "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " | ||
| 327 | + "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir"); | ||
| 328 | + SHERPA_ONNX_EXIT(-1); | ||
| 329 | + } | ||
| 330 | + | ||
| 331 | + frontend_ = std::make_unique<KokoroMultiLangLexicon>( | ||
| 332 | + mgr, config_.model.kokoro.tokens, config_.model.kokoro.lexicon, | ||
| 333 | + config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir, | ||
| 334 | + meta_data, config_.model.debug); | ||
| 335 | + | ||
| 336 | + return; | ||
| 337 | + } | ||
| 338 | + | ||
| 317 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( | 339 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( |
| 318 | mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir, | 340 | mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir, |
| 319 | meta_data); | 341 | meta_data); |
| @@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | @@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 321 | 343 | ||
| 322 | void InitFrontend() { | 344 | void InitFrontend() { |
| 323 | const auto &meta_data = model_->GetMetaData(); | 345 | const auto &meta_data = model_->GetMetaData(); |
| 346 | + if (meta_data.version >= 2) { | ||
| 347 | + // this is a multi-lingual model, we require that you pass lexicon | ||
| 348 | + // and dict_dir | ||
| 349 | + if (config_.model.kokoro.lexicon.empty() || | ||
| 350 | + config_.model.kokoro.dict_dir.empty()) { | ||
| 351 | + SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); | ||
| 352 | + SHERPA_ONNX_LOGE( | ||
| 353 | + "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " | ||
| 354 | + "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir"); | ||
| 355 | + SHERPA_ONNX_EXIT(-1); | ||
| 356 | + } | ||
| 357 | + | ||
| 358 | + frontend_ = std::make_unique<KokoroMultiLangLexicon>( | ||
| 359 | + config_.model.kokoro.tokens, config_.model.kokoro.lexicon, | ||
| 360 | + config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir, | ||
| 361 | + meta_data, config_.model.debug); | ||
| 362 | + | ||
| 363 | + return; | ||
| 364 | + } | ||
| 324 | 365 | ||
| 366 | + // this is for kokoro v0.19, which supports only English | ||
| 325 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( | 367 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( |
| 326 | config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data); | 368 | config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data); |
| 327 | } | 369 | } |
| @@ -8,6 +8,7 @@ | @@ -8,6 +8,7 @@ | ||
| 8 | 8 | ||
| 9 | #include "sherpa-onnx/csrc/file-utils.h" | 9 | #include "sherpa-onnx/csrc/file-utils.h" |
| 10 | #include "sherpa-onnx/csrc/macros.h" | 10 | #include "sherpa-onnx/csrc/macros.h" |
| 11 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 11 | 12 | ||
| 12 | namespace sherpa_onnx { | 13 | namespace sherpa_onnx { |
| 13 | 14 | ||
| @@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { | @@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { | ||
| 17 | "Path to voices.bin for Kokoro models"); | 18 | "Path to voices.bin for Kokoro models"); |
| 18 | po->Register("kokoro-tokens", &tokens, | 19 | po->Register("kokoro-tokens", &tokens, |
| 19 | "Path to tokens.txt for Kokoro models"); | 20 | "Path to tokens.txt for Kokoro models"); |
| 21 | + po->Register( | ||
| 22 | + "kokoro-lexicon", &lexicon, | ||
| 23 | + "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0" | ||
| 24 | + "You can pass multiple files, separated by ','. Example: " | ||
| 25 | + "./lexicon-us-en.txt,./lexicon-zh.txt"); | ||
| 20 | po->Register("kokoro-data-dir", &data_dir, | 26 | po->Register("kokoro-data-dir", &data_dir, |
| 21 | "Path to the directory containing dict for espeak-ng."); | 27 | "Path to the directory containing dict for espeak-ng."); |
| 28 | + po->Register("kokoro-dict-dir", &dict_dir, | ||
| 29 | + "Path to the directory containing dict for jieba. " | ||
| 30 | + "Used only for Kokoro >= v1.0"); | ||
| 22 | po->Register("kokoro-length-scale", &length_scale, | 31 | po->Register("kokoro-length-scale", &length_scale, |
| 23 | "Speech speed. Larger->Slower; Smaller->faster."); | 32 | "Speech speed. Larger->Slower; Smaller->faster."); |
| 24 | } | 33 | } |
| @@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const { | @@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const { | ||
| 44 | return false; | 53 | return false; |
| 45 | } | 54 | } |
| 46 | 55 | ||
| 56 | + if (!lexicon.empty()) { | ||
| 57 | + std::vector<std::string> files; | ||
| 58 | + SplitStringToVector(lexicon, ",", false, &files); | ||
| 59 | + for (const auto &f : files) { | ||
| 60 | + if (!FileExists(f)) { | ||
| 61 | + SHERPA_ONNX_LOGE( | ||
| 62 | + "lexicon '%s' does not exist. Please re-check --kokoro-lexicon", | ||
| 63 | + f.c_str()); | ||
| 64 | + return false; | ||
| 65 | + } | ||
| 66 | + } | ||
| 67 | + } | ||
| 68 | + | ||
| 47 | if (data_dir.empty()) { | 69 | if (data_dir.empty()) { |
| 48 | SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir"); | 70 | SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir"); |
| 49 | return false; | 71 | return false; |
| @@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const { | @@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const { | ||
| 77 | return false; | 99 | return false; |
| 78 | } | 100 | } |
| 79 | 101 | ||
| 102 | + if (!dict_dir.empty()) { | ||
| 103 | + std::vector<std::string> required_files = { | ||
| 104 | + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8", | ||
| 105 | + "idf.utf8", "stop_words.utf8", | ||
| 106 | + }; | ||
| 107 | + | ||
| 108 | + for (const auto &f : required_files) { | ||
| 109 | + if (!FileExists(dict_dir + "/" + f)) { | ||
| 110 | + SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check kokoro-dict-dir", | ||
| 111 | + dict_dir.c_str(), f.c_str()); | ||
| 112 | + return false; | ||
| 113 | + } | ||
| 114 | + } | ||
| 115 | + } | ||
| 116 | + | ||
| 80 | return true; | 117 | return true; |
| 81 | } | 118 | } |
| 82 | 119 | ||
| @@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { | @@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { | ||
| 87 | os << "model=\"" << model << "\", "; | 124 | os << "model=\"" << model << "\", "; |
| 88 | os << "voices=\"" << voices << "\", "; | 125 | os << "voices=\"" << voices << "\", "; |
| 89 | os << "tokens=\"" << tokens << "\", "; | 126 | os << "tokens=\"" << tokens << "\", "; |
| 127 | + os << "lexicon=\"" << lexicon << "\", "; | ||
| 90 | os << "data_dir=\"" << data_dir << "\", "; | 128 | os << "data_dir=\"" << data_dir << "\", "; |
| 129 | + os << "dict_dir=\"" << dict_dir << "\", "; | ||
| 91 | os << "length_scale=" << length_scale << ")"; | 130 | os << "length_scale=" << length_scale << ")"; |
| 92 | 131 | ||
| 93 | return os.str(); | 132 | return os.str(); |
| @@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig { | @@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig { | ||
| 16 | std::string voices; | 16 | std::string voices; |
| 17 | std::string tokens; | 17 | std::string tokens; |
| 18 | 18 | ||
| 19 | + // Note: You can pass multiple files, separated by ",", to lexicon | ||
| 20 | + // Example: lexicon = "./lexicon-gb-en.txt,./lexicon-zh.txt"; | ||
| 21 | + std::string lexicon; | ||
| 22 | + | ||
| 19 | std::string data_dir; | 23 | std::string data_dir; |
| 20 | 24 | ||
| 25 | + std::string dict_dir; | ||
| 26 | + | ||
| 21 | // speed = 1 / length_scale | 27 | // speed = 1 / length_scale |
| 22 | float length_scale = 1.0; | 28 | float length_scale = 1.0; |
| 23 | 29 | ||
| @@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig { | @@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig { | ||
| 26 | OfflineTtsKokoroModelConfig(const std::string &model, | 32 | OfflineTtsKokoroModelConfig(const std::string &model, |
| 27 | const std::string &voices, | 33 | const std::string &voices, |
| 28 | const std::string &tokens, | 34 | const std::string &tokens, |
| 29 | - const std::string &data_dir, float length_scale) | 35 | + const std::string &lexicon, |
| 36 | + const std::string &data_dir, | ||
| 37 | + const std::string &dict_dir, float length_scale) | ||
| 30 | : model(model), | 38 | : model(model), |
| 31 | voices(voices), | 39 | voices(voices), |
| 32 | tokens(tokens), | 40 | tokens(tokens), |
| 41 | + lexicon(lexicon), | ||
| 33 | data_dir(data_dir), | 42 | data_dir(data_dir), |
| 43 | + dict_dir(dict_dir), | ||
| 34 | length_scale(length_scale) {} | 44 | length_scale(length_scale) {} |
| 35 | 45 | ||
| 36 | void Register(ParseOptions *po); | 46 | void Register(ParseOptions *po); |
| @@ -32,8 +32,7 @@ | @@ -32,8 +32,7 @@ | ||
| 32 | 32 | ||
| 33 | namespace sherpa_onnx { | 33 | namespace sherpa_onnx { |
| 34 | 34 | ||
| 35 | -static void CallPhonemizeEspeak( | ||
| 36 | - const std::string &text, | 35 | +void CallPhonemizeEspeak(const std::string &text, |
| 37 | piper::eSpeakPhonemeConfig &config, // NOLINT | 36 | piper::eSpeakPhonemeConfig &config, // NOLINT |
| 38 | std::vector<std::vector<piper::Phoneme>> *phonemes) { | 37 | std::vector<std::vector<piper::Phoneme>> *phonemes) { |
| 39 | static std::mutex espeak_mutex; | 38 | static std::mutex espeak_mutex; |
| @@ -245,7 +244,7 @@ static std::vector<int64_t> CoquiPhonemesToIds( | @@ -245,7 +244,7 @@ static std::vector<int64_t> CoquiPhonemesToIds( | ||
| 245 | return ans; | 244 | return ans; |
| 246 | } | 245 | } |
| 247 | 246 | ||
| 248 | -static void InitEspeak(const std::string &data_dir) { | 247 | +void InitEspeak(const std::string &data_dir) { |
| 249 | static std::once_flag init_flag; | 248 | static std::once_flag init_flag; |
| 250 | std::call_once(init_flag, [data_dir]() { | 249 | std::call_once(init_flag, [data_dir]() { |
| 251 | int32_t result = | 250 | int32_t result = |
| @@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/, | @@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/, | ||
| 241 | jlong ptr, jstring text, | 241 | jlong ptr, jstring text, |
| 242 | jint sid, jfloat speed) { | 242 | jint sid, jfloat speed) { |
| 243 | const char *p_text = env->GetStringUTFChars(text, nullptr); | 243 | const char *p_text = env->GetStringUTFChars(text, nullptr); |
| 244 | - SHERPA_ONNX_LOGE("string is: %s", p_text); | ||
| 245 | 244 | ||
| 246 | auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate( | 245 | auto audio = reinterpret_cast<sherpa_onnx::OfflineTts *>(ptr)->Generate( |
| 247 | p_text, sid, speed); | 246 | p_text, sid, speed); |
| @@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl( | @@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl( | ||
| 267 | JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid, | 266 | JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid, |
| 268 | jfloat speed, jobject callback) { | 267 | jfloat speed, jobject callback) { |
| 269 | const char *p_text = env->GetStringUTFChars(text, nullptr); | 268 | const char *p_text = env->GetStringUTFChars(text, nullptr); |
| 270 | - SHERPA_ONNX_LOGE("string is: %s", p_text); | ||
| 271 | 269 | ||
| 272 | std::function<int32_t(const float *, int32_t, float)> callback_wrapper = | 270 | std::function<int32_t(const float *, int32_t, float)> callback_wrapper = |
| 273 | [env, callback](const float *samples, int32_t n, | 271 | [env, callback](const float *samples, int32_t n, |
| @@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { | @@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { | ||
| 16 | py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig") | 16 | py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig") |
| 17 | .def(py::init<>()) | 17 | .def(py::init<>()) |
| 18 | .def(py::init<const std::string &, const std::string &, | 18 | .def(py::init<const std::string &, const std::string &, |
| 19 | + const std::string &, const std::string &, | ||
| 19 | const std::string &, const std::string &, float>(), | 20 | const std::string &, const std::string &, float>(), |
| 20 | py::arg("model"), py::arg("voices"), py::arg("tokens"), | 21 | py::arg("model"), py::arg("voices"), py::arg("tokens"), |
| 21 | - py::arg("data_dir"), py::arg("length_scale") = 1.0) | 22 | + py::arg("lexicon") = "", py::arg("data_dir"), |
| 23 | + py::arg("dict_dir") = "", py::arg("length_scale") = 1.0) | ||
| 22 | .def_readwrite("model", &PyClass::model) | 24 | .def_readwrite("model", &PyClass::model) |
| 23 | .def_readwrite("voices", &PyClass::voices) | 25 | .def_readwrite("voices", &PyClass::voices) |
| 24 | .def_readwrite("tokens", &PyClass::tokens) | 26 | .def_readwrite("tokens", &PyClass::tokens) |
| 27 | + .def_readwrite("lexicon", &PyClass::lexicon) | ||
| 25 | .def_readwrite("data_dir", &PyClass::data_dir) | 28 | .def_readwrite("data_dir", &PyClass::data_dir) |
| 29 | + .def_readwrite("dict_dir", &PyClass::dict_dir) | ||
| 26 | .def_readwrite("length_scale", &PyClass::length_scale) | 30 | .def_readwrite("length_scale", &PyClass::length_scale) |
| 27 | .def("__str__", &PyClass::ToString) | 31 | .def("__str__", &PyClass::ToString) |
| 28 | .def("validate", &PyClass::Validate); | 32 | .def("validate", &PyClass::Validate); |
-
请 注册 或 登录 后发表评论