Committed by
GitHub
Support Matcha-TTS models using espeak-ng (#1672)
正在显示
10 个修改的文件
包含
288 行增加
和
57 行删除
| @@ -19,6 +19,28 @@ which $EXE | @@ -19,6 +19,28 @@ which $EXE | ||
| 19 | mkdir ./tts | 19 | mkdir ./tts |
| 20 | 20 | ||
| 21 | log "------------------------------------------------------------" | 21 | log "------------------------------------------------------------" |
| 22 | +log "matcha-icefall-en_US-ljspeech" | ||
| 23 | +log "------------------------------------------------------------" | ||
| 24 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 25 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 26 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 27 | + | ||
| 28 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 29 | + | ||
| 30 | +$EXE \ | ||
| 31 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 32 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 33 | + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ | ||
| 34 | + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 35 | + --num-threads=2 \ | ||
| 36 | + --output-filename=./tts/matcha-ljspeech-1.wav \ | ||
| 37 | + --debug=1 \ | ||
| 38 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | ||
| 39 | + | ||
| 40 | +rm hifigan_v2.onnx | ||
| 41 | +rm -rf matcha-icefall-en_US-ljspeech | ||
| 42 | + | ||
| 43 | +log "------------------------------------------------------------" | ||
| 22 | log "matcha-icefall-zh-baker" | 44 | log "matcha-icefall-zh-baker" |
| 23 | log "------------------------------------------------------------" | 45 | log "------------------------------------------------------------" |
| 24 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 46 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| @@ -267,7 +267,27 @@ log "Offline TTS test" | @@ -267,7 +267,27 @@ log "Offline TTS test" | ||
| 267 | # test waves are saved in ./tts | 267 | # test waves are saved in ./tts |
| 268 | mkdir ./tts | 268 | mkdir ./tts |
| 269 | 269 | ||
| 270 | -log "vits-ljs test" | 270 | +log "matcha-ljspeech-en test" |
| 271 | + | ||
| 272 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 273 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 274 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 275 | + | ||
| 276 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 277 | + | ||
| 278 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 279 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 280 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 281 | + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ | ||
| 282 | + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 283 | + --output-filename=./tts/test-matcha-ljspeech-en.wav \ | ||
| 284 | + --num-threads=2 \ | ||
| 285 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | ||
| 286 | + | ||
| 287 | +rm hifigan_v2.onnx | ||
| 288 | +rm -rf matcha-icefall-en_US-ljspeech | ||
| 289 | + | ||
| 290 | +log "matcha-baker-zh test" | ||
| 271 | 291 | ||
| 272 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 292 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 273 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 293 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| @@ -282,12 +302,13 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -282,12 +302,13 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 282 | --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ | 302 | --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ |
| 283 | --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | 303 | --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ |
| 284 | --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ | 304 | --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ |
| 285 | - --output-filename=./tts/test-matcha.wav \ | 305 | + --output-filename=./tts/test-matcha-baker-zh.wav \ |
| 286 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | 306 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| 287 | 307 | ||
| 288 | rm -rf matcha-icefall-zh-baker | 308 | rm -rf matcha-icefall-zh-baker |
| 289 | rm hifigan_v2.onnx | 309 | rm hifigan_v2.onnx |
| 290 | 310 | ||
| 311 | +log "vits-ljs test" | ||
| 291 | 312 | ||
| 292 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx | 313 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx |
| 293 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt | 314 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt |
| @@ -11,7 +11,7 @@ while the model is still generating. | @@ -11,7 +11,7 @@ while the model is still generating. | ||
| 11 | 11 | ||
| 12 | Usage: | 12 | Usage: |
| 13 | 13 | ||
| 14 | -Example (1/4) | 14 | +Example (1/5) |
| 15 | 15 | ||
| 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 23 | --output-filename=./generated.wav \ | 23 | --output-filename=./generated.wav \ |
| 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 25 | 25 | ||
| 26 | -Example (2/4) | 26 | +Example (2/5) |
| 27 | 27 | ||
| 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 | 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 |
| 29 | tar xvf vits-zh-aishell3.tar.bz2 | 29 | tar xvf vits-zh-aishell3.tar.bz2 |
| @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 37 | --output-filename=./liubei-21.wav \ | 37 | --output-filename=./liubei-21.wav \ |
| 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 39 | 39 | ||
| 40 | -Example (3/4) | 40 | +Example (3/5) |
| 41 | 41 | ||
| 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 53 | --output-filename=./test-2.wav \ | 53 | --output-filename=./test-2.wav \ |
| 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 55 | 55 | ||
| 56 | -Example (4/4) | 56 | +Example (4/5) |
| 57 | 57 | ||
| 58 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 58 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 59 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 59 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| @@ -71,6 +71,23 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -71,6 +71,23 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 71 | --output-filename=./test-matcha.wav \ | 71 | --output-filename=./test-matcha.wav \ |
| 72 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | 72 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| 73 | 73 | ||
| 74 | +Example (5/5) | ||
| 75 | + | ||
| 76 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 77 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 78 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 79 | + | ||
| 80 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 81 | + | ||
| 82 | +python3 ./python-api-examples/offline-tts-play.py \ | ||
| 83 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 84 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 85 | + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ | ||
| 86 | + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 87 | + --output-filename=./test-matcha-ljspeech-en.wav \ | ||
| 88 | + --num-threads=2 \ | ||
| 89 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | ||
| 90 | + | ||
| 74 | 91 | ||
| 75 | You can find more models at | 92 | You can find more models at |
| 76 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 93 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| @@ -12,7 +12,7 @@ generated audio. | @@ -12,7 +12,7 @@ generated audio. | ||
| 12 | 12 | ||
| 13 | Usage: | 13 | Usage: |
| 14 | 14 | ||
| 15 | -Example (1/4) | 15 | +Example (1/5) |
| 16 | 16 | ||
| 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 24 | --output-filename=./generated.wav \ | 24 | --output-filename=./generated.wav \ |
| 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 26 | 26 | ||
| 27 | -Example (2/4) | 27 | +Example (2/5) |
| 28 | 28 | ||
| 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 | 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 |
| @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 38 | --output-filename=./liubei-21.wav \ | 38 | --output-filename=./liubei-21.wav \ |
| 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 40 | 40 | ||
| 41 | -Example (3/4) | 41 | +Example (3/5) |
| 42 | 42 | ||
| 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 54 | --output-filename=./test-2.wav \ | 54 | --output-filename=./test-2.wav \ |
| 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 56 | 56 | ||
| 57 | -Example (4/4) | 57 | +Example (4/5) |
| 58 | 58 | ||
| 59 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 59 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 60 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 60 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| @@ -72,6 +72,23 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -72,6 +72,23 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 72 | --output-filename=./test-matcha.wav \ | 72 | --output-filename=./test-matcha.wav \ |
| 73 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | 73 | "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| 74 | 74 | ||
| 75 | +Example (5/5) | ||
| 76 | + | ||
| 77 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 78 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 79 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 80 | + | ||
| 81 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 82 | + | ||
| 83 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 84 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 85 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 86 | + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ | ||
| 87 | + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 88 | + --output-filename=./test-matcha-ljspeech-en.wav \ | ||
| 89 | + --num-threads=2 \ | ||
| 90 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | ||
| 91 | + | ||
| 75 | You can find more models at | 92 | You can find more models at |
| 76 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 93 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 77 | 94 |
| @@ -49,19 +49,21 @@ | @@ -49,19 +49,21 @@ | ||
| 49 | } while (0) | 49 | } while (0) |
| 50 | #endif | 50 | #endif |
| 51 | 51 | ||
| 52 | +#define SHERPA_ONNX_EXIT(code) exit(code) | ||
| 53 | + | ||
| 52 | // Read an integer | 54 | // Read an integer |
| 53 | #define SHERPA_ONNX_READ_META_DATA(dst, src_key) \ | 55 | #define SHERPA_ONNX_READ_META_DATA(dst, src_key) \ |
| 54 | do { \ | 56 | do { \ |
| 55 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ | 57 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ |
| 56 | if (value.empty()) { \ | 58 | if (value.empty()) { \ |
| 57 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ | 59 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ |
| 58 | - exit(-1); \ | 60 | + SHERPA_ONNX_EXIT(-1); \ |
| 59 | } \ | 61 | } \ |
| 60 | \ | 62 | \ |
| 61 | dst = atoi(value.c_str()); \ | 63 | dst = atoi(value.c_str()); \ |
| 62 | if (dst < 0) { \ | 64 | if (dst < 0) { \ |
| 63 | SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \ | 65 | SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \ |
| 64 | - exit(-1); \ | 66 | + SHERPA_ONNX_EXIT(-1); \ |
| 65 | } \ | 67 | } \ |
| 66 | } while (0) | 68 | } while (0) |
| 67 | 69 | ||
| @@ -74,7 +76,7 @@ | @@ -74,7 +76,7 @@ | ||
| 74 | dst = atoi(value.c_str()); \ | 76 | dst = atoi(value.c_str()); \ |
| 75 | if (dst < 0) { \ | 77 | if (dst < 0) { \ |
| 76 | SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \ | 78 | SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \ |
| 77 | - exit(-1); \ | 79 | + SHERPA_ONNX_EXIT(-1); \ |
| 78 | } \ | 80 | } \ |
| 79 | } \ | 81 | } \ |
| 80 | } while (0) | 82 | } while (0) |
| @@ -85,13 +87,13 @@ | @@ -85,13 +87,13 @@ | ||
| 85 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ | 87 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ |
| 86 | if (value.empty()) { \ | 88 | if (value.empty()) { \ |
| 87 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ | 89 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ |
| 88 | - exit(-1); \ | 90 | + SHERPA_ONNX_EXIT(-1); \ |
| 89 | } \ | 91 | } \ |
| 90 | \ | 92 | \ |
| 91 | bool ret = SplitStringToIntegers(value.c_str(), ",", true, &dst); \ | 93 | bool ret = SplitStringToIntegers(value.c_str(), ",", true, &dst); \ |
| 92 | if (!ret) { \ | 94 | if (!ret) { \ |
| 93 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \ | 95 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \ |
| 94 | - exit(-1); \ | 96 | + SHERPA_ONNX_EXIT(-1); \ |
| 95 | } \ | 97 | } \ |
| 96 | } while (0) | 98 | } while (0) |
| 97 | 99 | ||
| @@ -101,13 +103,13 @@ | @@ -101,13 +103,13 @@ | ||
| 101 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ | 103 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ |
| 102 | if (value.empty()) { \ | 104 | if (value.empty()) { \ |
| 103 | SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key); \ | 105 | SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key); \ |
| 104 | - exit(-1); \ | 106 | + SHERPA_ONNX_EXIT(-1); \ |
| 105 | } \ | 107 | } \ |
| 106 | \ | 108 | \ |
| 107 | bool ret = SplitStringToFloats(value.c_str(), ",", true, &dst); \ | 109 | bool ret = SplitStringToFloats(value.c_str(), ",", true, &dst); \ |
| 108 | if (!ret) { \ | 110 | if (!ret) { \ |
| 109 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \ | 111 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \ |
| 110 | - exit(-1); \ | 112 | + SHERPA_ONNX_EXIT(-1); \ |
| 111 | } \ | 113 | } \ |
| 112 | } while (0) | 114 | } while (0) |
| 113 | 115 | ||
| @@ -117,14 +119,14 @@ | @@ -117,14 +119,14 @@ | ||
| 117 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ | 119 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ |
| 118 | if (value.empty()) { \ | 120 | if (value.empty()) { \ |
| 119 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ | 121 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ |
| 120 | - exit(-1); \ | 122 | + SHERPA_ONNX_EXIT(-1); \ |
| 121 | } \ | 123 | } \ |
| 122 | SplitStringToVector(value.c_str(), ",", false, &dst); \ | 124 | SplitStringToVector(value.c_str(), ",", false, &dst); \ |
| 123 | \ | 125 | \ |
| 124 | if (dst.empty()) { \ | 126 | if (dst.empty()) { \ |
| 125 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \ | 127 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \ |
| 126 | value.c_str(), src_key); \ | 128 | value.c_str(), src_key); \ |
| 127 | - exit(-1); \ | 129 | + SHERPA_ONNX_EXIT(-1); \ |
| 128 | } \ | 130 | } \ |
| 129 | } while (0) | 131 | } while (0) |
| 130 | 132 | ||
| @@ -134,14 +136,14 @@ | @@ -134,14 +136,14 @@ | ||
| 134 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ | 136 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ |
| 135 | if (value.empty()) { \ | 137 | if (value.empty()) { \ |
| 136 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ | 138 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ |
| 137 | - exit(-1); \ | 139 | + SHERPA_ONNX_EXIT(-1); \ |
| 138 | } \ | 140 | } \ |
| 139 | SplitStringToVector(value.c_str(), sep, false, &dst); \ | 141 | SplitStringToVector(value.c_str(), sep, false, &dst); \ |
| 140 | \ | 142 | \ |
| 141 | if (dst.empty()) { \ | 143 | if (dst.empty()) { \ |
| 142 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \ | 144 | SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \ |
| 143 | value.c_str(), src_key); \ | 145 | value.c_str(), src_key); \ |
| 144 | - exit(-1); \ | 146 | + SHERPA_ONNX_EXIT(-1); \ |
| 145 | } \ | 147 | } \ |
| 146 | } while (0) | 148 | } while (0) |
| 147 | 149 | ||
| @@ -151,13 +153,13 @@ | @@ -151,13 +153,13 @@ | ||
| 151 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ | 153 | auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ |
| 152 | if (value.empty()) { \ | 154 | if (value.empty()) { \ |
| 153 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ | 155 | SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ |
| 154 | - exit(-1); \ | 156 | + SHERPA_ONNX_EXIT(-1); \ |
| 155 | } \ | 157 | } \ |
| 156 | \ | 158 | \ |
| 157 | dst = std::move(value); \ | 159 | dst = std::move(value); \ |
| 158 | if (dst.empty()) { \ | 160 | if (dst.empty()) { \ |
| 159 | SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \ | 161 | SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \ |
| 160 | - exit(-1); \ | 162 | + SHERPA_ONNX_EXIT(-1); \ |
| 161 | } \ | 163 | } \ |
| 162 | } while (0) | 164 | } while (0) |
| 163 | 165 | ||
| @@ -178,11 +180,9 @@ | @@ -178,11 +180,9 @@ | ||
| 178 | dst = std::move(value); \ | 180 | dst = std::move(value); \ |
| 179 | if (dst.empty()) { \ | 181 | if (dst.empty()) { \ |
| 180 | SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \ | 182 | SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \ |
| 181 | - exit(-1); \ | 183 | + SHERPA_ONNX_EXIT(-1); \ |
| 182 | } \ | 184 | } \ |
| 183 | } \ | 185 | } \ |
| 184 | } while (0) | 186 | } while (0) |
| 185 | 187 | ||
| 186 | -#define SHERPA_ONNX_EXIT(code) exit(code) | ||
| 187 | - | ||
| 188 | #endif // SHERPA_ONNX_CSRC_MACROS_H_ | 188 | #endif // SHERPA_ONNX_CSRC_MACROS_H_ |
| @@ -321,12 +321,45 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | @@ -321,12 +321,45 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | ||
| 321 | 321 | ||
| 322 | private: | 322 | private: |
| 323 | template <typename Manager> | 323 | template <typename Manager> |
| 324 | - void InitFrontend(Manager *mgr) {} | 324 | + void InitFrontend(Manager *mgr) { |
| 325 | + // for piper phonemizer | ||
| 326 | + // we require that you copy espeak_ng_data | ||
| 327 | + // from assets to disk | ||
| 328 | + // | ||
| 329 | + // for jieba | ||
| 330 | + // we require that you copy tokens.txt, lexicon.txt and dict | ||
| 331 | + // from assets to disk | ||
| 332 | + const auto &meta_data = model_->GetMetaData(); | ||
| 333 | + | ||
| 334 | + if (meta_data.jieba && !meta_data.has_espeak) { | ||
| 335 | + frontend_ = std::make_unique<JiebaLexicon>( | ||
| 336 | + config_.model.matcha.lexicon, config_.model.matcha.tokens, | ||
| 337 | + config_.model.matcha.dict_dir, config_.model.debug); | ||
| 338 | + } else if (meta_data.has_espeak && !meta_data.jieba) { | ||
| 339 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 340 | + mgr, config_.model.matcha.tokens, config_.model.matcha.data_dir, | ||
| 341 | + meta_data); | ||
| 342 | + } else { | ||
| 343 | + SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet"); | ||
| 344 | + SHERPA_ONNX_EXIT(-1); | ||
| 345 | + } | ||
| 346 | + } | ||
| 325 | 347 | ||
| 326 | void InitFrontend() { | 348 | void InitFrontend() { |
| 327 | - frontend_ = std::make_unique<JiebaLexicon>( | ||
| 328 | - config_.model.matcha.lexicon, config_.model.matcha.tokens, | ||
| 329 | - config_.model.matcha.dict_dir, config_.model.debug); | 349 | + const auto &meta_data = model_->GetMetaData(); |
| 350 | + | ||
| 351 | + if (meta_data.jieba && !meta_data.has_espeak) { | ||
| 352 | + frontend_ = std::make_unique<JiebaLexicon>( | ||
| 353 | + config_.model.matcha.lexicon, config_.model.matcha.tokens, | ||
| 354 | + config_.model.matcha.dict_dir, config_.model.debug); | ||
| 355 | + } else if (meta_data.has_espeak && !meta_data.jieba) { | ||
| 356 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 357 | + config_.model.matcha.tokens, config_.model.matcha.data_dir, | ||
| 358 | + meta_data); | ||
| 359 | + } else { | ||
| 360 | + SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet"); | ||
| 361 | + SHERPA_ONNX_EXIT(-1); | ||
| 362 | + } | ||
| 330 | } | 363 | } |
| 331 | 364 | ||
| 332 | GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, | 365 | GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, |
| @@ -18,7 +18,7 @@ struct OfflineTtsMatchaModelMetaData { | @@ -18,7 +18,7 @@ struct OfflineTtsMatchaModelMetaData { | ||
| 18 | int32_t num_speakers = 0; | 18 | int32_t num_speakers = 0; |
| 19 | int32_t version = 1; | 19 | int32_t version = 1; |
| 20 | int32_t jieba = 0; | 20 | int32_t jieba = 0; |
| 21 | - int32_t espeak = 0; | 21 | + int32_t has_espeak = 0; |
| 22 | int32_t use_eos_bos = 0; | 22 | int32_t use_eos_bos = 0; |
| 23 | int32_t pad_id = 0; | 23 | int32_t pad_id = 0; |
| 24 | }; | 24 | }; |
| @@ -142,7 +142,7 @@ class OfflineTtsMatchaModel::Impl { | @@ -142,7 +142,7 @@ class OfflineTtsMatchaModel::Impl { | ||
| 142 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); | 142 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); |
| 143 | SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); | 143 | SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); |
| 144 | SHERPA_ONNX_READ_META_DATA(meta_data_.jieba, "jieba"); | 144 | SHERPA_ONNX_READ_META_DATA(meta_data_.jieba, "jieba"); |
| 145 | - SHERPA_ONNX_READ_META_DATA(meta_data_.espeak, "has_espeak"); | 145 | + SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); |
| 146 | SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); | 146 | SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); |
| 147 | SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); | 147 | SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); |
| 148 | } | 148 | } |
| @@ -32,6 +32,18 @@ | @@ -32,6 +32,18 @@ | ||
| 32 | 32 | ||
| 33 | namespace sherpa_onnx { | 33 | namespace sherpa_onnx { |
| 34 | 34 | ||
| 35 | +static void CallPhonemizeEspeak( | ||
| 36 | + const std::string &text, | ||
| 37 | + piper::eSpeakPhonemeConfig &config, // NOLINT | ||
| 38 | + std::vector<std::vector<piper::Phoneme>> *phonemes) { | ||
| 39 | + static std::mutex espeak_mutex; | ||
| 40 | + | ||
| 41 | + std::lock_guard<std::mutex> lock(espeak_mutex); | ||
| 42 | + | ||
| 43 | + // keep multi threads from calling into piper::phonemize_eSpeak | ||
| 44 | + piper::phonemize_eSpeak(text, config, *phonemes); | ||
| 45 | +} | ||
| 46 | + | ||
| 35 | static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) { | 47 | static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) { |
| 36 | std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; | 48 | std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; |
| 37 | std::unordered_map<char32_t, int32_t> token2id; | 49 | std::unordered_map<char32_t, int32_t> token2id; |
| @@ -87,7 +99,7 @@ static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) { | @@ -87,7 +99,7 @@ static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) { | ||
| 87 | 99 | ||
| 88 | // see the function "phonemes_to_ids" from | 100 | // see the function "phonemes_to_ids" from |
| 89 | // https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb | 101 | // https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb |
| 90 | -static std::vector<int64_t> PiperPhonemesToIds( | 102 | +static std::vector<int64_t> PiperPhonemesToIdsVits( |
| 91 | const std::unordered_map<char32_t, int32_t> &token2id, | 103 | const std::unordered_map<char32_t, int32_t> &token2id, |
| 92 | const std::vector<piper::Phoneme> &phonemes) { | 104 | const std::vector<piper::Phoneme> &phonemes) { |
| 93 | // see | 105 | // see |
| @@ -114,17 +126,46 @@ static std::vector<int64_t> PiperPhonemesToIds( | @@ -114,17 +126,46 @@ static std::vector<int64_t> PiperPhonemesToIds( | ||
| 114 | return ans; | 126 | return ans; |
| 115 | } | 127 | } |
| 116 | 128 | ||
| 129 | +static std::vector<int64_t> PiperPhonemesToIdsMatcha( | ||
| 130 | + const std::unordered_map<char32_t, int32_t> &token2id, | ||
| 131 | + const std::vector<piper::Phoneme> &phonemes, bool use_eos_bos) { | ||
| 132 | + std::vector<int64_t> ans; | ||
| 133 | + ans.reserve(phonemes.size()); | ||
| 134 | + | ||
| 135 | + int32_t bos = token2id.at(U'^'); | ||
| 136 | + int32_t eos = token2id.at(U'$'); | ||
| 137 | + | ||
| 138 | + if (use_eos_bos) { | ||
| 139 | + ans.push_back(bos); | ||
| 140 | + } | ||
| 141 | + | ||
| 142 | + for (auto p : phonemes) { | ||
| 143 | + if (token2id.count(p)) { | ||
| 144 | + ans.push_back(token2id.at(p)); | ||
| 145 | + } else { | ||
| 146 | + SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.", | ||
| 147 | + static_cast<uint32_t>(p)); | ||
| 148 | + } | ||
| 149 | + } | ||
| 150 | + | ||
| 151 | + if (use_eos_bos) { | ||
| 152 | + ans.push_back(eos); | ||
| 153 | + } | ||
| 154 | + | ||
| 155 | + return ans; | ||
| 156 | +} | ||
| 157 | + | ||
| 117 | static std::vector<int64_t> CoquiPhonemesToIds( | 158 | static std::vector<int64_t> CoquiPhonemesToIds( |
| 118 | const std::unordered_map<char32_t, int32_t> &token2id, | 159 | const std::unordered_map<char32_t, int32_t> &token2id, |
| 119 | const std::vector<piper::Phoneme> &phonemes, | 160 | const std::vector<piper::Phoneme> &phonemes, |
| 120 | - const OfflineTtsVitsModelMetaData &meta_data) { | 161 | + const OfflineTtsVitsModelMetaData &vits_meta_data) { |
| 121 | // see | 162 | // see |
| 122 | // https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py#L87 | 163 | // https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py#L87 |
| 123 | - int32_t use_eos_bos = meta_data.use_eos_bos; | ||
| 124 | - int32_t bos_id = meta_data.bos_id; | ||
| 125 | - int32_t eos_id = meta_data.eos_id; | ||
| 126 | - int32_t blank_id = meta_data.blank_id; | ||
| 127 | - int32_t add_blank = meta_data.add_blank; | 164 | + int32_t use_eos_bos = vits_meta_data.use_eos_bos; |
| 165 | + int32_t bos_id = vits_meta_data.bos_id; | ||
| 166 | + int32_t eos_id = vits_meta_data.eos_id; | ||
| 167 | + int32_t blank_id = vits_meta_data.blank_id; | ||
| 168 | + int32_t add_blank = vits_meta_data.add_blank; | ||
| 128 | int32_t comma_id = token2id.at(','); | 169 | int32_t comma_id = token2id.at(','); |
| 129 | 170 | ||
| 130 | std::vector<int64_t> ans; | 171 | std::vector<int64_t> ans; |
| @@ -189,8 +230,37 @@ static void InitEspeak(const std::string &data_dir) { | @@ -189,8 +230,37 @@ static void InitEspeak(const std::string &data_dir) { | ||
| 189 | 230 | ||
| 190 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( | 231 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( |
| 191 | const std::string &tokens, const std::string &data_dir, | 232 | const std::string &tokens, const std::string &data_dir, |
| 192 | - const OfflineTtsVitsModelMetaData &meta_data) | ||
| 193 | - : meta_data_(meta_data) { | 233 | + const OfflineTtsVitsModelMetaData &vits_meta_data) |
| 234 | + : vits_meta_data_(vits_meta_data) { | ||
| 235 | + { | ||
| 236 | + std::ifstream is(tokens); | ||
| 237 | + token2id_ = ReadTokens(is); | ||
| 238 | + } | ||
| 239 | + | ||
| 240 | + InitEspeak(data_dir); | ||
| 241 | +} | ||
| 242 | + | ||
| 243 | +template <typename Manager> | ||
| 244 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 245 | + Manager *mgr, const std::string &tokens, const std::string &data_dir, | ||
| 246 | + const OfflineTtsVitsModelMetaData &vits_meta_data) | ||
| 247 | + : vits_meta_data_(vits_meta_data) { | ||
| 248 | + { | ||
| 249 | + auto buf = ReadFile(mgr, tokens); | ||
| 250 | + std::istrstream is(buf.data(), buf.size()); | ||
| 251 | + token2id_ = ReadTokens(is); | ||
| 252 | + } | ||
| 253 | + | ||
| 254 | + // We should copy the directory of espeak-ng-data from the asset to | ||
| 255 | + // some internal or external storage and then pass the directory to | ||
| 256 | + // data_dir. | ||
| 257 | + InitEspeak(data_dir); | ||
| 258 | +} | ||
| 259 | + | ||
| 260 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 261 | + const std::string &tokens, const std::string &data_dir, | ||
| 262 | + const OfflineTtsMatchaModelMetaData &matcha_meta_data) | ||
| 263 | + : matcha_meta_data_(matcha_meta_data), is_matcha_(true) { | ||
| 194 | { | 264 | { |
| 195 | std::ifstream is(tokens); | 265 | std::ifstream is(tokens); |
| 196 | token2id_ = ReadTokens(is); | 266 | token2id_ = ReadTokens(is); |
| @@ -202,8 +272,8 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -202,8 +272,8 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 202 | template <typename Manager> | 272 | template <typename Manager> |
| 203 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( | 273 | PiperPhonemizeLexicon::PiperPhonemizeLexicon( |
| 204 | Manager *mgr, const std::string &tokens, const std::string &data_dir, | 274 | Manager *mgr, const std::string &tokens, const std::string &data_dir, |
| 205 | - const OfflineTtsVitsModelMetaData &meta_data) | ||
| 206 | - : meta_data_(meta_data) { | 275 | + const OfflineTtsMatchaModelMetaData &matcha_meta_data) |
| 276 | + : matcha_meta_data_(matcha_meta_data), is_matcha_(true) { | ||
| 207 | { | 277 | { |
| 208 | auto buf = ReadFile(mgr, tokens); | 278 | auto buf = ReadFile(mgr, tokens); |
| 209 | std::istrstream is(buf.data(), buf.size()); | 279 | std::istrstream is(buf.data(), buf.size()); |
| @@ -218,6 +288,15 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | @@ -218,6 +288,15 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 218 | 288 | ||
| 219 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | 289 | std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( |
| 220 | const std::string &text, const std::string &voice /*= ""*/) const { | 290 | const std::string &text, const std::string &voice /*= ""*/) const { |
| 291 | + if (is_matcha_) { | ||
| 292 | + return ConvertTextToTokenIdsMatcha(text, voice); | ||
| 293 | + } else { | ||
| 294 | + return ConvertTextToTokenIdsVits(text, voice); | ||
| 295 | + } | ||
| 296 | +} | ||
| 297 | + | ||
| 298 | +std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( | ||
| 299 | + const std::string &text, const std::string &voice /*= ""*/) const { | ||
| 221 | piper::eSpeakPhonemeConfig config; | 300 | piper::eSpeakPhonemeConfig config; |
| 222 | 301 | ||
| 223 | // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices | 302 | // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices |
| @@ -226,26 +305,45 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | @@ -226,26 +305,45 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | ||
| 226 | 305 | ||
| 227 | std::vector<std::vector<piper::Phoneme>> phonemes; | 306 | std::vector<std::vector<piper::Phoneme>> phonemes; |
| 228 | 307 | ||
| 229 | - static std::mutex espeak_mutex; | ||
| 230 | - { | ||
| 231 | - std::lock_guard<std::mutex> lock(espeak_mutex); | 308 | + CallPhonemizeEspeak(text, config, &phonemes); |
| 232 | 309 | ||
| 233 | - // keep multi threads from calling into piper::phonemize_eSpeak | ||
| 234 | - piper::phonemize_eSpeak(text, config, phonemes); | 310 | + std::vector<TokenIDs> ans; |
| 311 | + | ||
| 312 | + std::vector<int64_t> phoneme_ids; | ||
| 313 | + | ||
| 314 | + for (const auto &p : phonemes) { | ||
| 315 | + phoneme_ids = | ||
| 316 | + PiperPhonemesToIdsMatcha(token2id_, p, matcha_meta_data_.use_eos_bos); | ||
| 317 | + ans.emplace_back(std::move(phoneme_ids)); | ||
| 235 | } | 318 | } |
| 236 | 319 | ||
| 320 | + return ans; | ||
| 321 | +} | ||
| 322 | + | ||
| 323 | +std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsVits( | ||
| 324 | + const std::string &text, const std::string &voice /*= ""*/) const { | ||
| 325 | + piper::eSpeakPhonemeConfig config; | ||
| 326 | + | ||
| 327 | + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices | ||
| 328 | + // to list available voices | ||
| 329 | + config.voice = voice; // e.g., voice is en-us | ||
| 330 | + | ||
| 331 | + std::vector<std::vector<piper::Phoneme>> phonemes; | ||
| 332 | + | ||
| 333 | + CallPhonemizeEspeak(text, config, &phonemes); | ||
| 334 | + | ||
| 237 | std::vector<TokenIDs> ans; | 335 | std::vector<TokenIDs> ans; |
| 238 | 336 | ||
| 239 | std::vector<int64_t> phoneme_ids; | 337 | std::vector<int64_t> phoneme_ids; |
| 240 | 338 | ||
| 241 | - if (meta_data_.is_piper || meta_data_.is_icefall) { | 339 | + if (vits_meta_data_.is_piper || vits_meta_data_.is_icefall) { |
| 242 | for (const auto &p : phonemes) { | 340 | for (const auto &p : phonemes) { |
| 243 | - phoneme_ids = PiperPhonemesToIds(token2id_, p); | 341 | + phoneme_ids = PiperPhonemesToIdsVits(token2id_, p); |
| 244 | ans.emplace_back(std::move(phoneme_ids)); | 342 | ans.emplace_back(std::move(phoneme_ids)); |
| 245 | } | 343 | } |
| 246 | - } else if (meta_data_.is_coqui) { | 344 | + } else if (vits_meta_data_.is_coqui) { |
| 247 | for (const auto &p : phonemes) { | 345 | for (const auto &p : phonemes) { |
| 248 | - phoneme_ids = CoquiPhonemesToIds(token2id_, p, meta_data_); | 346 | + phoneme_ids = CoquiPhonemesToIds(token2id_, p, vits_meta_data_); |
| 249 | ans.emplace_back(std::move(phoneme_ids)); | 347 | ans.emplace_back(std::move(phoneme_ids)); |
| 250 | } | 348 | } |
| 251 | 349 | ||
| @@ -260,13 +358,18 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | @@ -260,13 +358,18 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | ||
| 260 | #if __ANDROID_API__ >= 9 | 358 | #if __ANDROID_API__ >= 9 |
| 261 | template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | 359 | template PiperPhonemizeLexicon::PiperPhonemizeLexicon( |
| 262 | AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, | 360 | AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, |
| 263 | - const OfflineTtsVitsModelMetaData &meta_data); | 361 | + const OfflineTtsVitsModelMetaData &vits_meta_data); |
| 362 | + | ||
| 363 | +template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | ||
| 364 | + AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, | ||
| 365 | + const OfflineTtsMatchaModelMetaData &matcha_meta_data); | ||
| 264 | #endif | 366 | #endif |
| 265 | 367 | ||
| 266 | #if __OHOS__ | 368 | #if __OHOS__ |
| 267 | template PiperPhonemizeLexicon::PiperPhonemizeLexicon( | 369 | template PiperPhonemizeLexicon::PiperPhonemizeLexicon( |
| 268 | NativeResourceManager *mgr, const std::string &tokens, | 370 | NativeResourceManager *mgr, const std::string &tokens, |
| 269 | - const std::string &data_dir, const OfflineTtsVitsModelMetaData &meta_data); | 371 | + const std::string &data_dir, |
| 372 | + const OfflineTtsMatchaModelMetaData &matcha_meta_data); | ||
| 270 | #endif | 373 | #endif |
| 271 | 374 | ||
| 272 | } // namespace sherpa_onnx | 375 | } // namespace sherpa_onnx |
| @@ -10,6 +10,7 @@ | @@ -10,6 +10,7 @@ | ||
| 10 | #include <vector> | 10 | #include <vector> |
| 11 | 11 | ||
| 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 12 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| 13 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" | ||
| 13 | #include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | 14 | #include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" |
| 14 | 15 | ||
| 15 | namespace sherpa_onnx { | 16 | namespace sherpa_onnx { |
| @@ -17,20 +18,37 @@ namespace sherpa_onnx { | @@ -17,20 +18,37 @@ namespace sherpa_onnx { | ||
| 17 | class PiperPhonemizeLexicon : public OfflineTtsFrontend { | 18 | class PiperPhonemizeLexicon : public OfflineTtsFrontend { |
| 18 | public: | 19 | public: |
| 19 | PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, | 20 | PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, |
| 20 | - const OfflineTtsVitsModelMetaData &meta_data); | 21 | + const OfflineTtsVitsModelMetaData &vits_meta_data); |
| 22 | + | ||
| 23 | + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, | ||
| 24 | + const OfflineTtsMatchaModelMetaData &matcha_meta_data); | ||
| 21 | 25 | ||
| 22 | template <typename Manager> | 26 | template <typename Manager> |
| 23 | PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, | 27 | PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, |
| 24 | const std::string &data_dir, | 28 | const std::string &data_dir, |
| 25 | - const OfflineTtsVitsModelMetaData &meta_data); | 29 | + const OfflineTtsVitsModelMetaData &vits_meta_data); |
| 30 | + | ||
| 31 | + template <typename Manager> | ||
| 32 | + PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, | ||
| 33 | + const std::string &data_dir, | ||
| 34 | + const OfflineTtsMatchaModelMetaData &matcha_meta_data); | ||
| 26 | 35 | ||
| 27 | std::vector<TokenIDs> ConvertTextToTokenIds( | 36 | std::vector<TokenIDs> ConvertTextToTokenIds( |
| 28 | const std::string &text, const std::string &voice = "") const override; | 37 | const std::string &text, const std::string &voice = "") const override; |
| 29 | 38 | ||
| 30 | private: | 39 | private: |
| 40 | + std::vector<TokenIDs> ConvertTextToTokenIdsVits( | ||
| 41 | + const std::string &text, const std::string &voice = "") const; | ||
| 42 | + | ||
| 43 | + std::vector<TokenIDs> ConvertTextToTokenIdsMatcha( | ||
| 44 | + const std::string &text, const std::string &voice = "") const; | ||
| 45 | + | ||
| 46 | + private: | ||
| 31 | // map unicode codepoint to an integer ID | 47 | // map unicode codepoint to an integer ID |
| 32 | std::unordered_map<char32_t, int32_t> token2id_; | 48 | std::unordered_map<char32_t, int32_t> token2id_; |
| 33 | - OfflineTtsVitsModelMetaData meta_data_; | 49 | + OfflineTtsVitsModelMetaData vits_meta_data_; |
| 50 | + OfflineTtsMatchaModelMetaData matcha_meta_data_; | ||
| 51 | + bool is_matcha_ = false; | ||
| 34 | }; | 52 | }; |
| 35 | 53 | ||
| 36 | } // namespace sherpa_onnx | 54 | } // namespace sherpa_onnx |
-
请 注册 或 登录 后发表评论