正在显示
33 个修改的文件
包含
1397 行增加
和
86 行删除
| @@ -19,6 +19,40 @@ which $EXE | @@ -19,6 +19,40 @@ which $EXE | ||
| 19 | mkdir ./tts | 19 | mkdir ./tts |
| 20 | 20 | ||
| 21 | log "------------------------------------------------------------" | 21 | log "------------------------------------------------------------" |
| 22 | +log "matcha-icefall-zh-baker" | ||
| 23 | +log "------------------------------------------------------------" | ||
| 24 | +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 25 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 26 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 27 | + | ||
| 28 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 29 | + | ||
| 30 | +$EXE \ | ||
| 31 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 32 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 33 | + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 34 | + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 35 | + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 36 | + --num-threads=2 \ | ||
| 37 | + --debug=1 \ | ||
| 38 | + --output-filename=./tts/matcha-baker-zh-1.wav \ | ||
| 39 | + '小米的使命是,始终坚持做"感动人心、价格厚道"的好产品,让全球每个人都能享受科技带来的美好生活' | ||
| 40 | + | ||
| 41 | +$EXE \ | ||
| 42 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 43 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 44 | + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 45 | + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 46 | + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 47 | + --num-threads=2 \ | ||
| 48 | + --debug=1 \ | ||
| 49 | + --output-filename=./tts/matcha-baker-zh-2.wav \ | ||
| 50 | + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。" | ||
| 51 | + | ||
| 52 | +rm hifigan_v2.onnx | ||
| 53 | +rm -rf matcha-icefall-zh-baker | ||
| 54 | + | ||
| 55 | +log "------------------------------------------------------------" | ||
| 22 | log "vits-piper-en_US-amy-low" | 56 | log "vits-piper-en_US-amy-low" |
| 23 | log "------------------------------------------------------------" | 57 | log "------------------------------------------------------------" |
| 24 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 58 | curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| @@ -269,6 +269,26 @@ mkdir ./tts | @@ -269,6 +269,26 @@ mkdir ./tts | ||
| 269 | 269 | ||
| 270 | log "vits-ljs test" | 270 | log "vits-ljs test" |
| 271 | 271 | ||
| 272 | +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 273 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 274 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 275 | + | ||
| 276 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 277 | + | ||
| 278 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 279 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 280 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 281 | + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 282 | + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 283 | + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
| 284 | + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 285 | + --output-filename=./tts/test-matcha.wav \ | ||
| 286 | + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | ||
| 287 | + | ||
| 288 | +rm -rf matcha-icefall-zh-baker | ||
| 289 | +rm hifigan_v2.onnx | ||
| 290 | + | ||
| 291 | + | ||
| 272 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx | 292 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx |
| 273 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt | 293 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt |
| 274 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt | 294 | curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt |
| @@ -149,6 +149,23 @@ jobs: | @@ -149,6 +149,23 @@ jobs: | ||
| 149 | name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} | 149 | name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} |
| 150 | path: install/* | 150 | path: install/* |
| 151 | 151 | ||
| 152 | + - name: Test offline TTS | ||
| 153 | + if: matrix.with_tts == 'ON' | ||
| 154 | + shell: bash | ||
| 155 | + run: | | ||
| 156 | + du -h -d1 . | ||
| 157 | + export PATH=$PWD/build/bin:$PATH | ||
| 158 | + export EXE=sherpa-onnx-offline-tts | ||
| 159 | + | ||
| 160 | + .github/scripts/test-offline-tts.sh | ||
| 161 | + du -h -d1 . | ||
| 162 | + | ||
| 163 | + - uses: actions/upload-artifact@v4 | ||
| 164 | + if: matrix.with_tts == 'ON' | ||
| 165 | + with: | ||
| 166 | + name: tts-generated-test-files-${{ matrix.build_type }}-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} | ||
| 167 | + path: tts | ||
| 168 | + | ||
| 152 | - name: Test offline Moonshine | 169 | - name: Test offline Moonshine |
| 153 | if: matrix.build_type != 'Debug' | 170 | if: matrix.build_type != 'Debug' |
| 154 | shell: bash | 171 | shell: bash |
| @@ -309,16 +326,7 @@ jobs: | @@ -309,16 +326,7 @@ jobs: | ||
| 309 | .github/scripts/test-offline-whisper.sh | 326 | .github/scripts/test-offline-whisper.sh |
| 310 | du -h -d1 . | 327 | du -h -d1 . |
| 311 | 328 | ||
| 312 | - - name: Test offline TTS | ||
| 313 | - if: matrix.with_tts == 'ON' | ||
| 314 | - shell: bash | ||
| 315 | - run: | | ||
| 316 | - du -h -d1 . | ||
| 317 | - export PATH=$PWD/build/bin:$PATH | ||
| 318 | - export EXE=sherpa-onnx-offline-tts | ||
| 319 | 329 | ||
| 320 | - .github/scripts/test-offline-tts.sh | ||
| 321 | - du -h -d1 . | ||
| 322 | 330 | ||
| 323 | - name: Test online paraformer | 331 | - name: Test online paraformer |
| 324 | shell: bash | 332 | shell: bash |
| @@ -367,8 +375,4 @@ jobs: | @@ -367,8 +375,4 @@ jobs: | ||
| 367 | overwrite: true | 375 | overwrite: true |
| 368 | file: sherpa-onnx-*.tar.bz2 | 376 | file: sherpa-onnx-*.tar.bz2 |
| 369 | 377 | ||
| 370 | - - uses: actions/upload-artifact@v4 | ||
| 371 | - with: | ||
| 372 | - name: tts-generated-test-files-${{ matrix.build_type }}-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} | ||
| 373 | - path: tts | ||
| 374 | 378 |
| @@ -121,6 +121,15 @@ jobs: | @@ -121,6 +121,15 @@ jobs: | ||
| 121 | otool -L build/bin/sherpa-onnx | 121 | otool -L build/bin/sherpa-onnx |
| 122 | otool -l build/bin/sherpa-onnx | 122 | otool -l build/bin/sherpa-onnx |
| 123 | 123 | ||
| 124 | + - name: Test offline TTS | ||
| 125 | + if: matrix.with_tts == 'ON' | ||
| 126 | + shell: bash | ||
| 127 | + run: | | ||
| 128 | + export PATH=$PWD/build/bin:$PATH | ||
| 129 | + export EXE=sherpa-onnx-offline-tts | ||
| 130 | + | ||
| 131 | + .github/scripts/test-offline-tts.sh | ||
| 132 | + | ||
| 124 | - name: Test offline Moonshine | 133 | - name: Test offline Moonshine |
| 125 | if: matrix.build_type != 'Debug' | 134 | if: matrix.build_type != 'Debug' |
| 126 | shell: bash | 135 | shell: bash |
| @@ -226,15 +235,6 @@ jobs: | @@ -226,15 +235,6 @@ jobs: | ||
| 226 | 235 | ||
| 227 | .github/scripts/test-kws.sh | 236 | .github/scripts/test-kws.sh |
| 228 | 237 | ||
| 229 | - - name: Test offline TTS | ||
| 230 | - if: matrix.with_tts == 'ON' | ||
| 231 | - shell: bash | ||
| 232 | - run: | | ||
| 233 | - export PATH=$PWD/build/bin:$PATH | ||
| 234 | - export EXE=sherpa-onnx-offline-tts | ||
| 235 | - | ||
| 236 | - .github/scripts/test-offline-tts.sh | ||
| 237 | - | ||
| 238 | - name: Test online paraformer | 238 | - name: Test online paraformer |
| 239 | shell: bash | 239 | shell: bash |
| 240 | run: | | 240 | run: | |
| @@ -11,7 +11,7 @@ while the model is still generating. | @@ -11,7 +11,7 @@ while the model is still generating. | ||
| 11 | 11 | ||
| 12 | Usage: | 12 | Usage: |
| 13 | 13 | ||
| 14 | -Example (1/3) | 14 | +Example (1/4) |
| 15 | 15 | ||
| 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 16 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 17 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 23 | --output-filename=./generated.wav \ | 23 | --output-filename=./generated.wav \ |
| 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 24 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 25 | 25 | ||
| 26 | -Example (2/3) | 26 | +Example (2/4) |
| 27 | 27 | ||
| 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 | 28 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 |
| 29 | tar xvf vits-zh-aishell3.tar.bz2 | 29 | tar xvf vits-zh-aishell3.tar.bz2 |
| @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 37 | --output-filename=./liubei-21.wav \ | 37 | --output-filename=./liubei-21.wav \ |
| 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 38 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 39 | 39 | ||
| 40 | -Example (3/3) | 40 | +Example (3/4) |
| 41 | 41 | ||
| 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 42 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -53,6 +53,24 @@ python3 ./python-api-examples/offline-tts-play.py \ | @@ -53,6 +53,24 @@ python3 ./python-api-examples/offline-tts-play.py \ | ||
| 53 | --output-filename=./test-2.wav \ | 53 | --output-filename=./test-2.wav \ |
| 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 54 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 55 | 55 | ||
| 56 | +Example (4/4) | ||
| 57 | + | ||
| 58 | +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 59 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 60 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 61 | + | ||
| 62 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 63 | + | ||
| 64 | +python3 ./python-api-examples/offline-tts-play.py \ | ||
| 65 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 66 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 67 | + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 68 | + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 69 | + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
| 70 | + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 71 | + --output-filename=./test-matcha.wav \ | ||
| 72 | + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | ||
| 73 | + | ||
| 56 | 74 | ||
| 57 | You can find more models at | 75 | You can find more models at |
| 58 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 76 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| @@ -84,14 +102,11 @@ except ImportError: | @@ -84,14 +102,11 @@ except ImportError: | ||
| 84 | sys.exit(-1) | 102 | sys.exit(-1) |
| 85 | 103 | ||
| 86 | 104 | ||
| 87 | -def get_args(): | ||
| 88 | - parser = argparse.ArgumentParser( | ||
| 89 | - formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 90 | - ) | ||
| 91 | - | 105 | +def add_vits_args(parser): |
| 92 | parser.add_argument( | 106 | parser.add_argument( |
| 93 | "--vits-model", | 107 | "--vits-model", |
| 94 | type=str, | 108 | type=str, |
| 109 | + default="", | ||
| 95 | help="Path to vits model.onnx", | 110 | help="Path to vits model.onnx", |
| 96 | ) | 111 | ) |
| 97 | 112 | ||
| @@ -124,6 +139,60 @@ def get_args(): | @@ -124,6 +139,60 @@ def get_args(): | ||
| 124 | help="Path to the dict directory for models using jieba", | 139 | help="Path to the dict directory for models using jieba", |
| 125 | ) | 140 | ) |
| 126 | 141 | ||
| 142 | + | ||
| 143 | +def add_matcha_args(parser): | ||
| 144 | + parser.add_argument( | ||
| 145 | + "--matcha-acoustic-model", | ||
| 146 | + type=str, | ||
| 147 | + default="", | ||
| 148 | + help="Path to model.onnx for matcha", | ||
| 149 | + ) | ||
| 150 | + | ||
| 151 | + parser.add_argument( | ||
| 152 | + "--matcha-vocoder", | ||
| 153 | + type=str, | ||
| 154 | + default="", | ||
| 155 | + help="Path to vocoder for matcha", | ||
| 156 | + ) | ||
| 157 | + | ||
| 158 | + parser.add_argument( | ||
| 159 | + "--matcha-lexicon", | ||
| 160 | + type=str, | ||
| 161 | + default="", | ||
| 162 | + help="Path to lexicon.txt for matcha", | ||
| 163 | + ) | ||
| 164 | + | ||
| 165 | + parser.add_argument( | ||
| 166 | + "--matcha-tokens", | ||
| 167 | + type=str, | ||
| 168 | + default="", | ||
| 169 | + help="Path to tokens.txt for matcha", | ||
| 170 | + ) | ||
| 171 | + | ||
| 172 | + parser.add_argument( | ||
| 173 | + "--matcha-data-dir", | ||
| 174 | + type=str, | ||
| 175 | + default="", | ||
| 176 | + help="""Path to the dict directory of espeak-ng. If it is specified, | ||
| 177 | + --matcha-lexicon and --matcha-tokens are ignored""", | ||
| 178 | + ) | ||
| 179 | + | ||
| 180 | + parser.add_argument( | ||
| 181 | + "--matcha-dict-dir", | ||
| 182 | + type=str, | ||
| 183 | + default="", | ||
| 184 | + help="Path to the dict directory for models using jieba", | ||
| 185 | + ) | ||
| 186 | + | ||
| 187 | + | ||
| 188 | +def get_args(): | ||
| 189 | + parser = argparse.ArgumentParser( | ||
| 190 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 191 | + ) | ||
| 192 | + | ||
| 193 | + add_vits_args(parser) | ||
| 194 | + add_matcha_args(parser) | ||
| 195 | + | ||
| 127 | parser.add_argument( | 196 | parser.add_argument( |
| 128 | "--tts-rule-fsts", | 197 | "--tts-rule-fsts", |
| 129 | type=str, | 198 | type=str, |
| @@ -313,6 +382,14 @@ def main(): | @@ -313,6 +382,14 @@ def main(): | ||
| 313 | dict_dir=args.vits_dict_dir, | 382 | dict_dir=args.vits_dict_dir, |
| 314 | tokens=args.vits_tokens, | 383 | tokens=args.vits_tokens, |
| 315 | ), | 384 | ), |
| 385 | + matcha=sherpa_onnx.OfflineTtsMatchaModelConfig( | ||
| 386 | + acoustic_model=args.matcha_acoustic_model, | ||
| 387 | + vocoder=args.matcha_vocoder, | ||
| 388 | + lexicon=args.matcha_lexicon, | ||
| 389 | + tokens=args.matcha_tokens, | ||
| 390 | + data_dir=args.matcha_data_dir, | ||
| 391 | + dict_dir=args.matcha_dict_dir, | ||
| 392 | + ), | ||
| 316 | provider=args.provider, | 393 | provider=args.provider, |
| 317 | debug=args.debug, | 394 | debug=args.debug, |
| 318 | num_threads=args.num_threads, | 395 | num_threads=args.num_threads, |
| @@ -12,7 +12,7 @@ generated audio. | @@ -12,7 +12,7 @@ generated audio. | ||
| 12 | 12 | ||
| 13 | Usage: | 13 | Usage: |
| 14 | 14 | ||
| 15 | -Example (1/3) | 15 | +Example (1/4) |
| 16 | 16 | ||
| 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 17 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 18 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 24 | --output-filename=./generated.wav \ | 24 | --output-filename=./generated.wav \ |
| 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | 25 | "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." |
| 26 | 26 | ||
| 27 | -Example (2/3) | 27 | +Example (2/4) |
| 28 | 28 | ||
| 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 29 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 | 30 | tar xvf vits-icefall-zh-aishell3.tar.bz2 |
| @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 38 | --output-filename=./liubei-21.wav \ | 38 | --output-filename=./liubei-21.wav \ |
| 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" | 39 | "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" |
| 40 | 40 | ||
| 41 | -Example (3/3) | 41 | +Example (3/4) |
| 42 | 42 | ||
| 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 | 43 | wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 |
| 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 | 44 | tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 |
| @@ -54,6 +54,23 @@ python3 ./python-api-examples/offline-tts.py \ | @@ -54,6 +54,23 @@ python3 ./python-api-examples/offline-tts.py \ | ||
| 54 | --output-filename=./test-2.wav \ | 54 | --output-filename=./test-2.wav \ |
| 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" | 55 | "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" |
| 56 | 56 | ||
| 57 | +Example (4/4) | ||
| 58 | + | ||
| 59 | +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 60 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 61 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 62 | + | ||
| 63 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 64 | + | ||
| 65 | +python3 ./python-api-examples/offline-tts.py \ | ||
| 66 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 67 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 68 | + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 69 | + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 70 | + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
| 71 | + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 72 | + --output-filename=./test-matcha.wav \ | ||
| 73 | + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" | ||
| 57 | 74 | ||
| 58 | You can find more models at | 75 | You can find more models at |
| 59 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 76 | https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| @@ -71,14 +88,11 @@ import sherpa_onnx | @@ -71,14 +88,11 @@ import sherpa_onnx | ||
| 71 | import soundfile as sf | 88 | import soundfile as sf |
| 72 | 89 | ||
| 73 | 90 | ||
| 74 | -def get_args(): | ||
| 75 | - parser = argparse.ArgumentParser( | ||
| 76 | - formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 77 | - ) | ||
| 78 | - | 91 | +def add_vits_args(parser): |
| 79 | parser.add_argument( | 92 | parser.add_argument( |
| 80 | "--vits-model", | 93 | "--vits-model", |
| 81 | type=str, | 94 | type=str, |
| 95 | + default="", | ||
| 82 | help="Path to vits model.onnx", | 96 | help="Path to vits model.onnx", |
| 83 | ) | 97 | ) |
| 84 | 98 | ||
| @@ -111,6 +125,60 @@ def get_args(): | @@ -111,6 +125,60 @@ def get_args(): | ||
| 111 | help="Path to the dict directory for models using jieba", | 125 | help="Path to the dict directory for models using jieba", |
| 112 | ) | 126 | ) |
| 113 | 127 | ||
| 128 | + | ||
| 129 | +def add_matcha_args(parser): | ||
| 130 | + parser.add_argument( | ||
| 131 | + "--matcha-acoustic-model", | ||
| 132 | + type=str, | ||
| 133 | + default="", | ||
| 134 | + help="Path to model.onnx for matcha", | ||
| 135 | + ) | ||
| 136 | + | ||
| 137 | + parser.add_argument( | ||
| 138 | + "--matcha-vocoder", | ||
| 139 | + type=str, | ||
| 140 | + default="", | ||
| 141 | + help="Path to vocoder for matcha", | ||
| 142 | + ) | ||
| 143 | + | ||
| 144 | + parser.add_argument( | ||
| 145 | + "--matcha-lexicon", | ||
| 146 | + type=str, | ||
| 147 | + default="", | ||
| 148 | + help="Path to lexicon.txt for matcha", | ||
| 149 | + ) | ||
| 150 | + | ||
| 151 | + parser.add_argument( | ||
| 152 | + "--matcha-tokens", | ||
| 153 | + type=str, | ||
| 154 | + default="", | ||
| 155 | + help="Path to tokens.txt for matcha", | ||
| 156 | + ) | ||
| 157 | + | ||
| 158 | + parser.add_argument( | ||
| 159 | + "--matcha-data-dir", | ||
| 160 | + type=str, | ||
| 161 | + default="", | ||
| 162 | + help="""Path to the dict directory of espeak-ng. If it is specified, | ||
| 163 | + --matcha-lexicon and --matcha-tokens are ignored""", | ||
| 164 | + ) | ||
| 165 | + | ||
| 166 | + parser.add_argument( | ||
| 167 | + "--matcha-dict-dir", | ||
| 168 | + type=str, | ||
| 169 | + default="", | ||
| 170 | + help="Path to the dict directory for models using jieba", | ||
| 171 | + ) | ||
| 172 | + | ||
| 173 | + | ||
| 174 | +def get_args(): | ||
| 175 | + parser = argparse.ArgumentParser( | ||
| 176 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 177 | + ) | ||
| 178 | + | ||
| 179 | + add_vits_args(parser) | ||
| 180 | + add_matcha_args(parser) | ||
| 181 | + | ||
| 114 | parser.add_argument( | 182 | parser.add_argument( |
| 115 | "--tts-rule-fsts", | 183 | "--tts-rule-fsts", |
| 116 | type=str, | 184 | type=str, |
| @@ -196,6 +264,14 @@ def main(): | @@ -196,6 +264,14 @@ def main(): | ||
| 196 | dict_dir=args.vits_dict_dir, | 264 | dict_dir=args.vits_dict_dir, |
| 197 | tokens=args.vits_tokens, | 265 | tokens=args.vits_tokens, |
| 198 | ), | 266 | ), |
| 267 | + matcha=sherpa_onnx.OfflineTtsMatchaModelConfig( | ||
| 268 | + acoustic_model=args.matcha_acoustic_model, | ||
| 269 | + vocoder=args.matcha_vocoder, | ||
| 270 | + lexicon=args.matcha_lexicon, | ||
| 271 | + tokens=args.matcha_tokens, | ||
| 272 | + data_dir=args.matcha_data_dir, | ||
| 273 | + dict_dir=args.matcha_dict_dir, | ||
| 274 | + ), | ||
| 199 | provider=args.provider, | 275 | provider=args.provider, |
| 200 | debug=args.debug, | 276 | debug=args.debug, |
| 201 | num_threads=args.num_threads, | 277 | num_threads=args.num_threads, |
| @@ -151,12 +151,15 @@ list(APPEND sources | @@ -151,12 +151,15 @@ list(APPEND sources | ||
| 151 | 151 | ||
| 152 | if(SHERPA_ONNX_ENABLE_TTS) | 152 | if(SHERPA_ONNX_ENABLE_TTS) |
| 153 | list(APPEND sources | 153 | list(APPEND sources |
| 154 | + hifigan-vocoder.cc | ||
| 154 | jieba-lexicon.cc | 155 | jieba-lexicon.cc |
| 155 | lexicon.cc | 156 | lexicon.cc |
| 156 | melo-tts-lexicon.cc | 157 | melo-tts-lexicon.cc |
| 157 | offline-tts-character-frontend.cc | 158 | offline-tts-character-frontend.cc |
| 158 | offline-tts-frontend.cc | 159 | offline-tts-frontend.cc |
| 159 | offline-tts-impl.cc | 160 | offline-tts-impl.cc |
| 161 | + offline-tts-matcha-model-config.cc | ||
| 162 | + offline-tts-matcha-model.cc | ||
| 160 | offline-tts-model-config.cc | 163 | offline-tts-model-config.cc |
| 161 | offline-tts-vits-model-config.cc | 164 | offline-tts-vits-model-config.cc |
| 162 | offline-tts-vits-model.cc | 165 | offline-tts-vits-model.cc |
sherpa-onnx/csrc/hifigan-vocoder.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/hifigan-vocoder.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/hifigan-vocoder.h" | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | +#include <utility> | ||
| 9 | +#include <vector> | ||
| 10 | + | ||
| 11 | +#if __ANDROID_API__ >= 9 | ||
| 12 | +#include "android/asset_manager.h" | ||
| 13 | +#include "android/asset_manager_jni.h" | ||
| 14 | +#endif | ||
| 15 | + | ||
| 16 | +#if __OHOS__ | ||
| 17 | +#include "rawfile/raw_file_manager.h" | ||
| 18 | +#endif | ||
| 19 | + | ||
| 20 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 21 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 22 | +#include "sherpa-onnx/csrc/session.h" | ||
| 23 | + | ||
| 24 | +namespace sherpa_onnx { | ||
| 25 | + | ||
| 26 | +class HifiganVocoder::Impl { | ||
| 27 | + public: | ||
| 28 | + explicit Impl(int32_t num_threads, const std::string &provider, | ||
| 29 | + const std::string &model) | ||
| 30 | + : env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 31 | + sess_opts_(GetSessionOptions(num_threads, provider)), | ||
| 32 | + allocator_{} { | ||
| 33 | + auto buf = ReadFile(model); | ||
| 34 | + Init(buf.data(), buf.size()); | ||
| 35 | + } | ||
| 36 | + | ||
| 37 | + template <typename Manager> | ||
| 38 | + explicit Impl(Manager *mgr, int32_t num_threads, const std::string &provider, | ||
| 39 | + const std::string &model) | ||
| 40 | + : env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 41 | + sess_opts_(GetSessionOptions(num_threads, provider)), | ||
| 42 | + allocator_{} { | ||
| 43 | + auto buf = ReadFile(mgr, model); | ||
| 44 | + Init(buf.data(), buf.size()); | ||
| 45 | + } | ||
| 46 | + | ||
| 47 | + Ort::Value Run(Ort::Value mel) const { | ||
| 48 | + auto out = sess_->Run({}, input_names_ptr_.data(), &mel, 1, | ||
| 49 | + output_names_ptr_.data(), output_names_ptr_.size()); | ||
| 50 | + | ||
| 51 | + return std::move(out[0]); | ||
| 52 | + } | ||
| 53 | + | ||
| 54 | + private: | ||
| 55 | + void Init(void *model_data, size_t model_data_length) { | ||
| 56 | + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | ||
| 57 | + sess_opts_); | ||
| 58 | + | ||
| 59 | + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); | ||
| 60 | + | ||
| 61 | + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); | ||
| 62 | + } | ||
| 63 | + | ||
| 64 | + private: | ||
| 65 | + Ort::Env env_; | ||
| 66 | + Ort::SessionOptions sess_opts_; | ||
| 67 | + Ort::AllocatorWithDefaultOptions allocator_; | ||
| 68 | + | ||
| 69 | + std::unique_ptr<Ort::Session> sess_; | ||
| 70 | + | ||
| 71 | + std::vector<std::string> input_names_; | ||
| 72 | + std::vector<const char *> input_names_ptr_; | ||
| 73 | + | ||
| 74 | + std::vector<std::string> output_names_; | ||
| 75 | + std::vector<const char *> output_names_ptr_; | ||
| 76 | +}; | ||
| 77 | + | ||
| 78 | +HifiganVocoder::HifiganVocoder(int32_t num_threads, const std::string &provider, | ||
| 79 | + const std::string &model) | ||
| 80 | + : impl_(std::make_unique<Impl>(num_threads, provider, model)) {} | ||
| 81 | + | ||
| 82 | +template <typename Manager> | ||
| 83 | +HifiganVocoder::HifiganVocoder(Manager *mgr, int32_t num_threads, | ||
| 84 | + const std::string &provider, | ||
| 85 | + const std::string &model) | ||
| 86 | + : impl_(std::make_unique<Impl>(mgr, num_threads, provider, model)) {} | ||
| 87 | + | ||
| 88 | +HifiganVocoder::~HifiganVocoder() = default; | ||
| 89 | + | ||
| 90 | +Ort::Value HifiganVocoder::Run(Ort::Value mel) const { | ||
| 91 | + return impl_->Run(std::move(mel)); | ||
| 92 | +} | ||
| 93 | + | ||
| 94 | +#if __ANDROID_API__ >= 9 | ||
| 95 | +template HifiganVocoder::HifiganVocoder(AAssetManager *mgr, int32_t num_threads, | ||
| 96 | + const std::string &provider, | ||
| 97 | + const std::string &model); | ||
| 98 | +#endif | ||
| 99 | + | ||
| 100 | +#if __OHOS__ | ||
| 101 | +template HifiganVocoder::HifiganVocoder(NativeResourceManager *mgr, | ||
| 102 | + int32_t num_threads, | ||
| 103 | + const std::string &provider, | ||
| 104 | + const std::string &model); | ||
| 105 | +#endif | ||
| 106 | + | ||
| 107 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/hifigan-vocoder.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/hifigan-vocoder.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_HIFIGAN_VOCODER_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_HIFIGAN_VOCODER_H_ | ||
| 7 | + | ||
| 8 | +#include <memory> | ||
| 9 | +#include <string> | ||
| 10 | + | ||
| 11 | +#include "onnxruntime_cxx_api.h" // NOLINT | ||
| 12 | + | ||
| 13 | +namespace sherpa_onnx { | ||
| 14 | + | ||
| 15 | +class HifiganVocoder { | ||
| 16 | + public: | ||
| 17 | + ~HifiganVocoder(); | ||
| 18 | + | ||
| 19 | + HifiganVocoder(int32_t num_threads, const std::string &provider, | ||
| 20 | + const std::string &model); | ||
| 21 | + | ||
| 22 | + template <typename Manager> | ||
| 23 | + HifiganVocoder(Manager *mgr, int32_t num_threads, const std::string &provider, | ||
| 24 | + const std::string &model); | ||
| 25 | + | ||
| 26 | + /** @param mel A float32 tensor of shape (batch_size, feat_dim, num_frames). | ||
| 27 | + * @return Return a float32 tensor of shape (batch_size, num_samples). | ||
| 28 | + */ | ||
| 29 | + Ort::Value Run(Ort::Value mel) const; | ||
| 30 | + | ||
| 31 | + private: | ||
| 32 | + class Impl; | ||
| 33 | + std::unique_ptr<Impl> impl_; | ||
| 34 | +}; | ||
| 35 | + | ||
| 36 | +} // namespace sherpa_onnx | ||
| 37 | + | ||
| 38 | +#endif // SHERPA_ONNX_CSRC_HIFIGAN_VOCODER_H_ |
| @@ -19,9 +19,8 @@ namespace sherpa_onnx { | @@ -19,9 +19,8 @@ namespace sherpa_onnx { | ||
| 19 | class JiebaLexicon::Impl { | 19 | class JiebaLexicon::Impl { |
| 20 | public: | 20 | public: |
| 21 | Impl(const std::string &lexicon, const std::string &tokens, | 21 | Impl(const std::string &lexicon, const std::string &tokens, |
| 22 | - const std::string &dict_dir, | ||
| 23 | - const OfflineTtsVitsModelMetaData &meta_data, bool debug) | ||
| 24 | - : meta_data_(meta_data), debug_(debug) { | 22 | + const std::string &dict_dir, bool debug) |
| 23 | + : debug_(debug) { | ||
| 25 | std::string dict = dict_dir + "/jieba.dict.utf8"; | 24 | std::string dict = dict_dir + "/jieba.dict.utf8"; |
| 26 | std::string hmm = dict_dir + "/hmm_model.utf8"; | 25 | std::string hmm = dict_dir + "/hmm_model.utf8"; |
| 27 | std::string user_dict = dict_dir + "/user.dict.utf8"; | 26 | std::string user_dict = dict_dir + "/user.dict.utf8"; |
| @@ -84,7 +83,6 @@ class JiebaLexicon::Impl { | @@ -84,7 +83,6 @@ class JiebaLexicon::Impl { | ||
| 84 | std::vector<TokenIDs> ans; | 83 | std::vector<TokenIDs> ans; |
| 85 | std::vector<int64_t> this_sentence; | 84 | std::vector<int64_t> this_sentence; |
| 86 | 85 | ||
| 87 | - int32_t blank = token2id_.at(" "); | ||
| 88 | for (const auto &w : words) { | 86 | for (const auto &w : words) { |
| 89 | auto ids = ConvertWordToIds(w); | 87 | auto ids = ConvertWordToIds(w); |
| 90 | if (ids.empty()) { | 88 | if (ids.empty()) { |
| @@ -93,7 +91,6 @@ class JiebaLexicon::Impl { | @@ -93,7 +91,6 @@ class JiebaLexicon::Impl { | ||
| 93 | } | 91 | } |
| 94 | 92 | ||
| 95 | this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); | 93 | this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); |
| 96 | - this_sentence.push_back(blank); | ||
| 97 | 94 | ||
| 98 | if (w == "。" || w == "!" || w == "?" || w == ",") { | 95 | if (w == "。" || w == "!" || w == "?" || w == ",") { |
| 99 | ans.emplace_back(std::move(this_sentence)); | 96 | ans.emplace_back(std::move(this_sentence)); |
| @@ -135,7 +132,9 @@ class JiebaLexicon::Impl { | @@ -135,7 +132,9 @@ class JiebaLexicon::Impl { | ||
| 135 | token2id_ = ReadTokens(is); | 132 | token2id_ = ReadTokens(is); |
| 136 | 133 | ||
| 137 | std::vector<std::pair<std::string, std::string>> puncts = { | 134 | std::vector<std::pair<std::string, std::string>> puncts = { |
| 138 | - {",", ","}, {".", "。"}, {"!", "!"}, {"?", "?"}}; | 135 | + {",", ","}, {".", "。"}, {"!", "!"}, {"?", "?"}, {":", ":"}, |
| 136 | + {"\"", "“"}, {"\"", "”"}, {"'", "‘"}, {"'", "’"}, {";", ";"}, | ||
| 137 | + }; | ||
| 139 | 138 | ||
| 140 | for (const auto &p : puncts) { | 139 | for (const auto &p : puncts) { |
| 141 | if (token2id_.count(p.first) && !token2id_.count(p.second)) { | 140 | if (token2id_.count(p.first) && !token2id_.count(p.second)) { |
| @@ -150,6 +149,10 @@ class JiebaLexicon::Impl { | @@ -150,6 +149,10 @@ class JiebaLexicon::Impl { | ||
| 150 | if (!token2id_.count("、") && token2id_.count(",")) { | 149 | if (!token2id_.count("、") && token2id_.count(",")) { |
| 151 | token2id_["、"] = token2id_[","]; | 150 | token2id_["、"] = token2id_[","]; |
| 152 | } | 151 | } |
| 152 | + | ||
| 153 | + if (!token2id_.count(";") && token2id_.count(",")) { | ||
| 154 | + token2id_[";"] = token2id_[","]; | ||
| 155 | + } | ||
| 153 | } | 156 | } |
| 154 | 157 | ||
| 155 | void InitLexicon(std::istream &is) { | 158 | void InitLexicon(std::istream &is) { |
| @@ -195,8 +198,6 @@ class JiebaLexicon::Impl { | @@ -195,8 +198,6 @@ class JiebaLexicon::Impl { | ||
| 195 | // tokens.txt is saved in token2id_ | 198 | // tokens.txt is saved in token2id_ |
| 196 | std::unordered_map<std::string, int32_t> token2id_; | 199 | std::unordered_map<std::string, int32_t> token2id_; |
| 197 | 200 | ||
| 198 | - OfflineTtsVitsModelMetaData meta_data_; | ||
| 199 | - | ||
| 200 | std::unique_ptr<cppjieba::Jieba> jieba_; | 201 | std::unique_ptr<cppjieba::Jieba> jieba_; |
| 201 | bool debug_ = false; | 202 | bool debug_ = false; |
| 202 | }; | 203 | }; |
| @@ -205,11 +206,8 @@ JiebaLexicon::~JiebaLexicon() = default; | @@ -205,11 +206,8 @@ JiebaLexicon::~JiebaLexicon() = default; | ||
| 205 | 206 | ||
| 206 | JiebaLexicon::JiebaLexicon(const std::string &lexicon, | 207 | JiebaLexicon::JiebaLexicon(const std::string &lexicon, |
| 207 | const std::string &tokens, | 208 | const std::string &tokens, |
| 208 | - const std::string &dict_dir, | ||
| 209 | - const OfflineTtsVitsModelMetaData &meta_data, | ||
| 210 | - bool debug) | ||
| 211 | - : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, meta_data, | ||
| 212 | - debug)) {} | 209 | + const std::string &dict_dir, bool debug) |
| 210 | + : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, debug)) {} | ||
| 213 | 211 | ||
| 214 | std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds( | 212 | std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds( |
| 215 | const std::string &text, const std::string & /*unused_voice = ""*/) const { | 213 | const std::string &text, const std::string & /*unused_voice = ""*/) const { |
| @@ -11,7 +11,6 @@ | @@ -11,7 +11,6 @@ | ||
| 11 | #include <vector> | 11 | #include <vector> |
| 12 | 12 | ||
| 13 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" | 13 | #include "sherpa-onnx/csrc/offline-tts-frontend.h" |
| 14 | -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" | ||
| 15 | 14 | ||
| 16 | namespace sherpa_onnx { | 15 | namespace sherpa_onnx { |
| 17 | 16 | ||
| @@ -19,8 +18,7 @@ class JiebaLexicon : public OfflineTtsFrontend { | @@ -19,8 +18,7 @@ class JiebaLexicon : public OfflineTtsFrontend { | ||
| 19 | public: | 18 | public: |
| 20 | ~JiebaLexicon() override; | 19 | ~JiebaLexicon() override; |
| 21 | JiebaLexicon(const std::string &lexicon, const std::string &tokens, | 20 | JiebaLexicon(const std::string &lexicon, const std::string &tokens, |
| 22 | - const std::string &dict_dir, | ||
| 23 | - const OfflineTtsVitsModelMetaData &meta_data, bool debug); | 21 | + const std::string &dict_dir, bool debug); |
| 24 | 22 | ||
| 25 | std::vector<TokenIDs> ConvertTextToTokenIds( | 23 | std::vector<TokenIDs> ConvertTextToTokenIds( |
| 26 | const std::string &text, | 24 | const std::string &text, |
| @@ -5,6 +5,7 @@ | @@ -5,6 +5,7 @@ | ||
| 5 | #include "sherpa-onnx/csrc/offline-tts-impl.h" | 5 | #include "sherpa-onnx/csrc/offline-tts-impl.h" |
| 6 | 6 | ||
| 7 | #include <memory> | 7 | #include <memory> |
| 8 | +#include <vector> | ||
| 8 | 9 | ||
| 9 | #if __ANDROID_API__ >= 9 | 10 | #if __ANDROID_API__ >= 9 |
| 10 | #include "android/asset_manager.h" | 11 | #include "android/asset_manager.h" |
| @@ -15,21 +16,39 @@ | @@ -15,21 +16,39 @@ | ||
| 15 | #include "rawfile/raw_file_manager.h" | 16 | #include "rawfile/raw_file_manager.h" |
| 16 | #endif | 17 | #endif |
| 17 | 18 | ||
| 19 | +#include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" | ||
| 18 | #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" | 20 | #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" |
| 19 | 21 | ||
| 20 | namespace sherpa_onnx { | 22 | namespace sherpa_onnx { |
| 21 | 23 | ||
| 24 | +std::vector<int64_t> OfflineTtsImpl::AddBlank(const std::vector<int64_t> &x, | ||
| 25 | + int32_t blank_id /*= 0*/) const { | ||
| 26 | + // we assume the blank ID is 0 | ||
| 27 | + std::vector<int64_t> buffer(x.size() * 2 + 1, blank_id); | ||
| 28 | + int32_t i = 1; | ||
| 29 | + for (auto k : x) { | ||
| 30 | + buffer[i] = k; | ||
| 31 | + i += 2; | ||
| 32 | + } | ||
| 33 | + return buffer; | ||
| 34 | +} | ||
| 35 | + | ||
| 22 | std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | 36 | std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( |
| 23 | const OfflineTtsConfig &config) { | 37 | const OfflineTtsConfig &config) { |
| 24 | - // TODO(fangjun): Support other types | ||
| 25 | - return std::make_unique<OfflineTtsVitsImpl>(config); | 38 | + if (!config.model.vits.model.empty()) { |
| 39 | + return std::make_unique<OfflineTtsVitsImpl>(config); | ||
| 40 | + } | ||
| 41 | + return std::make_unique<OfflineTtsMatchaImpl>(config); | ||
| 26 | } | 42 | } |
| 27 | 43 | ||
| 28 | template <typename Manager> | 44 | template <typename Manager> |
| 29 | std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( | 45 | std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( |
| 30 | Manager *mgr, const OfflineTtsConfig &config) { | 46 | Manager *mgr, const OfflineTtsConfig &config) { |
| 31 | - // TODO(fangjun): Support other types | ||
| 32 | - return std::make_unique<OfflineTtsVitsImpl>(mgr, config); | 47 | + if (!config.model.vits.model.empty()) { |
| 48 | + return std::make_unique<OfflineTtsVitsImpl>(mgr, config); | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + return std::make_unique<OfflineTtsMatchaImpl>(mgr, config); | ||
| 33 | } | 52 | } |
| 34 | 53 | ||
| 35 | #if __ANDROID_API__ >= 9 | 54 | #if __ANDROID_API__ >= 9 |
| @@ -7,6 +7,7 @@ | @@ -7,6 +7,7 @@ | ||
| 7 | 7 | ||
| 8 | #include <memory> | 8 | #include <memory> |
| 9 | #include <string> | 9 | #include <string> |
| 10 | +#include <vector> | ||
| 10 | 11 | ||
| 11 | #include "sherpa-onnx/csrc/offline-tts.h" | 12 | #include "sherpa-onnx/csrc/offline-tts.h" |
| 12 | 13 | ||
| @@ -32,6 +33,9 @@ class OfflineTtsImpl { | @@ -32,6 +33,9 @@ class OfflineTtsImpl { | ||
| 32 | // Number of supported speakers. | 33 | // Number of supported speakers. |
| 33 | // If it supports only a single speaker, then it return 0 or 1. | 34 | // If it supports only a single speaker, then it return 0 or 1. |
| 34 | virtual int32_t NumSpeakers() const = 0; | 35 | virtual int32_t NumSpeakers() const = 0; |
| 36 | + | ||
| 37 | + std::vector<int64_t> AddBlank(const std::vector<int64_t> &x, | ||
| 38 | + int32_t blank_id = 0) const; | ||
| 35 | }; | 39 | }; |
| 36 | 40 | ||
| 37 | } // namespace sherpa_onnx | 41 | } // namespace sherpa_onnx |
sherpa-onnx/csrc/offline-tts-matcha-impl.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-matcha-impl.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_IMPL_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_IMPL_H_ | ||
| 6 | + | ||
| 7 | +#include <memory> | ||
| 8 | +#include <string> | ||
| 9 | +#include <strstream> | ||
| 10 | +#include <utility> | ||
| 11 | +#include <vector> | ||
| 12 | + | ||
| 13 | +#include "fst/extensions/far/far.h" | ||
| 14 | +#include "kaldifst/csrc/kaldi-fst-io.h" | ||
| 15 | +#include "kaldifst/csrc/text-normalizer.h" | ||
| 16 | +#include "sherpa-onnx/csrc/hifigan-vocoder.h" | ||
| 17 | +#include "sherpa-onnx/csrc/jieba-lexicon.h" | ||
| 18 | +#include "sherpa-onnx/csrc/lexicon.h" | ||
| 19 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 20 | +#include "sherpa-onnx/csrc/melo-tts-lexicon.h" | ||
| 21 | +#include "sherpa-onnx/csrc/offline-tts-character-frontend.h" | ||
| 22 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 23 | +#include "sherpa-onnx/csrc/offline-tts-impl.h" | ||
| 24 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model.h" | ||
| 25 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 26 | +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" | ||
| 27 | +#include "sherpa-onnx/csrc/text-utils.h" | ||
| 28 | + | ||
| 29 | +namespace sherpa_onnx { | ||
| 30 | + | ||
| 31 | +class OfflineTtsMatchaImpl : public OfflineTtsImpl { | ||
| 32 | + public: | ||
| 33 | + explicit OfflineTtsMatchaImpl(const OfflineTtsConfig &config) | ||
| 34 | + : config_(config), | ||
| 35 | + model_(std::make_unique<OfflineTtsMatchaModel>(config.model)), | ||
| 36 | + vocoder_(std::make_unique<HifiganVocoder>( | ||
| 37 | + config.model.num_threads, config.model.provider, | ||
| 38 | + config.model.matcha.vocoder)) { | ||
| 39 | + InitFrontend(); | ||
| 40 | + | ||
| 41 | + if (!config.rule_fsts.empty()) { | ||
| 42 | + std::vector<std::string> files; | ||
| 43 | + SplitStringToVector(config.rule_fsts, ",", false, &files); | ||
| 44 | + tn_list_.reserve(files.size()); | ||
| 45 | + for (const auto &f : files) { | ||
| 46 | + if (config.model.debug) { | ||
| 47 | +#if __OHOS__ | ||
| 48 | + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); | ||
| 49 | +#else | ||
| 50 | + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); | ||
| 51 | +#endif | ||
| 52 | + } | ||
| 53 | + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f)); | ||
| 54 | + } | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + if (!config.rule_fars.empty()) { | ||
| 58 | + if (config.model.debug) { | ||
| 59 | + SHERPA_ONNX_LOGE("Loading FST archives"); | ||
| 60 | + } | ||
| 61 | + std::vector<std::string> files; | ||
| 62 | + SplitStringToVector(config.rule_fars, ",", false, &files); | ||
| 63 | + | ||
| 64 | + tn_list_.reserve(files.size() + tn_list_.size()); | ||
| 65 | + | ||
| 66 | + for (const auto &f : files) { | ||
| 67 | + if (config.model.debug) { | ||
| 68 | +#if __OHOS__ | ||
| 69 | + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); | ||
| 70 | +#else | ||
| 71 | + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); | ||
| 72 | +#endif | ||
| 73 | + } | ||
| 74 | + std::unique_ptr<fst::FarReader<fst::StdArc>> reader( | ||
| 75 | + fst::FarReader<fst::StdArc>::Open(f)); | ||
| 76 | + for (; !reader->Done(); reader->Next()) { | ||
| 77 | + std::unique_ptr<fst::StdConstFst> r( | ||
| 78 | + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); | ||
| 79 | + | ||
| 80 | + tn_list_.push_back( | ||
| 81 | + std::make_unique<kaldifst::TextNormalizer>(std::move(r))); | ||
| 82 | + } | ||
| 83 | + } | ||
| 84 | + | ||
| 85 | + if (config.model.debug) { | ||
| 86 | + SHERPA_ONNX_LOGE("FST archives loaded!"); | ||
| 87 | + } | ||
| 88 | + } | ||
| 89 | + } | ||
| 90 | + | ||
| 91 | + template <typename Manager> | ||
| 92 | + OfflineTtsMatchaImpl(Manager *mgr, const OfflineTtsConfig &config) | ||
| 93 | + : config_(config), | ||
| 94 | + model_(std::make_unique<OfflineTtsMatchaModel>(mgr, config.model)), | ||
| 95 | + vocoder_(std::make_unique<HifiganVocoder>( | ||
| 96 | + mgr, config.model.num_threads, config.model.provider, | ||
| 97 | + config.model.matcha.vocoder)) { | ||
| 98 | + InitFrontend(mgr); | ||
| 99 | + | ||
| 100 | + if (!config.rule_fsts.empty()) { | ||
| 101 | + std::vector<std::string> files; | ||
| 102 | + SplitStringToVector(config.rule_fsts, ",", false, &files); | ||
| 103 | + tn_list_.reserve(files.size()); | ||
| 104 | + for (const auto &f : files) { | ||
| 105 | + if (config.model.debug) { | ||
| 106 | +#if __OHOS__ | ||
| 107 | + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); | ||
| 108 | +#else | ||
| 109 | + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); | ||
| 110 | +#endif | ||
| 111 | + } | ||
| 112 | + auto buf = ReadFile(mgr, f); | ||
| 113 | + std::istrstream is(buf.data(), buf.size()); | ||
| 114 | + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is)); | ||
| 115 | + } | ||
| 116 | + } | ||
| 117 | + | ||
| 118 | + if (!config.rule_fars.empty()) { | ||
| 119 | + std::vector<std::string> files; | ||
| 120 | + SplitStringToVector(config.rule_fars, ",", false, &files); | ||
| 121 | + tn_list_.reserve(files.size() + tn_list_.size()); | ||
| 122 | + | ||
| 123 | + for (const auto &f : files) { | ||
| 124 | + if (config.model.debug) { | ||
| 125 | +#if __OHOS__ | ||
| 126 | + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); | ||
| 127 | +#else | ||
| 128 | + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); | ||
| 129 | +#endif | ||
| 130 | + } | ||
| 131 | + | ||
| 132 | + auto buf = ReadFile(mgr, f); | ||
| 133 | + | ||
| 134 | + std::unique_ptr<std::istream> s( | ||
| 135 | + new std::istrstream(buf.data(), buf.size())); | ||
| 136 | + | ||
| 137 | + std::unique_ptr<fst::FarReader<fst::StdArc>> reader( | ||
| 138 | + fst::FarReader<fst::StdArc>::Open(std::move(s))); | ||
| 139 | + | ||
| 140 | + for (; !reader->Done(); reader->Next()) { | ||
| 141 | + std::unique_ptr<fst::StdConstFst> r( | ||
| 142 | + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); | ||
| 143 | + | ||
| 144 | + tn_list_.push_back( | ||
| 145 | + std::make_unique<kaldifst::TextNormalizer>(std::move(r))); | ||
| 146 | + } // for (; !reader->Done(); reader->Next()) | ||
| 147 | + } // for (const auto &f : files) | ||
| 148 | + } // if (!config.rule_fars.empty()) | ||
| 149 | + } | ||
| 150 | + | ||
| 151 | + int32_t SampleRate() const override { | ||
| 152 | + return model_->GetMetaData().sample_rate; | ||
| 153 | + } | ||
| 154 | + | ||
| 155 | + int32_t NumSpeakers() const override { | ||
| 156 | + return model_->GetMetaData().num_speakers; | ||
| 157 | + } | ||
| 158 | + | ||
| 159 | + GeneratedAudio Generate( | ||
| 160 | + const std::string &_text, int64_t sid = 0, float speed = 1.0, | ||
| 161 | + GeneratedAudioCallback callback = nullptr) const override { | ||
| 162 | + const auto &meta_data = model_->GetMetaData(); | ||
| 163 | + int32_t num_speakers = meta_data.num_speakers; | ||
| 164 | + | ||
| 165 | + if (num_speakers == 0 && sid != 0) { | ||
| 166 | +#if __OHOS__ | ||
| 167 | + SHERPA_ONNX_LOGE( | ||
| 168 | + "This is a single-speaker model and supports only sid 0. Given sid: " | ||
| 169 | + "%{public}d. sid is ignored", | ||
| 170 | + static_cast<int32_t>(sid)); | ||
| 171 | +#else | ||
| 172 | + SHERPA_ONNX_LOGE( | ||
| 173 | + "This is a single-speaker model and supports only sid 0. Given sid: " | ||
| 174 | + "%d. sid is ignored", | ||
| 175 | + static_cast<int32_t>(sid)); | ||
| 176 | +#endif | ||
| 177 | + } | ||
| 178 | + | ||
| 179 | + if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) { | ||
| 180 | +#if __OHOS__ | ||
| 181 | + SHERPA_ONNX_LOGE( | ||
| 182 | + "This model contains only %{public}d speakers. sid should be in the " | ||
| 183 | + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0", | ||
| 184 | + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | ||
| 185 | +#else | ||
| 186 | + SHERPA_ONNX_LOGE( | ||
| 187 | + "This model contains only %d speakers. sid should be in the range " | ||
| 188 | + "[%d, %d]. Given: %d. Use sid=0", | ||
| 189 | + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | ||
| 190 | +#endif | ||
| 191 | + sid = 0; | ||
| 192 | + } | ||
| 193 | + | ||
| 194 | + std::string text = _text; | ||
| 195 | + if (config_.model.debug) { | ||
| 196 | +#if __OHOS__ | ||
| 197 | + SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str()); | ||
| 198 | +#else | ||
| 199 | + SHERPA_ONNX_LOGE("Raw text: %s", text.c_str()); | ||
| 200 | +#endif | ||
| 201 | + } | ||
| 202 | + | ||
| 203 | + if (!tn_list_.empty()) { | ||
| 204 | + for (const auto &tn : tn_list_) { | ||
| 205 | + text = tn->Normalize(text); | ||
| 206 | + if (config_.model.debug) { | ||
| 207 | +#if __OHOS__ | ||
| 208 | + SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str()); | ||
| 209 | +#else | ||
| 210 | + SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str()); | ||
| 211 | +#endif | ||
| 212 | + } | ||
| 213 | + } | ||
| 214 | + } | ||
| 215 | + | ||
| 216 | + std::vector<TokenIDs> token_ids = | ||
| 217 | + frontend_->ConvertTextToTokenIds(text, "en-US"); | ||
| 218 | + | ||
| 219 | + if (token_ids.empty() || | ||
| 220 | + (token_ids.size() == 1 && token_ids[0].tokens.empty())) { | ||
| 221 | +#if __OHOS__ | ||
| 222 | + SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs", | ||
| 223 | + text.c_str()); | ||
| 224 | +#else | ||
| 225 | + SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str()); | ||
| 226 | +#endif | ||
| 227 | + return {}; | ||
| 228 | + } | ||
| 229 | + | ||
| 230 | + std::vector<std::vector<int64_t>> x; | ||
| 231 | + | ||
| 232 | + x.reserve(token_ids.size()); | ||
| 233 | + | ||
| 234 | + for (auto &i : token_ids) { | ||
| 235 | + x.push_back(std::move(i.tokens)); | ||
| 236 | + } | ||
| 237 | + | ||
| 238 | + for (auto &k : x) { | ||
| 239 | + k = AddBlank(k, meta_data.pad_id); | ||
| 240 | + } | ||
| 241 | + | ||
| 242 | + int32_t x_size = static_cast<int32_t>(x.size()); | ||
| 243 | + | ||
| 244 | + if (config_.max_num_sentences <= 0 || x_size <= config_.max_num_sentences) { | ||
| 245 | + auto ans = Process(x, sid, speed); | ||
| 246 | + if (callback) { | ||
| 247 | + callback(ans.samples.data(), ans.samples.size(), 1.0); | ||
| 248 | + } | ||
| 249 | + return ans; | ||
| 250 | + } | ||
| 251 | + | ||
| 252 | + // the input text is too long, we process sentences within it in batches | ||
| 253 | + // to avoid OOM. Batch size is config_.max_num_sentences | ||
| 254 | + std::vector<std::vector<int64_t>> batch_x; | ||
| 255 | + | ||
| 256 | + int32_t batch_size = config_.max_num_sentences; | ||
| 257 | + batch_x.reserve(config_.max_num_sentences); | ||
| 258 | + int32_t num_batches = x_size / batch_size; | ||
| 259 | + | ||
| 260 | + if (config_.model.debug) { | ||
| 261 | +#if __OHOS__ | ||
| 262 | + SHERPA_ONNX_LOGE( | ||
| 263 | + "Text is too long. Split it into %{public}d batches. batch size: " | ||
| 264 | + "%{public}d. Number of sentences: %{public}d", | ||
| 265 | + num_batches, batch_size, x_size); | ||
| 266 | +#else | ||
| 267 | + SHERPA_ONNX_LOGE( | ||
| 268 | + "Text is too long. Split it into %d batches. batch size: %d. Number " | ||
| 269 | + "of sentences: %d", | ||
| 270 | + num_batches, batch_size, x_size); | ||
| 271 | +#endif | ||
| 272 | + } | ||
| 273 | + | ||
| 274 | + GeneratedAudio ans; | ||
| 275 | + | ||
| 276 | + int32_t should_continue = 1; | ||
| 277 | + | ||
| 278 | + int32_t k = 0; | ||
| 279 | + | ||
| 280 | + for (int32_t b = 0; b != num_batches && should_continue; ++b) { | ||
| 281 | + batch_x.clear(); | ||
| 282 | + for (int32_t i = 0; i != batch_size; ++i, ++k) { | ||
| 283 | + batch_x.push_back(std::move(x[k])); | ||
| 284 | + } | ||
| 285 | + | ||
| 286 | + auto audio = Process(batch_x, sid, speed); | ||
| 287 | + ans.sample_rate = audio.sample_rate; | ||
| 288 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 289 | + audio.samples.end()); | ||
| 290 | + if (callback) { | ||
| 291 | + should_continue = callback(audio.samples.data(), audio.samples.size(), | ||
| 292 | + (b + 1) * 1.0 / num_batches); | ||
| 293 | + // Caution(fangjun): audio is freed when the callback returns, so users | ||
| 294 | + // should copy the data if they want to access the data after | ||
| 295 | + // the callback returns to avoid segmentation fault. | ||
| 296 | + } | ||
| 297 | + } | ||
| 298 | + | ||
| 299 | + batch_x.clear(); | ||
| 300 | + while (k < static_cast<int32_t>(x.size()) && should_continue) { | ||
| 301 | + batch_x.push_back(std::move(x[k])); | ||
| 302 | + | ||
| 303 | + ++k; | ||
| 304 | + } | ||
| 305 | + | ||
| 306 | + if (!batch_x.empty()) { | ||
| 307 | + auto audio = Process(batch_x, sid, speed); | ||
| 308 | + ans.sample_rate = audio.sample_rate; | ||
| 309 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 310 | + audio.samples.end()); | ||
| 311 | + if (callback) { | ||
| 312 | + callback(audio.samples.data(), audio.samples.size(), 1.0); | ||
| 313 | + // Caution(fangjun): audio is freed when the callback returns, so users | ||
| 314 | + // should copy the data if they want to access the data after | ||
| 315 | + // the callback returns to avoid segmentation fault. | ||
| 316 | + } | ||
| 317 | + } | ||
| 318 | + | ||
| 319 | + return ans; | ||
| 320 | + } | ||
| 321 | + | ||
| 322 | + private: | ||
| 323 | + template <typename Manager> | ||
| 324 | + void InitFrontend(Manager *mgr) {} | ||
| 325 | + | ||
| 326 | + void InitFrontend() { | ||
| 327 | + frontend_ = std::make_unique<JiebaLexicon>( | ||
| 328 | + config_.model.matcha.lexicon, config_.model.matcha.tokens, | ||
| 329 | + config_.model.matcha.dict_dir, config_.model.debug); | ||
| 330 | + } | ||
| 331 | + | ||
| 332 | + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, | ||
| 333 | + int32_t sid, float speed) const { | ||
| 334 | + int32_t num_tokens = 0; | ||
| 335 | + for (const auto &k : tokens) { | ||
| 336 | + num_tokens += k.size(); | ||
| 337 | + } | ||
| 338 | + | ||
| 339 | + std::vector<int64_t> x; | ||
| 340 | + x.reserve(num_tokens); | ||
| 341 | + for (const auto &k : tokens) { | ||
| 342 | + x.insert(x.end(), k.begin(), k.end()); | ||
| 343 | + } | ||
| 344 | + | ||
| 345 | + auto memory_info = | ||
| 346 | + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
| 347 | + | ||
| 348 | + std::array<int64_t, 2> x_shape = {1, static_cast<int32_t>(x.size())}; | ||
| 349 | + Ort::Value x_tensor = Ort::Value::CreateTensor( | ||
| 350 | + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size()); | ||
| 351 | + | ||
| 352 | + Ort::Value mel = model_->Run(std::move(x_tensor), sid, speed); | ||
| 353 | + Ort::Value audio = vocoder_->Run(std::move(mel)); | ||
| 354 | + | ||
| 355 | + std::vector<int64_t> audio_shape = | ||
| 356 | + audio.GetTensorTypeAndShapeInfo().GetShape(); | ||
| 357 | + | ||
| 358 | + int64_t total = 1; | ||
| 359 | + // The output shape may be (1, 1, total) or (1, total) or (total,) | ||
| 360 | + for (auto i : audio_shape) { | ||
| 361 | + total *= i; | ||
| 362 | + } | ||
| 363 | + | ||
| 364 | + const float *p = audio.GetTensorData<float>(); | ||
| 365 | + | ||
| 366 | + GeneratedAudio ans; | ||
| 367 | + ans.sample_rate = model_->GetMetaData().sample_rate; | ||
| 368 | + ans.samples = std::vector<float>(p, p + total); | ||
| 369 | + return ans; | ||
| 370 | + } | ||
| 371 | + | ||
| 372 | + private: | ||
| 373 | + OfflineTtsConfig config_; | ||
| 374 | + std::unique_ptr<OfflineTtsMatchaModel> model_; | ||
| 375 | + std::unique_ptr<HifiganVocoder> vocoder_; | ||
| 376 | + std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_; | ||
| 377 | + std::unique_ptr<OfflineTtsFrontend> frontend_; | ||
| 378 | +}; | ||
| 379 | + | ||
| 380 | +} // namespace sherpa_onnx | ||
| 381 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_IMPL_H_ |
| 1 | +// sherpa-onnx/csrc/offline-tts-matcha-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 10 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +void OfflineTtsMatchaModelConfig::Register(ParseOptions *po) { | ||
| 15 | + po->Register("matcha-acoustic-model", &acoustic_model, | ||
| 16 | + "Path to matcha acoustic model"); | ||
| 17 | + po->Register("matcha-vocoder", &vocoder, "Path to matcha vocoder"); | ||
| 18 | + po->Register("matcha-lexicon", &lexicon, | ||
| 19 | + "Path to lexicon.txt for Matcha models"); | ||
| 20 | + po->Register("matcha-tokens", &tokens, | ||
| 21 | + "Path to tokens.txt for Matcha models"); | ||
| 22 | + po->Register("matcha-data-dir", &data_dir, | ||
| 23 | + "Path to the directory containing dict for espeak-ng. If it is " | ||
| 24 | + "given, --matcha-lexicon is ignored."); | ||
| 25 | + po->Register("matcha-dict-dir", &dict_dir, | ||
| 26 | + "Path to the directory containing dict for jieba. Used only for " | ||
| 27 | + "Chinese TTS models using jieba"); | ||
| 28 | + po->Register("matcha-noise-scale", &noise_scale, | ||
| 29 | + "noise_scale for Matcha models"); | ||
| 30 | + po->Register("matcha-length-scale", &length_scale, | ||
| 31 | + "Speech speed. Larger->Slower; Smaller->faster."); | ||
| 32 | +} | ||
| 33 | + | ||
| 34 | +bool OfflineTtsMatchaModelConfig::Validate() const { | ||
| 35 | + if (acoustic_model.empty()) { | ||
| 36 | + SHERPA_ONNX_LOGE("Please provide --matcha-acoustic-model"); | ||
| 37 | + return false; | ||
| 38 | + } | ||
| 39 | + | ||
| 40 | + if (!FileExists(acoustic_model)) { | ||
| 41 | + SHERPA_ONNX_LOGE("--matcha-acoustic-model: '%s' does not exist", | ||
| 42 | + acoustic_model.c_str()); | ||
| 43 | + return false; | ||
| 44 | + } | ||
| 45 | + | ||
| 46 | + if (vocoder.empty()) { | ||
| 47 | + SHERPA_ONNX_LOGE("Please provide --matcha-vocoder"); | ||
| 48 | + return false; | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + if (!FileExists(vocoder)) { | ||
| 52 | + SHERPA_ONNX_LOGE("--matcha-vocoder: '%s' does not exist", vocoder.c_str()); | ||
| 53 | + return false; | ||
| 54 | + } | ||
| 55 | + | ||
| 56 | + if (tokens.empty()) { | ||
| 57 | + SHERPA_ONNX_LOGE("Please provide --matcha-tokens"); | ||
| 58 | + return false; | ||
| 59 | + } | ||
| 60 | + | ||
| 61 | + if (!FileExists(tokens)) { | ||
| 62 | + SHERPA_ONNX_LOGE("--matcha-tokens: '%s' does not exist", tokens.c_str()); | ||
| 63 | + return false; | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + if (!data_dir.empty()) { | ||
| 67 | + if (!FileExists(data_dir + "/phontab")) { | ||
| 68 | + SHERPA_ONNX_LOGE( | ||
| 69 | + "'%s/phontab' does not exist. Please check --matcha-data-dir", | ||
| 70 | + data_dir.c_str()); | ||
| 71 | + return false; | ||
| 72 | + } | ||
| 73 | + | ||
| 74 | + if (!FileExists(data_dir + "/phonindex")) { | ||
| 75 | + SHERPA_ONNX_LOGE( | ||
| 76 | + "'%s/phonindex' does not exist. Please check --matcha-data-dir", | ||
| 77 | + data_dir.c_str()); | ||
| 78 | + return false; | ||
| 79 | + } | ||
| 80 | + | ||
| 81 | + if (!FileExists(data_dir + "/phondata")) { | ||
| 82 | + SHERPA_ONNX_LOGE( | ||
| 83 | + "'%s/phondata' does not exist. Please check --matcha-data-dir", | ||
| 84 | + data_dir.c_str()); | ||
| 85 | + return false; | ||
| 86 | + } | ||
| 87 | + | ||
| 88 | + if (!FileExists(data_dir + "/intonations")) { | ||
| 89 | + SHERPA_ONNX_LOGE( | ||
| 90 | + "'%s/intonations' does not exist. Please check --matcha-data-dir", | ||
| 91 | + data_dir.c_str()); | ||
| 92 | + return false; | ||
| 93 | + } | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + if (!dict_dir.empty()) { | ||
| 97 | + std::vector<std::string> required_files = { | ||
| 98 | + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8", | ||
| 99 | + "idf.utf8", "stop_words.utf8", | ||
| 100 | + }; | ||
| 101 | + | ||
| 102 | + for (const auto &f : required_files) { | ||
| 103 | + if (!FileExists(dict_dir + "/" + f)) { | ||
| 104 | + SHERPA_ONNX_LOGE( | ||
| 105 | + "'%s/%s' does not exist. Please check --matcha-dict-dir", | ||
| 106 | + dict_dir.c_str(), f.c_str()); | ||
| 107 | + return false; | ||
| 108 | + } | ||
| 109 | + } | ||
| 110 | + | ||
| 111 | + // we require that --matcha-lexicon is not empty | ||
| 112 | + if (lexicon.empty()) { | ||
| 113 | + SHERPA_ONNX_LOGE("Please provide --matcha-lexicon"); | ||
| 114 | + return false; | ||
| 115 | + } | ||
| 116 | + | ||
| 117 | + if (!FileExists(lexicon)) { | ||
| 118 | + SHERPA_ONNX_LOGE("--matcha-lexicon: '%s' does not exist", | ||
| 119 | + lexicon.c_str()); | ||
| 120 | + return false; | ||
| 121 | + } | ||
| 122 | + } | ||
| 123 | + | ||
| 124 | + return true; | ||
| 125 | +} | ||
| 126 | + | ||
| 127 | +std::string OfflineTtsMatchaModelConfig::ToString() const { | ||
| 128 | + std::ostringstream os; | ||
| 129 | + | ||
| 130 | + os << "OfflineTtsMatchaModelConfig("; | ||
| 131 | + os << "acoustic_model=\"" << acoustic_model << "\", "; | ||
| 132 | + os << "vocoder=\"" << vocoder << "\", "; | ||
| 133 | + os << "lexicon=\"" << lexicon << "\", "; | ||
| 134 | + os << "tokens=\"" << tokens << "\", "; | ||
| 135 | + os << "data_dir=\"" << data_dir << "\", "; | ||
| 136 | + os << "dict_dir=\"" << dict_dir << "\", "; | ||
| 137 | + os << "noise_scale=" << noise_scale << ", "; | ||
| 138 | + os << "length_scale=" << length_scale << ")"; | ||
| 139 | + | ||
| 140 | + return os.str(); | ||
| 141 | +} | ||
| 142 | + | ||
| 143 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/csrc/offline-tts-matcha-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include <string> | ||
| 9 | + | ||
| 10 | +#include "sherpa-onnx/csrc/parse-options.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +struct OfflineTtsMatchaModelConfig { | ||
| 15 | + std::string acoustic_model; | ||
| 16 | + std::string vocoder; | ||
| 17 | + std::string lexicon; | ||
| 18 | + std::string tokens; | ||
| 19 | + | ||
| 20 | + // If data_dir is given, lexicon is ignored | ||
| 21 | + // data_dir is for piper-phonemizer, which uses espeak-ng | ||
| 22 | + std::string data_dir; | ||
| 23 | + | ||
| 24 | + // Used for Chinese TTS models using jieba | ||
| 25 | + std::string dict_dir; | ||
| 26 | + | ||
| 27 | + float noise_scale = 1; | ||
| 28 | + float length_scale = 1; | ||
| 29 | + | ||
| 30 | + OfflineTtsMatchaModelConfig() = default; | ||
| 31 | + | ||
| 32 | + OfflineTtsMatchaModelConfig(const std::string &acoustic_model, | ||
| 33 | + const std::string &vocoder, | ||
| 34 | + const std::string &lexicon, | ||
| 35 | + const std::string &tokens, | ||
| 36 | + const std::string &data_dir, | ||
| 37 | + const std::string &dict_dir, | ||
| 38 | + float noise_scale = 1.0, float length_scale = 1) | ||
| 39 | + : acoustic_model(acoustic_model), | ||
| 40 | + vocoder(vocoder), | ||
| 41 | + lexicon(lexicon), | ||
| 42 | + tokens(tokens), | ||
| 43 | + data_dir(data_dir), | ||
| 44 | + dict_dir(dict_dir), | ||
| 45 | + noise_scale(noise_scale), | ||
| 46 | + length_scale(length_scale) {} | ||
| 47 | + | ||
| 48 | + void Register(ParseOptions *po); | ||
| 49 | + bool Validate() const; | ||
| 50 | + | ||
| 51 | + std::string ToString() const; | ||
| 52 | +}; | ||
| 53 | + | ||
| 54 | +} // namespace sherpa_onnx | ||
| 55 | + | ||
| 56 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_ |
| 1 | +// sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ | ||
| 7 | + | ||
| 8 | +#include <cstdint> | ||
| 9 | +#include <string> | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +// If you are not sure what each field means, please | ||
| 14 | +// have a look of the Python file in the model directory that | ||
| 15 | +// you have downloaded. | ||
| 16 | +struct OfflineTtsMatchaModelMetaData { | ||
| 17 | + int32_t sample_rate = 0; | ||
| 18 | + int32_t num_speakers = 0; | ||
| 19 | + int32_t version = 1; | ||
| 20 | + int32_t jieba = 0; | ||
| 21 | + int32_t espeak = 0; | ||
| 22 | + int32_t use_eos_bos = 0; | ||
| 23 | + int32_t pad_id = 0; | ||
| 24 | +}; | ||
| 25 | + | ||
| 26 | +} // namespace sherpa_onnx | ||
| 27 | + | ||
| 28 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ |
sherpa-onnx/csrc/offline-tts-matcha-model.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-matcha-model.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model.h" | ||
| 6 | + | ||
| 7 | +#include <algorithm> | ||
| 8 | +#include <string> | ||
| 9 | +#include <utility> | ||
| 10 | +#include <vector> | ||
| 11 | + | ||
| 12 | +#if __ANDROID_API__ >= 9 | ||
| 13 | +#include "android/asset_manager.h" | ||
| 14 | +#include "android/asset_manager_jni.h" | ||
| 15 | +#endif | ||
| 16 | + | ||
| 17 | +#if __OHOS__ | ||
| 18 | +#include "rawfile/raw_file_manager.h" | ||
| 19 | +#endif | ||
| 20 | + | ||
| 21 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 22 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 23 | +#include "sherpa-onnx/csrc/session.h" | ||
| 24 | + | ||
| 25 | +namespace sherpa_onnx { | ||
| 26 | + | ||
| 27 | +class OfflineTtsMatchaModel::Impl { | ||
| 28 | + public: | ||
| 29 | + explicit Impl(const OfflineTtsModelConfig &config) | ||
| 30 | + : config_(config), | ||
| 31 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 32 | + sess_opts_(GetSessionOptions(config)), | ||
| 33 | + allocator_{} { | ||
| 34 | + auto buf = ReadFile(config.matcha.acoustic_model); | ||
| 35 | + Init(buf.data(), buf.size()); | ||
| 36 | + } | ||
| 37 | + | ||
| 38 | + template <typename Manager> | ||
| 39 | + Impl(Manager *mgr, const OfflineTtsModelConfig &config) | ||
| 40 | + : config_(config), | ||
| 41 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 42 | + sess_opts_(GetSessionOptions(config)), | ||
| 43 | + allocator_{} { | ||
| 44 | + auto buf = ReadFile(mgr, config.matcha.acoustic_model); | ||
| 45 | + Init(buf.data(), buf.size()); | ||
| 46 | + } | ||
| 47 | + | ||
| 48 | + const OfflineTtsMatchaModelMetaData &GetMetaData() const { | ||
| 49 | + return meta_data_; | ||
| 50 | + } | ||
| 51 | + | ||
| 52 | + Ort::Value Run(Ort::Value x, int64_t sid, float speed) { | ||
| 53 | + auto memory_info = | ||
| 54 | + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
| 55 | + | ||
| 56 | + std::vector<int64_t> x_shape = x.GetTensorTypeAndShapeInfo().GetShape(); | ||
| 57 | + if (x_shape[0] != 1) { | ||
| 58 | + SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d", | ||
| 59 | + static_cast<int32_t>(x_shape[0])); | ||
| 60 | + exit(-1); | ||
| 61 | + } | ||
| 62 | + | ||
| 63 | + int64_t len = x_shape[1]; | ||
| 64 | + int64_t len_shape = 1; | ||
| 65 | + | ||
| 66 | + Ort::Value x_length = | ||
| 67 | + Ort::Value::CreateTensor(memory_info, &len, 1, &len_shape, 1); | ||
| 68 | + | ||
| 69 | + int64_t scale_shape = 1; | ||
| 70 | + float noise_scale = config_.matcha.noise_scale; | ||
| 71 | + float length_scale = config_.matcha.length_scale; | ||
| 72 | + | ||
| 73 | + if (speed != 1 && speed > 0) { | ||
| 74 | + length_scale = 1. / speed; | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + Ort::Value noise_scale_tensor = | ||
| 78 | + Ort::Value::CreateTensor(memory_info, &noise_scale, 1, &scale_shape, 1); | ||
| 79 | + | ||
| 80 | + Ort::Value length_scale_tensor = Ort::Value::CreateTensor( | ||
| 81 | + memory_info, &length_scale, 1, &scale_shape, 1); | ||
| 82 | + | ||
| 83 | + Ort::Value sid_tensor = | ||
| 84 | + Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1); | ||
| 85 | + | ||
| 86 | + std::vector<Ort::Value> inputs; | ||
| 87 | + inputs.reserve(5); | ||
| 88 | + inputs.push_back(std::move(x)); | ||
| 89 | + inputs.push_back(std::move(x_length)); | ||
| 90 | + inputs.push_back(std::move(noise_scale_tensor)); | ||
| 91 | + inputs.push_back(std::move(length_scale_tensor)); | ||
| 92 | + | ||
| 93 | + if (input_names_.size() == 5 && input_names_.back() == "sid") { | ||
| 94 | + inputs.push_back(std::move(sid_tensor)); | ||
| 95 | + } | ||
| 96 | + | ||
| 97 | + auto out = | ||
| 98 | + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), | ||
| 99 | + output_names_ptr_.data(), output_names_ptr_.size()); | ||
| 100 | + | ||
| 101 | + return std::move(out[0]); | ||
| 102 | + } | ||
| 103 | + | ||
| 104 | + private: | ||
| 105 | + void Init(void *model_data, size_t model_data_length) { | ||
| 106 | + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | ||
| 107 | + sess_opts_); | ||
| 108 | + | ||
| 109 | + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); | ||
| 110 | + | ||
| 111 | + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); | ||
| 112 | + | ||
| 113 | + // get meta data | ||
| 114 | + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); | ||
| 115 | + if (config_.debug) { | ||
| 116 | + std::ostringstream os; | ||
| 117 | + os << "---matcha model---\n"; | ||
| 118 | + PrintModelMetadata(os, meta_data); | ||
| 119 | + | ||
| 120 | + os << "----------input names----------\n"; | ||
| 121 | + int32_t i = 0; | ||
| 122 | + for (const auto &s : input_names_) { | ||
| 123 | + os << i << " " << s << "\n"; | ||
| 124 | + ++i; | ||
| 125 | + } | ||
| 126 | + os << "----------output names----------\n"; | ||
| 127 | + i = 0; | ||
| 128 | + for (const auto &s : output_names_) { | ||
| 129 | + os << i << " " << s << "\n"; | ||
| 130 | + ++i; | ||
| 131 | + } | ||
| 132 | + | ||
| 133 | +#if __OHOS__ | ||
| 134 | + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); | ||
| 135 | +#else | ||
| 136 | + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); | ||
| 137 | +#endif | ||
| 138 | + } | ||
| 139 | + | ||
| 140 | + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below | ||
| 141 | + SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate"); | ||
| 142 | + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); | ||
| 143 | + SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); | ||
| 144 | + SHERPA_ONNX_READ_META_DATA(meta_data_.jieba, "jieba"); | ||
| 145 | + SHERPA_ONNX_READ_META_DATA(meta_data_.espeak, "has_espeak"); | ||
| 146 | + SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); | ||
| 147 | + SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); | ||
| 148 | + } | ||
| 149 | + | ||
| 150 | + private: | ||
| 151 | + OfflineTtsModelConfig config_; | ||
| 152 | + Ort::Env env_; | ||
| 153 | + Ort::SessionOptions sess_opts_; | ||
| 154 | + Ort::AllocatorWithDefaultOptions allocator_; | ||
| 155 | + | ||
| 156 | + std::unique_ptr<Ort::Session> sess_; | ||
| 157 | + | ||
| 158 | + std::vector<std::string> input_names_; | ||
| 159 | + std::vector<const char *> input_names_ptr_; | ||
| 160 | + | ||
| 161 | + std::vector<std::string> output_names_; | ||
| 162 | + std::vector<const char *> output_names_ptr_; | ||
| 163 | + | ||
| 164 | + OfflineTtsMatchaModelMetaData meta_data_; | ||
| 165 | +}; | ||
| 166 | + | ||
| 167 | +OfflineTtsMatchaModel::OfflineTtsMatchaModel( | ||
| 168 | + const OfflineTtsModelConfig &config) | ||
| 169 | + : impl_(std::make_unique<Impl>(config)) {} | ||
| 170 | + | ||
| 171 | +template <typename Manager> | ||
| 172 | +OfflineTtsMatchaModel::OfflineTtsMatchaModel( | ||
| 173 | + Manager *mgr, const OfflineTtsModelConfig &config) | ||
| 174 | + : impl_(std::make_unique<Impl>(mgr, config)) {} | ||
| 175 | + | ||
| 176 | +OfflineTtsMatchaModel::~OfflineTtsMatchaModel() = default; | ||
| 177 | + | ||
| 178 | +const OfflineTtsMatchaModelMetaData &OfflineTtsMatchaModel::GetMetaData() | ||
| 179 | + const { | ||
| 180 | + return impl_->GetMetaData(); | ||
| 181 | +} | ||
| 182 | + | ||
| 183 | +Ort::Value OfflineTtsMatchaModel::Run(Ort::Value x, int64_t sid /*= 0*/, | ||
| 184 | + float speed /*= 1.0*/) const { | ||
| 185 | + return impl_->Run(std::move(x), sid, speed); | ||
| 186 | +} | ||
| 187 | + | ||
| 188 | +#if __ANDROID_API__ >= 9 | ||
| 189 | +template OfflineTtsMatchaModel::OfflineTtsMatchaModel( | ||
| 190 | + AAssetManager *mgr, const OfflineTtsModelConfig &config); | ||
| 191 | +#endif | ||
| 192 | + | ||
| 193 | +#if __OHOS__ | ||
| 194 | +template OfflineTtsMatchaModel::OfflineTtsMatchaModel( | ||
| 195 | + NativeResourceManager *mgr, const OfflineTtsModelConfig &config); | ||
| 196 | +#endif | ||
| 197 | + | ||
| 198 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/offline-tts-matcha-model.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-matcha-model.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_H_ | ||
| 7 | + | ||
| 8 | +#include <memory> | ||
| 9 | +#include <string> | ||
| 10 | + | ||
| 11 | +#include "onnxruntime_cxx_api.h" // NOLINT | ||
| 12 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" | ||
| 13 | +#include "sherpa-onnx/csrc/offline-tts-model-config.h" | ||
| 14 | + | ||
| 15 | +namespace sherpa_onnx { | ||
| 16 | + | ||
| 17 | +class OfflineTtsMatchaModel { | ||
| 18 | + public: | ||
| 19 | + ~OfflineTtsMatchaModel(); | ||
| 20 | + | ||
| 21 | + explicit OfflineTtsMatchaModel(const OfflineTtsModelConfig &config); | ||
| 22 | + | ||
| 23 | + template <typename Manager> | ||
| 24 | + OfflineTtsMatchaModel(Manager *mgr, const OfflineTtsModelConfig &config); | ||
| 25 | + | ||
| 26 | + // Return a float32 tensor containing the mel | ||
| 27 | + // of shape (batch_size, mel_dim, num_frames) | ||
| 28 | + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const; | ||
| 29 | + | ||
| 30 | + const OfflineTtsMatchaModelMetaData &GetMetaData() const; | ||
| 31 | + | ||
| 32 | + private: | ||
| 33 | + class Impl; | ||
| 34 | + std::unique_ptr<Impl> impl_; | ||
| 35 | +}; | ||
| 36 | + | ||
| 37 | +} // namespace sherpa_onnx | ||
| 38 | + | ||
| 39 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_H_ |
| @@ -10,6 +10,7 @@ namespace sherpa_onnx { | @@ -10,6 +10,7 @@ namespace sherpa_onnx { | ||
| 10 | 10 | ||
| 11 | void OfflineTtsModelConfig::Register(ParseOptions *po) { | 11 | void OfflineTtsModelConfig::Register(ParseOptions *po) { |
| 12 | vits.Register(po); | 12 | vits.Register(po); |
| 13 | + matcha.Register(po); | ||
| 13 | 14 | ||
| 14 | po->Register("num-threads", &num_threads, | 15 | po->Register("num-threads", &num_threads, |
| 15 | "Number of threads to run the neural network"); | 16 | "Number of threads to run the neural network"); |
| @@ -27,7 +28,11 @@ bool OfflineTtsModelConfig::Validate() const { | @@ -27,7 +28,11 @@ bool OfflineTtsModelConfig::Validate() const { | ||
| 27 | return false; | 28 | return false; |
| 28 | } | 29 | } |
| 29 | 30 | ||
| 30 | - return vits.Validate(); | 31 | + if (!vits.model.empty()) { |
| 32 | + return vits.Validate(); | ||
| 33 | + } | ||
| 34 | + | ||
| 35 | + return matcha.Validate(); | ||
| 31 | } | 36 | } |
| 32 | 37 | ||
| 33 | std::string OfflineTtsModelConfig::ToString() const { | 38 | std::string OfflineTtsModelConfig::ToString() const { |
| @@ -35,6 +40,7 @@ std::string OfflineTtsModelConfig::ToString() const { | @@ -35,6 +40,7 @@ std::string OfflineTtsModelConfig::ToString() const { | ||
| 35 | 40 | ||
| 36 | os << "OfflineTtsModelConfig("; | 41 | os << "OfflineTtsModelConfig("; |
| 37 | os << "vits=" << vits.ToString() << ", "; | 42 | os << "vits=" << vits.ToString() << ", "; |
| 43 | + os << "matcha=" << matcha.ToString() << ", "; | ||
| 38 | os << "num_threads=" << num_threads << ", "; | 44 | os << "num_threads=" << num_threads << ", "; |
| 39 | os << "debug=" << (debug ? "True" : "False") << ", "; | 45 | os << "debug=" << (debug ? "True" : "False") << ", "; |
| 40 | os << "provider=\"" << provider << "\")"; | 46 | os << "provider=\"" << provider << "\")"; |
| @@ -7,6 +7,7 @@ | @@ -7,6 +7,7 @@ | ||
| 7 | 7 | ||
| 8 | #include <string> | 8 | #include <string> |
| 9 | 9 | ||
| 10 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" | ||
| 10 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" | 11 | #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" |
| 11 | #include "sherpa-onnx/csrc/parse-options.h" | 12 | #include "sherpa-onnx/csrc/parse-options.h" |
| 12 | 13 | ||
| @@ -14,6 +15,7 @@ namespace sherpa_onnx { | @@ -14,6 +15,7 @@ namespace sherpa_onnx { | ||
| 14 | 15 | ||
| 15 | struct OfflineTtsModelConfig { | 16 | struct OfflineTtsModelConfig { |
| 16 | OfflineTtsVitsModelConfig vits; | 17 | OfflineTtsVitsModelConfig vits; |
| 18 | + OfflineTtsMatchaModelConfig matcha; | ||
| 17 | 19 | ||
| 18 | int32_t num_threads = 1; | 20 | int32_t num_threads = 1; |
| 19 | bool debug = false; | 21 | bool debug = false; |
| @@ -22,9 +24,11 @@ struct OfflineTtsModelConfig { | @@ -22,9 +24,11 @@ struct OfflineTtsModelConfig { | ||
| 22 | OfflineTtsModelConfig() = default; | 24 | OfflineTtsModelConfig() = default; |
| 23 | 25 | ||
| 24 | OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, | 26 | OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, |
| 27 | + const OfflineTtsMatchaModelConfig &matcha, | ||
| 25 | int32_t num_threads, bool debug, | 28 | int32_t num_threads, bool debug, |
| 26 | const std::string &provider) | 29 | const std::string &provider) |
| 27 | : vits(vits), | 30 | : vits(vits), |
| 31 | + matcha(matcha), | ||
| 28 | num_threads(num_threads), | 32 | num_threads(num_threads), |
| 29 | debug(debug), | 33 | debug(debug), |
| 30 | provider(provider) {} | 34 | provider(provider) {} |
| @@ -156,17 +156,31 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -156,17 +156,31 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 156 | int32_t num_speakers = meta_data.num_speakers; | 156 | int32_t num_speakers = meta_data.num_speakers; |
| 157 | 157 | ||
| 158 | if (num_speakers == 0 && sid != 0) { | 158 | if (num_speakers == 0 && sid != 0) { |
| 159 | +#if __OHOS__ | ||
| 160 | + SHERPA_ONNX_LOGE( | ||
| 161 | + "This is a single-speaker model and supports only sid 0. Given sid: " | ||
| 162 | + "%{public}d. sid is ignored", | ||
| 163 | + static_cast<int32_t>(sid)); | ||
| 164 | +#else | ||
| 159 | SHERPA_ONNX_LOGE( | 165 | SHERPA_ONNX_LOGE( |
| 160 | "This is a single-speaker model and supports only sid 0. Given sid: " | 166 | "This is a single-speaker model and supports only sid 0. Given sid: " |
| 161 | "%d. sid is ignored", | 167 | "%d. sid is ignored", |
| 162 | static_cast<int32_t>(sid)); | 168 | static_cast<int32_t>(sid)); |
| 169 | +#endif | ||
| 163 | } | 170 | } |
| 164 | 171 | ||
| 165 | if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) { | 172 | if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) { |
| 173 | +#if __OHOS__ | ||
| 174 | + SHERPA_ONNX_LOGE( | ||
| 175 | + "This model contains only %{public}d speakers. sid should be in the " | ||
| 176 | + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0", | ||
| 177 | + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | ||
| 178 | +#else | ||
| 166 | SHERPA_ONNX_LOGE( | 179 | SHERPA_ONNX_LOGE( |
| 167 | "This model contains only %d speakers. sid should be in the range " | 180 | "This model contains only %d speakers. sid should be in the range " |
| 168 | "[%d, %d]. Given: %d. Use sid=0", | 181 | "[%d, %d]. Given: %d. Use sid=0", |
| 169 | num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); | 182 | num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); |
| 183 | +#endif | ||
| 170 | sid = 0; | 184 | sid = 0; |
| 171 | } | 185 | } |
| 172 | 186 | ||
| @@ -389,8 +403,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -389,8 +403,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 389 | } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) { | 403 | } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) { |
| 390 | frontend_ = std::make_unique<JiebaLexicon>( | 404 | frontend_ = std::make_unique<JiebaLexicon>( |
| 391 | config_.model.vits.lexicon, config_.model.vits.tokens, | 405 | config_.model.vits.lexicon, config_.model.vits.tokens, |
| 392 | - config_.model.vits.dict_dir, model_->GetMetaData(), | ||
| 393 | - config_.model.debug); | 406 | + config_.model.vits.dict_dir, config_.model.debug); |
| 394 | } else if ((meta_data.is_piper || meta_data.is_coqui || | 407 | } else if ((meta_data.is_piper || meta_data.is_coqui || |
| 395 | meta_data.is_icefall) && | 408 | meta_data.is_icefall) && |
| 396 | !config_.model.vits.data_dir.empty()) { | 409 | !config_.model.vits.data_dir.empty()) { |
| @@ -410,17 +423,6 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -410,17 +423,6 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 410 | } | 423 | } |
| 411 | } | 424 | } |
| 412 | 425 | ||
| 413 | - std::vector<int64_t> AddBlank(const std::vector<int64_t> &x) const { | ||
| 414 | - // we assume the blank ID is 0 | ||
| 415 | - std::vector<int64_t> buffer(x.size() * 2 + 1); | ||
| 416 | - int32_t i = 1; | ||
| 417 | - for (auto k : x) { | ||
| 418 | - buffer[i] = k; | ||
| 419 | - i += 2; | ||
| 420 | - } | ||
| 421 | - return buffer; | ||
| 422 | - } | ||
| 423 | - | ||
| 424 | GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, | 426 | GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, |
| 425 | const std::vector<std::vector<int64_t>> &tones, | 427 | const std::vector<std::vector<int64_t>> &tones, |
| 426 | int32_t sid, float speed) const { | 428 | int32_t sid, float speed) const { |
| @@ -51,25 +51,30 @@ bool OfflineTtsVitsModelConfig::Validate() const { | @@ -51,25 +51,30 @@ bool OfflineTtsVitsModelConfig::Validate() const { | ||
| 51 | 51 | ||
| 52 | if (!data_dir.empty()) { | 52 | if (!data_dir.empty()) { |
| 53 | if (!FileExists(data_dir + "/phontab")) { | 53 | if (!FileExists(data_dir + "/phontab")) { |
| 54 | - SHERPA_ONNX_LOGE("'%s/phontab' does not exist. Skipping test", | ||
| 55 | - data_dir.c_str()); | 54 | + SHERPA_ONNX_LOGE( |
| 55 | + "'%s/phontab' does not exist. Please check --vits-data-dir", | ||
| 56 | + data_dir.c_str()); | ||
| 56 | return false; | 57 | return false; |
| 57 | } | 58 | } |
| 58 | 59 | ||
| 59 | if (!FileExists(data_dir + "/phonindex")) { | 60 | if (!FileExists(data_dir + "/phonindex")) { |
| 60 | - SHERPA_ONNX_LOGE("'%s/phonindex' does not exist. Skipping test", | ||
| 61 | - data_dir.c_str()); | 61 | + SHERPA_ONNX_LOGE( |
| 62 | + "'%s/phonindex' does not exist. Please check --vits-data-dir", | ||
| 63 | + data_dir.c_str()); | ||
| 62 | return false; | 64 | return false; |
| 63 | } | 65 | } |
| 64 | 66 | ||
| 65 | if (!FileExists(data_dir + "/phondata")) { | 67 | if (!FileExists(data_dir + "/phondata")) { |
| 66 | - SHERPA_ONNX_LOGE("'%s/phondata' does not exist. Skipping test", | ||
| 67 | - data_dir.c_str()); | 68 | + SHERPA_ONNX_LOGE( |
| 69 | + "'%s/phondata' does not exist. Please check --vits-data-dir", | ||
| 70 | + data_dir.c_str()); | ||
| 68 | return false; | 71 | return false; |
| 69 | } | 72 | } |
| 70 | 73 | ||
| 71 | if (!FileExists(data_dir + "/intonations")) { | 74 | if (!FileExists(data_dir + "/intonations")) { |
| 72 | - SHERPA_ONNX_LOGE("'%s/intonations' does not exist.", data_dir.c_str()); | 75 | + SHERPA_ONNX_LOGE( |
| 76 | + "'%s/intonations' does not exist. Please check --vits-data-dir", | ||
| 77 | + data_dir.c_str()); | ||
| 73 | return false; | 78 | return false; |
| 74 | } | 79 | } |
| 75 | } | 80 | } |
| @@ -82,8 +87,8 @@ bool OfflineTtsVitsModelConfig::Validate() const { | @@ -82,8 +87,8 @@ bool OfflineTtsVitsModelConfig::Validate() const { | ||
| 82 | 87 | ||
| 83 | for (const auto &f : required_files) { | 88 | for (const auto &f : required_files) { |
| 84 | if (!FileExists(dict_dir + "/" + f)) { | 89 | if (!FileExists(dict_dir + "/" + f)) { |
| 85 | - SHERPA_ONNX_LOGE("'%s/%s' does not exist.", dict_dir.c_str(), | ||
| 86 | - f.c_str()); | 90 | + SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check vits-dict-dir", |
| 91 | + dict_dir.c_str(), f.c_str()); | ||
| 87 | return false; | 92 | return false; |
| 88 | } | 93 | } |
| 89 | } | 94 | } |
| @@ -174,7 +174,7 @@ class OfflineTtsVitsModel::Impl { | @@ -174,7 +174,7 @@ class OfflineTtsVitsModel::Impl { | ||
| 174 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); | 174 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); |
| 175 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); | 175 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); |
| 176 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_eos_bos, | 176 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_eos_bos, |
| 177 | - "use_eos_bos", 0); | 177 | + "use_eos_bos", 1); |
| 178 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.pad_id, "pad_id", 0); | 178 | SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.pad_id, "pad_id", 0); |
| 179 | 179 | ||
| 180 | std::string comment; | 180 | std::string comment; |
| @@ -362,7 +362,7 @@ Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/, | @@ -362,7 +362,7 @@ Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/, | ||
| 362 | 362 | ||
| 363 | Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, Ort::Value tones, | 363 | Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, Ort::Value tones, |
| 364 | int64_t sid /*= 0*/, | 364 | int64_t sid /*= 0*/, |
| 365 | - float speed /*= 1.0*/) { | 365 | + float speed /*= 1.0*/) const { |
| 366 | return impl_->Run(std::move(x), std::move(tones), sid, speed); | 366 | return impl_->Run(std::move(x), std::move(tones), sid, speed); |
| 367 | } | 367 | } |
| 368 | 368 |
| @@ -37,7 +37,7 @@ class OfflineTtsVitsModel { | @@ -37,7 +37,7 @@ class OfflineTtsVitsModel { | ||
| 37 | 37 | ||
| 38 | // This is for MeloTTS | 38 | // This is for MeloTTS |
| 39 | Ort::Value Run(Ort::Value x, Ort::Value tones, int64_t sid = 0, | 39 | Ort::Value Run(Ort::Value x, Ort::Value tones, int64_t sid = 0, |
| 40 | - float speed = 1.0); | 40 | + float speed = 1.0) const; |
| 41 | 41 | ||
| 42 | const OfflineTtsVitsModelMetaData &GetMetaData() const; | 42 | const OfflineTtsVitsModelMetaData &GetMetaData() const; |
| 43 | 43 |
| @@ -273,4 +273,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) { | @@ -273,4 +273,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) { | ||
| 273 | return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider); | 273 | return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider); |
| 274 | } | 274 | } |
| 275 | 275 | ||
| 276 | +Ort::SessionOptions GetSessionOptions(int32_t num_threads, | ||
| 277 | + const std::string &provider_str) { | ||
| 278 | + return GetSessionOptionsImpl(num_threads, provider_str); | ||
| 279 | +} | ||
| 280 | + | ||
| 276 | } // namespace sherpa_onnx | 281 | } // namespace sherpa_onnx |
| @@ -26,6 +26,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config); | @@ -26,6 +26,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config); | ||
| 26 | Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config, | 26 | Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config, |
| 27 | const std::string &model_type); | 27 | const std::string &model_type); |
| 28 | 28 | ||
| 29 | +Ort::SessionOptions GetSessionOptions(int32_t num_threads, | ||
| 30 | + const std::string &provider_str); | ||
| 31 | + | ||
| 29 | template <typename T> | 32 | template <typename T> |
| 30 | Ort::SessionOptions GetSessionOptions(const T &config) { | 33 | Ort::SessionOptions GetSessionOptions(const T &config) { |
| 31 | return GetSessionOptionsImpl(config.num_threads, config.provider); | 34 | return GetSessionOptionsImpl(config.num_threads, config.provider); |
| @@ -72,6 +72,10 @@ or details. | @@ -72,6 +72,10 @@ or details. | ||
| 72 | exit(EXIT_FAILURE); | 72 | exit(EXIT_FAILURE); |
| 73 | } | 73 | } |
| 74 | 74 | ||
| 75 | + if (config.model.debug) { | ||
| 76 | + fprintf(stderr, "%s\n", config.model.ToString().c_str()); | ||
| 77 | + } | ||
| 78 | + | ||
| 75 | if (!config.Validate()) { | 79 | if (!config.Validate()) { |
| 76 | fprintf(stderr, "Errors in config!\n"); | 80 | fprintf(stderr, "Errors in config!\n"); |
| 77 | exit(EXIT_FAILURE); | 81 | exit(EXIT_FAILURE); |
| @@ -54,6 +54,7 @@ endif() | @@ -54,6 +54,7 @@ endif() | ||
| 54 | 54 | ||
| 55 | if(SHERPA_ONNX_ENABLE_TTS) | 55 | if(SHERPA_ONNX_ENABLE_TTS) |
| 56 | list(APPEND srcs | 56 | list(APPEND srcs |
| 57 | + offline-tts-matcha-model-config.cc | ||
| 57 | offline-tts-model-config.cc | 58 | offline-tts-model-config.cc |
| 58 | offline-tts-vits-model-config.cc | 59 | offline-tts-vits-model-config.cc |
| 59 | offline-tts.cc | 60 | offline-tts.cc |
| 1 | +// sherpa-onnx/python/csrc/offline-tts-matcha-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +void PybindOfflineTtsMatchaModelConfig(py::module *m) { | ||
| 14 | + using PyClass = OfflineTtsMatchaModelConfig; | ||
| 15 | + | ||
| 16 | + py::class_<PyClass>(*m, "OfflineTtsMatchaModelConfig") | ||
| 17 | + .def(py::init<>()) | ||
| 18 | + .def(py::init<const std::string &, const std::string &, | ||
| 19 | + const std::string &, const std::string &, | ||
| 20 | + const std::string &, const std::string &, float, float>(), | ||
| 21 | + py::arg("acoustic_model"), py::arg("vocoder"), py::arg("lexicon"), | ||
| 22 | + py::arg("tokens"), py::arg("data_dir") = "", | ||
| 23 | + py::arg("dict_dir") = "", py::arg("noise_scale") = 1.0, | ||
| 24 | + py::arg("length_scale") = 1.0) | ||
| 25 | + .def_readwrite("acoustic_model", &PyClass::acoustic_model) | ||
| 26 | + .def_readwrite("vocoder", &PyClass::vocoder) | ||
| 27 | + .def_readwrite("lexicon", &PyClass::lexicon) | ||
| 28 | + .def_readwrite("tokens", &PyClass::tokens) | ||
| 29 | + .def_readwrite("data_dir", &PyClass::data_dir) | ||
| 30 | + .def_readwrite("dict_dir", &PyClass::dict_dir) | ||
| 31 | + .def_readwrite("noise_scale", &PyClass::noise_scale) | ||
| 32 | + .def_readwrite("length_scale", &PyClass::length_scale) | ||
| 33 | + .def("__str__", &PyClass::ToString) | ||
| 34 | + .def("validate", &PyClass::Validate); | ||
| 35 | +} | ||
| 36 | + | ||
| 37 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindOfflineTtsMatchaModelConfig(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_ |
| @@ -7,22 +7,26 @@ | @@ -7,22 +7,26 @@ | ||
| 7 | #include <string> | 7 | #include <string> |
| 8 | 8 | ||
| 9 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" | 9 | #include "sherpa-onnx/csrc/offline-tts-model-config.h" |
| 10 | +#include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" | ||
| 10 | #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" | 11 | #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" |
| 11 | 12 | ||
| 12 | namespace sherpa_onnx { | 13 | namespace sherpa_onnx { |
| 13 | 14 | ||
| 14 | void PybindOfflineTtsModelConfig(py::module *m) { | 15 | void PybindOfflineTtsModelConfig(py::module *m) { |
| 15 | PybindOfflineTtsVitsModelConfig(m); | 16 | PybindOfflineTtsVitsModelConfig(m); |
| 17 | + PybindOfflineTtsMatchaModelConfig(m); | ||
| 16 | 18 | ||
| 17 | using PyClass = OfflineTtsModelConfig; | 19 | using PyClass = OfflineTtsModelConfig; |
| 18 | 20 | ||
| 19 | py::class_<PyClass>(*m, "OfflineTtsModelConfig") | 21 | py::class_<PyClass>(*m, "OfflineTtsModelConfig") |
| 20 | .def(py::init<>()) | 22 | .def(py::init<>()) |
| 21 | - .def(py::init<const OfflineTtsVitsModelConfig &, int32_t, bool, | 23 | + .def(py::init<const OfflineTtsVitsModelConfig &, |
| 24 | + const OfflineTtsMatchaModelConfig &, int32_t, bool, | ||
| 22 | const std::string &>(), | 25 | const std::string &>(), |
| 23 | - py::arg("vits"), py::arg("num_threads") = 1, | 26 | + py::arg("vits"), py::arg("matcha"), py::arg("num_threads") = 1, |
| 24 | py::arg("debug") = false, py::arg("provider") = "cpu") | 27 | py::arg("debug") = false, py::arg("provider") = "cpu") |
| 25 | .def_readwrite("vits", &PyClass::vits) | 28 | .def_readwrite("vits", &PyClass::vits) |
| 29 | + .def_readwrite("matcha", &PyClass::matcha) | ||
| 26 | .def_readwrite("num_threads", &PyClass::num_threads) | 30 | .def_readwrite("num_threads", &PyClass::num_threads) |
| 27 | .def_readwrite("debug", &PyClass::debug) | 31 | .def_readwrite("debug", &PyClass::debug) |
| 28 | .def_readwrite("provider", &PyClass::provider) | 32 | .def_readwrite("provider", &PyClass::provider) |
| @@ -20,6 +20,7 @@ from _sherpa_onnx import ( | @@ -20,6 +20,7 @@ from _sherpa_onnx import ( | ||
| 20 | OfflineStream, | 20 | OfflineStream, |
| 21 | OfflineTts, | 21 | OfflineTts, |
| 22 | OfflineTtsConfig, | 22 | OfflineTtsConfig, |
| 23 | + OfflineTtsMatchaModelConfig, | ||
| 23 | OfflineTtsModelConfig, | 24 | OfflineTtsModelConfig, |
| 24 | OfflineTtsVitsModelConfig, | 25 | OfflineTtsVitsModelConfig, |
| 25 | OfflineZipformerAudioTaggingModelConfig, | 26 | OfflineZipformerAudioTaggingModelConfig, |
-
请 注册 或 登录 后发表评论