Fangjun Kuang
Committed by GitHub

Add C++ runtime for Matcha-TTS (#1627)

@@ -19,6 +19,40 @@ which $EXE @@ -19,6 +19,40 @@ which $EXE
19 mkdir ./tts 19 mkdir ./tts
20 20
21 log "------------------------------------------------------------" 21 log "------------------------------------------------------------"
  22 +log "matcha-icefall-zh-baker"
  23 +log "------------------------------------------------------------"
  24 +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  25 +tar xvf matcha-icefall-zh-baker.tar.bz2
  26 +rm matcha-icefall-zh-baker.tar.bz2
  27 +
  28 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  29 +
  30 +$EXE \
  31 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  32 + --matcha-vocoder=./hifigan_v2.onnx \
  33 + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  34 + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
  35 + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \
  36 + --num-threads=2 \
  37 + --debug=1 \
  38 + --output-filename=./tts/matcha-baker-zh-1.wav \
  39 + '小米的使命是,始终坚持做"感动人心、价格厚道"的好产品,让全球每个人都能享受科技带来的美好生活'
  40 +
  41 +$EXE \
  42 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  43 + --matcha-vocoder=./hifigan_v2.onnx \
  44 + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  45 + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
  46 + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \
  47 + --num-threads=2 \
  48 + --debug=1 \
  49 + --output-filename=./tts/matcha-baker-zh-2.wav \
  50 + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。"
  51 +
  52 +rm hifigan_v2.onnx
  53 +rm -rf matcha-icefall-zh-baker
  54 +
  55 +log "------------------------------------------------------------"
22 log "vits-piper-en_US-amy-low" 56 log "vits-piper-en_US-amy-low"
23 log "------------------------------------------------------------" 57 log "------------------------------------------------------------"
24 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 58 curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
@@ -269,6 +269,26 @@ mkdir ./tts @@ -269,6 +269,26 @@ mkdir ./tts
269 269
270 log "vits-ljs test" 270 log "vits-ljs test"
271 271
  272 +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  273 +tar xvf matcha-icefall-zh-baker.tar.bz2
  274 +rm matcha-icefall-zh-baker.tar.bz2
  275 +
  276 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  277 +
  278 +python3 ./python-api-examples/offline-tts.py \
  279 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  280 + --matcha-vocoder=./hifigan_v2.onnx \
  281 + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  282 + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
  283 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  284 + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \
  285 + --output-filename=./tts/test-matcha.wav \
  286 + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
  287 +
  288 +rm -rf matcha-icefall-zh-baker
  289 +rm hifigan_v2.onnx
  290 +
  291 +
272 curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx 292 curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx
273 curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt 293 curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt
274 curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt 294 curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt
@@ -149,6 +149,23 @@ jobs: @@ -149,6 +149,23 @@ jobs:
149 name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} 149 name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
150 path: install/* 150 path: install/*
151 151
  152 + - name: Test offline TTS
  153 + if: matrix.with_tts == 'ON'
  154 + shell: bash
  155 + run: |
  156 + du -h -d1 .
  157 + export PATH=$PWD/build/bin:$PATH
  158 + export EXE=sherpa-onnx-offline-tts
  159 +
  160 + .github/scripts/test-offline-tts.sh
  161 + du -h -d1 .
  162 +
  163 + - uses: actions/upload-artifact@v4
  164 + if: matrix.with_tts == 'ON'
  165 + with:
  166 + name: tts-generated-test-files-${{ matrix.build_type }}-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
  167 + path: tts
  168 +
152 - name: Test offline Moonshine 169 - name: Test offline Moonshine
153 if: matrix.build_type != 'Debug' 170 if: matrix.build_type != 'Debug'
154 shell: bash 171 shell: bash
@@ -309,16 +326,7 @@ jobs: @@ -309,16 +326,7 @@ jobs:
309 .github/scripts/test-offline-whisper.sh 326 .github/scripts/test-offline-whisper.sh
310 du -h -d1 . 327 du -h -d1 .
311 328
312 - - name: Test offline TTS  
313 - if: matrix.with_tts == 'ON'  
314 - shell: bash  
315 - run: |  
316 - du -h -d1 .  
317 - export PATH=$PWD/build/bin:$PATH  
318 - export EXE=sherpa-onnx-offline-tts  
319 329
320 - .github/scripts/test-offline-tts.sh  
321 - du -h -d1 .  
322 330
323 - name: Test online paraformer 331 - name: Test online paraformer
324 shell: bash 332 shell: bash
@@ -367,8 +375,4 @@ jobs: @@ -367,8 +375,4 @@ jobs:
367 overwrite: true 375 overwrite: true
368 file: sherpa-onnx-*.tar.bz2 376 file: sherpa-onnx-*.tar.bz2
369 377
370 - - uses: actions/upload-artifact@v4  
371 - with:  
372 - name: tts-generated-test-files-${{ matrix.build_type }}-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}  
373 - path: tts  
374 378
@@ -121,6 +121,15 @@ jobs: @@ -121,6 +121,15 @@ jobs:
121 otool -L build/bin/sherpa-onnx 121 otool -L build/bin/sherpa-onnx
122 otool -l build/bin/sherpa-onnx 122 otool -l build/bin/sherpa-onnx
123 123
  124 + - name: Test offline TTS
  125 + if: matrix.with_tts == 'ON'
  126 + shell: bash
  127 + run: |
  128 + export PATH=$PWD/build/bin:$PATH
  129 + export EXE=sherpa-onnx-offline-tts
  130 +
  131 + .github/scripts/test-offline-tts.sh
  132 +
124 - name: Test offline Moonshine 133 - name: Test offline Moonshine
125 if: matrix.build_type != 'Debug' 134 if: matrix.build_type != 'Debug'
126 shell: bash 135 shell: bash
@@ -226,15 +235,6 @@ jobs: @@ -226,15 +235,6 @@ jobs:
226 235
227 .github/scripts/test-kws.sh 236 .github/scripts/test-kws.sh
228 237
229 - - name: Test offline TTS  
230 - if: matrix.with_tts == 'ON'  
231 - shell: bash  
232 - run: |  
233 - export PATH=$PWD/build/bin:$PATH  
234 - export EXE=sherpa-onnx-offline-tts  
235 -  
236 - .github/scripts/test-offline-tts.sh  
237 -  
238 - name: Test online paraformer 238 - name: Test online paraformer
239 shell: bash 239 shell: bash
240 run: | 240 run: |
@@ -11,7 +11,7 @@ while the model is still generating. @@ -11,7 +11,7 @@ while the model is still generating.
11 11
12 Usage: 12 Usage:
13 13
14 -Example (1/3) 14 +Example (1/4)
15 15
16 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 16 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
17 tar xf vits-piper-en_US-amy-low.tar.bz2 17 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \
23 --output-filename=./generated.wav \ 23 --output-filename=./generated.wav \
24 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 24 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
25 25
26 -Example (2/3) 26 +Example (2/4)
27 27
28 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 28 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
29 tar xvf vits-zh-aishell3.tar.bz2 29 tar xvf vits-zh-aishell3.tar.bz2
@@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \
37 --output-filename=./liubei-21.wav \ 37 --output-filename=./liubei-21.wav \
38 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" 38 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
39 39
40 -Example (3/3) 40 +Example (3/4)
41 41
42 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 42 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
43 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 43 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -53,6 +53,24 @@ python3 ./python-api-examples/offline-tts-play.py \ @@ -53,6 +53,24 @@ python3 ./python-api-examples/offline-tts-play.py \
53 --output-filename=./test-2.wav \ 53 --output-filename=./test-2.wav \
54 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" 54 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
55 55
  56 +Example (4/4)
  57 +
  58 +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  59 +tar xvf matcha-icefall-zh-baker.tar.bz2
  60 +rm matcha-icefall-zh-baker.tar.bz2
  61 +
  62 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  63 +
  64 +python3 ./python-api-examples/offline-tts-play.py \
  65 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  66 + --matcha-vocoder=./hifigan_v2.onnx \
  67 + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  68 + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
  69 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  70 + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \
  71 + --output-filename=./test-matcha.wav \
  72 + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
  73 +
56 74
57 You can find more models at 75 You can find more models at
58 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 76 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
@@ -84,14 +102,11 @@ except ImportError: @@ -84,14 +102,11 @@ except ImportError:
84 sys.exit(-1) 102 sys.exit(-1)
85 103
86 104
87 -def get_args():  
88 - parser = argparse.ArgumentParser(  
89 - formatter_class=argparse.ArgumentDefaultsHelpFormatter  
90 - )  
91 - 105 +def add_vits_args(parser):
92 parser.add_argument( 106 parser.add_argument(
93 "--vits-model", 107 "--vits-model",
94 type=str, 108 type=str,
  109 + default="",
95 help="Path to vits model.onnx", 110 help="Path to vits model.onnx",
96 ) 111 )
97 112
@@ -124,6 +139,60 @@ def get_args(): @@ -124,6 +139,60 @@ def get_args():
124 help="Path to the dict directory for models using jieba", 139 help="Path to the dict directory for models using jieba",
125 ) 140 )
126 141
  142 +
  143 +def add_matcha_args(parser):
  144 + parser.add_argument(
  145 + "--matcha-acoustic-model",
  146 + type=str,
  147 + default="",
  148 + help="Path to model.onnx for matcha",
  149 + )
  150 +
  151 + parser.add_argument(
  152 + "--matcha-vocoder",
  153 + type=str,
  154 + default="",
  155 + help="Path to vocoder for matcha",
  156 + )
  157 +
  158 + parser.add_argument(
  159 + "--matcha-lexicon",
  160 + type=str,
  161 + default="",
  162 + help="Path to lexicon.txt for matcha",
  163 + )
  164 +
  165 + parser.add_argument(
  166 + "--matcha-tokens",
  167 + type=str,
  168 + default="",
  169 + help="Path to tokens.txt for matcha",
  170 + )
  171 +
  172 + parser.add_argument(
  173 + "--matcha-data-dir",
  174 + type=str,
  175 + default="",
  176 + help="""Path to the dict directory of espeak-ng. If it is specified,
  177 + --matcha-lexicon and --matcha-tokens are ignored""",
  178 + )
  179 +
  180 + parser.add_argument(
  181 + "--matcha-dict-dir",
  182 + type=str,
  183 + default="",
  184 + help="Path to the dict directory for models using jieba",
  185 + )
  186 +
  187 +
  188 +def get_args():
  189 + parser = argparse.ArgumentParser(
  190 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  191 + )
  192 +
  193 + add_vits_args(parser)
  194 + add_matcha_args(parser)
  195 +
127 parser.add_argument( 196 parser.add_argument(
128 "--tts-rule-fsts", 197 "--tts-rule-fsts",
129 type=str, 198 type=str,
@@ -313,6 +382,14 @@ def main(): @@ -313,6 +382,14 @@ def main():
313 dict_dir=args.vits_dict_dir, 382 dict_dir=args.vits_dict_dir,
314 tokens=args.vits_tokens, 383 tokens=args.vits_tokens,
315 ), 384 ),
  385 + matcha=sherpa_onnx.OfflineTtsMatchaModelConfig(
  386 + acoustic_model=args.matcha_acoustic_model,
  387 + vocoder=args.matcha_vocoder,
  388 + lexicon=args.matcha_lexicon,
  389 + tokens=args.matcha_tokens,
  390 + data_dir=args.matcha_data_dir,
  391 + dict_dir=args.matcha_dict_dir,
  392 + ),
316 provider=args.provider, 393 provider=args.provider,
317 debug=args.debug, 394 debug=args.debug,
318 num_threads=args.num_threads, 395 num_threads=args.num_threads,
@@ -12,7 +12,7 @@ generated audio. @@ -12,7 +12,7 @@ generated audio.
12 12
13 Usage: 13 Usage:
14 14
15 -Example (1/3) 15 +Example (1/4)
16 16
17 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 17 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
18 tar xf vits-piper-en_US-amy-low.tar.bz2 18 tar xf vits-piper-en_US-amy-low.tar.bz2
@@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \
24 --output-filename=./generated.wav \ 24 --output-filename=./generated.wav \
25 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." 25 "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
26 26
27 -Example (2/3) 27 +Example (2/4)
28 28
29 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 29 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
30 tar xvf vits-icefall-zh-aishell3.tar.bz2 30 tar xvf vits-icefall-zh-aishell3.tar.bz2
@@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \
38 --output-filename=./liubei-21.wav \ 38 --output-filename=./liubei-21.wav \
39 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" 39 "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
40 40
41 -Example (3/3) 41 +Example (3/4)
42 42
43 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 43 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
44 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 44 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
@@ -54,6 +54,23 @@ python3 ./python-api-examples/offline-tts.py \ @@ -54,6 +54,23 @@ python3 ./python-api-examples/offline-tts.py \
54 --output-filename=./test-2.wav \ 54 --output-filename=./test-2.wav \
55 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" 55 "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
56 56
  57 +Example (4/4)
  58 +
  59 +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  60 +tar xvf matcha-icefall-zh-baker.tar.bz2
  61 +rm matcha-icefall-zh-baker.tar.bz2
  62 +
  63 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  64 +
  65 +python3 ./python-api-examples/offline-tts.py \
  66 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  67 + --matcha-vocoder=./hifigan_v2.onnx \
  68 + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  69 + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
  70 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  71 + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \
  72 + --output-filename=./test-matcha.wav \
  73 + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
57 74
58 You can find more models at 75 You can find more models at
59 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 76 https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
@@ -71,14 +88,11 @@ import sherpa_onnx @@ -71,14 +88,11 @@ import sherpa_onnx
71 import soundfile as sf 88 import soundfile as sf
72 89
73 90
74 -def get_args():  
75 - parser = argparse.ArgumentParser(  
76 - formatter_class=argparse.ArgumentDefaultsHelpFormatter  
77 - )  
78 - 91 +def add_vits_args(parser):
79 parser.add_argument( 92 parser.add_argument(
80 "--vits-model", 93 "--vits-model",
81 type=str, 94 type=str,
  95 + default="",
82 help="Path to vits model.onnx", 96 help="Path to vits model.onnx",
83 ) 97 )
84 98
@@ -111,6 +125,60 @@ def get_args(): @@ -111,6 +125,60 @@ def get_args():
111 help="Path to the dict directory for models using jieba", 125 help="Path to the dict directory for models using jieba",
112 ) 126 )
113 127
  128 +
  129 +def add_matcha_args(parser):
  130 + parser.add_argument(
  131 + "--matcha-acoustic-model",
  132 + type=str,
  133 + default="",
  134 + help="Path to model.onnx for matcha",
  135 + )
  136 +
  137 + parser.add_argument(
  138 + "--matcha-vocoder",
  139 + type=str,
  140 + default="",
  141 + help="Path to vocoder for matcha",
  142 + )
  143 +
  144 + parser.add_argument(
  145 + "--matcha-lexicon",
  146 + type=str,
  147 + default="",
  148 + help="Path to lexicon.txt for matcha",
  149 + )
  150 +
  151 + parser.add_argument(
  152 + "--matcha-tokens",
  153 + type=str,
  154 + default="",
  155 + help="Path to tokens.txt for matcha",
  156 + )
  157 +
  158 + parser.add_argument(
  159 + "--matcha-data-dir",
  160 + type=str,
  161 + default="",
  162 + help="""Path to the dict directory of espeak-ng. If it is specified,
  163 + --matcha-lexicon and --matcha-tokens are ignored""",
  164 + )
  165 +
  166 + parser.add_argument(
  167 + "--matcha-dict-dir",
  168 + type=str,
  169 + default="",
  170 + help="Path to the dict directory for models using jieba",
  171 + )
  172 +
  173 +
  174 +def get_args():
  175 + parser = argparse.ArgumentParser(
  176 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  177 + )
  178 +
  179 + add_vits_args(parser)
  180 + add_matcha_args(parser)
  181 +
114 parser.add_argument( 182 parser.add_argument(
115 "--tts-rule-fsts", 183 "--tts-rule-fsts",
116 type=str, 184 type=str,
@@ -196,6 +264,14 @@ def main(): @@ -196,6 +264,14 @@ def main():
196 dict_dir=args.vits_dict_dir, 264 dict_dir=args.vits_dict_dir,
197 tokens=args.vits_tokens, 265 tokens=args.vits_tokens,
198 ), 266 ),
  267 + matcha=sherpa_onnx.OfflineTtsMatchaModelConfig(
  268 + acoustic_model=args.matcha_acoustic_model,
  269 + vocoder=args.matcha_vocoder,
  270 + lexicon=args.matcha_lexicon,
  271 + tokens=args.matcha_tokens,
  272 + data_dir=args.matcha_data_dir,
  273 + dict_dir=args.matcha_dict_dir,
  274 + ),
199 provider=args.provider, 275 provider=args.provider,
200 debug=args.debug, 276 debug=args.debug,
201 num_threads=args.num_threads, 277 num_threads=args.num_threads,
@@ -151,12 +151,15 @@ list(APPEND sources @@ -151,12 +151,15 @@ list(APPEND sources
151 151
152 if(SHERPA_ONNX_ENABLE_TTS) 152 if(SHERPA_ONNX_ENABLE_TTS)
153 list(APPEND sources 153 list(APPEND sources
  154 + hifigan-vocoder.cc
154 jieba-lexicon.cc 155 jieba-lexicon.cc
155 lexicon.cc 156 lexicon.cc
156 melo-tts-lexicon.cc 157 melo-tts-lexicon.cc
157 offline-tts-character-frontend.cc 158 offline-tts-character-frontend.cc
158 offline-tts-frontend.cc 159 offline-tts-frontend.cc
159 offline-tts-impl.cc 160 offline-tts-impl.cc
  161 + offline-tts-matcha-model-config.cc
  162 + offline-tts-matcha-model.cc
160 offline-tts-model-config.cc 163 offline-tts-model-config.cc
161 offline-tts-vits-model-config.cc 164 offline-tts-vits-model-config.cc
162 offline-tts-vits-model.cc 165 offline-tts-vits-model.cc
  1 +// sherpa-onnx/csrc/hifigan-vocoder.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/hifigan-vocoder.h"
  6 +
  7 +#include <string>
  8 +#include <utility>
  9 +#include <vector>
  10 +
  11 +#if __ANDROID_API__ >= 9
  12 +#include "android/asset_manager.h"
  13 +#include "android/asset_manager_jni.h"
  14 +#endif
  15 +
  16 +#if __OHOS__
  17 +#include "rawfile/raw_file_manager.h"
  18 +#endif
  19 +
  20 +#include "sherpa-onnx/csrc/macros.h"
  21 +#include "sherpa-onnx/csrc/onnx-utils.h"
  22 +#include "sherpa-onnx/csrc/session.h"
  23 +
  24 +namespace sherpa_onnx {
  25 +
  26 +class HifiganVocoder::Impl {
  27 + public:
  28 + explicit Impl(int32_t num_threads, const std::string &provider,
  29 + const std::string &model)
  30 + : env_(ORT_LOGGING_LEVEL_ERROR),
  31 + sess_opts_(GetSessionOptions(num_threads, provider)),
  32 + allocator_{} {
  33 + auto buf = ReadFile(model);
  34 + Init(buf.data(), buf.size());
  35 + }
  36 +
  37 + template <typename Manager>
  38 + explicit Impl(Manager *mgr, int32_t num_threads, const std::string &provider,
  39 + const std::string &model)
  40 + : env_(ORT_LOGGING_LEVEL_ERROR),
  41 + sess_opts_(GetSessionOptions(num_threads, provider)),
  42 + allocator_{} {
  43 + auto buf = ReadFile(mgr, model);
  44 + Init(buf.data(), buf.size());
  45 + }
  46 +
  47 + Ort::Value Run(Ort::Value mel) const {
  48 + auto out = sess_->Run({}, input_names_ptr_.data(), &mel, 1,
  49 + output_names_ptr_.data(), output_names_ptr_.size());
  50 +
  51 + return std::move(out[0]);
  52 + }
  53 +
  54 + private:
  55 + void Init(void *model_data, size_t model_data_length) {
  56 + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
  57 + sess_opts_);
  58 +
  59 + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
  60 +
  61 + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
  62 + }
  63 +
  64 + private:
  65 + Ort::Env env_;
  66 + Ort::SessionOptions sess_opts_;
  67 + Ort::AllocatorWithDefaultOptions allocator_;
  68 +
  69 + std::unique_ptr<Ort::Session> sess_;
  70 +
  71 + std::vector<std::string> input_names_;
  72 + std::vector<const char *> input_names_ptr_;
  73 +
  74 + std::vector<std::string> output_names_;
  75 + std::vector<const char *> output_names_ptr_;
  76 +};
  77 +
  78 +HifiganVocoder::HifiganVocoder(int32_t num_threads, const std::string &provider,
  79 + const std::string &model)
  80 + : impl_(std::make_unique<Impl>(num_threads, provider, model)) {}
  81 +
  82 +template <typename Manager>
  83 +HifiganVocoder::HifiganVocoder(Manager *mgr, int32_t num_threads,
  84 + const std::string &provider,
  85 + const std::string &model)
  86 + : impl_(std::make_unique<Impl>(mgr, num_threads, provider, model)) {}
  87 +
  88 +HifiganVocoder::~HifiganVocoder() = default;
  89 +
  90 +Ort::Value HifiganVocoder::Run(Ort::Value mel) const {
  91 + return impl_->Run(std::move(mel));
  92 +}
  93 +
  94 +#if __ANDROID_API__ >= 9
  95 +template HifiganVocoder::HifiganVocoder(AAssetManager *mgr, int32_t num_threads,
  96 + const std::string &provider,
  97 + const std::string &model);
  98 +#endif
  99 +
  100 +#if __OHOS__
  101 +template HifiganVocoder::HifiganVocoder(NativeResourceManager *mgr,
  102 + int32_t num_threads,
  103 + const std::string &provider,
  104 + const std::string &model);
  105 +#endif
  106 +
  107 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/hifigan-vocoder.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_HIFIGAN_VOCODER_H_
  6 +#define SHERPA_ONNX_CSRC_HIFIGAN_VOCODER_H_
  7 +
  8 +#include <memory>
  9 +#include <string>
  10 +
  11 +#include "onnxruntime_cxx_api.h" // NOLINT
  12 +
  13 +namespace sherpa_onnx {
  14 +
  15 +class HifiganVocoder {
  16 + public:
  17 + ~HifiganVocoder();
  18 +
  19 + HifiganVocoder(int32_t num_threads, const std::string &provider,
  20 + const std::string &model);
  21 +
  22 + template <typename Manager>
  23 + HifiganVocoder(Manager *mgr, int32_t num_threads, const std::string &provider,
  24 + const std::string &model);
  25 +
  26 + /** @param mel A float32 tensor of shape (batch_size, feat_dim, num_frames).
  27 + * @return Return a float32 tensor of shape (batch_size, num_samples).
  28 + */
  29 + Ort::Value Run(Ort::Value mel) const;
  30 +
  31 + private:
  32 + class Impl;
  33 + std::unique_ptr<Impl> impl_;
  34 +};
  35 +
  36 +} // namespace sherpa_onnx
  37 +
  38 +#endif // SHERPA_ONNX_CSRC_HIFIGAN_VOCODER_H_
@@ -19,9 +19,8 @@ namespace sherpa_onnx { @@ -19,9 +19,8 @@ namespace sherpa_onnx {
19 class JiebaLexicon::Impl { 19 class JiebaLexicon::Impl {
20 public: 20 public:
21 Impl(const std::string &lexicon, const std::string &tokens, 21 Impl(const std::string &lexicon, const std::string &tokens,
22 - const std::string &dict_dir,  
23 - const OfflineTtsVitsModelMetaData &meta_data, bool debug)  
24 - : meta_data_(meta_data), debug_(debug) { 22 + const std::string &dict_dir, bool debug)
  23 + : debug_(debug) {
25 std::string dict = dict_dir + "/jieba.dict.utf8"; 24 std::string dict = dict_dir + "/jieba.dict.utf8";
26 std::string hmm = dict_dir + "/hmm_model.utf8"; 25 std::string hmm = dict_dir + "/hmm_model.utf8";
27 std::string user_dict = dict_dir + "/user.dict.utf8"; 26 std::string user_dict = dict_dir + "/user.dict.utf8";
@@ -84,7 +83,6 @@ class JiebaLexicon::Impl { @@ -84,7 +83,6 @@ class JiebaLexicon::Impl {
84 std::vector<TokenIDs> ans; 83 std::vector<TokenIDs> ans;
85 std::vector<int64_t> this_sentence; 84 std::vector<int64_t> this_sentence;
86 85
87 - int32_t blank = token2id_.at(" ");  
88 for (const auto &w : words) { 86 for (const auto &w : words) {
89 auto ids = ConvertWordToIds(w); 87 auto ids = ConvertWordToIds(w);
90 if (ids.empty()) { 88 if (ids.empty()) {
@@ -93,7 +91,6 @@ class JiebaLexicon::Impl { @@ -93,7 +91,6 @@ class JiebaLexicon::Impl {
93 } 91 }
94 92
95 this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); 93 this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
96 - this_sentence.push_back(blank);  
97 94
98 if (w == "。" || w == "!" || w == "?" || w == ",") { 95 if (w == "。" || w == "!" || w == "?" || w == ",") {
99 ans.emplace_back(std::move(this_sentence)); 96 ans.emplace_back(std::move(this_sentence));
@@ -135,7 +132,9 @@ class JiebaLexicon::Impl { @@ -135,7 +132,9 @@ class JiebaLexicon::Impl {
135 token2id_ = ReadTokens(is); 132 token2id_ = ReadTokens(is);
136 133
137 std::vector<std::pair<std::string, std::string>> puncts = { 134 std::vector<std::pair<std::string, std::string>> puncts = {
138 - {",", ","}, {".", "。"}, {"!", "!"}, {"?", "?"}}; 135 + {",", ","}, {".", "。"}, {"!", "!"}, {"?", "?"}, {":", ":"},
  136 + {"\"", "“"}, {"\"", "”"}, {"'", "‘"}, {"'", "’"}, {";", ";"},
  137 + };
139 138
140 for (const auto &p : puncts) { 139 for (const auto &p : puncts) {
141 if (token2id_.count(p.first) && !token2id_.count(p.second)) { 140 if (token2id_.count(p.first) && !token2id_.count(p.second)) {
@@ -150,6 +149,10 @@ class JiebaLexicon::Impl { @@ -150,6 +149,10 @@ class JiebaLexicon::Impl {
150 if (!token2id_.count("、") && token2id_.count(",")) { 149 if (!token2id_.count("、") && token2id_.count(",")) {
151 token2id_["、"] = token2id_[","]; 150 token2id_["、"] = token2id_[","];
152 } 151 }
  152 +
  153 + if (!token2id_.count(";") && token2id_.count(",")) {
  154 + token2id_[";"] = token2id_[","];
  155 + }
153 } 156 }
154 157
155 void InitLexicon(std::istream &is) { 158 void InitLexicon(std::istream &is) {
@@ -195,8 +198,6 @@ class JiebaLexicon::Impl { @@ -195,8 +198,6 @@ class JiebaLexicon::Impl {
195 // tokens.txt is saved in token2id_ 198 // tokens.txt is saved in token2id_
196 std::unordered_map<std::string, int32_t> token2id_; 199 std::unordered_map<std::string, int32_t> token2id_;
197 200
198 - OfflineTtsVitsModelMetaData meta_data_;  
199 -  
200 std::unique_ptr<cppjieba::Jieba> jieba_; 201 std::unique_ptr<cppjieba::Jieba> jieba_;
201 bool debug_ = false; 202 bool debug_ = false;
202 }; 203 };
@@ -205,11 +206,8 @@ JiebaLexicon::~JiebaLexicon() = default; @@ -205,11 +206,8 @@ JiebaLexicon::~JiebaLexicon() = default;
205 206
206 JiebaLexicon::JiebaLexicon(const std::string &lexicon, 207 JiebaLexicon::JiebaLexicon(const std::string &lexicon,
207 const std::string &tokens, 208 const std::string &tokens,
208 - const std::string &dict_dir,  
209 - const OfflineTtsVitsModelMetaData &meta_data,  
210 - bool debug)  
211 - : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, meta_data,  
212 - debug)) {} 209 + const std::string &dict_dir, bool debug)
  210 + : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, debug)) {}
213 211
214 std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds( 212 std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds(
215 const std::string &text, const std::string & /*unused_voice = ""*/) const { 213 const std::string &text, const std::string & /*unused_voice = ""*/) const {
@@ -11,7 +11,6 @@ @@ -11,7 +11,6 @@
11 #include <vector> 11 #include <vector>
12 12
13 #include "sherpa-onnx/csrc/offline-tts-frontend.h" 13 #include "sherpa-onnx/csrc/offline-tts-frontend.h"
14 -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"  
15 14
16 namespace sherpa_onnx { 15 namespace sherpa_onnx {
17 16
@@ -19,8 +18,7 @@ class JiebaLexicon : public OfflineTtsFrontend { @@ -19,8 +18,7 @@ class JiebaLexicon : public OfflineTtsFrontend {
19 public: 18 public:
20 ~JiebaLexicon() override; 19 ~JiebaLexicon() override;
21 JiebaLexicon(const std::string &lexicon, const std::string &tokens, 20 JiebaLexicon(const std::string &lexicon, const std::string &tokens,
22 - const std::string &dict_dir,  
23 - const OfflineTtsVitsModelMetaData &meta_data, bool debug); 21 + const std::string &dict_dir, bool debug);
24 22
25 std::vector<TokenIDs> ConvertTextToTokenIds( 23 std::vector<TokenIDs> ConvertTextToTokenIds(
26 const std::string &text, 24 const std::string &text,
@@ -5,6 +5,7 @@ @@ -5,6 +5,7 @@
5 #include "sherpa-onnx/csrc/offline-tts-impl.h" 5 #include "sherpa-onnx/csrc/offline-tts-impl.h"
6 6
7 #include <memory> 7 #include <memory>
  8 +#include <vector>
8 9
9 #if __ANDROID_API__ >= 9 10 #if __ANDROID_API__ >= 9
10 #include "android/asset_manager.h" 11 #include "android/asset_manager.h"
@@ -15,21 +16,39 @@ @@ -15,21 +16,39 @@
15 #include "rawfile/raw_file_manager.h" 16 #include "rawfile/raw_file_manager.h"
16 #endif 17 #endif
17 18
  19 +#include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
18 #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" 20 #include "sherpa-onnx/csrc/offline-tts-vits-impl.h"
19 21
20 namespace sherpa_onnx { 22 namespace sherpa_onnx {
21 23
  24 +std::vector<int64_t> OfflineTtsImpl::AddBlank(const std::vector<int64_t> &x,
  25 + int32_t blank_id /*= 0*/) const {
  26 + // we assume the blank ID is 0
  27 + std::vector<int64_t> buffer(x.size() * 2 + 1, blank_id);
  28 + int32_t i = 1;
  29 + for (auto k : x) {
  30 + buffer[i] = k;
  31 + i += 2;
  32 + }
  33 + return buffer;
  34 +}
  35 +
22 std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( 36 std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
23 const OfflineTtsConfig &config) { 37 const OfflineTtsConfig &config) {
24 - // TODO(fangjun): Support other types 38 + if (!config.model.vits.model.empty()) {
25 return std::make_unique<OfflineTtsVitsImpl>(config); 39 return std::make_unique<OfflineTtsVitsImpl>(config);
  40 + }
  41 + return std::make_unique<OfflineTtsMatchaImpl>(config);
26 } 42 }
27 43
28 template <typename Manager> 44 template <typename Manager>
29 std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create( 45 std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
30 Manager *mgr, const OfflineTtsConfig &config) { 46 Manager *mgr, const OfflineTtsConfig &config) {
31 - // TODO(fangjun): Support other types 47 + if (!config.model.vits.model.empty()) {
32 return std::make_unique<OfflineTtsVitsImpl>(mgr, config); 48 return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
  49 + }
  50 +
  51 + return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
33 } 52 }
34 53
35 #if __ANDROID_API__ >= 9 54 #if __ANDROID_API__ >= 9
@@ -7,6 +7,7 @@ @@ -7,6 +7,7 @@
7 7
8 #include <memory> 8 #include <memory>
9 #include <string> 9 #include <string>
  10 +#include <vector>
10 11
11 #include "sherpa-onnx/csrc/offline-tts.h" 12 #include "sherpa-onnx/csrc/offline-tts.h"
12 13
@@ -32,6 +33,9 @@ class OfflineTtsImpl { @@ -32,6 +33,9 @@ class OfflineTtsImpl {
32 // Number of supported speakers. 33 // Number of supported speakers.
33 // If it supports only a single speaker, then it return 0 or 1. 34 // If it supports only a single speaker, then it return 0 or 1.
34 virtual int32_t NumSpeakers() const = 0; 35 virtual int32_t NumSpeakers() const = 0;
  36 +
  37 + std::vector<int64_t> AddBlank(const std::vector<int64_t> &x,
  38 + int32_t blank_id = 0) const;
35 }; 39 };
36 40
37 } // namespace sherpa_onnx 41 } // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-tts-matcha-impl.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_IMPL_H_
  5 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_IMPL_H_
  6 +
  7 +#include <memory>
  8 +#include <string>
  9 +#include <strstream>
  10 +#include <utility>
  11 +#include <vector>
  12 +
  13 +#include "fst/extensions/far/far.h"
  14 +#include "kaldifst/csrc/kaldi-fst-io.h"
  15 +#include "kaldifst/csrc/text-normalizer.h"
  16 +#include "sherpa-onnx/csrc/hifigan-vocoder.h"
  17 +#include "sherpa-onnx/csrc/jieba-lexicon.h"
  18 +#include "sherpa-onnx/csrc/lexicon.h"
  19 +#include "sherpa-onnx/csrc/macros.h"
  20 +#include "sherpa-onnx/csrc/melo-tts-lexicon.h"
  21 +#include "sherpa-onnx/csrc/offline-tts-character-frontend.h"
  22 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
  23 +#include "sherpa-onnx/csrc/offline-tts-impl.h"
  24 +#include "sherpa-onnx/csrc/offline-tts-matcha-model.h"
  25 +#include "sherpa-onnx/csrc/onnx-utils.h"
  26 +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
  27 +#include "sherpa-onnx/csrc/text-utils.h"
  28 +
  29 +namespace sherpa_onnx {
  30 +
  31 +class OfflineTtsMatchaImpl : public OfflineTtsImpl {
  32 + public:
  33 + explicit OfflineTtsMatchaImpl(const OfflineTtsConfig &config)
  34 + : config_(config),
  35 + model_(std::make_unique<OfflineTtsMatchaModel>(config.model)),
  36 + vocoder_(std::make_unique<HifiganVocoder>(
  37 + config.model.num_threads, config.model.provider,
  38 + config.model.matcha.vocoder)) {
  39 + InitFrontend();
  40 +
  41 + if (!config.rule_fsts.empty()) {
  42 + std::vector<std::string> files;
  43 + SplitStringToVector(config.rule_fsts, ",", false, &files);
  44 + tn_list_.reserve(files.size());
  45 + for (const auto &f : files) {
  46 + if (config.model.debug) {
  47 +#if __OHOS__
  48 + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
  49 +#else
  50 + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
  51 +#endif
  52 + }
  53 + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
  54 + }
  55 + }
  56 +
  57 + if (!config.rule_fars.empty()) {
  58 + if (config.model.debug) {
  59 + SHERPA_ONNX_LOGE("Loading FST archives");
  60 + }
  61 + std::vector<std::string> files;
  62 + SplitStringToVector(config.rule_fars, ",", false, &files);
  63 +
  64 + tn_list_.reserve(files.size() + tn_list_.size());
  65 +
  66 + for (const auto &f : files) {
  67 + if (config.model.debug) {
  68 +#if __OHOS__
  69 + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
  70 +#else
  71 + SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
  72 +#endif
  73 + }
  74 + std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
  75 + fst::FarReader<fst::StdArc>::Open(f));
  76 + for (; !reader->Done(); reader->Next()) {
  77 + std::unique_ptr<fst::StdConstFst> r(
  78 + fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
  79 +
  80 + tn_list_.push_back(
  81 + std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
  82 + }
  83 + }
  84 +
  85 + if (config.model.debug) {
  86 + SHERPA_ONNX_LOGE("FST archives loaded!");
  87 + }
  88 + }
  89 + }
  90 +
  91 + template <typename Manager>
  92 + OfflineTtsMatchaImpl(Manager *mgr, const OfflineTtsConfig &config)
  93 + : config_(config),
  94 + model_(std::make_unique<OfflineTtsMatchaModel>(mgr, config.model)),
  95 + vocoder_(std::make_unique<HifiganVocoder>(
  96 + mgr, config.model.num_threads, config.model.provider,
  97 + config.model.matcha.vocoder)) {
  98 + InitFrontend(mgr);
  99 +
  100 + if (!config.rule_fsts.empty()) {
  101 + std::vector<std::string> files;
  102 + SplitStringToVector(config.rule_fsts, ",", false, &files);
  103 + tn_list_.reserve(files.size());
  104 + for (const auto &f : files) {
  105 + if (config.model.debug) {
  106 +#if __OHOS__
  107 + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
  108 +#else
  109 + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
  110 +#endif
  111 + }
  112 + auto buf = ReadFile(mgr, f);
  113 + std::istrstream is(buf.data(), buf.size());
  114 + tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is));
  115 + }
  116 + }
  117 +
  118 + if (!config.rule_fars.empty()) {
  119 + std::vector<std::string> files;
  120 + SplitStringToVector(config.rule_fars, ",", false, &files);
  121 + tn_list_.reserve(files.size() + tn_list_.size());
  122 +
  123 + for (const auto &f : files) {
  124 + if (config.model.debug) {
  125 +#if __OHOS__
  126 + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
  127 +#else
  128 + SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
  129 +#endif
  130 + }
  131 +
  132 + auto buf = ReadFile(mgr, f);
  133 +
  134 + std::unique_ptr<std::istream> s(
  135 + new std::istrstream(buf.data(), buf.size()));
  136 +
  137 + std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
  138 + fst::FarReader<fst::StdArc>::Open(std::move(s)));
  139 +
  140 + for (; !reader->Done(); reader->Next()) {
  141 + std::unique_ptr<fst::StdConstFst> r(
  142 + fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
  143 +
  144 + tn_list_.push_back(
  145 + std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
  146 + } // for (; !reader->Done(); reader->Next())
  147 + } // for (const auto &f : files)
  148 + } // if (!config.rule_fars.empty())
  149 + }
  150 +
  151 + int32_t SampleRate() const override {
  152 + return model_->GetMetaData().sample_rate;
  153 + }
  154 +
  155 + int32_t NumSpeakers() const override {
  156 + return model_->GetMetaData().num_speakers;
  157 + }
  158 +
  159 + GeneratedAudio Generate(
  160 + const std::string &_text, int64_t sid = 0, float speed = 1.0,
  161 + GeneratedAudioCallback callback = nullptr) const override {
  162 + const auto &meta_data = model_->GetMetaData();
  163 + int32_t num_speakers = meta_data.num_speakers;
  164 +
  165 + if (num_speakers == 0 && sid != 0) {
  166 +#if __OHOS__
  167 + SHERPA_ONNX_LOGE(
  168 + "This is a single-speaker model and supports only sid 0. Given sid: "
  169 + "%{public}d. sid is ignored",
  170 + static_cast<int32_t>(sid));
  171 +#else
  172 + SHERPA_ONNX_LOGE(
  173 + "This is a single-speaker model and supports only sid 0. Given sid: "
  174 + "%d. sid is ignored",
  175 + static_cast<int32_t>(sid));
  176 +#endif
  177 + }
  178 +
  179 + if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) {
  180 +#if __OHOS__
  181 + SHERPA_ONNX_LOGE(
  182 + "This model contains only %{public}d speakers. sid should be in the "
  183 + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0",
  184 + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  185 +#else
  186 + SHERPA_ONNX_LOGE(
  187 + "This model contains only %d speakers. sid should be in the range "
  188 + "[%d, %d]. Given: %d. Use sid=0",
  189 + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  190 +#endif
  191 + sid = 0;
  192 + }
  193 +
  194 + std::string text = _text;
  195 + if (config_.model.debug) {
  196 +#if __OHOS__
  197 + SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str());
  198 +#else
  199 + SHERPA_ONNX_LOGE("Raw text: %s", text.c_str());
  200 +#endif
  201 + }
  202 +
  203 + if (!tn_list_.empty()) {
  204 + for (const auto &tn : tn_list_) {
  205 + text = tn->Normalize(text);
  206 + if (config_.model.debug) {
  207 +#if __OHOS__
  208 + SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str());
  209 +#else
  210 + SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str());
  211 +#endif
  212 + }
  213 + }
  214 + }
  215 +
  216 + std::vector<TokenIDs> token_ids =
  217 + frontend_->ConvertTextToTokenIds(text, "en-US");
  218 +
  219 + if (token_ids.empty() ||
  220 + (token_ids.size() == 1 && token_ids[0].tokens.empty())) {
  221 +#if __OHOS__
  222 + SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs",
  223 + text.c_str());
  224 +#else
  225 + SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str());
  226 +#endif
  227 + return {};
  228 + }
  229 +
  230 + std::vector<std::vector<int64_t>> x;
  231 +
  232 + x.reserve(token_ids.size());
  233 +
  234 + for (auto &i : token_ids) {
  235 + x.push_back(std::move(i.tokens));
  236 + }
  237 +
  238 + for (auto &k : x) {
  239 + k = AddBlank(k, meta_data.pad_id);
  240 + }
  241 +
  242 + int32_t x_size = static_cast<int32_t>(x.size());
  243 +
  244 + if (config_.max_num_sentences <= 0 || x_size <= config_.max_num_sentences) {
  245 + auto ans = Process(x, sid, speed);
  246 + if (callback) {
  247 + callback(ans.samples.data(), ans.samples.size(), 1.0);
  248 + }
  249 + return ans;
  250 + }
  251 +
  252 + // the input text is too long, we process sentences within it in batches
  253 + // to avoid OOM. Batch size is config_.max_num_sentences
  254 + std::vector<std::vector<int64_t>> batch_x;
  255 +
  256 + int32_t batch_size = config_.max_num_sentences;
  257 + batch_x.reserve(config_.max_num_sentences);
  258 + int32_t num_batches = x_size / batch_size;
  259 +
  260 + if (config_.model.debug) {
  261 +#if __OHOS__
  262 + SHERPA_ONNX_LOGE(
  263 + "Text is too long. Split it into %{public}d batches. batch size: "
  264 + "%{public}d. Number of sentences: %{public}d",
  265 + num_batches, batch_size, x_size);
  266 +#else
  267 + SHERPA_ONNX_LOGE(
  268 + "Text is too long. Split it into %d batches. batch size: %d. Number "
  269 + "of sentences: %d",
  270 + num_batches, batch_size, x_size);
  271 +#endif
  272 + }
  273 +
  274 + GeneratedAudio ans;
  275 +
  276 + int32_t should_continue = 1;
  277 +
  278 + int32_t k = 0;
  279 +
  280 + for (int32_t b = 0; b != num_batches && should_continue; ++b) {
  281 + batch_x.clear();
  282 + for (int32_t i = 0; i != batch_size; ++i, ++k) {
  283 + batch_x.push_back(std::move(x[k]));
  284 + }
  285 +
  286 + auto audio = Process(batch_x, sid, speed);
  287 + ans.sample_rate = audio.sample_rate;
  288 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  289 + audio.samples.end());
  290 + if (callback) {
  291 + should_continue = callback(audio.samples.data(), audio.samples.size(),
  292 + (b + 1) * 1.0 / num_batches);
  293 + // Caution(fangjun): audio is freed when the callback returns, so users
  294 + // should copy the data if they want to access the data after
  295 + // the callback returns to avoid segmentation fault.
  296 + }
  297 + }
  298 +
  299 + batch_x.clear();
  300 + while (k < static_cast<int32_t>(x.size()) && should_continue) {
  301 + batch_x.push_back(std::move(x[k]));
  302 +
  303 + ++k;
  304 + }
  305 +
  306 + if (!batch_x.empty()) {
  307 + auto audio = Process(batch_x, sid, speed);
  308 + ans.sample_rate = audio.sample_rate;
  309 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  310 + audio.samples.end());
  311 + if (callback) {
  312 + callback(audio.samples.data(), audio.samples.size(), 1.0);
  313 + // Caution(fangjun): audio is freed when the callback returns, so users
  314 + // should copy the data if they want to access the data after
  315 + // the callback returns to avoid segmentation fault.
  316 + }
  317 + }
  318 +
  319 + return ans;
  320 + }
  321 +
  322 + private:
  323 + template <typename Manager>
  324 + void InitFrontend(Manager *mgr) {}
  325 +
  326 + void InitFrontend() {
  327 + frontend_ = std::make_unique<JiebaLexicon>(
  328 + config_.model.matcha.lexicon, config_.model.matcha.tokens,
  329 + config_.model.matcha.dict_dir, config_.model.debug);
  330 + }
  331 +
  332 + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
  333 + int32_t sid, float speed) const {
  334 + int32_t num_tokens = 0;
  335 + for (const auto &k : tokens) {
  336 + num_tokens += k.size();
  337 + }
  338 +
  339 + std::vector<int64_t> x;
  340 + x.reserve(num_tokens);
  341 + for (const auto &k : tokens) {
  342 + x.insert(x.end(), k.begin(), k.end());
  343 + }
  344 +
  345 + auto memory_info =
  346 + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
  347 +
  348 + std::array<int64_t, 2> x_shape = {1, static_cast<int32_t>(x.size())};
  349 + Ort::Value x_tensor = Ort::Value::CreateTensor(
  350 + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
  351 +
  352 + Ort::Value mel = model_->Run(std::move(x_tensor), sid, speed);
  353 + Ort::Value audio = vocoder_->Run(std::move(mel));
  354 +
  355 + std::vector<int64_t> audio_shape =
  356 + audio.GetTensorTypeAndShapeInfo().GetShape();
  357 +
  358 + int64_t total = 1;
  359 + // The output shape may be (1, 1, total) or (1, total) or (total,)
  360 + for (auto i : audio_shape) {
  361 + total *= i;
  362 + }
  363 +
  364 + const float *p = audio.GetTensorData<float>();
  365 +
  366 + GeneratedAudio ans;
  367 + ans.sample_rate = model_->GetMetaData().sample_rate;
  368 + ans.samples = std::vector<float>(p, p + total);
  369 + return ans;
  370 + }
  371 +
  372 + private:
  373 + OfflineTtsConfig config_;
  374 + std::unique_ptr<OfflineTtsMatchaModel> model_;
  375 + std::unique_ptr<HifiganVocoder> vocoder_;
  376 + std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_;
  377 + std::unique_ptr<OfflineTtsFrontend> frontend_;
  378 +};
  379 +
  380 +} // namespace sherpa_onnx
  381 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_IMPL_H_
  1 +// sherpa-onnx/csrc/offline-tts-matcha-model-config.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "sherpa-onnx/csrc/file-utils.h"
  10 +#include "sherpa-onnx/csrc/macros.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +void OfflineTtsMatchaModelConfig::Register(ParseOptions *po) {
  15 + po->Register("matcha-acoustic-model", &acoustic_model,
  16 + "Path to matcha acoustic model");
  17 + po->Register("matcha-vocoder", &vocoder, "Path to matcha vocoder");
  18 + po->Register("matcha-lexicon", &lexicon,
  19 + "Path to lexicon.txt for Matcha models");
  20 + po->Register("matcha-tokens", &tokens,
  21 + "Path to tokens.txt for Matcha models");
  22 + po->Register("matcha-data-dir", &data_dir,
  23 + "Path to the directory containing dict for espeak-ng. If it is "
  24 + "given, --matcha-lexicon is ignored.");
  25 + po->Register("matcha-dict-dir", &dict_dir,
  26 + "Path to the directory containing dict for jieba. Used only for "
  27 + "Chinese TTS models using jieba");
  28 + po->Register("matcha-noise-scale", &noise_scale,
  29 + "noise_scale for Matcha models");
  30 + po->Register("matcha-length-scale", &length_scale,
  31 + "Speech speed. Larger->Slower; Smaller->faster.");
  32 +}
  33 +
  34 +bool OfflineTtsMatchaModelConfig::Validate() const {
  35 + if (acoustic_model.empty()) {
  36 + SHERPA_ONNX_LOGE("Please provide --matcha-acoustic-model");
  37 + return false;
  38 + }
  39 +
  40 + if (!FileExists(acoustic_model)) {
  41 + SHERPA_ONNX_LOGE("--matcha-acoustic-model: '%s' does not exist",
  42 + acoustic_model.c_str());
  43 + return false;
  44 + }
  45 +
  46 + if (vocoder.empty()) {
  47 + SHERPA_ONNX_LOGE("Please provide --matcha-vocoder");
  48 + return false;
  49 + }
  50 +
  51 + if (!FileExists(vocoder)) {
  52 + SHERPA_ONNX_LOGE("--matcha-vocoder: '%s' does not exist", vocoder.c_str());
  53 + return false;
  54 + }
  55 +
  56 + if (tokens.empty()) {
  57 + SHERPA_ONNX_LOGE("Please provide --matcha-tokens");
  58 + return false;
  59 + }
  60 +
  61 + if (!FileExists(tokens)) {
  62 + SHERPA_ONNX_LOGE("--matcha-tokens: '%s' does not exist", tokens.c_str());
  63 + return false;
  64 + }
  65 +
  66 + if (!data_dir.empty()) {
  67 + if (!FileExists(data_dir + "/phontab")) {
  68 + SHERPA_ONNX_LOGE(
  69 + "'%s/phontab' does not exist. Please check --matcha-data-dir",
  70 + data_dir.c_str());
  71 + return false;
  72 + }
  73 +
  74 + if (!FileExists(data_dir + "/phonindex")) {
  75 + SHERPA_ONNX_LOGE(
  76 + "'%s/phonindex' does not exist. Please check --matcha-data-dir",
  77 + data_dir.c_str());
  78 + return false;
  79 + }
  80 +
  81 + if (!FileExists(data_dir + "/phondata")) {
  82 + SHERPA_ONNX_LOGE(
  83 + "'%s/phondata' does not exist. Please check --matcha-data-dir",
  84 + data_dir.c_str());
  85 + return false;
  86 + }
  87 +
  88 + if (!FileExists(data_dir + "/intonations")) {
  89 + SHERPA_ONNX_LOGE(
  90 + "'%s/intonations' does not exist. Please check --matcha-data-dir",
  91 + data_dir.c_str());
  92 + return false;
  93 + }
  94 + }
  95 +
  96 + if (!dict_dir.empty()) {
  97 + std::vector<std::string> required_files = {
  98 + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8",
  99 + "idf.utf8", "stop_words.utf8",
  100 + };
  101 +
  102 + for (const auto &f : required_files) {
  103 + if (!FileExists(dict_dir + "/" + f)) {
  104 + SHERPA_ONNX_LOGE(
  105 + "'%s/%s' does not exist. Please check --matcha-dict-dir",
  106 + dict_dir.c_str(), f.c_str());
  107 + return false;
  108 + }
  109 + }
  110 +
  111 + // we require that --matcha-lexicon is not empty
  112 + if (lexicon.empty()) {
  113 + SHERPA_ONNX_LOGE("Please provide --matcha-lexicon");
  114 + return false;
  115 + }
  116 +
  117 + if (!FileExists(lexicon)) {
  118 + SHERPA_ONNX_LOGE("--matcha-lexicon: '%s' does not exist",
  119 + lexicon.c_str());
  120 + return false;
  121 + }
  122 + }
  123 +
  124 + return true;
  125 +}
  126 +
  127 +std::string OfflineTtsMatchaModelConfig::ToString() const {
  128 + std::ostringstream os;
  129 +
  130 + os << "OfflineTtsMatchaModelConfig(";
  131 + os << "acoustic_model=\"" << acoustic_model << "\", ";
  132 + os << "vocoder=\"" << vocoder << "\", ";
  133 + os << "lexicon=\"" << lexicon << "\", ";
  134 + os << "tokens=\"" << tokens << "\", ";
  135 + os << "data_dir=\"" << data_dir << "\", ";
  136 + os << "dict_dir=\"" << dict_dir << "\", ";
  137 + os << "noise_scale=" << noise_scale << ", ";
  138 + os << "length_scale=" << length_scale << ")";
  139 +
  140 + return os.str();
  141 +}
  142 +
  143 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-tts-matcha-model-config.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_
  7 +
  8 +#include <string>
  9 +
  10 +#include "sherpa-onnx/csrc/parse-options.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +struct OfflineTtsMatchaModelConfig {
  15 + std::string acoustic_model;
  16 + std::string vocoder;
  17 + std::string lexicon;
  18 + std::string tokens;
  19 +
  20 + // If data_dir is given, lexicon is ignored
  21 + // data_dir is for piper-phonemizer, which uses espeak-ng
  22 + std::string data_dir;
  23 +
  24 + // Used for Chinese TTS models using jieba
  25 + std::string dict_dir;
  26 +
  27 + float noise_scale = 1;
  28 + float length_scale = 1;
  29 +
  30 + OfflineTtsMatchaModelConfig() = default;
  31 +
  32 + OfflineTtsMatchaModelConfig(const std::string &acoustic_model,
  33 + const std::string &vocoder,
  34 + const std::string &lexicon,
  35 + const std::string &tokens,
  36 + const std::string &data_dir,
  37 + const std::string &dict_dir,
  38 + float noise_scale = 1.0, float length_scale = 1)
  39 + : acoustic_model(acoustic_model),
  40 + vocoder(vocoder),
  41 + lexicon(lexicon),
  42 + tokens(tokens),
  43 + data_dir(data_dir),
  44 + dict_dir(dict_dir),
  45 + noise_scale(noise_scale),
  46 + length_scale(length_scale) {}
  47 +
  48 + void Register(ParseOptions *po);
  49 + bool Validate() const;
  50 +
  51 + std::string ToString() const;
  52 +};
  53 +
  54 +} // namespace sherpa_onnx
  55 +
  56 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_
  1 +// sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_
  7 +
  8 +#include <cstdint>
  9 +#include <string>
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +// If you are not sure what each field means, please
  14 +// have a look of the Python file in the model directory that
  15 +// you have downloaded.
  16 +struct OfflineTtsMatchaModelMetaData {
  17 + int32_t sample_rate = 0;
  18 + int32_t num_speakers = 0;
  19 + int32_t version = 1;
  20 + int32_t jieba = 0;
  21 + int32_t espeak = 0;
  22 + int32_t use_eos_bos = 0;
  23 + int32_t pad_id = 0;
  24 +};
  25 +
  26 +} // namespace sherpa_onnx
  27 +
  28 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_
  1 +// sherpa-onnx/csrc/offline-tts-matcha-model.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/offline-tts-matcha-model.h"
  6 +
  7 +#include <algorithm>
  8 +#include <string>
  9 +#include <utility>
  10 +#include <vector>
  11 +
  12 +#if __ANDROID_API__ >= 9
  13 +#include "android/asset_manager.h"
  14 +#include "android/asset_manager_jni.h"
  15 +#endif
  16 +
  17 +#if __OHOS__
  18 +#include "rawfile/raw_file_manager.h"
  19 +#endif
  20 +
  21 +#include "sherpa-onnx/csrc/macros.h"
  22 +#include "sherpa-onnx/csrc/onnx-utils.h"
  23 +#include "sherpa-onnx/csrc/session.h"
  24 +
  25 +namespace sherpa_onnx {
  26 +
  27 +class OfflineTtsMatchaModel::Impl {
  28 + public:
  29 + explicit Impl(const OfflineTtsModelConfig &config)
  30 + : config_(config),
  31 + env_(ORT_LOGGING_LEVEL_ERROR),
  32 + sess_opts_(GetSessionOptions(config)),
  33 + allocator_{} {
  34 + auto buf = ReadFile(config.matcha.acoustic_model);
  35 + Init(buf.data(), buf.size());
  36 + }
  37 +
  38 + template <typename Manager>
  39 + Impl(Manager *mgr, const OfflineTtsModelConfig &config)
  40 + : config_(config),
  41 + env_(ORT_LOGGING_LEVEL_ERROR),
  42 + sess_opts_(GetSessionOptions(config)),
  43 + allocator_{} {
  44 + auto buf = ReadFile(mgr, config.matcha.acoustic_model);
  45 + Init(buf.data(), buf.size());
  46 + }
  47 +
  48 + const OfflineTtsMatchaModelMetaData &GetMetaData() const {
  49 + return meta_data_;
  50 + }
  51 +
  52 + Ort::Value Run(Ort::Value x, int64_t sid, float speed) {
  53 + auto memory_info =
  54 + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
  55 +
  56 + std::vector<int64_t> x_shape = x.GetTensorTypeAndShapeInfo().GetShape();
  57 + if (x_shape[0] != 1) {
  58 + SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d",
  59 + static_cast<int32_t>(x_shape[0]));
  60 + exit(-1);
  61 + }
  62 +
  63 + int64_t len = x_shape[1];
  64 + int64_t len_shape = 1;
  65 +
  66 + Ort::Value x_length =
  67 + Ort::Value::CreateTensor(memory_info, &len, 1, &len_shape, 1);
  68 +
  69 + int64_t scale_shape = 1;
  70 + float noise_scale = config_.matcha.noise_scale;
  71 + float length_scale = config_.matcha.length_scale;
  72 +
  73 + if (speed != 1 && speed > 0) {
  74 + length_scale = 1. / speed;
  75 + }
  76 +
  77 + Ort::Value noise_scale_tensor =
  78 + Ort::Value::CreateTensor(memory_info, &noise_scale, 1, &scale_shape, 1);
  79 +
  80 + Ort::Value length_scale_tensor = Ort::Value::CreateTensor(
  81 + memory_info, &length_scale, 1, &scale_shape, 1);
  82 +
  83 + Ort::Value sid_tensor =
  84 + Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1);
  85 +
  86 + std::vector<Ort::Value> inputs;
  87 + inputs.reserve(5);
  88 + inputs.push_back(std::move(x));
  89 + inputs.push_back(std::move(x_length));
  90 + inputs.push_back(std::move(noise_scale_tensor));
  91 + inputs.push_back(std::move(length_scale_tensor));
  92 +
  93 + if (input_names_.size() == 5 && input_names_.back() == "sid") {
  94 + inputs.push_back(std::move(sid_tensor));
  95 + }
  96 +
  97 + auto out =
  98 + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
  99 + output_names_ptr_.data(), output_names_ptr_.size());
  100 +
  101 + return std::move(out[0]);
  102 + }
  103 +
  104 + private:
  105 + void Init(void *model_data, size_t model_data_length) {
  106 + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
  107 + sess_opts_);
  108 +
  109 + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
  110 +
  111 + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
  112 +
  113 + // get meta data
  114 + Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
  115 + if (config_.debug) {
  116 + std::ostringstream os;
  117 + os << "---matcha model---\n";
  118 + PrintModelMetadata(os, meta_data);
  119 +
  120 + os << "----------input names----------\n";
  121 + int32_t i = 0;
  122 + for (const auto &s : input_names_) {
  123 + os << i << " " << s << "\n";
  124 + ++i;
  125 + }
  126 + os << "----------output names----------\n";
  127 + i = 0;
  128 + for (const auto &s : output_names_) {
  129 + os << i << " " << s << "\n";
  130 + ++i;
  131 + }
  132 +
  133 +#if __OHOS__
  134 + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
  135 +#else
  136 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
  137 +#endif
  138 + }
  139 +
  140 + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
  141 + SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate");
  142 + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
  143 + SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
  144 + SHERPA_ONNX_READ_META_DATA(meta_data_.jieba, "jieba");
  145 + SHERPA_ONNX_READ_META_DATA(meta_data_.espeak, "has_espeak");
  146 + SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos");
  147 + SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id");
  148 + }
  149 +
  150 + private:
  151 + OfflineTtsModelConfig config_;
  152 + Ort::Env env_;
  153 + Ort::SessionOptions sess_opts_;
  154 + Ort::AllocatorWithDefaultOptions allocator_;
  155 +
  156 + std::unique_ptr<Ort::Session> sess_;
  157 +
  158 + std::vector<std::string> input_names_;
  159 + std::vector<const char *> input_names_ptr_;
  160 +
  161 + std::vector<std::string> output_names_;
  162 + std::vector<const char *> output_names_ptr_;
  163 +
  164 + OfflineTtsMatchaModelMetaData meta_data_;
  165 +};
  166 +
  167 +OfflineTtsMatchaModel::OfflineTtsMatchaModel(
  168 + const OfflineTtsModelConfig &config)
  169 + : impl_(std::make_unique<Impl>(config)) {}
  170 +
  171 +template <typename Manager>
  172 +OfflineTtsMatchaModel::OfflineTtsMatchaModel(
  173 + Manager *mgr, const OfflineTtsModelConfig &config)
  174 + : impl_(std::make_unique<Impl>(mgr, config)) {}
  175 +
  176 +OfflineTtsMatchaModel::~OfflineTtsMatchaModel() = default;
  177 +
  178 +const OfflineTtsMatchaModelMetaData &OfflineTtsMatchaModel::GetMetaData()
  179 + const {
  180 + return impl_->GetMetaData();
  181 +}
  182 +
  183 +Ort::Value OfflineTtsMatchaModel::Run(Ort::Value x, int64_t sid /*= 0*/,
  184 + float speed /*= 1.0*/) const {
  185 + return impl_->Run(std::move(x), sid, speed);
  186 +}
  187 +
  188 +#if __ANDROID_API__ >= 9
  189 +template OfflineTtsMatchaModel::OfflineTtsMatchaModel(
  190 + AAssetManager *mgr, const OfflineTtsModelConfig &config);
  191 +#endif
  192 +
  193 +#if __OHOS__
  194 +template OfflineTtsMatchaModel::OfflineTtsMatchaModel(
  195 + NativeResourceManager *mgr, const OfflineTtsModelConfig &config);
  196 +#endif
  197 +
  198 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/offline-tts-matcha-model.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_H_
  7 +
  8 +#include <memory>
  9 +#include <string>
  10 +
  11 +#include "onnxruntime_cxx_api.h" // NOLINT
  12 +#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h"
  13 +#include "sherpa-onnx/csrc/offline-tts-model-config.h"
  14 +
  15 +namespace sherpa_onnx {
  16 +
  17 +class OfflineTtsMatchaModel {
  18 + public:
  19 + ~OfflineTtsMatchaModel();
  20 +
  21 + explicit OfflineTtsMatchaModel(const OfflineTtsModelConfig &config);
  22 +
  23 + template <typename Manager>
  24 + OfflineTtsMatchaModel(Manager *mgr, const OfflineTtsModelConfig &config);
  25 +
  26 + // Return a float32 tensor containing the mel
  27 + // of shape (batch_size, mel_dim, num_frames)
  28 + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const;
  29 +
  30 + const OfflineTtsMatchaModelMetaData &GetMetaData() const;
  31 +
  32 + private:
  33 + class Impl;
  34 + std::unique_ptr<Impl> impl_;
  35 +};
  36 +
  37 +} // namespace sherpa_onnx
  38 +
  39 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_H_
@@ -10,6 +10,7 @@ namespace sherpa_onnx { @@ -10,6 +10,7 @@ namespace sherpa_onnx {
10 10
11 void OfflineTtsModelConfig::Register(ParseOptions *po) { 11 void OfflineTtsModelConfig::Register(ParseOptions *po) {
12 vits.Register(po); 12 vits.Register(po);
  13 + matcha.Register(po);
13 14
14 po->Register("num-threads", &num_threads, 15 po->Register("num-threads", &num_threads,
15 "Number of threads to run the neural network"); 16 "Number of threads to run the neural network");
@@ -27,7 +28,11 @@ bool OfflineTtsModelConfig::Validate() const { @@ -27,7 +28,11 @@ bool OfflineTtsModelConfig::Validate() const {
27 return false; 28 return false;
28 } 29 }
29 30
  31 + if (!vits.model.empty()) {
30 return vits.Validate(); 32 return vits.Validate();
  33 + }
  34 +
  35 + return matcha.Validate();
31 } 36 }
32 37
33 std::string OfflineTtsModelConfig::ToString() const { 38 std::string OfflineTtsModelConfig::ToString() const {
@@ -35,6 +40,7 @@ std::string OfflineTtsModelConfig::ToString() const { @@ -35,6 +40,7 @@ std::string OfflineTtsModelConfig::ToString() const {
35 40
36 os << "OfflineTtsModelConfig("; 41 os << "OfflineTtsModelConfig(";
37 os << "vits=" << vits.ToString() << ", "; 42 os << "vits=" << vits.ToString() << ", ";
  43 + os << "matcha=" << matcha.ToString() << ", ";
38 os << "num_threads=" << num_threads << ", "; 44 os << "num_threads=" << num_threads << ", ";
39 os << "debug=" << (debug ? "True" : "False") << ", "; 45 os << "debug=" << (debug ? "True" : "False") << ", ";
40 os << "provider=\"" << provider << "\")"; 46 os << "provider=\"" << provider << "\")";
@@ -7,6 +7,7 @@ @@ -7,6 +7,7 @@
7 7
8 #include <string> 8 #include <string>
9 9
  10 +#include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h"
10 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" 11 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h"
11 #include "sherpa-onnx/csrc/parse-options.h" 12 #include "sherpa-onnx/csrc/parse-options.h"
12 13
@@ -14,6 +15,7 @@ namespace sherpa_onnx { @@ -14,6 +15,7 @@ namespace sherpa_onnx {
14 15
15 struct OfflineTtsModelConfig { 16 struct OfflineTtsModelConfig {
16 OfflineTtsVitsModelConfig vits; 17 OfflineTtsVitsModelConfig vits;
  18 + OfflineTtsMatchaModelConfig matcha;
17 19
18 int32_t num_threads = 1; 20 int32_t num_threads = 1;
19 bool debug = false; 21 bool debug = false;
@@ -22,9 +24,11 @@ struct OfflineTtsModelConfig { @@ -22,9 +24,11 @@ struct OfflineTtsModelConfig {
22 OfflineTtsModelConfig() = default; 24 OfflineTtsModelConfig() = default;
23 25
24 OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, 26 OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits,
  27 + const OfflineTtsMatchaModelConfig &matcha,
25 int32_t num_threads, bool debug, 28 int32_t num_threads, bool debug,
26 const std::string &provider) 29 const std::string &provider)
27 : vits(vits), 30 : vits(vits),
  31 + matcha(matcha),
28 num_threads(num_threads), 32 num_threads(num_threads),
29 debug(debug), 33 debug(debug),
30 provider(provider) {} 34 provider(provider) {}
@@ -156,17 +156,31 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -156,17 +156,31 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
156 int32_t num_speakers = meta_data.num_speakers; 156 int32_t num_speakers = meta_data.num_speakers;
157 157
158 if (num_speakers == 0 && sid != 0) { 158 if (num_speakers == 0 && sid != 0) {
  159 +#if __OHOS__
  160 + SHERPA_ONNX_LOGE(
  161 + "This is a single-speaker model and supports only sid 0. Given sid: "
  162 + "%{public}d. sid is ignored",
  163 + static_cast<int32_t>(sid));
  164 +#else
159 SHERPA_ONNX_LOGE( 165 SHERPA_ONNX_LOGE(
160 "This is a single-speaker model and supports only sid 0. Given sid: " 166 "This is a single-speaker model and supports only sid 0. Given sid: "
161 "%d. sid is ignored", 167 "%d. sid is ignored",
162 static_cast<int32_t>(sid)); 168 static_cast<int32_t>(sid));
  169 +#endif
163 } 170 }
164 171
165 if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) { 172 if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) {
  173 +#if __OHOS__
  174 + SHERPA_ONNX_LOGE(
  175 + "This model contains only %{public}d speakers. sid should be in the "
  176 + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0",
  177 + num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  178 +#else
166 SHERPA_ONNX_LOGE( 179 SHERPA_ONNX_LOGE(
167 "This model contains only %d speakers. sid should be in the range " 180 "This model contains only %d speakers. sid should be in the range "
168 "[%d, %d]. Given: %d. Use sid=0", 181 "[%d, %d]. Given: %d. Use sid=0",
169 num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid)); 182 num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
  183 +#endif
170 sid = 0; 184 sid = 0;
171 } 185 }
172 186
@@ -389,8 +403,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -389,8 +403,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
389 } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) { 403 } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) {
390 frontend_ = std::make_unique<JiebaLexicon>( 404 frontend_ = std::make_unique<JiebaLexicon>(
391 config_.model.vits.lexicon, config_.model.vits.tokens, 405 config_.model.vits.lexicon, config_.model.vits.tokens,
392 - config_.model.vits.dict_dir, model_->GetMetaData(),  
393 - config_.model.debug); 406 + config_.model.vits.dict_dir, config_.model.debug);
394 } else if ((meta_data.is_piper || meta_data.is_coqui || 407 } else if ((meta_data.is_piper || meta_data.is_coqui ||
395 meta_data.is_icefall) && 408 meta_data.is_icefall) &&
396 !config_.model.vits.data_dir.empty()) { 409 !config_.model.vits.data_dir.empty()) {
@@ -410,17 +423,6 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -410,17 +423,6 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
410 } 423 }
411 } 424 }
412 425
413 - std::vector<int64_t> AddBlank(const std::vector<int64_t> &x) const {  
414 - // we assume the blank ID is 0  
415 - std::vector<int64_t> buffer(x.size() * 2 + 1);  
416 - int32_t i = 1;  
417 - for (auto k : x) {  
418 - buffer[i] = k;  
419 - i += 2;  
420 - }  
421 - return buffer;  
422 - }  
423 -  
424 GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, 426 GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
425 const std::vector<std::vector<int64_t>> &tones, 427 const std::vector<std::vector<int64_t>> &tones,
426 int32_t sid, float speed) const { 428 int32_t sid, float speed) const {
@@ -51,25 +51,30 @@ bool OfflineTtsVitsModelConfig::Validate() const { @@ -51,25 +51,30 @@ bool OfflineTtsVitsModelConfig::Validate() const {
51 51
52 if (!data_dir.empty()) { 52 if (!data_dir.empty()) {
53 if (!FileExists(data_dir + "/phontab")) { 53 if (!FileExists(data_dir + "/phontab")) {
54 - SHERPA_ONNX_LOGE("'%s/phontab' does not exist. Skipping test", 54 + SHERPA_ONNX_LOGE(
  55 + "'%s/phontab' does not exist. Please check --vits-data-dir",
55 data_dir.c_str()); 56 data_dir.c_str());
56 return false; 57 return false;
57 } 58 }
58 59
59 if (!FileExists(data_dir + "/phonindex")) { 60 if (!FileExists(data_dir + "/phonindex")) {
60 - SHERPA_ONNX_LOGE("'%s/phonindex' does not exist. Skipping test", 61 + SHERPA_ONNX_LOGE(
  62 + "'%s/phonindex' does not exist. Please check --vits-data-dir",
61 data_dir.c_str()); 63 data_dir.c_str());
62 return false; 64 return false;
63 } 65 }
64 66
65 if (!FileExists(data_dir + "/phondata")) { 67 if (!FileExists(data_dir + "/phondata")) {
66 - SHERPA_ONNX_LOGE("'%s/phondata' does not exist. Skipping test", 68 + SHERPA_ONNX_LOGE(
  69 + "'%s/phondata' does not exist. Please check --vits-data-dir",
67 data_dir.c_str()); 70 data_dir.c_str());
68 return false; 71 return false;
69 } 72 }
70 73
71 if (!FileExists(data_dir + "/intonations")) { 74 if (!FileExists(data_dir + "/intonations")) {
72 - SHERPA_ONNX_LOGE("'%s/intonations' does not exist.", data_dir.c_str()); 75 + SHERPA_ONNX_LOGE(
  76 + "'%s/intonations' does not exist. Please check --vits-data-dir",
  77 + data_dir.c_str());
73 return false; 78 return false;
74 } 79 }
75 } 80 }
@@ -82,8 +87,8 @@ bool OfflineTtsVitsModelConfig::Validate() const { @@ -82,8 +87,8 @@ bool OfflineTtsVitsModelConfig::Validate() const {
82 87
83 for (const auto &f : required_files) { 88 for (const auto &f : required_files) {
84 if (!FileExists(dict_dir + "/" + f)) { 89 if (!FileExists(dict_dir + "/" + f)) {
85 - SHERPA_ONNX_LOGE("'%s/%s' does not exist.", dict_dir.c_str(),  
86 - f.c_str()); 90 + SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check vits-dict-dir",
  91 + dict_dir.c_str(), f.c_str());
87 return false; 92 return false;
88 } 93 }
89 } 94 }
@@ -174,7 +174,7 @@ class OfflineTtsVitsModel::Impl { @@ -174,7 +174,7 @@ class OfflineTtsVitsModel::Impl {
174 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); 174 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0);
175 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); 175 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0);
176 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_eos_bos, 176 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_eos_bos,
177 - "use_eos_bos", 0); 177 + "use_eos_bos", 1);
178 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.pad_id, "pad_id", 0); 178 SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.pad_id, "pad_id", 0);
179 179
180 std::string comment; 180 std::string comment;
@@ -362,7 +362,7 @@ Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/, @@ -362,7 +362,7 @@ Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/,
362 362
363 Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, Ort::Value tones, 363 Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, Ort::Value tones,
364 int64_t sid /*= 0*/, 364 int64_t sid /*= 0*/,
365 - float speed /*= 1.0*/) { 365 + float speed /*= 1.0*/) const {
366 return impl_->Run(std::move(x), std::move(tones), sid, speed); 366 return impl_->Run(std::move(x), std::move(tones), sid, speed);
367 } 367 }
368 368
@@ -37,7 +37,7 @@ class OfflineTtsVitsModel { @@ -37,7 +37,7 @@ class OfflineTtsVitsModel {
37 37
38 // This is for MeloTTS 38 // This is for MeloTTS
39 Ort::Value Run(Ort::Value x, Ort::Value tones, int64_t sid = 0, 39 Ort::Value Run(Ort::Value x, Ort::Value tones, int64_t sid = 0,
40 - float speed = 1.0); 40 + float speed = 1.0) const;
41 41
42 const OfflineTtsVitsModelMetaData &GetMetaData() const; 42 const OfflineTtsVitsModelMetaData &GetMetaData() const;
43 43
@@ -273,4 +273,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) { @@ -273,4 +273,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) {
273 return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider); 273 return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider);
274 } 274 }
275 275
  276 +Ort::SessionOptions GetSessionOptions(int32_t num_threads,
  277 + const std::string &provider_str) {
  278 + return GetSessionOptionsImpl(num_threads, provider_str);
  279 +}
  280 +
276 } // namespace sherpa_onnx 281 } // namespace sherpa_onnx
@@ -26,6 +26,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config); @@ -26,6 +26,9 @@ Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config);
26 Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config, 26 Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config,
27 const std::string &model_type); 27 const std::string &model_type);
28 28
  29 +Ort::SessionOptions GetSessionOptions(int32_t num_threads,
  30 + const std::string &provider_str);
  31 +
29 template <typename T> 32 template <typename T>
30 Ort::SessionOptions GetSessionOptions(const T &config) { 33 Ort::SessionOptions GetSessionOptions(const T &config) {
31 return GetSessionOptionsImpl(config.num_threads, config.provider); 34 return GetSessionOptionsImpl(config.num_threads, config.provider);
@@ -72,6 +72,10 @@ or details. @@ -72,6 +72,10 @@ or details.
72 exit(EXIT_FAILURE); 72 exit(EXIT_FAILURE);
73 } 73 }
74 74
  75 + if (config.model.debug) {
  76 + fprintf(stderr, "%s\n", config.model.ToString().c_str());
  77 + }
  78 +
75 if (!config.Validate()) { 79 if (!config.Validate()) {
76 fprintf(stderr, "Errors in config!\n"); 80 fprintf(stderr, "Errors in config!\n");
77 exit(EXIT_FAILURE); 81 exit(EXIT_FAILURE);
@@ -54,6 +54,7 @@ endif() @@ -54,6 +54,7 @@ endif()
54 54
55 if(SHERPA_ONNX_ENABLE_TTS) 55 if(SHERPA_ONNX_ENABLE_TTS)
56 list(APPEND srcs 56 list(APPEND srcs
  57 + offline-tts-matcha-model-config.cc
57 offline-tts-model-config.cc 58 offline-tts-model-config.cc
58 offline-tts-vits-model-config.cc 59 offline-tts-vits-model-config.cc
59 offline-tts.cc 60 offline-tts.cc
  1 +// sherpa-onnx/python/csrc/offline-tts-matcha-model-config.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h"
  6 +
  7 +#include <string>
  8 +
  9 +#include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +void PybindOfflineTtsMatchaModelConfig(py::module *m) {
  14 + using PyClass = OfflineTtsMatchaModelConfig;
  15 +
  16 + py::class_<PyClass>(*m, "OfflineTtsMatchaModelConfig")
  17 + .def(py::init<>())
  18 + .def(py::init<const std::string &, const std::string &,
  19 + const std::string &, const std::string &,
  20 + const std::string &, const std::string &, float, float>(),
  21 + py::arg("acoustic_model"), py::arg("vocoder"), py::arg("lexicon"),
  22 + py::arg("tokens"), py::arg("data_dir") = "",
  23 + py::arg("dict_dir") = "", py::arg("noise_scale") = 1.0,
  24 + py::arg("length_scale") = 1.0)
  25 + .def_readwrite("acoustic_model", &PyClass::acoustic_model)
  26 + .def_readwrite("vocoder", &PyClass::vocoder)
  27 + .def_readwrite("lexicon", &PyClass::lexicon)
  28 + .def_readwrite("tokens", &PyClass::tokens)
  29 + .def_readwrite("data_dir", &PyClass::data_dir)
  30 + .def_readwrite("dict_dir", &PyClass::dict_dir)
  31 + .def_readwrite("noise_scale", &PyClass::noise_scale)
  32 + .def_readwrite("length_scale", &PyClass::length_scale)
  33 + .def("__str__", &PyClass::ToString)
  34 + .def("validate", &PyClass::Validate);
  35 +}
  36 +
  37 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindOfflineTtsMatchaModelConfig(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_MATCHA_MODEL_CONFIG_H_
@@ -7,22 +7,26 @@ @@ -7,22 +7,26 @@
7 #include <string> 7 #include <string>
8 8
9 #include "sherpa-onnx/csrc/offline-tts-model-config.h" 9 #include "sherpa-onnx/csrc/offline-tts-model-config.h"
  10 +#include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h"
10 #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" 11 #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h"
11 12
12 namespace sherpa_onnx { 13 namespace sherpa_onnx {
13 14
14 void PybindOfflineTtsModelConfig(py::module *m) { 15 void PybindOfflineTtsModelConfig(py::module *m) {
15 PybindOfflineTtsVitsModelConfig(m); 16 PybindOfflineTtsVitsModelConfig(m);
  17 + PybindOfflineTtsMatchaModelConfig(m);
16 18
17 using PyClass = OfflineTtsModelConfig; 19 using PyClass = OfflineTtsModelConfig;
18 20
19 py::class_<PyClass>(*m, "OfflineTtsModelConfig") 21 py::class_<PyClass>(*m, "OfflineTtsModelConfig")
20 .def(py::init<>()) 22 .def(py::init<>())
21 - .def(py::init<const OfflineTtsVitsModelConfig &, int32_t, bool, 23 + .def(py::init<const OfflineTtsVitsModelConfig &,
  24 + const OfflineTtsMatchaModelConfig &, int32_t, bool,
22 const std::string &>(), 25 const std::string &>(),
23 - py::arg("vits"), py::arg("num_threads") = 1, 26 + py::arg("vits"), py::arg("matcha"), py::arg("num_threads") = 1,
24 py::arg("debug") = false, py::arg("provider") = "cpu") 27 py::arg("debug") = false, py::arg("provider") = "cpu")
25 .def_readwrite("vits", &PyClass::vits) 28 .def_readwrite("vits", &PyClass::vits)
  29 + .def_readwrite("matcha", &PyClass::matcha)
26 .def_readwrite("num_threads", &PyClass::num_threads) 30 .def_readwrite("num_threads", &PyClass::num_threads)
27 .def_readwrite("debug", &PyClass::debug) 31 .def_readwrite("debug", &PyClass::debug)
28 .def_readwrite("provider", &PyClass::provider) 32 .def_readwrite("provider", &PyClass::provider)
@@ -20,6 +20,7 @@ from _sherpa_onnx import ( @@ -20,6 +20,7 @@ from _sherpa_onnx import (
20 OfflineStream, 20 OfflineStream,
21 OfflineTts, 21 OfflineTts,
22 OfflineTtsConfig, 22 OfflineTtsConfig,
  23 + OfflineTtsMatchaModelConfig,
23 OfflineTtsModelConfig, 24 OfflineTtsModelConfig,
24 OfflineTtsVitsModelConfig, 25 OfflineTtsVitsModelConfig,
25 OfflineZipformerAudioTaggingModelConfig, 26 OfflineZipformerAudioTaggingModelConfig,