Fangjun Kuang
Committed by GitHub

Add Go API for MatchaTTS models (#1685)

@@ -209,6 +209,15 @@ jobs: @@ -209,6 +209,15 @@ jobs:
209 go build 209 go build
210 ls -lh 210 ls -lh
211 211
  212 + echo "Test matcha zh"
  213 + ./run-matcha-zh.sh
  214 + rm -rf matcha-icefall-*
  215 +
  216 + echo "Test matcha en"
  217 + ./run-matcha-en.sh
  218 + rm -rf matcha-icefall-*
  219 + ls -lh *.wav
  220 +
212 echo "Test vits-ljs" 221 echo "Test vits-ljs"
213 ./run-vits-ljs.sh 222 ./run-vits-ljs.sh
214 rm -rf vits-ljs 223 rm -rf vits-ljs
@@ -246,6 +255,15 @@ jobs: @@ -246,6 +255,15 @@ jobs:
246 cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll . 255 cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll .
247 ls -lh 256 ls -lh
248 257
  258 + echo "Test matcha zh"
  259 + ./run-matcha-zh.sh
  260 + rm -rf matcha-icefall-*
  261 +
  262 + echo "Test matcha en"
  263 + ./run-matcha-en.sh
  264 + rm -rf matcha-icefall-*
  265 + ls -lh *.wav
  266 +
249 echo "Test vits-ljs" 267 echo "Test vits-ljs"
250 ./run-vits-ljs.sh 268 ./run-vits-ljs.sh
251 rm -rf vits-ljs 269 rm -rf vits-ljs
@@ -291,6 +309,15 @@ jobs: @@ -291,6 +309,15 @@ jobs:
291 cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll . 309 cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll .
292 ls -lh 310 ls -lh
293 311
  312 + echo "Test matcha zh"
  313 + ./run-matcha-zh.sh
  314 + rm -rf matcha-icefall-*
  315 +
  316 + echo "Test matcha en"
  317 + ./run-matcha-en.sh
  318 + rm -rf matcha-icefall-*
  319 + ls -lh *.wav
  320 +
294 echo "Test vits-ljs" 321 echo "Test vits-ljs"
295 ./run-vits-ljs.sh 322 ./run-vits-ljs.sh
296 rm -rf vits-ljs 323 rm -rf vits-ljs
@@ -226,6 +226,15 @@ jobs: @@ -226,6 +226,15 @@ jobs:
226 go build 226 go build
227 ls -lh 227 ls -lh
228 228
  229 + echo "Test matcha zh"
  230 + ./run-matcha-zh.sh
  231 + rm -rf matcha-icefall-*
  232 +
  233 + echo "Test matcha en"
  234 + ./run-matcha-en.sh
  235 + rm -rf matcha-icefall-*
  236 + ls -lh *.wav
  237 +
229 echo "Test vits-ljs" 238 echo "Test vits-ljs"
230 ./run-vits-ljs.sh 239 ./run-vits-ljs.sh
231 rm -rf vits-ljs 240 rm -rf vits-ljs
@@ -17,11 +17,22 @@ func main() { @@ -17,11 +17,22 @@ func main() {
17 flag.StringVar(&config.Model.Vits.Lexicon, "vits-lexicon", "", "Path to lexicon.txt") 17 flag.StringVar(&config.Model.Vits.Lexicon, "vits-lexicon", "", "Path to lexicon.txt")
18 flag.StringVar(&config.Model.Vits.Tokens, "vits-tokens", "", "Path to tokens.txt") 18 flag.StringVar(&config.Model.Vits.Tokens, "vits-tokens", "", "Path to tokens.txt")
19 flag.StringVar(&config.Model.Vits.DataDir, "vits-data-dir", "", "Path to espeak-ng-data") 19 flag.StringVar(&config.Model.Vits.DataDir, "vits-data-dir", "", "Path to espeak-ng-data")
  20 + flag.StringVar(&config.Model.Matcha.DictDir, "vits-dict-dir", "", "Path to dict for jieba")
20 21
21 flag.Float32Var(&config.Model.Vits.NoiseScale, "vits-noise-scale", 0.667, "noise_scale for VITS") 22 flag.Float32Var(&config.Model.Vits.NoiseScale, "vits-noise-scale", 0.667, "noise_scale for VITS")
22 flag.Float32Var(&config.Model.Vits.NoiseScaleW, "vits-noise-scale-w", 0.8, "noise_scale_w for VITS") 23 flag.Float32Var(&config.Model.Vits.NoiseScaleW, "vits-noise-scale-w", 0.8, "noise_scale_w for VITS")
23 flag.Float32Var(&config.Model.Vits.LengthScale, "vits-length-scale", 1.0, "length_scale for VITS. small -> faster in speech speed; large -> slower") 24 flag.Float32Var(&config.Model.Vits.LengthScale, "vits-length-scale", 1.0, "length_scale for VITS. small -> faster in speech speed; large -> slower")
24 25
  26 + flag.StringVar(&config.Model.Matcha.AcousticModel, "matcha-acoustic-model", "", "Path to the matcha acoustic model")
  27 + flag.StringVar(&config.Model.Matcha.Vocoder, "matcha-vocoder", "", "Path to the matcha vocoder model")
  28 + flag.StringVar(&config.Model.Matcha.Lexicon, "matcha-lexicon", "", "Path to lexicon.txt")
  29 + flag.StringVar(&config.Model.Matcha.Tokens, "matcha-tokens", "", "Path to tokens.txt")
  30 + flag.StringVar(&config.Model.Matcha.DataDir, "matcha-data-dir", "", "Path to espeak-ng-data")
  31 + flag.StringVar(&config.Model.Matcha.DictDir, "matcha-dict-dir", "", "Path to dict for jieba")
  32 +
  33 + flag.Float32Var(&config.Model.Matcha.NoiseScale, "matcha-noise-scale", 0.667, "noise_scale for Matcha")
  34 + flag.Float32Var(&config.Model.Matcha.LengthScale, "matcha-length-scale", 1.0, "length_scale for Matcha. small -> faster in speech speed; large -> slower")
  35 +
25 flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing") 36 flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing")
26 flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message") 37 flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
27 flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use") 38 flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +# please visit
  6 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  7 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  8 +# to download more models
  9 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  10 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  11 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  12 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  13 +fi
  14 +
  15 +if [ ! -f ./hifigan_v2.onnx ]; then
  16 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  17 +fi
  18 +
  19 +go mod tidy
  20 +go build
  21 +
  22 +./non-streaming-tts \
  23 + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
  24 + --matcha-vocoder=./hifigan_v2.onnx \
  25 + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
  26 + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
  27 + --debug=1 \
  28 + --output-filename=./test-matcha-en.wav \
  29 + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  30 +
  31 +
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +# please visit
  6 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  7 +# to download more models
  8 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  9 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  10 + tar xvf matcha-icefall-zh-baker.tar.bz2
  11 + rm matcha-icefall-zh-baker.tar.bz2
  12 +fi
  13 +
  14 +if [ ! -f ./hifigan_v2.onnx ]; then
  15 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  16 +fi
  17 +
  18 +go mod tidy
  19 +go build
  20 +
  21 +./non-streaming-tts \
  22 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  23 + --matcha-vocoder=./hifigan_v2.onnx \
  24 + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  25 + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
  26 + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \
  27 + --debug=1 \
  28 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  29 + --output-filename=./test-matcha-zh.wav \
  30 + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
  31 +
@@ -4,7 +4,7 @@ set -ex @@ -4,7 +4,7 @@ set -ex
4 4
5 if [ ! -d vits-piper-en_US-lessac-medium ]; then 5 if [ ! -d vits-piper-en_US-lessac-medium ]; then
6 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-lessac-medium.tar.bz2 6 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-lessac-medium.tar.bz2
7 - tar xvf vits-piper-en_US-lessac-medium.tar.bz2 7 + tar xf vits-piper-en_US-lessac-medium.tar.bz2
8 rm vits-piper-en_US-lessac-medium.tar.bz2 8 rm vits-piper-en_US-lessac-medium.tar.bz2
9 fi 9 fi
10 10
  1 +../../../../go-api-examples/non-streaming-tts/run-matcha-en.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-matcha-zh.sh
@@ -671,8 +671,20 @@ type OfflineTtsVitsModelConfig struct { @@ -671,8 +671,20 @@ type OfflineTtsVitsModelConfig struct {
671 DictDir string // Path to dict directory for jieba (used only in Chinese tts) 671 DictDir string // Path to dict directory for jieba (used only in Chinese tts)
672 } 672 }
673 673
  674 +type OfflineTtsMatchaModelConfig struct {
  675 + AcousticModel string // Path to the acoustic model for MatchaTTS
  676 + Vocoder string // Path to the vocoder model for MatchaTTS
  677 + Lexicon string // Path to lexicon.txt
  678 + Tokens string // Path to tokens.txt
  679 + DataDir string // Path to espeak-ng-data directory
  680 + NoiseScale float32 // noise scale for vits models. Please use 0.667 in general
  681 + LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
  682 + DictDir string // Path to dict directory for jieba (used only in Chinese tts)
  683 +}
  684 +
674 type OfflineTtsModelConfig struct { 685 type OfflineTtsModelConfig struct {
675 Vits OfflineTtsVitsModelConfig 686 Vits OfflineTtsVitsModelConfig
  687 + Matcha OfflineTtsMatchaModelConfig
676 688
677 // Number of threads to use for neural network computation 689 // Number of threads to use for neural network computation
678 NumThreads int 690 NumThreads int
@@ -722,6 +734,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { @@ -722,6 +734,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
722 734
723 c.max_num_sentences = C.int(config.MaxNumSentences) 735 c.max_num_sentences = C.int(config.MaxNumSentences)
724 736
  737 + // vits
725 c.model.vits.model = C.CString(config.Model.Vits.Model) 738 c.model.vits.model = C.CString(config.Model.Vits.Model)
726 defer C.free(unsafe.Pointer(c.model.vits.model)) 739 defer C.free(unsafe.Pointer(c.model.vits.model))
727 740
@@ -741,6 +754,28 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { @@ -741,6 +754,28 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
741 c.model.vits.dict_dir = C.CString(config.Model.Vits.DictDir) 754 c.model.vits.dict_dir = C.CString(config.Model.Vits.DictDir)
742 defer C.free(unsafe.Pointer(c.model.vits.dict_dir)) 755 defer C.free(unsafe.Pointer(c.model.vits.dict_dir))
743 756
  757 + // matcha
  758 + c.model.matcha.acoustic_model = C.CString(config.Model.Matcha.AcousticModel)
  759 + defer C.free(unsafe.Pointer(c.model.matcha.acoustic_model))
  760 +
  761 + c.model.matcha.vocoder = C.CString(config.Model.Matcha.Vocoder)
  762 + defer C.free(unsafe.Pointer(c.model.matcha.vocoder))
  763 +
  764 + c.model.matcha.lexicon = C.CString(config.Model.Matcha.Lexicon)
  765 + defer C.free(unsafe.Pointer(c.model.matcha.lexicon))
  766 +
  767 + c.model.matcha.tokens = C.CString(config.Model.Matcha.Tokens)
  768 + defer C.free(unsafe.Pointer(c.model.matcha.tokens))
  769 +
  770 + c.model.matcha.data_dir = C.CString(config.Model.Matcha.DataDir)
  771 + defer C.free(unsafe.Pointer(c.model.matcha.data_dir))
  772 +
  773 + c.model.matcha.noise_scale = C.float(config.Model.Matcha.NoiseScale)
  774 + c.model.matcha.length_scale = C.float(config.Model.Matcha.LengthScale)
  775 +
  776 + c.model.matcha.dict_dir = C.CString(config.Model.Matcha.DictDir)
  777 + defer C.free(unsafe.Pointer(c.model.matcha.dict_dir))
  778 +
744 c.model.num_threads = C.int(config.Model.NumThreads) 779 c.model.num_threads = C.int(config.Model.NumThreads)
745 c.model.debug = C.int(config.Model.Debug) 780 c.model.debug = C.int(config.Model.Debug)
746 781