愚者自愚
Committed by GitHub

Add Go implementation of the TTS generation callback (#2213)

module offline-tts-play
go 1.17
... ...
package main
import (
"log"
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
flag "github.com/spf13/pflag"
)
func main() {
log.SetFlags(log.LstdFlags | log.Lmicroseconds)
config := sherpa.OfflineTtsConfig{}
sid := 0
flag.StringVar(&config.Model.Vits.Model, "vits-model", "", "Path to the vits ONNX model")
flag.StringVar(&config.Model.Vits.Lexicon, "vits-lexicon", "", "Path to lexicon.txt")
flag.StringVar(&config.Model.Vits.Tokens, "vits-tokens", "", "Path to tokens.txt")
flag.StringVar(&config.Model.Vits.DataDir, "vits-data-dir", "", "Path to espeak-ng-data")
flag.StringVar(&config.Model.Vits.DictDir, "vits-dict-dir", "", "Path to dict for jieba")
flag.Float32Var(&config.Model.Vits.NoiseScale, "vits-noise-scale", 0.667, "noise_scale for VITS")
flag.Float32Var(&config.Model.Vits.NoiseScaleW, "vits-noise-scale-w", 0.8, "noise_scale_w for VITS")
flag.Float32Var(&config.Model.Vits.LengthScale, "vits-length-scale", 1.0, "length_scale for VITS. small -> faster in speech speed; large -> slower")
flag.StringVar(&config.Model.Matcha.AcousticModel, "matcha-acoustic-model", "", "Path to the matcha acoustic model")
flag.StringVar(&config.Model.Matcha.Vocoder, "matcha-vocoder", "", "Path to the matcha vocoder model")
flag.StringVar(&config.Model.Matcha.Lexicon, "matcha-lexicon", "", "Path to lexicon.txt")
flag.StringVar(&config.Model.Matcha.Tokens, "matcha-tokens", "", "Path to tokens.txt")
flag.StringVar(&config.Model.Matcha.DataDir, "matcha-data-dir", "", "Path to espeak-ng-data")
flag.StringVar(&config.Model.Matcha.DictDir, "matcha-dict-dir", "", "Path to dict for jieba")
flag.Float32Var(&config.Model.Matcha.NoiseScale, "matcha-noise-scale", 0.667, "noise_scale for Matcha")
flag.Float32Var(&config.Model.Matcha.LengthScale, "matcha-length-scale", 1.0, "length_scale for Matcha. small -> faster in speech speed; large -> slower")
flag.StringVar(&config.Model.Kokoro.Model, "kokoro-model", "", "Path to the Kokoro ONNX model")
flag.StringVar(&config.Model.Kokoro.Voices, "kokoro-voices", "", "Path to voices.bin for Kokoro")
flag.StringVar(&config.Model.Kokoro.Tokens, "kokoro-tokens", "", "Path to tokens.txt for Kokoro")
flag.StringVar(&config.Model.Kokoro.DataDir, "kokoro-data-dir", "", "Path to espeak-ng-data for Kokoro")
flag.StringVar(&config.Model.Kokoro.DictDir, "kokoro-dict-dir", "", "Path to dict for Kokoro")
flag.StringVar(&config.Model.Kokoro.Lexicon, "kokoro-lexicon", "", "Path to lexicon files for Kokoro")
flag.Float32Var(&config.Model.Kokoro.LengthScale, "kokoro-length-scale", 1.0, "length_scale for Kokoro. small -> faster in speech speed; large -> slower")
flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing")
flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")
flag.StringVar(&config.RuleFsts, "tts-rule-fsts", "", "Path to rule.fst")
flag.StringVar(&config.RuleFars, "tts-rule-fars", "", "Path to rule.far")
flag.IntVar(&config.MaxNumSentences, "tts-max-num-sentences", 1, "Batch size")
flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models")
flag.Parse()
if len(flag.Args()) != 1 {
log.Fatalf("Please provide the text to generate audios")
}
text := flag.Arg(0)
log.Println("Input text:", text)
log.Println("Speaker ID:", sid)
log.Println("Initializing model (may take several seconds)")
tts := sherpa.NewOfflineTts(&config)
defer sherpa.DeleteOfflineTts(tts)
log.Println("Model created!")
log.Println("Start generating!")
tts.GenerateWithCallback(text, sid, 1.0, func(samples []float32) {
log.Printf("data len(%d)", len(samples))
})
log.Println("")
log.Println("Done!")
}
... ...
#!/usr/bin/env bash
set -ex
if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2
fi
go mod tidy
go build
./offline-tts-play \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--debug=1 \
"Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
... ...
#!/usr/bin/env bash
set -ex
if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
tar xf kokoro-multi-lang-v1_0.tar.bz2
rm kokoro-multi-lang-v1_0.tar.bz2
fi
go mod tidy
go build
./offline-tts-play \
--kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
--kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
--kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
--kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
--kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
--kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
--debug=1 \
"中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
... ...
#!/usr/bin/env bash
set -ex
# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
fi
if [ ! -f ./vocos-22khz-univ.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
fi
go mod tidy
go build
./offline-tts-play \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./vocos-22khz-univ.onnx \
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--debug=1 \
"Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
... ...
#!/usr/bin/env bash
set -ex
# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
fi
if [ ! -f ./vocos-22khz-univ.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
fi
go mod tidy
go build
./offline-tts-play \
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
--matcha-vocoder=./vocos-22khz-univ.onnx \
--matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
--matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
--matcha-dict-dir=./matcha-icefall-zh-baker/dict \
--debug=1 \
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
... ...
#!/usr/bin/env bash
set -ex
if [ ! -d vits-ljs ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-ljs.tar.bz2
tar xvf vits-ljs.tar.bz2
rm vits-ljs.tar.bz2
fi
go mod tidy
go build
./offline-tts-play \
--vits-model=./vits-ljs/vits-ljs.onnx \
--vits-lexicon=./vits-ljs/lexicon.txt \
--vits-tokens=./vits-ljs/tokens.txt \
--sid=0 \
--debug=1 \
"Liliana, the most beautiful and lovely assistant of our team!"
... ...
#!/usr/bin/env bash
set -ex
if [ ! -d vits-piper-en_US-lessac-medium ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-lessac-medium.tar.bz2
tar xf vits-piper-en_US-lessac-medium.tar.bz2
rm vits-piper-en_US-lessac-medium.tar.bz2
fi
go mod tidy
go build
./offline-tts-play \
--vits-model=./vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx \
--vits-data-dir=./vits-piper-en_US-lessac-medium/espeak-ng-data \
--vits-tokens=./vits-piper-en_US-lessac-medium/tokens.txt \
'liliana, the most beautiful and lovely assistant of our team!'
... ...
#!/usr/bin/env bash
set -ex
if [ ! -d vits-vctk ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2
tar xvf vits-vctk.tar.bz2
rm vits-vctk.tar.bz2
fi
go mod tidy
go build
for sid in 0 10 108; do
./offline-tts-play \
--vits-model=./vits-vctk/vits-vctk.onnx \
--vits-lexicon=./vits-vctk/lexicon.txt \
--vits-tokens=./vits-vctk/tokens.txt \
--sid=0 \
--debug=1 \
'Ask not what your country can do for you; ask what you can do for your country.'
done
... ...
#!/usr/bin/env bash
set -ex
if [ ! -d vits-icefall-zh-aishell3 ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
rm vits-icefall-zh-aishell3.tar.bz2
fi
go mod tidy
go build
for sid in 10 33 99; do
./offline-tts-play \
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--sid=$sid \
--debug=1 \
"林美丽最美丽、最漂亮、最可爱!"
./offline-tts-play \
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
--sid=$sid \
--debug=1 \
"数字12345.6789怎么念"
./offline-tts-play \
--vits-model=./vits-icefall-zh-aishell3/model.onnx \
--vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
--vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
--tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
--tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
--sid=$sid \
--debug=1 \
"万古长存长沙长大长白山长孙长安街"
done
... ...
*.wav
vits-ljs
vits-vctk
vits-zh-aishell3
offline-tts-play
... ...
module offline-tts-play
go 1.17
replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
require (
github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx v0.0.0-00010101000000-000000000000
github.com/spf13/pflag v1.0.6
)
... ...
../../../../go-api-examples/non-streaming-tts/main.go
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-kokoro-en.sh
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-kokoro-zh-en.sh
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-matcha-en.sh
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-matcha-zh.sh
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-vits-ljs.sh
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-vits-piper-en_US-lessac-medium.sh
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-vits-vctk.sh
\ No newline at end of file
... ...
../../../../go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh
\ No newline at end of file
... ...
... ... @@ -41,8 +41,13 @@ package sherpa_onnx
// #include <stdlib.h>
// #include "c-api.h"
// extern int32_t _cgoGeneratedAudioCallback(float *samples,int32_t n,void *arg);
// extern int32_t _cgoGeneratedAudioProgressCallback(float *samples, int32_t n, float p, void *arg);
import "C"
import "unsafe"
import (
"runtime/cgo"
"unsafe"
)
// Configuration for online/streaming transducer models
//
... ... @@ -890,6 +895,36 @@ type OfflineTts struct {
impl *C.struct_SherpaOnnxOfflineTts
}
type sherpaOnnxGeneratedAudioCallbackWithArg func(samples []float32)
//export _cgoGeneratedAudioCallback
func _cgoGeneratedAudioCallback(samples *C.float, n C.int32_t, arg unsafe.Pointer) C.int32_t {
h := *(*cgo.Handle)(arg)
val := h.Value().(sherpaOnnxGeneratedAudioCallbackWithArg)
all := make([]float32, n)
arr := unsafe.Slice(samples, n)
for i := 0; i < int(n); i++ {
all[i] = float32(arr[i])
}
val(all)
return 1
}
type sherpaOnnxGeneratedAudioProgressCallbackWithArg func(samples []float32, p float32)
//export _cgoGeneratedAudioProgressCallback
func _cgoGeneratedAudioProgressCallback(samples *C.float, n C.int32_t, p C.float, arg unsafe.Pointer) C.int32_t {
h := *(*cgo.Handle)(arg)
val := h.Value().(sherpaOnnxGeneratedAudioProgressCallbackWithArg)
all := make([]float32, n)
arr := unsafe.Slice(samples, n)
for i := 0; i < int(n); i++ {
all[i] = float32(arr[i])
}
val(all, float32(p))
return 1
}
// Free the internal pointer inside the tts to avoid memory leak.
func DeleteOfflineTts(tts *OfflineTts) {
C.SherpaOnnxDestroyOfflineTts(tts.impl)
... ... @@ -1010,6 +1045,26 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
return ans
}
func (tts *OfflineTts) GenerateWithCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioCallbackWithArg) {
s := C.CString(text)
defer C.free(unsafe.Pointer(s))
h := cgo.NewHandle(cb)
defer h.Delete()
audio := C.SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(tts.impl, s, C.int(sid), C.float(speed), C.SherpaOnnxGeneratedAudioCallbackWithArg(C._cgoGeneratedAudioCallback), unsafe.Pointer(&h))
defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
}
func (tts *OfflineTts) GenerateWithProgressCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioProgressCallbackWithArg) {
s := C.CString(text)
defer C.free(unsafe.Pointer(s))
h := cgo.NewHandle(cb)
defer h.Delete()
audio := C.SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg(tts.impl, s, C.int(sid), C.float(speed), C.SherpaOnnxGeneratedAudioProgressCallbackWithArg(C._cgoGeneratedAudioProgressCallback), unsafe.Pointer(&h))
defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
}
func (audio *GeneratedAudio) Save(filename string) bool {
s := C.CString(filename)
defer C.free(unsafe.Pointer(s))
... ...