愚者自愚
Committed by GitHub

Add Go implementation of the TTS generation callback (#2213)

  1 +module offline-tts-play
  2 +
  3 +go 1.17
  1 +package main
  2 +
  3 +import (
  4 + "log"
  5 +
  6 + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
  7 + flag "github.com/spf13/pflag"
  8 +)
  9 +
  10 +func main() {
  11 + log.SetFlags(log.LstdFlags | log.Lmicroseconds)
  12 +
  13 + config := sherpa.OfflineTtsConfig{}
  14 + sid := 0
  15 +
  16 + flag.StringVar(&config.Model.Vits.Model, "vits-model", "", "Path to the vits ONNX model")
  17 + flag.StringVar(&config.Model.Vits.Lexicon, "vits-lexicon", "", "Path to lexicon.txt")
  18 + flag.StringVar(&config.Model.Vits.Tokens, "vits-tokens", "", "Path to tokens.txt")
  19 + flag.StringVar(&config.Model.Vits.DataDir, "vits-data-dir", "", "Path to espeak-ng-data")
  20 + flag.StringVar(&config.Model.Vits.DictDir, "vits-dict-dir", "", "Path to dict for jieba")
  21 +
  22 + flag.Float32Var(&config.Model.Vits.NoiseScale, "vits-noise-scale", 0.667, "noise_scale for VITS")
  23 + flag.Float32Var(&config.Model.Vits.NoiseScaleW, "vits-noise-scale-w", 0.8, "noise_scale_w for VITS")
  24 + flag.Float32Var(&config.Model.Vits.LengthScale, "vits-length-scale", 1.0, "length_scale for VITS. small -> faster in speech speed; large -> slower")
  25 +
  26 + flag.StringVar(&config.Model.Matcha.AcousticModel, "matcha-acoustic-model", "", "Path to the matcha acoustic model")
  27 + flag.StringVar(&config.Model.Matcha.Vocoder, "matcha-vocoder", "", "Path to the matcha vocoder model")
  28 + flag.StringVar(&config.Model.Matcha.Lexicon, "matcha-lexicon", "", "Path to lexicon.txt")
  29 + flag.StringVar(&config.Model.Matcha.Tokens, "matcha-tokens", "", "Path to tokens.txt")
  30 + flag.StringVar(&config.Model.Matcha.DataDir, "matcha-data-dir", "", "Path to espeak-ng-data")
  31 + flag.StringVar(&config.Model.Matcha.DictDir, "matcha-dict-dir", "", "Path to dict for jieba")
  32 +
  33 + flag.Float32Var(&config.Model.Matcha.NoiseScale, "matcha-noise-scale", 0.667, "noise_scale for Matcha")
  34 + flag.Float32Var(&config.Model.Matcha.LengthScale, "matcha-length-scale", 1.0, "length_scale for Matcha. small -> faster in speech speed; large -> slower")
  35 +
  36 + flag.StringVar(&config.Model.Kokoro.Model, "kokoro-model", "", "Path to the Kokoro ONNX model")
  37 + flag.StringVar(&config.Model.Kokoro.Voices, "kokoro-voices", "", "Path to voices.bin for Kokoro")
  38 + flag.StringVar(&config.Model.Kokoro.Tokens, "kokoro-tokens", "", "Path to tokens.txt for Kokoro")
  39 + flag.StringVar(&config.Model.Kokoro.DataDir, "kokoro-data-dir", "", "Path to espeak-ng-data for Kokoro")
  40 + flag.StringVar(&config.Model.Kokoro.DictDir, "kokoro-dict-dir", "", "Path to dict for Kokoro")
  41 + flag.StringVar(&config.Model.Kokoro.Lexicon, "kokoro-lexicon", "", "Path to lexicon files for Kokoro")
  42 + flag.Float32Var(&config.Model.Kokoro.LengthScale, "kokoro-length-scale", 1.0, "length_scale for Kokoro. small -> faster in speech speed; large -> slower")
  43 +
  44 + flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing")
  45 + flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
  46 + flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")
  47 + flag.StringVar(&config.RuleFsts, "tts-rule-fsts", "", "Path to rule.fst")
  48 + flag.StringVar(&config.RuleFars, "tts-rule-fars", "", "Path to rule.far")
  49 + flag.IntVar(&config.MaxNumSentences, "tts-max-num-sentences", 1, "Batch size")
  50 +
  51 + flag.IntVar(&sid, "sid", 0, "Speaker ID. Used only for multi-speaker models")
  52 +
  53 + flag.Parse()
  54 +
  55 + if len(flag.Args()) != 1 {
  56 + log.Fatalf("Please provide the text to generate audios")
  57 + }
  58 +
  59 + text := flag.Arg(0)
  60 +
  61 + log.Println("Input text:", text)
  62 + log.Println("Speaker ID:", sid)
  63 +
  64 + log.Println("Initializing model (may take several seconds)")
  65 +
  66 + tts := sherpa.NewOfflineTts(&config)
  67 + defer sherpa.DeleteOfflineTts(tts)
  68 +
  69 + log.Println("Model created!")
  70 +
  71 + log.Println("Start generating!")
  72 +
  73 + tts.GenerateWithCallback(text, sid, 1.0, func(samples []float32) {
  74 + log.Printf("data len(%d)", len(samples))
  75 + })
  76 + log.Println("")
  77 +
  78 + log.Println("Done!")
  79 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  7 + tar xf kokoro-en-v0_19.tar.bz2
  8 + rm kokoro-en-v0_19.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./offline-tts-play \
  15 + --kokoro-model=./kokoro-en-v0_19/model.onnx \
  16 + --kokoro-voices=./kokoro-en-v0_19/voices.bin \
  17 + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
  18 + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
  19 + --debug=1 \
  20 + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  7 + tar xf kokoro-multi-lang-v1_0.tar.bz2
  8 + rm kokoro-multi-lang-v1_0.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./offline-tts-play \
  15 + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \
  16 + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \
  17 + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \
  18 + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \
  19 + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \
  20 + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \
  21 + --debug=1 \
  22 + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +# please visit
  6 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  7 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  8 +# to download more models
  9 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  10 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  11 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  12 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  13 +fi
  14 +
  15 +if [ ! -f ./vocos-22khz-univ.onnx ]; then
  16 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
  17 +fi
  18 +
  19 +go mod tidy
  20 +go build
  21 +
  22 +./offline-tts-play \
  23 + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
  24 + --matcha-vocoder=./vocos-22khz-univ.onnx \
  25 + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
  26 + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
  27 + --debug=1 \
  28 + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  29 +
  30 +
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +# please visit
  6 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  7 +# to download more models
  8 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  9 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  10 + tar xvf matcha-icefall-zh-baker.tar.bz2
  11 + rm matcha-icefall-zh-baker.tar.bz2
  12 +fi
  13 +
  14 +if [ ! -f ./vocos-22khz-univ.onnx ]; then
  15 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
  16 +fi
  17 +
  18 +go mod tidy
  19 +go build
  20 +
  21 +./offline-tts-play \
  22 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  23 + --matcha-vocoder=./vocos-22khz-univ.onnx \
  24 + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  25 + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
  26 + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \
  27 + --debug=1 \
  28 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  29 + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
  30 +
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d vits-ljs ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-ljs.tar.bz2
  7 + tar xvf vits-ljs.tar.bz2
  8 + rm vits-ljs.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./offline-tts-play \
  15 + --vits-model=./vits-ljs/vits-ljs.onnx \
  16 + --vits-lexicon=./vits-ljs/lexicon.txt \
  17 + --vits-tokens=./vits-ljs/tokens.txt \
  18 + --sid=0 \
  19 + --debug=1 \
  20 + "Liliana, the most beautiful and lovely assistant of our team!"
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d vits-piper-en_US-lessac-medium ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-lessac-medium.tar.bz2
  7 + tar xf vits-piper-en_US-lessac-medium.tar.bz2
  8 + rm vits-piper-en_US-lessac-medium.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./offline-tts-play \
  15 + --vits-model=./vits-piper-en_US-lessac-medium/en_US-lessac-medium.onnx \
  16 + --vits-data-dir=./vits-piper-en_US-lessac-medium/espeak-ng-data \
  17 + --vits-tokens=./vits-piper-en_US-lessac-medium/tokens.txt \
  18 + 'liliana, the most beautiful and lovely assistant of our team!'
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d vits-vctk ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2
  7 + tar xvf vits-vctk.tar.bz2
  8 + rm vits-vctk.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +for sid in 0 10 108; do
  15 +./offline-tts-play \
  16 + --vits-model=./vits-vctk/vits-vctk.onnx \
  17 + --vits-lexicon=./vits-vctk/lexicon.txt \
  18 + --vits-tokens=./vits-vctk/tokens.txt \
  19 + --sid=0 \
  20 + --debug=1 \
  21 + 'Ask not what your country can do for you; ask what you can do for your country.'
  22 +done
  1 +#!/usr/bin/env bash
  2 +set -ex
  3 +
  4 +if [ ! -d vits-icefall-zh-aishell3 ]; then
  5 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
  6 + tar xvf vits-icefall-zh-aishell3.tar.bz2
  7 + rm vits-icefall-zh-aishell3.tar.bz2
  8 +fi
  9 +
  10 +go mod tidy
  11 +go build
  12 +
  13 +for sid in 10 33 99; do
  14 +./offline-tts-play \
  15 + --vits-model=./vits-icefall-zh-aishell3/model.onnx \
  16 + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
  17 + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
  18 + --sid=$sid \
  19 + --debug=1 \
  20 + "林美丽最美丽、最漂亮、最可爱!"
  21 +
  22 +./offline-tts-play \
  23 + --vits-model=./vits-icefall-zh-aishell3/model.onnx \
  24 + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
  25 + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
  26 + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
  27 + --sid=$sid \
  28 + --debug=1 \
  29 + "数字12345.6789怎么念"
  30 +
  31 +./offline-tts-play \
  32 + --vits-model=./vits-icefall-zh-aishell3/model.onnx \
  33 + --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
  34 + --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \
  35 + --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
  36 + --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
  37 + --sid=$sid \
  38 + --debug=1 \
  39 + "万古长存长沙长大长白山长孙长安街"
  40 +done
  1 +*.wav
  2 +vits-ljs
  3 +vits-vctk
  4 +vits-zh-aishell3
  5 +offline-tts-play
  1 +module offline-tts-play
  2 +
  3 +go 1.17
  4 +
  5 +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
  6 +
  7 +require (
  8 + github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx v0.0.0-00010101000000-000000000000
  9 + github.com/spf13/pflag v1.0.6
  10 +)
  1 +../../../../go-api-examples/non-streaming-tts/main.go
  1 +../../../../go-api-examples/non-streaming-tts/run-kokoro-en.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-kokoro-zh-en.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-matcha-en.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-matcha-zh.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-vits-ljs.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-vits-piper-en_US-lessac-medium.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-vits-vctk.sh
  1 +../../../../go-api-examples/non-streaming-tts/run-vits-zh-aishell3.sh
@@ -41,8 +41,13 @@ package sherpa_onnx @@ -41,8 +41,13 @@ package sherpa_onnx
41 41
42 // #include <stdlib.h> 42 // #include <stdlib.h>
43 // #include "c-api.h" 43 // #include "c-api.h"
  44 +// extern int32_t _cgoGeneratedAudioCallback(float *samples,int32_t n,void *arg);
  45 +// extern int32_t _cgoGeneratedAudioProgressCallback(float *samples, int32_t n, float p, void *arg);
44 import "C" 46 import "C"
45 -import "unsafe" 47 +import (
  48 + "runtime/cgo"
  49 + "unsafe"
  50 +)
46 51
47 // Configuration for online/streaming transducer models 52 // Configuration for online/streaming transducer models
48 // 53 //
@@ -890,6 +895,36 @@ type OfflineTts struct { @@ -890,6 +895,36 @@ type OfflineTts struct {
890 impl *C.struct_SherpaOnnxOfflineTts 895 impl *C.struct_SherpaOnnxOfflineTts
891 } 896 }
892 897
  898 +type sherpaOnnxGeneratedAudioCallbackWithArg func(samples []float32)
  899 +
  900 +//export _cgoGeneratedAudioCallback
  901 +func _cgoGeneratedAudioCallback(samples *C.float, n C.int32_t, arg unsafe.Pointer) C.int32_t {
  902 + h := *(*cgo.Handle)(arg)
  903 + val := h.Value().(sherpaOnnxGeneratedAudioCallbackWithArg)
  904 + all := make([]float32, n)
  905 + arr := unsafe.Slice(samples, n)
  906 + for i := 0; i < int(n); i++ {
  907 + all[i] = float32(arr[i])
  908 + }
  909 + val(all)
  910 + return 1
  911 +}
  912 +
  913 +type sherpaOnnxGeneratedAudioProgressCallbackWithArg func(samples []float32, p float32)
  914 +
  915 +//export _cgoGeneratedAudioProgressCallback
  916 +func _cgoGeneratedAudioProgressCallback(samples *C.float, n C.int32_t, p C.float, arg unsafe.Pointer) C.int32_t {
  917 + h := *(*cgo.Handle)(arg)
  918 + val := h.Value().(sherpaOnnxGeneratedAudioProgressCallbackWithArg)
  919 + all := make([]float32, n)
  920 + arr := unsafe.Slice(samples, n)
  921 + for i := 0; i < int(n); i++ {
  922 + all[i] = float32(arr[i])
  923 + }
  924 + val(all, float32(p))
  925 + return 1
  926 +}
  927 +
893 // Free the internal pointer inside the tts to avoid memory leak. 928 // Free the internal pointer inside the tts to avoid memory leak.
894 func DeleteOfflineTts(tts *OfflineTts) { 929 func DeleteOfflineTts(tts *OfflineTts) {
895 C.SherpaOnnxDestroyOfflineTts(tts.impl) 930 C.SherpaOnnxDestroyOfflineTts(tts.impl)
@@ -1010,6 +1045,26 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA @@ -1010,6 +1045,26 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
1010 return ans 1045 return ans
1011 } 1046 }
1012 1047
  1048 +func (tts *OfflineTts) GenerateWithCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioCallbackWithArg) {
  1049 + s := C.CString(text)
  1050 + defer C.free(unsafe.Pointer(s))
  1051 +
  1052 + h := cgo.NewHandle(cb)
  1053 + defer h.Delete()
  1054 + audio := C.SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(tts.impl, s, C.int(sid), C.float(speed), C.SherpaOnnxGeneratedAudioCallbackWithArg(C._cgoGeneratedAudioCallback), unsafe.Pointer(&h))
  1055 + defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
  1056 +}
  1057 +
  1058 +func (tts *OfflineTts) GenerateWithProgressCallback(text string, sid int, speed float32, cb sherpaOnnxGeneratedAudioProgressCallbackWithArg) {
  1059 + s := C.CString(text)
  1060 + defer C.free(unsafe.Pointer(s))
  1061 +
  1062 + h := cgo.NewHandle(cb)
  1063 + defer h.Delete()
  1064 + audio := C.SherpaOnnxOfflineTtsGenerateWithProgressCallbackWithArg(tts.impl, s, C.int(sid), C.float(speed), C.SherpaOnnxGeneratedAudioProgressCallbackWithArg(C._cgoGeneratedAudioProgressCallback), unsafe.Pointer(&h))
  1065 + defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
  1066 +}
  1067 +
1013 func (audio *GeneratedAudio) Save(filename string) bool { 1068 func (audio *GeneratedAudio) Save(filename string) bool {
1014 s := C.CString(filename) 1069 s := C.CString(filename)
1015 defer C.free(unsafe.Pointer(s)) 1070 defer C.free(unsafe.Pointer(s))