Add Go API for Kokoro TTS models (#1722)

Fangjun Kuang · GitHub
Commit 2086f8c55bb7029dd61d97cc002d98eedbe3a443 2086f8c5 1 parent ad61ad6f
.github/workflows/test-go-package.yaml
.github/workflows/test-go.yaml
go-api-examples/non-streaming-tts/main.go
go-api-examples/non-streaming-tts/run-kokoro-en.sh
scripts/go/_internal/non-streaming-tts/run-kokoro-en.sh
scripts/go/sherpa_onnx.go
--- a/.github/workflows/test-go-package.yaml
查看文件 @2086f8c
+++ b/.github/workflows/test-go-package.yaml
查看文件 @2086f8c
@@ -209,6 +209,11 @@ jobs:
           go build
           ls -lh
 
+           echo "Test kokoro en"
+           ./run-kokoro-en.sh
+           rm -rf kokoro-en-*
+           ls -lh
+ 
           echo "Test matcha zh"
           ./run-matcha-zh.sh
           rm -rf matcha-icefall-*
--- a/.github/workflows/test-go.yaml
查看文件 @2086f8c
+++ b/.github/workflows/test-go.yaml
查看文件 @2086f8c
@@ -224,6 +224,11 @@ jobs:
           go build
           ls -lh
 
+           echo "Test kokoro en"
+           ./run-kokoro-en.sh
+           rm -rf kokoro-en-*
+           ls -lh
+ 
           echo "Test matcha zh"
           ./run-matcha-zh.sh
           rm -rf matcha-icefall-*
--- a/go-api-examples/non-streaming-tts/main.go
查看文件 @2086f8c
+++ b/go-api-examples/non-streaming-tts/main.go
查看文件 @2086f8c
@@ -33,6 +33,12 @@ func main() {
 	flag.Float32Var(&config.Model.Matcha.NoiseScale, "matcha-noise-scale", 0.667, "noise_scale for Matcha")
 	flag.Float32Var(&config.Model.Matcha.LengthScale, "matcha-length-scale", 1.0, "length_scale for Matcha. small -> faster in speech speed; large -> slower")
 
+ 	flag.StringVar(&config.Model.Kokoro.Model, "kokoro-model", "", "Path to the Kokoro ONNX model")
+ 	flag.StringVar(&config.Model.Kokoro.Voices, "kokoro-voices", "", "Path to voices.bin for Kokoro")
+ 	flag.StringVar(&config.Model.Kokoro.Tokens, "kokoro-tokens", "", "Path to tokens.txt for Kokoro")
+ 	flag.StringVar(&config.Model.Kokoro.DataDir, "kokoro-data-dir", "", "Path to espeak-ng-data for Kokoro")
+ 	flag.Float32Var(&config.Model.Kokoro.LengthScale, "kokoro-length-scale", 1.0, "length_scale for Kokoro. small -> faster in speech speed; large -> slower")
+ 
 	flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing")
 	flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
 	flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")
--- a/go-api-examples/non-streaming-tts/run-kokoro-en.sh 0 → 100755
查看文件 @2086f8c
+++ b/go-api-examples/non-streaming-tts/run-kokoro-en.sh 0 → 100755
查看文件 @2086f8c
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
+   tar xf kokoro-en-v0_19.tar.bz2
+   rm kokoro-en-v0_19.tar.bz2
+ fi
+ 
+ go mod tidy
+ go build
+ 
+ ./non-streaming-tts \
+   --kokoro-model=./kokoro-en-v0_19/model.onnx \
+   --kokoro-voices=./kokoro-en-v0_19/voices.bin \
+   --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
+   --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
+   --debug=1 \
+   --output-filename=./test-kokoro-en.wav \
+   "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
--- a/scripts/go/_internal/non-streaming-tts/run-kokoro-en.sh 0 → 120000
查看文件 @2086f8c
+++ b/scripts/go/_internal/non-streaming-tts/run-kokoro-en.sh 0 → 120000
查看文件 @2086f8c
+ ../../../../go-api-examples/non-streaming-tts/run-kokoro-en.sh
\ No newline at end of file
--- a/scripts/go/sherpa_onnx.go
查看文件 @2086f8c
+++ b/scripts/go/sherpa_onnx.go
查看文件 @2086f8c
@@ -682,9 +682,18 @@ type OfflineTtsMatchaModelConfig struct {
 	DictDir       string  // Path to dict directory for jieba (used only in Chinese tts)
 }
 
+ type OfflineTtsKokoroModelConfig struct {
+ 	Model       string  // Path to the model for kokoro
+ 	Voices      string  // Path to the voices.bin for kokoro
+ 	Tokens      string  // Path to tokens.txt
+ 	DataDir     string  // Path to espeak-ng-data directory
+ 	LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
+ }
+ 
 type OfflineTtsModelConfig struct {
 	Vits   OfflineTtsVitsModelConfig
 	Matcha OfflineTtsMatchaModelConfig
+ 	Kokoro OfflineTtsKokoroModelConfig
 
 	// Number of threads to use for neural network computation
 	NumThreads int
@@ -776,6 +785,21 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
 	c.model.matcha.dict_dir = C.CString(config.Model.Matcha.DictDir)
 	defer C.free(unsafe.Pointer(c.model.matcha.dict_dir))
 
+ 	// kokoro
+ 	c.model.kokoro.model = C.CString(config.Model.Kokoro.Model)
+ 	defer C.free(unsafe.Pointer(c.model.kokoro.model))
+ 
+ 	c.model.kokoro.voices = C.CString(config.Model.Kokoro.Voices)
+ 	defer C.free(unsafe.Pointer(c.model.kokoro.voices))
+ 
+ 	c.model.kokoro.tokens = C.CString(config.Model.Kokoro.Tokens)
+ 	defer C.free(unsafe.Pointer(c.model.kokoro.tokens))
+ 
+ 	c.model.kokoro.data_dir = C.CString(config.Model.Kokoro.DataDir)
+ 	defer C.free(unsafe.Pointer(c.model.kokoro.data_dir))
+ 
+ 	c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale)
+ 
 	c.model.num_threads = C.int(config.Model.NumThreads)
 	c.model.debug = C.int(config.Model.Debug)