Fangjun Kuang
Committed by GitHub

Add Go API for Kokoro TTS models (#1722)

@@ -209,6 +209,11 @@ jobs: @@ -209,6 +209,11 @@ jobs:
209 go build 209 go build
210 ls -lh 210 ls -lh
211 211
  212 + echo "Test kokoro en"
  213 + ./run-kokoro-en.sh
  214 + rm -rf kokoro-en-*
  215 + ls -lh
  216 +
212 echo "Test matcha zh" 217 echo "Test matcha zh"
213 ./run-matcha-zh.sh 218 ./run-matcha-zh.sh
214 rm -rf matcha-icefall-* 219 rm -rf matcha-icefall-*
@@ -224,6 +224,11 @@ jobs: @@ -224,6 +224,11 @@ jobs:
224 go build 224 go build
225 ls -lh 225 ls -lh
226 226
  227 + echo "Test kokoro en"
  228 + ./run-kokoro-en.sh
  229 + rm -rf kokoro-en-*
  230 + ls -lh
  231 +
227 echo "Test matcha zh" 232 echo "Test matcha zh"
228 ./run-matcha-zh.sh 233 ./run-matcha-zh.sh
229 rm -rf matcha-icefall-* 234 rm -rf matcha-icefall-*
@@ -33,6 +33,12 @@ func main() { @@ -33,6 +33,12 @@ func main() {
33 flag.Float32Var(&config.Model.Matcha.NoiseScale, "matcha-noise-scale", 0.667, "noise_scale for Matcha") 33 flag.Float32Var(&config.Model.Matcha.NoiseScale, "matcha-noise-scale", 0.667, "noise_scale for Matcha")
34 flag.Float32Var(&config.Model.Matcha.LengthScale, "matcha-length-scale", 1.0, "length_scale for Matcha. small -> faster in speech speed; large -> slower") 34 flag.Float32Var(&config.Model.Matcha.LengthScale, "matcha-length-scale", 1.0, "length_scale for Matcha. small -> faster in speech speed; large -> slower")
35 35
  36 + flag.StringVar(&config.Model.Kokoro.Model, "kokoro-model", "", "Path to the Kokoro ONNX model")
  37 + flag.StringVar(&config.Model.Kokoro.Voices, "kokoro-voices", "", "Path to voices.bin for Kokoro")
  38 + flag.StringVar(&config.Model.Kokoro.Tokens, "kokoro-tokens", "", "Path to tokens.txt for Kokoro")
  39 + flag.StringVar(&config.Model.Kokoro.DataDir, "kokoro-data-dir", "", "Path to espeak-ng-data for Kokoro")
  40 + flag.Float32Var(&config.Model.Kokoro.LengthScale, "kokoro-length-scale", 1.0, "length_scale for Kokoro. small -> faster in speech speed; large -> slower")
  41 +
36 flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing") 42 flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing")
37 flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message") 43 flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message")
38 flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use") 44 flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use")
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  7 + tar xf kokoro-en-v0_19.tar.bz2
  8 + rm kokoro-en-v0_19.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./non-streaming-tts \
  15 + --kokoro-model=./kokoro-en-v0_19/model.onnx \
  16 + --kokoro-voices=./kokoro-en-v0_19/voices.bin \
  17 + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
  18 + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
  19 + --debug=1 \
  20 + --output-filename=./test-kokoro-en.wav \
  21 + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  1 +../../../../go-api-examples/non-streaming-tts/run-kokoro-en.sh
@@ -682,9 +682,18 @@ type OfflineTtsMatchaModelConfig struct { @@ -682,9 +682,18 @@ type OfflineTtsMatchaModelConfig struct {
682 DictDir string // Path to dict directory for jieba (used only in Chinese tts) 682 DictDir string // Path to dict directory for jieba (used only in Chinese tts)
683 } 683 }
684 684
  685 +type OfflineTtsKokoroModelConfig struct {
  686 + Model string // Path to the model for kokoro
  687 + Voices string // Path to the voices.bin for kokoro
  688 + Tokens string // Path to tokens.txt
  689 + DataDir string // Path to espeak-ng-data directory
  690 + LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
  691 +}
  692 +
685 type OfflineTtsModelConfig struct { 693 type OfflineTtsModelConfig struct {
686 Vits OfflineTtsVitsModelConfig 694 Vits OfflineTtsVitsModelConfig
687 Matcha OfflineTtsMatchaModelConfig 695 Matcha OfflineTtsMatchaModelConfig
  696 + Kokoro OfflineTtsKokoroModelConfig
688 697
689 // Number of threads to use for neural network computation 698 // Number of threads to use for neural network computation
690 NumThreads int 699 NumThreads int
@@ -776,6 +785,21 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { @@ -776,6 +785,21 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
776 c.model.matcha.dict_dir = C.CString(config.Model.Matcha.DictDir) 785 c.model.matcha.dict_dir = C.CString(config.Model.Matcha.DictDir)
777 defer C.free(unsafe.Pointer(c.model.matcha.dict_dir)) 786 defer C.free(unsafe.Pointer(c.model.matcha.dict_dir))
778 787
  788 + // kokoro
  789 + c.model.kokoro.model = C.CString(config.Model.Kokoro.Model)
  790 + defer C.free(unsafe.Pointer(c.model.kokoro.model))
  791 +
  792 + c.model.kokoro.voices = C.CString(config.Model.Kokoro.Voices)
  793 + defer C.free(unsafe.Pointer(c.model.kokoro.voices))
  794 +
  795 + c.model.kokoro.tokens = C.CString(config.Model.Kokoro.Tokens)
  796 + defer C.free(unsafe.Pointer(c.model.kokoro.tokens))
  797 +
  798 + c.model.kokoro.data_dir = C.CString(config.Model.Kokoro.DataDir)
  799 + defer C.free(unsafe.Pointer(c.model.kokoro.data_dir))
  800 +
  801 + c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale)
  802 +
779 c.model.num_threads = C.int(config.Model.NumThreads) 803 c.model.num_threads = C.int(config.Model.NumThreads)
780 c.model.debug = C.int(config.Model.Debug) 804 c.model.debug = C.int(config.Model.Debug)
781 805