Fangjun Kuang
Committed by GitHub

Add Swift API for Kokoro TTS models (#1721)

@@ -11,6 +11,10 @@ ls -lh @@ -11,6 +11,10 @@ ls -lh
11 ls -lh 11 ls -lh
12 rm -rf vits-piper-* 12 rm -rf vits-piper-*
13 13
  14 +./run-tts-kokoro-en.sh
  15 +ls -lh
  16 +rm -rf kokoro-en-*
  17 +
14 ./run-tts-matcha-zh.sh 18 ./run-tts-matcha-zh.sh
15 ls -lh 19 ls -lh
16 rm -rf matcha-icefall-* 20 rm -rf matcha-icefall-*
@@ -12,3 +12,4 @@ keyword-spotting-from-file @@ -12,3 +12,4 @@ keyword-spotting-from-file
12 add-punctuations 12 add-punctuations
13 tts-matcha-zh 13 tts-matcha-zh
14 tts-matcha-en 14 tts-matcha-en
  15 +tts-kokoro-en
@@ -736,7 +736,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( @@ -736,7 +736,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig(
736 noise_scale: noiseScale, 736 noise_scale: noiseScale,
737 noise_scale_w: noiseScaleW, 737 noise_scale_w: noiseScaleW,
738 length_scale: lengthScale, 738 length_scale: lengthScale,
739 - dict_dir: toCPointer(dictDir)) 739 + dict_dir: toCPointer(dictDir)
  740 + )
740 } 741 }
741 742
742 func sherpaOnnxOfflineTtsMatchaModelConfig( 743 func sherpaOnnxOfflineTtsMatchaModelConfig(
@@ -757,12 +758,30 @@ func sherpaOnnxOfflineTtsMatchaModelConfig( @@ -757,12 +758,30 @@ func sherpaOnnxOfflineTtsMatchaModelConfig(
757 data_dir: toCPointer(dataDir), 758 data_dir: toCPointer(dataDir),
758 noise_scale: noiseScale, 759 noise_scale: noiseScale,
759 length_scale: lengthScale, 760 length_scale: lengthScale,
760 - dict_dir: toCPointer(dictDir)) 761 + dict_dir: toCPointer(dictDir)
  762 + )
  763 +}
  764 +
  765 +func sherpaOnnxOfflineTtsKokoroModelConfig(
  766 + model: String = "",
  767 + voices: String = "",
  768 + tokens: String = "",
  769 + dataDir: String = "",
  770 + lengthScale: Float = 1.0
  771 +) -> SherpaOnnxOfflineTtsKokoroModelConfig {
  772 + return SherpaOnnxOfflineTtsKokoroModelConfig(
  773 + model: toCPointer(model),
  774 + voices: toCPointer(voices),
  775 + tokens: toCPointer(tokens),
  776 + data_dir: toCPointer(dataDir),
  777 + length_scale: lengthScale
  778 + )
761 } 779 }
762 780
763 func sherpaOnnxOfflineTtsModelConfig( 781 func sherpaOnnxOfflineTtsModelConfig(
764 vits: SherpaOnnxOfflineTtsVitsModelConfig = sherpaOnnxOfflineTtsVitsModelConfig(), 782 vits: SherpaOnnxOfflineTtsVitsModelConfig = sherpaOnnxOfflineTtsVitsModelConfig(),
765 matcha: SherpaOnnxOfflineTtsMatchaModelConfig = sherpaOnnxOfflineTtsMatchaModelConfig(), 783 matcha: SherpaOnnxOfflineTtsMatchaModelConfig = sherpaOnnxOfflineTtsMatchaModelConfig(),
  784 + kokoro: SherpaOnnxOfflineTtsKokoroModelConfig = sherpaOnnxOfflineTtsKokoroModelConfig(),
766 numThreads: Int = 1, 785 numThreads: Int = 1,
767 debug: Int = 0, 786 debug: Int = 0,
768 provider: String = "cpu" 787 provider: String = "cpu"
@@ -772,7 +791,8 @@ func sherpaOnnxOfflineTtsModelConfig( @@ -772,7 +791,8 @@ func sherpaOnnxOfflineTtsModelConfig(
772 num_threads: Int32(numThreads), 791 num_threads: Int32(numThreads),
773 debug: Int32(debug), 792 debug: Int32(debug),
774 provider: toCPointer(provider), 793 provider: toCPointer(provider),
775 - matcha: matcha 794 + matcha: matcha,
  795 + kokoro: kokoro
776 ) 796 )
777 } 797 }
778 798
@@ -780,7 +800,7 @@ func sherpaOnnxOfflineTtsConfig( @@ -780,7 +800,7 @@ func sherpaOnnxOfflineTtsConfig(
780 model: SherpaOnnxOfflineTtsModelConfig, 800 model: SherpaOnnxOfflineTtsModelConfig,
781 ruleFsts: String = "", 801 ruleFsts: String = "",
782 ruleFars: String = "", 802 ruleFars: String = "",
783 - maxNumSentences: Int = 2 803 + maxNumSentences: Int = 1
784 ) -> SherpaOnnxOfflineTtsConfig { 804 ) -> SherpaOnnxOfflineTtsConfig {
785 return SherpaOnnxOfflineTtsConfig( 805 return SherpaOnnxOfflineTtsConfig(
786 model: model, 806 model: model,
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +# please visit
  11 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html
  12 +# to download more models
  13 +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  15 + tar xf kokoro-en-v0_19.tar.bz2
  16 + rm kokoro-en-v0_19.tar.bz2
  17 +fi
  18 +
  19 +if [ ! -e ./tts-kokoro-en ]; then
  20 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  21 + swiftc \
  22 + -lc++ \
  23 + -I ../build-swift-macos/install/include \
  24 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  25 + ./tts-kokoro-en.swift ./SherpaOnnx.swift \
  26 + -L ../build-swift-macos/install/lib/ \
  27 + -l sherpa-onnx \
  28 + -l onnxruntime \
  29 + -o tts-kokoro-en
  30 +
  31 + strip tts-kokoro-en
  32 +else
  33 + echo "./tts-kokoro-en exists - skip building"
  34 +fi
  35 +
  36 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  37 +./tts-kokoro-en
@@ -21,7 +21,7 @@ if [ ! -f ./hifigan_v2.onnx ]; then @@ -21,7 +21,7 @@ if [ ! -f ./hifigan_v2.onnx ]; then
21 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx 21 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
22 fi 22 fi
23 23
24 -if [ ! -e ./tts ]; then 24 +if [ ! -e ./tts-matcha-en ]; then
25 # Note: We use -lc++ to link against libc++ instead of libstdc++ 25 # Note: We use -lc++ to link against libc++ instead of libstdc++
26 swiftc \ 26 swiftc \
27 -lc++ \ 27 -lc++ \
@@ -20,7 +20,7 @@ if [ ! -f ./hifigan_v2.onnx ]; then @@ -20,7 +20,7 @@ if [ ! -f ./hifigan_v2.onnx ]; then
20 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx 20 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
21 fi 21 fi
22 22
23 -if [ ! -e ./tts ]; then 23 +if [ ! -e ./tts-matcha-zh ]; then
24 # Note: We use -lc++ to link against libc++ instead of libstdc++ 24 # Note: We use -lc++ to link against libc++ instead of libstdc++
25 swiftc \ 25 swiftc \
26 -lc++ \ 26 -lc++ \
@@ -15,7 +15,7 @@ if [ ! -d ./vits-piper-en_US-amy-low ]; then @@ -15,7 +15,7 @@ if [ ! -d ./vits-piper-en_US-amy-low ]; then
15 rm vits-piper-en_US-amy-low.tar.bz2 15 rm vits-piper-en_US-amy-low.tar.bz2
16 fi 16 fi
17 17
18 -if [ ! -e ./tts ]; then 18 +if [ ! -e ./tts-vits ]; then
19 # Note: We use -lc++ to link against libc++ instead of libstdc++ 19 # Note: We use -lc++ to link against libc++ instead of libstdc++
20 swiftc \ 20 swiftc \
21 -lc++ \ 21 -lc++ \
  1 +class MyClass {
  2 + func playSamples(samples: [Float]) {
  3 + print("Play \(samples.count) samples")
  4 + }
  5 +}
  6 +
  7 +func run() {
  8 + let model = "./kokoro-en-v0_19/model.onnx"
  9 + let voices = "./kokoro-en-v0_19/voices.bin"
  10 + let tokens = "./kokoro-en-v0_19/tokens.txt"
  11 + let dataDir = "./kokoro-en-v0_19/espeak-ng-data"
  12 + let kokoro = sherpaOnnxOfflineTtsKokoroModelConfig(
  13 + model: model,
  14 + voices: voices,
  15 + tokens: tokens,
  16 + dataDir: dataDir
  17 + )
  18 + let modelConfig = sherpaOnnxOfflineTtsModelConfig(kokoro: kokoro, debug: 0)
  19 + var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig)
  20 +
  21 + let myClass = MyClass()
  22 +
  23 + // We use Unretained here so myClass must be kept alive as the callback is invoked
  24 + //
  25 + // See also
  26 + // https://medium.com/codex/swift-c-callback-interoperability-6d57da6c8ee6
  27 + let arg = Unmanaged<MyClass>.passUnretained(myClass).toOpaque()
  28 +
  29 + let callback: TtsCallbackWithArg = { samples, n, arg in
  30 + let o = Unmanaged<MyClass>.fromOpaque(arg!).takeUnretainedValue()
  31 + var savedSamples: [Float] = []
  32 + for index in 0..<n {
  33 + savedSamples.append(samples![Int(index)])
  34 + }
  35 +
  36 + o.playSamples(samples: savedSamples)
  37 +
  38 + // return 1 so that it continues generating
  39 + return 1
  40 + }
  41 +
  42 + let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
  43 +
  44 + let text =
  45 + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  46 + let sid = 0
  47 + let speed: Float = 1.0
  48 +
  49 + let audio = tts.generateWithCallbackWithArg(
  50 + text: text, callback: callback, arg: arg, sid: sid, speed: speed)
  51 + let filename = "test-kokoro-en.wav"
  52 + let ok = audio.save(filename: filename)
  53 + if ok == 1 {
  54 + print("\nSaved to:\(filename)")
  55 + } else {
  56 + print("Failed to save to \(filename)")
  57 + }
  58 +}
  59 +
  60 +@main
  61 +struct App {
  62 + static func main() {
  63 + run()
  64 + }
  65 +}