Fangjun Kuang
Committed by GitHub

Add Swift API for Kokoro TTS 1.0 (#1803)

@@ -11,6 +11,10 @@ ls -lh @@ -11,6 +11,10 @@ ls -lh
11 ls -lh 11 ls -lh
12 rm -rf vits-piper-* 12 rm -rf vits-piper-*
13 13
  14 +./run-tts-kokoro-zh-en.sh
  15 +ls -lh
  16 +rm -rf kokoro-multi-*
  17 +
14 ./run-tts-kokoro-en.sh 18 ./run-tts-kokoro-en.sh
15 ls -lh 19 ls -lh
16 rm -rf kokoro-en-* 20 rm -rf kokoro-en-*
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 // Copyright (c) 2025 Xiaomi Corporation 3 // Copyright (c) 2025 Xiaomi Corporation
4 4
5 // This file shows how to use sherpa-onnx CXX API 5 // This file shows how to use sherpa-onnx CXX API
6 -// for Chinese TTS with Kokoro. 6 +// for English TTS with Kokoro.
7 // 7 //
8 // clang-format off 8 // clang-format off
9 /* 9 /*
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 // Copyright (c) 2025 Xiaomi Corporation 3 // Copyright (c) 2025 Xiaomi Corporation
4 4
5 // This file shows how to use sherpa-onnx CXX API 5 // This file shows how to use sherpa-onnx CXX API
6 -// for Chinese TTS with Kokoro. 6 +// for Chinese + English TTS with Kokoro.
7 // 7 //
8 // clang-format off 8 // clang-format off
9 /* 9 /*
@@ -13,3 +13,4 @@ add-punctuations @@ -13,3 +13,4 @@ add-punctuations
13 tts-matcha-zh 13 tts-matcha-zh
14 tts-matcha-en 14 tts-matcha-en
15 tts-kokoro-en 15 tts-kokoro-en
  16 +tts-kokoro-zh-en
@@ -767,14 +767,18 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( @@ -767,14 +767,18 @@ func sherpaOnnxOfflineTtsKokoroModelConfig(
767 voices: String = "", 767 voices: String = "",
768 tokens: String = "", 768 tokens: String = "",
769 dataDir: String = "", 769 dataDir: String = "",
770 - lengthScale: Float = 1.0 770 + lengthScale: Float = 1.0,
  771 + dictDir: String = "",
  772 + lexicon: String = ""
771 ) -> SherpaOnnxOfflineTtsKokoroModelConfig { 773 ) -> SherpaOnnxOfflineTtsKokoroModelConfig {
772 return SherpaOnnxOfflineTtsKokoroModelConfig( 774 return SherpaOnnxOfflineTtsKokoroModelConfig(
773 model: toCPointer(model), 775 model: toCPointer(model),
774 voices: toCPointer(voices), 776 voices: toCPointer(voices),
775 tokens: toCPointer(tokens), 777 tokens: toCPointer(tokens),
776 data_dir: toCPointer(dataDir), 778 data_dir: toCPointer(dataDir),
777 - length_scale: lengthScale 779 + length_scale: lengthScale,
  780 + dict_dir: toCPointer(dictDir),
  781 + lexicon: toCPointer(lexicon)
778 ) 782 )
779 } 783 }
780 784
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +# please visit
  11 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html
  12 +# to download more models
  13 +if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  15 + tar xf kokoro-multi-lang-v1_0.tar.bz2
  16 + rm kokoro-multi-lang-v1_0.tar.bz2
  17 +fi
  18 +
  19 +if [ ! -e ./tts-kokoro-zh-en ]; then
  20 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  21 + swiftc \
  22 + -lc++ \
  23 + -I ../build-swift-macos/install/include \
  24 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  25 + ./tts-kokoro-zh-en.swift ./SherpaOnnx.swift \
  26 + -L ../build-swift-macos/install/lib/ \
  27 + -l sherpa-onnx \
  28 + -l onnxruntime \
  29 + -o tts-kokoro-zh-en
  30 +
  31 + strip tts-kokoro-zh-en
  32 +else
  33 + echo "./tts-kokoro-zh-en exists - skip building"
  34 +fi
  35 +
  36 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  37 +./tts-kokoro-zh-en
  1 +class MyClass {
  2 + func playSamples(samples: [Float]) {
  3 + print("Play \(samples.count) samples")
  4 + }
  5 +}
  6 +
  7 +func run() {
  8 + let model = "./kokoro-multi-lang-v1_0/model.onnx"
  9 + let voices = "./kokoro-multi-lang-v1_0/voices.bin"
  10 + let tokens = "./kokoro-multi-lang-v1_0/tokens.txt"
  11 + let dataDir = "./kokoro-multi-lang-v1_0/espeak-ng-data"
  12 + let dictDir = "./kokoro-multi-lang-v1_0/dict"
  13 + let lexicon = "./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt"
  14 + let kokoro = sherpaOnnxOfflineTtsKokoroModelConfig(
  15 + model: model,
  16 + voices: voices,
  17 + tokens: tokens,
  18 + dataDir: dataDir,
  19 + dictDir: dictDir,
  20 + lexicon: lexicon
  21 + )
  22 + let modelConfig = sherpaOnnxOfflineTtsModelConfig(kokoro: kokoro, debug: 0)
  23 + var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig)
  24 +
  25 + let myClass = MyClass()
  26 +
  27 + // We use Unretained here so myClass must be kept alive as the callback is invoked
  28 + //
  29 + // See also
  30 + // https://medium.com/codex/swift-c-callback-interoperability-6d57da6c8ee6
  31 + let arg = Unmanaged<MyClass>.passUnretained(myClass).toOpaque()
  32 +
  33 + let callback: TtsCallbackWithArg = { samples, n, arg in
  34 + let o = Unmanaged<MyClass>.fromOpaque(arg!).takeUnretainedValue()
  35 + var savedSamples: [Float] = []
  36 + for index in 0..<n {
  37 + savedSamples.append(samples![Int(index)])
  38 + }
  39 +
  40 + o.playSamples(samples: savedSamples)
  41 +
  42 + // return 1 so that it continues generating
  43 + return 1
  44 + }
  45 +
  46 + let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
  47 +
  48 + let text =
  49 + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?"
  50 + let sid = 0
  51 + let speed: Float = 1.0
  52 +
  53 + let audio = tts.generateWithCallbackWithArg(
  54 + text: text, callback: callback, arg: arg, sid: sid, speed: speed)
  55 + let filename = "test-kokoro-zh-en.wav"
  56 + let ok = audio.save(filename: filename)
  57 + if ok == 1 {
  58 + print("\nSaved to:\(filename)")
  59 + } else {
  60 + print("Failed to save to \(filename)")
  61 + }
  62 +}
  63 +
  64 +@main
  65 +struct App {
  66 + static func main() {
  67 + run()
  68 + }
  69 +}