Fangjun Kuang
Committed by GitHub

Add Swift API for MatchaTTS models. (#1684)

@@ -7,6 +7,18 @@ echo "pwd: $PWD" @@ -7,6 +7,18 @@ echo "pwd: $PWD"
7 cd swift-api-examples 7 cd swift-api-examples
8 ls -lh 8 ls -lh
9 9
  10 +./run-tts-vits.sh
  11 +ls -lh
  12 +rm -rf vits-piper-*
  13 +
  14 +./run-tts-matcha-zh.sh
  15 +ls -lh
  16 +rm -rf matcha-icefall-*
  17 +
  18 +./run-tts-matcha-en.sh
  19 +ls -lh
  20 +rm -rf matcha-icefall-*
  21 +
10 ./run-speaker-diarization.sh 22 ./run-speaker-diarization.sh
11 rm -rf *.onnx 23 rm -rf *.onnx
12 rm -rf sherpa-onnx-pyannote-segmentation-3-0 24 rm -rf sherpa-onnx-pyannote-segmentation-3-0
@@ -38,8 +50,9 @@ popd @@ -38,8 +50,9 @@ popd
38 ls -lh /Users/fangjun/Desktop 50 ls -lh /Users/fangjun/Desktop
39 cat /Users/fangjun/Desktop/Obama.srt 51 cat /Users/fangjun/Desktop/Obama.srt
40 52
41 -./run-tts.sh  
42 -ls -lh 53 +rm -rf sherpa-onnx-whisper*
  54 +rm -f *.onnx
  55 +rm /Users/fangjun/Desktop/Obama.wav
43 56
44 ./run-decode-file.sh 57 ./run-decode-file.sh
45 rm decode-file 58 rm decode-file
@@ -48,5 +61,4 @@ sed -i.bak '20d' ./decode-file.swift @@ -48,5 +61,4 @@ sed -i.bak '20d' ./decode-file.swift
48 61
49 ./run-decode-file-non-streaming.sh 62 ./run-decode-file-non-streaming.sh
50 63
51 -  
52 ls -lh 64 ls -lh
@@ -31,7 +31,7 @@ fi @@ -31,7 +31,7 @@ fi
31 # to download more models 31 # to download more models
32 if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then 32 if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
33 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 33 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
34 - tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 34 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
35 rm matcha-icefall-en_US-ljspeech.tar.bz2 35 rm matcha-icefall-en_US-ljspeech.tar.bz2
36 fi 36 fi
37 37
@@ -350,7 +350,7 @@ node ./test_vad_asr_non_streaming_sense_voice_microphone.js @@ -350,7 +350,7 @@ node ./test_vad_asr_non_streaming_sense_voice_microphone.js
350 ### Text-to-speech with MatchaTTS models (English TTS) 350 ### Text-to-speech with MatchaTTS models (English TTS)
351 ```bash 351 ```bash
352 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 352 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
353 -tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 353 +tar xf matcha-icefall-en_US-ljspeech.tar.bz2
354 rm matcha-icefall-en_US-ljspeech.tar.bz2 354 rm matcha-icefall-en_US-ljspeech.tar.bz2
355 355
356 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx 356 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
@@ -70,7 +70,7 @@ You can use the following command to run it: @@ -70,7 +70,7 @@ You can use the following command to run it:
70 70
71 ```bash 71 ```bash
72 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 72 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
73 -tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 73 +tar xf matcha-icefall-en_US-ljspeech.tar.bz2
74 rm matcha-icefall-en_US-ljspeech.tar.bz2 74 rm matcha-icefall-en_US-ljspeech.tar.bz2
75 75
76 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx 76 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
@@ -2,7 +2,7 @@ decode-file @@ -2,7 +2,7 @@ decode-file
2 decode-file-non-streaming 2 decode-file-non-streaming
3 generate-subtitles 3 generate-subtitles
4 spoken-language-identification 4 spoken-language-identification
5 -tts 5 +tts-vits
6 vits-vctk 6 vits-vctk
7 sherpa-onnx-paraformer-zh-2023-09-14 7 sherpa-onnx-paraformer-zh-2023-09-14
8 !*.sh 8 !*.sh
@@ -10,3 +10,5 @@ sherpa-onnx-paraformer-zh-2023-09-14 @@ -10,3 +10,5 @@ sherpa-onnx-paraformer-zh-2023-09-14
10 streaming-hlg-decode-file 10 streaming-hlg-decode-file
11 keyword-spotting-from-file 11 keyword-spotting-from-file
12 add-punctuations 12 add-punctuations
  13 +tts-matcha-zh
  14 +tts-matcha-en
@@ -719,9 +719,9 @@ class SherpaOnnxVoiceActivityDetectorWrapper { @@ -719,9 +719,9 @@ class SherpaOnnxVoiceActivityDetectorWrapper {
719 719
720 // offline tts 720 // offline tts
721 func sherpaOnnxOfflineTtsVitsModelConfig( 721 func sherpaOnnxOfflineTtsVitsModelConfig(
722 - model: String,  
723 - lexicon: String,  
724 - tokens: String, 722 + model: String = "",
  723 + lexicon: String = "",
  724 + tokens: String = "",
725 dataDir: String = "", 725 dataDir: String = "",
726 noiseScale: Float = 0.667, 726 noiseScale: Float = 0.667,
727 noiseScaleW: Float = 0.8, 727 noiseScaleW: Float = 0.8,
@@ -739,8 +739,30 @@ func sherpaOnnxOfflineTtsVitsModelConfig( @@ -739,8 +739,30 @@ func sherpaOnnxOfflineTtsVitsModelConfig(
739 dict_dir: toCPointer(dictDir)) 739 dict_dir: toCPointer(dictDir))
740 } 740 }
741 741
  742 +func sherpaOnnxOfflineTtsMatchaModelConfig(
  743 + acousticModel: String = "",
  744 + vocoder: String = "",
  745 + lexicon: String = "",
  746 + tokens: String = "",
  747 + dataDir: String = "",
  748 + noiseScale: Float = 0.667,
  749 + lengthScale: Float = 1.0,
  750 + dictDir: String = ""
  751 +) -> SherpaOnnxOfflineTtsMatchaModelConfig {
  752 + return SherpaOnnxOfflineTtsMatchaModelConfig(
  753 + acoustic_model: toCPointer(acousticModel),
  754 + vocoder: toCPointer(vocoder),
  755 + lexicon: toCPointer(lexicon),
  756 + tokens: toCPointer(tokens),
  757 + data_dir: toCPointer(dataDir),
  758 + noise_scale: noiseScale,
  759 + length_scale: lengthScale,
  760 + dict_dir: toCPointer(dictDir))
  761 +}
  762 +
742 func sherpaOnnxOfflineTtsModelConfig( 763 func sherpaOnnxOfflineTtsModelConfig(
743 - vits: SherpaOnnxOfflineTtsVitsModelConfig, 764 + vits: SherpaOnnxOfflineTtsVitsModelConfig = sherpaOnnxOfflineTtsVitsModelConfig(),
  765 + matcha: SherpaOnnxOfflineTtsMatchaModelConfig = sherpaOnnxOfflineTtsMatchaModelConfig(),
744 numThreads: Int = 1, 766 numThreads: Int = 1,
745 debug: Int = 0, 767 debug: Int = 0,
746 provider: String = "cpu" 768 provider: String = "cpu"
@@ -749,7 +771,8 @@ func sherpaOnnxOfflineTtsModelConfig( @@ -749,7 +771,8 @@ func sherpaOnnxOfflineTtsModelConfig(
749 vits: vits, 771 vits: vits,
750 num_threads: Int32(numThreads), 772 num_threads: Int32(numThreads),
751 debug: Int32(debug), 773 debug: Int32(debug),
752 - provider: toCPointer(provider) 774 + provider: toCPointer(provider),
  775 + matcha: matcha
753 ) 776 )
754 } 777 }
755 778
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +# please visit
  11 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  12 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  13 +# to download more models
  14 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  15 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  16 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  17 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  18 +fi
  19 +
  20 +if [ ! -f ./hifigan_v2.onnx ]; then
  21 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  22 +fi
  23 +
  24 +if [ ! -e ./tts ]; then
  25 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  26 + swiftc \
  27 + -lc++ \
  28 + -I ../build-swift-macos/install/include \
  29 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  30 + ./tts-matcha-en.swift ./SherpaOnnx.swift \
  31 + -L ../build-swift-macos/install/lib/ \
  32 + -l sherpa-onnx \
  33 + -l onnxruntime \
  34 + -o tts-matcha-en
  35 +
  36 + strip tts-matcha-en
  37 +else
  38 + echo "./tts-matcha-en exists - skip building"
  39 +fi
  40 +
  41 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  42 +./tts-matcha-en
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +# please visit
  11 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  12 +# to download more models
  13 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  15 + tar xvf matcha-icefall-zh-baker.tar.bz2
  16 + rm matcha-icefall-zh-baker.tar.bz2
  17 +fi
  18 +
  19 +if [ ! -f ./hifigan_v2.onnx ]; then
  20 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  21 +fi
  22 +
  23 +if [ ! -e ./tts ]; then
  24 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  25 + swiftc \
  26 + -lc++ \
  27 + -I ../build-swift-macos/install/include \
  28 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  29 + ./tts-matcha-zh.swift ./SherpaOnnx.swift \
  30 + -L ../build-swift-macos/install/lib/ \
  31 + -l sherpa-onnx \
  32 + -l onnxruntime \
  33 + -o tts-matcha-zh
  34 +
  35 + strip tts-matcha-zh
  36 +else
  37 + echo "./tts-matcha-zh exists - skip building"
  38 +fi
  39 +
  40 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  41 +./tts-matcha-zh
@@ -21,16 +21,16 @@ if [ ! -e ./tts ]; then @@ -21,16 +21,16 @@ if [ ! -e ./tts ]; then
21 -lc++ \ 21 -lc++ \
22 -I ../build-swift-macos/install/include \ 22 -I ../build-swift-macos/install/include \
23 -import-objc-header ./SherpaOnnx-Bridging-Header.h \ 23 -import-objc-header ./SherpaOnnx-Bridging-Header.h \
24 - ./tts.swift ./SherpaOnnx.swift \ 24 + ./tts-vits.swift ./SherpaOnnx.swift \
25 -L ../build-swift-macos/install/lib/ \ 25 -L ../build-swift-macos/install/lib/ \
26 -l sherpa-onnx \ 26 -l sherpa-onnx \
27 -l onnxruntime \ 27 -l onnxruntime \
28 - -o tts 28 + -o tts-vits
29 29
30 - strip tts 30 + strip tts-vits
31 else 31 else
32 - echo "./tts exists - skip building" 32 + echo "./tts-vits exists - skip building"
33 fi 33 fi
34 34
35 export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH 35 export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
36 -./tts 36 +./tts-vits
  1 +class MyClass {
  2 + func playSamples(samples: [Float]) {
  3 + print("Play \(samples.count) samples")
  4 + }
  5 +}
  6 +
  7 +func run() {
  8 + let acousticModel = "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"
  9 + let vocoder = "./hifigan_v2.onnx"
  10 + let tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"
  11 + let dataDir = "./matcha-icefall-en_US-ljspeech/espeak-ng-data"
  12 + let matcha = sherpaOnnxOfflineTtsMatchaModelConfig(
  13 + acousticModel: acousticModel,
  14 + vocoder: vocoder,
  15 + tokens: tokens,
  16 + dataDir: dataDir
  17 + )
  18 + let modelConfig = sherpaOnnxOfflineTtsModelConfig(matcha: matcha, debug: 0)
  19 + var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig)
  20 +
  21 + let myClass = MyClass()
  22 +
  23 + // We use Unretained here so myClass must be kept alive as the callback is invoked
  24 + //
  25 + // See also
  26 + // https://medium.com/codex/swift-c-callback-interoperability-6d57da6c8ee6
  27 + let arg = Unmanaged<MyClass>.passUnretained(myClass).toOpaque()
  28 +
  29 + let callback: TtsCallbackWithArg = { samples, n, arg in
  30 + let o = Unmanaged<MyClass>.fromOpaque(arg!).takeUnretainedValue()
  31 + var savedSamples: [Float] = []
  32 + for index in 0..<n {
  33 + savedSamples.append(samples![Int(index)])
  34 + }
  35 +
  36 + o.playSamples(samples: savedSamples)
  37 +
  38 + // return 1 so that it continues generating
  39 + return 1
  40 + }
  41 +
  42 + let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
  43 +
  44 + let text =
  45 + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
  46 + let sid = 0
  47 + let speed: Float = 1.0
  48 +
  49 + let audio = tts.generateWithCallbackWithArg(
  50 + text: text, callback: callback, arg: arg, sid: sid, speed: speed)
  51 + let filename = "test-matcha-en.wav"
  52 + let ok = audio.save(filename: filename)
  53 + if ok == 1 {
  54 + print("\nSaved to:\(filename)")
  55 + } else {
  56 + print("Failed to save to \(filename)")
  57 + }
  58 +}
  59 +
  60 +@main
  61 +struct App {
  62 + static func main() {
  63 + run()
  64 + }
  65 +}
  1 +class MyClass {
  2 + func playSamples(samples: [Float]) {
  3 + print("Play \(samples.count) samples")
  4 + }
  5 +}
  6 +
  7 +func run() {
  8 + let acousticModel = "./matcha-icefall-zh-baker/model-steps-3.onnx"
  9 + let vocoder = "./hifigan_v2.onnx"
  10 + let lexicon = "./matcha-icefall-zh-baker/lexicon.txt"
  11 + let tokens = "./matcha-icefall-zh-baker/tokens.txt"
  12 + let dictDir = "./matcha-icefall-zh-baker/dict"
  13 + let ruleFsts =
  14 + "./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"
  15 + let matcha = sherpaOnnxOfflineTtsMatchaModelConfig(
  16 + acousticModel: acousticModel,
  17 + vocoder: vocoder,
  18 + lexicon: lexicon,
  19 + tokens: tokens,
  20 + dictDir: dictDir
  21 + )
  22 + let modelConfig = sherpaOnnxOfflineTtsModelConfig(matcha: matcha, debug: 0)
  23 + var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig, ruleFsts: ruleFsts)
  24 +
  25 + let myClass = MyClass()
  26 +
  27 + // We use Unretained here so myClass must be kept alive as the callback is invoked
  28 + //
  29 + // See also
  30 + // https://medium.com/codex/swift-c-callback-interoperability-6d57da6c8ee6
  31 + let arg = Unmanaged<MyClass>.passUnretained(myClass).toOpaque()
  32 +
  33 + let callback: TtsCallbackWithArg = { samples, n, arg in
  34 + let o = Unmanaged<MyClass>.fromOpaque(arg!).takeUnretainedValue()
  35 + var savedSamples: [Float] = []
  36 + for index in 0..<n {
  37 + savedSamples.append(samples![Int(index)])
  38 + }
  39 +
  40 + o.playSamples(samples: savedSamples)
  41 +
  42 + // return 1 so that it continues generating
  43 + return 1
  44 + }
  45 +
  46 + let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
  47 +
  48 + let text = "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
  49 + let sid = 0
  50 + let speed: Float = 1.0
  51 +
  52 + let audio = tts.generateWithCallbackWithArg(
  53 + text: text, callback: callback, arg: arg, sid: sid, speed: speed)
  54 + let filename = "test-matcha-zh.wav"
  55 + let ok = audio.save(filename: filename)
  56 + if ok == 1 {
  57 + print("\nSaved to:\(filename)")
  58 + } else {
  59 + print("Failed to save to \(filename)")
  60 + }
  61 +}
  62 +
  63 +@main
  64 +struct App {
  65 + static func main() {
  66 + run()
  67 + }
  68 +}
@@ -47,7 +47,7 @@ func run() { @@ -47,7 +47,7 @@ func run() {
47 47
48 let audio = tts.generateWithCallbackWithArg( 48 let audio = tts.generateWithCallbackWithArg(
49 text: text, callback: callback, arg: arg, sid: sid, speed: speed) 49 text: text, callback: callback, arg: arg, sid: sid, speed: speed)
50 - let filename = "test.wav" 50 + let filename = "test-vits-en.wav"
51 let ok = audio.save(filename: filename) 51 let ok = audio.save(filename: filename)
52 if ok == 1 { 52 if ok == 1 {
53 print("\nSaved to:\(filename)") 53 print("\nSaved to:\(filename)")