Fangjun Kuang
Committed by GitHub

Add Swift API for TTS (#439)

  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +echo "pwd: $PWD"
  6 +
  7 +cd swift-api-examples
  8 +ls -lh
  9 +
  10 +mkdir -p /Users/fangjun/Desktop
  11 +pushd /Users/fangjun/Desktop
  12 +wget -q https://huggingface.co/csukuangfj/test-data/resolve/main/Obama.wav
  13 +ls -lh
  14 +popd
  15 +
  16 +./run-generate-subtitles.sh
  17 +
  18 +ls -lh /Users/fangjun/Desktop
  19 +cat /Users/fangjun/Desktop/Obama.srt
  20 +
  21 +./run-tts.sh
  22 +ls -lh
  23 +
  24 +./run-decode-file.sh
  25 +
  26 +./run-decode-file-non-streaming.sh
  27 +
  28 +ls -lh
  1 +name: swift
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - master
  7 +
  8 + pull_request:
  9 + branches:
  10 + - master
  11 +
  12 + workflow_dispatch:
  13 +
  14 +concurrency:
  15 + group: swift-${{ github.ref }}
  16 + cancel-in-progress: true
  17 +
  18 +jobs:
  19 + swift:
  20 + runs-on: ${{ matrix.os }}
  21 + strategy:
  22 + fail-fast: false
  23 + matrix:
  24 + os: [macos-13]
  25 +
  26 + steps:
  27 + - uses: actions/checkout@v4
  28 + with:
  29 + fetch-depth: 0
  30 +
  31 + - name: ccache
  32 + uses: hendrikmuhs/ccache-action@v1.2
  33 + with:
  34 + key: ${{ matrix.os }}-swift
  35 +
  36 + - name: Build
  37 + shell: bash
  38 + run: |
  39 + sudo mkdir -p /Users/fangjun/Desktop
  40 + sudo chmod a=rwx /Users/fangjun/Desktop
  41 + ls -lhd /Users/fangjun/Desktop
  42 + ls -lh /Users/fangjun/Desktop
  43 +
  44 + export CMAKE_CXX_COMPILER_LAUNCHER=ccache
  45 + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
  46 + cmake --version
  47 +
  48 + ./build-swift-macos.sh
  49 +
  50 + - name: test
  51 + shell: bash
  52 + run: |
  53 + .github/scripts/test-swift.sh
1 decode-file 1 decode-file
2 decode-file-non-streaming 2 decode-file-non-streaming
3 generate-subtitles 3 generate-subtitles
  4 +tts
  5 +vits-vctk
  6 +sherpa-onnx-paraformer-zh-2023-09-14
  7 +!*.sh
@@ -572,3 +572,110 @@ class SherpaOnnxVoiceActivityDetectorWrapper { @@ -572,3 +572,110 @@ class SherpaOnnxVoiceActivityDetectorWrapper {
572 SherpaOnnxVoiceActivityDetectorReset(vad) 572 SherpaOnnxVoiceActivityDetectorReset(vad)
573 } 573 }
574 } 574 }
  575 +
  576 +// offline tts
  577 +func sherpaOnnxOfflineTtsVitsModelConfig(
  578 + model: String,
  579 + lexicon: String,
  580 + tokens: String,
  581 + noiseScale: Float = 0.667,
  582 + noiseScaleW: Float = 0.8,
  583 + lengthScale: Float = 1.0
  584 +) -> SherpaOnnxOfflineTtsVitsModelConfig {
  585 + return SherpaOnnxOfflineTtsVitsModelConfig(
  586 + model: toCPointer(model),
  587 + lexicon: toCPointer(lexicon),
  588 + tokens: toCPointer(tokens),
  589 + noise_scale: noiseScale,
  590 + noise_scale_w: noiseScaleW,
  591 + length_scale: lengthScale)
  592 +}
  593 +
  594 +func sherpaOnnxOfflineTtsModelConfig(
  595 + vits: SherpaOnnxOfflineTtsVitsModelConfig,
  596 + numThreads: Int = 1,
  597 + debug: Int = 0,
  598 + provider: String = "cpu"
  599 +) -> SherpaOnnxOfflineTtsModelConfig {
  600 + return SherpaOnnxOfflineTtsModelConfig(
  601 + vits: vits,
  602 + num_threads: Int32(numThreads),
  603 + debug: Int32(debug),
  604 + provider: toCPointer(provider)
  605 + )
  606 +}
  607 +
  608 +func sherpaOnnxOfflineTtsConfig(
  609 + model: SherpaOnnxOfflineTtsModelConfig,
  610 + ruleFsts: String = ""
  611 +) -> SherpaOnnxOfflineTtsConfig {
  612 + return SherpaOnnxOfflineTtsConfig(
  613 + model: model,
  614 + rule_fsts: toCPointer(ruleFsts)
  615 + )
  616 +}
  617 +
  618 +class SherpaOnnxGeneratedAudioWrapper {
  619 + /// A pointer to the underlying counterpart in C
  620 + let audio: UnsafePointer<SherpaOnnxGeneratedAudio>!
  621 +
  622 + init(audio: UnsafePointer<SherpaOnnxGeneratedAudio>!) {
  623 + self.audio = audio
  624 + }
  625 +
  626 + deinit {
  627 + if let audio {
  628 + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
  629 + }
  630 + }
  631 +
  632 + var n: Int32 {
  633 + return audio.pointee.n
  634 + }
  635 +
  636 + var sampleRate: Int32 {
  637 + return audio.pointee.sample_rate
  638 + }
  639 +
  640 + var samples: [Float] {
  641 + if let p = audio.pointee.samples {
  642 + var samples: [Float] = []
  643 + for index in 0..<n {
  644 + samples.append(p[Int(index)])
  645 + }
  646 + return samples
  647 + } else {
  648 + let samples: [Float] = []
  649 + return samples
  650 + }
  651 + }
  652 +
  653 + func save(filename: String) {
  654 + SherpaOnnxWriteWave(audio.pointee.samples, n, sampleRate, toCPointer(filename))
  655 + }
  656 +}
  657 +
  658 +class SherpaOnnxOfflineTtsWrapper {
  659 + /// A pointer to the underlying counterpart in C
  660 + let tts: OpaquePointer!
  661 +
  662 + /// Constructor taking a model config
  663 + init(
  664 + config: UnsafePointer<SherpaOnnxOfflineTtsConfig>!
  665 + ) {
  666 + tts = SherpaOnnxCreateOfflineTts(config)
  667 + }
  668 +
  669 + deinit {
  670 + if let tts {
  671 + SherpaOnnxDestroyOfflineTts(tts)
  672 + }
  673 + }
  674 +
  675 + func generate(text: String, sid: Int = 0, speed: Float = 1.0) -> SherpaOnnxGeneratedAudioWrapper {
  676 + let audio: UnsafePointer<SherpaOnnxGeneratedAudio>? = SherpaOnnxOfflineTtsGenerate(
  677 + tts, toCPointer(text), Int32(sid), speed)
  678 +
  679 + return SherpaOnnxGeneratedAudioWrapper(audio: audio)
  680 + }
  681 +}
@@ -175,8 +175,8 @@ func run() { @@ -175,8 +175,8 @@ func run() {
175 var segments: [SpeechSegment] = [] 175 var segments: [SpeechSegment] = []
176 176
177 for offset in stride(from: 0, to: array.count, by: windowSize) { 177 for offset in stride(from: 0, to: array.count, by: windowSize) {
178 - let end = min(offset + windowSize, array.count)  
179 - vad.acceptWaveform(samples: [Float](array[offset ..< end])) 178 + let end = min(offset + windowSize, array.count)
  179 + vad.acceptWaveform(samples: [Float](array[offset..<end]))
180 } 180 }
181 181
182 var index: Int = 0 182 var index: Int = 0
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
  11 + echo "Please download the pre-trained model for testing."
  12 + echo "You can refer to"
  13 + echo ""
  14 + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
  15 + echo ""
  16 + echo "for help"
  17 +
  18 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  19 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  20 + rm sherpa-onnx-whisper-tiny.en.tar.bz2
  21 +fi
  22 +
  23 +if [ ! -e ./decode-file-non-streaming ]; then
  24 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  25 + swiftc \
  26 + -lc++ \
  27 + -I ../build-swift-macos/install/include \
  28 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  29 + ./decode-file-non-streaming.swift ./SherpaOnnx.swift \
  30 + -L ../build-swift-macos/install/lib/ \
  31 + -l sherpa-onnx \
  32 + -l onnxruntime \
  33 + -o decode-file-non-streaming
  34 +
  35 + strip decode-file-non-streaming
  36 +else
  37 + echo "./decode-file-non-streaming exists - skip building"
  38 +fi
  39 +
  40 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  41 +./decode-file-non-streaming
@@ -14,7 +14,10 @@ if [ ! -d ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 ]; then @@ -14,7 +14,10 @@ if [ ! -d ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 ]; then
14 echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english" 14 echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english"
15 echo "" 15 echo ""
16 echo "for help" 16 echo "for help"
17 - exit 1 17 +
  18 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  19 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  20 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
18 fi 21 fi
19 22
20 if [ ! -e ./decode-file ]; then 23 if [ ! -e ./decode-file ]; then
@@ -28,6 +31,8 @@ if [ ! -e ./decode-file ]; then @@ -28,6 +31,8 @@ if [ ! -e ./decode-file ]; then
28 -l sherpa-onnx \ 31 -l sherpa-onnx \
29 -l onnxruntime \ 32 -l onnxruntime \
30 -o decode-file 33 -o decode-file
  34 +
  35 + strip decode-file
31 else 36 else
32 echo "./decode-file exists - skip building" 37 echo "./decode-file exists - skip building"
33 fi 38 fi
@@ -14,7 +14,15 @@ if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then @@ -14,7 +14,15 @@ if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
14 echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html" 14 echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
15 echo "" 15 echo ""
16 echo "for help" 16 echo "for help"
17 - exit 1 17 +
  18 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  19 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  20 + rm sherpa-onnx-whisper-tiny.en.tar.bz2
  21 + ls -lh sherpa-onnx-whisper-tiny.en
  22 +fi
  23 +if [ ! -f ./silero_vad.onnx ]; then
  24 + echo "downloading silero_vad"
  25 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
18 fi 26 fi
19 27
20 if [ ! -e ./generate-subtitles ]; then 28 if [ ! -e ./generate-subtitles ]; then
@@ -28,6 +36,8 @@ if [ ! -e ./generate-subtitles ]; then @@ -28,6 +36,8 @@ if [ ! -e ./generate-subtitles ]; then
28 -l sherpa-onnx \ 36 -l sherpa-onnx \
29 -l onnxruntime \ 37 -l onnxruntime \
30 -o generate-subtitles 38 -o generate-subtitles
  39 +
  40 + strip generate-subtitles
31 else 41 else
32 echo "./generate-subtitles exists - skip building" 42 echo "./generate-subtitles exists - skip building"
33 fi 43 fi
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -d ./vits-vctk ]; then
  11 + echo "Please download the pre-trained model for testing."
  12 + echo "You can refer to"
  13 + echo ""
  14 + echo "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers"
  15 + echo ""
  16 + echo "for help"
  17 +
  18 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2
  19 + tar xvf vits-vctk.tar.bz2
  20 + rm vits-vctk.tar.bz2
  21 +fi
  22 +
  23 +if [ ! -e ./tts ]; then
  24 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  25 + swiftc \
  26 + -lc++ \
  27 + -I ../build-swift-macos/install/include \
  28 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  29 + ./tts.swift ./SherpaOnnx.swift \
  30 + -L ../build-swift-macos/install/lib/ \
  31 + -l sherpa-onnx \
  32 + -l onnxruntime \
  33 + -o tts
  34 +
  35 + strip tts
  36 +else
  37 + echo "./tts exists - skip building"
  38 +fi
  39 +
  40 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  41 +./tts
  1 +func run() {
  2 + let model = "./vits-vctk/vits-vctk.onnx"
  3 + let lexicon = "./vits-vctk/lexicon.txt"
  4 + let tokens = "./vits-vctk/tokens.txt"
  5 + let vits = sherpaOnnxOfflineTtsVitsModelConfig(
  6 + model: model,
  7 + lexicon: lexicon,
  8 + tokens: tokens
  9 + )
  10 + let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits)
  11 + var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig)
  12 +
  13 + let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
  14 +
  15 + let text = "How are you doing? Fantastic!"
  16 + let sid = 99
  17 + let speed: Float = 1.0
  18 +
  19 + let audio = tts.generate(text: text, sid: sid, speed: speed)
  20 + let filename = "test.wav"
  21 + audio.save(filename: filename)
  22 +
  23 + print("\nSaved to:\n\(filename)")
  24 +}
  25 +
  26 +@main
  27 +struct App {
  28 + static func main() {
  29 + run()
  30 + }
  31 +}