Fangjun Kuang
Committed by GitHub

Add Swift API for ten-vad (#2387)

@@ -71,7 +71,11 @@ curl -SL -O https://huggingface.co/csukuangfj/test-data/resolve/main/Obama.wav @@ -71,7 +71,11 @@ curl -SL -O https://huggingface.co/csukuangfj/test-data/resolve/main/Obama.wav
71 ls -lh 71 ls -lh
72 popd 72 popd
73 73
  74 +./run-generate-subtitles-ten-vad.sh
  75 +rm -rf *.onnx
  76 +
74 ./run-generate-subtitles.sh 77 ./run-generate-subtitles.sh
  78 +rm -rf *.onnx
75 79
76 ls -lh /Users/fangjun/Desktop 80 ls -lh /Users/fangjun/Desktop
77 cat /Users/fangjun/Desktop/Obama.srt 81 cat /Users/fangjun/Desktop/Obama.srt
1 decode-file 1 decode-file
2 decode-file-non-streaming 2 decode-file-non-streaming
3 generate-subtitles 3 generate-subtitles
  4 +generate-subtitles-ten-vad
4 spoken-language-identification 5 spoken-language-identification
5 tts-vits 6 tts-vits
6 vits-vctk 7 vits-vctk
@@ -386,6 +386,22 @@ func sherpaOnnxOfflineWhisperModelConfig( @@ -386,6 +386,22 @@ func sherpaOnnxOfflineWhisperModelConfig(
386 ) 386 )
387 } 387 }
388 388
  389 +func sherpaOnnxOfflineCanaryModelConfig(
  390 + encoder: String = "",
  391 + decoder: String = "",
  392 + srcLang: String = "en",
  393 + tgtLang: String = "en",
  394 + usePnc: Bool = true
  395 +) -> SherpaOnnxOfflineCanaryModelConfig {
  396 + return SherpaOnnxOfflineCanaryModelConfig(
  397 + encoder: toCPointer(encoder),
  398 + decoder: toCPointer(decoder),
  399 + src_lang: toCPointer(srcLang),
  400 + tgt_lang: toCPointer(tgtLang),
  401 + use_pnc: usePnc ? 1 : 0
  402 + )
  403 +}
  404 +
389 func sherpaOnnxOfflineFireRedAsrModelConfig( 405 func sherpaOnnxOfflineFireRedAsrModelConfig(
390 encoder: String = "", 406 encoder: String = "",
391 decoder: String = "" 407 decoder: String = ""
@@ -459,7 +475,8 @@ func sherpaOnnxOfflineModelConfig( @@ -459,7 +475,8 @@ func sherpaOnnxOfflineModelConfig(
459 fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(), 475 fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(),
460 dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(), 476 dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(),
461 zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig = 477 zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig =
462 - sherpaOnnxOfflineZipformerCtcModelConfig() 478 + sherpaOnnxOfflineZipformerCtcModelConfig(),
  479 + canary: SherpaOnnxOfflineCanaryModelConfig = sherpaOnnxOfflineCanaryModelConfig()
463 ) -> SherpaOnnxOfflineModelConfig { 480 ) -> SherpaOnnxOfflineModelConfig {
464 return SherpaOnnxOfflineModelConfig( 481 return SherpaOnnxOfflineModelConfig(
465 transducer: transducer, 482 transducer: transducer,
@@ -479,7 +496,8 @@ func sherpaOnnxOfflineModelConfig( @@ -479,7 +496,8 @@ func sherpaOnnxOfflineModelConfig(
479 moonshine: moonshine, 496 moonshine: moonshine,
480 fire_red_asr: fireRedAsr, 497 fire_red_asr: fireRedAsr,
481 dolphin: dolphin, 498 dolphin: dolphin,
482 - zipformer_ctc: zipformerCtc 499 + zipformer_ctc: zipformerCtc,
  500 + canary: canary
483 ) 501 )
484 } 502 }
485 503
@@ -607,10 +625,14 @@ class SherpaOnnxOfflineRecognizer { @@ -607,10 +625,14 @@ class SherpaOnnxOfflineRecognizer {
607 625
608 return SherpaOnnxOfflineRecongitionResult(result: result) 626 return SherpaOnnxOfflineRecongitionResult(result: result)
609 } 627 }
  628 +
  629 + func setConfig(config: UnsafePointer<SherpaOnnxOfflineRecognizerConfig>!) {
  630 + SherpaOnnxOfflineRecognizerSetConfig(recognizer, config)
  631 + }
610 } 632 }
611 633
612 func sherpaOnnxSileroVadModelConfig( 634 func sherpaOnnxSileroVadModelConfig(
613 - model: String, 635 + model: String = "",
614 threshold: Float = 0.5, 636 threshold: Float = 0.5,
615 minSilenceDuration: Float = 0.25, 637 minSilenceDuration: Float = 0.25,
616 minSpeechDuration: Float = 0.5, 638 minSpeechDuration: Float = 0.5,
@@ -627,19 +649,39 @@ func sherpaOnnxSileroVadModelConfig( @@ -627,19 +649,39 @@ func sherpaOnnxSileroVadModelConfig(
627 ) 649 )
628 } 650 }
629 651
  652 +func sherpaOnnxTenVadModelConfig(
  653 + model: String = "",
  654 + threshold: Float = 0.5,
  655 + minSilenceDuration: Float = 0.25,
  656 + minSpeechDuration: Float = 0.5,
  657 + windowSize: Int = 256,
  658 + maxSpeechDuration: Float = 5.0
  659 +) -> SherpaOnnxTenVadModelConfig {
  660 + return SherpaOnnxTenVadModelConfig(
  661 + model: toCPointer(model),
  662 + threshold: threshold,
  663 + min_silence_duration: minSilenceDuration,
  664 + min_speech_duration: minSpeechDuration,
  665 + window_size: Int32(windowSize),
  666 + max_speech_duration: maxSpeechDuration
  667 + )
  668 +}
  669 +
630 func sherpaOnnxVadModelConfig( 670 func sherpaOnnxVadModelConfig(
631 - sileroVad: SherpaOnnxSileroVadModelConfig, 671 + sileroVad: SherpaOnnxSileroVadModelConfig = sherpaOnnxSileroVadModelConfig(),
632 sampleRate: Int32 = 16000, 672 sampleRate: Int32 = 16000,
633 numThreads: Int = 1, 673 numThreads: Int = 1,
634 provider: String = "cpu", 674 provider: String = "cpu",
635 - debug: Int = 0 675 + debug: Int = 0,
  676 + tenVad: SherpaOnnxTenVadModelConfig = sherpaOnnxTenVadModelConfig()
636 ) -> SherpaOnnxVadModelConfig { 677 ) -> SherpaOnnxVadModelConfig {
637 return SherpaOnnxVadModelConfig( 678 return SherpaOnnxVadModelConfig(
638 silero_vad: sileroVad, 679 silero_vad: sileroVad,
639 sample_rate: sampleRate, 680 sample_rate: sampleRate,
640 num_threads: Int32(numThreads), 681 num_threads: Int32(numThreads),
641 provider: toCPointer(provider), 682 provider: toCPointer(provider),
642 - debug: Int32(debug) 683 + debug: Int32(debug),
  684 + ten_vad: tenVad
643 ) 685 )
644 } 686 }
645 687
@@ -156,11 +156,35 @@ func run() { @@ -156,11 +156,35 @@ func run() {
156 assert(audioFormat.channelCount == 1) 156 assert(audioFormat.channelCount == 1)
157 assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) 157 assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
158 158
159 - let sileroVadConfig = sherpaOnnxSileroVadModelConfig(  
160 - model: "./silero_vad.onnx" 159 + var sileroVadConfig = sherpaOnnxSileroVadModelConfig()
  160 + var tenVadConfig = sherpaOnnxTenVadModelConfig()
  161 +
  162 + var windowSize = 0
  163 +
  164 + if FileManager.default.fileExists(atPath: "./silero_vad.onnx") {
  165 + sileroVadConfig = sherpaOnnxSileroVadModelConfig(
  166 + model: "./silero_vad.onnx",
  167 + threshold: 0.25,
  168 + windowSize: 512
161 ) 169 )
  170 + windowSize = 512
  171 + print("Use silero-vad")
  172 + } else if FileManager.default.fileExists(atPath: "./ten-vad.onnx") {
  173 + tenVadConfig = sherpaOnnxTenVadModelConfig(
  174 + model: "./ten-vad.onnx",
  175 + threshold: 0.25,
  176 + windowSize: 256
  177 + )
  178 + windowSize = 256
  179 + print("Use ten-vad")
  180 + } else {
  181 + print("Please provide ./silero_vad.onnx or ./ten-vad.onnx")
  182 + return
  183 + }
  184 +
  185 + var vadModelConfig = sherpaOnnxVadModelConfig(
  186 + sileroVad: sileroVadConfig, tenVad: tenVadConfig)
162 187
163 - var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)  
164 let vad = SherpaOnnxVoiceActivityDetectorWrapper( 188 let vad = SherpaOnnxVoiceActivityDetectorWrapper(
165 config: &vadModelConfig, buffer_size_in_seconds: 120) 189 config: &vadModelConfig, buffer_size_in_seconds: 120)
166 190
@@ -170,8 +194,6 @@ func run() { @@ -170,8 +194,6 @@ func run() {
170 try! audioFile.read(into: audioFileBuffer!) 194 try! audioFile.read(into: audioFileBuffer!)
171 var array: [Float]! = audioFileBuffer?.array() 195 var array: [Float]! = audioFileBuffer?.array()
172 196
173 - let windowSize = Int(vadModelConfig.silero_vad.window_size)  
174 -  
175 var segments: [SpeechSegment] = [] 197 var segments: [SpeechSegment] = []
176 198
177 for offset in stride(from: 0, to: array.count, by: windowSize) { 199 for offset in stride(from: 0, to: array.count, by: windowSize) {
@@ -180,7 +202,6 @@ func run() { @@ -180,7 +202,6 @@ func run() {
180 } 202 }
181 203
182 vad.flush() 204 vad.flush()
183 - var index: Int = 0  
184 while !vad.isEmpty() { 205 while !vad.isEmpty() {
185 let s = vad.front() 206 let s = vad.front()
186 vad.pop() 207 vad.pop()
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
  11 + echo "Please download the pre-trained model for testing."
  12 + echo "You can refer to"
  13 + echo ""
  14 + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
  15 + echo ""
  16 + echo "for help"
  17 +
  18 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  19 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  20 + rm sherpa-onnx-whisper-tiny.en.tar.bz2
  21 + ls -lh sherpa-onnx-whisper-tiny.en
  22 +fi
  23 +if [ ! -f ./ten-vad.onnx ]; then
  24 + echo "downloading ten-vad"
  25 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  26 +fi
  27 +
  28 +if [ ! -e ./generate-subtitles-ten-vad ]; then
  29 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  30 + swiftc \
  31 + -lc++ \
  32 + -I ../build-swift-macos/install/include \
  33 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  34 + ./generate-subtitles.swift ./SherpaOnnx.swift \
  35 + -L ../build-swift-macos/install/lib/ \
  36 + -l sherpa-onnx \
  37 + -l onnxruntime \
  38 + -o generate-subtitles-ten-vad
  39 +
  40 + strip generate-subtitles-ten-vad
  41 +else
  42 + echo "./generate-subtitles-ten-vad exists - skip building"
  43 +fi
  44 +
  45 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  46 +./generate-subtitles-ten-vad