Add Swift example for generating subtitles (#318)

Fangjun Kuang · GitHub
Commit 692a47dd80416e45e7b531e228a2cfde327dfcec 692a47dd 1 parent 2d51ca49
sherpa-onnx/c-api/c-api.cc
sherpa-onnx/c-api/c-api.h
sherpa-onnx/csrc/hypothesis.cc
sherpa-onnx/csrc/silero-vad-model-config.h
swift-api-examples/.gitignore
swift-api-examples/SherpaOnnx.swift
swift-api-examples/decode-file-non-streaming.swift
swift-api-examples/generate-subtitles.swift
swift-api-examples/run-generate-subtitles.sh
--- a/sherpa-onnx/c-api/c-api.cc
查看文件 @692a47d
+++ b/sherpa-onnx/c-api/c-api.cc
查看文件 @692a47d
@@ -9,9 +9,11 @@
 #include <utility>
 #include <vector>
 
+ #include "sherpa-onnx/csrc/circular-buffer.h"
 #include "sherpa-onnx/csrc/display.h"
 #include "sherpa-onnx/csrc/offline-recognizer.h"
 #include "sherpa-onnx/csrc/online-recognizer.h"
+ #include "sherpa-onnx/csrc/voice-activity-detector.h"
 
 struct SherpaOnnxOnlineRecognizer {
   std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
@@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer,
   recognizer->impl->DecodeStreams(ss.data(), n);
 }
 
- SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
+ const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
     SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) {
   sherpa_onnx::OnlineRecognizerResult result =
       recognizer->impl->GetResult(stream->impl.get());
@@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer,
   recognizer->impl->DecodeStreams(ss.data(), n);
 }
 
- SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
+ const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
     SherpaOnnxOfflineStream *stream) {
   const sherpa_onnx::OfflineRecognitionResult &result =
       stream->impl->GetResult();
@@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult(
   delete[] r->timestamps;
   delete r;
 }
+ 
+ // ============================================================
+ // For VAD
+ // ============================================================
+ //
+ struct SherpaOnnxCircularBuffer {
+   std::unique_ptr<sherpa_onnx::CircularBuffer> impl;
+ };
+ 
+ SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) {
+   SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer;
+   buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity);
+   return buffer;
+ }
+ 
+ void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) {
+   delete buffer;
+ }
+ 
+ void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer,
+                                   const float *p, int32_t n) {
+   buffer->impl->Push(p, n);
+ }
+ 
+ const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,
+                                          int32_t start_index, int32_t n) {
+   std::vector<float> v = buffer->impl->Get(start_index, n);
+ 
+   float *p = new float[n];
+   std::copy(v.begin(), v.end(), p);
+   return p;
+ }
+ 
+ void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; }
+ 
+ void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) {
+   buffer->impl->Pop(n);
+ }
+ 
+ int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) {
+   return buffer->impl->Size();
+ }
+ 
+ void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) {
+   buffer->impl->Reset();
+ }
+ 
+ struct SherpaOnnxVoiceActivityDetector {
+   std::unique_ptr<sherpa_onnx::VoiceActivityDetector> impl;
+ };
+ 
+ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
+     const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) {
+   sherpa_onnx::VadModelConfig vad_config;
+ 
+   vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, "");
+   vad_config.silero_vad.threshold =
+       SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5);
+ 
+   vad_config.silero_vad.min_silence_duration =
+       SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5);
+ 
+   vad_config.silero_vad.min_speech_duration =
+       SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25);
+ 
+   vad_config.silero_vad.window_size =
+       SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
+ 
+   vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
+   vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
+   vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
+   vad_config.debug = SHERPA_ONNX_OR(config->debug, false);
+ 
+   if (vad_config.debug) {
+     fprintf(stderr, "%s\n", vad_config.ToString().c_str());
+   }
+ 
+   SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector;
+   p->impl = std::make_unique<sherpa_onnx::VoiceActivityDetector>(
+       vad_config, buffer_size_in_seconds);
+ 
+   return p;
+ }
+ 
+ void SherpaOnnxDestroyVoiceActivityDetector(
+     SherpaOnnxVoiceActivityDetector *p) {
+   delete p;
+ }
+ 
+ void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
+     SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
+   p->impl->AcceptWaveform(samples, n);
+ }
+ 
+ int32_t SherpaOnnxVoiceActivityDetectorEmpty(
+     SherpaOnnxVoiceActivityDetector *p) {
+   return p->impl->Empty();
+ }
+ 
+ SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
+     SherpaOnnxVoiceActivityDetector *p) {
+   p->impl->Pop();
+ }
+ 
+ SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
+ SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) {
+   const sherpa_onnx::SpeechSegment &segment = p->impl->Front();
+ 
+   SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment;
+   ans->start = segment.start;
+   ans->samples = new float[segment.samples.size()];
+   std::copy(segment.samples.begin(), segment.samples.end(), ans->samples);
+   ans->n = segment.samples.size();
+ 
+   return ans;
+ }
+ 
+ void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) {
+   delete[] p->samples;
+   delete p;
+ }
+ 
+ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
+   p->impl->Reset();
+ }
--- a/sherpa-onnx/c-api/c-api.h
查看文件 @692a47d
+++ b/sherpa-onnx/c-api/c-api.h
查看文件 @692a47d
@@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
 /// @return A pointer containing the result. The user has to invoke
 ///         DestroyOnlineRecognizerResult() to free the returned pointer to
 ///         avoid memory leak.
- SHERPA_ONNX_API SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
+ SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
     SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);
 
 /// Destroy the pointer returned by GetOnlineStreamResult().
@@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
 /// @return Return a pointer to the result. The user has to invoke
 ///         DestroyOnlineRecognizerResult() to free the returned pointer to
 ///         avoid memory leak.
- SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
+ SHERPA_ONNX_API const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
     SherpaOnnxOfflineStream *stream);
 
 /// Destroy the pointer returned by GetOfflineStreamResult().
@@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
 SHERPA_ONNX_API void DestroyOfflineRecognizerResult(
     const SherpaOnnxOfflineRecognizerResult *r);
 
+ // ============================================================
+ // For VAD
+ // ============================================================
+ 
+ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
+   // Path to the silero VAD model
+   const char *model;
+ 
+   // threshold to classify a segment as speech
+   //
+   // If the predicted probability of a segment is larger than this
+   // value, then it is classified as speech.
+   float threshold;
+ 
+   // in seconds
+   float min_silence_duration;
+ 
+   // in seconds
+   float min_speech_duration;
+ 
+   int window_size;
+ } SherpaOnnxSileroVadModelConfig;
+ 
+ SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
+   SherpaOnnxSileroVadModelConfig silero_vad;
+ 
+   int32_t sample_rate;
+   int32_t num_threads;
+   const char *provider;
+   int32_t debug;
+ } SherpaOnnxVadModelConfig;
+ 
+ SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
+     SherpaOnnxCircularBuffer;
+ 
+ // Return an instance of circular buffer. The user has to use
+ // SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid
+ // memory leak.
+ SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
+     int32_t capacity);
+ 
+ // Free the pointer returned by SherpaOnnxCreateCircularBuffer()
+ SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer(
+     SherpaOnnxCircularBuffer *buffer);
+ 
+ SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
+     SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);
+ 
+ // Return n samples starting at the given index.
+ //
+ // Return a pointer to an array containing n samples starting at start_index.
+ // The user has to use SherpaOnnxCircularBufferFree() to free the returned
+ // pointer to avoid memory leak.
+ SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet(
+     SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);
+ 
+ // Free the pointer returned by SherpaOnnxCircularBufferGet().
+ SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p);
+ 
+ // Remove n elements from the buffer
+ SHERPA_ONNX_API void SherpaOnnxCircularBufferPop(
+     SherpaOnnxCircularBuffer *buffer, int32_t n);
+ 
+ // Return number of elements in the buffer.
+ SHERPA_ONNX_API int32_t
+ SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer);
+ 
+ // Clear all elements in the buffer
+ SHERPA_ONNX_API void SherpaOnnxCircularBufferReset(
+     SherpaOnnxCircularBuffer *buffer);
+ 
+ SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment {
+   // The start index in samples of this segment
+   int32_t start;
+ 
+   // pointer to the array containing the samples
+   float *samples;
+ 
+   // number of samples in this segment
+   int32_t n;
+ } SherpaOnnxSpeechSegment;
+ 
+ typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector;
+ 
+ // Return an instance of VoiceActivityDetector.
+ // The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
+ // the returned pointer to avoid memory leak.
+ SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
+ SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config,
+                                       float buffer_size_in_seconds);
+ 
+ SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector(
+     SherpaOnnxVoiceActivityDetector *p);
+ 
+ SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
+     SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);
+ 
+ // Return 1 if there are no speech segments available.
+ // Return 0 if there are speech segments.
+ SHERPA_ONNX_API int32_t
+ SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p);
+ 
+ // Return the first speech segment.
+ // It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1.
+ SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
+     SherpaOnnxVoiceActivityDetector *p);
+ 
+ // Return the first speech segment.
+ // The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
+ // pointer to avoid memory leak.
+ SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
+ SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p);
+ 
+ // Free the pointer returned SherpaOnnxVoiceActivityDetectorFront().
+ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
+     const SherpaOnnxSpeechSegment *p);
+ 
+ // Re-initialize the voice activity detector.
+ SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
+     SherpaOnnxVoiceActivityDetector *p);
+ 
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/sherpa-onnx/csrc/hypothesis.cc
查看文件 @692a47d
+++ b/sherpa-onnx/csrc/hypothesis.cc
查看文件 @692a47d
@@ -18,9 +18,9 @@ void Hypotheses::Add(Hypothesis hyp) {
   } else {
     it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob);
 
-     if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0){
+     if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0) {
       it->second.lm_log_prob =
-         LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
+           LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
     }
   }
 }
--- a/sherpa-onnx/csrc/silero-vad-model-config.h
查看文件 @692a47d
+++ b/sherpa-onnx/csrc/silero-vad-model-config.h
查看文件 @692a47d
@@ -15,7 +15,7 @@ struct SileroVadModelConfig {
 
   // threshold to classify a segment as speech
   //
-   // The predicted probability of a segment is larger than this
+   // If the predicted probability of a segment is larger than this
   // value, then it is classified as speech.
   float threshold = 0.5;
 
@@ -25,7 +25,7 @@ struct SileroVadModelConfig {
 
   // 512, 1024, 1536 samples for 16000 Hz
   // 256, 512, 768 samples for 800 Hz
-   int window_size = 512;  // in samples
+   int32_t window_size = 512;  // in samples
 
   SileroVadModelConfig() = default;
 
--- a/swift-api-examples/.gitignore
查看文件 @692a47d
+++ b/swift-api-examples/.gitignore
查看文件 @692a47d
 decode-file
 decode-file-non-streaming
+ generate-subtitles
--- a/swift-api-examples/SherpaOnnx.swift
查看文件 @692a47d
+++ b/swift-api-examples/SherpaOnnx.swift
查看文件 @692a47d
@@ -215,7 +215,7 @@ class SherpaOnnxRecognizer {
 
   /// Get the decoding results so far
   func getResult() -> SherpaOnnxOnlineRecongitionResult {
-     let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
+     let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
       recognizer, stream)
     return SherpaOnnxOnlineRecongitionResult(result: result)
   }
@@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer {
 
     DecodeOfflineStream(recognizer, stream)
 
-     let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
+     let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
       stream)
 
     DestroyOfflineStream(stream)
@@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer {
     return SherpaOnnxOfflineRecongitionResult(result: result)
   }
 }
+ 
+ func sherpaOnnxSileroVadModelConfig(
+   model: String,
+   threshold: Float = 0.5,
+   minSilenceDuration: Float = 0.25,
+   minSpeechDuration: Float = 0.5,
+   windowSize: Int = 512
+ ) -> SherpaOnnxSileroVadModelConfig {
+   return SherpaOnnxSileroVadModelConfig(
+     model: toCPointer(model),
+     threshold: threshold,
+     min_silence_duration: minSilenceDuration,
+     min_speech_duration: minSpeechDuration,
+     window_size: Int32(windowSize)
+   )
+ }
+ 
+ func sherpaOnnxVadModelConfig(
+   sileroVad: SherpaOnnxSileroVadModelConfig,
+   sampleRate: Int32 = 16000,
+   numThreads: Int = 1,
+   provider: String = "cpu",
+   debug: Int = 0
+ ) -> SherpaOnnxVadModelConfig {
+   return SherpaOnnxVadModelConfig(
+     silero_vad: sileroVad,
+     sample_rate: sampleRate,
+     num_threads: Int32(numThreads),
+     provider: toCPointer(provider),
+     debug: Int32(debug)
+   )
+ }
+ 
+ class SherpaOnnxCircularBufferWrapper {
+   let buffer: OpaquePointer!
+ 
+   init(capacity: Int) {
+     buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity))
+   }
+ 
+   deinit {
+     if let buffer {
+       SherpaOnnxDestroyCircularBuffer(buffer)
+     }
+   }
+ 
+   func push(samples: [Float]) {
+     SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count))
+   }
+ 
+   func get(startIndex: Int, n: Int) -> [Float] {
+     let p: UnsafePointer<Float>! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n))
+ 
+     var samples: [Float] = []
+ 
+     for index in 0..<n {
+       samples.append(p[Int(index)])
+     }
+ 
+     SherpaOnnxCircularBufferFree(p)
+ 
+     return samples
+   }
+ 
+   func pop(n: Int) {
+     SherpaOnnxCircularBufferPop(buffer, Int32(n))
+   }
+ 
+   func size() -> Int {
+     return Int(SherpaOnnxCircularBufferSize(buffer))
+   }
+ 
+   func reset() {
+     SherpaOnnxCircularBufferReset(buffer)
+   }
+ }
+ 
+ class SherpaOnnxSpeechSegmentWrapper {
+   let p: UnsafePointer<SherpaOnnxSpeechSegment>!
+ 
+   init(p: UnsafePointer<SherpaOnnxSpeechSegment>!) {
+     self.p = p
+   }
+ 
+   deinit {
+     if let p {
+       SherpaOnnxDestroySpeechSegment(p)
+     }
+   }
+ 
+   var start: Int {
+     return Int(p.pointee.start)
+   }
+ 
+   var n: Int {
+     return Int(p.pointee.n)
+   }
+ 
+   var samples: [Float] {
+     var samples: [Float] = []
+     for index in 0..<n {
+       samples.append(p.pointee.samples[Int(index)])
+     }
+     return samples
+   }
+ }
+ 
+ class SherpaOnnxVoiceActivityDetectorWrapper {
+   /// A pointer to the underlying counterpart in C
+   let vad: OpaquePointer!
+ 
+   init(config: UnsafePointer<SherpaOnnxVadModelConfig>!, buffer_size_in_seconds: Float) {
+     vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds)
+   }
+ 
+   deinit {
+     if let vad {
+       SherpaOnnxDestroyVoiceActivityDetector(vad)
+     }
+   }
+ 
+   func acceptWaveform(samples: [Float]) {
+     SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count))
+   }
+ 
+   func isEmpty() -> Bool {
+     return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false
+   }
+ 
+   func pop() {
+     SherpaOnnxVoiceActivityDetectorPop(vad)
+   }
+ 
+   func front() -> SherpaOnnxSpeechSegmentWrapper {
+     let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
+     return SherpaOnnxSpeechSegmentWrapper(p: p)
+   }
+ 
+   func reset() {
+     SherpaOnnxVoiceActivityDetectorReset(vad)
+   }
+ }
--- a/swift-api-examples/decode-file-non-streaming.swift
查看文件 @692a47d
+++ b/swift-api-examples/decode-file-non-streaming.swift
查看文件 @692a47d
@@ -13,7 +13,6 @@ extension AVAudioPCMBuffer {
 }
 
 func run() {
- 
   var recognizer: SherpaOnnxOfflineRecognizer
   var modelConfig: SherpaOnnxOfflineModelConfig
   var modelType = "whisper"
--- a/swift-api-examples/generate-subtitles.swift 0 → 100644
查看文件 @692a47d
+++ b/swift-api-examples/generate-subtitles.swift 0 → 100644
查看文件 @692a47d
+ /*
+ This file shows how to use Swift API to generate subtitles.
+ 
+ You can use the files from
+ https://huggingface.co/csukuangfj/vad/tree/main
+ for testing.
+ 
+ For instance, to generate subtitles for Obama.mov, please first
+ use
+ 
+ ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav
+ 
+ to extract the audio part from the video.
+ 
+ This file supports only processing WAV sound files, so you have to first
+ extract audios from videos.
+ 
+ Please see
+ ./run-generate-subtitles.sh
+ for usages.
+ */
+ 
+ import AVFoundation
+ 
+ extension AudioBuffer {
+   func array() -> [Float] {
+     return Array(UnsafeBufferPointer(self))
+   }
+ }
+ 
+ extension AVAudioPCMBuffer {
+   func array() -> [Float] {
+     return self.audioBufferList.pointee.mBuffers.array()
+   }
+ }
+ 
+ extension TimeInterval {
+   var hourMinuteSecondMS: String {
+     String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond)
+   }
+ 
+   var hour: Int {
+     Int((self / 3600).truncatingRemainder(dividingBy: 3600))
+   }
+   var minute: Int {
+     Int((self / 60).truncatingRemainder(dividingBy: 60))
+   }
+   var second: Int {
+     Int(truncatingRemainder(dividingBy: 60))
+   }
+   var millisecond: Int {
+     Int((self * 1000).truncatingRemainder(dividingBy: 1000))
+   }
+ }
+ 
+ extension String {
+   var fileURL: URL {
+     return URL(fileURLWithPath: self)
+   }
+   var pathExtension: String {
+     return fileURL.pathExtension
+   }
+   var lastPathComponent: String {
+     return fileURL.lastPathComponent
+   }
+   var stringByDeletingPathExtension: String {
+     return fileURL.deletingPathExtension().path
+   }
+ }
+ 
+ class SpeechSegment: CustomStringConvertible {
+ 
+   let start: Float
+   let end: Float
+   let text: String
+ 
+   init(start: Float, duration: Float, text: String) {
+     self.start = start
+     self.end = start + duration
+     self.text = text
+   }
+   public var description: String {
+     var s: String
+     s = TimeInterval(self.start).hourMinuteSecondMS
+     s += " --> "
+     s += TimeInterval(self.end).hourMinuteSecondMS
+     s += "\n"
+     s += self.text
+ 
+     return s
+   }
+ }
+ 
+ func run() {
+   var recognizer: SherpaOnnxOfflineRecognizer
+   var modelConfig: SherpaOnnxOfflineModelConfig
+   var modelType = "whisper"
+   // modelType = "paraformer"
+   var filePath = "/Users/fangjun/Desktop/Obama.wav"  // English
+   // filePath = "/Users/fangjun/Desktop/lei-jun.wav"  // Chinese
+   // please go to https://huggingface.co/csukuangfj/vad
+   // to download the above two files
+ 
+   if modelType == "whisper" {
+     // for English
+     let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
+     let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
+     let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
+ 
+     let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
+       encoder: encoder,
+       decoder: decoder
+     )
+ 
+     modelConfig = sherpaOnnxOfflineModelConfig(
+       tokens: tokens,
+       whisper: whisperConfig,
+       debug: 0,
+       modelType: "whisper"
+     )
+   } else if modelType == "paraformer" {
+     // for Chinese
+     let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx"
+     let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt"
+     let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig(
+       model: model
+     )
+ 
+     modelConfig = sherpaOnnxOfflineModelConfig(
+       tokens: tokens,
+       paraformer: paraformerConfig,
+       debug: 0,
+       modelType: "paraformer"
+     )
+   } else {
+     print("Please specify a supported modelType \(modelType)")
+     return
+   }
+ 
+   let sampleRate = 16000
+   let featConfig = sherpaOnnxFeatureConfig(
+     sampleRate: sampleRate,
+     featureDim: 80
+   )
+   var config = sherpaOnnxOfflineRecognizerConfig(
+     featConfig: featConfig,
+     modelConfig: modelConfig
+   )
+ 
+   recognizer = SherpaOnnxOfflineRecognizer(config: &config)
+ 
+   let audioFile = try! AVAudioFile(forReading: filePath.fileURL)
+ 
+   let audioFormat = audioFile.processingFormat
+   assert(audioFormat.sampleRate == Double(sampleRate))
+   assert(audioFormat.channelCount == 1)
+   assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
+ 
+   let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
+     model: "./silero_vad.onnx"
+   )
+ 
+   var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
+   let vad = SherpaOnnxVoiceActivityDetectorWrapper(
+     config: &vadModelConfig, buffer_size_in_seconds: 120)
+ 
+   let audioFrameCount = UInt32(audioFile.length)
+   let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
+ 
+   try! audioFile.read(into: audioFileBuffer!)
+   var array: [Float]! = audioFileBuffer?.array()
+ 
+   let windowSize = Int(vadModelConfig.silero_vad.window_size)
+ 
+   var segments: [SpeechSegment] = []
+ 
+   while array.count > windowSize {
+     // todo(fangjun): avoid extra copies here
+     vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
+     array = [Float](array[windowSize..<array.count])
+ 
+     while !vad.isEmpty() {
+       let s = vad.front()
+       vad.pop()
+       let result = recognizer.decode(samples: s.samples)
+ 
+       segments.append(
+         SpeechSegment(
+           start: Float(s.start) / Float(sampleRate),
+           duration: Float(s.samples.count) / Float(sampleRate),
+           text: result.text))
+ 
+       print(segments.last!)
+ 
+     }
+   }
+ 
+   let srt = zip(segments.indices, segments).map { (index, element) in
+     return "\(index+1)\n\(element)"
+   }.joined(separator: "\n\n")
+ 
+   let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
+   do {
+     try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
+   } catch {
+     print("Error writing: \(error.localizedDescription)")
+   }
+ 
+   print("Saved to \(srtFilename)")
+ }
+ 
+ @main
+ struct App {
+   static func main() {
+     run()
+   }
+ }
--- a/swift-api-examples/run-generate-subtitles.sh 0 → 100755
查看文件 @692a47d
+++ b/swift-api-examples/run-generate-subtitles.sh 0 → 100755
查看文件 @692a47d
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ if [ ! -d ../build-swift-macos ]; then
+   echo "Please run ../build-swift-macos.sh first!"
+   exit 1
+ fi
+ 
+ if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
+   echo "Please download the pre-trained model for testing."
+   echo "You can refer to"
+   echo ""
+   echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
+   echo ""
+   echo "for help"
+   exit 1
+ fi
+ 
+ if [ ! -e ./generate-subtitles ]; then
+   # Note: We use -lc++ to link against libc++ instead of libstdc++
+   swiftc \
+     -lc++ \
+     -I ../build-swift-macos/install/include \
+     -import-objc-header ./SherpaOnnx-Bridging-Header.h \
+     ./generate-subtitles.swift  ./SherpaOnnx.swift \
+     -L ../build-swift-macos/install/lib/ \
+     -l sherpa-onnx \
+     -l onnxruntime \
+     -o generate-subtitles
+ else
+   echo "./generate-subtitles exists - skip building"
+ fi
+ 
+ export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
+ ./generate-subtitles