Fangjun Kuang
Committed by GitHub

Add Swift example for generating subtitles (#318)

@@ -9,9 +9,11 @@ @@ -9,9 +9,11 @@
9 #include <utility> 9 #include <utility>
10 #include <vector> 10 #include <vector>
11 11
  12 +#include "sherpa-onnx/csrc/circular-buffer.h"
12 #include "sherpa-onnx/csrc/display.h" 13 #include "sherpa-onnx/csrc/display.h"
13 #include "sherpa-onnx/csrc/offline-recognizer.h" 14 #include "sherpa-onnx/csrc/offline-recognizer.h"
14 #include "sherpa-onnx/csrc/online-recognizer.h" 15 #include "sherpa-onnx/csrc/online-recognizer.h"
  16 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
15 17
16 struct SherpaOnnxOnlineRecognizer { 18 struct SherpaOnnxOnlineRecognizer {
17 std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl; 19 std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
@@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, @@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer,
127 recognizer->impl->DecodeStreams(ss.data(), n); 129 recognizer->impl->DecodeStreams(ss.data(), n);
128 } 130 }
129 131
130 -SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( 132 +const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
131 SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) { 133 SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) {
132 sherpa_onnx::OnlineRecognizerResult result = 134 sherpa_onnx::OnlineRecognizerResult result =
133 recognizer->impl->GetResult(stream->impl.get()); 135 recognizer->impl->GetResult(stream->impl.get());
@@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer, @@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer,
340 recognizer->impl->DecodeStreams(ss.data(), n); 342 recognizer->impl->DecodeStreams(ss.data(), n);
341 } 343 }
342 344
343 -SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( 345 +const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
344 SherpaOnnxOfflineStream *stream) { 346 SherpaOnnxOfflineStream *stream) {
345 const sherpa_onnx::OfflineRecognitionResult &result = 347 const sherpa_onnx::OfflineRecognitionResult &result =
346 stream->impl->GetResult(); 348 stream->impl->GetResult();
@@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult( @@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult(
372 delete[] r->timestamps; 374 delete[] r->timestamps;
373 delete r; 375 delete r;
374 } 376 }
  377 +
  378 +// ============================================================
  379 +// For VAD
  380 +// ============================================================
  381 +//
  382 +struct SherpaOnnxCircularBuffer {
  383 + std::unique_ptr<sherpa_onnx::CircularBuffer> impl;
  384 +};
  385 +
  386 +SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) {
  387 + SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer;
  388 + buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity);
  389 + return buffer;
  390 +}
  391 +
  392 +void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) {
  393 + delete buffer;
  394 +}
  395 +
  396 +void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer,
  397 + const float *p, int32_t n) {
  398 + buffer->impl->Push(p, n);
  399 +}
  400 +
  401 +const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer,
  402 + int32_t start_index, int32_t n) {
  403 + std::vector<float> v = buffer->impl->Get(start_index, n);
  404 +
  405 + float *p = new float[n];
  406 + std::copy(v.begin(), v.end(), p);
  407 + return p;
  408 +}
  409 +
  410 +void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; }
  411 +
  412 +void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) {
  413 + buffer->impl->Pop(n);
  414 +}
  415 +
  416 +int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) {
  417 + return buffer->impl->Size();
  418 +}
  419 +
  420 +void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) {
  421 + buffer->impl->Reset();
  422 +}
  423 +
  424 +struct SherpaOnnxVoiceActivityDetector {
  425 + std::unique_ptr<sherpa_onnx::VoiceActivityDetector> impl;
  426 +};
  427 +
  428 +SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
  429 + const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) {
  430 + sherpa_onnx::VadModelConfig vad_config;
  431 +
  432 + vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, "");
  433 + vad_config.silero_vad.threshold =
  434 + SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5);
  435 +
  436 + vad_config.silero_vad.min_silence_duration =
  437 + SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5);
  438 +
  439 + vad_config.silero_vad.min_speech_duration =
  440 + SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25);
  441 +
  442 + vad_config.silero_vad.window_size =
  443 + SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
  444 +
  445 + vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
  446 + vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
  447 + vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
  448 + vad_config.debug = SHERPA_ONNX_OR(config->debug, false);
  449 +
  450 + if (vad_config.debug) {
  451 + fprintf(stderr, "%s\n", vad_config.ToString().c_str());
  452 + }
  453 +
  454 + SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector;
  455 + p->impl = std::make_unique<sherpa_onnx::VoiceActivityDetector>(
  456 + vad_config, buffer_size_in_seconds);
  457 +
  458 + return p;
  459 +}
  460 +
  461 +void SherpaOnnxDestroyVoiceActivityDetector(
  462 + SherpaOnnxVoiceActivityDetector *p) {
  463 + delete p;
  464 +}
  465 +
  466 +void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
  467 + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) {
  468 + p->impl->AcceptWaveform(samples, n);
  469 +}
  470 +
  471 +int32_t SherpaOnnxVoiceActivityDetectorEmpty(
  472 + SherpaOnnxVoiceActivityDetector *p) {
  473 + return p->impl->Empty();
  474 +}
  475 +
  476 +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
  477 + SherpaOnnxVoiceActivityDetector *p) {
  478 + p->impl->Pop();
  479 +}
  480 +
  481 +SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
  482 +SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) {
  483 + const sherpa_onnx::SpeechSegment &segment = p->impl->Front();
  484 +
  485 + SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment;
  486 + ans->start = segment.start;
  487 + ans->samples = new float[segment.samples.size()];
  488 + std::copy(segment.samples.begin(), segment.samples.end(), ans->samples);
  489 + ans->n = segment.samples.size();
  490 +
  491 + return ans;
  492 +}
  493 +
  494 +void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) {
  495 + delete[] p->samples;
  496 + delete p;
  497 +}
  498 +
  499 +void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
  500 + p->impl->Reset();
  501 +}
@@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams( @@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams(
234 /// @return A pointer containing the result. The user has to invoke 234 /// @return A pointer containing the result. The user has to invoke
235 /// DestroyOnlineRecognizerResult() to free the returned pointer to 235 /// DestroyOnlineRecognizerResult() to free the returned pointer to
236 /// avoid memory leak. 236 /// avoid memory leak.
237 -SHERPA_ONNX_API SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( 237 +SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult(
238 SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); 238 SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream);
239 239
240 /// Destroy the pointer returned by GetOnlineStreamResult(). 240 /// Destroy the pointer returned by GetOnlineStreamResult().
@@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { @@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
429 /// @return Return a pointer to the result. The user has to invoke 429 /// @return Return a pointer to the result. The user has to invoke
430 /// DestroyOnlineRecognizerResult() to free the returned pointer to 430 /// DestroyOnlineRecognizerResult() to free the returned pointer to
431 /// avoid memory leak. 431 /// avoid memory leak.
432 -SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( 432 +SHERPA_ONNX_API const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
433 SherpaOnnxOfflineStream *stream); 433 SherpaOnnxOfflineStream *stream);
434 434
435 /// Destroy the pointer returned by GetOfflineStreamResult(). 435 /// Destroy the pointer returned by GetOfflineStreamResult().
@@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( @@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
438 SHERPA_ONNX_API void DestroyOfflineRecognizerResult( 438 SHERPA_ONNX_API void DestroyOfflineRecognizerResult(
439 const SherpaOnnxOfflineRecognizerResult *r); 439 const SherpaOnnxOfflineRecognizerResult *r);
440 440
  441 +// ============================================================
  442 +// For VAD
  443 +// ============================================================
  444 +
  445 +SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
  446 + // Path to the silero VAD model
  447 + const char *model;
  448 +
  449 + // threshold to classify a segment as speech
  450 + //
  451 + // If the predicted probability of a segment is larger than this
  452 + // value, then it is classified as speech.
  453 + float threshold;
  454 +
  455 + // in seconds
  456 + float min_silence_duration;
  457 +
  458 + // in seconds
  459 + float min_speech_duration;
  460 +
  461 + int window_size;
  462 +} SherpaOnnxSileroVadModelConfig;
  463 +
  464 +SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
  465 + SherpaOnnxSileroVadModelConfig silero_vad;
  466 +
  467 + int32_t sample_rate;
  468 + int32_t num_threads;
  469 + const char *provider;
  470 + int32_t debug;
  471 +} SherpaOnnxVadModelConfig;
  472 +
  473 +SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
  474 + SherpaOnnxCircularBuffer;
  475 +
  476 +// Return an instance of circular buffer. The user has to use
  477 +// SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid
  478 +// memory leak.
  479 +SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(
  480 + int32_t capacity);
  481 +
  482 +// Free the pointer returned by SherpaOnnxCreateCircularBuffer()
  483 +SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer(
  484 + SherpaOnnxCircularBuffer *buffer);
  485 +
  486 +SHERPA_ONNX_API void SherpaOnnxCircularBufferPush(
  487 + SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n);
  488 +
  489 +// Return n samples starting at the given index.
  490 +//
  491 +// Return a pointer to an array containing n samples starting at start_index.
  492 +// The user has to use SherpaOnnxCircularBufferFree() to free the returned
  493 +// pointer to avoid memory leak.
  494 +SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet(
  495 + SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n);
  496 +
  497 +// Free the pointer returned by SherpaOnnxCircularBufferGet().
  498 +SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p);
  499 +
  500 +// Remove n elements from the buffer
  501 +SHERPA_ONNX_API void SherpaOnnxCircularBufferPop(
  502 + SherpaOnnxCircularBuffer *buffer, int32_t n);
  503 +
  504 +// Return number of elements in the buffer.
  505 +SHERPA_ONNX_API int32_t
  506 +SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer);
  507 +
  508 +// Clear all elements in the buffer
  509 +SHERPA_ONNX_API void SherpaOnnxCircularBufferReset(
  510 + SherpaOnnxCircularBuffer *buffer);
  511 +
  512 +SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment {
  513 + // The start index in samples of this segment
  514 + int32_t start;
  515 +
  516 + // pointer to the array containing the samples
  517 + float *samples;
  518 +
  519 + // number of samples in this segment
  520 + int32_t n;
  521 +} SherpaOnnxSpeechSegment;
  522 +
  523 +typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector;
  524 +
  525 +// Return an instance of VoiceActivityDetector.
  526 +// The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free
  527 +// the returned pointer to avoid memory leak.
  528 +SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector *
  529 +SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config,
  530 + float buffer_size_in_seconds);
  531 +
  532 +SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector(
  533 + SherpaOnnxVoiceActivityDetector *p);
  534 +
  535 +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform(
  536 + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n);
  537 +
  538 +// Return 1 if there are no speech segments available.
  539 +// Return 0 if there are speech segments.
  540 +SHERPA_ONNX_API int32_t
  541 +SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p);
  542 +
  543 +// Return the first speech segment.
  544 +// It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1.
  545 +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop(
  546 + SherpaOnnxVoiceActivityDetector *p);
  547 +
  548 +// Return the first speech segment.
  549 +// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned
  550 +// pointer to avoid memory leak.
  551 +SHERPA_ONNX_API const SherpaOnnxSpeechSegment *
  552 +SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p);
  553 +
  554 +// Free the pointer returned SherpaOnnxVoiceActivityDetectorFront().
  555 +SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
  556 + const SherpaOnnxSpeechSegment *p);
  557 +
  558 +// Re-initialize the voice activity detector.
  559 +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
  560 + SherpaOnnxVoiceActivityDetector *p);
  561 +
441 #ifdef __cplusplus 562 #ifdef __cplusplus
442 } /* extern "C" */ 563 } /* extern "C" */
443 #endif 564 #endif
@@ -18,7 +18,7 @@ void Hypotheses::Add(Hypothesis hyp) { @@ -18,7 +18,7 @@ void Hypotheses::Add(Hypothesis hyp) {
18 } else { 18 } else {
19 it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob); 19 it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob);
20 20
21 - if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0){ 21 + if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0) {
22 it->second.lm_log_prob = 22 it->second.lm_log_prob =
23 LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob); 23 LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob);
24 } 24 }
@@ -15,7 +15,7 @@ struct SileroVadModelConfig { @@ -15,7 +15,7 @@ struct SileroVadModelConfig {
15 15
16 // threshold to classify a segment as speech 16 // threshold to classify a segment as speech
17 // 17 //
18 - // The predicted probability of a segment is larger than this 18 + // If the predicted probability of a segment is larger than this
19 // value, then it is classified as speech. 19 // value, then it is classified as speech.
20 float threshold = 0.5; 20 float threshold = 0.5;
21 21
@@ -25,7 +25,7 @@ struct SileroVadModelConfig { @@ -25,7 +25,7 @@ struct SileroVadModelConfig {
25 25
26 // 512, 1024, 1536 samples for 16000 Hz 26 // 512, 1024, 1536 samples for 16000 Hz
27 // 256, 512, 768 samples for 800 Hz 27 // 256, 512, 768 samples for 800 Hz
28 - int window_size = 512; // in samples 28 + int32_t window_size = 512; // in samples
29 29
30 SileroVadModelConfig() = default; 30 SileroVadModelConfig() = default;
31 31
1 decode-file 1 decode-file
2 decode-file-non-streaming 2 decode-file-non-streaming
  3 +generate-subtitles
@@ -215,7 +215,7 @@ class SherpaOnnxRecognizer { @@ -215,7 +215,7 @@ class SherpaOnnxRecognizer {
215 215
216 /// Get the decoding results so far 216 /// Get the decoding results so far
217 func getResult() -> SherpaOnnxOnlineRecongitionResult { 217 func getResult() -> SherpaOnnxOnlineRecongitionResult {
218 - let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult( 218 + let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(
219 recognizer, stream) 219 recognizer, stream)
220 return SherpaOnnxOnlineRecongitionResult(result: result) 220 return SherpaOnnxOnlineRecongitionResult(result: result)
221 } 221 }
@@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer { @@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer {
406 406
407 DecodeOfflineStream(recognizer, stream) 407 DecodeOfflineStream(recognizer, stream)
408 408
409 - let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult( 409 + let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult(
410 stream) 410 stream)
411 411
412 DestroyOfflineStream(stream) 412 DestroyOfflineStream(stream)
@@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer { @@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer {
414 return SherpaOnnxOfflineRecongitionResult(result: result) 414 return SherpaOnnxOfflineRecongitionResult(result: result)
415 } 415 }
416 } 416 }
  417 +
  418 +func sherpaOnnxSileroVadModelConfig(
  419 + model: String,
  420 + threshold: Float = 0.5,
  421 + minSilenceDuration: Float = 0.25,
  422 + minSpeechDuration: Float = 0.5,
  423 + windowSize: Int = 512
  424 +) -> SherpaOnnxSileroVadModelConfig {
  425 + return SherpaOnnxSileroVadModelConfig(
  426 + model: toCPointer(model),
  427 + threshold: threshold,
  428 + min_silence_duration: minSilenceDuration,
  429 + min_speech_duration: minSpeechDuration,
  430 + window_size: Int32(windowSize)
  431 + )
  432 +}
  433 +
  434 +func sherpaOnnxVadModelConfig(
  435 + sileroVad: SherpaOnnxSileroVadModelConfig,
  436 + sampleRate: Int32 = 16000,
  437 + numThreads: Int = 1,
  438 + provider: String = "cpu",
  439 + debug: Int = 0
  440 +) -> SherpaOnnxVadModelConfig {
  441 + return SherpaOnnxVadModelConfig(
  442 + silero_vad: sileroVad,
  443 + sample_rate: sampleRate,
  444 + num_threads: Int32(numThreads),
  445 + provider: toCPointer(provider),
  446 + debug: Int32(debug)
  447 + )
  448 +}
  449 +
  450 +class SherpaOnnxCircularBufferWrapper {
  451 + let buffer: OpaquePointer!
  452 +
  453 + init(capacity: Int) {
  454 + buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity))
  455 + }
  456 +
  457 + deinit {
  458 + if let buffer {
  459 + SherpaOnnxDestroyCircularBuffer(buffer)
  460 + }
  461 + }
  462 +
  463 + func push(samples: [Float]) {
  464 + SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count))
  465 + }
  466 +
  467 + func get(startIndex: Int, n: Int) -> [Float] {
  468 + let p: UnsafePointer<Float>! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n))
  469 +
  470 + var samples: [Float] = []
  471 +
  472 + for index in 0..<n {
  473 + samples.append(p[Int(index)])
  474 + }
  475 +
  476 + SherpaOnnxCircularBufferFree(p)
  477 +
  478 + return samples
  479 + }
  480 +
  481 + func pop(n: Int) {
  482 + SherpaOnnxCircularBufferPop(buffer, Int32(n))
  483 + }
  484 +
  485 + func size() -> Int {
  486 + return Int(SherpaOnnxCircularBufferSize(buffer))
  487 + }
  488 +
  489 + func reset() {
  490 + SherpaOnnxCircularBufferReset(buffer)
  491 + }
  492 +}
  493 +
  494 +class SherpaOnnxSpeechSegmentWrapper {
  495 + let p: UnsafePointer<SherpaOnnxSpeechSegment>!
  496 +
  497 + init(p: UnsafePointer<SherpaOnnxSpeechSegment>!) {
  498 + self.p = p
  499 + }
  500 +
  501 + deinit {
  502 + if let p {
  503 + SherpaOnnxDestroySpeechSegment(p)
  504 + }
  505 + }
  506 +
  507 + var start: Int {
  508 + return Int(p.pointee.start)
  509 + }
  510 +
  511 + var n: Int {
  512 + return Int(p.pointee.n)
  513 + }
  514 +
  515 + var samples: [Float] {
  516 + var samples: [Float] = []
  517 + for index in 0..<n {
  518 + samples.append(p.pointee.samples[Int(index)])
  519 + }
  520 + return samples
  521 + }
  522 +}
  523 +
  524 +class SherpaOnnxVoiceActivityDetectorWrapper {
  525 + /// A pointer to the underlying counterpart in C
  526 + let vad: OpaquePointer!
  527 +
  528 + init(config: UnsafePointer<SherpaOnnxVadModelConfig>!, buffer_size_in_seconds: Float) {
  529 + vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds)
  530 + }
  531 +
  532 + deinit {
  533 + if let vad {
  534 + SherpaOnnxDestroyVoiceActivityDetector(vad)
  535 + }
  536 + }
  537 +
  538 + func acceptWaveform(samples: [Float]) {
  539 + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count))
  540 + }
  541 +
  542 + func isEmpty() -> Bool {
  543 + return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false
  544 + }
  545 +
  546 + func pop() {
  547 + SherpaOnnxVoiceActivityDetectorPop(vad)
  548 + }
  549 +
  550 + func front() -> SherpaOnnxSpeechSegmentWrapper {
  551 + let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad)
  552 + return SherpaOnnxSpeechSegmentWrapper(p: p)
  553 + }
  554 +
  555 + func reset() {
  556 + SherpaOnnxVoiceActivityDetectorReset(vad)
  557 + }
  558 +}
@@ -13,7 +13,6 @@ extension AVAudioPCMBuffer { @@ -13,7 +13,6 @@ extension AVAudioPCMBuffer {
13 } 13 }
14 14
15 func run() { 15 func run() {
16 -  
17 var recognizer: SherpaOnnxOfflineRecognizer 16 var recognizer: SherpaOnnxOfflineRecognizer
18 var modelConfig: SherpaOnnxOfflineModelConfig 17 var modelConfig: SherpaOnnxOfflineModelConfig
19 var modelType = "whisper" 18 var modelType = "whisper"
  1 +/*
  2 +This file shows how to use Swift API to generate subtitles.
  3 +
  4 +You can use the files from
  5 +https://huggingface.co/csukuangfj/vad/tree/main
  6 +for testing.
  7 +
  8 +For instance, to generate subtitles for Obama.mov, please first
  9 +use
  10 +
  11 +ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav
  12 +
  13 +to extract the audio part from the video.
  14 +
  15 +This file supports only processing WAV sound files, so you have to first
  16 +extract audios from videos.
  17 +
  18 +Please see
  19 +./run-generate-subtitles.sh
  20 +for usages.
  21 +*/
  22 +
  23 +import AVFoundation
  24 +
  25 +extension AudioBuffer {
  26 + func array() -> [Float] {
  27 + return Array(UnsafeBufferPointer(self))
  28 + }
  29 +}
  30 +
  31 +extension AVAudioPCMBuffer {
  32 + func array() -> [Float] {
  33 + return self.audioBufferList.pointee.mBuffers.array()
  34 + }
  35 +}
  36 +
  37 +extension TimeInterval {
  38 + var hourMinuteSecondMS: String {
  39 + String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond)
  40 + }
  41 +
  42 + var hour: Int {
  43 + Int((self / 3600).truncatingRemainder(dividingBy: 3600))
  44 + }
  45 + var minute: Int {
  46 + Int((self / 60).truncatingRemainder(dividingBy: 60))
  47 + }
  48 + var second: Int {
  49 + Int(truncatingRemainder(dividingBy: 60))
  50 + }
  51 + var millisecond: Int {
  52 + Int((self * 1000).truncatingRemainder(dividingBy: 1000))
  53 + }
  54 +}
  55 +
  56 +extension String {
  57 + var fileURL: URL {
  58 + return URL(fileURLWithPath: self)
  59 + }
  60 + var pathExtension: String {
  61 + return fileURL.pathExtension
  62 + }
  63 + var lastPathComponent: String {
  64 + return fileURL.lastPathComponent
  65 + }
  66 + var stringByDeletingPathExtension: String {
  67 + return fileURL.deletingPathExtension().path
  68 + }
  69 +}
  70 +
  71 +class SpeechSegment: CustomStringConvertible {
  72 +
  73 + let start: Float
  74 + let end: Float
  75 + let text: String
  76 +
  77 + init(start: Float, duration: Float, text: String) {
  78 + self.start = start
  79 + self.end = start + duration
  80 + self.text = text
  81 + }
  82 + public var description: String {
  83 + var s: String
  84 + s = TimeInterval(self.start).hourMinuteSecondMS
  85 + s += " --> "
  86 + s += TimeInterval(self.end).hourMinuteSecondMS
  87 + s += "\n"
  88 + s += self.text
  89 +
  90 + return s
  91 + }
  92 +}
  93 +
  94 +func run() {
  95 + var recognizer: SherpaOnnxOfflineRecognizer
  96 + var modelConfig: SherpaOnnxOfflineModelConfig
  97 + var modelType = "whisper"
  98 + // modelType = "paraformer"
  99 + var filePath = "/Users/fangjun/Desktop/Obama.wav" // English
  100 + // filePath = "/Users/fangjun/Desktop/lei-jun.wav" // Chinese
  101 + // please go to https://huggingface.co/csukuangfj/vad
  102 + // to download the above two files
  103 +
  104 + if modelType == "whisper" {
  105 + // for English
  106 + let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
  107 + let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
  108 + let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
  109 +
  110 + let whisperConfig = sherpaOnnxOfflineWhisperModelConfig(
  111 + encoder: encoder,
  112 + decoder: decoder
  113 + )
  114 +
  115 + modelConfig = sherpaOnnxOfflineModelConfig(
  116 + tokens: tokens,
  117 + whisper: whisperConfig,
  118 + debug: 0,
  119 + modelType: "whisper"
  120 + )
  121 + } else if modelType == "paraformer" {
  122 + // for Chinese
  123 + let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx"
  124 + let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt"
  125 + let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig(
  126 + model: model
  127 + )
  128 +
  129 + modelConfig = sherpaOnnxOfflineModelConfig(
  130 + tokens: tokens,
  131 + paraformer: paraformerConfig,
  132 + debug: 0,
  133 + modelType: "paraformer"
  134 + )
  135 + } else {
  136 + print("Please specify a supported modelType \(modelType)")
  137 + return
  138 + }
  139 +
  140 + let sampleRate = 16000
  141 + let featConfig = sherpaOnnxFeatureConfig(
  142 + sampleRate: sampleRate,
  143 + featureDim: 80
  144 + )
  145 + var config = sherpaOnnxOfflineRecognizerConfig(
  146 + featConfig: featConfig,
  147 + modelConfig: modelConfig
  148 + )
  149 +
  150 + recognizer = SherpaOnnxOfflineRecognizer(config: &config)
  151 +
  152 + let audioFile = try! AVAudioFile(forReading: filePath.fileURL)
  153 +
  154 + let audioFormat = audioFile.processingFormat
  155 + assert(audioFormat.sampleRate == Double(sampleRate))
  156 + assert(audioFormat.channelCount == 1)
  157 + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
  158 +
  159 + let sileroVadConfig = sherpaOnnxSileroVadModelConfig(
  160 + model: "./silero_vad.onnx"
  161 + )
  162 +
  163 + var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig)
  164 + let vad = SherpaOnnxVoiceActivityDetectorWrapper(
  165 + config: &vadModelConfig, buffer_size_in_seconds: 120)
  166 +
  167 + let audioFrameCount = UInt32(audioFile.length)
  168 + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
  169 +
  170 + try! audioFile.read(into: audioFileBuffer!)
  171 + var array: [Float]! = audioFileBuffer?.array()
  172 +
  173 + let windowSize = Int(vadModelConfig.silero_vad.window_size)
  174 +
  175 + var segments: [SpeechSegment] = []
  176 +
  177 + while array.count > windowSize {
  178 + // todo(fangjun): avoid extra copies here
  179 + vad.acceptWaveform(samples: [Float](array[0..<windowSize]))
  180 + array = [Float](array[windowSize..<array.count])
  181 +
  182 + while !vad.isEmpty() {
  183 + let s = vad.front()
  184 + vad.pop()
  185 + let result = recognizer.decode(samples: s.samples)
  186 +
  187 + segments.append(
  188 + SpeechSegment(
  189 + start: Float(s.start) / Float(sampleRate),
  190 + duration: Float(s.samples.count) / Float(sampleRate),
  191 + text: result.text))
  192 +
  193 + print(segments.last!)
  194 +
  195 + }
  196 + }
  197 +
  198 + let srt = zip(segments.indices, segments).map { (index, element) in
  199 + return "\(index+1)\n\(element)"
  200 + }.joined(separator: "\n\n")
  201 +
  202 + let srtFilename = filePath.stringByDeletingPathExtension + ".srt"
  203 + do {
  204 + try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8)
  205 + } catch {
  206 + print("Error writing: \(error.localizedDescription)")
  207 + }
  208 +
  209 + print("Saved to \(srtFilename)")
  210 +}
  211 +
  212 +@main
  213 +struct App {
  214 + static func main() {
  215 + run()
  216 + }
  217 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
  11 + echo "Please download the pre-trained model for testing."
  12 + echo "You can refer to"
  13 + echo ""
  14 + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html"
  15 + echo ""
  16 + echo "for help"
  17 + exit 1
  18 +fi
  19 +
  20 +if [ ! -e ./generate-subtitles ]; then
  21 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  22 + swiftc \
  23 + -lc++ \
  24 + -I ../build-swift-macos/install/include \
  25 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  26 + ./generate-subtitles.swift ./SherpaOnnx.swift \
  27 + -L ../build-swift-macos/install/lib/ \
  28 + -l sherpa-onnx \
  29 + -l onnxruntime \
  30 + -o generate-subtitles
  31 +else
  32 + echo "./generate-subtitles exists - skip building"
  33 +fi
  34 +
  35 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  36 +./generate-subtitles