Committed by
GitHub
Add Swift example for generating subtitles (#318)
正在显示
9 个修改的文件
包含
653 行增加
和
10 行删除
| @@ -9,9 +9,11 @@ | @@ -9,9 +9,11 @@ | ||
| 9 | #include <utility> | 9 | #include <utility> |
| 10 | #include <vector> | 10 | #include <vector> |
| 11 | 11 | ||
| 12 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 12 | #include "sherpa-onnx/csrc/display.h" | 13 | #include "sherpa-onnx/csrc/display.h" |
| 13 | #include "sherpa-onnx/csrc/offline-recognizer.h" | 14 | #include "sherpa-onnx/csrc/offline-recognizer.h" |
| 14 | #include "sherpa-onnx/csrc/online-recognizer.h" | 15 | #include "sherpa-onnx/csrc/online-recognizer.h" |
| 16 | +#include "sherpa-onnx/csrc/voice-activity-detector.h" | ||
| 15 | 17 | ||
| 16 | struct SherpaOnnxOnlineRecognizer { | 18 | struct SherpaOnnxOnlineRecognizer { |
| 17 | std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl; | 19 | std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl; |
| @@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, | @@ -127,7 +129,7 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 127 | recognizer->impl->DecodeStreams(ss.data(), n); | 129 | recognizer->impl->DecodeStreams(ss.data(), n); |
| 128 | } | 130 | } |
| 129 | 131 | ||
| 130 | -SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( | 132 | +const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( |
| 131 | SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) { | 133 | SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) { |
| 132 | sherpa_onnx::OnlineRecognizerResult result = | 134 | sherpa_onnx::OnlineRecognizerResult result = |
| 133 | recognizer->impl->GetResult(stream->impl.get()); | 135 | recognizer->impl->GetResult(stream->impl.get()); |
| @@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer, | @@ -340,7 +342,7 @@ void DecodeMultipleOfflineStreams(SherpaOnnxOfflineRecognizer *recognizer, | ||
| 340 | recognizer->impl->DecodeStreams(ss.data(), n); | 342 | recognizer->impl->DecodeStreams(ss.data(), n); |
| 341 | } | 343 | } |
| 342 | 344 | ||
| 343 | -SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( | 345 | +const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( |
| 344 | SherpaOnnxOfflineStream *stream) { | 346 | SherpaOnnxOfflineStream *stream) { |
| 345 | const sherpa_onnx::OfflineRecognitionResult &result = | 347 | const sherpa_onnx::OfflineRecognitionResult &result = |
| 346 | stream->impl->GetResult(); | 348 | stream->impl->GetResult(); |
| @@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult( | @@ -372,3 +374,128 @@ void DestroyOfflineRecognizerResult( | ||
| 372 | delete[] r->timestamps; | 374 | delete[] r->timestamps; |
| 373 | delete r; | 375 | delete r; |
| 374 | } | 376 | } |
| 377 | + | ||
| 378 | +// ============================================================ | ||
| 379 | +// For VAD | ||
| 380 | +// ============================================================ | ||
| 381 | +// | ||
| 382 | +struct SherpaOnnxCircularBuffer { | ||
| 383 | + std::unique_ptr<sherpa_onnx::CircularBuffer> impl; | ||
| 384 | +}; | ||
| 385 | + | ||
| 386 | +SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer(int32_t capacity) { | ||
| 387 | + SherpaOnnxCircularBuffer *buffer = new SherpaOnnxCircularBuffer; | ||
| 388 | + buffer->impl = std::make_unique<sherpa_onnx::CircularBuffer>(capacity); | ||
| 389 | + return buffer; | ||
| 390 | +} | ||
| 391 | + | ||
| 392 | +void SherpaOnnxDestroyCircularBuffer(SherpaOnnxCircularBuffer *buffer) { | ||
| 393 | + delete buffer; | ||
| 394 | +} | ||
| 395 | + | ||
| 396 | +void SherpaOnnxCircularBufferPush(SherpaOnnxCircularBuffer *buffer, | ||
| 397 | + const float *p, int32_t n) { | ||
| 398 | + buffer->impl->Push(p, n); | ||
| 399 | +} | ||
| 400 | + | ||
| 401 | +const float *SherpaOnnxCircularBufferGet(SherpaOnnxCircularBuffer *buffer, | ||
| 402 | + int32_t start_index, int32_t n) { | ||
| 403 | + std::vector<float> v = buffer->impl->Get(start_index, n); | ||
| 404 | + | ||
| 405 | + float *p = new float[n]; | ||
| 406 | + std::copy(v.begin(), v.end(), p); | ||
| 407 | + return p; | ||
| 408 | +} | ||
| 409 | + | ||
| 410 | +void SherpaOnnxCircularBufferFree(const float *p) { delete[] p; } | ||
| 411 | + | ||
| 412 | +void SherpaOnnxCircularBufferPop(SherpaOnnxCircularBuffer *buffer, int32_t n) { | ||
| 413 | + buffer->impl->Pop(n); | ||
| 414 | +} | ||
| 415 | + | ||
| 416 | +int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) { | ||
| 417 | + return buffer->impl->Size(); | ||
| 418 | +} | ||
| 419 | + | ||
| 420 | +void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) { | ||
| 421 | + buffer->impl->Reset(); | ||
| 422 | +} | ||
| 423 | + | ||
| 424 | +struct SherpaOnnxVoiceActivityDetector { | ||
| 425 | + std::unique_ptr<sherpa_onnx::VoiceActivityDetector> impl; | ||
| 426 | +}; | ||
| 427 | + | ||
| 428 | +SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( | ||
| 429 | + const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds) { | ||
| 430 | + sherpa_onnx::VadModelConfig vad_config; | ||
| 431 | + | ||
| 432 | + vad_config.silero_vad.model = SHERPA_ONNX_OR(config->silero_vad.model, ""); | ||
| 433 | + vad_config.silero_vad.threshold = | ||
| 434 | + SHERPA_ONNX_OR(config->silero_vad.threshold, 0.5); | ||
| 435 | + | ||
| 436 | + vad_config.silero_vad.min_silence_duration = | ||
| 437 | + SHERPA_ONNX_OR(config->silero_vad.min_silence_duration, 0.5); | ||
| 438 | + | ||
| 439 | + vad_config.silero_vad.min_speech_duration = | ||
| 440 | + SHERPA_ONNX_OR(config->silero_vad.min_speech_duration, 0.25); | ||
| 441 | + | ||
| 442 | + vad_config.silero_vad.window_size = | ||
| 443 | + SHERPA_ONNX_OR(config->silero_vad.window_size, 512); | ||
| 444 | + | ||
| 445 | + vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); | ||
| 446 | + vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); | ||
| 447 | + vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); | ||
| 448 | + vad_config.debug = SHERPA_ONNX_OR(config->debug, false); | ||
| 449 | + | ||
| 450 | + if (vad_config.debug) { | ||
| 451 | + fprintf(stderr, "%s\n", vad_config.ToString().c_str()); | ||
| 452 | + } | ||
| 453 | + | ||
| 454 | + SherpaOnnxVoiceActivityDetector *p = new SherpaOnnxVoiceActivityDetector; | ||
| 455 | + p->impl = std::make_unique<sherpa_onnx::VoiceActivityDetector>( | ||
| 456 | + vad_config, buffer_size_in_seconds); | ||
| 457 | + | ||
| 458 | + return p; | ||
| 459 | +} | ||
| 460 | + | ||
| 461 | +void SherpaOnnxDestroyVoiceActivityDetector( | ||
| 462 | + SherpaOnnxVoiceActivityDetector *p) { | ||
| 463 | + delete p; | ||
| 464 | +} | ||
| 465 | + | ||
| 466 | +void SherpaOnnxVoiceActivityDetectorAcceptWaveform( | ||
| 467 | + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n) { | ||
| 468 | + p->impl->AcceptWaveform(samples, n); | ||
| 469 | +} | ||
| 470 | + | ||
| 471 | +int32_t SherpaOnnxVoiceActivityDetectorEmpty( | ||
| 472 | + SherpaOnnxVoiceActivityDetector *p) { | ||
| 473 | + return p->impl->Empty(); | ||
| 474 | +} | ||
| 475 | + | ||
| 476 | +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( | ||
| 477 | + SherpaOnnxVoiceActivityDetector *p) { | ||
| 478 | + p->impl->Pop(); | ||
| 479 | +} | ||
| 480 | + | ||
| 481 | +SHERPA_ONNX_API const SherpaOnnxSpeechSegment * | ||
| 482 | +SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p) { | ||
| 483 | + const sherpa_onnx::SpeechSegment &segment = p->impl->Front(); | ||
| 484 | + | ||
| 485 | + SherpaOnnxSpeechSegment *ans = new SherpaOnnxSpeechSegment; | ||
| 486 | + ans->start = segment.start; | ||
| 487 | + ans->samples = new float[segment.samples.size()]; | ||
| 488 | + std::copy(segment.samples.begin(), segment.samples.end(), ans->samples); | ||
| 489 | + ans->n = segment.samples.size(); | ||
| 490 | + | ||
| 491 | + return ans; | ||
| 492 | +} | ||
| 493 | + | ||
| 494 | +void SherpaOnnxDestroySpeechSegment(const SherpaOnnxSpeechSegment *p) { | ||
| 495 | + delete[] p->samples; | ||
| 496 | + delete p; | ||
| 497 | +} | ||
| 498 | + | ||
| 499 | +void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { | ||
| 500 | + p->impl->Reset(); | ||
| 501 | +} |
| @@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams( | @@ -234,7 +234,7 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams( | ||
| 234 | /// @return A pointer containing the result. The user has to invoke | 234 | /// @return A pointer containing the result. The user has to invoke |
| 235 | /// DestroyOnlineRecognizerResult() to free the returned pointer to | 235 | /// DestroyOnlineRecognizerResult() to free the returned pointer to |
| 236 | /// avoid memory leak. | 236 | /// avoid memory leak. |
| 237 | -SHERPA_ONNX_API SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( | 237 | +SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( |
| 238 | SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); | 238 | SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); |
| 239 | 239 | ||
| 240 | /// Destroy the pointer returned by GetOnlineStreamResult(). | 240 | /// Destroy the pointer returned by GetOnlineStreamResult(). |
| @@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { | @@ -429,7 +429,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { | ||
| 429 | /// @return Return a pointer to the result. The user has to invoke | 429 | /// @return Return a pointer to the result. The user has to invoke |
| 430 | /// DestroyOnlineRecognizerResult() to free the returned pointer to | 430 | /// DestroyOnlineRecognizerResult() to free the returned pointer to |
| 431 | /// avoid memory leak. | 431 | /// avoid memory leak. |
| 432 | -SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( | 432 | +SHERPA_ONNX_API const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( |
| 433 | SherpaOnnxOfflineStream *stream); | 433 | SherpaOnnxOfflineStream *stream); |
| 434 | 434 | ||
| 435 | /// Destroy the pointer returned by GetOfflineStreamResult(). | 435 | /// Destroy the pointer returned by GetOfflineStreamResult(). |
| @@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( | @@ -438,6 +438,127 @@ SHERPA_ONNX_API SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( | ||
| 438 | SHERPA_ONNX_API void DestroyOfflineRecognizerResult( | 438 | SHERPA_ONNX_API void DestroyOfflineRecognizerResult( |
| 439 | const SherpaOnnxOfflineRecognizerResult *r); | 439 | const SherpaOnnxOfflineRecognizerResult *r); |
| 440 | 440 | ||
| 441 | +// ============================================================ | ||
| 442 | +// For VAD | ||
| 443 | +// ============================================================ | ||
| 444 | + | ||
| 445 | +SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { | ||
| 446 | + // Path to the silero VAD model | ||
| 447 | + const char *model; | ||
| 448 | + | ||
| 449 | + // threshold to classify a segment as speech | ||
| 450 | + // | ||
| 451 | + // If the predicted probability of a segment is larger than this | ||
| 452 | + // value, then it is classified as speech. | ||
| 453 | + float threshold; | ||
| 454 | + | ||
| 455 | + // in seconds | ||
| 456 | + float min_silence_duration; | ||
| 457 | + | ||
| 458 | + // in seconds | ||
| 459 | + float min_speech_duration; | ||
| 460 | + | ||
| 461 | + int window_size; | ||
| 462 | +} SherpaOnnxSileroVadModelConfig; | ||
| 463 | + | ||
| 464 | +SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { | ||
| 465 | + SherpaOnnxSileroVadModelConfig silero_vad; | ||
| 466 | + | ||
| 467 | + int32_t sample_rate; | ||
| 468 | + int32_t num_threads; | ||
| 469 | + const char *provider; | ||
| 470 | + int32_t debug; | ||
| 471 | +} SherpaOnnxVadModelConfig; | ||
| 472 | + | ||
| 473 | +SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer | ||
| 474 | + SherpaOnnxCircularBuffer; | ||
| 475 | + | ||
| 476 | +// Return an instance of circular buffer. The user has to use | ||
| 477 | +// SherpaOnnxDestroyCircularBuffer() to free the returned pointer to avoid | ||
| 478 | +// memory leak. | ||
| 479 | +SHERPA_ONNX_API SherpaOnnxCircularBuffer *SherpaOnnxCreateCircularBuffer( | ||
| 480 | + int32_t capacity); | ||
| 481 | + | ||
| 482 | +// Free the pointer returned by SherpaOnnxCreateCircularBuffer() | ||
| 483 | +SHERPA_ONNX_API void SherpaOnnxDestroyCircularBuffer( | ||
| 484 | + SherpaOnnxCircularBuffer *buffer); | ||
| 485 | + | ||
| 486 | +SHERPA_ONNX_API void SherpaOnnxCircularBufferPush( | ||
| 487 | + SherpaOnnxCircularBuffer *buffer, const float *p, int32_t n); | ||
| 488 | + | ||
| 489 | +// Return n samples starting at the given index. | ||
| 490 | +// | ||
| 491 | +// Return a pointer to an array containing n samples starting at start_index. | ||
| 492 | +// The user has to use SherpaOnnxCircularBufferFree() to free the returned | ||
| 493 | +// pointer to avoid memory leak. | ||
| 494 | +SHERPA_ONNX_API const float *SherpaOnnxCircularBufferGet( | ||
| 495 | + SherpaOnnxCircularBuffer *buffer, int32_t start_index, int32_t n); | ||
| 496 | + | ||
| 497 | +// Free the pointer returned by SherpaOnnxCircularBufferGet(). | ||
| 498 | +SHERPA_ONNX_API void SherpaOnnxCircularBufferFree(const float *p); | ||
| 499 | + | ||
| 500 | +// Remove n elements from the buffer | ||
| 501 | +SHERPA_ONNX_API void SherpaOnnxCircularBufferPop( | ||
| 502 | + SherpaOnnxCircularBuffer *buffer, int32_t n); | ||
| 503 | + | ||
| 504 | +// Return number of elements in the buffer. | ||
| 505 | +SHERPA_ONNX_API int32_t | ||
| 506 | +SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer); | ||
| 507 | + | ||
| 508 | +// Clear all elements in the buffer | ||
| 509 | +SHERPA_ONNX_API void SherpaOnnxCircularBufferReset( | ||
| 510 | + SherpaOnnxCircularBuffer *buffer); | ||
| 511 | + | ||
| 512 | +SHERPA_ONNX_API typedef struct SherpaOnnxSpeechSegment { | ||
| 513 | + // The start index in samples of this segment | ||
| 514 | + int32_t start; | ||
| 515 | + | ||
| 516 | + // pointer to the array containing the samples | ||
| 517 | + float *samples; | ||
| 518 | + | ||
| 519 | + // number of samples in this segment | ||
| 520 | + int32_t n; | ||
| 521 | +} SherpaOnnxSpeechSegment; | ||
| 522 | + | ||
| 523 | +typedef struct SherpaOnnxVoiceActivityDetector SherpaOnnxVoiceActivityDetector; | ||
| 524 | + | ||
| 525 | +// Return an instance of VoiceActivityDetector. | ||
| 526 | +// The user has to use SherpaOnnxDestroyVoiceActivityDetector() to free | ||
| 527 | +// the returned pointer to avoid memory leak. | ||
| 528 | +SHERPA_ONNX_API SherpaOnnxVoiceActivityDetector * | ||
| 529 | +SherpaOnnxCreateVoiceActivityDetector(const SherpaOnnxVadModelConfig *config, | ||
| 530 | + float buffer_size_in_seconds); | ||
| 531 | + | ||
| 532 | +SHERPA_ONNX_API void SherpaOnnxDestroyVoiceActivityDetector( | ||
| 533 | + SherpaOnnxVoiceActivityDetector *p); | ||
| 534 | + | ||
| 535 | +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorAcceptWaveform( | ||
| 536 | + SherpaOnnxVoiceActivityDetector *p, const float *samples, int32_t n); | ||
| 537 | + | ||
| 538 | +// Return 1 if there are no speech segments available. | ||
| 539 | +// Return 0 if there are speech segments. | ||
| 540 | +SHERPA_ONNX_API int32_t | ||
| 541 | +SherpaOnnxVoiceActivityDetectorEmpty(SherpaOnnxVoiceActivityDetector *p); | ||
| 542 | + | ||
| 543 | +// Return the first speech segment. | ||
| 544 | +// It throws if SherpaOnnxVoiceActivityDetectorEmpty() returns 1. | ||
| 545 | +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorPop( | ||
| 546 | + SherpaOnnxVoiceActivityDetector *p); | ||
| 547 | + | ||
| 548 | +// Return the first speech segment. | ||
| 549 | +// The user has to use SherpaOnnxDestroySpeechSegment() to free the returned | ||
| 550 | +// pointer to avoid memory leak. | ||
| 551 | +SHERPA_ONNX_API const SherpaOnnxSpeechSegment * | ||
| 552 | +SherpaOnnxVoiceActivityDetectorFront(SherpaOnnxVoiceActivityDetector *p); | ||
| 553 | + | ||
| 554 | +// Free the pointer returned SherpaOnnxVoiceActivityDetectorFront(). | ||
| 555 | +SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment( | ||
| 556 | + const SherpaOnnxSpeechSegment *p); | ||
| 557 | + | ||
| 558 | +// Re-initialize the voice activity detector. | ||
| 559 | +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( | ||
| 560 | + SherpaOnnxVoiceActivityDetector *p); | ||
| 561 | + | ||
| 441 | #ifdef __cplusplus | 562 | #ifdef __cplusplus |
| 442 | } /* extern "C" */ | 563 | } /* extern "C" */ |
| 443 | #endif | 564 | #endif |
| @@ -18,7 +18,7 @@ void Hypotheses::Add(Hypothesis hyp) { | @@ -18,7 +18,7 @@ void Hypotheses::Add(Hypothesis hyp) { | ||
| 18 | } else { | 18 | } else { |
| 19 | it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob); | 19 | it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob); |
| 20 | 20 | ||
| 21 | - if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0){ | 21 | + if (it->second.lm_log_prob != 0 && hyp.lm_log_prob != 0) { |
| 22 | it->second.lm_log_prob = | 22 | it->second.lm_log_prob = |
| 23 | LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob); | 23 | LogAdd<double>()(it->second.lm_log_prob, hyp.lm_log_prob); |
| 24 | } | 24 | } |
| @@ -15,7 +15,7 @@ struct SileroVadModelConfig { | @@ -15,7 +15,7 @@ struct SileroVadModelConfig { | ||
| 15 | 15 | ||
| 16 | // threshold to classify a segment as speech | 16 | // threshold to classify a segment as speech |
| 17 | // | 17 | // |
| 18 | - // The predicted probability of a segment is larger than this | 18 | + // If the predicted probability of a segment is larger than this |
| 19 | // value, then it is classified as speech. | 19 | // value, then it is classified as speech. |
| 20 | float threshold = 0.5; | 20 | float threshold = 0.5; |
| 21 | 21 | ||
| @@ -25,7 +25,7 @@ struct SileroVadModelConfig { | @@ -25,7 +25,7 @@ struct SileroVadModelConfig { | ||
| 25 | 25 | ||
| 26 | // 512, 1024, 1536 samples for 16000 Hz | 26 | // 512, 1024, 1536 samples for 16000 Hz |
| 27 | // 256, 512, 768 samples for 800 Hz | 27 | // 256, 512, 768 samples for 800 Hz |
| 28 | - int window_size = 512; // in samples | 28 | + int32_t window_size = 512; // in samples |
| 29 | 29 | ||
| 30 | SileroVadModelConfig() = default; | 30 | SileroVadModelConfig() = default; |
| 31 | 31 |
| @@ -215,7 +215,7 @@ class SherpaOnnxRecognizer { | @@ -215,7 +215,7 @@ class SherpaOnnxRecognizer { | ||
| 215 | 215 | ||
| 216 | /// Get the decoding results so far | 216 | /// Get the decoding results so far |
| 217 | func getResult() -> SherpaOnnxOnlineRecongitionResult { | 217 | func getResult() -> SherpaOnnxOnlineRecongitionResult { |
| 218 | - let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult( | 218 | + let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult( |
| 219 | recognizer, stream) | 219 | recognizer, stream) |
| 220 | return SherpaOnnxOnlineRecongitionResult(result: result) | 220 | return SherpaOnnxOnlineRecongitionResult(result: result) |
| 221 | } | 221 | } |
| @@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer { | @@ -406,7 +406,7 @@ class SherpaOnnxOfflineRecognizer { | ||
| 406 | 406 | ||
| 407 | DecodeOfflineStream(recognizer, stream) | 407 | DecodeOfflineStream(recognizer, stream) |
| 408 | 408 | ||
| 409 | - let result: UnsafeMutablePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult( | 409 | + let result: UnsafePointer<SherpaOnnxOfflineRecognizerResult>? = GetOfflineStreamResult( |
| 410 | stream) | 410 | stream) |
| 411 | 411 | ||
| 412 | DestroyOfflineStream(stream) | 412 | DestroyOfflineStream(stream) |
| @@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer { | @@ -414,3 +414,145 @@ class SherpaOnnxOfflineRecognizer { | ||
| 414 | return SherpaOnnxOfflineRecongitionResult(result: result) | 414 | return SherpaOnnxOfflineRecongitionResult(result: result) |
| 415 | } | 415 | } |
| 416 | } | 416 | } |
| 417 | + | ||
| 418 | +func sherpaOnnxSileroVadModelConfig( | ||
| 419 | + model: String, | ||
| 420 | + threshold: Float = 0.5, | ||
| 421 | + minSilenceDuration: Float = 0.25, | ||
| 422 | + minSpeechDuration: Float = 0.5, | ||
| 423 | + windowSize: Int = 512 | ||
| 424 | +) -> SherpaOnnxSileroVadModelConfig { | ||
| 425 | + return SherpaOnnxSileroVadModelConfig( | ||
| 426 | + model: toCPointer(model), | ||
| 427 | + threshold: threshold, | ||
| 428 | + min_silence_duration: minSilenceDuration, | ||
| 429 | + min_speech_duration: minSpeechDuration, | ||
| 430 | + window_size: Int32(windowSize) | ||
| 431 | + ) | ||
| 432 | +} | ||
| 433 | + | ||
| 434 | +func sherpaOnnxVadModelConfig( | ||
| 435 | + sileroVad: SherpaOnnxSileroVadModelConfig, | ||
| 436 | + sampleRate: Int32 = 16000, | ||
| 437 | + numThreads: Int = 1, | ||
| 438 | + provider: String = "cpu", | ||
| 439 | + debug: Int = 0 | ||
| 440 | +) -> SherpaOnnxVadModelConfig { | ||
| 441 | + return SherpaOnnxVadModelConfig( | ||
| 442 | + silero_vad: sileroVad, | ||
| 443 | + sample_rate: sampleRate, | ||
| 444 | + num_threads: Int32(numThreads), | ||
| 445 | + provider: toCPointer(provider), | ||
| 446 | + debug: Int32(debug) | ||
| 447 | + ) | ||
| 448 | +} | ||
| 449 | + | ||
| 450 | +class SherpaOnnxCircularBufferWrapper { | ||
| 451 | + let buffer: OpaquePointer! | ||
| 452 | + | ||
| 453 | + init(capacity: Int) { | ||
| 454 | + buffer = SherpaOnnxCreateCircularBuffer(Int32(capacity)) | ||
| 455 | + } | ||
| 456 | + | ||
| 457 | + deinit { | ||
| 458 | + if let buffer { | ||
| 459 | + SherpaOnnxDestroyCircularBuffer(buffer) | ||
| 460 | + } | ||
| 461 | + } | ||
| 462 | + | ||
| 463 | + func push(samples: [Float]) { | ||
| 464 | + SherpaOnnxCircularBufferPush(buffer, samples, Int32(samples.count)) | ||
| 465 | + } | ||
| 466 | + | ||
| 467 | + func get(startIndex: Int, n: Int) -> [Float] { | ||
| 468 | + let p: UnsafePointer<Float>! = SherpaOnnxCircularBufferGet(buffer, Int32(startIndex), Int32(n)) | ||
| 469 | + | ||
| 470 | + var samples: [Float] = [] | ||
| 471 | + | ||
| 472 | + for index in 0..<n { | ||
| 473 | + samples.append(p[Int(index)]) | ||
| 474 | + } | ||
| 475 | + | ||
| 476 | + SherpaOnnxCircularBufferFree(p) | ||
| 477 | + | ||
| 478 | + return samples | ||
| 479 | + } | ||
| 480 | + | ||
| 481 | + func pop(n: Int) { | ||
| 482 | + SherpaOnnxCircularBufferPop(buffer, Int32(n)) | ||
| 483 | + } | ||
| 484 | + | ||
| 485 | + func size() -> Int { | ||
| 486 | + return Int(SherpaOnnxCircularBufferSize(buffer)) | ||
| 487 | + } | ||
| 488 | + | ||
| 489 | + func reset() { | ||
| 490 | + SherpaOnnxCircularBufferReset(buffer) | ||
| 491 | + } | ||
| 492 | +} | ||
| 493 | + | ||
| 494 | +class SherpaOnnxSpeechSegmentWrapper { | ||
| 495 | + let p: UnsafePointer<SherpaOnnxSpeechSegment>! | ||
| 496 | + | ||
| 497 | + init(p: UnsafePointer<SherpaOnnxSpeechSegment>!) { | ||
| 498 | + self.p = p | ||
| 499 | + } | ||
| 500 | + | ||
| 501 | + deinit { | ||
| 502 | + if let p { | ||
| 503 | + SherpaOnnxDestroySpeechSegment(p) | ||
| 504 | + } | ||
| 505 | + } | ||
| 506 | + | ||
| 507 | + var start: Int { | ||
| 508 | + return Int(p.pointee.start) | ||
| 509 | + } | ||
| 510 | + | ||
| 511 | + var n: Int { | ||
| 512 | + return Int(p.pointee.n) | ||
| 513 | + } | ||
| 514 | + | ||
| 515 | + var samples: [Float] { | ||
| 516 | + var samples: [Float] = [] | ||
| 517 | + for index in 0..<n { | ||
| 518 | + samples.append(p.pointee.samples[Int(index)]) | ||
| 519 | + } | ||
| 520 | + return samples | ||
| 521 | + } | ||
| 522 | +} | ||
| 523 | + | ||
| 524 | +class SherpaOnnxVoiceActivityDetectorWrapper { | ||
| 525 | + /// A pointer to the underlying counterpart in C | ||
| 526 | + let vad: OpaquePointer! | ||
| 527 | + | ||
| 528 | + init(config: UnsafePointer<SherpaOnnxVadModelConfig>!, buffer_size_in_seconds: Float) { | ||
| 529 | + vad = SherpaOnnxCreateVoiceActivityDetector(config, buffer_size_in_seconds) | ||
| 530 | + } | ||
| 531 | + | ||
| 532 | + deinit { | ||
| 533 | + if let vad { | ||
| 534 | + SherpaOnnxDestroyVoiceActivityDetector(vad) | ||
| 535 | + } | ||
| 536 | + } | ||
| 537 | + | ||
| 538 | + func acceptWaveform(samples: [Float]) { | ||
| 539 | + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count)) | ||
| 540 | + } | ||
| 541 | + | ||
| 542 | + func isEmpty() -> Bool { | ||
| 543 | + return SherpaOnnxVoiceActivityDetectorEmpty(vad) == 1 ? true : false | ||
| 544 | + } | ||
| 545 | + | ||
| 546 | + func pop() { | ||
| 547 | + SherpaOnnxVoiceActivityDetectorPop(vad) | ||
| 548 | + } | ||
| 549 | + | ||
| 550 | + func front() -> SherpaOnnxSpeechSegmentWrapper { | ||
| 551 | + let p: UnsafePointer<SherpaOnnxSpeechSegment>? = SherpaOnnxVoiceActivityDetectorFront(vad) | ||
| 552 | + return SherpaOnnxSpeechSegmentWrapper(p: p) | ||
| 553 | + } | ||
| 554 | + | ||
| 555 | + func reset() { | ||
| 556 | + SherpaOnnxVoiceActivityDetectorReset(vad) | ||
| 557 | + } | ||
| 558 | +} |
| @@ -13,7 +13,6 @@ extension AVAudioPCMBuffer { | @@ -13,7 +13,6 @@ extension AVAudioPCMBuffer { | ||
| 13 | } | 13 | } |
| 14 | 14 | ||
| 15 | func run() { | 15 | func run() { |
| 16 | - | ||
| 17 | var recognizer: SherpaOnnxOfflineRecognizer | 16 | var recognizer: SherpaOnnxOfflineRecognizer |
| 18 | var modelConfig: SherpaOnnxOfflineModelConfig | 17 | var modelConfig: SherpaOnnxOfflineModelConfig |
| 19 | var modelType = "whisper" | 18 | var modelType = "whisper" |
swift-api-examples/generate-subtitles.swift
0 → 100644
| 1 | +/* | ||
| 2 | +This file shows how to use Swift API to generate subtitles. | ||
| 3 | + | ||
| 4 | +You can use the files from | ||
| 5 | +https://huggingface.co/csukuangfj/vad/tree/main | ||
| 6 | +for testing. | ||
| 7 | + | ||
| 8 | +For instance, to generate subtitles for Obama.mov, please first | ||
| 9 | +use | ||
| 10 | + | ||
| 11 | +ffmpeg -i ./Obama.mov -acodec pcm_s16le -ac 1 -ar 16000 Obama.wav | ||
| 12 | + | ||
| 13 | +to extract the audio part from the video. | ||
| 14 | + | ||
| 15 | +This file supports only processing WAV sound files, so you have to first | ||
| 16 | +extract audios from videos. | ||
| 17 | + | ||
| 18 | +Please see | ||
| 19 | +./run-generate-subtitles.sh | ||
| 20 | +for usages. | ||
| 21 | +*/ | ||
| 22 | + | ||
| 23 | +import AVFoundation | ||
| 24 | + | ||
| 25 | +extension AudioBuffer { | ||
| 26 | + func array() -> [Float] { | ||
| 27 | + return Array(UnsafeBufferPointer(self)) | ||
| 28 | + } | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +extension AVAudioPCMBuffer { | ||
| 32 | + func array() -> [Float] { | ||
| 33 | + return self.audioBufferList.pointee.mBuffers.array() | ||
| 34 | + } | ||
| 35 | +} | ||
| 36 | + | ||
| 37 | +extension TimeInterval { | ||
| 38 | + var hourMinuteSecondMS: String { | ||
| 39 | + String(format: "%d:%02d:%02d,%03d", hour, minute, second, millisecond) | ||
| 40 | + } | ||
| 41 | + | ||
| 42 | + var hour: Int { | ||
| 43 | + Int((self / 3600).truncatingRemainder(dividingBy: 3600)) | ||
| 44 | + } | ||
| 45 | + var minute: Int { | ||
| 46 | + Int((self / 60).truncatingRemainder(dividingBy: 60)) | ||
| 47 | + } | ||
| 48 | + var second: Int { | ||
| 49 | + Int(truncatingRemainder(dividingBy: 60)) | ||
| 50 | + } | ||
| 51 | + var millisecond: Int { | ||
| 52 | + Int((self * 1000).truncatingRemainder(dividingBy: 1000)) | ||
| 53 | + } | ||
| 54 | +} | ||
| 55 | + | ||
| 56 | +extension String { | ||
| 57 | + var fileURL: URL { | ||
| 58 | + return URL(fileURLWithPath: self) | ||
| 59 | + } | ||
| 60 | + var pathExtension: String { | ||
| 61 | + return fileURL.pathExtension | ||
| 62 | + } | ||
| 63 | + var lastPathComponent: String { | ||
| 64 | + return fileURL.lastPathComponent | ||
| 65 | + } | ||
| 66 | + var stringByDeletingPathExtension: String { | ||
| 67 | + return fileURL.deletingPathExtension().path | ||
| 68 | + } | ||
| 69 | +} | ||
| 70 | + | ||
| 71 | +class SpeechSegment: CustomStringConvertible { | ||
| 72 | + | ||
| 73 | + let start: Float | ||
| 74 | + let end: Float | ||
| 75 | + let text: String | ||
| 76 | + | ||
| 77 | + init(start: Float, duration: Float, text: String) { | ||
| 78 | + self.start = start | ||
| 79 | + self.end = start + duration | ||
| 80 | + self.text = text | ||
| 81 | + } | ||
| 82 | + public var description: String { | ||
| 83 | + var s: String | ||
| 84 | + s = TimeInterval(self.start).hourMinuteSecondMS | ||
| 85 | + s += " --> " | ||
| 86 | + s += TimeInterval(self.end).hourMinuteSecondMS | ||
| 87 | + s += "\n" | ||
| 88 | + s += self.text | ||
| 89 | + | ||
| 90 | + return s | ||
| 91 | + } | ||
| 92 | +} | ||
| 93 | + | ||
| 94 | +func run() { | ||
| 95 | + var recognizer: SherpaOnnxOfflineRecognizer | ||
| 96 | + var modelConfig: SherpaOnnxOfflineModelConfig | ||
| 97 | + var modelType = "whisper" | ||
| 98 | + // modelType = "paraformer" | ||
| 99 | + var filePath = "/Users/fangjun/Desktop/Obama.wav" // English | ||
| 100 | + // filePath = "/Users/fangjun/Desktop/lei-jun.wav" // Chinese | ||
| 101 | + // please go to https://huggingface.co/csukuangfj/vad | ||
| 102 | + // to download the above two files | ||
| 103 | + | ||
| 104 | + if modelType == "whisper" { | ||
| 105 | + // for English | ||
| 106 | + let encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx" | ||
| 107 | + let decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx" | ||
| 108 | + let tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt" | ||
| 109 | + | ||
| 110 | + let whisperConfig = sherpaOnnxOfflineWhisperModelConfig( | ||
| 111 | + encoder: encoder, | ||
| 112 | + decoder: decoder | ||
| 113 | + ) | ||
| 114 | + | ||
| 115 | + modelConfig = sherpaOnnxOfflineModelConfig( | ||
| 116 | + tokens: tokens, | ||
| 117 | + whisper: whisperConfig, | ||
| 118 | + debug: 0, | ||
| 119 | + modelType: "whisper" | ||
| 120 | + ) | ||
| 121 | + } else if modelType == "paraformer" { | ||
| 122 | + // for Chinese | ||
| 123 | + let model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx" | ||
| 124 | + let tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt" | ||
| 125 | + let paraformerConfig = sherpaOnnxOfflineParaformerModelConfig( | ||
| 126 | + model: model | ||
| 127 | + ) | ||
| 128 | + | ||
| 129 | + modelConfig = sherpaOnnxOfflineModelConfig( | ||
| 130 | + tokens: tokens, | ||
| 131 | + paraformer: paraformerConfig, | ||
| 132 | + debug: 0, | ||
| 133 | + modelType: "paraformer" | ||
| 134 | + ) | ||
| 135 | + } else { | ||
| 136 | + print("Please specify a supported modelType \(modelType)") | ||
| 137 | + return | ||
| 138 | + } | ||
| 139 | + | ||
| 140 | + let sampleRate = 16000 | ||
| 141 | + let featConfig = sherpaOnnxFeatureConfig( | ||
| 142 | + sampleRate: sampleRate, | ||
| 143 | + featureDim: 80 | ||
| 144 | + ) | ||
| 145 | + var config = sherpaOnnxOfflineRecognizerConfig( | ||
| 146 | + featConfig: featConfig, | ||
| 147 | + modelConfig: modelConfig | ||
| 148 | + ) | ||
| 149 | + | ||
| 150 | + recognizer = SherpaOnnxOfflineRecognizer(config: &config) | ||
| 151 | + | ||
| 152 | + let audioFile = try! AVAudioFile(forReading: filePath.fileURL) | ||
| 153 | + | ||
| 154 | + let audioFormat = audioFile.processingFormat | ||
| 155 | + assert(audioFormat.sampleRate == Double(sampleRate)) | ||
| 156 | + assert(audioFormat.channelCount == 1) | ||
| 157 | + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) | ||
| 158 | + | ||
| 159 | + let sileroVadConfig = sherpaOnnxSileroVadModelConfig( | ||
| 160 | + model: "./silero_vad.onnx" | ||
| 161 | + ) | ||
| 162 | + | ||
| 163 | + var vadModelConfig = sherpaOnnxVadModelConfig(sileroVad: sileroVadConfig) | ||
| 164 | + let vad = SherpaOnnxVoiceActivityDetectorWrapper( | ||
| 165 | + config: &vadModelConfig, buffer_size_in_seconds: 120) | ||
| 166 | + | ||
| 167 | + let audioFrameCount = UInt32(audioFile.length) | ||
| 168 | + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) | ||
| 169 | + | ||
| 170 | + try! audioFile.read(into: audioFileBuffer!) | ||
| 171 | + var array: [Float]! = audioFileBuffer?.array() | ||
| 172 | + | ||
| 173 | + let windowSize = Int(vadModelConfig.silero_vad.window_size) | ||
| 174 | + | ||
| 175 | + var segments: [SpeechSegment] = [] | ||
| 176 | + | ||
| 177 | + while array.count > windowSize { | ||
| 178 | + // todo(fangjun): avoid extra copies here | ||
| 179 | + vad.acceptWaveform(samples: [Float](array[0..<windowSize])) | ||
| 180 | + array = [Float](array[windowSize..<array.count]) | ||
| 181 | + | ||
| 182 | + while !vad.isEmpty() { | ||
| 183 | + let s = vad.front() | ||
| 184 | + vad.pop() | ||
| 185 | + let result = recognizer.decode(samples: s.samples) | ||
| 186 | + | ||
| 187 | + segments.append( | ||
| 188 | + SpeechSegment( | ||
| 189 | + start: Float(s.start) / Float(sampleRate), | ||
| 190 | + duration: Float(s.samples.count) / Float(sampleRate), | ||
| 191 | + text: result.text)) | ||
| 192 | + | ||
| 193 | + print(segments.last!) | ||
| 194 | + | ||
| 195 | + } | ||
| 196 | + } | ||
| 197 | + | ||
| 198 | + let srt = zip(segments.indices, segments).map { (index, element) in | ||
| 199 | + return "\(index+1)\n\(element)" | ||
| 200 | + }.joined(separator: "\n\n") | ||
| 201 | + | ||
| 202 | + let srtFilename = filePath.stringByDeletingPathExtension + ".srt" | ||
| 203 | + do { | ||
| 204 | + try srt.write(to: srtFilename.fileURL, atomically: true, encoding: .utf8) | ||
| 205 | + } catch { | ||
| 206 | + print("Error writing: \(error.localizedDescription)") | ||
| 207 | + } | ||
| 208 | + | ||
| 209 | + print("Saved to \(srtFilename)") | ||
| 210 | +} | ||
| 211 | + | ||
| 212 | +@main | ||
| 213 | +struct App { | ||
| 214 | + static func main() { | ||
| 215 | + run() | ||
| 216 | + } | ||
| 217 | +} |
swift-api-examples/run-generate-subtitles.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d ../build-swift-macos ]; then | ||
| 6 | + echo "Please run ../build-swift-macos.sh first!" | ||
| 7 | + exit 1 | ||
| 8 | +fi | ||
| 9 | + | ||
| 10 | +if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then | ||
| 11 | + echo "Please download the pre-trained model for testing." | ||
| 12 | + echo "You can refer to" | ||
| 13 | + echo "" | ||
| 14 | + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html" | ||
| 15 | + echo "" | ||
| 16 | + echo "for help" | ||
| 17 | + exit 1 | ||
| 18 | +fi | ||
| 19 | + | ||
| 20 | +if [ ! -e ./generate-subtitles ]; then | ||
| 21 | + # Note: We use -lc++ to link against libc++ instead of libstdc++ | ||
| 22 | + swiftc \ | ||
| 23 | + -lc++ \ | ||
| 24 | + -I ../build-swift-macos/install/include \ | ||
| 25 | + -import-objc-header ./SherpaOnnx-Bridging-Header.h \ | ||
| 26 | + ./generate-subtitles.swift ./SherpaOnnx.swift \ | ||
| 27 | + -L ../build-swift-macos/install/lib/ \ | ||
| 28 | + -l sherpa-onnx \ | ||
| 29 | + -l onnxruntime \ | ||
| 30 | + -o generate-subtitles | ||
| 31 | +else | ||
| 32 | + echo "./generate-subtitles exists - skip building" | ||
| 33 | +fi | ||
| 34 | + | ||
| 35 | +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH | ||
| 36 | +./generate-subtitles |
-
请 注册 或 登录 后发表评论