正在显示
10 个修改的文件
包含
461 行增加
和
36 行删除
| @@ -81,6 +81,45 @@ jobs: | @@ -81,6 +81,45 @@ jobs: | ||
| 81 | otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib | 81 | otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib |
| 82 | fi | 82 | fi |
| 83 | 83 | ||
| 84 | + - name: Test VAD | ||
| 85 | + shell: bash | ||
| 86 | + run: | | ||
| 87 | + name=vad-cxx-api | ||
| 88 | + g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \ | ||
| 89 | + -I ./build/install/include \ | ||
| 90 | + -L ./build/install/lib/ \ | ||
| 91 | + -l sherpa-onnx-cxx-api \ | ||
| 92 | + -l sherpa-onnx-c-api \ | ||
| 93 | + -l onnxruntime | ||
| 94 | + | ||
| 95 | + ls -lh $name | ||
| 96 | + | ||
| 97 | + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | ||
| 98 | + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 99 | + | ||
| 100 | + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then | ||
| 101 | + ldd ./$name | ||
| 102 | + echo "----" | ||
| 103 | + readelf -d ./$name | ||
| 104 | + fi | ||
| 105 | + | ||
| 106 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 107 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 108 | + | ||
| 109 | + ./$name | ||
| 110 | + | ||
| 111 | + mkdir vad-test | ||
| 112 | + cp -v lei-jun-test*.wav vad-test | ||
| 113 | + | ||
| 114 | + ls -lh vad-test | ||
| 115 | + | ||
| 116 | + rm $name | ||
| 117 | + | ||
| 118 | + - uses: actions/upload-artifact@v4 | ||
| 119 | + with: | ||
| 120 | + name: vad-test-wavs-cxx-${{ matrix.os }} | ||
| 121 | + path: ./vad-test/*.wav | ||
| 122 | + | ||
| 84 | - name: Test Speech Enhancement (GTCRN) | 123 | - name: Test Speech Enhancement (GTCRN) |
| 85 | shell: bash | 124 | shell: bash |
| 86 | run: | | 125 | run: | |
| @@ -53,6 +53,7 @@ def get_binaries(): | @@ -53,6 +53,7 @@ def get_binaries(): | ||
| 53 | "sherpa-onnx-microphone-offline-speaker-identification", | 53 | "sherpa-onnx-microphone-offline-speaker-identification", |
| 54 | "sherpa-onnx-offline", | 54 | "sherpa-onnx-offline", |
| 55 | "sherpa-onnx-offline-audio-tagging", | 55 | "sherpa-onnx-offline-audio-tagging", |
| 56 | + "sherpa-onnx-offline-denoiser", | ||
| 56 | "sherpa-onnx-offline-language-identification", | 57 | "sherpa-onnx-offline-language-identification", |
| 57 | "sherpa-onnx-offline-punctuation", | 58 | "sherpa-onnx-offline-punctuation", |
| 58 | "sherpa-onnx-offline-speaker-diarization", | 59 | "sherpa-onnx-offline-speaker-diarization", |
| @@ -62,6 +63,7 @@ def get_binaries(): | @@ -62,6 +63,7 @@ def get_binaries(): | ||
| 62 | "sherpa-onnx-online-punctuation", | 63 | "sherpa-onnx-online-punctuation", |
| 63 | "sherpa-onnx-online-websocket-client", | 64 | "sherpa-onnx-online-websocket-client", |
| 64 | "sherpa-onnx-online-websocket-server", | 65 | "sherpa-onnx-online-websocket-server", |
| 66 | + "sherpa-onnx-vad", | ||
| 65 | "sherpa-onnx-vad-microphone", | 67 | "sherpa-onnx-vad-microphone", |
| 66 | "sherpa-onnx-vad-microphone-offline-asr", | 68 | "sherpa-onnx-vad-microphone-offline-asr", |
| 67 | "sherpa-onnx-vad-with-offline-asr", | 69 | "sherpa-onnx-vad-with-offline-asr", |
| @@ -24,6 +24,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) | @@ -24,6 +24,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) | ||
| 24 | add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) | 24 | add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) |
| 25 | target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) | 25 | target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) |
| 26 | 26 | ||
| 27 | +add_executable(vad-cxx-api ./vad-cxx-api.cc) | ||
| 28 | +target_link_libraries(vad-cxx-api sherpa-onnx-cxx-api) | ||
| 29 | + | ||
| 27 | if(SHERPA_ONNX_ENABLE_TTS) | 30 | if(SHERPA_ONNX_ENABLE_TTS) |
| 28 | add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc) | 31 | add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc) |
| 29 | target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api) | 32 | target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api) |
cxx-api-examples/vad-cxx-api.cc
0 → 100644
| 1 | +// cxx-api-examples/vad-cxx-api.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// | ||
| 6 | +// This file demonstrates how to use VAD to remove silences from a file | ||
| 7 | +// clang-format off | ||
| 8 | +// | ||
| 9 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 10 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 11 | +// | ||
| 12 | +// clang-format on | ||
| 13 | +#include <iostream> | ||
| 14 | +#include <string> | ||
| 15 | + | ||
| 16 | +#include "sherpa-onnx/c-api/cxx-api.h" | ||
| 17 | + | ||
| 18 | +int32_t main() { | ||
| 19 | + using namespace sherpa_onnx::cxx; // NOLINT | ||
| 20 | + | ||
| 21 | + std::string wave_filename = "./lei-jun-test.wav"; | ||
| 22 | + std::string vad_filename = "./silero_vad.onnx"; | ||
| 23 | + | ||
| 24 | + VadModelConfig config; | ||
| 25 | + config.silero_vad.model = vad_filename; | ||
| 26 | + config.silero_vad.threshold = 0.1; | ||
| 27 | + config.silero_vad.min_silence_duration = 0.5; | ||
| 28 | + config.silero_vad.min_speech_duration = 0.25; | ||
| 29 | + config.silero_vad.max_speech_duration = 20; | ||
| 30 | + config.sample_rate = 16000; | ||
| 31 | + config.debug = true; | ||
| 32 | + | ||
| 33 | + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); | ||
| 34 | + if (!vad.Get()) { | ||
| 35 | + std::cerr << "Failed to create VAD. Please check your config\n"; | ||
| 36 | + return -1; | ||
| 37 | + } | ||
| 38 | + | ||
| 39 | + Wave wave = ReadWave(wave_filename); | ||
| 40 | + if (wave.samples.empty()) { | ||
| 41 | + std::cerr << "Failed to read: '" << wave_filename << "'\n"; | ||
| 42 | + return -1; | ||
| 43 | + } | ||
| 44 | + bool is_eof = false; | ||
| 45 | + int32_t i = 0; | ||
| 46 | + int32_t window_size = config.silero_vad.window_size; | ||
| 47 | + | ||
| 48 | + int32_t sample_rate = config.sample_rate; | ||
| 49 | + | ||
| 50 | + std::vector<float> samples_without_silence; | ||
| 51 | + | ||
| 52 | + while (!is_eof) { | ||
| 53 | + if (i + window_size < wave.samples.size()) { | ||
| 54 | + vad.AcceptWaveform(wave.samples.data() + i, window_size); | ||
| 55 | + i += window_size; | ||
| 56 | + } else { | ||
| 57 | + is_eof = true; | ||
| 58 | + vad.Flush(); | ||
| 59 | + } | ||
| 60 | + | ||
| 61 | + while (!vad.IsEmpty()) { | ||
| 62 | + auto segment = vad.Front(); | ||
| 63 | + float start_time = segment.start / static_cast<float>(sample_rate); | ||
| 64 | + float end_time = | ||
| 65 | + start_time + segment.samples.size() / static_cast<float>(sample_rate); | ||
| 66 | + printf("%.3f -- %.3f\n", start_time, end_time); | ||
| 67 | + | ||
| 68 | + samples_without_silence.insert(samples_without_silence.end(), | ||
| 69 | + segment.samples.begin(), | ||
| 70 | + segment.samples.end()); | ||
| 71 | + | ||
| 72 | + vad.Pop(); | ||
| 73 | + } | ||
| 74 | + } | ||
| 75 | + | ||
| 76 | + bool ok = WriteWave("./lei-jun-test-no-silence.wav", | ||
| 77 | + {samples_without_silence, sample_rate}); | ||
| 78 | + if (ok) { | ||
| 79 | + std::cout << "Saved to ./lei-jun-test-no-silence.wav\n"; | ||
| 80 | + } else { | ||
| 81 | + std::cerr << "Failed to write ./lei-jun-test-no-silence.wav\n"; | ||
| 82 | + } | ||
| 83 | + | ||
| 84 | + return 0; | ||
| 85 | +} |
| @@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { | @@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { | ||
| 785 | // in seconds | 785 | // in seconds |
| 786 | float min_speech_duration; | 786 | float min_speech_duration; |
| 787 | 787 | ||
| 788 | - int window_size; | 788 | + int32_t window_size; |
| 789 | 789 | ||
| 790 | // If a speech segment is longer than this value, then we increase | 790 | // If a speech segment is longer than this value, then we increase |
| 791 | // the threshold to 0.9. After finishing detecting the segment, | 791 | // the threshold to 0.9. After finishing detecting the segment, |
| @@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const { | @@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const { | ||
| 558 | return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_); | 558 | return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_); |
| 559 | } | 559 | } |
| 560 | 560 | ||
| 561 | +CircularBuffer CircularBuffer::Create(int32_t capacity) { | ||
| 562 | + auto p = SherpaOnnxCreateCircularBuffer(capacity); | ||
| 563 | + return CircularBuffer(p); | ||
| 564 | +} | ||
| 565 | + | ||
| 566 | +CircularBuffer::CircularBuffer(const SherpaOnnxCircularBuffer *p) | ||
| 567 | + : MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer>(p) {} | ||
| 568 | + | ||
| 569 | +void CircularBuffer::Destroy(const SherpaOnnxCircularBuffer *p) const { | ||
| 570 | + SherpaOnnxDestroyCircularBuffer(p); | ||
| 571 | +} | ||
| 572 | + | ||
| 573 | +void CircularBuffer::Push(const float *samples, int32_t n) const { | ||
| 574 | + SherpaOnnxCircularBufferPush(p_, samples, n); | ||
| 575 | +} | ||
| 576 | + | ||
| 577 | +std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const { | ||
| 578 | + const float *samples = SherpaOnnxCircularBufferGet(p_, start_index, n); | ||
| 579 | + std::vector<float> ans(n); | ||
| 580 | + std::copy(samples, samples + n, ans.begin()); | ||
| 581 | + | ||
| 582 | + SherpaOnnxCircularBufferFree(samples); | ||
| 583 | + return ans; | ||
| 584 | +} | ||
| 585 | + | ||
| 586 | +void CircularBuffer::Pop(int32_t n) const { | ||
| 587 | + SherpaOnnxCircularBufferPop(p_, n); | ||
| 588 | +} | ||
| 589 | + | ||
| 590 | +int32_t CircularBuffer::Size() const { | ||
| 591 | + return SherpaOnnxCircularBufferSize(p_); | ||
| 592 | +} | ||
| 593 | + | ||
| 594 | +int32_t CircularBuffer::Head() const { | ||
| 595 | + return SherpaOnnxCircularBufferHead(p_); | ||
| 596 | +} | ||
| 597 | + | ||
| 598 | +void CircularBuffer::Reset() const { SherpaOnnxCircularBufferReset(p_); } | ||
| 599 | + | ||
| 600 | +VoiceActivityDetector VoiceActivityDetector::Create( | ||
| 601 | + const VadModelConfig &config, float buffer_size_in_seconds) { | ||
| 602 | + struct SherpaOnnxVadModelConfig c; | ||
| 603 | + memset(&c, 0, sizeof(c)); | ||
| 604 | + | ||
| 605 | + c.silero_vad.model = config.silero_vad.model.c_str(); | ||
| 606 | + c.silero_vad.threshold = config.silero_vad.threshold; | ||
| 607 | + c.silero_vad.min_silence_duration = config.silero_vad.min_silence_duration; | ||
| 608 | + c.silero_vad.min_speech_duration = config.silero_vad.min_speech_duration; | ||
| 609 | + c.silero_vad.window_size = config.silero_vad.window_size; | ||
| 610 | + c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration; | ||
| 611 | + | ||
| 612 | + c.sample_rate = config.sample_rate; | ||
| 613 | + c.num_threads = config.num_threads; | ||
| 614 | + c.provider = config.provider.c_str(); | ||
| 615 | + c.debug = config.debug; | ||
| 616 | + | ||
| 617 | + auto p = SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds); | ||
| 618 | + return VoiceActivityDetector(p); | ||
| 619 | +} | ||
| 620 | + | ||
| 621 | +VoiceActivityDetector::VoiceActivityDetector( | ||
| 622 | + const SherpaOnnxVoiceActivityDetector *p) | ||
| 623 | + : MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector>(p) {} | ||
| 624 | + | ||
| 625 | +void VoiceActivityDetector::Destroy( | ||
| 626 | + const SherpaOnnxVoiceActivityDetector *p) const { | ||
| 627 | + SherpaOnnxDestroyVoiceActivityDetector(p); | ||
| 628 | +} | ||
| 629 | + | ||
| 630 | +void VoiceActivityDetector::AcceptWaveform(const float *samples, | ||
| 631 | + int32_t n) const { | ||
| 632 | + SherpaOnnxVoiceActivityDetectorAcceptWaveform(p_, samples, n); | ||
| 633 | +} | ||
| 634 | + | ||
| 635 | +bool VoiceActivityDetector::IsEmpty() const { | ||
| 636 | + return SherpaOnnxVoiceActivityDetectorEmpty(p_); | ||
| 637 | +} | ||
| 638 | + | ||
| 639 | +bool VoiceActivityDetector ::IsDetected() const { | ||
| 640 | + return SherpaOnnxVoiceActivityDetectorDetected(p_); | ||
| 641 | +} | ||
| 642 | + | ||
| 643 | +void VoiceActivityDetector::Pop() const { | ||
| 644 | + SherpaOnnxVoiceActivityDetectorPop(p_); | ||
| 645 | +} | ||
| 646 | + | ||
| 647 | +void VoiceActivityDetector::Clear() const { | ||
| 648 | + SherpaOnnxVoiceActivityDetectorClear(p_); | ||
| 649 | +} | ||
| 650 | + | ||
| 651 | +SpeechSegment VoiceActivityDetector::Front() const { | ||
| 652 | + auto f = SherpaOnnxVoiceActivityDetectorFront(p_); | ||
| 653 | + | ||
| 654 | + SpeechSegment segment; | ||
| 655 | + segment.start = f->start; | ||
| 656 | + segment.samples = std::vector<float>{f->samples, f->samples + f->n}; | ||
| 657 | + | ||
| 658 | + SherpaOnnxDestroySpeechSegment(f); | ||
| 659 | + | ||
| 660 | + return segment; | ||
| 661 | +} | ||
| 662 | + | ||
| 663 | +void VoiceActivityDetector::Reset() const { | ||
| 664 | + SherpaOnnxVoiceActivityDetectorReset(p_); | ||
| 665 | +} | ||
| 666 | + | ||
| 667 | +void VoiceActivityDetector::Flush() const { | ||
| 668 | + SherpaOnnxVoiceActivityDetectorFlush(p_); | ||
| 669 | +} | ||
| 670 | + | ||
| 561 | } // namespace sherpa_onnx::cxx | 671 | } // namespace sherpa_onnx::cxx |
| @@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser | @@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser | ||
| 500 | explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p); | 500 | explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p); |
| 501 | }; | 501 | }; |
| 502 | 502 | ||
| 503 | +// ============================== | ||
| 504 | +// VAD | ||
| 505 | +// ============================== | ||
| 506 | + | ||
| 507 | +struct SileroVadModelConfig { | ||
| 508 | + std::string model; | ||
| 509 | + float threshold = 0.5; | ||
| 510 | + float min_silence_duration = 0.5; | ||
| 511 | + float min_speech_duration = 0.25; | ||
| 512 | + int32_t window_size = 512; | ||
| 513 | + float max_speech_duration = 20; | ||
| 514 | +}; | ||
| 515 | + | ||
| 516 | +struct VadModelConfig { | ||
| 517 | + SileroVadModelConfig silero_vad; | ||
| 518 | + | ||
| 519 | + int32_t sample_rate = 16000; | ||
| 520 | + int32_t num_threads = 1; | ||
| 521 | + std::string provider = "cpu"; | ||
| 522 | + bool debug = false; | ||
| 523 | +}; | ||
| 524 | + | ||
| 525 | +struct SpeechSegment { | ||
| 526 | + int32_t start; | ||
| 527 | + std::vector<float> samples; | ||
| 528 | +}; | ||
| 529 | + | ||
| 530 | +class SHERPA_ONNX_API CircularBuffer | ||
| 531 | + : public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> { | ||
| 532 | + public: | ||
| 533 | + static CircularBuffer Create(int32_t capacity); | ||
| 534 | + | ||
| 535 | + void Destroy(const SherpaOnnxCircularBuffer *p) const; | ||
| 536 | + | ||
| 537 | + void Push(const float *p, int32_t n) const; | ||
| 538 | + | ||
| 539 | + std::vector<float> Get(int32_t start_index, int32_t n) const; | ||
| 540 | + | ||
| 541 | + void Pop(int32_t n) const; | ||
| 542 | + | ||
| 543 | + int32_t Size() const; | ||
| 544 | + | ||
| 545 | + int32_t Head() const; | ||
| 546 | + | ||
| 547 | + void Reset() const; | ||
| 548 | + | ||
| 549 | + private: | ||
| 550 | + explicit CircularBuffer(const SherpaOnnxCircularBuffer *p); | ||
| 551 | +}; | ||
| 552 | + | ||
| 553 | +class SHERPA_ONNX_API VoiceActivityDetector | ||
| 554 | + : public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> { | ||
| 555 | + public: | ||
| 556 | + static VoiceActivityDetector Create(const VadModelConfig &config, | ||
| 557 | + float buffer_size_in_seconds); | ||
| 558 | + | ||
| 559 | + void Destroy(const SherpaOnnxVoiceActivityDetector *p) const; | ||
| 560 | + | ||
| 561 | + void AcceptWaveform(const float *samples, int32_t n) const; | ||
| 562 | + | ||
| 563 | + bool IsEmpty() const; | ||
| 564 | + | ||
| 565 | + bool IsDetected() const; | ||
| 566 | + | ||
| 567 | + void Pop() const; | ||
| 568 | + | ||
| 569 | + void Clear() const; | ||
| 570 | + | ||
| 571 | + SpeechSegment Front() const; | ||
| 572 | + | ||
| 573 | + void Reset() const; | ||
| 574 | + | ||
| 575 | + void Flush() const; | ||
| 576 | + | ||
| 577 | + private: | ||
| 578 | + explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p); | ||
| 579 | +}; | ||
| 580 | + | ||
| 503 | } // namespace sherpa_onnx::cxx | 581 | } // namespace sherpa_onnx::cxx |
| 504 | 582 | ||
| 505 | #endif // SHERPA_ONNX_C_API_CXX_API_H_ | 583 | #endif // SHERPA_ONNX_C_API_CXX_API_H_ |
| @@ -317,11 +317,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) | @@ -317,11 +317,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) | ||
| 317 | add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc) | 317 | add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc) |
| 318 | add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc) | 318 | add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc) |
| 319 | add_executable(sherpa-onnx-offline-audio-tagging sherpa-onnx-offline-audio-tagging.cc) | 319 | add_executable(sherpa-onnx-offline-audio-tagging sherpa-onnx-offline-audio-tagging.cc) |
| 320 | + add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc) | ||
| 320 | add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc) | 321 | add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc) |
| 321 | add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc) | 322 | add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc) |
| 322 | add_executable(sherpa-onnx-offline-punctuation sherpa-onnx-offline-punctuation.cc) | 323 | add_executable(sherpa-onnx-offline-punctuation sherpa-onnx-offline-punctuation.cc) |
| 323 | add_executable(sherpa-onnx-online-punctuation sherpa-onnx-online-punctuation.cc) | 324 | add_executable(sherpa-onnx-online-punctuation sherpa-onnx-online-punctuation.cc) |
| 324 | - add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc) | 325 | + add_executable(sherpa-onnx-vad sherpa-onnx-vad.cc) |
| 325 | 326 | ||
| 326 | if(SHERPA_ONNX_ENABLE_TTS) | 327 | if(SHERPA_ONNX_ENABLE_TTS) |
| 327 | add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc) | 328 | add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc) |
| @@ -336,11 +337,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) | @@ -336,11 +337,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) | ||
| 336 | sherpa-onnx-keyword-spotter | 337 | sherpa-onnx-keyword-spotter |
| 337 | sherpa-onnx-offline | 338 | sherpa-onnx-offline |
| 338 | sherpa-onnx-offline-audio-tagging | 339 | sherpa-onnx-offline-audio-tagging |
| 340 | + sherpa-onnx-offline-denoiser | ||
| 339 | sherpa-onnx-offline-language-identification | 341 | sherpa-onnx-offline-language-identification |
| 340 | sherpa-onnx-offline-parallel | 342 | sherpa-onnx-offline-parallel |
| 341 | sherpa-onnx-offline-punctuation | 343 | sherpa-onnx-offline-punctuation |
| 342 | - sherpa-onnx-offline-denoiser | ||
| 343 | sherpa-onnx-online-punctuation | 344 | sherpa-onnx-online-punctuation |
| 345 | + sherpa-onnx-vad | ||
| 344 | ) | 346 | ) |
| 345 | if(SHERPA_ONNX_ENABLE_TTS) | 347 | if(SHERPA_ONNX_ENABLE_TTS) |
| 346 | list(APPEND main_exes | 348 | list(APPEND main_exes |
| @@ -7,9 +7,9 @@ | @@ -7,9 +7,9 @@ | ||
| 7 | #include <stdlib.h> | 7 | #include <stdlib.h> |
| 8 | 8 | ||
| 9 | #include <algorithm> | 9 | #include <algorithm> |
| 10 | +#include <iomanip> | ||
| 10 | 11 | ||
| 11 | #include "sherpa-onnx/csrc/alsa.h" | 12 | #include "sherpa-onnx/csrc/alsa.h" |
| 12 | -#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 13 | #include "sherpa-onnx/csrc/voice-activity-detector.h" | 13 | #include "sherpa-onnx/csrc/voice-activity-detector.h" |
| 14 | #include "sherpa-onnx/csrc/wave-writer.h" | 14 | #include "sherpa-onnx/csrc/wave-writer.h" |
| 15 | 15 | ||
| @@ -84,8 +84,6 @@ as the device_name. | @@ -84,8 +84,6 @@ as the device_name. | ||
| 84 | exit(-1); | 84 | exit(-1); |
| 85 | } | 85 | } |
| 86 | 86 | ||
| 87 | - int32_t chunk = 0.1 * alsa.GetActualSampleRate(); | ||
| 88 | - | ||
| 89 | auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config); | 87 | auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config); |
| 90 | 88 | ||
| 91 | fprintf(stderr, "Started. Please speak\n"); | 89 | fprintf(stderr, "Started. Please speak\n"); |
| @@ -95,36 +93,34 @@ as the device_name. | @@ -95,36 +93,34 @@ as the device_name. | ||
| 95 | 93 | ||
| 96 | int32_t k = 0; | 94 | int32_t k = 0; |
| 97 | while (!stop) { | 95 | while (!stop) { |
| 98 | - { | ||
| 99 | - const std::vector<float> &samples = alsa.Read(chunk); | ||
| 100 | - | ||
| 101 | - vad->AcceptWaveform(samples.data(), samples.size()); | ||
| 102 | - | ||
| 103 | - if (vad->IsSpeechDetected() && !printed) { | ||
| 104 | - printed = true; | ||
| 105 | - fprintf(stderr, "\nDetected speech!\n"); | ||
| 106 | - } | ||
| 107 | - if (!vad->IsSpeechDetected()) { | ||
| 108 | - printed = false; | ||
| 109 | - } | ||
| 110 | - | ||
| 111 | - while (!vad->Empty()) { | ||
| 112 | - const auto &segment = vad->Front(); | ||
| 113 | - float duration = | ||
| 114 | - segment.samples.size() / static_cast<float>(sample_rate); | ||
| 115 | - | ||
| 116 | - fprintf(stderr, "Duration: %.3f seconds\n", duration); | ||
| 117 | - | ||
| 118 | - char filename[128]; | ||
| 119 | - snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration); | ||
| 120 | - k += 1; | ||
| 121 | - sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(), | ||
| 122 | - segment.samples.size()); | ||
| 123 | - fprintf(stderr, "Saved to %s\n", filename); | ||
| 124 | - fprintf(stderr, "----------\n"); | ||
| 125 | - | ||
| 126 | - vad->Pop(); | ||
| 127 | - } | 96 | + const std::vector<float> &samples = alsa.Read(window_size); |
| 97 | + | ||
| 98 | + vad->AcceptWaveform(samples.data(), samples.size()); | ||
| 99 | + | ||
| 100 | + if (vad->IsSpeechDetected() && !printed) { | ||
| 101 | + printed = true; | ||
| 102 | + fprintf(stderr, "\nDetected speech!\n"); | ||
| 103 | + } | ||
| 104 | + if (!vad->IsSpeechDetected()) { | ||
| 105 | + printed = false; | ||
| 106 | + } | ||
| 107 | + | ||
| 108 | + while (!vad->Empty()) { | ||
| 109 | + const auto &segment = vad->Front(); | ||
| 110 | + float duration = segment.samples.size() / static_cast<float>(sample_rate); | ||
| 111 | + | ||
| 112 | + fprintf(stderr, "Duration: %.3f seconds\n", duration); | ||
| 113 | + | ||
| 114 | + std::ostringstream os; | ||
| 115 | + os << "seg-" << k << "-" << std::fixed << std::setprecision(3) << duration | ||
| 116 | + << "s.wav"; | ||
| 117 | + k += 1; | ||
| 118 | + sherpa_onnx::WriteWave(os.str(), 16000, segment.samples.data(), | ||
| 119 | + segment.samples.size()); | ||
| 120 | + fprintf(stderr, "Saved to %s\n", os.str().c_str()); | ||
| 121 | + fprintf(stderr, "----------\n"); | ||
| 122 | + | ||
| 123 | + vad->Pop(); | ||
| 128 | } | 124 | } |
| 129 | } | 125 | } |
| 130 | 126 |
sherpa-onnx/csrc/sherpa-onnx-vad.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-vad.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <stdio.h> | ||
| 6 | +#include <stdlib.h> | ||
| 7 | + | ||
| 8 | +#include <algorithm> | ||
| 9 | +#include <iomanip> | ||
| 10 | + | ||
| 11 | +#include "sherpa-onnx/csrc/voice-activity-detector.h" | ||
| 12 | +#include "sherpa-onnx/csrc/wave-reader.h" | ||
| 13 | +#include "sherpa-onnx/csrc/wave-writer.h" | ||
| 14 | + | ||
| 15 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 16 | + const char *kUsageMessage = R"usage( | ||
| 17 | +This program shows how to use VAD in sherpa-onnx | ||
| 18 | +to remove silences from a file. | ||
| 19 | + | ||
| 20 | + ./bin/sherpa-onnx-vad \ | ||
| 21 | + --silero-vad-model=/path/to/silero_vad.onnx \ | ||
| 22 | + /path/to/input.wav | ||
| 23 | + /path/to/output.wav | ||
| 24 | + | ||
| 25 | +Please download silero_vad.onnx from | ||
| 26 | +https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx | ||
| 27 | + | ||
| 28 | +For instance, use | ||
| 29 | +wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx | ||
| 30 | + | ||
| 31 | +input.wav should be 16kHz. | ||
| 32 | +)usage"; | ||
| 33 | + | ||
| 34 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 35 | + sherpa_onnx::VadModelConfig config; | ||
| 36 | + | ||
| 37 | + config.Register(&po); | ||
| 38 | + po.Read(argc, argv); | ||
| 39 | + if (po.NumArgs() != 2) { | ||
| 40 | + fprintf( | ||
| 41 | + stderr, | ||
| 42 | + "Please provide only 2 argument2: the input wav and the output wav\n"); | ||
| 43 | + po.PrintUsage(); | ||
| 44 | + exit(EXIT_FAILURE); | ||
| 45 | + } | ||
| 46 | + | ||
| 47 | + fprintf(stderr, "%s\n", config.ToString().c_str()); | ||
| 48 | + | ||
| 49 | + if (!config.Validate()) { | ||
| 50 | + fprintf(stderr, "Errors in config!\n"); | ||
| 51 | + return -1; | ||
| 52 | + } | ||
| 53 | + | ||
| 54 | + std::string wav_filename = po.GetArg(1); | ||
| 55 | + int32_t sampling_rate = -1; | ||
| 56 | + | ||
| 57 | + bool is_ok = false; | ||
| 58 | + std::vector<float> samples = | ||
| 59 | + sherpa_onnx::ReadWave(wav_filename, &sampling_rate, &is_ok); | ||
| 60 | + | ||
| 61 | + if (!is_ok) { | ||
| 62 | + fprintf(stderr, "Failed to read '%s'\n", wav_filename.c_str()); | ||
| 63 | + return -1; | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + if (sampling_rate != 16000) { | ||
| 67 | + fprintf(stderr, "Support only 16000Hz. Given: %d\n", sampling_rate); | ||
| 68 | + return -1; | ||
| 69 | + } | ||
| 70 | + | ||
| 71 | + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config); | ||
| 72 | + | ||
| 73 | + int32_t window_size = config.silero_vad.window_size; | ||
| 74 | + | ||
| 75 | + int32_t i = 0; | ||
| 76 | + bool is_eof = false; | ||
| 77 | + | ||
| 78 | + std::vector<float> samples_without_silence; | ||
| 79 | + | ||
| 80 | + while (!is_eof) { | ||
| 81 | + if (i + window_size < samples.size()) { | ||
| 82 | + vad->AcceptWaveform(samples.data() + i, window_size); | ||
| 83 | + i += window_size; | ||
| 84 | + } else { | ||
| 85 | + vad->Flush(); | ||
| 86 | + is_eof = true; | ||
| 87 | + } | ||
| 88 | + | ||
| 89 | + while (!vad->Empty()) { | ||
| 90 | + const auto &segment = vad->Front(); | ||
| 91 | + float start_time = segment.start / static_cast<float>(sampling_rate); | ||
| 92 | + float end_time = start_time + segment.samples.size() / | ||
| 93 | + static_cast<float>(sampling_rate); | ||
| 94 | + | ||
| 95 | + fprintf(stderr, "%.3f -- %.3f\n", start_time, end_time); | ||
| 96 | + samples_without_silence.insert(samples_without_silence.end(), | ||
| 97 | + segment.samples.begin(), | ||
| 98 | + segment.samples.end()); | ||
| 99 | + vad->Pop(); | ||
| 100 | + } | ||
| 101 | + } | ||
| 102 | + | ||
| 103 | + sherpa_onnx::WriteWave(po.GetArg(2), sampling_rate, | ||
| 104 | + samples_without_silence.data(), | ||
| 105 | + samples_without_silence.size()); | ||
| 106 | + | ||
| 107 | + fprintf(stderr, "Saved to %s\n", po.GetArg(2).c_str()); | ||
| 108 | + | ||
| 109 | + return 0; | ||
| 110 | +} |
-
请 注册 或 登录 后发表评论