Pascal API for VAD (#1249)

Fangjun Kuang · GitHub
Commit 619279b162288487211e9ee41f0a4ced0436b428 619279b1 1 parent a7dc6c2c
.github/workflows/pascal.yaml
pascal-api-examples/README.md
pascal-api-examples/non-streaming-asr/nemo_ctc.pas
pascal-api-examples/non-streaming-asr/nemo_transducer.pas
pascal-api-examples/non-streaming-asr/paraformer.pas
pascal-api-examples/non-streaming-asr/paraformer_itn.pas
pascal-api-examples/non-streaming-asr/sense_voice.pas
pascal-api-examples/non-streaming-asr/telespeech_ctc.pas
pascal-api-examples/non-streaming-asr/whisper.pas
pascal-api-examples/non-streaming-asr/zipformer_transducer.pas
pascal-api-examples/vad-with-non-streaming-asr/.gitignore
pascal-api-examples/vad-with-non-streaming-asr/README.md
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh
pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas
pascal-api-examples/vad-with-non-streaming-asr/vad_with_whisper.pas
pascal-api-examples/vad/.gitignore
pascal-api-examples/vad/README.md
pascal-api-examples/vad/circular_buffer.pas
pascal-api-examples/vad/remove_silence.pas
--- a/.github/workflows/pascal.yaml
查看文件 @619279b
+++ b/.github/workflows/pascal.yaml
查看文件 @619279b
@@ -116,12 +116,54 @@ jobs:
             cp -v install/lib/*.dll ../pascal-api-examples/read-wav
             cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
             cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
+            cp -v install/lib/*.dll ../pascal-api-examples/vad
+            cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
             cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
+            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
+            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
           fi
+      - name:  Run Pascal test (VAD + non-streaming ASR)
+        shell: bash
+        run: |
+          export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
+
+          cd ./pascal-api-examples
+
+          pushd vad-with-non-streaming-asr
+          time ./run-vad-with-whisper.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          time ./run-vad-with-sense-voice.sh
+          rm -rf sherpa-onnx-*
+          echo "---"
+
+          ls -lh
+
+          popd
+
+      - name:  Run Pascal test (VAD test)
+        shell: bash
+        run: |
+          export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
+
+          cd ./pascal-api-examples
+
+          pushd vad
+          ./run-circular-buffer.sh
+          echo "---"
+
+          time ./run-remove-silence.sh
+          echo "---"
+
+          ls -lh
+
+          popd
+
       - name:  Run Pascal test (Read wav test)
         shell: bash
         run: |
--- a/pascal-api-examples/README.md
查看文件 @619279b
+++ b/pascal-api-examples/README.md
查看文件 @619279b
@@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
 |[read-wav](./read-wav)|It shows how to read a wave file.|
 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
+|[vad](./vad)| It shows how to use the voice activity detection API.|
+|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
--- a/pascal-api-examples/non-streaming-asr/nemo_ctc.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/nemo_ctc.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';
--- a/pascal-api-examples/non-streaming-asr/nemo_transducer.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/nemo_transducer.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
   Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
   Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
--- a/pascal-api-examples/non-streaming-asr/paraformer.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/paraformer.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';
--- a/pascal-api-examples/non-streaming-asr/paraformer_itn.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/paraformer_itn.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';
--- a/pascal-api-examples/non-streaming-asr/sense_voice.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/sense_voice.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
   Config.ModelConfig.SenseVoice.Language := 'auto';
   Config.ModelConfig.SenseVoice.UseItn := False;
--- a/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
   Config.ModelConfig.Provider := 'cpu';
--- a/pascal-api-examples/non-streaming-asr/whisper.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/whisper.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
   Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
   Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
--- a/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas
查看文件 @619279b
+++ b/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas
查看文件 @619279b
@@ -33,6 +33,8 @@ var
   Duration: Single;
   RealTimeFactor: Single;
 begin
+  Initialize(Config);
+
   Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
   Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
   Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';
--- a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore 0 → 100644
查看文件 @619279b
+!run-*.sh
+vad_with_whisper
+vad_with_sense_voice
--- a/pascal-api-examples/vad-with-non-streaming-asr/README.md 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/README.md 0 → 100644
查看文件 @619279b
+# Introduction
+
+
+This directory contains examples for how to use the VAD (voice activity detection)
+with non-streaming speech recognition models.
+
+|Directory| Description|
+|---------|------------|
+|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
+|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
+
+Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
--- a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh 0 → 100755
查看文件 @619279b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh 0 → 100755
查看文件 @619279b
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+if [[ ! -f ./silero_vad.onnx ]]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+fi
+
+if [ ! -f ./lei-jun-test.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+fi
+
+if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+  tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+  rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
+fi
+
+fpc \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./vad_with_sense_voice.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./vad_with_sense_voice
--- a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh 0 → 100755
查看文件 @619279b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh 0 → 100755
查看文件 @619279b
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+if [[ ! -f ./silero_vad.onnx ]]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+fi
+
+if [ ! -f ./Obama.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+fi
+
+if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+
+  tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+  rm sherpa-onnx-whisper-tiny.en.tar.bz2
+fi
+
+fpc \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./vad_with_whisper.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./vad_with_whisper
--- a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas 0 → 100644
查看文件 @619279b
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming SenseVoice model
+with silero VAD to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program vad_with_whisper;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  SysUtils;
+
+function CreateVad(): TSherpaOnnxVoiceActivityDetector;
+var
+  Config: TSherpaOnnxVadModelConfig;
+
+  SampleRate: Integer;
+  WindowSize: Integer;
+begin
+  Initialize(Config);
+
+  SampleRate := 16000; {Please don't change it unless you know the details}
+  WindowSize := 512; {Please don't change it unless you know the details}
+
+  Config.SileroVad.Model := './silero_vad.onnx';
+  Config.SileroVad.MinSpeechDuration := 0.5;
+  Config.SileroVad.MinSilenceDuration := 0.5;
+  Config.SileroVad.Threshold := 0.5;
+  Config.SileroVad.WindowSize := WindowSize;
+  Config.NumThreads:= 1;
+  Config.Debug:= True;
+  Config.Provider:= 'cpu';
+  Config.SampleRate := SampleRate;
+
+  Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
+end;
+
+function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
+var
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+begin
+  Initialize(Config);
+
+  Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
+  Config.ModelConfig.SenseVoice.Language := 'auto';
+  Config.ModelConfig.SenseVoice.UseItn := False;
+  Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  Result := TSherpaOnnxOfflineRecognizer.Create(Config);
+end;
+
+var
+  Wave: TSherpaOnnxWave;
+
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Vad: TSherpaOnnxVoiceActivityDetector;
+
+  Offset: Integer;
+  WindowSize: Integer;
+  SpeechSegment: TSherpaOnnxSpeechSegment;
+
+  Start: Single;
+  Duration: Single;
+
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+begin
+  Vad := CreateVad();
+  Recognizer := CreateOfflineRecognizer();
+
+  Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
+  if Wave.SampleRate <> Vad.Config.SampleRate then
+    begin
+      WriteLn(Format('Expected sample rate: %d. Given: %d',
+        [Vad.Config.SampleRate, Wave.SampleRate]));
+
+      Exit;
+    end;
+
+  WindowSize := Vad.Config.SileroVad.WindowSize;
+  Offset := 0;
+  while Offset + WindowSize <= Length(Wave.Samples) do
+    begin
+      Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
+      Offset += WindowSize;
+
+      while not Vad.IsEmpty do
+        begin
+          SpeechSegment := Vad.Front();
+          Vad.Pop();
+          Stream := Recognizer.CreateStream();
+
+          Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+          Recognizer.Decode(Stream);
+          RecognitionResult := Recognizer.GetResult(Stream);
+
+          Start := SpeechSegment.Start / Wave.SampleRate;
+          Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+          WriteLn(Format('%.3f -- %.3f %s',
+            [Start, Start + Duration, RecognitionResult.Text]));
+
+          FreeAndNil(Stream);
+        end;
+    end;
+
+  Vad.Flush;
+
+  while not Vad.IsEmpty do
+    begin
+      SpeechSegment := Vad.Front();
+      Vad.Pop();
+      Stream := Recognizer.CreateStream();
+
+      Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+      Recognizer.Decode(Stream);
+      RecognitionResult := Recognizer.GetResult(Stream);
+
+      Start := SpeechSegment.Start / Wave.SampleRate;
+      Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+      WriteLn(Format('%.3f -- %.3f %s',
+        [Start, Start + Duration, RecognitionResult.Text]));
+
+      FreeAndNil(Stream);
+    end;
+
+  FreeAndNil(Recognizer);
+  FreeAndNil(Vad);
+end.
--- a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_whisper.pas 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_whisper.pas 0 → 100644
查看文件 @619279b
+{ Copyright (c)  2024  Xiaomi Corporation }
+
+{
+This file shows how to use a non-streaming Whisper model
+with silero VAD to decode files.
+
+You can download the model files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+}
+
+program vad_with_whisper;
+
+{$mode objfpc}
+
+uses
+  sherpa_onnx,
+  SysUtils;
+
+function CreateVad(): TSherpaOnnxVoiceActivityDetector;
+var
+  Config: TSherpaOnnxVadModelConfig;
+
+  SampleRate: Integer;
+  WindowSize: Integer;
+begin
+  Initialize(Config);
+
+  SampleRate := 16000; {Please don't change it unless you know the details}
+  WindowSize := 512; {Please don't change it unless you know the details}
+
+  Config.SileroVad.Model := './silero_vad.onnx';
+  Config.SileroVad.MinSpeechDuration := 0.5;
+  Config.SileroVad.MinSilenceDuration := 0.5;
+  Config.SileroVad.Threshold := 0.5;
+  Config.SileroVad.WindowSize := WindowSize;
+  Config.NumThreads:= 1;
+  Config.Debug:= True;
+  Config.Provider:= 'cpu';
+  Config.SampleRate := SampleRate;
+
+  Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
+end;
+
+function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
+var
+  Config: TSherpaOnnxOfflineRecognizerConfig;
+begin
+  Initialize(Config);
+
+  Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
+  Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
+  Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
+  Config.ModelConfig.Provider := 'cpu';
+  Config.ModelConfig.NumThreads := 1;
+  Config.ModelConfig.Debug := False;
+
+  Result := TSherpaOnnxOfflineRecognizer.Create(Config);
+end;
+
+var
+  Wave: TSherpaOnnxWave;
+
+  Recognizer: TSherpaOnnxOfflineRecognizer;
+  Vad: TSherpaOnnxVoiceActivityDetector;
+
+  Offset: Integer;
+  WindowSize: Integer;
+  SpeechSegment: TSherpaOnnxSpeechSegment;
+
+  Start: Single;
+  Duration: Single;
+
+  Stream: TSherpaOnnxOfflineStream;
+  RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+begin
+  Vad := CreateVad();
+  Recognizer := CreateOfflineRecognizer();
+
+  Wave := SherpaOnnxReadWave('./Obama.wav');
+  if Wave.SampleRate <> Vad.Config.SampleRate then
+    begin
+      WriteLn(Format('Expected sample rate: %d. Given: %d',
+        [Vad.Config.SampleRate, Wave.SampleRate]));
+
+      Exit;
+    end;
+
+  WindowSize := Vad.Config.SileroVad.WindowSize;
+  Offset := 0;
+  while Offset + WindowSize <= Length(Wave.Samples) do
+    begin
+      Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
+      Offset += WindowSize;
+
+      while not Vad.IsEmpty do
+        begin
+          SpeechSegment := Vad.Front();
+          Vad.Pop();
+          Stream := Recognizer.CreateStream();
+
+          Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+          Recognizer.Decode(Stream);
+          RecognitionResult := Recognizer.GetResult(Stream);
+
+          Start := SpeechSegment.Start / Wave.SampleRate;
+          Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+          WriteLn(Format('%.3f -- %.3f %s',
+            [Start, Start + Duration, RecognitionResult.Text]));
+
+          FreeAndNil(Stream);
+        end;
+    end;
+
+  Vad.Flush;
+
+  while not Vad.IsEmpty do
+    begin
+      SpeechSegment := Vad.Front();
+      Vad.Pop();
+      Stream := Recognizer.CreateStream();
+
+      Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+      Recognizer.Decode(Stream);
+      RecognitionResult := Recognizer.GetResult(Stream);
+
+      Start := SpeechSegment.Start / Wave.SampleRate;
+      Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+      WriteLn(Format('%.3f -- %.3f %s',
+        [Start, Start + Duration, RecognitionResult.Text]));
+
+      FreeAndNil(Stream);
+    end;
+
+  FreeAndNil(Recognizer);
+  FreeAndNil(Vad);
+end.
--- a/pascal-api-examples/vad/.gitignore 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad/.gitignore 0 → 100644
查看文件 @619279b
+!run*.sh
+circular_buffer
+remove_silence
--- a/pascal-api-examples/vad/README.md 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad/README.md 0 → 100644
查看文件 @619279b
+# Introduction
+
+
+This directory contains examples for how to use the VAD (voice activity detection)
+APIs.
+
+|Directory| Description|
+|---------|------------|
+|[run-circular-buffer.sh](./run-circular-buffer.sh)|It shows how to use the circular buffer API.|
+|[run-remove-silence.sh](./run-remove-silence.sh)|It shows how to use the VAD API to remove silences from a wave file.|
+
--- a/pascal-api-examples/vad/circular_buffer.pas 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad/circular_buffer.pas 0 → 100644
查看文件 @619279b
+{ Copyright (c)  2024  Xiaomi Corporation }
+program circular_buffer;
+{
+This file shows how to use the CircularBuffer API of sherpa-onnx
+}
+
+{$mode objfpc}
+{$ASSERTIONS ON}
+
+uses
+  sherpa_onnx;
+
+var
+  Buffer: TSherpaOnnxCircularBuffer;
+  Samples: TSherpaOnnxSamplesArray;
+begin
+  {The initial capacity is 5. It will be resized automatically if needed.}
+  Buffer := TSherpaOnnxCircularBuffer.Create(5);
+  Assert(Buffer.Size = 0);
+  Assert(Buffer.Head = 0);
+  Buffer.Push([0, 10, 20]);
+
+  {Push() changes Size. Head is not changed.}
+  Assert(Buffer.Size = 3);
+  Assert(Buffer.Head = 0);
+
+  Samples := Buffer.Get(0, 1);
+  Assert(Length(Samples) = 1);
+  Assert(Samples[0] = 0);
+
+  { Get() does not change Size or Head}
+  Assert(Buffer.Size = 3);
+  Assert(Buffer.Head = 0);
+
+  Samples := Buffer.Get(0, 2);
+  Assert(Length(Samples) = 2);
+  Assert(Samples[0] = 0);
+  Assert(Samples[1] = 10);
+
+  { The buffer will be resized since its initial capacity is 5 but we have
+    pushed 7 elements into it.
+
+    No data is lost during the resize.
+  }
+  Buffer.Push([30, 40, 50, 60]);
+
+  Assert(Buffer.Size = 7); {There are now 7 elements}
+  Assert(Buffer.Head = 0);
+
+  {Remove the first 4 elements}
+  Buffer.Pop(4);
+
+  Assert(Buffer.Size = 3); {There are only 3 elements left}
+  Assert(Buffer.Head = 4);
+
+  Samples := Buffer.Get(Buffer.Head, 2);
+  Assert(Length(Samples) = 2);
+  Assert(Samples[0] = 40);
+  Assert(Samples[1] = 50);
+
+  Buffer.Pop(1);
+
+  Assert(Buffer.Size = 2); {There are only 2 elements left}
+  Assert(Buffer.Head = 5);
+
+  Samples := Buffer.Get(Buffer.Head, 2);
+  Assert(Length(Samples) = 2);
+  Assert(Samples[0] = 50);
+  Assert(Samples[1] = 60);
+
+  Buffer.Pop(2);
+  Assert(Buffer.Size = 0); {There are no elements left}
+  Assert(Buffer.Head = 7);
+
+  Buffer.Push([100, 200, 300, 400, 500]);
+  Assert(Buffer.Size = 5);
+  Assert(Buffer.Head = 7);
+
+  Buffer.Pop(4);
+  Assert(Buffer.Size = 1);
+
+  {Head can be larger than the Capacity!
+   This is what circular means. It points to Buffer.Head / Capacity.
+  }
+  Assert(Buffer.Head = 11);
+  Buffer.Push([600, 700]);
+
+  Assert(Buffer.Size = 3);
+  Assert(Buffer.Head = 11);
+
+  Samples := Buffer.Get(Buffer.Head, 3);
+  Assert(Length(Samples) = 3);
+  Assert(Samples[0] = 500);
+  Assert(Samples[1] = 600);
+  Assert(Samples[2] = 700);
+
+  Buffer.Pop(3);
+  Assert(Buffer.Size = 0);
+  Assert(Buffer.Head = 14);
+
+  Buffer.Reset();
+
+  Assert(Buffer.Size = 0);
+  Assert(Buffer.Head = 0);
+end.
+
--- a/pascal-api-examples/vad/remove_silence.pas 0 → 100644
查看文件 @619279b
+++ b/pascal-api-examples/vad/remove_silence.pas 0 → 100644
查看文件 @619279b
+{ Copyright (c)  2024  Xiaomi Corporation }
+{
+This file shows how to use the VAD API from sherpa-onnx
+to remove silences from a wave file.
+}
+program main;
+
+{$mode delphi}
+
+uses
+  sherpa_onnx,
+  SysUtils;
+
+var
+  Wave: TSherpaOnnxWave;
+
+  Config: TSherpaOnnxVadModelConfig;
+  Vad: TSherpaOnnxVoiceActivityDetector;
+  Offset: Integer;
+  WindowSize: Integer;
+  SpeechSegment: TSherpaOnnxSpeechSegment;
+
+  Start: Single;
+  Duration: Single;
+  SampleRate: Integer;
+
+  AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
+  AllSamples: array of Single;
+  N: Integer;
+  I: Integer;
+begin
+  SampleRate := 16000; {Please don't change it unless you know the details}
+
+  Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
+  if Wave.SampleRate <> SampleRate then
+    begin
+      WriteLn(Format('Expected sample rate: %d. Given: %d',
+        [SampleRate, Wave.SampleRate]));
+
+      Exit;
+    end;
+
+  WindowSize := 512; {Please don't change it unless you know the details}
+  Initialize(Config);
+
+  Config.SileroVad.Model := './silero_vad.onnx';
+  Config.SileroVad.MinSpeechDuration := 0.25;
+  Config.SileroVad.MinSilenceDuration := 0.5;
+  Config.SileroVad.Threshold := 0.5;
+  Config.SileroVad.WindowSize := WindowSize;
+  Config.NumThreads:= 1;
+  Config.Debug:= True;
+  Config.Provider:= 'cpu';
+  Config.SampleRate := SampleRate;
+
+  Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
+
+  AllSpeechSegment := nil;
+  AllSamples := nil;
+  Offset := 0;
+  while Offset + WindowSize <= Length(Wave.Samples) do
+    begin
+      Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
+      Inc(Offset, WindowSize);
+
+      while not Vad.IsEmpty do
+        begin
+          SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
+
+          SpeechSegment := Vad.Front();
+          Vad.Pop();
+          AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
+
+          Start := SpeechSegment.Start / SampleRate;
+          Duration := Length(SpeechSegment.Samples) / SampleRate;
+          WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
+        end;
+    end;
+
+  Vad.Flush;
+
+  while not Vad.IsEmpty do
+    begin
+      SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
+
+      SpeechSegment := Vad.Front();
+      Vad.Pop();
+      AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
+
+      Start := SpeechSegment.Start / SampleRate;
+      Duration := Length(SpeechSegment.Samples) / SampleRate;
+      WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
+    end;
+
+  N := 0;
+  for SpeechSegment in AllSpeechSegment do
+    Inc(N, Length(SpeechSegment.Samples));
+
+  SetLength(AllSamples, N);
+
+  N := 0;
+  for SpeechSegment in AllSpeechSegment do
+    begin
+      for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
+        begin
+          AllSamples[N] := SpeechSegment.Samples[I];
+          Inc(N);
+        end;
+    end;
+
+  SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate);
+  WriteLn('Saved to ./lei-jun-test-no-silence.wav');
+
+  FreeAndNil(Vad);
+end.
--- a/pascal-api-examples/vad/run-circular-buffer.sh 0 → 100755
查看文件 @619279b
+++ b/pascal-api-examples/vad/run-circular-buffer.sh 0 → 100755
查看文件 @619279b
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+fpc \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./circular_buffer.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./circular_buffer
--- a/pascal-api-examples/vad/run-remove-silence.sh 0 → 100755
查看文件 @619279b
+++ b/pascal-api-examples/vad/run-remove-silence.sh 0 → 100755
查看文件 @619279b
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+if [[ ! -f ./silero_vad.onnx ]]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+fi
+
+if [ ! -f ./lei-jun-test.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+fi
+
+fpc \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./remove_silence.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+./remove_silence
--- a/sherpa-onnx/csrc/circular-buffer.cc
查看文件 @619279b
+++ b/sherpa-onnx/csrc/circular-buffer.cc
查看文件 @619279b
@@ -95,6 +95,8 @@ void CircularBuffer::Push(const float *p, int32_t n) {
         "capacity to: %d",
         n, size, n + size, capacity, new_capacity);
     Resize(new_capacity);
+
+    capacity = new_capacity;
   }
   int32_t start = tail_ % capacity;
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @619279b
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @619279b
@@ -2,9 +2,11 @@
 unit sherpa_onnx;
-{$mode objfpc}
+{$IFDEF FPC}
+  {$mode objfpc}
+  {$modeSwitch advancedRecords} { to support records with methods }
+{$ENDIF}
-{$modeSwitch advancedRecords} { to support records with methods }
 (* {$LongStrings ON} *)
 interface
@@ -45,18 +47,21 @@ type
     ModelingUnit: AnsiString;
     BpeVocab: AnsiString;
     function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
   end;
   TSherpaOnnxFeatureConfig = record
     SampleRate: Integer;
     FeatureDim: Integer;
     function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
   end;
   TSherpaOnnxOnlineCtcFstDecoderConfig = record
     Graph: AnsiString;
     MaxActive: Integer;
     function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
   end;
   TSherpaOnnxOnlineRecognizerConfig = record
@@ -75,6 +80,7 @@ type
     RuleFars: AnsiString;
     BlankPenalty: Single;
     function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
   end;
   TSherpaOnnxOnlineRecognizerResult = record
@@ -97,6 +103,7 @@ type
   TSherpaOnnxOnlineRecognizer = class
   private
    Handle: Pointer;
+   _Config: TSherpaOnnxOnlineRecognizerConfig;
   public
     constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig);
     destructor Destroy; override;
@@ -108,6 +115,7 @@ type
     procedure Reset(Stream: TSherpaOnnxOnlineStream);
     function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
     function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
+    property Config: TSherpaOnnxOnlineRecognizerConfig Read _Config;
   end;
   TSherpaOnnxOfflineTransducerModelConfig = record
@@ -134,6 +142,7 @@ type
     Task: AnsiString;
     TailPaddings: Integer;
     function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
   end;
   TSherpaOnnxOfflineTdnnModelConfig = record
@@ -145,12 +154,14 @@ type
     Model: AnsiString;
     Scale: Single;
     function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
   end;
   TSherpaOnnxOfflineSenseVoiceModelConfig = record
     Model: AnsiString;
     Language: AnsiString;
     UseItn: Boolean;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
     function ToString: AnsiString;
   end;
@@ -169,6 +180,7 @@ type
     BpeVocab: AnsiString;
     TeleSpeechCtc: AnsiString;
     SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
     function ToString: AnsiString;
   end;
@@ -183,6 +195,7 @@ type
     RuleFsts: AnsiString;
     RuleFars: AnsiString;
     BlankPenalty: Single;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
     function ToString: AnsiString;
   end;
@@ -205,18 +218,83 @@ type
   TSherpaOnnxOfflineRecognizer = class
   private
    Handle: Pointer;
+   _Config: TSherpaOnnxOfflineRecognizerConfig;
   public
     constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig);
     destructor Destroy; override;
     function CreateStream: TSherpaOnnxOfflineStream;
     procedure Decode(Stream: TSherpaOnnxOfflineStream);
     function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
+    property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config;
   end;
-{ It supports reading a single channel wave with 16-bit encoded samples.
-  Samples are normalized to the range [-1, 1].
-}
-function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
+  TSherpaOnnxSileroVadModelConfig = record
+    Model: AnsiString;
+    Threshold: Single;
+    MinSilenceDuration: Single;
+    MinSpeechDuration: Single;
+    WindowSize: Integer;
+    function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
+  end;
+
+  TSherpaOnnxVadModelConfig = record
+    SileroVad: TSherpaOnnxSileroVadModelConfig;
+    SampleRate: Integer;
+    NumThreads: Integer;
+    Provider: AnsiString;
+    Debug: Boolean;
+    function ToString: AnsiString;
+    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
+  end;
+
+  TSherpaOnnxSamplesArray = array of Single;
+
+  TSherpaOnnxCircularBuffer = class
+  private
+    Handle: Pointer;
+  public
+    constructor Create(Capacity: Integer);
+    destructor Destroy; override;
+    procedure Push(Samples: array of Single);
+    function Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
+    procedure Pop(N: Integer);
+    procedure Reset;
+    function Size: Integer;
+    function Head: Integer;
+  end;
+
+  TSherpaOnnxSpeechSegment = record
+    Samples: array of Single;
+    Start: Integer;
+  end;
+
+  TSherpaOnnxVoiceActivityDetector = class
+  private
+    Handle: Pointer;
+    _Config: TSherpaOnnxVadModelConfig;
+  public
+    constructor Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
+    destructor Destroy; override;
+    procedure AcceptWaveform(Samples: array of Single); overload;
+    procedure AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); overload;
+    function IsEmpty: Boolean;
+    function IsDetected: Boolean;
+    procedure Pop;
+    procedure Clear;
+    function Front: TSherpaOnnxSpeechSegment;
+    procedure Reset;
+    procedure Flush;
+    property Config: TSherpaOnnxVadModelConfig Read _Config;
+  end;
+
+  { It supports reading a single channel wave with 16-bit encoded samples.
+    Samples are normalized to the range [-1, 1].
+  }
+  function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
+
+  function SherpaOnnxWriteWave(Filename: AnsiString;
+    Samples: array of Single; SampleRate: Integer): Boolean;
 implementation
@@ -294,15 +372,15 @@ type
     DecodingMethod: PAnsiChar;
     MaxActivePaths: cint32;
     EnableEndpoint: cint32;
-    Rule1MinTrailingSilence: Single;
-    Rule2MinTrailingSilence: Single;
-    Rule3MinUtteranceLength: Single;
+    Rule1MinTrailingSilence: cfloat;
+    Rule2MinTrailingSilence: cfloat;
+    Rule3MinUtteranceLength: cfloat;
     HotwordsFile: PAnsiChar;
-    HotwordsScore: Single;
+    HotwordsScore: cfloat;
     CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig;
     RuleFsts: PAnsiChar;
     RuleFars: PAnsiChar;
-    BlankPenalty: Single;
+    BlankPenalty: cfloat;
   end;
   PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig;
@@ -330,7 +408,7 @@ type
   end;
   SherpaOnnxOfflineLMConfig = record
     Model: PAnsiChar;
-    Scale: Single;
+    Scale: cfloat;
   end;
   SherpaOnnxOfflineSenseVoiceModelConfig = record
     Model: PAnsiChar;
@@ -361,14 +439,100 @@ type
     DecodingMethod: PAnsiChar;
     MaxActivePaths: cint32;
     HotwordsFile: PAnsiChar;
-    HotwordsScore: Single;
+    HotwordsScore: cfloat;
     RuleFsts: PAnsiChar;
     RuleFars: PAnsiChar;
-    BlankPenalty: Single;
+    BlankPenalty: cfloat;
   end;
   PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig;
+  SherpaOnnxSileroVadModelConfig = record
+    Model: PAnsiChar;
+    Threshold: cfloat;
+    MinSilenceDuration: cfloat;
+    MinSpeechDuration: cfloat;
+    WindowSize: cint32;
+  end;
+  SherpaOnnxVadModelConfig = record
+    SileroVad: SherpaOnnxSileroVadModelConfig;
+    SampleRate: cint32;
+    NumThreads: cint32;
+    Provider: PAnsiChar;
+    Debug: cint32;
+  end;
+  PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
+
+  SherpaOnnxSpeechSegment = record
+    Start: cint32;
+    Samples: pcfloat;
+    N: cint32;
+  end;
+
+  PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
+
+function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
+  BufferSizeInSeconds: cfloat): Pointer; cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxDestroyVoiceActivityDetector(Vad: Pointer); cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxVoiceActivityDetectorAcceptWaveform(Vad: Pointer;
+  Samples: pcfloat; N: cint32); cdecl;
+  external SherpaOnnxLibName;
+
+function SherpaOnnxVoiceActivityDetectorEmpty(Vad: Pointer): cint32; cdecl;
+  external SherpaOnnxLibName;
+
+function SherpaOnnxVoiceActivityDetectorDetected(Vad: Pointer): cint32; cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxVoiceActivityDetectorPop(Vad: Pointer); cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxVoiceActivityDetectorClear(Vad: Pointer); cdecl;
+  external SherpaOnnxLibName;
+
+function SherpaOnnxVoiceActivityDetectorFront(Vad: Pointer): PSherpaOnnxSpeechSegment; cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxDestroySpeechSegment(P: PSherpaOnnxSpeechSegment); cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxVoiceActivityDetectorReset(P: PSherpaOnnxSpeechSegment); cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxVoiceActivityDetectorFlush(P: PSherpaOnnxSpeechSegment); cdecl;
+  external SherpaOnnxLibName;
+
+function SherpaOnnxCreateCircularBuffer(Capacity: cint32): Pointer; cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxDestroyCircularBuffer(Buffer: Pointer) ; cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxCircularBufferPush(Buffer: Pointer; Samples: pcfloat; N: cint32); cdecl;
+  external SherpaOnnxLibName;
+
+function SherpaOnnxCircularBufferGet(Buffer: Pointer; StartIndex: cint32; N: cint32): pcfloat ; cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxCircularBufferFree(P: pcfloat); cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxCircularBufferPop(Buffer: Pointer; N: cint32); cdecl;
+  external SherpaOnnxLibName;
+
+function SherpaOnnxCircularBufferSize(Buffer: Pointer): cint32; cdecl;
+  external SherpaOnnxLibName;
+
+function SherpaOnnxCircularBufferHead(Buffer: Pointer): cint32; cdecl;
+  external SherpaOnnxLibName;
+
+procedure SherpaOnnxCircularBufferReset(Buffer: Pointer); cdecl;
+  external SherpaOnnxLibName;
+
 function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl;
   external SherpaOnnxLibName;
@@ -437,9 +601,20 @@ procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl;
 function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl;
   external SherpaOnnxLibName name 'SherpaOnnxReadWave';
+function SherpaOnnxWriteWaveWrapper(Samples: pcfloat; N: cint32;
+  SampleRate: cint32; Filename: PAnsiChar): cint32; cdecl;
+  external SherpaOnnxLibName name 'SherpaOnnxWriteWave';
+
 procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl;
   external SherpaOnnxLibName name 'SherpaOnnxFreeWave';
+function SherpaOnnxWriteWave(Filename: AnsiString;
+    Samples: array of Single; SampleRate: Integer): Boolean;
+begin
+  Result := SherpaOnnxWriteWaveWrapper(pcfloat(Samples), Length(Samples),
+    SampleRate, PAnsiChar(Filename)) = 1;
+end;
+
 function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
 var
   PFilename: PAnsiChar;
@@ -611,6 +786,7 @@ begin
   C.BlankPenalty := Config.BlankPenalty;
   Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C);
+  Self._Config := Config;
 end;
 destructor TSherpaOnnxOnlineRecognizer.Destroy;
@@ -877,6 +1053,7 @@ begin
   C.BlankPenalty := Config.BlankPenalty;
   Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C);
+  Self._Config := Config;
 end;
 destructor TSherpaOnnxOfflineRecognizer.Destroy;
@@ -984,5 +1161,255 @@ begin
     [Self.Text, TokensStr, TimestampStr]);
 end;
+function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
+begin
+  Result := Format('TSherpaOnnxSileroVadModelConfig(' +
+    'Model := %s, ' +
+    'Threshold := %.2f, ' +
+    'MinSilenceDuration := %.2f, ' +
+    'MinSpeechDuration := %.2f, ' +
+    'WindowSize := %d' +
+    ')',
+    [Self.Model, Self.Threshold, Self.MinSilenceDuration,
+     Self.MinSpeechDuration, Self.WindowSize
+    ]);
+end;
+
+class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
+begin
+  Dest.Threshold := 0.5;
+  Dest.MinSilenceDuration := 0.5;
+  Dest.MinSpeechDuration := 0.25;
+  Dest.WindowSize := 512;
+end;
+
+function TSherpaOnnxVadModelConfig.ToString: AnsiString;
+begin
+  Result := Format('TSherpaOnnxVadModelConfig(' +
+    'SileroVad := %s, ' +
+    'SampleRate := %d, ' +
+    'NumThreads := %d, ' +
+    'Provider := %s, ' +
+    'Debug := %s' +
+    ')',
+    [Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
+     Self.Debug.ToString
+    ]);
+end;
+
+class operator TSherpaOnnxVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
+begin
+  Dest.SampleRate := 16000;
+  Dest.NumThreads := 1;
+  Dest.Provider := 'cpu';
+  Dest.Debug := False;
+end;
+
+class operator TSherpaOnnxFeatureConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
+begin
+  Dest.SampleRate := 16000;
+  Dest.FeatureDim := 80;
+end;
+
+class operator TSherpaOnnxOnlineCtcFstDecoderConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
+begin
+  Dest.MaxActive := 3000;
+end;
+
+class operator TSherpaOnnxOnlineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
+begin
+  Dest.DecodingMethod := 'greedy_search';
+  Dest.EnableEndpoint := False;
+  Dest.Rule1MinTrailingSilence := 2.4;
+  Dest.Rule2MinTrailingSilence := 1.2;
+  Dest.Rule3MinUtteranceLength := 20;
+  Dest.HotwordsScore := 1.5;
+  Dest.BlankPenalty := 0;
+end;
+
+class operator TSherpaOnnxOnlineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
+begin
+  Dest.NumThreads := 1;
+  Dest.Provider := 'cpu';
+  Dest.Debug := False;
+end;
+
+class operator TSherpaOnnxOfflineWhisperModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
+begin
+  Dest.Task := 'transcribe';
+  Dest.TailPaddings := -1;
+end;
+
+class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
+begin
+  Dest.Scale := 1.0;
+end;
+
+class operator TSherpaOnnxOfflineSenseVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
+begin
+  Dest.UseItn := True;
+end;
+
+class operator TSherpaOnnxOfflineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
+begin
+  Dest.NumThreads := 1;
+  Dest.Debug := False;
+  Dest.Provider := 'cpu';
+end;
+
+class operator TSherpaOnnxOfflineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
+begin
+  Dest.DecodingMethod := 'greedy_search';
+  Dest.MaxActivePaths := 4;
+  Dest.HotwordsScore := 1.5;
+  Dest.BlankPenalty := 0;
+end;
+
+constructor TSherpaOnnxCircularBuffer.Create(Capacity: Integer);
+begin
+  Self.Handle := SherpaOnnxCreateCircularBuffer(Capacity);
+end;
+
+destructor TSherpaOnnxCircularBuffer.Destroy;
+begin
+  SherpaOnnxDestroyCircularBuffer(Self.Handle);
+  Self.Handle := nil;
+end;
+
+procedure TSherpaOnnxCircularBuffer.Push(Samples: array of Single);
+begin
+  SherpaOnnxCircularBufferPush(Self.Handle, pcfloat(Samples), Length(Samples));
+end;
+
+function TSherpaOnnxCircularBuffer.Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
+var
+  P: pcfloat;
+  I: Integer;
+begin
+  P := SherpaOnnxCircularBufferGet(Self.Handle, StartIndex, N);
+
+  Result := nil;
+
+  SetLength(Result, N);
+
+  for I := Low(Result) to High(Result) do
+    Result[I] := P[I];
+
+  SherpaOnnxCircularBufferFree(P);
+end;
+
+procedure TSherpaOnnxCircularBuffer.Pop(N: Integer);
+begin
+  SherpaOnnxCircularBufferPop(Self.Handle, N);
+end;
+
+procedure TSherpaOnnxCircularBuffer.Reset;
+begin
+  SherpaOnnxCircularBufferReset(Self.Handle);
+end;
+
+function TSherpaOnnxCircularBuffer.Size: Integer;
+begin
+  Result := SherpaOnnxCircularBufferSize(Self.Handle);
+end;
+
+function TSherpaOnnxCircularBuffer.Head: Integer;
+begin
+  Result := SherpaOnnxCircularBufferHead(Self.Handle);
+end;
+
+constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
+var
+  C: SherpaOnnxVadModelConfig;
+begin
+  Self._Config := Config;
+
+  Initialize(C);
+
+  C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
+  C.SileroVad.Threshold := Config.SileroVad.Threshold;
+  C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
+  C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
+  C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
+
+  C.SampleRate := Config.SampleRate;
+  C.NumThreads := Config.NumThreads;
+  C.Provider := PAnsiChar(Config.Provider);
+  C.Debug := Ord(Config.Debug);
+
+  Self.Handle := SherpaOnnxCreateVoiceActivityDetector(@C, BufferSizeInSeconds);
+end;
+
+destructor TSherpaOnnxVoiceActivityDetector.Destroy;
+begin
+  SherpaOnnxDestroyVoiceActivityDetector(Self.Handle);
+  Self.Handle := nil;
+end;
+
+procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single);
+begin
+  SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, pcfloat(Samples), Length(Samples));
+end;
+
+procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer);
+begin
+  if Offset + N > Length(Samples) then
+    begin
+      WriteLn(Format('Invalid arguments!. Array length: %d, Offset: %d, N: %d',
+        [Length(Samples), Offset, N]
+      ));
+      Exit;
+    end;
+
+  SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle,
+    pcfloat(Samples) + Offset, N);
+end;
+
+function TSherpaOnnxVoiceActivityDetector.IsEmpty: Boolean;
+begin
+  Result := SherpaOnnxVoiceActivityDetectorEmpty(Self.Handle) = 1;
+end;
+
+function TSherpaOnnxVoiceActivityDetector.IsDetected: Boolean;
+begin
+  Result := SherpaOnnxVoiceActivityDetectorDetected(Self.Handle) = 1;
+end;
+
+procedure TSherpaOnnxVoiceActivityDetector.Pop;
+begin
+  SherpaOnnxVoiceActivityDetectorPop(Self.Handle);
+end;
+
+procedure TSherpaOnnxVoiceActivityDetector.Clear;
+begin
+  SherpaOnnxVoiceActivityDetectorClear(Self.Handle);
+end;
+
+function TSherpaOnnxVoiceActivityDetector.Front: TSherpaOnnxSpeechSegment;
+var
+  P: PSherpaOnnxSpeechSegment;
+  I: Integer;
+begin
+  P := SherpaOnnxVoiceActivityDetectorFront(Self.Handle);
+  Result.Start := P^.Start;
+  Result.Samples := nil;
+  SetLength(Result.Samples, P^.N);
+
+  for I := Low(Result.Samples) to High(Result.Samples) do
+    Result.Samples[I] := P^.Samples[I];
+
+  SherpaOnnxDestroySpeechSegment(P);
+end;
+
+procedure TSherpaOnnxVoiceActivityDetector.Reset;
+begin
+  SherpaOnnxVoiceActivityDetectorReset(Self.Handle);
+end;
+
+procedure TSherpaOnnxVoiceActivityDetector.Flush;
+begin
+  SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
+end;
+
 end.