Fangjun Kuang
Committed by GitHub

Pascal API for VAD (#1249)

@@ -116,12 +116,54 @@ jobs: @@ -116,12 +116,54 @@ jobs:
116 cp -v install/lib/*.dll ../pascal-api-examples/read-wav 116 cp -v install/lib/*.dll ../pascal-api-examples/read-wav
117 cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr 117 cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
118 cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr 118 cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
  119 + cp -v install/lib/*.dll ../pascal-api-examples/vad
  120 + cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
119 121
120 cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav 122 cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
121 cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr 123 cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
122 cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr 124 cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
  125 + cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
  126 + cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
123 fi 127 fi
124 128
  129 + - name: Run Pascal test (VAD + non-streaming ASR)
  130 + shell: bash
  131 + run: |
  132 + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
  133 +
  134 + cd ./pascal-api-examples
  135 +
  136 + pushd vad-with-non-streaming-asr
  137 + time ./run-vad-with-whisper.sh
  138 + rm -rf sherpa-onnx-*
  139 + echo "---"
  140 +
  141 + time ./run-vad-with-sense-voice.sh
  142 + rm -rf sherpa-onnx-*
  143 + echo "---"
  144 +
  145 + ls -lh
  146 +
  147 + popd
  148 +
  149 + - name: Run Pascal test (VAD test)
  150 + shell: bash
  151 + run: |
  152 + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
  153 +
  154 + cd ./pascal-api-examples
  155 +
  156 + pushd vad
  157 + ./run-circular-buffer.sh
  158 + echo "---"
  159 +
  160 + time ./run-remove-silence.sh
  161 + echo "---"
  162 +
  163 + ls -lh
  164 +
  165 + popd
  166 +
125 - name: Run Pascal test (Read wav test) 167 - name: Run Pascal test (Read wav test)
126 shell: bash 168 shell: bash
127 run: | 169 run: |
@@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx). @@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
8 |[read-wav](./read-wav)|It shows how to read a wave file.| 8 |[read-wav](./read-wav)|It shows how to read a wave file.|
9 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.| 9 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
10 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| 10 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
  11 +|[vad](./vad)| It shows how to use the voice activity detection API.|
  12 +|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx'; 38 Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
37 Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt'; 39 Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
38 Config.ModelConfig.Provider := 'cpu'; 40 Config.ModelConfig.Provider := 'cpu';
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx'; 38 Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
37 Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx'; 39 Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
38 Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx'; 40 Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx'; 38 Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
37 Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt'; 39 Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
38 Config.ModelConfig.Provider := 'cpu'; 40 Config.ModelConfig.Provider := 'cpu';
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx'; 38 Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
37 Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt'; 39 Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
38 Config.ModelConfig.Provider := 'cpu'; 40 Config.ModelConfig.Provider := 'cpu';
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx'; 38 Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
37 Config.ModelConfig.SenseVoice.Language := 'auto'; 39 Config.ModelConfig.SenseVoice.Language := 'auto';
38 Config.ModelConfig.SenseVoice.UseItn := False; 40 Config.ModelConfig.SenseVoice.UseItn := False;
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx'; 38 Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
37 Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt'; 39 Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
38 Config.ModelConfig.Provider := 'cpu'; 40 Config.ModelConfig.Provider := 'cpu';
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; 38 Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
37 Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; 39 Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
38 Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; 40 Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
@@ -33,6 +33,8 @@ var @@ -33,6 +33,8 @@ var
33 Duration: Single; 33 Duration: Single;
34 RealTimeFactor: Single; 34 RealTimeFactor: Single;
35 begin 35 begin
  36 + Initialize(Config);
  37 +
36 Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx'; 38 Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
37 Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx'; 39 Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
38 Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx'; 40 Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';
  1 +!run-*.sh
  2 +vad_with_whisper
  3 +vad_with_sense_voice
  1 +# Introduction
  2 +
  3 +
  4 +This directory contains examples for how to use the VAD (voice activity detection)
  5 +with non-streaming speech recognition models.
  6 +
  7 +|Directory| Description|
  8 +|---------|------------|
  9 +|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
  10 +|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
  11 +
  12 +Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [[ ! -f ./silero_vad.onnx ]]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  28 +fi
  29 +
  30 +if [ ! -f ./lei-jun-test.wav ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  32 +fi
  33 +
  34 +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
  35 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  36 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  37 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  38 +fi
  39 +
  40 +fpc \
  41 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  42 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  43 + ./vad_with_sense_voice.pas
  44 +
  45 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  46 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  47 +
  48 +./vad_with_sense_voice
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [[ ! -f ./silero_vad.onnx ]]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  28 +fi
  29 +
  30 +if [ ! -f ./Obama.wav ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  32 +fi
  33 +
  34 +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
  35 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  36 +
  37 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  38 + rm sherpa-onnx-whisper-tiny.en.tar.bz2
  39 +fi
  40 +
  41 +fpc \
  42 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  43 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  44 + ./vad_with_whisper.pas
  45 +
  46 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  47 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  48 +
  49 +./vad_with_whisper
  1 +{ Copyright (c) 2024 Xiaomi Corporation }
  2 +
  3 +{
  4 +This file shows how to use a non-streaming SenseVoice model
  5 +with silero VAD to decode files.
  6 +
  7 +You can download the model files from
  8 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  9 +}
  10 +
  11 +program vad_with_whisper;
  12 +
  13 +{$mode objfpc}
  14 +
  15 +uses
  16 + sherpa_onnx,
  17 + SysUtils;
  18 +
  19 +function CreateVad(): TSherpaOnnxVoiceActivityDetector;
  20 +var
  21 + Config: TSherpaOnnxVadModelConfig;
  22 +
  23 + SampleRate: Integer;
  24 + WindowSize: Integer;
  25 +begin
  26 + Initialize(Config);
  27 +
  28 + SampleRate := 16000; {Please don't change it unless you know the details}
  29 + WindowSize := 512; {Please don't change it unless you know the details}
  30 +
  31 + Config.SileroVad.Model := './silero_vad.onnx';
  32 + Config.SileroVad.MinSpeechDuration := 0.5;
  33 + Config.SileroVad.MinSilenceDuration := 0.5;
  34 + Config.SileroVad.Threshold := 0.5;
  35 + Config.SileroVad.WindowSize := WindowSize;
  36 + Config.NumThreads:= 1;
  37 + Config.Debug:= True;
  38 + Config.Provider:= 'cpu';
  39 + Config.SampleRate := SampleRate;
  40 +
  41 + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
  42 +end;
  43 +
  44 +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
  45 +var
  46 + Config: TSherpaOnnxOfflineRecognizerConfig;
  47 +begin
  48 + Initialize(Config);
  49 +
  50 + Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
  51 + Config.ModelConfig.SenseVoice.Language := 'auto';
  52 + Config.ModelConfig.SenseVoice.UseItn := False;
  53 + Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt';
  54 + Config.ModelConfig.Provider := 'cpu';
  55 + Config.ModelConfig.NumThreads := 1;
  56 + Config.ModelConfig.Debug := False;
  57 +
  58 + Result := TSherpaOnnxOfflineRecognizer.Create(Config);
  59 +end;
  60 +
  61 +var
  62 + Wave: TSherpaOnnxWave;
  63 +
  64 + Recognizer: TSherpaOnnxOfflineRecognizer;
  65 + Vad: TSherpaOnnxVoiceActivityDetector;
  66 +
  67 + Offset: Integer;
  68 + WindowSize: Integer;
  69 + SpeechSegment: TSherpaOnnxSpeechSegment;
  70 +
  71 + Start: Single;
  72 + Duration: Single;
  73 +
  74 + Stream: TSherpaOnnxOfflineStream;
  75 + RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
  76 +begin
  77 + Vad := CreateVad();
  78 + Recognizer := CreateOfflineRecognizer();
  79 +
  80 + Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
  81 + if Wave.SampleRate <> Vad.Config.SampleRate then
  82 + begin
  83 + WriteLn(Format('Expected sample rate: %d. Given: %d',
  84 + [Vad.Config.SampleRate, Wave.SampleRate]));
  85 +
  86 + Exit;
  87 + end;
  88 +
  89 + WindowSize := Vad.Config.SileroVad.WindowSize;
  90 + Offset := 0;
  91 + while Offset + WindowSize <= Length(Wave.Samples) do
  92 + begin
  93 + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
  94 + Offset += WindowSize;
  95 +
  96 + while not Vad.IsEmpty do
  97 + begin
  98 + SpeechSegment := Vad.Front();
  99 + Vad.Pop();
  100 + Stream := Recognizer.CreateStream();
  101 +
  102 + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
  103 + Recognizer.Decode(Stream);
  104 + RecognitionResult := Recognizer.GetResult(Stream);
  105 +
  106 + Start := SpeechSegment.Start / Wave.SampleRate;
  107 + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
  108 + WriteLn(Format('%.3f -- %.3f %s',
  109 + [Start, Start + Duration, RecognitionResult.Text]));
  110 +
  111 + FreeAndNil(Stream);
  112 + end;
  113 + end;
  114 +
  115 + Vad.Flush;
  116 +
  117 + while not Vad.IsEmpty do
  118 + begin
  119 + SpeechSegment := Vad.Front();
  120 + Vad.Pop();
  121 + Stream := Recognizer.CreateStream();
  122 +
  123 + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
  124 + Recognizer.Decode(Stream);
  125 + RecognitionResult := Recognizer.GetResult(Stream);
  126 +
  127 + Start := SpeechSegment.Start / Wave.SampleRate;
  128 + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
  129 + WriteLn(Format('%.3f -- %.3f %s',
  130 + [Start, Start + Duration, RecognitionResult.Text]));
  131 +
  132 + FreeAndNil(Stream);
  133 + end;
  134 +
  135 + FreeAndNil(Recognizer);
  136 + FreeAndNil(Vad);
  137 +end.
  1 +{ Copyright (c) 2024 Xiaomi Corporation }
  2 +
  3 +{
  4 +This file shows how to use a non-streaming Whisper model
  5 +with silero VAD to decode files.
  6 +
  7 +You can download the model files from
  8 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  9 +}
  10 +
  11 +program vad_with_whisper;
  12 +
  13 +{$mode objfpc}
  14 +
  15 +uses
  16 + sherpa_onnx,
  17 + SysUtils;
  18 +
  19 +function CreateVad(): TSherpaOnnxVoiceActivityDetector;
  20 +var
  21 + Config: TSherpaOnnxVadModelConfig;
  22 +
  23 + SampleRate: Integer;
  24 + WindowSize: Integer;
  25 +begin
  26 + Initialize(Config);
  27 +
  28 + SampleRate := 16000; {Please don't change it unless you know the details}
  29 + WindowSize := 512; {Please don't change it unless you know the details}
  30 +
  31 + Config.SileroVad.Model := './silero_vad.onnx';
  32 + Config.SileroVad.MinSpeechDuration := 0.5;
  33 + Config.SileroVad.MinSilenceDuration := 0.5;
  34 + Config.SileroVad.Threshold := 0.5;
  35 + Config.SileroVad.WindowSize := WindowSize;
  36 + Config.NumThreads:= 1;
  37 + Config.Debug:= True;
  38 + Config.Provider:= 'cpu';
  39 + Config.SampleRate := SampleRate;
  40 +
  41 + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
  42 +end;
  43 +
  44 +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
  45 +var
  46 + Config: TSherpaOnnxOfflineRecognizerConfig;
  47 +begin
  48 + Initialize(Config);
  49 +
  50 + Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
  51 + Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
  52 + Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
  53 + Config.ModelConfig.Provider := 'cpu';
  54 + Config.ModelConfig.NumThreads := 1;
  55 + Config.ModelConfig.Debug := False;
  56 +
  57 + Result := TSherpaOnnxOfflineRecognizer.Create(Config);
  58 +end;
  59 +
  60 +var
  61 + Wave: TSherpaOnnxWave;
  62 +
  63 + Recognizer: TSherpaOnnxOfflineRecognizer;
  64 + Vad: TSherpaOnnxVoiceActivityDetector;
  65 +
  66 + Offset: Integer;
  67 + WindowSize: Integer;
  68 + SpeechSegment: TSherpaOnnxSpeechSegment;
  69 +
  70 + Start: Single;
  71 + Duration: Single;
  72 +
  73 + Stream: TSherpaOnnxOfflineStream;
  74 + RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
  75 +begin
  76 + Vad := CreateVad();
  77 + Recognizer := CreateOfflineRecognizer();
  78 +
  79 + Wave := SherpaOnnxReadWave('./Obama.wav');
  80 + if Wave.SampleRate <> Vad.Config.SampleRate then
  81 + begin
  82 + WriteLn(Format('Expected sample rate: %d. Given: %d',
  83 + [Vad.Config.SampleRate, Wave.SampleRate]));
  84 +
  85 + Exit;
  86 + end;
  87 +
  88 + WindowSize := Vad.Config.SileroVad.WindowSize;
  89 + Offset := 0;
  90 + while Offset + WindowSize <= Length(Wave.Samples) do
  91 + begin
  92 + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
  93 + Offset += WindowSize;
  94 +
  95 + while not Vad.IsEmpty do
  96 + begin
  97 + SpeechSegment := Vad.Front();
  98 + Vad.Pop();
  99 + Stream := Recognizer.CreateStream();
  100 +
  101 + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
  102 + Recognizer.Decode(Stream);
  103 + RecognitionResult := Recognizer.GetResult(Stream);
  104 +
  105 + Start := SpeechSegment.Start / Wave.SampleRate;
  106 + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
  107 + WriteLn(Format('%.3f -- %.3f %s',
  108 + [Start, Start + Duration, RecognitionResult.Text]));
  109 +
  110 + FreeAndNil(Stream);
  111 + end;
  112 + end;
  113 +
  114 + Vad.Flush;
  115 +
  116 + while not Vad.IsEmpty do
  117 + begin
  118 + SpeechSegment := Vad.Front();
  119 + Vad.Pop();
  120 + Stream := Recognizer.CreateStream();
  121 +
  122 + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
  123 + Recognizer.Decode(Stream);
  124 + RecognitionResult := Recognizer.GetResult(Stream);
  125 +
  126 + Start := SpeechSegment.Start / Wave.SampleRate;
  127 + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
  128 + WriteLn(Format('%.3f -- %.3f %s',
  129 + [Start, Start + Duration, RecognitionResult.Text]));
  130 +
  131 + FreeAndNil(Stream);
  132 + end;
  133 +
  134 + FreeAndNil(Recognizer);
  135 + FreeAndNil(Vad);
  136 +end.
  1 +!run*.sh
  2 +circular_buffer
  3 +remove_silence
  1 +# Introduction
  2 +
  3 +
  4 +This directory contains examples for how to use the VAD (voice activity detection)
  5 +APIs.
  6 +
  7 +|Directory| Description|
  8 +|---------|------------|
  9 +|[run-circular-buffer.sh](./run-circular-buffer.sh)|It shows how to use the circular buffer API.|
  10 +|[run-remove-silence.sh](./run-remove-silence.sh)|It shows how to use the VAD API to remove silences from a wave file.|
  11 +
  1 +{ Copyright (c) 2024 Xiaomi Corporation }
  2 +program circular_buffer;
  3 +{
  4 +This file shows how to use the CircularBuffer API of sherpa-onnx
  5 +}
  6 +
  7 +{$mode objfpc}
  8 +{$ASSERTIONS ON}
  9 +
  10 +uses
  11 + sherpa_onnx;
  12 +
  13 +var
  14 + Buffer: TSherpaOnnxCircularBuffer;
  15 + Samples: TSherpaOnnxSamplesArray;
  16 +begin
  17 + {The initial capacity is 5. It will be resized automatically if needed.}
  18 + Buffer := TSherpaOnnxCircularBuffer.Create(5);
  19 + Assert(Buffer.Size = 0);
  20 + Assert(Buffer.Head = 0);
  21 + Buffer.Push([0, 10, 20]);
  22 +
  23 + {Push() changes Size. Head is not changed.}
  24 + Assert(Buffer.Size = 3);
  25 + Assert(Buffer.Head = 0);
  26 +
  27 + Samples := Buffer.Get(0, 1);
  28 + Assert(Length(Samples) = 1);
  29 + Assert(Samples[0] = 0);
  30 +
  31 + { Get() does not change Size or Head}
  32 + Assert(Buffer.Size = 3);
  33 + Assert(Buffer.Head = 0);
  34 +
  35 + Samples := Buffer.Get(0, 2);
  36 + Assert(Length(Samples) = 2);
  37 + Assert(Samples[0] = 0);
  38 + Assert(Samples[1] = 10);
  39 +
  40 + { The buffer will be resized since its initial capacity is 5 but we have
  41 + pushed 7 elements into it.
  42 +
  43 + No data is lost during the resize.
  44 + }
  45 + Buffer.Push([30, 40, 50, 60]);
  46 +
  47 + Assert(Buffer.Size = 7); {There are now 7 elements}
  48 + Assert(Buffer.Head = 0);
  49 +
  50 + {Remove the first 4 elements}
  51 + Buffer.Pop(4);
  52 +
  53 + Assert(Buffer.Size = 3); {There are only 3 elements left}
  54 + Assert(Buffer.Head = 4);
  55 +
  56 + Samples := Buffer.Get(Buffer.Head, 2);
  57 + Assert(Length(Samples) = 2);
  58 + Assert(Samples[0] = 40);
  59 + Assert(Samples[1] = 50);
  60 +
  61 + Buffer.Pop(1);
  62 +
  63 + Assert(Buffer.Size = 2); {There are only 2 elements left}
  64 + Assert(Buffer.Head = 5);
  65 +
  66 + Samples := Buffer.Get(Buffer.Head, 2);
  67 + Assert(Length(Samples) = 2);
  68 + Assert(Samples[0] = 50);
  69 + Assert(Samples[1] = 60);
  70 +
  71 + Buffer.Pop(2);
  72 + Assert(Buffer.Size = 0); {There are no elements left}
  73 + Assert(Buffer.Head = 7);
  74 +
  75 + Buffer.Push([100, 200, 300, 400, 500]);
  76 + Assert(Buffer.Size = 5);
  77 + Assert(Buffer.Head = 7);
  78 +
  79 + Buffer.Pop(4);
  80 + Assert(Buffer.Size = 1);
  81 +
  82 + {Head can be larger than the Capacity!
  83 + This is what circular means. It points to Buffer.Head / Capacity.
  84 + }
  85 + Assert(Buffer.Head = 11);
  86 + Buffer.Push([600, 700]);
  87 +
  88 + Assert(Buffer.Size = 3);
  89 + Assert(Buffer.Head = 11);
  90 +
  91 + Samples := Buffer.Get(Buffer.Head, 3);
  92 + Assert(Length(Samples) = 3);
  93 + Assert(Samples[0] = 500);
  94 + Assert(Samples[1] = 600);
  95 + Assert(Samples[2] = 700);
  96 +
  97 + Buffer.Pop(3);
  98 + Assert(Buffer.Size = 0);
  99 + Assert(Buffer.Head = 14);
  100 +
  101 + Buffer.Reset();
  102 +
  103 + Assert(Buffer.Size = 0);
  104 + Assert(Buffer.Head = 0);
  105 +end.
  106 +
  1 +{ Copyright (c) 2024 Xiaomi Corporation }
  2 +{
  3 +This file shows how to use the VAD API from sherpa-onnx
  4 +to remove silences from a wave file.
  5 +}
  6 +program main;
  7 +
  8 +{$mode delphi}
  9 +
  10 +uses
  11 + sherpa_onnx,
  12 + SysUtils;
  13 +
  14 +var
  15 + Wave: TSherpaOnnxWave;
  16 +
  17 + Config: TSherpaOnnxVadModelConfig;
  18 + Vad: TSherpaOnnxVoiceActivityDetector;
  19 + Offset: Integer;
  20 + WindowSize: Integer;
  21 + SpeechSegment: TSherpaOnnxSpeechSegment;
  22 +
  23 + Start: Single;
  24 + Duration: Single;
  25 + SampleRate: Integer;
  26 +
  27 + AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
  28 + AllSamples: array of Single;
  29 + N: Integer;
  30 + I: Integer;
  31 +begin
  32 + SampleRate := 16000; {Please don't change it unless you know the details}
  33 +
  34 + Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
  35 + if Wave.SampleRate <> SampleRate then
  36 + begin
  37 + WriteLn(Format('Expected sample rate: %d. Given: %d',
  38 + [SampleRate, Wave.SampleRate]));
  39 +
  40 + Exit;
  41 + end;
  42 +
  43 + WindowSize := 512; {Please don't change it unless you know the details}
  44 + Initialize(Config);
  45 +
  46 + Config.SileroVad.Model := './silero_vad.onnx';
  47 + Config.SileroVad.MinSpeechDuration := 0.25;
  48 + Config.SileroVad.MinSilenceDuration := 0.5;
  49 + Config.SileroVad.Threshold := 0.5;
  50 + Config.SileroVad.WindowSize := WindowSize;
  51 + Config.NumThreads:= 1;
  52 + Config.Debug:= True;
  53 + Config.Provider:= 'cpu';
  54 + Config.SampleRate := SampleRate;
  55 +
  56 + Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
  57 +
  58 + AllSpeechSegment := nil;
  59 + AllSamples := nil;
  60 + Offset := 0;
  61 + while Offset + WindowSize <= Length(Wave.Samples) do
  62 + begin
  63 + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
  64 + Inc(Offset, WindowSize);
  65 +
  66 + while not Vad.IsEmpty do
  67 + begin
  68 + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
  69 +
  70 + SpeechSegment := Vad.Front();
  71 + Vad.Pop();
  72 + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
  73 +
  74 + Start := SpeechSegment.Start / SampleRate;
  75 + Duration := Length(SpeechSegment.Samples) / SampleRate;
  76 + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
  77 + end;
  78 + end;
  79 +
  80 + Vad.Flush;
  81 +
  82 + while not Vad.IsEmpty do
  83 + begin
  84 + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
  85 +
  86 + SpeechSegment := Vad.Front();
  87 + Vad.Pop();
  88 + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
  89 +
  90 + Start := SpeechSegment.Start / SampleRate;
  91 + Duration := Length(SpeechSegment.Samples) / SampleRate;
  92 + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
  93 + end;
  94 +
  95 + N := 0;
  96 + for SpeechSegment in AllSpeechSegment do
  97 + Inc(N, Length(SpeechSegment.Samples));
  98 +
  99 + SetLength(AllSamples, N);
  100 +
  101 + N := 0;
  102 + for SpeechSegment in AllSpeechSegment do
  103 + begin
  104 + for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
  105 + begin
  106 + AllSamples[N] := SpeechSegment.Samples[I];
  107 + Inc(N);
  108 + end;
  109 + end;
  110 +
  111 + SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate);
  112 + WriteLn('Saved to ./lei-jun-test-no-silence.wav');
  113 +
  114 + FreeAndNil(Vad);
  115 +end.
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +fpc \
  27 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  28 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  29 + ./circular_buffer.pas
  30 +
  31 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  32 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  33 +
  34 +./circular_buffer
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [[ ! -f ./silero_vad.onnx ]]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  28 +fi
  29 +
  30 +if [ ! -f ./lei-jun-test.wav ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  32 +fi
  33 +
  34 +fpc \
  35 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  36 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  37 + ./remove_silence.pas
  38 +
  39 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  40 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  41 +
  42 +./remove_silence
@@ -95,6 +95,8 @@ void CircularBuffer::Push(const float *p, int32_t n) { @@ -95,6 +95,8 @@ void CircularBuffer::Push(const float *p, int32_t n) {
95 "capacity to: %d", 95 "capacity to: %d",
96 n, size, n + size, capacity, new_capacity); 96 n, size, n + size, capacity, new_capacity);
97 Resize(new_capacity); 97 Resize(new_capacity);
  98 +
  99 + capacity = new_capacity;
98 } 100 }
99 101
100 int32_t start = tail_ % capacity; 102 int32_t start = tail_ % capacity;
@@ -2,9 +2,11 @@ @@ -2,9 +2,11 @@
2 2
3 unit sherpa_onnx; 3 unit sherpa_onnx;
4 4
5 -{$mode objfpc} 5 +{$IFDEF FPC}
  6 + {$mode objfpc}
  7 + {$modeSwitch advancedRecords} { to support records with methods }
  8 +{$ENDIF}
6 9
7 -{$modeSwitch advancedRecords} { to support records with methods }  
8 (* {$LongStrings ON} *) 10 (* {$LongStrings ON} *)
9 11
10 interface 12 interface
@@ -45,18 +47,21 @@ type @@ -45,18 +47,21 @@ type
45 ModelingUnit: AnsiString; 47 ModelingUnit: AnsiString;
46 BpeVocab: AnsiString; 48 BpeVocab: AnsiString;
47 function ToString: AnsiString; 49 function ToString: AnsiString;
  50 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
48 end; 51 end;
49 52
50 TSherpaOnnxFeatureConfig = record 53 TSherpaOnnxFeatureConfig = record
51 SampleRate: Integer; 54 SampleRate: Integer;
52 FeatureDim: Integer; 55 FeatureDim: Integer;
53 function ToString: AnsiString; 56 function ToString: AnsiString;
  57 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
54 end; 58 end;
55 59
56 TSherpaOnnxOnlineCtcFstDecoderConfig = record 60 TSherpaOnnxOnlineCtcFstDecoderConfig = record
57 Graph: AnsiString; 61 Graph: AnsiString;
58 MaxActive: Integer; 62 MaxActive: Integer;
59 function ToString: AnsiString; 63 function ToString: AnsiString;
  64 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
60 end; 65 end;
61 66
62 TSherpaOnnxOnlineRecognizerConfig = record 67 TSherpaOnnxOnlineRecognizerConfig = record
@@ -75,6 +80,7 @@ type @@ -75,6 +80,7 @@ type
75 RuleFars: AnsiString; 80 RuleFars: AnsiString;
76 BlankPenalty: Single; 81 BlankPenalty: Single;
77 function ToString: AnsiString; 82 function ToString: AnsiString;
  83 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
78 end; 84 end;
79 85
80 TSherpaOnnxOnlineRecognizerResult = record 86 TSherpaOnnxOnlineRecognizerResult = record
@@ -97,6 +103,7 @@ type @@ -97,6 +103,7 @@ type
97 TSherpaOnnxOnlineRecognizer = class 103 TSherpaOnnxOnlineRecognizer = class
98 private 104 private
99 Handle: Pointer; 105 Handle: Pointer;
  106 + _Config: TSherpaOnnxOnlineRecognizerConfig;
100 public 107 public
101 constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig); 108 constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig);
102 destructor Destroy; override; 109 destructor Destroy; override;
@@ -108,6 +115,7 @@ type @@ -108,6 +115,7 @@ type
108 procedure Reset(Stream: TSherpaOnnxOnlineStream); 115 procedure Reset(Stream: TSherpaOnnxOnlineStream);
109 function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean; 116 function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
110 function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult; 117 function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
  118 + property Config: TSherpaOnnxOnlineRecognizerConfig Read _Config;
111 end; 119 end;
112 120
113 TSherpaOnnxOfflineTransducerModelConfig = record 121 TSherpaOnnxOfflineTransducerModelConfig = record
@@ -134,6 +142,7 @@ type @@ -134,6 +142,7 @@ type
134 Task: AnsiString; 142 Task: AnsiString;
135 TailPaddings: Integer; 143 TailPaddings: Integer;
136 function ToString: AnsiString; 144 function ToString: AnsiString;
  145 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
137 end; 146 end;
138 147
139 TSherpaOnnxOfflineTdnnModelConfig = record 148 TSherpaOnnxOfflineTdnnModelConfig = record
@@ -145,12 +154,14 @@ type @@ -145,12 +154,14 @@ type
145 Model: AnsiString; 154 Model: AnsiString;
146 Scale: Single; 155 Scale: Single;
147 function ToString: AnsiString; 156 function ToString: AnsiString;
  157 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
148 end; 158 end;
149 159
150 TSherpaOnnxOfflineSenseVoiceModelConfig = record 160 TSherpaOnnxOfflineSenseVoiceModelConfig = record
151 Model: AnsiString; 161 Model: AnsiString;
152 Language: AnsiString; 162 Language: AnsiString;
153 UseItn: Boolean; 163 UseItn: Boolean;
  164 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
154 function ToString: AnsiString; 165 function ToString: AnsiString;
155 end; 166 end;
156 167
@@ -169,6 +180,7 @@ type @@ -169,6 +180,7 @@ type
169 BpeVocab: AnsiString; 180 BpeVocab: AnsiString;
170 TeleSpeechCtc: AnsiString; 181 TeleSpeechCtc: AnsiString;
171 SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig; 182 SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
  183 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
172 function ToString: AnsiString; 184 function ToString: AnsiString;
173 end; 185 end;
174 186
@@ -183,6 +195,7 @@ type @@ -183,6 +195,7 @@ type
183 RuleFsts: AnsiString; 195 RuleFsts: AnsiString;
184 RuleFars: AnsiString; 196 RuleFars: AnsiString;
185 BlankPenalty: Single; 197 BlankPenalty: Single;
  198 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
186 function ToString: AnsiString; 199 function ToString: AnsiString;
187 end; 200 end;
188 201
@@ -205,18 +218,83 @@ type @@ -205,18 +218,83 @@ type
205 TSherpaOnnxOfflineRecognizer = class 218 TSherpaOnnxOfflineRecognizer = class
206 private 219 private
207 Handle: Pointer; 220 Handle: Pointer;
  221 + _Config: TSherpaOnnxOfflineRecognizerConfig;
208 public 222 public
209 constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig); 223 constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig);
210 destructor Destroy; override; 224 destructor Destroy; override;
211 function CreateStream: TSherpaOnnxOfflineStream; 225 function CreateStream: TSherpaOnnxOfflineStream;
212 procedure Decode(Stream: TSherpaOnnxOfflineStream); 226 procedure Decode(Stream: TSherpaOnnxOfflineStream);
213 function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult; 227 function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
  228 + property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config;
214 end; 229 end;
215 230
216 -{ It supports reading a single channel wave with 16-bit encoded samples.  
217 - Samples are normalized to the range [-1, 1].  
218 -}  
219 -function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave; 231 + TSherpaOnnxSileroVadModelConfig = record
  232 + Model: AnsiString;
  233 + Threshold: Single;
  234 + MinSilenceDuration: Single;
  235 + MinSpeechDuration: Single;
  236 + WindowSize: Integer;
  237 + function ToString: AnsiString;
  238 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
  239 + end;
  240 +
  241 + TSherpaOnnxVadModelConfig = record
  242 + SileroVad: TSherpaOnnxSileroVadModelConfig;
  243 + SampleRate: Integer;
  244 + NumThreads: Integer;
  245 + Provider: AnsiString;
  246 + Debug: Boolean;
  247 + function ToString: AnsiString;
  248 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
  249 + end;
  250 +
  251 + TSherpaOnnxSamplesArray = array of Single;
  252 +
  253 + TSherpaOnnxCircularBuffer = class
  254 + private
  255 + Handle: Pointer;
  256 + public
  257 + constructor Create(Capacity: Integer);
  258 + destructor Destroy; override;
  259 + procedure Push(Samples: array of Single);
  260 + function Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
  261 + procedure Pop(N: Integer);
  262 + procedure Reset;
  263 + function Size: Integer;
  264 + function Head: Integer;
  265 + end;
  266 +
  267 + TSherpaOnnxSpeechSegment = record
  268 + Samples: array of Single;
  269 + Start: Integer;
  270 + end;
  271 +
  272 + TSherpaOnnxVoiceActivityDetector = class
  273 + private
  274 + Handle: Pointer;
  275 + _Config: TSherpaOnnxVadModelConfig;
  276 + public
  277 + constructor Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
  278 + destructor Destroy; override;
  279 + procedure AcceptWaveform(Samples: array of Single); overload;
  280 + procedure AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); overload;
  281 + function IsEmpty: Boolean;
  282 + function IsDetected: Boolean;
  283 + procedure Pop;
  284 + procedure Clear;
  285 + function Front: TSherpaOnnxSpeechSegment;
  286 + procedure Reset;
  287 + procedure Flush;
  288 + property Config: TSherpaOnnxVadModelConfig Read _Config;
  289 + end;
  290 +
  291 + { It supports reading a single channel wave with 16-bit encoded samples.
  292 + Samples are normalized to the range [-1, 1].
  293 + }
  294 + function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
  295 +
  296 + function SherpaOnnxWriteWave(Filename: AnsiString;
  297 + Samples: array of Single; SampleRate: Integer): Boolean;
220 298
221 implementation 299 implementation
222 300
@@ -294,15 +372,15 @@ type @@ -294,15 +372,15 @@ type
294 DecodingMethod: PAnsiChar; 372 DecodingMethod: PAnsiChar;
295 MaxActivePaths: cint32; 373 MaxActivePaths: cint32;
296 EnableEndpoint: cint32; 374 EnableEndpoint: cint32;
297 - Rule1MinTrailingSilence: Single;  
298 - Rule2MinTrailingSilence: Single;  
299 - Rule3MinUtteranceLength: Single; 375 + Rule1MinTrailingSilence: cfloat;
  376 + Rule2MinTrailingSilence: cfloat;
  377 + Rule3MinUtteranceLength: cfloat;
300 HotwordsFile: PAnsiChar; 378 HotwordsFile: PAnsiChar;
301 - HotwordsScore: Single; 379 + HotwordsScore: cfloat;
302 CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig; 380 CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig;
303 RuleFsts: PAnsiChar; 381 RuleFsts: PAnsiChar;
304 RuleFars: PAnsiChar; 382 RuleFars: PAnsiChar;
305 - BlankPenalty: Single; 383 + BlankPenalty: cfloat;
306 end; 384 end;
307 385
308 PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig; 386 PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig;
@@ -330,7 +408,7 @@ type @@ -330,7 +408,7 @@ type
330 end; 408 end;
331 SherpaOnnxOfflineLMConfig = record 409 SherpaOnnxOfflineLMConfig = record
332 Model: PAnsiChar; 410 Model: PAnsiChar;
333 - Scale: Single; 411 + Scale: cfloat;
334 end; 412 end;
335 SherpaOnnxOfflineSenseVoiceModelConfig = record 413 SherpaOnnxOfflineSenseVoiceModelConfig = record
336 Model: PAnsiChar; 414 Model: PAnsiChar;
@@ -361,14 +439,100 @@ type @@ -361,14 +439,100 @@ type
361 DecodingMethod: PAnsiChar; 439 DecodingMethod: PAnsiChar;
362 MaxActivePaths: cint32; 440 MaxActivePaths: cint32;
363 HotwordsFile: PAnsiChar; 441 HotwordsFile: PAnsiChar;
364 - HotwordsScore: Single; 442 + HotwordsScore: cfloat;
365 RuleFsts: PAnsiChar; 443 RuleFsts: PAnsiChar;
366 RuleFars: PAnsiChar; 444 RuleFars: PAnsiChar;
367 - BlankPenalty: Single; 445 + BlankPenalty: cfloat;
368 end; 446 end;
369 447
370 PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig; 448 PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig;
371 449
  450 + SherpaOnnxSileroVadModelConfig = record
  451 + Model: PAnsiChar;
  452 + Threshold: cfloat;
  453 + MinSilenceDuration: cfloat;
  454 + MinSpeechDuration: cfloat;
  455 + WindowSize: cint32;
  456 + end;
  457 + SherpaOnnxVadModelConfig = record
  458 + SileroVad: SherpaOnnxSileroVadModelConfig;
  459 + SampleRate: cint32;
  460 + NumThreads: cint32;
  461 + Provider: PAnsiChar;
  462 + Debug: cint32;
  463 + end;
  464 + PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
  465 +
  466 + SherpaOnnxSpeechSegment = record
  467 + Start: cint32;
  468 + Samples: pcfloat;
  469 + N: cint32;
  470 + end;
  471 +
  472 + PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
  473 +
  474 +function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
  475 + BufferSizeInSeconds: cfloat): Pointer; cdecl;
  476 + external SherpaOnnxLibName;
  477 +
  478 +procedure SherpaOnnxDestroyVoiceActivityDetector(Vad: Pointer); cdecl;
  479 + external SherpaOnnxLibName;
  480 +
  481 +procedure SherpaOnnxVoiceActivityDetectorAcceptWaveform(Vad: Pointer;
  482 + Samples: pcfloat; N: cint32); cdecl;
  483 + external SherpaOnnxLibName;
  484 +
  485 +function SherpaOnnxVoiceActivityDetectorEmpty(Vad: Pointer): cint32; cdecl;
  486 + external SherpaOnnxLibName;
  487 +
  488 +function SherpaOnnxVoiceActivityDetectorDetected(Vad: Pointer): cint32; cdecl;
  489 + external SherpaOnnxLibName;
  490 +
  491 +procedure SherpaOnnxVoiceActivityDetectorPop(Vad: Pointer); cdecl;
  492 + external SherpaOnnxLibName;
  493 +
  494 +procedure SherpaOnnxVoiceActivityDetectorClear(Vad: Pointer); cdecl;
  495 + external SherpaOnnxLibName;
  496 +
  497 +function SherpaOnnxVoiceActivityDetectorFront(Vad: Pointer): PSherpaOnnxSpeechSegment; cdecl;
  498 + external SherpaOnnxLibName;
  499 +
  500 +procedure SherpaOnnxDestroySpeechSegment(P: PSherpaOnnxSpeechSegment); cdecl;
  501 + external SherpaOnnxLibName;
  502 +
  503 +procedure SherpaOnnxVoiceActivityDetectorReset(P: PSherpaOnnxSpeechSegment); cdecl;
  504 + external SherpaOnnxLibName;
  505 +
  506 +procedure SherpaOnnxVoiceActivityDetectorFlush(P: PSherpaOnnxSpeechSegment); cdecl;
  507 + external SherpaOnnxLibName;
  508 +
  509 +function SherpaOnnxCreateCircularBuffer(Capacity: cint32): Pointer; cdecl;
  510 + external SherpaOnnxLibName;
  511 +
  512 +procedure SherpaOnnxDestroyCircularBuffer(Buffer: Pointer) ; cdecl;
  513 + external SherpaOnnxLibName;
  514 +
  515 +procedure SherpaOnnxCircularBufferPush(Buffer: Pointer; Samples: pcfloat; N: cint32); cdecl;
  516 + external SherpaOnnxLibName;
  517 +
  518 +function SherpaOnnxCircularBufferGet(Buffer: Pointer; StartIndex: cint32; N: cint32): pcfloat ; cdecl;
  519 + external SherpaOnnxLibName;
  520 +
  521 +procedure SherpaOnnxCircularBufferFree(P: pcfloat); cdecl;
  522 + external SherpaOnnxLibName;
  523 +
  524 +procedure SherpaOnnxCircularBufferPop(Buffer: Pointer; N: cint32); cdecl;
  525 + external SherpaOnnxLibName;
  526 +
  527 +function SherpaOnnxCircularBufferSize(Buffer: Pointer): cint32; cdecl;
  528 + external SherpaOnnxLibName;
  529 +
  530 +function SherpaOnnxCircularBufferHead(Buffer: Pointer): cint32; cdecl;
  531 + external SherpaOnnxLibName;
  532 +
  533 +procedure SherpaOnnxCircularBufferReset(Buffer: Pointer); cdecl;
  534 + external SherpaOnnxLibName;
  535 +
372 function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl; 536 function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl;
373 external SherpaOnnxLibName; 537 external SherpaOnnxLibName;
374 538
@@ -437,9 +601,20 @@ procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl; @@ -437,9 +601,20 @@ procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl;
437 function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl; 601 function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl;
438 external SherpaOnnxLibName name 'SherpaOnnxReadWave'; 602 external SherpaOnnxLibName name 'SherpaOnnxReadWave';
439 603
  604 +function SherpaOnnxWriteWaveWrapper(Samples: pcfloat; N: cint32;
  605 + SampleRate: cint32; Filename: PAnsiChar): cint32; cdecl;
  606 + external SherpaOnnxLibName name 'SherpaOnnxWriteWave';
  607 +
440 procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl; 608 procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl;
441 external SherpaOnnxLibName name 'SherpaOnnxFreeWave'; 609 external SherpaOnnxLibName name 'SherpaOnnxFreeWave';
442 610
  611 +function SherpaOnnxWriteWave(Filename: AnsiString;
  612 + Samples: array of Single; SampleRate: Integer): Boolean;
  613 +begin
  614 + Result := SherpaOnnxWriteWaveWrapper(pcfloat(Samples), Length(Samples),
  615 + SampleRate, PAnsiChar(Filename)) = 1;
  616 +end;
  617 +
443 function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave; 618 function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
444 var 619 var
445 PFilename: PAnsiChar; 620 PFilename: PAnsiChar;
@@ -611,6 +786,7 @@ begin @@ -611,6 +786,7 @@ begin
611 C.BlankPenalty := Config.BlankPenalty; 786 C.BlankPenalty := Config.BlankPenalty;
612 787
613 Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C); 788 Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C);
  789 + Self._Config := Config;
614 end; 790 end;
615 791
616 destructor TSherpaOnnxOnlineRecognizer.Destroy; 792 destructor TSherpaOnnxOnlineRecognizer.Destroy;
@@ -877,6 +1053,7 @@ begin @@ -877,6 +1053,7 @@ begin
877 C.BlankPenalty := Config.BlankPenalty; 1053 C.BlankPenalty := Config.BlankPenalty;
878 1054
879 Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C); 1055 Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C);
  1056 + Self._Config := Config;
880 end; 1057 end;
881 1058
882 destructor TSherpaOnnxOfflineRecognizer.Destroy; 1059 destructor TSherpaOnnxOfflineRecognizer.Destroy;
@@ -984,5 +1161,255 @@ begin @@ -984,5 +1161,255 @@ begin
984 [Self.Text, TokensStr, TimestampStr]); 1161 [Self.Text, TokensStr, TimestampStr]);
985 end; 1162 end;
986 1163
  1164 +function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
  1165 +begin
  1166 + Result := Format('TSherpaOnnxSileroVadModelConfig(' +
  1167 + 'Model := %s, ' +
  1168 + 'Threshold := %.2f, ' +
  1169 + 'MinSilenceDuration := %.2f, ' +
  1170 + 'MinSpeechDuration := %.2f, ' +
  1171 + 'WindowSize := %d' +
  1172 + ')',
  1173 + [Self.Model, Self.Threshold, Self.MinSilenceDuration,
  1174 + Self.MinSpeechDuration, Self.WindowSize
  1175 + ]);
  1176 +end;
  1177 +
  1178 +class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
  1179 +begin
  1180 + Dest.Threshold := 0.5;
  1181 + Dest.MinSilenceDuration := 0.5;
  1182 + Dest.MinSpeechDuration := 0.25;
  1183 + Dest.WindowSize := 512;
  1184 +end;
  1185 +
  1186 +function TSherpaOnnxVadModelConfig.ToString: AnsiString;
  1187 +begin
  1188 + Result := Format('TSherpaOnnxVadModelConfig(' +
  1189 + 'SileroVad := %s, ' +
  1190 + 'SampleRate := %d, ' +
  1191 + 'NumThreads := %d, ' +
  1192 + 'Provider := %s, ' +
  1193 + 'Debug := %s' +
  1194 + ')',
  1195 + [Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
  1196 + Self.Debug.ToString
  1197 + ]);
  1198 +end;
  1199 +
  1200 +class operator TSherpaOnnxVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
  1201 +begin
  1202 + Dest.SampleRate := 16000;
  1203 + Dest.NumThreads := 1;
  1204 + Dest.Provider := 'cpu';
  1205 + Dest.Debug := False;
  1206 +end;
  1207 +
  1208 +class operator TSherpaOnnxFeatureConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
  1209 +begin
  1210 + Dest.SampleRate := 16000;
  1211 + Dest.FeatureDim := 80;
  1212 +end;
  1213 +
  1214 +class operator TSherpaOnnxOnlineCtcFstDecoderConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
  1215 +begin
  1216 + Dest.MaxActive := 3000;
  1217 +end;
  1218 +
  1219 +class operator TSherpaOnnxOnlineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
  1220 +begin
  1221 + Dest.DecodingMethod := 'greedy_search';
  1222 + Dest.EnableEndpoint := False;
  1223 + Dest.Rule1MinTrailingSilence := 2.4;
  1224 + Dest.Rule2MinTrailingSilence := 1.2;
  1225 + Dest.Rule3MinUtteranceLength := 20;
  1226 + Dest.HotwordsScore := 1.5;
  1227 + Dest.BlankPenalty := 0;
  1228 +end;
  1229 +
  1230 +class operator TSherpaOnnxOnlineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
  1231 +begin
  1232 + Dest.NumThreads := 1;
  1233 + Dest.Provider := 'cpu';
  1234 + Dest.Debug := False;
  1235 +end;
  1236 +
  1237 +class operator TSherpaOnnxOfflineWhisperModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
  1238 +begin
  1239 + Dest.Task := 'transcribe';
  1240 + Dest.TailPaddings := -1;
  1241 +end;
  1242 +
  1243 +class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
  1244 +begin
  1245 + Dest.Scale := 1.0;
  1246 +end;
  1247 +
  1248 +class operator TSherpaOnnxOfflineSenseVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
  1249 +begin
  1250 + Dest.UseItn := True;
  1251 +end;
  1252 +
  1253 +class operator TSherpaOnnxOfflineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
  1254 +begin
  1255 + Dest.NumThreads := 1;
  1256 + Dest.Debug := False;
  1257 + Dest.Provider := 'cpu';
  1258 +end;
  1259 +
  1260 +class operator TSherpaOnnxOfflineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
  1261 +begin
  1262 + Dest.DecodingMethod := 'greedy_search';
  1263 + Dest.MaxActivePaths := 4;
  1264 + Dest.HotwordsScore := 1.5;
  1265 + Dest.BlankPenalty := 0;
  1266 +end;
  1267 +
  1268 +constructor TSherpaOnnxCircularBuffer.Create(Capacity: Integer);
  1269 +begin
  1270 + Self.Handle := SherpaOnnxCreateCircularBuffer(Capacity);
  1271 +end;
  1272 +
  1273 +destructor TSherpaOnnxCircularBuffer.Destroy;
  1274 +begin
  1275 + SherpaOnnxDestroyCircularBuffer(Self.Handle);
  1276 + Self.Handle := nil;
  1277 +end;
  1278 +
  1279 +procedure TSherpaOnnxCircularBuffer.Push(Samples: array of Single);
  1280 +begin
  1281 + SherpaOnnxCircularBufferPush(Self.Handle, pcfloat(Samples), Length(Samples));
  1282 +end;
  1283 +
  1284 +function TSherpaOnnxCircularBuffer.Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
  1285 +var
  1286 + P: pcfloat;
  1287 + I: Integer;
  1288 +begin
  1289 + P := SherpaOnnxCircularBufferGet(Self.Handle, StartIndex, N);
  1290 +
  1291 + Result := nil;
  1292 +
  1293 + SetLength(Result, N);
  1294 +
  1295 + for I := Low(Result) to High(Result) do
  1296 + Result[I] := P[I];
  1297 +
  1298 + SherpaOnnxCircularBufferFree(P);
  1299 +end;
  1300 +
  1301 +procedure TSherpaOnnxCircularBuffer.Pop(N: Integer);
  1302 +begin
  1303 + SherpaOnnxCircularBufferPop(Self.Handle, N);
  1304 +end;
  1305 +
  1306 +procedure TSherpaOnnxCircularBuffer.Reset;
  1307 +begin
  1308 + SherpaOnnxCircularBufferReset(Self.Handle);
  1309 +end;
  1310 +
  1311 +function TSherpaOnnxCircularBuffer.Size: Integer;
  1312 +begin
  1313 + Result := SherpaOnnxCircularBufferSize(Self.Handle);
  1314 +end;
  1315 +
  1316 +function TSherpaOnnxCircularBuffer.Head: Integer;
  1317 +begin
  1318 + Result := SherpaOnnxCircularBufferHead(Self.Handle);
  1319 +end;
  1320 +
  1321 +constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
  1322 +var
  1323 + C: SherpaOnnxVadModelConfig;
  1324 +begin
  1325 + Self._Config := Config;
  1326 +
  1327 + Initialize(C);
  1328 +
  1329 + C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
  1330 + C.SileroVad.Threshold := Config.SileroVad.Threshold;
  1331 + C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
  1332 + C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
  1333 + C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
  1334 +
  1335 + C.SampleRate := Config.SampleRate;
  1336 + C.NumThreads := Config.NumThreads;
  1337 + C.Provider := PAnsiChar(Config.Provider);
  1338 + C.Debug := Ord(Config.Debug);
  1339 +
  1340 + Self.Handle := SherpaOnnxCreateVoiceActivityDetector(@C, BufferSizeInSeconds);
  1341 +end;
  1342 +
  1343 +destructor TSherpaOnnxVoiceActivityDetector.Destroy;
  1344 +begin
  1345 + SherpaOnnxDestroyVoiceActivityDetector(Self.Handle);
  1346 + Self.Handle := nil;
  1347 +end;
  1348 +
  1349 +procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single);
  1350 +begin
  1351 + SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, pcfloat(Samples), Length(Samples));
  1352 +end;
  1353 +
  1354 +procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer);
  1355 +begin
  1356 + if Offset + N > Length(Samples) then
  1357 + begin
  1358 + WriteLn(Format('Invalid arguments!. Array length: %d, Offset: %d, N: %d',
  1359 + [Length(Samples), Offset, N]
  1360 + ));
  1361 + Exit;
  1362 + end;
  1363 +
  1364 + SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle,
  1365 + pcfloat(Samples) + Offset, N);
  1366 +end;
  1367 +
  1368 +function TSherpaOnnxVoiceActivityDetector.IsEmpty: Boolean;
  1369 +begin
  1370 + Result := SherpaOnnxVoiceActivityDetectorEmpty(Self.Handle) = 1;
  1371 +end;
  1372 +
  1373 +function TSherpaOnnxVoiceActivityDetector.IsDetected: Boolean;
  1374 +begin
  1375 + Result := SherpaOnnxVoiceActivityDetectorDetected(Self.Handle) = 1;
  1376 +end;
  1377 +
  1378 +procedure TSherpaOnnxVoiceActivityDetector.Pop;
  1379 +begin
  1380 + SherpaOnnxVoiceActivityDetectorPop(Self.Handle);
  1381 +end;
  1382 +
  1383 +procedure TSherpaOnnxVoiceActivityDetector.Clear;
  1384 +begin
  1385 + SherpaOnnxVoiceActivityDetectorClear(Self.Handle);
  1386 +end;
  1387 +
  1388 +function TSherpaOnnxVoiceActivityDetector.Front: TSherpaOnnxSpeechSegment;
  1389 +var
  1390 + P: PSherpaOnnxSpeechSegment;
  1391 + I: Integer;
  1392 +begin
  1393 + P := SherpaOnnxVoiceActivityDetectorFront(Self.Handle);
  1394 + Result.Start := P^.Start;
  1395 + Result.Samples := nil;
  1396 + SetLength(Result.Samples, P^.N);
  1397 +
  1398 + for I := Low(Result.Samples) to High(Result.Samples) do
  1399 + Result.Samples[I] := P^.Samples[I];
  1400 +
  1401 + SherpaOnnxDestroySpeechSegment(P);
  1402 +end;
  1403 +
  1404 +procedure TSherpaOnnxVoiceActivityDetector.Reset;
  1405 +begin
  1406 + SherpaOnnxVoiceActivityDetectorReset(Self.Handle);
  1407 +end;
  1408 +
  1409 +procedure TSherpaOnnxVoiceActivityDetector.Flush;
  1410 +begin
  1411 + SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
  1412 +end;
  1413 +
987 end. 1414 end.
988 1415