Fangjun Kuang
Committed by GitHub

Pascal API for VAD (#1249)

... ... @@ -116,12 +116,54 @@ jobs:
cp -v install/lib/*.dll ../pascal-api-examples/read-wav
cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
cp -v install/lib/*.dll ../pascal-api-examples/vad
cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
fi
- name: Run Pascal test (VAD + non-streaming ASR)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd vad-with-non-streaming-asr
time ./run-vad-with-whisper.sh
rm -rf sherpa-onnx-*
echo "---"
time ./run-vad-with-sense-voice.sh
rm -rf sherpa-onnx-*
echo "---"
ls -lh
popd
- name: Run Pascal test (VAD test)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd vad
./run-circular-buffer.sh
echo "---"
time ./run-remove-silence.sh
echo "---"
ls -lh
popd
- name: Run Pascal test (Read wav test)
shell: bash
run: |
... ...
... ... @@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|[read-wav](./read-wav)|It shows how to read a wave file.|
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|[vad](./vad)| It shows how to use the voice activity detection API.|
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
Config.ModelConfig.SenseVoice.Language := 'auto';
Config.ModelConfig.SenseVoice.UseItn := False;
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
... ...
... ... @@ -33,6 +33,8 @@ var
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';
... ...
!run-*.sh
vad_with_whisper
vad_with_sense_voice
... ...
# Introduction
This directory contains examples for how to use the VAD (voice activity detection)
with non-streaming speech recognition models.
|Directory| Description|
|---------|------------|
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
... ...
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
popd
fi
if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi
if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi
if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
fi
fpc \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./vad_with_sense_voice.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./vad_with_sense_voice
... ...
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
popd
fi
if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi
if [ ! -f ./Obama.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
fi
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
rm sherpa-onnx-whisper-tiny.en.tar.bz2
fi
fpc \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./vad_with_whisper.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./vad_with_whisper
... ...
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use a non-streaming SenseVoice model
with silero VAD to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program vad_with_whisper;
{$mode objfpc}
uses
sherpa_onnx,
SysUtils;
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
var
Config: TSherpaOnnxVadModelConfig;
SampleRate: Integer;
WindowSize: Integer;
begin
Initialize(Config);
SampleRate := 16000; {Please don't change it unless you know the details}
WindowSize := 512; {Please don't change it unless you know the details}
Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
end;
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
Config.ModelConfig.SenseVoice.Language := 'auto';
Config.ModelConfig.SenseVoice.UseItn := False;
Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
var
Wave: TSherpaOnnxWave;
Recognizer: TSherpaOnnxOfflineRecognizer;
Vad: TSherpaOnnxVoiceActivityDetector;
Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Start: Single;
Duration: Single;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
begin
Vad := CreateVad();
Recognizer := CreateOfflineRecognizer();
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> Vad.Config.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[Vad.Config.SampleRate, Wave.SampleRate]));
Exit;
end;
WindowSize := Vad.Config.SileroVad.WindowSize;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Offset += WindowSize;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
end;
Vad.Flush;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
FreeAndNil(Recognizer);
FreeAndNil(Vad);
end.
... ...
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use a non-streaming Whisper model
with silero VAD to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program vad_with_whisper;
{$mode objfpc}
uses
sherpa_onnx,
SysUtils;
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
var
Config: TSherpaOnnxVadModelConfig;
SampleRate: Integer;
WindowSize: Integer;
begin
Initialize(Config);
SampleRate := 16000; {Please don't change it unless you know the details}
WindowSize := 512; {Please don't change it unless you know the details}
Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
end;
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
var
Wave: TSherpaOnnxWave;
Recognizer: TSherpaOnnxOfflineRecognizer;
Vad: TSherpaOnnxVoiceActivityDetector;
Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Start: Single;
Duration: Single;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
begin
Vad := CreateVad();
Recognizer := CreateOfflineRecognizer();
Wave := SherpaOnnxReadWave('./Obama.wav');
if Wave.SampleRate <> Vad.Config.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[Vad.Config.SampleRate, Wave.SampleRate]));
Exit;
end;
WindowSize := Vad.Config.SileroVad.WindowSize;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Offset += WindowSize;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
end;
Vad.Flush;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
FreeAndNil(Recognizer);
FreeAndNil(Vad);
end.
... ...
!run*.sh
circular_buffer
remove_silence
... ...
# Introduction
This directory contains examples for how to use the VAD (voice activity detection)
APIs.
|Directory| Description|
|---------|------------|
|[run-circular-buffer.sh](./run-circular-buffer.sh)|It shows how to use the circular buffer API.|
|[run-remove-silence.sh](./run-remove-silence.sh)|It shows how to use the VAD API to remove silences from a wave file.|
... ...
{ Copyright (c) 2024 Xiaomi Corporation }
program circular_buffer;
{
This file shows how to use the CircularBuffer API of sherpa-onnx
}
{$mode objfpc}
{$ASSERTIONS ON}
uses
sherpa_onnx;
var
Buffer: TSherpaOnnxCircularBuffer;
Samples: TSherpaOnnxSamplesArray;
begin
{The initial capacity is 5. It will be resized automatically if needed.}
Buffer := TSherpaOnnxCircularBuffer.Create(5);
Assert(Buffer.Size = 0);
Assert(Buffer.Head = 0);
Buffer.Push([0, 10, 20]);
{Push() changes Size. Head is not changed.}
Assert(Buffer.Size = 3);
Assert(Buffer.Head = 0);
Samples := Buffer.Get(0, 1);
Assert(Length(Samples) = 1);
Assert(Samples[0] = 0);
{ Get() does not change Size or Head}
Assert(Buffer.Size = 3);
Assert(Buffer.Head = 0);
Samples := Buffer.Get(0, 2);
Assert(Length(Samples) = 2);
Assert(Samples[0] = 0);
Assert(Samples[1] = 10);
{ The buffer will be resized since its initial capacity is 5 but we have
pushed 7 elements into it.
No data is lost during the resize.
}
Buffer.Push([30, 40, 50, 60]);
Assert(Buffer.Size = 7); {There are now 7 elements}
Assert(Buffer.Head = 0);
{Remove the first 4 elements}
Buffer.Pop(4);
Assert(Buffer.Size = 3); {There are only 3 elements left}
Assert(Buffer.Head = 4);
Samples := Buffer.Get(Buffer.Head, 2);
Assert(Length(Samples) = 2);
Assert(Samples[0] = 40);
Assert(Samples[1] = 50);
Buffer.Pop(1);
Assert(Buffer.Size = 2); {There are only 2 elements left}
Assert(Buffer.Head = 5);
Samples := Buffer.Get(Buffer.Head, 2);
Assert(Length(Samples) = 2);
Assert(Samples[0] = 50);
Assert(Samples[1] = 60);
Buffer.Pop(2);
Assert(Buffer.Size = 0); {There are no elements left}
Assert(Buffer.Head = 7);
Buffer.Push([100, 200, 300, 400, 500]);
Assert(Buffer.Size = 5);
Assert(Buffer.Head = 7);
Buffer.Pop(4);
Assert(Buffer.Size = 1);
{Head can be larger than the Capacity!
This is what circular means. It points to Buffer.Head / Capacity.
}
Assert(Buffer.Head = 11);
Buffer.Push([600, 700]);
Assert(Buffer.Size = 3);
Assert(Buffer.Head = 11);
Samples := Buffer.Get(Buffer.Head, 3);
Assert(Length(Samples) = 3);
Assert(Samples[0] = 500);
Assert(Samples[1] = 600);
Assert(Samples[2] = 700);
Buffer.Pop(3);
Assert(Buffer.Size = 0);
Assert(Buffer.Head = 14);
Buffer.Reset();
Assert(Buffer.Size = 0);
Assert(Buffer.Head = 0);
end.
... ...
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use the VAD API from sherpa-onnx
to remove silences from a wave file.
}
program main;
{$mode delphi}
uses
sherpa_onnx,
SysUtils;
var
Wave: TSherpaOnnxWave;
Config: TSherpaOnnxVadModelConfig;
Vad: TSherpaOnnxVoiceActivityDetector;
Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Start: Single;
Duration: Single;
SampleRate: Integer;
AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
AllSamples: array of Single;
N: Integer;
I: Integer;
begin
SampleRate := 16000; {Please don't change it unless you know the details}
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[SampleRate, Wave.SampleRate]));
Exit;
end;
WindowSize := 512; {Please don't change it unless you know the details}
Initialize(Config);
Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.25;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
AllSpeechSegment := nil;
AllSamples := nil;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Inc(Offset, WindowSize);
while not Vad.IsEmpty do
begin
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
SpeechSegment := Vad.Front();
Vad.Pop();
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
Start := SpeechSegment.Start / SampleRate;
Duration := Length(SpeechSegment.Samples) / SampleRate;
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
end;
end;
Vad.Flush;
while not Vad.IsEmpty do
begin
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
SpeechSegment := Vad.Front();
Vad.Pop();
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
Start := SpeechSegment.Start / SampleRate;
Duration := Length(SpeechSegment.Samples) / SampleRate;
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
end;
N := 0;
for SpeechSegment in AllSpeechSegment do
Inc(N, Length(SpeechSegment.Samples));
SetLength(AllSamples, N);
N := 0;
for SpeechSegment in AllSpeechSegment do
begin
for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
begin
AllSamples[N] := SpeechSegment.Samples[I];
Inc(N);
end;
end;
SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate);
WriteLn('Saved to ./lei-jun-test-no-silence.wav');
FreeAndNil(Vad);
end.
... ...
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
popd
fi
fpc \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./circular_buffer.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./circular_buffer
... ...
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
popd
fi
if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi
if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi
fpc \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./remove_silence.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./remove_silence
... ...
... ... @@ -95,6 +95,8 @@ void CircularBuffer::Push(const float *p, int32_t n) {
"capacity to: %d",
n, size, n + size, capacity, new_capacity);
Resize(new_capacity);
capacity = new_capacity;
}
int32_t start = tail_ % capacity;
... ...
... ... @@ -2,9 +2,11 @@
unit sherpa_onnx;
{$mode objfpc}
{$IFDEF FPC}
{$mode objfpc}
{$modeSwitch advancedRecords} { to support records with methods }
{$ENDIF}
{$modeSwitch advancedRecords} { to support records with methods }
(* {$LongStrings ON} *)
interface
... ... @@ -45,18 +47,21 @@ type
ModelingUnit: AnsiString;
BpeVocab: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
end;
TSherpaOnnxFeatureConfig = record
SampleRate: Integer;
FeatureDim: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
end;
TSherpaOnnxOnlineCtcFstDecoderConfig = record
Graph: AnsiString;
MaxActive: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
end;
TSherpaOnnxOnlineRecognizerConfig = record
... ... @@ -75,6 +80,7 @@ type
RuleFars: AnsiString;
BlankPenalty: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
end;
TSherpaOnnxOnlineRecognizerResult = record
... ... @@ -97,6 +103,7 @@ type
TSherpaOnnxOnlineRecognizer = class
private
Handle: Pointer;
_Config: TSherpaOnnxOnlineRecognizerConfig;
public
constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig);
destructor Destroy; override;
... ... @@ -108,6 +115,7 @@ type
procedure Reset(Stream: TSherpaOnnxOnlineStream);
function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
property Config: TSherpaOnnxOnlineRecognizerConfig Read _Config;
end;
TSherpaOnnxOfflineTransducerModelConfig = record
... ... @@ -134,6 +142,7 @@ type
Task: AnsiString;
TailPaddings: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
end;
TSherpaOnnxOfflineTdnnModelConfig = record
... ... @@ -145,12 +154,14 @@ type
Model: AnsiString;
Scale: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
end;
TSherpaOnnxOfflineSenseVoiceModelConfig = record
Model: AnsiString;
Language: AnsiString;
UseItn: Boolean;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
function ToString: AnsiString;
end;
... ... @@ -169,6 +180,7 @@ type
BpeVocab: AnsiString;
TeleSpeechCtc: AnsiString;
SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
function ToString: AnsiString;
end;
... ... @@ -183,6 +195,7 @@ type
RuleFsts: AnsiString;
RuleFars: AnsiString;
BlankPenalty: Single;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
function ToString: AnsiString;
end;
... ... @@ -205,18 +218,83 @@ type
TSherpaOnnxOfflineRecognizer = class
private
Handle: Pointer;
_Config: TSherpaOnnxOfflineRecognizerConfig;
public
constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig);
destructor Destroy; override;
function CreateStream: TSherpaOnnxOfflineStream;
procedure Decode(Stream: TSherpaOnnxOfflineStream);
function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config;
end;
{ It supports reading a single channel wave with 16-bit encoded samples.
Samples are normalized to the range [-1, 1].
}
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
TSherpaOnnxSileroVadModelConfig = record
Model: AnsiString;
Threshold: Single;
MinSilenceDuration: Single;
MinSpeechDuration: Single;
WindowSize: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
end;
TSherpaOnnxVadModelConfig = record
SileroVad: TSherpaOnnxSileroVadModelConfig;
SampleRate: Integer;
NumThreads: Integer;
Provider: AnsiString;
Debug: Boolean;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
end;
TSherpaOnnxSamplesArray = array of Single;
TSherpaOnnxCircularBuffer = class
private
Handle: Pointer;
public
constructor Create(Capacity: Integer);
destructor Destroy; override;
procedure Push(Samples: array of Single);
function Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
procedure Pop(N: Integer);
procedure Reset;
function Size: Integer;
function Head: Integer;
end;
TSherpaOnnxSpeechSegment = record
Samples: array of Single;
Start: Integer;
end;
TSherpaOnnxVoiceActivityDetector = class
private
Handle: Pointer;
_Config: TSherpaOnnxVadModelConfig;
public
constructor Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
destructor Destroy; override;
procedure AcceptWaveform(Samples: array of Single); overload;
procedure AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); overload;
function IsEmpty: Boolean;
function IsDetected: Boolean;
procedure Pop;
procedure Clear;
function Front: TSherpaOnnxSpeechSegment;
procedure Reset;
procedure Flush;
property Config: TSherpaOnnxVadModelConfig Read _Config;
end;
{ It supports reading a single channel wave with 16-bit encoded samples.
Samples are normalized to the range [-1, 1].
}
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
function SherpaOnnxWriteWave(Filename: AnsiString;
Samples: array of Single; SampleRate: Integer): Boolean;
implementation
... ... @@ -294,15 +372,15 @@ type
DecodingMethod: PAnsiChar;
MaxActivePaths: cint32;
EnableEndpoint: cint32;
Rule1MinTrailingSilence: Single;
Rule2MinTrailingSilence: Single;
Rule3MinUtteranceLength: Single;
Rule1MinTrailingSilence: cfloat;
Rule2MinTrailingSilence: cfloat;
Rule3MinUtteranceLength: cfloat;
HotwordsFile: PAnsiChar;
HotwordsScore: Single;
HotwordsScore: cfloat;
CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig;
RuleFsts: PAnsiChar;
RuleFars: PAnsiChar;
BlankPenalty: Single;
BlankPenalty: cfloat;
end;
PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig;
... ... @@ -330,7 +408,7 @@ type
end;
SherpaOnnxOfflineLMConfig = record
Model: PAnsiChar;
Scale: Single;
Scale: cfloat;
end;
SherpaOnnxOfflineSenseVoiceModelConfig = record
Model: PAnsiChar;
... ... @@ -361,14 +439,100 @@ type
DecodingMethod: PAnsiChar;
MaxActivePaths: cint32;
HotwordsFile: PAnsiChar;
HotwordsScore: Single;
HotwordsScore: cfloat;
RuleFsts: PAnsiChar;
RuleFars: PAnsiChar;
BlankPenalty: Single;
BlankPenalty: cfloat;
end;
PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig;
SherpaOnnxSileroVadModelConfig = record
Model: PAnsiChar;
Threshold: cfloat;
MinSilenceDuration: cfloat;
MinSpeechDuration: cfloat;
WindowSize: cint32;
end;
SherpaOnnxVadModelConfig = record
SileroVad: SherpaOnnxSileroVadModelConfig;
SampleRate: cint32;
NumThreads: cint32;
Provider: PAnsiChar;
Debug: cint32;
end;
PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
SherpaOnnxSpeechSegment = record
Start: cint32;
Samples: pcfloat;
N: cint32;
end;
PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
BufferSizeInSeconds: cfloat): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyVoiceActivityDetector(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorAcceptWaveform(Vad: Pointer;
Samples: pcfloat; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorEmpty(Vad: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorDetected(Vad: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorPop(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorClear(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorFront(Vad: Pointer): PSherpaOnnxSpeechSegment; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroySpeechSegment(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorReset(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorFlush(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateCircularBuffer(Capacity: cint32): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyCircularBuffer(Buffer: Pointer) ; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferPush(Buffer: Pointer; Samples: pcfloat; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferGet(Buffer: Pointer; StartIndex: cint32; N: cint32): pcfloat ; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferFree(P: pcfloat); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferPop(Buffer: Pointer; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferSize(Buffer: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferHead(Buffer: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferReset(Buffer: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl;
external SherpaOnnxLibName;
... ... @@ -437,9 +601,20 @@ procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl;
function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxReadWave';
function SherpaOnnxWriteWaveWrapper(Samples: pcfloat; N: cint32;
SampleRate: cint32; Filename: PAnsiChar): cint32; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxWriteWave';
procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl;
external SherpaOnnxLibName name 'SherpaOnnxFreeWave';
function SherpaOnnxWriteWave(Filename: AnsiString;
Samples: array of Single; SampleRate: Integer): Boolean;
begin
Result := SherpaOnnxWriteWaveWrapper(pcfloat(Samples), Length(Samples),
SampleRate, PAnsiChar(Filename)) = 1;
end;
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
var
PFilename: PAnsiChar;
... ... @@ -611,6 +786,7 @@ begin
C.BlankPenalty := Config.BlankPenalty;
Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C);
Self._Config := Config;
end;
destructor TSherpaOnnxOnlineRecognizer.Destroy;
... ... @@ -877,6 +1053,7 @@ begin
C.BlankPenalty := Config.BlankPenalty;
Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C);
Self._Config := Config;
end;
destructor TSherpaOnnxOfflineRecognizer.Destroy;
... ... @@ -984,5 +1161,255 @@ begin
[Self.Text, TokensStr, TimestampStr]);
end;
function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxSileroVadModelConfig(' +
'Model := %s, ' +
'Threshold := %.2f, ' +
'MinSilenceDuration := %.2f, ' +
'MinSpeechDuration := %.2f, ' +
'WindowSize := %d' +
')',
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
Self.MinSpeechDuration, Self.WindowSize
]);
end;
class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
begin
Dest.Threshold := 0.5;
Dest.MinSilenceDuration := 0.5;
Dest.MinSpeechDuration := 0.25;
Dest.WindowSize := 512;
end;
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxVadModelConfig(' +
'SileroVad := %s, ' +
'SampleRate := %d, ' +
'NumThreads := %d, ' +
'Provider := %s, ' +
'Debug := %s' +
')',
[Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
Self.Debug.ToString
]);
end;
class operator TSherpaOnnxVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
begin
Dest.SampleRate := 16000;
Dest.NumThreads := 1;
Dest.Provider := 'cpu';
Dest.Debug := False;
end;
class operator TSherpaOnnxFeatureConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
begin
Dest.SampleRate := 16000;
Dest.FeatureDim := 80;
end;
class operator TSherpaOnnxOnlineCtcFstDecoderConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
begin
Dest.MaxActive := 3000;
end;
class operator TSherpaOnnxOnlineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
begin
Dest.DecodingMethod := 'greedy_search';
Dest.EnableEndpoint := False;
Dest.Rule1MinTrailingSilence := 2.4;
Dest.Rule2MinTrailingSilence := 1.2;
Dest.Rule3MinUtteranceLength := 20;
Dest.HotwordsScore := 1.5;
Dest.BlankPenalty := 0;
end;
class operator TSherpaOnnxOnlineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
begin
Dest.NumThreads := 1;
Dest.Provider := 'cpu';
Dest.Debug := False;
end;
class operator TSherpaOnnxOfflineWhisperModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
begin
Dest.Task := 'transcribe';
Dest.TailPaddings := -1;
end;
class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
begin
Dest.Scale := 1.0;
end;
class operator TSherpaOnnxOfflineSenseVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
begin
Dest.UseItn := True;
end;
class operator TSherpaOnnxOfflineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
class operator TSherpaOnnxOfflineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
begin
Dest.DecodingMethod := 'greedy_search';
Dest.MaxActivePaths := 4;
Dest.HotwordsScore := 1.5;
Dest.BlankPenalty := 0;
end;
constructor TSherpaOnnxCircularBuffer.Create(Capacity: Integer);
begin
Self.Handle := SherpaOnnxCreateCircularBuffer(Capacity);
end;
destructor TSherpaOnnxCircularBuffer.Destroy;
begin
SherpaOnnxDestroyCircularBuffer(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxCircularBuffer.Push(Samples: array of Single);
begin
SherpaOnnxCircularBufferPush(Self.Handle, pcfloat(Samples), Length(Samples));
end;
function TSherpaOnnxCircularBuffer.Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
var
P: pcfloat;
I: Integer;
begin
P := SherpaOnnxCircularBufferGet(Self.Handle, StartIndex, N);
Result := nil;
SetLength(Result, N);
for I := Low(Result) to High(Result) do
Result[I] := P[I];
SherpaOnnxCircularBufferFree(P);
end;
procedure TSherpaOnnxCircularBuffer.Pop(N: Integer);
begin
SherpaOnnxCircularBufferPop(Self.Handle, N);
end;
procedure TSherpaOnnxCircularBuffer.Reset;
begin
SherpaOnnxCircularBufferReset(Self.Handle);
end;
function TSherpaOnnxCircularBuffer.Size: Integer;
begin
Result := SherpaOnnxCircularBufferSize(Self.Handle);
end;
function TSherpaOnnxCircularBuffer.Head: Integer;
begin
Result := SherpaOnnxCircularBufferHead(Self.Handle);
end;
constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
var
C: SherpaOnnxVadModelConfig;
begin
Self._Config := Config;
Initialize(C);
C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
C.SileroVad.Threshold := Config.SileroVad.Threshold;
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
C.SampleRate := Config.SampleRate;
C.NumThreads := Config.NumThreads;
C.Provider := PAnsiChar(Config.Provider);
C.Debug := Ord(Config.Debug);
Self.Handle := SherpaOnnxCreateVoiceActivityDetector(@C, BufferSizeInSeconds);
end;
destructor TSherpaOnnxVoiceActivityDetector.Destroy;
begin
SherpaOnnxDestroyVoiceActivityDetector(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single);
begin
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, pcfloat(Samples), Length(Samples));
end;
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer);
begin
if Offset + N > Length(Samples) then
begin
WriteLn(Format('Invalid arguments!. Array length: %d, Offset: %d, N: %d',
[Length(Samples), Offset, N]
));
Exit;
end;
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle,
pcfloat(Samples) + Offset, N);
end;
function TSherpaOnnxVoiceActivityDetector.IsEmpty: Boolean;
begin
Result := SherpaOnnxVoiceActivityDetectorEmpty(Self.Handle) = 1;
end;
function TSherpaOnnxVoiceActivityDetector.IsDetected: Boolean;
begin
Result := SherpaOnnxVoiceActivityDetectorDetected(Self.Handle) = 1;
end;
procedure TSherpaOnnxVoiceActivityDetector.Pop;
begin
SherpaOnnxVoiceActivityDetectorPop(Self.Handle);
end;
procedure TSherpaOnnxVoiceActivityDetector.Clear;
begin
SherpaOnnxVoiceActivityDetectorClear(Self.Handle);
end;
function TSherpaOnnxVoiceActivityDetector.Front: TSherpaOnnxSpeechSegment;
var
P: PSherpaOnnxSpeechSegment;
I: Integer;
begin
P := SherpaOnnxVoiceActivityDetectorFront(Self.Handle);
Result.Start := P^.Start;
Result.Samples := nil;
SetLength(Result.Samples, P^.N);
for I := Low(Result.Samples) to High(Result.Samples) do
Result.Samples[I] := P^.Samples[I];
SherpaOnnxDestroySpeechSegment(P);
end;
procedure TSherpaOnnxVoiceActivityDetector.Reset;
begin
SherpaOnnxVoiceActivityDetectorReset(Self.Handle);
end;
procedure TSherpaOnnxVoiceActivityDetector.Flush;
begin
SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
end;
end.
... ...