Fangjun Kuang
Committed by GitHub

Add Pascal API for ten-vad (#2388)

... ... @@ -136,6 +136,27 @@ jobs:
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
fi
- name: Run Pascal test (VAD test)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd vad
./run-circular-buffer.sh
echo "---"
time ./run-remove-silence-ten-vad.sh
echo "---"
time ./run-remove-silence.sh
echo "---"
ls -lh
popd
- name: Run Speech Enhancement test (GTCRN)
shell: bash
run: |
... ... @@ -298,24 +319,6 @@ jobs:
popd
- name: Run Pascal test (VAD test)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd vad
./run-circular-buffer.sh
echo "---"
time ./run-remove-silence.sh
echo "---"
ls -lh
popd
- name: Run Pascal test (Read wav test)
shell: bash
run: |
... ...
!run*.sh
circular_buffer
remove_silence
remove_silence_ten_vad
... ...
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use the VAD API from sherpa-onnx
to remove silences from a wave file.
to remove silences from a wave file with silero-vad.
}
program main;
... ...
{ Copyright (c) 2025 Xiaomi Corporation }
{
This file shows how to use the VAD API from sherpa-onnx
to remove silences from a wave file with ten-vad.
}
program main;
{$mode delphi}
uses
sherpa_onnx,
SysUtils;
var
Wave: TSherpaOnnxWave;
Config: TSherpaOnnxVadModelConfig;
Vad: TSherpaOnnxVoiceActivityDetector;
Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Start: Single;
Duration: Single;
SampleRate: Integer;
AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
AllSamples: array of Single;
N: Integer;
I: Integer;
begin
SampleRate := 16000; {Please don't change it unless you know the details}
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[SampleRate, Wave.SampleRate]));
Exit;
end;
WindowSize := 256; {Please don't change it unless you know the details}
Initialize(Config);
Config.TenVad.Model := './ten-vad.onnx';
Config.TenVad.MinSpeechDuration := 0.25;
Config.TenVad.MinSilenceDuration := 0.5;
Config.TenVad.Threshold := 0.25;
Config.TenVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
AllSpeechSegment := nil;
AllSamples := nil;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Inc(Offset, WindowSize);
while not Vad.IsEmpty do
begin
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
SpeechSegment := Vad.Front();
Vad.Pop();
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
Start := SpeechSegment.Start / SampleRate;
Duration := Length(SpeechSegment.Samples) / SampleRate;
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
end;
end;
Vad.Flush;
while not Vad.IsEmpty do
begin
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
SpeechSegment := Vad.Front();
Vad.Pop();
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
Start := SpeechSegment.Start / SampleRate;
Duration := Length(SpeechSegment.Samples) / SampleRate;
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
end;
N := 0;
for SpeechSegment in AllSpeechSegment do
Inc(N, Length(SpeechSegment.Samples));
SetLength(AllSamples, N);
N := 0;
for SpeechSegment in AllSpeechSegment do
begin
for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
begin
AllSamples[N] := SpeechSegment.Samples[I];
Inc(N);
end;
end;
SherpaOnnxWriteWave('./lei-jun-test-no-silence-ten-vad.wav', AllSamples, SampleRate);
WriteLn('Saved to ./lei-jun-test-no-silence-ten-vad.wav');
FreeAndNil(Vad);
end.
... ...
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
popd
fi
if [[ ! -f ./ten-vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
fi
if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi
fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./remove_silence_ten_vad.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./remove_silence_ten_vad
... ...
... ... @@ -426,12 +426,24 @@ type
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
end;
TSherpaOnnxTenVadModelConfig = record
Model: AnsiString;
Threshold: Single;
MinSilenceDuration: Single;
MinSpeechDuration: Single;
WindowSize: Integer;
MaxSpeechDuration: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
end;
TSherpaOnnxVadModelConfig = record
SileroVad: TSherpaOnnxSileroVadModelConfig;
SampleRate: Integer;
NumThreads: Integer;
Provider: AnsiString;
Debug: Boolean;
TenVad: TSherpaOnnxTenVadModelConfig;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
end;
... ... @@ -829,12 +841,23 @@ type
WindowSize: cint32;
MaxSpeechDuration: cfloat;
end;
SherpaOnnxTenVadModelConfig = record
Model: PAnsiChar;
Threshold: cfloat;
MinSilenceDuration: cfloat;
MinSpeechDuration: cfloat;
WindowSize: cint32;
MaxSpeechDuration: cfloat;
end;
SherpaOnnxVadModelConfig = record
SileroVad: SherpaOnnxSileroVadModelConfig;
SampleRate: cint32;
NumThreads: cint32;
Provider: PAnsiChar;
Debug: cint32;
TenVad: SherpaOnnxTenVadModelConfig;
end;
PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
... ... @@ -1907,6 +1930,21 @@ begin
]);
end;
function TSherpaOnnxTenVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxTenVadModelConfig(' +
'Model := %s, ' +
'Threshold := %.2f, ' +
'MinSilenceDuration := %.2f, ' +
'MinSpeechDuration := %.2f, ' +
'WindowSize := %d, ' +
'MaxSpeechDuration := %.2f' +
')',
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
]);
end;
class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
begin
Dest.Threshold := 0.5;
... ... @@ -1916,6 +1954,15 @@ begin
Dest.MaxSpeechDuration := 5.0;
end;
class operator TSherpaOnnxTenVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
begin
Dest.Threshold := 0.5;
Dest.MinSilenceDuration := 0.5;
Dest.MinSpeechDuration := 0.25;
Dest.WindowSize := 256;
Dest.MaxSpeechDuration := 5.0;
end;
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxVadModelConfig(' +
... ... @@ -1923,10 +1970,11 @@ begin
'SampleRate := %d, ' +
'NumThreads := %d, ' +
'Provider := %s, ' +
'Debug := %s' +
'Debug := %s, ' +
'TenVad := %s' +
')',
[Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
Self.Debug.ToString
Self.Debug.ToString, Self.TenVad.ToString
]);
end;
... ... @@ -2077,6 +2125,13 @@ begin
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
C.TenVad.Model := PAnsiChar(Config.TenVad.Model);
C.TenVad.Threshold := Config.TenVad.Threshold;
C.TenVad.MinSilenceDuration := Config.TenVad.MinSilenceDuration;
C.TenVad.MinSpeechDuration := Config.TenVad.MinSpeechDuration;
C.TenVad.WindowSize := Config.TenVad.WindowSize;
C.TenVad.MaxSpeechDuration := Config.TenVad.MaxSpeechDuration;
C.SampleRate := Config.SampleRate;
C.NumThreads := Config.NumThreads;
C.Provider := PAnsiChar(Config.Provider);
... ...