Fangjun Kuang
Committed by GitHub

Add Pascal API for ten-vad (#2388)

@@ -136,6 +136,27 @@ jobs: @@ -136,6 +136,27 @@ jobs:
136 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr 136 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
137 fi 137 fi
138 138
  139 + - name: Run Pascal test (VAD test)
  140 + shell: bash
  141 + run: |
  142 + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
  143 +
  144 + cd ./pascal-api-examples
  145 +
  146 + pushd vad
  147 + ./run-circular-buffer.sh
  148 + echo "---"
  149 +
  150 + time ./run-remove-silence-ten-vad.sh
  151 + echo "---"
  152 +
  153 + time ./run-remove-silence.sh
  154 + echo "---"
  155 +
  156 + ls -lh
  157 +
  158 + popd
  159 +
139 - name: Run Speech Enhancement test (GTCRN) 160 - name: Run Speech Enhancement test (GTCRN)
140 shell: bash 161 shell: bash
141 run: | 162 run: |
@@ -298,24 +319,6 @@ jobs: @@ -298,24 +319,6 @@ jobs:
298 319
299 popd 320 popd
300 321
301 - - name: Run Pascal test (VAD test)  
302 - shell: bash  
303 - run: |  
304 - export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH  
305 -  
306 - cd ./pascal-api-examples  
307 -  
308 - pushd vad  
309 - ./run-circular-buffer.sh  
310 - echo "---"  
311 -  
312 - time ./run-remove-silence.sh  
313 - echo "---"  
314 -  
315 - ls -lh  
316 -  
317 - popd  
318 -  
319 - name: Run Pascal test (Read wav test) 322 - name: Run Pascal test (Read wav test)
320 shell: bash 323 shell: bash
321 run: | 324 run: |
1 !run*.sh 1 !run*.sh
2 circular_buffer 2 circular_buffer
3 remove_silence 3 remove_silence
  4 +remove_silence_ten_vad
1 { Copyright (c) 2024 Xiaomi Corporation } 1 { Copyright (c) 2024 Xiaomi Corporation }
2 { 2 {
3 This file shows how to use the VAD API from sherpa-onnx 3 This file shows how to use the VAD API from sherpa-onnx
4 -to remove silences from a wave file. 4 +to remove silences from a wave file with silero-vad.
5 } 5 }
6 program main; 6 program main;
7 7
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +{
  3 +This file shows how to use the VAD API from sherpa-onnx
  4 +to remove silences from a wave file with ten-vad.
  5 +}
  6 +program main;
  7 +
  8 +{$mode delphi}
  9 +
  10 +uses
  11 + sherpa_onnx,
  12 + SysUtils;
  13 +
  14 +var
  15 + Wave: TSherpaOnnxWave;
  16 +
  17 + Config: TSherpaOnnxVadModelConfig;
  18 + Vad: TSherpaOnnxVoiceActivityDetector;
  19 + Offset: Integer;
  20 + WindowSize: Integer;
  21 + SpeechSegment: TSherpaOnnxSpeechSegment;
  22 +
  23 + Start: Single;
  24 + Duration: Single;
  25 + SampleRate: Integer;
  26 +
  27 + AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
  28 + AllSamples: array of Single;
  29 + N: Integer;
  30 + I: Integer;
  31 +begin
  32 + SampleRate := 16000; {Please don't change it unless you know the details}
  33 +
  34 + Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
  35 + if Wave.SampleRate <> SampleRate then
  36 + begin
  37 + WriteLn(Format('Expected sample rate: %d. Given: %d',
  38 + [SampleRate, Wave.SampleRate]));
  39 +
  40 + Exit;
  41 + end;
  42 +
  43 + WindowSize := 256; {Please don't change it unless you know the details}
  44 + Initialize(Config);
  45 +
  46 + Config.TenVad.Model := './ten-vad.onnx';
  47 + Config.TenVad.MinSpeechDuration := 0.25;
  48 + Config.TenVad.MinSilenceDuration := 0.5;
  49 + Config.TenVad.Threshold := 0.25;
  50 + Config.TenVad.WindowSize := WindowSize;
  51 + Config.NumThreads:= 1;
  52 + Config.Debug:= True;
  53 + Config.Provider:= 'cpu';
  54 + Config.SampleRate := SampleRate;
  55 +
  56 + Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
  57 +
  58 + AllSpeechSegment := nil;
  59 + AllSamples := nil;
  60 + Offset := 0;
  61 + while Offset + WindowSize <= Length(Wave.Samples) do
  62 + begin
  63 + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
  64 + Inc(Offset, WindowSize);
  65 +
  66 + while not Vad.IsEmpty do
  67 + begin
  68 + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
  69 +
  70 + SpeechSegment := Vad.Front();
  71 + Vad.Pop();
  72 + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
  73 +
  74 + Start := SpeechSegment.Start / SampleRate;
  75 + Duration := Length(SpeechSegment.Samples) / SampleRate;
  76 + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
  77 + end;
  78 + end;
  79 +
  80 + Vad.Flush;
  81 +
  82 + while not Vad.IsEmpty do
  83 + begin
  84 + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
  85 +
  86 + SpeechSegment := Vad.Front();
  87 + Vad.Pop();
  88 + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
  89 +
  90 + Start := SpeechSegment.Start / SampleRate;
  91 + Duration := Length(SpeechSegment.Samples) / SampleRate;
  92 + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
  93 + end;
  94 +
  95 + N := 0;
  96 + for SpeechSegment in AllSpeechSegment do
  97 + Inc(N, Length(SpeechSegment.Samples));
  98 +
  99 + SetLength(AllSamples, N);
  100 +
  101 + N := 0;
  102 + for SpeechSegment in AllSpeechSegment do
  103 + begin
  104 + for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
  105 + begin
  106 + AllSamples[N] := SpeechSegment.Samples[I];
  107 + Inc(N);
  108 + end;
  109 + end;
  110 +
  111 + SherpaOnnxWriteWave('./lei-jun-test-no-silence-ten-vad.wav', AllSamples, SampleRate);
  112 + WriteLn('Saved to ./lei-jun-test-no-silence-ten-vad.wav');
  113 +
  114 + FreeAndNil(Vad);
  115 +end.
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [[ ! -f ./ten-vad.onnx ]]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  28 +fi
  29 +
  30 +if [ ! -f ./lei-jun-test.wav ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  32 +fi
  33 +
  34 +fpc \
  35 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  36 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  37 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  38 + ./remove_silence_ten_vad.pas
  39 +
  40 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  41 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  42 +
  43 +./remove_silence_ten_vad
@@ -426,12 +426,24 @@ type @@ -426,12 +426,24 @@ type
426 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); 426 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
427 end; 427 end;
428 428
  429 + TSherpaOnnxTenVadModelConfig = record
  430 + Model: AnsiString;
  431 + Threshold: Single;
  432 + MinSilenceDuration: Single;
  433 + MinSpeechDuration: Single;
  434 + WindowSize: Integer;
  435 + MaxSpeechDuration: Single;
  436 + function ToString: AnsiString;
  437 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
  438 + end;
  439 +
429 TSherpaOnnxVadModelConfig = record 440 TSherpaOnnxVadModelConfig = record
430 SileroVad: TSherpaOnnxSileroVadModelConfig; 441 SileroVad: TSherpaOnnxSileroVadModelConfig;
431 SampleRate: Integer; 442 SampleRate: Integer;
432 NumThreads: Integer; 443 NumThreads: Integer;
433 Provider: AnsiString; 444 Provider: AnsiString;
434 Debug: Boolean; 445 Debug: Boolean;
  446 + TenVad: TSherpaOnnxTenVadModelConfig;
435 function ToString: AnsiString; 447 function ToString: AnsiString;
436 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig); 448 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
437 end; 449 end;
@@ -829,12 +841,23 @@ type @@ -829,12 +841,23 @@ type
829 WindowSize: cint32; 841 WindowSize: cint32;
830 MaxSpeechDuration: cfloat; 842 MaxSpeechDuration: cfloat;
831 end; 843 end;
  844 +
  845 + SherpaOnnxTenVadModelConfig = record
  846 + Model: PAnsiChar;
  847 + Threshold: cfloat;
  848 + MinSilenceDuration: cfloat;
  849 + MinSpeechDuration: cfloat;
  850 + WindowSize: cint32;
  851 + MaxSpeechDuration: cfloat;
  852 + end;
  853 +
832 SherpaOnnxVadModelConfig = record 854 SherpaOnnxVadModelConfig = record
833 SileroVad: SherpaOnnxSileroVadModelConfig; 855 SileroVad: SherpaOnnxSileroVadModelConfig;
834 SampleRate: cint32; 856 SampleRate: cint32;
835 NumThreads: cint32; 857 NumThreads: cint32;
836 Provider: PAnsiChar; 858 Provider: PAnsiChar;
837 Debug: cint32; 859 Debug: cint32;
  860 + TenVad: SherpaOnnxTenVadModelConfig;
838 end; 861 end;
839 PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig; 862 PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
840 863
@@ -1907,6 +1930,21 @@ begin @@ -1907,6 +1930,21 @@ begin
1907 ]); 1930 ]);
1908 end; 1931 end;
1909 1932
  1933 +function TSherpaOnnxTenVadModelConfig.ToString: AnsiString;
  1934 +begin
  1935 + Result := Format('TSherpaOnnxTenVadModelConfig(' +
  1936 + 'Model := %s, ' +
  1937 + 'Threshold := %.2f, ' +
  1938 + 'MinSilenceDuration := %.2f, ' +
  1939 + 'MinSpeechDuration := %.2f, ' +
  1940 + 'WindowSize := %d, ' +
  1941 + 'MaxSpeechDuration := %.2f' +
  1942 + ')',
  1943 + [Self.Model, Self.Threshold, Self.MinSilenceDuration,
  1944 + Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
  1945 + ]);
  1946 +end;
  1947 +
1910 class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); 1948 class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
1911 begin 1949 begin
1912 Dest.Threshold := 0.5; 1950 Dest.Threshold := 0.5;
@@ -1916,6 +1954,15 @@ begin @@ -1916,6 +1954,15 @@ begin
1916 Dest.MaxSpeechDuration := 5.0; 1954 Dest.MaxSpeechDuration := 5.0;
1917 end; 1955 end;
1918 1956
  1957 +class operator TSherpaOnnxTenVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig);
  1958 +begin
  1959 + Dest.Threshold := 0.5;
  1960 + Dest.MinSilenceDuration := 0.5;
  1961 + Dest.MinSpeechDuration := 0.25;
  1962 + Dest.WindowSize := 256;
  1963 + Dest.MaxSpeechDuration := 5.0;
  1964 +end;
  1965 +
1919 function TSherpaOnnxVadModelConfig.ToString: AnsiString; 1966 function TSherpaOnnxVadModelConfig.ToString: AnsiString;
1920 begin 1967 begin
1921 Result := Format('TSherpaOnnxVadModelConfig(' + 1968 Result := Format('TSherpaOnnxVadModelConfig(' +
@@ -1923,10 +1970,11 @@ begin @@ -1923,10 +1970,11 @@ begin
1923 'SampleRate := %d, ' + 1970 'SampleRate := %d, ' +
1924 'NumThreads := %d, ' + 1971 'NumThreads := %d, ' +
1925 'Provider := %s, ' + 1972 'Provider := %s, ' +
1926 - 'Debug := %s' + 1973 + 'Debug := %s, ' +
  1974 + 'TenVad := %s' +
1927 ')', 1975 ')',
1928 [Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider, 1976 [Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
1929 - Self.Debug.ToString 1977 + Self.Debug.ToString, Self.TenVad.ToString
1930 ]); 1978 ]);
1931 end; 1979 end;
1932 1980
@@ -2077,6 +2125,13 @@ begin @@ -2077,6 +2125,13 @@ begin
2077 C.SileroVad.WindowSize := Config.SileroVad.WindowSize; 2125 C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
2078 C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration; 2126 C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
2079 2127
  2128 + C.TenVad.Model := PAnsiChar(Config.TenVad.Model);
  2129 + C.TenVad.Threshold := Config.TenVad.Threshold;
  2130 + C.TenVad.MinSilenceDuration := Config.TenVad.MinSilenceDuration;
  2131 + C.TenVad.MinSpeechDuration := Config.TenVad.MinSpeechDuration;
  2132 + C.TenVad.WindowSize := Config.TenVad.WindowSize;
  2133 + C.TenVad.MaxSpeechDuration := Config.TenVad.MaxSpeechDuration;
  2134 +
2080 C.SampleRate := Config.SampleRate; 2135 C.SampleRate := Config.SampleRate;
2081 C.NumThreads := Config.NumThreads; 2136 C.NumThreads := Config.NumThreads;
2082 C.Provider := PAnsiChar(Config.Provider); 2137 C.Provider := PAnsiChar(Config.Provider);