正在显示
6 个修改的文件
包含
238 行增加
和
21 行删除
| @@ -136,6 +136,27 @@ jobs: | @@ -136,6 +136,27 @@ jobs: | ||
| 136 | cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr | 136 | cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr |
| 137 | fi | 137 | fi |
| 138 | 138 | ||
| 139 | + - name: Run Pascal test (VAD test) | ||
| 140 | + shell: bash | ||
| 141 | + run: | | ||
| 142 | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH | ||
| 143 | + | ||
| 144 | + cd ./pascal-api-examples | ||
| 145 | + | ||
| 146 | + pushd vad | ||
| 147 | + ./run-circular-buffer.sh | ||
| 148 | + echo "---" | ||
| 149 | + | ||
| 150 | + time ./run-remove-silence-ten-vad.sh | ||
| 151 | + echo "---" | ||
| 152 | + | ||
| 153 | + time ./run-remove-silence.sh | ||
| 154 | + echo "---" | ||
| 155 | + | ||
| 156 | + ls -lh | ||
| 157 | + | ||
| 158 | + popd | ||
| 159 | + | ||
| 139 | - name: Run Speech Enhancement test (GTCRN) | 160 | - name: Run Speech Enhancement test (GTCRN) |
| 140 | shell: bash | 161 | shell: bash |
| 141 | run: | | 162 | run: | |
| @@ -298,24 +319,6 @@ jobs: | @@ -298,24 +319,6 @@ jobs: | ||
| 298 | 319 | ||
| 299 | popd | 320 | popd |
| 300 | 321 | ||
| 301 | - - name: Run Pascal test (VAD test) | ||
| 302 | - shell: bash | ||
| 303 | - run: | | ||
| 304 | - export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH | ||
| 305 | - | ||
| 306 | - cd ./pascal-api-examples | ||
| 307 | - | ||
| 308 | - pushd vad | ||
| 309 | - ./run-circular-buffer.sh | ||
| 310 | - echo "---" | ||
| 311 | - | ||
| 312 | - time ./run-remove-silence.sh | ||
| 313 | - echo "---" | ||
| 314 | - | ||
| 315 | - ls -lh | ||
| 316 | - | ||
| 317 | - popd | ||
| 318 | - | ||
| 319 | - name: Run Pascal test (Read wav test) | 322 | - name: Run Pascal test (Read wav test) |
| 320 | shell: bash | 323 | shell: bash |
| 321 | run: | | 324 | run: | |
| 1 | { Copyright (c) 2024 Xiaomi Corporation } | 1 | { Copyright (c) 2024 Xiaomi Corporation } |
| 2 | { | 2 | { |
| 3 | This file shows how to use the VAD API from sherpa-onnx | 3 | This file shows how to use the VAD API from sherpa-onnx |
| 4 | -to remove silences from a wave file. | 4 | +to remove silences from a wave file with silero-vad. |
| 5 | } | 5 | } |
| 6 | program main; | 6 | program main; |
| 7 | 7 |
| 1 | +{ Copyright (c) 2025 Xiaomi Corporation } | ||
| 2 | +{ | ||
| 3 | +This file shows how to use the VAD API from sherpa-onnx | ||
| 4 | +to remove silences from a wave file with ten-vad. | ||
| 5 | +} | ||
| 6 | +program main; | ||
| 7 | + | ||
| 8 | +{$mode delphi} | ||
| 9 | + | ||
| 10 | +uses | ||
| 11 | + sherpa_onnx, | ||
| 12 | + SysUtils; | ||
| 13 | + | ||
| 14 | +var | ||
| 15 | + Wave: TSherpaOnnxWave; | ||
| 16 | + | ||
| 17 | + Config: TSherpaOnnxVadModelConfig; | ||
| 18 | + Vad: TSherpaOnnxVoiceActivityDetector; | ||
| 19 | + Offset: Integer; | ||
| 20 | + WindowSize: Integer; | ||
| 21 | + SpeechSegment: TSherpaOnnxSpeechSegment; | ||
| 22 | + | ||
| 23 | + Start: Single; | ||
| 24 | + Duration: Single; | ||
| 25 | + SampleRate: Integer; | ||
| 26 | + | ||
| 27 | + AllSpeechSegment: array of TSherpaOnnxSpeechSegment; | ||
| 28 | + AllSamples: array of Single; | ||
| 29 | + N: Integer; | ||
| 30 | + I: Integer; | ||
| 31 | +begin | ||
| 32 | + SampleRate := 16000; {Please don't change it unless you know the details} | ||
| 33 | + | ||
| 34 | + Wave := SherpaOnnxReadWave('./lei-jun-test.wav'); | ||
| 35 | + if Wave.SampleRate <> SampleRate then | ||
| 36 | + begin | ||
| 37 | + WriteLn(Format('Expected sample rate: %d. Given: %d', | ||
| 38 | + [SampleRate, Wave.SampleRate])); | ||
| 39 | + | ||
| 40 | + Exit; | ||
| 41 | + end; | ||
| 42 | + | ||
| 43 | + WindowSize := 256; {Please don't change it unless you know the details} | ||
| 44 | + Initialize(Config); | ||
| 45 | + | ||
| 46 | + Config.TenVad.Model := './ten-vad.onnx'; | ||
| 47 | + Config.TenVad.MinSpeechDuration := 0.25; | ||
| 48 | + Config.TenVad.MinSilenceDuration := 0.5; | ||
| 49 | + Config.TenVad.Threshold := 0.25; | ||
| 50 | + Config.TenVad.WindowSize := WindowSize; | ||
| 51 | + Config.NumThreads:= 1; | ||
| 52 | + Config.Debug:= True; | ||
| 53 | + Config.Provider:= 'cpu'; | ||
| 54 | + Config.SampleRate := SampleRate; | ||
| 55 | + | ||
| 56 | + Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20); | ||
| 57 | + | ||
| 58 | + AllSpeechSegment := nil; | ||
| 59 | + AllSamples := nil; | ||
| 60 | + Offset := 0; | ||
| 61 | + while Offset + WindowSize <= Length(Wave.Samples) do | ||
| 62 | + begin | ||
| 63 | + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); | ||
| 64 | + Inc(Offset, WindowSize); | ||
| 65 | + | ||
| 66 | + while not Vad.IsEmpty do | ||
| 67 | + begin | ||
| 68 | + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1); | ||
| 69 | + | ||
| 70 | + SpeechSegment := Vad.Front(); | ||
| 71 | + Vad.Pop(); | ||
| 72 | + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment; | ||
| 73 | + | ||
| 74 | + Start := SpeechSegment.Start / SampleRate; | ||
| 75 | + Duration := Length(SpeechSegment.Samples) / SampleRate; | ||
| 76 | + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration])); | ||
| 77 | + end; | ||
| 78 | + end; | ||
| 79 | + | ||
| 80 | + Vad.Flush; | ||
| 81 | + | ||
| 82 | + while not Vad.IsEmpty do | ||
| 83 | + begin | ||
| 84 | + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1); | ||
| 85 | + | ||
| 86 | + SpeechSegment := Vad.Front(); | ||
| 87 | + Vad.Pop(); | ||
| 88 | + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment; | ||
| 89 | + | ||
| 90 | + Start := SpeechSegment.Start / SampleRate; | ||
| 91 | + Duration := Length(SpeechSegment.Samples) / SampleRate; | ||
| 92 | + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration])); | ||
| 93 | + end; | ||
| 94 | + | ||
| 95 | + N := 0; | ||
| 96 | + for SpeechSegment in AllSpeechSegment do | ||
| 97 | + Inc(N, Length(SpeechSegment.Samples)); | ||
| 98 | + | ||
| 99 | + SetLength(AllSamples, N); | ||
| 100 | + | ||
| 101 | + N := 0; | ||
| 102 | + for SpeechSegment in AllSpeechSegment do | ||
| 103 | + begin | ||
| 104 | + for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do | ||
| 105 | + begin | ||
| 106 | + AllSamples[N] := SpeechSegment.Samples[I]; | ||
| 107 | + Inc(N); | ||
| 108 | + end; | ||
| 109 | + end; | ||
| 110 | + | ||
| 111 | + SherpaOnnxWriteWave('./lei-jun-test-no-silence-ten-vad.wav', AllSamples, SampleRate); | ||
| 112 | + WriteLn('Saved to ./lei-jun-test-no-silence-ten-vad.wav'); | ||
| 113 | + | ||
| 114 | + FreeAndNil(Vad); | ||
| 115 | +end. |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
| 6 | +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) | ||
| 7 | + | ||
| 8 | +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" | ||
| 9 | + | ||
| 10 | +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then | ||
| 11 | + mkdir -p ../../build | ||
| 12 | + pushd ../../build | ||
| 13 | + cmake \ | ||
| 14 | + -DCMAKE_INSTALL_PREFIX=./install \ | ||
| 15 | + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
| 16 | + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
| 17 | + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
| 18 | + -DBUILD_SHARED_LIBS=ON \ | ||
| 19 | + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
| 20 | + .. | ||
| 21 | + | ||
| 22 | + cmake --build . --target install --config Release | ||
| 23 | + popd | ||
| 24 | +fi | ||
| 25 | + | ||
| 26 | +if [[ ! -f ./ten-vad.onnx ]]; then | ||
| 27 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 28 | +fi | ||
| 29 | + | ||
| 30 | +if [ ! -f ./lei-jun-test.wav ]; then | ||
| 31 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 32 | +fi | ||
| 33 | + | ||
| 34 | +fpc \ | ||
| 35 | + -dSHERPA_ONNX_USE_SHARED_LIBS \ | ||
| 36 | + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ | ||
| 37 | + -Fl$SHERPA_ONNX_DIR/build/install/lib \ | ||
| 38 | + ./remove_silence_ten_vad.pas | ||
| 39 | + | ||
| 40 | +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH | ||
| 41 | +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH | ||
| 42 | + | ||
| 43 | +./remove_silence_ten_vad |
| @@ -426,12 +426,24 @@ type | @@ -426,12 +426,24 @@ type | ||
| 426 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); | 426 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); |
| 427 | end; | 427 | end; |
| 428 | 428 | ||
| 429 | + TSherpaOnnxTenVadModelConfig = record | ||
| 430 | + Model: AnsiString; | ||
| 431 | + Threshold: Single; | ||
| 432 | + MinSilenceDuration: Single; | ||
| 433 | + MinSpeechDuration: Single; | ||
| 434 | + WindowSize: Integer; | ||
| 435 | + MaxSpeechDuration: Single; | ||
| 436 | + function ToString: AnsiString; | ||
| 437 | + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig); | ||
| 438 | + end; | ||
| 439 | + | ||
| 429 | TSherpaOnnxVadModelConfig = record | 440 | TSherpaOnnxVadModelConfig = record |
| 430 | SileroVad: TSherpaOnnxSileroVadModelConfig; | 441 | SileroVad: TSherpaOnnxSileroVadModelConfig; |
| 431 | SampleRate: Integer; | 442 | SampleRate: Integer; |
| 432 | NumThreads: Integer; | 443 | NumThreads: Integer; |
| 433 | Provider: AnsiString; | 444 | Provider: AnsiString; |
| 434 | Debug: Boolean; | 445 | Debug: Boolean; |
| 446 | + TenVad: TSherpaOnnxTenVadModelConfig; | ||
| 435 | function ToString: AnsiString; | 447 | function ToString: AnsiString; |
| 436 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig); | 448 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig); |
| 437 | end; | 449 | end; |
| @@ -829,12 +841,23 @@ type | @@ -829,12 +841,23 @@ type | ||
| 829 | WindowSize: cint32; | 841 | WindowSize: cint32; |
| 830 | MaxSpeechDuration: cfloat; | 842 | MaxSpeechDuration: cfloat; |
| 831 | end; | 843 | end; |
| 844 | + | ||
| 845 | + SherpaOnnxTenVadModelConfig = record | ||
| 846 | + Model: PAnsiChar; | ||
| 847 | + Threshold: cfloat; | ||
| 848 | + MinSilenceDuration: cfloat; | ||
| 849 | + MinSpeechDuration: cfloat; | ||
| 850 | + WindowSize: cint32; | ||
| 851 | + MaxSpeechDuration: cfloat; | ||
| 852 | + end; | ||
| 853 | + | ||
| 832 | SherpaOnnxVadModelConfig = record | 854 | SherpaOnnxVadModelConfig = record |
| 833 | SileroVad: SherpaOnnxSileroVadModelConfig; | 855 | SileroVad: SherpaOnnxSileroVadModelConfig; |
| 834 | SampleRate: cint32; | 856 | SampleRate: cint32; |
| 835 | NumThreads: cint32; | 857 | NumThreads: cint32; |
| 836 | Provider: PAnsiChar; | 858 | Provider: PAnsiChar; |
| 837 | Debug: cint32; | 859 | Debug: cint32; |
| 860 | + TenVad: SherpaOnnxTenVadModelConfig; | ||
| 838 | end; | 861 | end; |
| 839 | PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig; | 862 | PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig; |
| 840 | 863 | ||
| @@ -1907,6 +1930,21 @@ begin | @@ -1907,6 +1930,21 @@ begin | ||
| 1907 | ]); | 1930 | ]); |
| 1908 | end; | 1931 | end; |
| 1909 | 1932 | ||
| 1933 | +function TSherpaOnnxTenVadModelConfig.ToString: AnsiString; | ||
| 1934 | +begin | ||
| 1935 | + Result := Format('TSherpaOnnxTenVadModelConfig(' + | ||
| 1936 | + 'Model := %s, ' + | ||
| 1937 | + 'Threshold := %.2f, ' + | ||
| 1938 | + 'MinSilenceDuration := %.2f, ' + | ||
| 1939 | + 'MinSpeechDuration := %.2f, ' + | ||
| 1940 | + 'WindowSize := %d, ' + | ||
| 1941 | + 'MaxSpeechDuration := %.2f' + | ||
| 1942 | + ')', | ||
| 1943 | + [Self.Model, Self.Threshold, Self.MinSilenceDuration, | ||
| 1944 | + Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration | ||
| 1945 | + ]); | ||
| 1946 | +end; | ||
| 1947 | + | ||
| 1910 | class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); | 1948 | class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); |
| 1911 | begin | 1949 | begin |
| 1912 | Dest.Threshold := 0.5; | 1950 | Dest.Threshold := 0.5; |
| @@ -1916,6 +1954,15 @@ begin | @@ -1916,6 +1954,15 @@ begin | ||
| 1916 | Dest.MaxSpeechDuration := 5.0; | 1954 | Dest.MaxSpeechDuration := 5.0; |
| 1917 | end; | 1955 | end; |
| 1918 | 1956 | ||
| 1957 | +class operator TSherpaOnnxTenVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxTenVadModelConfig); | ||
| 1958 | +begin | ||
| 1959 | + Dest.Threshold := 0.5; | ||
| 1960 | + Dest.MinSilenceDuration := 0.5; | ||
| 1961 | + Dest.MinSpeechDuration := 0.25; | ||
| 1962 | + Dest.WindowSize := 256; | ||
| 1963 | + Dest.MaxSpeechDuration := 5.0; | ||
| 1964 | +end; | ||
| 1965 | + | ||
| 1919 | function TSherpaOnnxVadModelConfig.ToString: AnsiString; | 1966 | function TSherpaOnnxVadModelConfig.ToString: AnsiString; |
| 1920 | begin | 1967 | begin |
| 1921 | Result := Format('TSherpaOnnxVadModelConfig(' + | 1968 | Result := Format('TSherpaOnnxVadModelConfig(' + |
| @@ -1923,10 +1970,11 @@ begin | @@ -1923,10 +1970,11 @@ begin | ||
| 1923 | 'SampleRate := %d, ' + | 1970 | 'SampleRate := %d, ' + |
| 1924 | 'NumThreads := %d, ' + | 1971 | 'NumThreads := %d, ' + |
| 1925 | 'Provider := %s, ' + | 1972 | 'Provider := %s, ' + |
| 1926 | - 'Debug := %s' + | 1973 | + 'Debug := %s, ' + |
| 1974 | + 'TenVad := %s' + | ||
| 1927 | ')', | 1975 | ')', |
| 1928 | [Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider, | 1976 | [Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider, |
| 1929 | - Self.Debug.ToString | 1977 | + Self.Debug.ToString, Self.TenVad.ToString |
| 1930 | ]); | 1978 | ]); |
| 1931 | end; | 1979 | end; |
| 1932 | 1980 | ||
| @@ -2077,6 +2125,13 @@ begin | @@ -2077,6 +2125,13 @@ begin | ||
| 2077 | C.SileroVad.WindowSize := Config.SileroVad.WindowSize; | 2125 | C.SileroVad.WindowSize := Config.SileroVad.WindowSize; |
| 2078 | C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration; | 2126 | C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration; |
| 2079 | 2127 | ||
| 2128 | + C.TenVad.Model := PAnsiChar(Config.TenVad.Model); | ||
| 2129 | + C.TenVad.Threshold := Config.TenVad.Threshold; | ||
| 2130 | + C.TenVad.MinSilenceDuration := Config.TenVad.MinSilenceDuration; | ||
| 2131 | + C.TenVad.MinSpeechDuration := Config.TenVad.MinSpeechDuration; | ||
| 2132 | + C.TenVad.WindowSize := Config.TenVad.WindowSize; | ||
| 2133 | + C.TenVad.MaxSpeechDuration := Config.TenVad.MaxSpeechDuration; | ||
| 2134 | + | ||
| 2080 | C.SampleRate := Config.SampleRate; | 2135 | C.SampleRate := Config.SampleRate; |
| 2081 | C.NumThreads := Config.NumThreads; | 2136 | C.NumThreads := Config.NumThreads; |
| 2082 | C.Provider := PAnsiChar(Config.Provider); | 2137 | C.Provider := PAnsiChar(Config.Provider); |
-
请 注册 或 登录 后发表评论