Fangjun Kuang
Committed by GitHub

Add Pascal API for speech enhancement GTCRN models (#1992)

@@ -111,20 +111,36 @@ jobs: @@ -111,20 +111,36 @@ jobs:
111 ls -lh install/lib/ 111 ls -lh install/lib/
112 112
113 if [[ ${{ matrix.os }} == 'windows-latest' ]]; then 113 if [[ ${{ matrix.os }} == 'windows-latest' ]]; then
  114 + cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
114 cp -v install/lib/*.dll ../pascal-api-examples/read-wav 115 cp -v install/lib/*.dll ../pascal-api-examples/read-wav
  116 + cp -v install/lib/*.dll ../pascal-api-examples/speaker-diarization
  117 + cp -v install/lib/*.dll ../pascal-api-examples/speech-enhancement-gtcrn
115 cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr 118 cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
116 - cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr 119 + cp -v install/lib/*.dll ../pascal-api-examples/tts
117 cp -v install/lib/*.dll ../pascal-api-examples/vad 120 cp -v install/lib/*.dll ../pascal-api-examples/vad
118 cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr 121 cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
119 122
  123 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr
120 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav 124 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav
  125 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/speaker-diarization
  126 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/speech-enhancement-gtcrn
121 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr 127 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr
122 - cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr 128 + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
123 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad 129 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad
124 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr 130 cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
125 - cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts  
126 fi 131 fi
127 132
  133 + - name: Run Speech Enhancement test (GTCRN)
  134 + shell: bash
  135 + run: |
  136 + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
  137 +
  138 + cd ./pascal-api-examples
  139 +
  140 + pushd speech-enhancement-gtcrn
  141 + ./run-gtcrn.sh
  142 + ls -lh
  143 +
128 - name: Run Pascal test (Non Streaming ASR) 144 - name: Run Pascal test (Non Streaming ASR)
129 shell: bash 145 shell: bash
130 run: | 146 run: |
@@ -99,11 +99,21 @@ jobs: @@ -99,11 +99,21 @@ jobs:
99 cp -v ./install/lib/sherpa-onnx-c-api.dll ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/ 99 cp -v ./install/lib/sherpa-onnx-c-api.dll ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/
100 cp -v ./install/lib/onnxruntime.dll ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/ 100 cp -v ./install/lib/onnxruntime.dll ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/
101 ls -lh ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/ 101 ls -lh ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/
102 - cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/speaker-identification/  
103 - cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/streaming-hlg-decoding/  
104 - cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-tts/ 102 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/add-punctuation
  103 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/audio-tagging
  104 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/keyword-spotting-from-file/
105 cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-decode-files/ 105 cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-decode-files/
  106 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-speaker-diarization/
  107 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/non-streaming-tts/
  108 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/speaker-identification/
  109 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/speech-enhancement-gtcrn
106 cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/streaming-decode-files/ 110 cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/streaming-decode-files/
  111 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/streaming-hlg-decoding/
  112 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad
  113 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-asr-paraformer
  114 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-asr-whisper
  115 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-speaker-identification
  116 + cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll ../scripts/go/_internal/vad-spoken-language-identification
107 117
108 cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll $upload_dir 118 cp -v ../scripts/go/_internal/lib/x86_64-pc-windows-gnu/*.dll $upload_dir
109 else 119 else
  1 +{ Copyright (c) 2025 Xiaomi Corporation }
  2 +{
  3 +This file shows how to use the speech enhancement API from sherpa-onnx
  4 +
  5 +Please first download files used in this script before you run it.
  6 +
  7 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx
  8 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav
  9 +}
  10 +program main;
  11 +
  12 +{$mode delphi}
  13 +
  14 +uses
  15 + sherpa_onnx,
  16 + SysUtils;
  17 +
  18 +var
  19 + Wave: TSherpaOnnxWave;
  20 +
  21 + Config: TSherpaOnnxOfflineSpeechDenoiserConfig;
  22 + Sd: TSherpaOnnxOfflineSpeechDenoiser;
  23 + Audio: TSherpaOnnxDenoisedAudio;
  24 +begin
  25 + Wave := SherpaOnnxReadWave('./inp_16k.wav');
  26 +
  27 + Initialize(Config);
  28 +
  29 + Config.Model.Gtcrn.Model := './gtcrn_simple.onnx';
  30 + Config.Model.NumThreads:= 1;
  31 + Config.Model.Debug:= True;
  32 + Config.Model.Provider:= 'cpu';
  33 +
  34 + Sd := TSherpaOnnxOfflineSpeechDenoiser.Create(Config);
  35 +
  36 + Audio := Sd.Run(Wave.Samples, Wave.SampleRate);
  37 +
  38 + SherpaOnnxWriteWave('./enhanced-16k.wav', Audio.Samples, Audio.SampleRate);
  39 + WriteLn('Saved to ./enhanced-16k.wav');
  40 +
  41 + FreeAndNil(Sd);
  42 +end.
  43 +
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
  6 +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
  7 +
  8 +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
  9 +
  10 +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  11 + mkdir -p ../../build
  12 + pushd ../../build
  13 + cmake \
  14 + -DCMAKE_INSTALL_PREFIX=./install \
  15 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  16 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  17 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  18 + -DBUILD_SHARED_LIBS=ON \
  19 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  20 + ..
  21 +
  22 + cmake --build . --target install --config Release
  23 + popd
  24 +fi
  25 +
  26 +if [ ! -f ./gtcrn_simple.onnx ]; then
  27 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx
  28 +fi
  29 +
  30 +if [ ! -f ./inp_16k.wav ]; then
  31 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav
  32 +fi
  33 +
  34 +fpc \
  35 + -dSHERPA_ONNX_USE_SHARED_LIBS \
  36 + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  37 + -Fl$SHERPA_ONNX_DIR/build/install/lib \
  38 + ./gtcrn.pas
  39 +
  40 +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
  41 +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
  42 +
  43 +./gtcrn
  44 +
@@ -515,6 +515,44 @@ type @@ -515,6 +515,44 @@ type
515 property GetSampleRate: Integer Read SampleRate; 515 property GetSampleRate: Integer Read SampleRate;
516 end; 516 end;
517 517
  518 + TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig = record
  519 + Model: AnsiString;
  520 + function ToString: AnsiString;
  521 + end;
  522 +
  523 + TSherpaOnnxOfflineSpeechDenoiserModelConfig = record
  524 + Gtcrn: TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig;
  525 + NumThreads: Integer;
  526 + Debug: Boolean;
  527 + Provider: AnsiString;
  528 + function ToString: AnsiString;
  529 + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeechDenoiserModelConfig);
  530 + end;
  531 +
  532 + TSherpaOnnxOfflineSpeechDenoiserConfig = record
  533 + Model: TSherpaOnnxOfflineSpeechDenoiserModelConfig;
  534 + function ToString: AnsiString;
  535 + end;
  536 +
  537 + TSherpaOnnxDenoisedAudio = record
  538 + Samples: array of Single;
  539 + SampleRate: Integer;
  540 + end;
  541 +
  542 + TSherpaOnnxOfflineSpeechDenoiser = class
  543 + private
  544 + Handle: Pointer;
  545 + SampleRate: Integer;
  546 + _Config: TSherpaOnnxOfflineSpeechDenoiserConfig;
  547 + public
  548 + constructor Create(Config: TSherpaOnnxOfflineSpeechDenoiserConfig);
  549 + destructor Destroy; override;
  550 +
  551 + function Run(Samples: array of Single; InputSampleRate: Integer): TSherpaOnnxDenoisedAudio;
  552 +
  553 + property GetHandle: Pointer Read Handle;
  554 + property GetSampleRate: Integer Read SampleRate;
  555 + end;
518 556
519 { It supports reading a single channel wave with 16-bit encoded samples. 557 { It supports reading a single channel wave with 16-bit encoded samples.
520 Samples are normalized to the range [-1, 1]. 558 Samples are normalized to the range [-1, 1].
@@ -851,6 +889,31 @@ type @@ -851,6 +889,31 @@ type
851 889
852 PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig; 890 PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig;
853 891
  892 + SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig = record
  893 + Model: PAnsiChar;
  894 + end;
  895 +
  896 + SherpaOnnxOfflineSpeechDenoiserModelConfig = record
  897 + Gtcrn: SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig;
  898 + NumThreads: cint32;
  899 + Debug: cint32;
  900 + Provider: PAnsiChar;
  901 + end;
  902 +
  903 + SherpaOnnxOfflineSpeechDenoiserConfig = record
  904 + Model: SherpaOnnxOfflineSpeechDenoiserModelConfig;
  905 + end;
  906 +
  907 + PSherpaOnnxOfflineSpeechDenoiserConfig = ^SherpaOnnxOfflineSpeechDenoiserConfig;
  908 +
  909 + SherpaOnnxDenoisedAudio = record
  910 + Samples: pcfloat;
  911 + N: cint32;
  912 + SampleRate: cint32;
  913 + end;
  914 +
  915 + PSherpaOnnxDenoisedAudio = ^SherpaOnnxDenoisedAudio;
  916 +
854 function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32; 917 function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
855 SampleRateOutHz: cint32; 918 SampleRateOutHz: cint32;
856 FilterCutoffHz: cfloat; 919 FilterCutoffHz: cfloat;
@@ -872,6 +935,22 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec @@ -872,6 +935,22 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec
872 procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl; 935 procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
873 external SherpaOnnxLibName; 936 external SherpaOnnxLibName;
874 937
  938 +function SherpaOnnxCreateOfflineSpeechDenoiser(Config: PSherpaOnnxOfflineSpeechDenoiserConfig): Pointer; cdecl;
  939 + external SherpaOnnxLibName;
  940 +
  941 +procedure SherpaOnnxDestroyOfflineSpeechDenoiser(P: Pointer); cdecl;
  942 + external SherpaOnnxLibName;
  943 +
  944 +function SherpaOnnxOfflineSpeechDenoiserGetSampleRate(P: Pointer): cint32; cdecl;
  945 + external SherpaOnnxLibName;
  946 +
  947 +function SherpaOnnxOfflineSpeechDenoiserRun(P: Pointer;
  948 + Samples: pcfloat; N: cint32;SampleRate: cint32):PSherpaOnnxDenoisedAudio; cdecl;
  949 + external SherpaOnnxLibName;
  950 +
  951 +procedure SherpaOnnxDestroyDenoisedAudio(Audio: Pointer); cdecl;
  952 + external SherpaOnnxLibName;
  953 +
875 function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl; 954 function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl;
876 external SherpaOnnxLibName; 955 external SherpaOnnxLibName;
877 956
@@ -2358,4 +2437,79 @@ begin @@ -2358,4 +2437,79 @@ begin
2358 SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R); 2437 SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R);
2359 end; 2438 end;
2360 2439
  2440 +function TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig.ToString: AnsiString;
  2441 +begin
  2442 + Result := Format('TSherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(' +
  2443 + 'Model := %s)', [Self.Model]);
  2444 +end;
  2445 +
  2446 +function TSherpaOnnxOfflineSpeechDenoiserModelConfig.ToString: AnsiString;
  2447 +begin
  2448 + Result := Format('TSherpaOnnxOfflineSpeechDenoiserModelConfig(' +
  2449 + 'Gtcrn := %s, '+
  2450 + 'NumThreads := %d, '+
  2451 + 'Debug := %s, '+
  2452 + 'Provider := %s)',
  2453 + [Self.Gtcrn.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider]);
  2454 +end;
  2455 +
  2456 +class operator TSherpaOnnxOfflineSpeechDenoiserModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeechDenoiserModelConfig);
  2457 +begin
  2458 + Dest.NumThreads := 1;
  2459 + Dest.Debug := False;
  2460 + Dest.Provider := 'cpu';
  2461 +end;
  2462 +
  2463 +function TSherpaOnnxOfflineSpeechDenoiserConfig.ToString: AnsiString;
  2464 +begin
  2465 + Result := Format('TSherpaOnnxOfflineSpeechDenoiserConfig(' +
  2466 + 'Model := %s)', [Self.Model.ToString]);
  2467 +end;
  2468 +
  2469 +constructor TSherpaOnnxOfflineSpeechDenoiser.Create(Config: TSherpaOnnxOfflineSpeechDenoiserConfig);
  2470 +var
  2471 + C: SherpaOnnxOfflineSpeechDenoiserConfig;
  2472 +begin
  2473 + C := Default(SherpaOnnxOfflineSpeechDenoiserConfig);
  2474 + C.Model.Gtcrn.Model := PAnsiChar(Config.Model.Gtcrn.Model);
  2475 + C.Model.NumThreads := Config.Model.NumThreads;
  2476 + C.Model.Debug := Ord(Config.Model.Debug);
  2477 + C.Model.Provider := PAnsiChar(Config.Model.Provider);
  2478 +
  2479 + Self.Handle := SherpaOnnxCreateOfflineSpeechDenoiser(@C);
  2480 + Self._Config := Config;
  2481 + Self.SampleRate := 0;
  2482 +
  2483 + if Self.Handle <> nil then
  2484 + begin
  2485 + Self.SampleRate := SherpaOnnxOfflineSpeechDenoiserGetSampleRate(Self.Handle);
  2486 + end;
  2487 +end;
  2488 +
  2489 +destructor TSherpaOnnxOfflineSpeechDenoiser.Destroy;
  2490 +begin
  2491 + SherpaOnnxDestroyOfflineSpeechDenoiser(Self.Handle);
  2492 + Self.Handle := nil;
  2493 +end;
  2494 +
  2495 +function TSherpaOnnxOfflineSpeechDenoiser.Run(Samples: array of Single; InputSampleRate: Integer): TSherpaOnnxDenoisedAudio;
  2496 +var
  2497 + Audio: PSherpaOnnxDenoisedAudio;
  2498 + I: Integer;
  2499 +begin
  2500 + Result := Default(TSherpaOnnxDenoisedAudio);
  2501 +
  2502 + Audio := SherpaOnnxOfflineSpeechDenoiserRun(Self.Handle, pcfloat(Samples), Length(Samples), InputSampleRate);
  2503 +
  2504 + SetLength(Result.Samples, Audio^.N);
  2505 + Result.SampleRate := Audio^.SampleRate;
  2506 +
  2507 + for I := Low(Result.Samples) to High(Result.Samples) do
  2508 + begin
  2509 + Result.Samples[I] := Audio^.Samples[I];
  2510 + end;
  2511 +
  2512 + SherpaOnnxDestroyDenoisedAudio(audio);
  2513 +end;
  2514 +
2361 end. 2515 end.