Add Pascal API for Moonshine models (#1482)

Fangjun Kuang · GitHub
Commit cdd8e1bbcb51aeb0b1bd73c4138f0e4ad7061bbd cdd8e1bb 1 parent 54468a73
.github/workflows/pascal.yaml
pascal-api-examples/non-streaming-asr/.gitignore
pascal-api-examples/non-streaming-asr/moonshine.pas
pascal-api-examples/non-streaming-asr/run-moonshine.sh
pascal-api-examples/vad-with-non-streaming-asr/.gitignore
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh
pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas
sherpa-onnx/pascal-api/sherpa_onnx.pas
--- a/.github/workflows/pascal.yaml
查看文件 @cdd8e1b
+++ b/.github/workflows/pascal.yaml
查看文件 @cdd8e1b
@@ -165,6 +165,10 @@ jobs:
           cd ./pascal-api-examples
 
           pushd vad-with-non-streaming-asr
+           time ./run-vad-with-moonshine.sh
+           rm -rf sherpa-onnx-*
+           echo "---"
+ 
           time ./run-vad-with-whisper.sh
           rm -rf sherpa-onnx-*
           echo "---"
@@ -220,6 +224,10 @@ jobs:
           rm -rf sherpa-onnx-*
           echo "---"
 
+           ./run-moonshine.sh
+           rm -rf sherpa-onnx-*
+           echo "---"
+ 
           ./run-whisper.sh
           rm -rf sherpa-onnx-*
           echo "---"
--- a/pascal-api-examples/non-streaming-asr/.gitignore
查看文件 @cdd8e1b
+++ b/pascal-api-examples/non-streaming-asr/.gitignore
查看文件 @cdd8e1b
@@ -7,3 +7,4 @@ paraformer
 paraformer_itn
 sense_voice
 telespeech_ctc
+ moonshine
--- a/pascal-api-examples/non-streaming-asr/moonshine.pas 0 → 100644
查看文件 @cdd8e1b
+++ b/pascal-api-examples/non-streaming-asr/moonshine.pas 0 → 100644
查看文件 @cdd8e1b
+ { Copyright (c)  2024  Xiaomi Corporation }
+ 
+ {
+ This file shows how to use a non-streaming Moonshine model
+ to decode files.
+ 
+ You can download the model files from
+ https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+ }
+ 
+ program moonshine;
+ 
+ {$mode objfpc}
+ 
+ uses
+   sherpa_onnx,
+   DateUtils,
+   SysUtils;
+ 
+ var
+   Wave: TSherpaOnnxWave;
+   WaveFilename: AnsiString;
+ 
+   Config: TSherpaOnnxOfflineRecognizerConfig;
+   Recognizer: TSherpaOnnxOfflineRecognizer;
+   Stream: TSherpaOnnxOfflineStream;
+   RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+ 
+   Start: TDateTime;
+   Stop: TDateTime;
+ 
+   Elapsed: Single;
+   Duration: Single;
+   RealTimeFactor: Single;
+ begin
+   Initialize(Config);
+ 
+   Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx';
+   Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx';
+   Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx';
+   Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx';
+ 
+   Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt';
+   Config.ModelConfig.Provider := 'cpu';
+   Config.ModelConfig.NumThreads := 1;
+   Config.ModelConfig.Debug := False;
+ 
+   WaveFilename := './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav';
+ 
+   Wave := SherpaOnnxReadWave(WaveFilename);
+ 
+   Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
+   Stream := Recognizer.CreateStream();
+   Start := Now;
+ 
+   Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
+   Recognizer.Decode(Stream);
+ 
+   RecognitionResult := Recognizer.GetResult(Stream);
+ 
+   Stop := Now;
+ 
+   Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
+   Duration := Length(Wave.Samples) / Wave.SampleRate;
+   RealTimeFactor := Elapsed / Duration;
+ 
+   WriteLn(RecognitionResult.ToString);
+   WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
+   WriteLn(Format('Elapsed %.3f s', [Elapsed]));
+   WriteLn(Format('Wave duration %.3f s', [Duration]));
+   WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
+ 
+   {Free resources to avoid memory leak.
+ 
+   Note: You don't need to invoke them for this simple script.
+   However, you have to invoke them in your own large/complex project.
+   }
+   FreeAndNil(Stream);
+   FreeAndNil(Recognizer);
+ end.
--- a/pascal-api-examples/non-streaming-asr/run-moonshine.sh 0 → 100755
查看文件 @cdd8e1b
+++ b/pascal-api-examples/non-streaming-asr/run-moonshine.sh 0 → 100755
查看文件 @cdd8e1b
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+ 
+ echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+ 
+ if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+   mkdir -p ../../build
+   pushd ../../build
+   cmake \
+     -DCMAKE_INSTALL_PREFIX=./install \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     ..
+ 
+   cmake --build . --target install --config Release
+   ls -lh lib
+   popd
+ fi
+ 
+ if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+   tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+   rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+ fi
+ 
+ fpc \
+   -dSHERPA_ONNX_USE_SHARED_LIBS \
+   -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+   -Fl$SHERPA_ONNX_DIR/build/install/lib \
+   ./moonshine.pas
+ 
+ export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+ export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+ 
+ ./moonshine
--- a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore
查看文件 @cdd8e1b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore
查看文件 @cdd8e1b
 !run-*.sh
 vad_with_whisper
 vad_with_sense_voice
+ vad_with_moonshine
--- a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh 0 → 100755
查看文件 @cdd8e1b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh 0 → 100755
查看文件 @cdd8e1b
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+ 
+ echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+ 
+ if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+   mkdir -p ../../build
+   pushd ../../build
+   cmake \
+     -DCMAKE_INSTALL_PREFIX=./install \
+     -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+     -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+     -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+     -DBUILD_SHARED_LIBS=ON \
+     -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+     ..
+ 
+   cmake --build . --target install --config Release
+   popd
+ fi
+ 
+ if [[ ! -f ./silero_vad.onnx ]]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+ fi
+ 
+ if [ ! -f ./Obama.wav ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+ fi
+ 
+ if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+   tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+   rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+ fi
+ 
+ fpc \
+   -dSHERPA_ONNX_USE_SHARED_LIBS \
+   -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+   -Fl$SHERPA_ONNX_DIR/build/install/lib \
+   ./vad_with_moonshine.pas
+ 
+ export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+ export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+ 
+ ./vad_with_moonshine
--- a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas 0 → 100644
查看文件 @cdd8e1b
+++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_moonshine.pas 0 → 100644
查看文件 @cdd8e1b
+ { Copyright (c)  2024  Xiaomi Corporation }
+ 
+ {
+ This file shows how to use a non-streaming Moonshine model
+ with silero VAD to decode files.
+ 
+ You can download the model files from
+ https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+ }
+ 
+ program vad_with_moonshine;
+ 
+ {$mode objfpc}
+ 
+ uses
+   sherpa_onnx,
+   SysUtils;
+ 
+ function CreateVad(): TSherpaOnnxVoiceActivityDetector;
+ var
+   Config: TSherpaOnnxVadModelConfig;
+ 
+   SampleRate: Integer;
+   WindowSize: Integer;
+ begin
+   Initialize(Config);
+ 
+   SampleRate := 16000; {Please don't change it unless you know the details}
+   WindowSize := 512; {Please don't change it unless you know the details}
+ 
+   Config.SileroVad.Model := './silero_vad.onnx';
+   Config.SileroVad.MinSpeechDuration := 0.5;
+   Config.SileroVad.MinSilenceDuration := 0.5;
+   Config.SileroVad.Threshold := 0.5;
+   Config.SileroVad.WindowSize := WindowSize;
+   Config.NumThreads:= 1;
+   Config.Debug:= True;
+   Config.Provider:= 'cpu';
+   Config.SampleRate := SampleRate;
+ 
+   Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
+ end;
+ 
+ function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
+ var
+   Config: TSherpaOnnxOfflineRecognizerConfig;
+ begin
+   Initialize(Config);
+ 
+   Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx';
+   Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx';
+   Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx';
+   Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx';
+ 
+   Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt';
+   Config.ModelConfig.Provider := 'cpu';
+   Config.ModelConfig.NumThreads := 1;
+   Config.ModelConfig.Debug := False;
+ 
+   Result := TSherpaOnnxOfflineRecognizer.Create(Config);
+ end;
+ 
+ var
+   Wave: TSherpaOnnxWave;
+ 
+   Recognizer: TSherpaOnnxOfflineRecognizer;
+   Vad: TSherpaOnnxVoiceActivityDetector;
+ 
+   Offset: Integer;
+   WindowSize: Integer;
+   SpeechSegment: TSherpaOnnxSpeechSegment;
+ 
+   Start: Single;
+   Duration: Single;
+ 
+   Stream: TSherpaOnnxOfflineStream;
+   RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
+ begin
+   Vad := CreateVad();
+   Recognizer := CreateOfflineRecognizer();
+ 
+   Wave := SherpaOnnxReadWave('./Obama.wav');
+   if Wave.SampleRate <> Vad.Config.SampleRate then
+     begin
+       WriteLn(Format('Expected sample rate: %d. Given: %d',
+         [Vad.Config.SampleRate, Wave.SampleRate]));
+ 
+       Exit;
+     end;
+ 
+   WindowSize := Vad.Config.SileroVad.WindowSize;
+   Offset := 0;
+   while Offset + WindowSize <= Length(Wave.Samples) do
+     begin
+       Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
+       Offset += WindowSize;
+ 
+       while not Vad.IsEmpty do
+         begin
+           SpeechSegment := Vad.Front();
+           Vad.Pop();
+           Stream := Recognizer.CreateStream();
+ 
+           Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+           Recognizer.Decode(Stream);
+           RecognitionResult := Recognizer.GetResult(Stream);
+ 
+           Start := SpeechSegment.Start / Wave.SampleRate;
+           Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+           WriteLn(Format('%.3f -- %.3f %s',
+             [Start, Start + Duration, RecognitionResult.Text]));
+ 
+           FreeAndNil(Stream);
+         end;
+     end;
+ 
+   Vad.Flush;
+ 
+   while not Vad.IsEmpty do
+     begin
+       SpeechSegment := Vad.Front();
+       Vad.Pop();
+       Stream := Recognizer.CreateStream();
+ 
+       Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
+       Recognizer.Decode(Stream);
+       RecognitionResult := Recognizer.GetResult(Stream);
+ 
+       Start := SpeechSegment.Start / Wave.SampleRate;
+       Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
+       WriteLn(Format('%.3f -- %.3f %s',
+         [Start, Start + Duration, RecognitionResult.Text]));
+ 
+       FreeAndNil(Stream);
+     end;
+ 
+   FreeAndNil(Recognizer);
+   FreeAndNil(Vad);
+ end.
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @cdd8e1b
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
查看文件 @cdd8e1b
@@ -250,6 +250,14 @@ type
     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
   end;
 
+   TSherpaOnnxOfflineMoonshineModelConfig = record
+     Preprocessor: AnsiString;
+     Encoder: AnsiString;
+     UncachedDecoder: AnsiString;
+     CachedDecoder: AnsiString;
+     function ToString: AnsiString;
+   end;
+ 
   TSherpaOnnxOfflineTdnnModelConfig = record
     Model: AnsiString;
     function ToString: AnsiString;
@@ -285,6 +293,7 @@ type
     BpeVocab: AnsiString;
     TeleSpeechCtc: AnsiString;
     SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
+     Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
     class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
     function ToString: AnsiString;
   end;
@@ -617,6 +626,12 @@ type
     Task: PAnsiChar;
     TailPaddings: cint32;
   end;
+   SherpaOnnxOfflineMoonshineModelConfig = record
+     Preprocessor: PAnsiChar;
+     Encoder: PAnsiChar;
+     UncachedDecoder: PAnsiChar;
+     CachedDecoder: PAnsiChar;
+   end;
   SherpaOnnxOfflineTdnnModelConfig = record
     Model: PAnsiChar;
   end;
@@ -644,6 +659,7 @@ type
     BpeVocab: PAnsiChar;
     TeleSpeechCtc: PAnsiChar;
     SenseVoice:  SherpaOnnxOfflineSenseVoiceModelConfig;
+     Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
   end;
 
   SherpaOnnxOfflineRecognizerConfig = record
@@ -1312,6 +1328,16 @@ begin
     [Self.Encoder, Self.Decoder, Self.Language, Self.Task, Self.TailPaddings]);
 end;
 
+ function TSherpaOnnxOfflineMoonshineModelConfig.ToString: AnsiString;
+ begin
+   Result := Format('TSherpaOnnxOfflineMoonshineModelConfig(' +
+     'Preprocessor := %s, ' +
+     'Encoder := %s, ' +
+     'UncachedDecoder := %s, ' +
+     'CachedDecoder := %s)',
+     [Self.Preprocessor, Self.Encoder, Self.UncachedDecoder, Self.CachedDecoder]);
+ end;
+ 
 function TSherpaOnnxOfflineTdnnModelConfig.ToString: AnsiString;
 begin
   Result := Format('TSherpaOnnxOfflineTdnnModelConfig(Model := %s)',
@@ -1353,13 +1379,14 @@ begin
     'ModelingUnit := %s, ' +
     'BpeVocab := %s, ' +
     'TeleSpeechCtc := %s, ' +
-     'SenseVoice := %s' +
+     'SenseVoice := %s, ' +
+     'Moonshine := %s' +
     ')',
     [Self.Transducer.ToString, Self.Paraformer.ToString,
      Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
      Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
      Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
-      Self.TeleSpeechCtc, Self.SenseVoice.ToString
+      Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString
      ]);
 end;
 
@@ -1407,7 +1434,6 @@ begin
 
   C.ModelConfig.Tdnn.Model := PAnsiChar(Config.ModelConfig.Tdnn.Model);
 
- 
   C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens);
   C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads;
   C.ModelConfig.Debug := Ord(Config.ModelConfig.Debug);
@@ -1421,6 +1447,11 @@ begin
   C.ModelConfig.SenseVoice.Language := PAnsiChar(Config.ModelConfig.SenseVoice.Language);
   C.ModelConfig.SenseVoice.UseItn := Ord(Config.ModelConfig.SenseVoice.UseItn);
 
+   C.ModelConfig.Moonshine.Preprocessor := PAnsiChar(Config.ModelConfig.Moonshine.Preprocessor);
+   C.ModelConfig.Moonshine.Encoder := PAnsiChar(Config.ModelConfig.Moonshine.Encoder);
+   C.ModelConfig.Moonshine.UncachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.UncachedDecoder);
+   C.ModelConfig.Moonshine.CachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.CachedDecoder);
+ 
   C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
   C.LMConfig.Scale := Config.LMConfig.Scale;