zipformer_ctc.pas
2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use a streaming Zipformer CTC model
to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program zipformer_ctc;
{$mode objfpc}
uses
sherpa_onnx,
DateUtils,
SysUtils;
var
Config: TSherpaOnnxOnlineRecognizerConfig;
Recognizer: TSherpaOnnxOnlineRecognizer;
Stream: TSherpaOnnxOnlineStream;
RecognitionResult: TSherpaOnnxOnlineRecognizerResult;
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;
TailPaddings: array of Single;
Start: TDateTime;
Stop: TDateTime;
Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
{Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
to download model files used in this file.}
Config.ModelConfig.Zipformer2Ctc.Model := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
WaveFilename := './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav';
Wave := SherpaOnnxReadWave(WaveFilename);
Recognizer := TSherpaOnnxOnlineRecognizer.Create(Config);
Start := Now;
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
SetLength(TailPaddings, Round(Wave.SampleRate * 0.5)); {0.5 seconds of padding}
Stream.AcceptWaveform(TailPaddings, Wave.SampleRate);
Stream.InputFinished();
while Recognizer.IsReady(Stream) do
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Stop := Now;
Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;
WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
{Free resources to avoid memory leak.
Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.