streaming-ctc-buffered-tokens-c-api.c
5.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// c-api-examples/streaming-ctc-buffered-tokens-c-api.c
//
// Copyright (c) 2024 Xiaomi Corporation
// Copyright (c) 2024 Luo Xiao
//
// This file demonstrates how to use streaming Zipformer2 Ctc with sherpa-onnx's
// C API and with tokens loaded from buffered strings instead of
// from external files API.
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
// tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
// rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
//
// clang-format on
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "sherpa-onnx/c-api/c-api.h"
static size_t ReadFile(const char *filename, const char **buffer_out) {
FILE *file = fopen(filename, "r");
if (file == NULL) {
fprintf(stderr, "Failed to open %s\n", filename);
return -1;
}
fseek(file, 0L, SEEK_END);
long size = ftell(file);
rewind(file);
*buffer_out = malloc(size);
if (*buffer_out == NULL) {
fclose(file);
fprintf(stderr, "Memory error\n");
return -1;
}
size_t read_bytes = fread((void *)*buffer_out, 1, size, file);
if (read_bytes != size) {
printf("Errors occured in reading the file %s\n", filename);
free((void *)*buffer_out);
*buffer_out = NULL;
fclose(file);
return -1;
}
fclose(file);
return read_bytes;
}
int32_t main() {
const char *wav_filename =
"sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/"
"DEV_T0000000000.wav";
const char *model_filename =
"sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/"
"ctc-epoch-20-avg-1-chunk-16-left-128.onnx";
const char *tokens_filename =
"sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt";
const char *provider = "cpu";
const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
if (wave == NULL) {
fprintf(stderr, "Failed to read %s\n", wav_filename);
return -1;
}
// reading tokens to buffers
const char *tokens_buf;
size_t token_buf_size = ReadFile(tokens_filename, &tokens_buf);
if (token_buf_size < 1) {
fprintf(stderr, "Please check your tokens.txt!\n");
free((void *)tokens_buf);
return -1;
}
// Zipformer2Ctc config
SherpaOnnxOnlineZipformer2CtcModelConfig zipformer2_ctc_config;
memset(&zipformer2_ctc_config, 0, sizeof(zipformer2_ctc_config));
zipformer2_ctc_config.model = model_filename;
// Online model config
SherpaOnnxOnlineModelConfig online_model_config;
memset(&online_model_config, 0, sizeof(online_model_config));
online_model_config.debug = 1;
online_model_config.num_threads = 1;
online_model_config.provider = provider;
online_model_config.tokens_buf = tokens_buf;
online_model_config.tokens_buf_size = token_buf_size;
online_model_config.zipformer2_ctc = zipformer2_ctc_config;
// Recognizer config
SherpaOnnxOnlineRecognizerConfig recognizer_config;
memset(&recognizer_config, 0, sizeof(recognizer_config));
recognizer_config.decoding_method = "greedy_search";
recognizer_config.model_config = online_model_config;
const SherpaOnnxOnlineRecognizer *recognizer =
SherpaOnnxCreateOnlineRecognizer(&recognizer_config);
free((void *)tokens_buf);
tokens_buf = NULL;
if (recognizer == NULL) {
fprintf(stderr, "Please check your config!\n");
SherpaOnnxFreeWave(wave);
return -1;
}
const SherpaOnnxOnlineStream *stream =
SherpaOnnxCreateOnlineStream(recognizer);
const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50);
int32_t segment_id = 0;
// simulate streaming. You can choose an arbitrary N
#define N 3200
fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
wave->sample_rate, wave->num_samples,
(float)wave->num_samples / wave->sample_rate);
int32_t k = 0;
while (k < wave->num_samples) {
int32_t start = k;
int32_t end =
(start + N > wave->num_samples) ? wave->num_samples : (start + N);
k += N;
SherpaOnnxOnlineStreamAcceptWaveform(stream, wave->sample_rate,
wave->samples + start, end - start);
while (SherpaOnnxIsOnlineStreamReady(recognizer, stream)) {
SherpaOnnxDecodeOnlineStream(recognizer, stream);
}
const SherpaOnnxOnlineRecognizerResult *r =
SherpaOnnxGetOnlineStreamResult(recognizer, stream);
if (strlen(r->text)) {
SherpaOnnxPrint(display, segment_id, r->text);
}
if (SherpaOnnxOnlineStreamIsEndpoint(recognizer, stream)) {
if (strlen(r->text)) {
++segment_id;
}
SherpaOnnxOnlineStreamReset(recognizer, stream);
}
SherpaOnnxDestroyOnlineRecognizerResult(r);
}
// add some tail padding
float tail_paddings[4800] = {0}; // 0.3 seconds at 16 kHz sample rate
SherpaOnnxOnlineStreamAcceptWaveform(stream, wave->sample_rate, tail_paddings,
4800);
SherpaOnnxFreeWave(wave);
SherpaOnnxOnlineStreamInputFinished(stream);
while (SherpaOnnxIsOnlineStreamReady(recognizer, stream)) {
SherpaOnnxDecodeOnlineStream(recognizer, stream);
}
const SherpaOnnxOnlineRecognizerResult *r =
SherpaOnnxGetOnlineStreamResult(recognizer, stream);
if (strlen(r->text)) {
SherpaOnnxPrint(display, segment_id, r->text);
}
SherpaOnnxDestroyOnlineRecognizerResult(r);
SherpaOnnxDestroyDisplay(display);
SherpaOnnxDestroyOnlineStream(stream);
SherpaOnnxDestroyOnlineRecognizer(recognizer);
fprintf(stderr, "\n");
return 0;
}