sherpa-onnx-alsa.cc
4.0 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// sherpa-onnx/csrc/sherpa-onnx-alsa.cc
//
// Copyright (c) 2022-2023 Xiaomi Corporation
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <cctype> // std::tolower
#include <cstdint>
#include "sherpa-onnx/csrc/alsa.h"
#include "sherpa-onnx/csrc/display.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
#include "sherpa-onnx/csrc/parse-options.h"
bool stop = false;
static void Handler(int sig) {
stop = true;
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
}
int main(int32_t argc, char *argv[]) {
signal(SIGINT, Handler);
const char *kUsageMessage = R"usage(
Usage:
./bin/sherpa-onnx-alsa \
--tokens=/path/to/tokens.txt \
--encoder=/path/to/encoder.onnx \
--decoder=/path/to/decoder.onnx \
--joiner=/path/to/joiner.onnx \
--provider=cpu \
--num-threads=2 \
--decoding-method=greedy_search \
device_name
Please refer to
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
for a list of pre-trained models to download.
The device name specifies which microphone to use in case there are several
on your system. You can use
arecord -l
to find all available microphones on your computer. For instance, if it outputs
**** List of CAPTURE Hardware Devices ****
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
Subdevices: 1/1
Subdevice #0: subdevice #0
and if you want to select card 3 and device 0 on that card, please use:
plughw:3,0
as the device_name.
)usage";
sherpa_onnx::ParseOptions po(kUsageMessage);
sherpa_onnx::OnlineRecognizerConfig config;
config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 1) {
fprintf(stderr, "Please provide only 1 argument: the device name\n");
po.PrintUsage();
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s\n", config.ToString().c_str());
if (!config.Validate()) {
fprintf(stderr, "Errors in config!\n");
return -1;
}
sherpa_onnx::OnlineRecognizer recognizer(config);
int32_t expected_sample_rate = config.feat_config.sampling_rate;
std::string device_name = po.GetArg(1);
sherpa_onnx::Alsa alsa(device_name.c_str());
fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
expected_sample_rate);
exit(-1);
}
int32_t chunk = 0.1 * alsa.GetActualSampleRate();
std::string last_text;
auto stream = recognizer.CreateStream();
sherpa_onnx::Display display;
int32_t segment_index = 0;
while (!stop) {
const std::vector<float> &samples = alsa.Read(chunk);
stream->AcceptWaveform(expected_sample_rate, samples.data(),
samples.size());
while (recognizer.IsReady(stream.get())) {
recognizer.DecodeStream(stream.get());
}
auto text = recognizer.GetResult(stream.get()).text;
bool is_endpoint = recognizer.IsEndpoint(stream.get());
if (is_endpoint && !config.model_config.paraformer.encoder.empty()) {
// For streaming paraformer models, since it has a large right chunk size
// we need to pad it on endpointing so that the last character
// can be recognized
std::vector<float> tail_paddings(
static_cast<int>(1.0 * expected_sample_rate));
stream->AcceptWaveform(expected_sample_rate, tail_paddings.data(),
tail_paddings.size());
while (recognizer.IsReady(stream.get())) {
recognizer.DecodeStream(stream.get());
}
text = recognizer.GetResult(stream.get()).text;
}
if (!text.empty() && last_text != text) {
last_text = text;
std::transform(text.begin(), text.end(), text.begin(),
[](auto c) { return std::tolower(c); });
display.Print(segment_index, text);
fflush(stderr);
}
if (is_endpoint) {
if (!text.empty()) {
++segment_index;
}
recognizer.Reset(stream.get());
}
}
return 0;
}