sherpa-onnx-vad-alsa-offline-asr.cc
4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
// sherpa-onnx/csrc/sherpa-onnx-vad-alsa-offline-asr.cc
//
// Copyright (c) 2022-2025 Xiaomi Corporation
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <mutex> // NOLINT
#include "sherpa-onnx/csrc/alsa.h"
#include "sherpa-onnx/csrc/circular-buffer.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/resample.h"
#include "sherpa-onnx/csrc/voice-activity-detector.h"
bool stop = false;
static void Handler(int32_t /*sig*/) {
stop = true;
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
}
int32_t main(int32_t argc, char *argv[]) {
signal(SIGINT, Handler);
const char *kUsageMessage = R"usage(
This program shows how to use a streaming VAD with non-streaming ASR in
sherpa-onnx.
Please download silero_vad.onnx from
https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
For instance, use
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
Please refer to ./sherpa-onnx-microphone-offline.cc
to download models for offline ASR.
(1) Transducer from icefall
./bin/sherpa-onnx-vad-microphone-offline-asr \
--silero-vad-model=/path/to/silero_vad.onnx \
--tokens=/path/to/tokens.txt \
--encoder=/path/to/encoder.onnx \
--decoder=/path/to/decoder.onnx \
--joiner=/path/to/joiner.onnx \
device_name
(2) Paraformer from FunASR
./bin/sherpa-onnx-vad-microphone-offline-asr \
--silero-vad-model=/path/to/silero_vad.onnx \
--tokens=/path/to/tokens.txt \
--paraformer=/path/to/model.onnx \
device_name
(3) Whisper models
./bin/sherpa-onnx-vad-microphone-offline-asr \
--silero-vad-model=/path/to/silero_vad.onnx \
--whisper-encoder=./sherpa-onnx-whisper-base.en/base.en-encoder.int8.onnx \
--whisper-decoder=./sherpa-onnx-whisper-base.en/base.en-decoder.int8.onnx \
--tokens=./sherpa-onnx-whisper-base.en/base.en-tokens.txt \
device_name
The device name specifies which microphone to use in case there are several
on your system. You can use
arecord -l
to find all available microphones on your computer. For instance, if it outputs
**** List of CAPTURE Hardware Devices ****
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
Subdevices: 1/1
Subdevice #0: subdevice #0
and if you want to select card 3 and device 0 on that card, please use:
plughw:3,0
as the device_name.
)usage";
sherpa_onnx::ParseOptions po(kUsageMessage);
sherpa_onnx::VadModelConfig vad_config;
sherpa_onnx::OfflineRecognizerConfig asr_config;
vad_config.Register(&po);
asr_config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 1) {
fprintf(stderr, "Please provide only 1 argument: the device name\n");
po.PrintUsage();
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s\n", vad_config.ToString().c_str());
fprintf(stderr, "%s\n", asr_config.ToString().c_str());
if (!vad_config.Validate()) {
fprintf(stderr, "Errors in vad_config!\n");
return -1;
}
if (!asr_config.Validate()) {
fprintf(stderr, "Errors in asr_config!\n");
return -1;
}
fprintf(stderr, "Creating recognizer ...\n");
sherpa_onnx::OfflineRecognizer recognizer(asr_config);
fprintf(stderr, "Recognizer created!\n");
auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(vad_config);
std::string device_name = po.GetArg(1);
sherpa_onnx::Alsa alsa(device_name.c_str());
fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
int32_t sample_rate = 16000;
if (alsa.GetExpectedSampleRate() != sample_rate) {
fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
sample_rate);
exit(-1);
}
fprintf(stderr, "Started. Please speak\n");
int32_t window_size = vad_config.silero_vad.window_size;
int32_t index = 0;
while (!stop) {
const std::vector<float> &samples = alsa.Read(window_size);
vad->AcceptWaveform(samples.data(), samples.size());
while (!vad->Empty()) {
const auto &segment = vad->Front();
auto s = recognizer.CreateStream();
s->AcceptWaveform(sample_rate, segment.samples.data(),
segment.samples.size());
recognizer.DecodeStream(s.get());
const auto &result = s->GetResult();
if (!result.text.empty()) {
fprintf(stderr, "%2d: %s\n", index, result.text.c_str());
++index;
}
vad->Pop();
}
}
return 0;
}