online-ctc-fst-decoder.cc
3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// sherpa-onnx/csrc/online-ctc-fst-decoder.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include "sherpa-onnx/csrc/online-ctc-fst-decoder.h"
#include <algorithm>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "fst/fstlib.h"
#include "kaldi-decoder/csrc/decodable-ctc.h"
#include "kaldifst/csrc/fstext-utils.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/online-stream.h"
namespace sherpa_onnx {
// defined in ./offline-ctc-fst-decoder.cc
fst::Fst<fst::StdArc> *ReadGraph(const std::string &filename);
OnlineCtcFstDecoder::OnlineCtcFstDecoder(
const OnlineCtcFstDecoderConfig &config, int32_t blank_id)
: config_(config), fst_(ReadGraph(config.graph)), blank_id_(blank_id) {
options_.max_active = config_.max_active;
}
std::unique_ptr<kaldi_decoder::FasterDecoder>
OnlineCtcFstDecoder::CreateFasterDecoder() const {
return std::make_unique<kaldi_decoder::FasterDecoder>(*fst_, options_);
}
static void DecodeOne(const float *log_probs, int32_t num_rows,
int32_t num_cols, OnlineCtcDecoderResult *result,
OnlineStream *s, int32_t blank_id) {
int32_t &processed_frames = s->GetFasterDecoderProcessedFrames();
kaldi_decoder::DecodableCtc decodable(log_probs, num_rows, num_cols,
processed_frames);
kaldi_decoder::FasterDecoder *decoder = s->GetFasterDecoder();
if (processed_frames == 0) {
decoder->InitDecoding();
}
decoder->AdvanceDecoding(&decodable);
if (decoder->ReachedFinal()) {
fst::VectorFst<fst::LatticeArc> fst_out;
bool ok = decoder->GetBestPath(&fst_out);
if (ok) {
std::vector<int32_t> isymbols_out;
std::vector<int32_t> osymbols_out_unused;
ok = fst::GetLinearSymbolSequence(fst_out, &isymbols_out,
&osymbols_out_unused, nullptr);
std::vector<int64_t> tokens;
tokens.reserve(isymbols_out.size());
std::vector<int32_t> timestamps;
timestamps.reserve(isymbols_out.size());
std::ostringstream os;
int32_t prev_id = -1;
int32_t num_trailing_blanks = 0;
int32_t f = 0; // frame number
for (auto i : isymbols_out) {
i -= 1;
if (i == blank_id) {
num_trailing_blanks += 1;
} else {
num_trailing_blanks = 0;
}
if (i != blank_id && i != prev_id) {
tokens.push_back(i);
timestamps.push_back(f);
}
prev_id = i;
f += 1;
}
result->tokens = std::move(tokens);
result->timestamps = std::move(timestamps);
// no need to set frame_offset
}
}
processed_frames += num_rows;
}
void OnlineCtcFstDecoder::Decode(Ort::Value log_probs,
std::vector<OnlineCtcDecoderResult> *results,
OnlineStream **ss, int32_t n) {
std::vector<int64_t> log_probs_shape =
log_probs.GetTensorTypeAndShapeInfo().GetShape();
if (log_probs_shape[0] != results->size()) {
SHERPA_ONNX_LOGE("Size mismatch! log_probs.size(0) %d, results.size(0): %d",
static_cast<int32_t>(log_probs_shape[0]),
static_cast<int32_t>(results->size()));
exit(-1);
}
if (log_probs_shape[0] != n) {
SHERPA_ONNX_LOGE("Size mismatch! log_probs.size(0) %d, n: %d",
static_cast<int32_t>(log_probs_shape[0]), n);
exit(-1);
}
int32_t batch_size = static_cast<int32_t>(log_probs_shape[0]);
int32_t num_frames = static_cast<int32_t>(log_probs_shape[1]);
int32_t vocab_size = static_cast<int32_t>(log_probs_shape[2]);
const float *p = log_probs.GetTensorData<float>();
for (int32_t i = 0; i != batch_size; ++i) {
DecodeOne(p + i * num_frames * vocab_size, num_frames, vocab_size,
&(*results)[i], ss[i], blank_id_);
}
}
} // namespace sherpa_onnx