online-transducer-decoder.h
3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// sherpa-onnx/csrc/online-transducer-decoder.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_DECODER_H_
#define SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_DECODER_H_
#include <vector>
#include "onnxruntime_cxx_api.h" // NOLINT
#include "sherpa-onnx/csrc/hypothesis.h"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
struct OnlineTransducerDecoderResult {
/// Number of frames after subsampling we have decoded so far
int32_t frame_offset = 0;
/// The decoded token IDs so far
std::vector<int64_t> tokens;
/// number of trailing blank frames decoded so far
int32_t num_trailing_blanks = 0;
/// timestamps[i] contains the output frame index where tokens[i] is decoded.
std::vector<int32_t> timestamps;
std::vector<float> ys_probs;
std::vector<float> lm_probs;
std::vector<float> context_scores;
// Cache decoder_out for endpointing
Ort::Value decoder_out;
// used only in modified beam_search
Hypotheses hyps;
OnlineTransducerDecoderResult()
: tokens{}, num_trailing_blanks(0), decoder_out{nullptr}, hyps{} {}
OnlineTransducerDecoderResult(const OnlineTransducerDecoderResult &other);
OnlineTransducerDecoderResult &operator=(
const OnlineTransducerDecoderResult &other);
OnlineTransducerDecoderResult(OnlineTransducerDecoderResult &&other) noexcept;
OnlineTransducerDecoderResult &operator=(
OnlineTransducerDecoderResult &&other) noexcept;
};
class OnlineStream;
class OnlineTransducerDecoder {
public:
virtual ~OnlineTransducerDecoder() = default;
/* Return an empty result.
*
* To simplify the decoding code, we add `context_size` blanks
* to the beginning of the decoding result, which will be
* stripped by calling `StripPrecedingBlanks()`.
*/
virtual OnlineTransducerDecoderResult GetEmptyResult() const = 0;
/** Strip blanks added by `GetEmptyResult()`.
*
* @param r It is changed in-place.
*/
virtual void StripLeadingBlanks(OnlineTransducerDecoderResult * /*r*/) const {
}
/** Run transducer beam search given the output from the encoder model.
*
* @param encoder_out A 3-D tensor of shape (N, T, joiner_dim)
* @param result It is modified in-place.
*
* @note There is no need to pass encoder_out_length here since for the
* online decoding case, each utterance has the same number of frames
* and there are no paddings.
*/
virtual void Decode(Ort::Value encoder_out,
std::vector<OnlineTransducerDecoderResult> *result) = 0;
/** Run transducer beam search given the output from the encoder model.
*
* Note: Currently this interface is for contextual-biasing feature which
* needs a ContextGraph owned by the OnlineStream.
*
* @param encoder_out A 3-D tensor of shape (N, T, joiner_dim)
* @param ss A list of OnlineStreams.
* @param result It is modified in-place.
*
* @note There is no need to pass encoder_out_length here since for the
* online decoding case, each utterance has the same number of frames
* and there are no paddings.
*/
virtual void Decode(Ort::Value /*encoder_out*/, OnlineStream ** /*ss*/,
std::vector<OnlineTransducerDecoderResult> * /*result*/) {
SHERPA_ONNX_LOGE(
"This interface is for OnlineTransducerModifiedBeamSearchDecoder.");
exit(-1);
}
// used for endpointing. We need to keep decoder_out after reset
virtual void UpdateDecoderOut(OnlineTransducerDecoderResult * /*result*/) {}
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_DECODER_H_