Fangjun Kuang
Committed by GitHub

Add online stream. (#28)

... ... @@ -3,6 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR})
add_executable(sherpa-onnx
features.cc
online-lstm-transducer-model.cc
online-stream.cc
online-transducer-greedy-search-decoder.cc
online-transducer-model-config.cc
online-transducer-model.cc
... ...
... ... @@ -11,16 +11,12 @@
namespace sherpa_onnx {
struct FeatureExtractorConfig {
int32_t sampling_rate = 16000;
float sampling_rate = 16000;
int32_t feature_dim = 80;
};
class FeatureExtractor {
public:
/**
* @param sampling_rate Sampling rate of the data used to train the model.
* @param feature_dim Dimension of the features used to train the model.
*/
explicit FeatureExtractor(const FeatureExtractorConfig &config = {});
~FeatureExtractor();
... ... @@ -32,16 +28,19 @@ class FeatureExtractor {
*/
void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
// InputFinished() tells the class you won't be providing any
// more waveform. This will help flush out the last frame or two
// of features, in the case where snip-edges == false; it also
// affects the return value of IsLastFrame().
/**
* InputFinished() tells the class you won't be providing any
* more waveform. This will help flush out the last frame or two
* of features, in the case where snip-edges == false; it also
* affects the return value of IsLastFrame().
*/
void InputFinished();
int32_t NumFramesReady() const;
// Note: IsLastFrame() will only ever return true if you have called
// InputFinished() (and this frame is the last frame).
/** Note: IsLastFrame() will only ever return true if you have called
* InputFinished() (and this frame is the last frame).
*/
bool IsLastFrame(int32_t frame) const;
/** Get n frames starting from the given frame index.
... ...
// sherpa-onnx/csrc/online-stream.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/online-stream.h"
#include <memory>
#include <vector>
#include "sherpa-onnx/csrc/features.h"
namespace sherpa_onnx {
class OnlineStream::Impl {
public:
explicit Impl(const FeatureExtractorConfig &config)
: feat_extractor_(config) {}
void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n) {
feat_extractor_.AcceptWaveform(sampling_rate, waveform, n);
}
void InputFinished() { feat_extractor_.InputFinished(); }
int32_t NumFramesReady() const { return feat_extractor_.NumFramesReady(); }
bool IsLastFrame(int32_t frame) const {
return feat_extractor_.IsLastFrame(frame);
}
std::vector<float> GetFrames(int32_t frame_index, int32_t n) const {
return feat_extractor_.GetFrames(frame_index, n);
}
void Reset() { feat_extractor_.Reset(); }
int32_t &GetNumProcessedFrames() { return num_processed_frames_; }
void SetResult(const OnlineTransducerDecoderResult &r) { result_ = r; }
const OnlineTransducerDecoderResult &GetResult() const { return result_; }
int32_t FeatureDim() const { return feat_extractor_.FeatureDim(); }
private:
FeatureExtractor feat_extractor_;
int32_t num_processed_frames_ = 0; // before subsampling
OnlineTransducerDecoderResult result_;
};
OnlineStream::OnlineStream(const FeatureExtractorConfig &config /*= {}*/)
: impl_(std::make_unique<Impl>(config)) {}
OnlineStream::~OnlineStream() = default;
void OnlineStream::AcceptWaveform(float sampling_rate, const float *waveform,
int32_t n) {
impl_->AcceptWaveform(sampling_rate, waveform, n);
}
void OnlineStream::InputFinished() { impl_->InputFinished(); }
int32_t OnlineStream::NumFramesReady() const { return impl_->NumFramesReady(); }
bool OnlineStream::IsLastFrame(int32_t frame) const {
return impl_->IsLastFrame(frame);
}
std::vector<float> OnlineStream::GetFrames(int32_t frame_index,
int32_t n) const {
return impl_->GetFrames(frame_index, n);
}
void OnlineStream::Reset() { impl_->Reset(); }
int32_t OnlineStream::FeatureDim() const { return impl_->FeatureDim(); }
int32_t &OnlineStream::GetNumProcessedFrames() {
return impl_->GetNumProcessedFrames();
}
void OnlineStream::SetResult(const OnlineTransducerDecoderResult &r) {
impl_->SetResult(r);
}
const OnlineTransducerDecoderResult &OnlineStream::GetResult() const {
return impl_->GetResult();
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/online-stream.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
#define SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
#include <memory>
#include <vector>
#include "sherpa-onnx/csrc/features.h"
#include "sherpa-onnx/csrc/online-transducer-decoder.h"
namespace sherpa_onnx {
class OnlineStream {
public:
explicit OnlineStream(const FeatureExtractorConfig &config = {});
~OnlineStream();
/**
@param sampling_rate The sampling_rate of the input waveform. Should match
the one expected by the feature extractor.
@param waveform Pointer to a 1-D array of size n
@param n Number of entries in waveform
*/
void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
/**
* InputFinished() tells the class you won't be providing any
* more waveform. This will help flush out the last frame or two
* of features, in the case where snip-edges == false; it also
* affects the return value of IsLastFrame().
*/
void InputFinished();
int32_t NumFramesReady() const;
/** Note: IsLastFrame() will only ever return true if you have called
* InputFinished() (and this frame is the last frame).
*/
bool IsLastFrame(int32_t frame) const;
/** Get n frames starting from the given frame index.
*
* @param frame_index The starting frame index
* @param n Number of frames to get.
* @return Return a 2-D tensor of shape (n, feature_dim).
* which is flattened into a 1-D vector (flattened in in row major)
*/
std::vector<float> GetFrames(int32_t frame_index, int32_t n) const;
void Reset();
int32_t FeatureDim() const;
// Return a reference to the number of processed frames so far.
// Initially, it is 0. It is always less than NumFramesReady().
//
// The returned reference is valid as long as this object is alive.
int32_t &GetNumProcessedFrames();
void SetResult(const OnlineTransducerDecoderResult &r);
const OnlineTransducerDecoderResult &GetResult() const;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
... ...
... ... @@ -8,8 +8,7 @@
#include <string>
#include <vector>
#include "kaldi-native-fbank/csrc/online-feature.h"
#include "sherpa-onnx/csrc/features.h"
#include "sherpa-onnx/csrc/online-stream.h"
#include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"
#include "sherpa-onnx/csrc/online-transducer-model-config.h"
#include "sherpa-onnx/csrc/online-transducer-model.h"
... ... @@ -64,7 +63,7 @@ for a list of pre-trained models to download.
std::vector<Ort::Value> states = model->GetEncoderInitStates();
int32_t expected_sampling_rate = 16000;
float expected_sampling_rate = 16000;
bool is_ok = false;
std::vector<float> samples =
... ... @@ -75,7 +74,7 @@ for a list of pre-trained models to download.
return -1;
}
float duration = samples.size() / static_cast<float>(expected_sampling_rate);
float duration = samples.size() / expected_sampling_rate;
fprintf(stderr, "wav filename: %s\n", wav_filename.c_str());
fprintf(stderr, "wav duration (s): %.3f\n", duration);
... ... @@ -83,32 +82,33 @@ for a list of pre-trained models to download.
auto begin = std::chrono::steady_clock::now();
fprintf(stderr, "Started\n");
sherpa_onnx::FeatureExtractor feat_extractor;
feat_extractor.AcceptWaveform(expected_sampling_rate, samples.data(),
samples.size());
sherpa_onnx::OnlineStream stream;
stream.AcceptWaveform(expected_sampling_rate, samples.data(), samples.size());
std::vector<float> tail_paddings(
static_cast<int>(0.2 * expected_sampling_rate));
feat_extractor.AcceptWaveform(expected_sampling_rate, tail_paddings.data(),
tail_paddings.size());
feat_extractor.InputFinished();
stream.AcceptWaveform(expected_sampling_rate, tail_paddings.data(),
tail_paddings.size());
stream.InputFinished();
int32_t num_frames = feat_extractor.NumFramesReady();
int32_t feature_dim = feat_extractor.FeatureDim();
int32_t num_frames = stream.NumFramesReady();
int32_t feature_dim = stream.FeatureDim();
std::array<int64_t, 3> x_shape{1, chunk_size, feature_dim};
sherpa_onnx::OnlineTransducerGreedySearchDecoder decoder(model.get());
std::vector<sherpa_onnx::OnlineTransducerDecoderResult> result = {
decoder.GetEmptyResult()};
for (int32_t start = 0; start + chunk_size < num_frames;
start += chunk_shift) {
std::vector<float> features = feat_extractor.GetFrames(start, chunk_size);
while (stream.NumFramesReady() - stream.GetNumProcessedFrames() >
chunk_size) {
std::vector<float> features =
stream.GetFrames(stream.GetNumProcessedFrames(), chunk_size);
stream.GetNumProcessedFrames() += chunk_shift;
Ort::Value x =
Ort::Value::CreateTensor(memory_info, features.data(), features.size(),
x_shape.data(), x_shape.size());
auto pair = model->RunEncoder(std::move(x), states);
states = std::move(pair.second);
decoder.Decode(std::move(pair.first), &result);
... ... @@ -116,8 +116,8 @@ for a list of pre-trained models to download.
decoder.StripLeadingBlanks(&result[0]);
const auto &hyp = result[0].tokens;
std::string text;
for (size_t i = model->ContextSize(); i != hyp.size(); ++i) {
text += sym[hyp[i]];
for (auto t : hyp) {
text += sym[t];
}
fprintf(stderr, "Done!\n");
... ...