Refactor offline recognizer. (#94)

* Refactor offline recognizer. The purpose is to make it easier to support different types of models.

Refactor offline recognizer. (#94)
* Refactor offline recognizer. The purpose is to make it easier to support different types of models.
Fangjun Kuang · GitHub
Commit dffb0fd43cf832793c9e5da2df07fe369cbe8fc0 dffb0fd4 1 parent 55722462
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/macros.h
sherpa-onnx/csrc/offline-recognizer-impl.cc
sherpa-onnx/csrc/offline-recognizer-impl.h
sherpa-onnx/csrc/offline-recognizer-transducer-impl.h
sherpa-onnx/csrc/offline-recognizer.cc
sherpa-onnx/csrc/offline-recognizer.h
sherpa-onnx/csrc/text-utils.cc
sherpa-onnx/csrc/text-utils.h
--- a/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @dffb0fd
@@ -6,11 +6,12 @@ set(sources
   features.cc
   file-utils.cc
   hypothesis.cc
+  offline-recognizer-impl.cc
+  offline-recognizer.cc
   offline-stream.cc
   offline-transducer-greedy-search-decoder.cc
   offline-transducer-model-config.cc
   offline-transducer-model.cc
-  offline-recognizer.cc
   online-lstm-transducer-model.cc
   online-recognizer.cc
   online-stream.cc
--- a/sherpa-onnx/csrc/macros.h
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/macros.h
查看文件 @dffb0fd
@@ -23,36 +23,55 @@
   } while (0)
 #endif
+// Read an integer
 #define SHERPA_ONNX_READ_META_DATA(dst, src_key)                        \
   do {                                                                  \
     auto value =                                                        \
         meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \
     if (!value) {                                                       \
-      fprintf(stderr, "%s does not exist in the metadata\n", src_key);  \
+      SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key);   \
       exit(-1);                                                         \
     }                                                                   \
                                                                         \
     dst = atoi(value.get());                                            \
     if (dst <= 0) {                                                     \
-      fprintf(stderr, "Invalid value %d for %s\n", dst, src_key);       \
+      SHERPA_ONNX_LOGE("Invalid value %d for %s", dst, src_key);        \
       exit(-1);                                                         \
     }                                                                   \
   } while (0)
-#define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key)                      \
-  do {                                                                    \
-    auto value =                                                          \
-        meta_data.LookupCustomMetadataMapAllocated(src_key, allocator);   \
-    if (!value) {                                                         \
-      fprintf(stderr, "%s does not exist in the metadata\n", src_key);    \
-      exit(-1);                                                           \
-    }                                                                     \
-                                                                          \
-    bool ret = SplitStringToIntegers(value.get(), ",", true, &dst);       \
-    if (!ret) {                                                           \
-      fprintf(stderr, "Invalid value %s for %s\n", value.get(), src_key); \
-      exit(-1);                                                           \
-    }                                                                     \
+// read a vector of integers
+#define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key)                     \
+  do {                                                                   \
+    auto value =                                                         \
+        meta_data.LookupCustomMetadataMapAllocated(src_key, allocator);  \
+    if (!value) {                                                        \
+      SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key);    \
+      exit(-1);                                                          \
+    }                                                                    \
+                                                                         \
+    bool ret = SplitStringToIntegers(value.get(), ",", true, &dst);      \
+    if (!ret) {                                                          \
+      SHERPA_ONNX_LOGE("Invalid value %s for %s", value.get(), src_key); \
+      exit(-1);                                                          \
+    }                                                                    \
+  } while (0)
+
+// Read a string
+#define SHERPA_ONNX_READ_META_DATA_STR(dst, src_key)                    \
+  do {                                                                  \
+    auto value =                                                        \
+        meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \
+    if (!value) {                                                       \
+      SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key);   \
+      exit(-1);                                                         \
+    }                                                                   \
+                                                                        \
+    dst = value.get();                                                  \
+    if (dst.empty()) {                                                  \
+      SHERPA_ONNX_LOGE("Invalid value for %s\n", src_key);              \
+      exit(-1);                                                         \
+    }                                                                   \
   } while (0)
 #endif  // SHERPA_ONNX_CSRC_MACROS_H_
--- a/sherpa-onnx/csrc/offline-recognizer-impl.cc 0 → 100644
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc 0 → 100644
查看文件 @dffb0fd
+// sherpa-onnx/csrc/offline-recognizer-impl.cc
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/offline-recognizer-impl.h"
+
+#include <string>
+
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#include "sherpa-onnx/csrc/macros.h"
+#include "sherpa-onnx/csrc/offline-recognizer-transducer-impl.h"
+#include "sherpa-onnx/csrc/onnx-utils.h"
+#include "sherpa-onnx/csrc/text-utils.h"
+
+namespace sherpa_onnx {
+
+std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
+    const OfflineRecognizerConfig &config) {
+  Ort::Env env;
+
+  Ort::SessionOptions sess_opts;
+  auto buf = ReadFile(config.model_config.encoder_filename);
+
+  auto encoder_sess =
+      std::make_unique<Ort::Session>(env, buf.data(), buf.size(), sess_opts);
+
+  Ort::ModelMetadata meta_data = encoder_sess->GetModelMetadata();
+
+  Ort::AllocatorWithDefaultOptions allocator;  // used in the macro below
+
+  std::string model_type;
+  SHERPA_ONNX_READ_META_DATA_STR(model_type, "model_type");
+
+  if (model_type == "conformer") {
+    return std::make_unique<OfflineRecognizerTransducerImpl>(config);
+  }
+
+  SHERPA_ONNX_LOGE("Unsupported model_type: %s\n", model_type.c_str());
+
+  exit(-1);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/offline-recognizer-impl.h 0 → 100644
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/offline-recognizer-impl.h 0 → 100644
查看文件 @dffb0fd
+// sherpa-onnx/csrc/offline-recognizer-impl.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_IMPL_H_
+#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_IMPL_H_
+
+#include <memory>
+
+#include "sherpa-onnx/csrc/offline-recognizer.h"
+#include "sherpa-onnx/csrc/offline-stream.h"
+
+namespace sherpa_onnx {
+
+class OfflineRecognizerImpl {
+ public:
+  static std::unique_ptr<OfflineRecognizerImpl> Create(
+      const OfflineRecognizerConfig &config);
+
+  virtual ~OfflineRecognizerImpl() = default;
+
+  virtual std::unique_ptr<OfflineStream> CreateStream() const = 0;
+
+  virtual void DecodeStreams(OfflineStream **ss, int32_t n) const = 0;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_IMPL_H_
--- a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h 0 → 100644
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h 0 → 100644
查看文件 @dffb0fd
+// sherpa-onnx/csrc/offline-recognizer-transducer-impl.h
+//
+// Copyright (c)  2022  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
+#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "sherpa-onnx/csrc/macros.h"
+#include "sherpa-onnx/csrc/offline-recognizer-impl.h"
+#include "sherpa-onnx/csrc/offline-recognizer.h"
+#include "sherpa-onnx/csrc/offline-transducer-decoder.h"
+#include "sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.h"
+#include "sherpa-onnx/csrc/offline-transducer-model.h"
+#include "sherpa-onnx/csrc/pad-sequence.h"
+#include "sherpa-onnx/csrc/symbol-table.h"
+
+namespace sherpa_onnx {
+
+static OfflineRecognitionResult Convert(
+    const OfflineTransducerDecoderResult &src, const SymbolTable &sym_table,
+    int32_t frame_shift_ms, int32_t subsampling_factor) {
+  OfflineRecognitionResult r;
+  r.tokens.reserve(src.tokens.size());
+  r.timestamps.reserve(src.timestamps.size());
+
+  std::string text;
+  for (auto i : src.tokens) {
+    auto sym = sym_table[i];
+    text.append(sym);
+
+    r.tokens.push_back(std::move(sym));
+  }
+  r.text = std::move(text);
+
+  float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor;
+  for (auto t : src.timestamps) {
+    float time = frame_shift_s * t;
+    r.timestamps.push_back(time);
+  }
+
+  return r;
+}
+
+class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl {
+ public:
+  explicit OfflineRecognizerTransducerImpl(
+      const OfflineRecognizerConfig &config)
+      : config_(config),
+        symbol_table_(config_.model_config.tokens),
+        model_(std::make_unique<OfflineTransducerModel>(config_.model_config)) {
+    if (config_.decoding_method == "greedy_search") {
+      decoder_ =
+          std::make_unique<OfflineTransducerGreedySearchDecoder>(model_.get());
+    } else if (config_.decoding_method == "modified_beam_search") {
+      SHERPA_ONNX_LOGE("TODO: modified_beam_search is to be implemented");
+      exit(-1);
+    } else {
+      SHERPA_ONNX_LOGE("Unsupported decoding method: %s",
+                       config_.decoding_method.c_str());
+      exit(-1);
+    }
+  }
+
+  std::unique_ptr<OfflineStream> CreateStream() const override {
+    return std::make_unique<OfflineStream>(config_.feat_config);
+  }
+
+  void DecodeStreams(OfflineStream **ss, int32_t n) const override {
+    auto memory_info =
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+    int32_t feat_dim = ss[0]->FeatureDim();
+
+    std::vector<Ort::Value> features;
+
+    features.reserve(n);
+
+    std::vector<std::vector<float>> features_vec(n);
+    std::vector<int64_t> features_length_vec(n);
+    for (int32_t i = 0; i != n; ++i) {
+      auto f = ss[i]->GetFrames();
+      int32_t num_frames = f.size() / feat_dim;
+
+      features_length_vec[i] = num_frames;
+      features_vec[i] = std::move(f);
+
+      std::array<int64_t, 2> shape = {num_frames, feat_dim};
+
+      Ort::Value x = Ort::Value::CreateTensor(
+          memory_info, features_vec[i].data(), features_vec[i].size(),
+          shape.data(), shape.size());
+      features.push_back(std::move(x));
+    }
+
+    std::vector<const Ort::Value *> features_pointer(n);
+    for (int32_t i = 0; i != n; ++i) {
+      features_pointer[i] = &features[i];
+    }
+
+    std::array<int64_t, 1> features_length_shape = {n};
+    Ort::Value x_length = Ort::Value::CreateTensor(
+        memory_info, features_length_vec.data(), n,
+        features_length_shape.data(), features_length_shape.size());
+
+    Ort::Value x = PadSequence(model_->Allocator(), features_pointer,
+                               -23.025850929940457f);
+
+    auto t = model_->RunEncoder(std::move(x), std::move(x_length));
+    auto results = decoder_->Decode(std::move(t.first), std::move(t.second));
+
+    int32_t frame_shift_ms = 10;
+    for (int32_t i = 0; i != n; ++i) {
+      auto r = Convert(results[i], symbol_table_, frame_shift_ms,
+                       model_->SubsamplingFactor());
+
+      ss[i]->SetResult(r);
+    }
+  }
+
+ private:
+  OfflineRecognizerConfig config_;
+  SymbolTable symbol_table_;
+  std::unique_ptr<OfflineTransducerModel> model_;
+  std::unique_ptr<OfflineTransducerDecoder> decoder_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
--- a/sherpa-onnx/csrc/offline-recognizer.cc
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/offline-recognizer.cc
查看文件 @dffb0fd
@@ -5,42 +5,11 @@
 #include "sherpa-onnx/csrc/offline-recognizer.h"
 #include <memory>
-#include <utility>
-#include "sherpa-onnx/csrc/macros.h"
-#include "sherpa-onnx/csrc/offline-transducer-decoder.h"
-#include "sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.h"
-#include "sherpa-onnx/csrc/offline-transducer-model.h"
-#include "sherpa-onnx/csrc/pad-sequence.h"
-#include "sherpa-onnx/csrc/symbol-table.h"
+#include "sherpa-onnx/csrc/offline-recognizer-impl.h"
 namespace sherpa_onnx {
-static OfflineRecognitionResult Convert(
-    const OfflineTransducerDecoderResult &src, const SymbolTable &sym_table,
-    int32_t frame_shift_ms, int32_t subsampling_factor) {
-  OfflineRecognitionResult r;
-  r.tokens.reserve(src.tokens.size());
-  r.timestamps.reserve(src.timestamps.size());
-
-  std::string text;
-  for (auto i : src.tokens) {
-    auto sym = sym_table[i];
-    text.append(sym);
-
-    r.tokens.push_back(std::move(sym));
-  }
-  r.text = std::move(text);
-
-  float frame_shift_s = frame_shift_ms / 1000. * subsampling_factor;
-  for (auto t : src.timestamps) {
-    float time = frame_shift_s * t;
-    r.timestamps.push_back(time);
-  }
-
-  return r;
-}
-
 void OfflineRecognizerConfig::Register(ParseOptions *po) {
   feat_config.Register(po);
   model_config.Register(po);
@@ -65,90 +34,8 @@ std::string OfflineRecognizerConfig::ToString() const {
   return os.str();
 }
-class OfflineRecognizer::Impl {
- public:
-  explicit Impl(const OfflineRecognizerConfig &config)
-      : config_(config),
-        symbol_table_(config_.model_config.tokens),
-        model_(std::make_unique<OfflineTransducerModel>(config_.model_config)) {
-    if (config_.decoding_method == "greedy_search") {
-      decoder_ =
-          std::make_unique<OfflineTransducerGreedySearchDecoder>(model_.get());
-    } else if (config_.decoding_method == "modified_beam_search") {
-      SHERPA_ONNX_LOGE("TODO: modified_beam_search is to be implemented");
-      exit(-1);
-    } else {
-      SHERPA_ONNX_LOGE("Unsupported decoding method: %s",
-                       config_.decoding_method.c_str());
-      exit(-1);
-    }
-  }
-
-  std::unique_ptr<OfflineStream> CreateStream() const {
-    return std::make_unique<OfflineStream>(config_.feat_config);
-  }
-
-  void DecodeStreams(OfflineStream **ss, int32_t n) const {
-    auto memory_info =
-        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-
-    int32_t feat_dim = ss[0]->FeatureDim();
-
-    std::vector<Ort::Value> features;
-
-    features.reserve(n);
-
-    std::vector<std::vector<float>> features_vec(n);
-    std::vector<int64_t> features_length_vec(n);
-    for (int32_t i = 0; i != n; ++i) {
-      auto f = ss[i]->GetFrames();
-      int32_t num_frames = f.size() / feat_dim;
-
-      features_length_vec[i] = num_frames;
-      features_vec[i] = std::move(f);
-
-      std::array<int64_t, 2> shape = {num_frames, feat_dim};
-
-      Ort::Value x = Ort::Value::CreateTensor(
-          memory_info, features_vec[i].data(), features_vec[i].size(),
-          shape.data(), shape.size());
-      features.push_back(std::move(x));
-    }
-
-    std::vector<const Ort::Value *> features_pointer(n);
-    for (int32_t i = 0; i != n; ++i) {
-      features_pointer[i] = &features[i];
-    }
-
-    std::array<int64_t, 1> features_length_shape = {n};
-    Ort::Value x_length = Ort::Value::CreateTensor(
-        memory_info, features_length_vec.data(), n,
-        features_length_shape.data(), features_length_shape.size());
-
-    Ort::Value x = PadSequence(model_->Allocator(), features_pointer,
-                               -23.025850929940457f);
-
-    auto t = model_->RunEncoder(std::move(x), std::move(x_length));
-    auto results = decoder_->Decode(std::move(t.first), std::move(t.second));
-
-    int32_t frame_shift_ms = 10;
-    for (int32_t i = 0; i != n; ++i) {
-      auto r = Convert(results[i], symbol_table_, frame_shift_ms,
-                       model_->SubsamplingFactor());
-
-      ss[i]->SetResult(r);
-    }
-  }
-
- private:
-  OfflineRecognizerConfig config_;
-  SymbolTable symbol_table_;
-  std::unique_ptr<OfflineTransducerModel> model_;
-  std::unique_ptr<OfflineTransducerDecoder> decoder_;
-};
-
 OfflineRecognizer::OfflineRecognizer(const OfflineRecognizerConfig &config)
-    : impl_(std::make_unique<Impl>(config)) {}
+    : impl_(OfflineRecognizerImpl::Create(config)) {}
 OfflineRecognizer::~OfflineRecognizer() = default;
--- a/sherpa-onnx/csrc/offline-recognizer.h
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/offline-recognizer.h
查看文件 @dffb0fd
@@ -52,6 +52,8 @@ struct OfflineRecognizerConfig {
   std::string ToString() const;
 };
+class OfflineRecognizerImpl;
+
 class OfflineRecognizer {
  public:
   ~OfflineRecognizer();
@@ -78,8 +80,7 @@ class OfflineRecognizer {
   void DecodeStreams(OfflineStream **ss, int32_t n) const;
  private:
-  class Impl;
-  std::unique_ptr<Impl> impl_;
+  std::unique_ptr<OfflineRecognizerImpl> impl_;
 };
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/text-utils.cc
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/text-utils.cc
查看文件 @dffb0fd
@@ -5,6 +5,8 @@
 #include "sherpa-onnx/csrc/text-utils.h"
+#include <assert.h>
+
 #include <string>
 #include <vector>
@@ -27,4 +29,31 @@ void SplitStringToVector(const std::string &full, const char *delim,
   }
 }
+template <class F>
+bool SplitStringToFloats(const std::string &full, const char *delim,
+                         bool omit_empty_strings,  // typically false
+                         std::vector<F> *out) {
+  assert(out != nullptr);
+  if (*(full.c_str()) == '\0') {
+    out->clear();
+    return true;
+  }
+  std::vector<std::string> split;
+  SplitStringToVector(full, delim, omit_empty_strings, &split);
+  out->resize(split.size());
+  for (size_t i = 0; i < split.size(); ++i) {
+    // assume atof never fails
+    (*out)[i] = atof(split[i].c_str());
+  }
+  return true;
+}
+
+// Instantiate the template above for float and double.
+template bool SplitStringToFloats(const std::string &full, const char *delim,
+                                  bool omit_empty_strings,
+                                  std::vector<float> *out);
+template bool SplitStringToFloats(const std::string &full, const char *delim,
+                                  bool omit_empty_strings,
+                                  std::vector<double> *out);
+
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/text-utils.h
查看文件 @dffb0fd
+++ b/sherpa-onnx/csrc/text-utils.h
查看文件 @dffb0fd
@@ -80,6 +80,12 @@ bool SplitStringToIntegers(const std::string &full, const char *delim,
   return true;
 }
+// This is defined for F = float and double.
+template <class F>
+bool SplitStringToFloats(const std::string &full, const char *delim,
+                         bool omit_empty_strings,  // typically false
+                         std::vector<F> *out);
+
 }  // namespace sherpa_onnx
 #endif  // SHERPA_ONNX_CSRC_TEXT_UTILS_H_