Reduce model initialization time for offline speech recognition (#213)

Fangjun Kuang · GitHub
Commit f3206c49dc8d4d8be3ff75530c96fa3a99195c06 f3206c49 1 parent 0abd7ce8
mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
scripts/dotnet/offline.cs
sherpa-onnx/c-api/c-api.cc
sherpa-onnx/c-api/c-api.h
sherpa-onnx/csrc/offline-model-config.cc
sherpa-onnx/csrc/offline-model-config.h
sherpa-onnx/csrc/offline-recognizer-impl.cc
sherpa-onnx/csrc/offline-transducer-model-config.cc
sherpa-onnx/python/csrc/offline-model-config.cc
sherpa-onnx/python/sherpa_onnx/offline_recognizer.py
--- a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
查看文件 @f3206c4
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
查看文件 @f3206c4
@@ -387,6 +387,7 @@ void CNonStreamingSpeechRecognitionDlg::InitParaformer() {
   config_.model_config.tokens = tokens.c_str();
   config_.model_config.num_threads = 1;
   config_.model_config.debug = 1;
+   config_.model_config.model_type = "paraformer";
 
   config_.decoding_method = "greedy_search";
   config_.max_active_paths = 4;
@@ -447,6 +448,7 @@ void CNonStreamingSpeechRecognitionDlg::InitRecognizer() {
   config_.model_config.tokens = tokens.c_str();
   config_.model_config.num_threads = 1;
   config_.model_config.debug = 0;
+   config_.model_config.model_type = "transducer";
 
   config_.decoding_method = "greedy_search";
   config_.max_active_paths = 4;
--- a/scripts/dotnet/offline.cs
查看文件 @f3206c4
+++ b/scripts/dotnet/offline.cs
查看文件 @f3206c4
@@ -76,6 +76,8 @@ namespace SherpaOnnx
       Tokens = "";
       NumThreads = 1;
       Debug = 0;
+       Provider = "cpu";
+       ModelType = "";
     }
     public OfflineTransducerModelConfig Transducer;
     public OfflineParaformerModelConfig Paraformer;
@@ -87,6 +89,12 @@ namespace SherpaOnnx
     public int NumThreads;
 
     public int Debug;
+ 
+     [MarshalAs(UnmanagedType.LPStr)]
+     public string Provider;
+ 
+     [MarshalAs(UnmanagedType.LPStr)]
+     public string ModelType;
   }
 
   [StructLayout(LayoutKind.Sequential)]
--- a/sherpa-onnx/c-api/c-api.cc
查看文件 @f3206c4
+++ b/sherpa-onnx/c-api/c-api.cc
查看文件 @f3206c4
@@ -33,23 +33,33 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
     const SherpaOnnxOnlineRecognizerConfig *config) {
   sherpa_onnx::OnlineRecognizerConfig recognizer_config;
 
-   recognizer_config.feat_config.sampling_rate = SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000);
-   recognizer_config.feat_config.feature_dim = SHERPA_ONNX_OR(config->feat_config.feature_dim, 80);
+   recognizer_config.feat_config.sampling_rate =
+       SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000);
+   recognizer_config.feat_config.feature_dim =
+       SHERPA_ONNX_OR(config->feat_config.feature_dim, 80);
 
   recognizer_config.model_config.encoder_filename =
       SHERPA_ONNX_OR(config->model_config.encoder, "");
   recognizer_config.model_config.decoder_filename =
       SHERPA_ONNX_OR(config->model_config.decoder, "");
-   recognizer_config.model_config.joiner_filename = SHERPA_ONNX_OR(config->model_config.joiner, "");
-   recognizer_config.model_config.tokens = SHERPA_ONNX_OR(config->model_config.tokens, "");
-   recognizer_config.model_config.num_threads = SHERPA_ONNX_OR(config->model_config.num_threads, 1);
-   recognizer_config.model_config.provider = SHERPA_ONNX_OR(config->model_config.provider, "cpu");
-   recognizer_config.model_config.debug = SHERPA_ONNX_OR(config->model_config.debug, 0);
- 
-   recognizer_config.decoding_method = SHERPA_ONNX_OR(config->decoding_method, "greedy_search");
-   recognizer_config.max_active_paths = SHERPA_ONNX_OR(config->max_active_paths, 4);
- 
-   recognizer_config.enable_endpoint = SHERPA_ONNX_OR(config->enable_endpoint, 0);
+   recognizer_config.model_config.joiner_filename =
+       SHERPA_ONNX_OR(config->model_config.joiner, "");
+   recognizer_config.model_config.tokens =
+       SHERPA_ONNX_OR(config->model_config.tokens, "");
+   recognizer_config.model_config.num_threads =
+       SHERPA_ONNX_OR(config->model_config.num_threads, 1);
+   recognizer_config.model_config.provider =
+       SHERPA_ONNX_OR(config->model_config.provider, "cpu");
+   recognizer_config.model_config.debug =
+       SHERPA_ONNX_OR(config->model_config.debug, 0);
+ 
+   recognizer_config.decoding_method =
+       SHERPA_ONNX_OR(config->decoding_method, "greedy_search");
+   recognizer_config.max_active_paths =
+       SHERPA_ONNX_OR(config->max_active_paths, 4);
+ 
+   recognizer_config.enable_endpoint =
+       SHERPA_ONNX_OR(config->enable_endpoint, 0);
 
   recognizer_config.endpoint_config.rule1.min_trailing_silence =
       SHERPA_ONNX_OR(config->rule1_min_trailing_silence, 2.4);
@@ -173,9 +183,11 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
     const SherpaOnnxOfflineRecognizerConfig *config) {
   sherpa_onnx::OfflineRecognizerConfig recognizer_config;
 
-   recognizer_config.feat_config.sampling_rate = SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000);
+   recognizer_config.feat_config.sampling_rate =
+       SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000);
 
-   recognizer_config.feat_config.feature_dim = SHERPA_ONNX_OR(config->feat_config.feature_dim, 80);
+   recognizer_config.feat_config.feature_dim =
+       SHERPA_ONNX_OR(config->feat_config.feature_dim, 80);
 
   recognizer_config.model_config.transducer.encoder_filename =
       SHERPA_ONNX_OR(config->model_config.transducer.encoder, "");
@@ -184,7 +196,7 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
       SHERPA_ONNX_OR(config->model_config.transducer.decoder, "");
 
   recognizer_config.model_config.transducer.joiner_filename =
-       SHERPA_ONNX_OR(config->model_config.transducer.joiner,"");
+       SHERPA_ONNX_OR(config->model_config.transducer.joiner, "");
 
   recognizer_config.model_config.paraformer.model =
       SHERPA_ONNX_OR(config->model_config.paraformer.model, "");
@@ -192,15 +204,26 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
   recognizer_config.model_config.nemo_ctc.model =
       SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, "");
 
-   recognizer_config.model_config.tokens = SHERPA_ONNX_OR(config->model_config.tokens, "");
-   recognizer_config.model_config.num_threads = SHERPA_ONNX_OR(config->model_config.num_threads, 1);
-   recognizer_config.model_config.debug = SHERPA_ONNX_OR(config->model_config.debug, 0);
- 
-   recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, "");
-   recognizer_config.lm_config.scale = SHERPA_ONNX_OR(config->lm_config.scale, 1.0);
- 
-   recognizer_config.decoding_method = SHERPA_ONNX_OR(config->decoding_method, "greedy_search");
-   recognizer_config.max_active_paths = SHERPA_ONNX_OR(config->max_active_paths, 4);
+   recognizer_config.model_config.tokens =
+       SHERPA_ONNX_OR(config->model_config.tokens, "");
+   recognizer_config.model_config.num_threads =
+       SHERPA_ONNX_OR(config->model_config.num_threads, 1);
+   recognizer_config.model_config.debug =
+       SHERPA_ONNX_OR(config->model_config.debug, 0);
+   recognizer_config.model_config.provider =
+       SHERPA_ONNX_OR(config->model_config.provider, "cpu");
+   recognizer_config.model_config.model_type =
+       SHERPA_ONNX_OR(config->model_config.model_type, "");
+ 
+   recognizer_config.lm_config.model =
+       SHERPA_ONNX_OR(config->lm_config.model, "");
+   recognizer_config.lm_config.scale =
+       SHERPA_ONNX_OR(config->lm_config.scale, 1.0);
+ 
+   recognizer_config.decoding_method =
+       SHERPA_ONNX_OR(config->decoding_method, "greedy_search");
+   recognizer_config.max_active_paths =
+       SHERPA_ONNX_OR(config->max_active_paths, 4);
 
   if (config->model_config.debug) {
     fprintf(stderr, "%s\n", recognizer_config.ToString().c_str());
--- a/sherpa-onnx/c-api/c-api.h
查看文件 @f3206c4
+++ b/sherpa-onnx/c-api/c-api.h
查看文件 @f3206c4
@@ -272,6 +272,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
   const char *tokens;
   int32_t num_threads;
   int32_t debug;
+   const char *provider;
+   const char *model_type;
 } SherpaOnnxOfflineModelConfig;
 
 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {
--- a/sherpa-onnx/csrc/offline-model-config.cc
查看文件 @f3206c4
+++ b/sherpa-onnx/csrc/offline-model-config.cc
查看文件 @f3206c4
@@ -25,6 +25,11 @@ void OfflineModelConfig::Register(ParseOptions *po) {
 
   po->Register("provider", &provider,
                "Specify a provider to use: cpu, cuda, coreml");
+ 
+   po->Register("model-type", &model_type,
+                "Specify it to reduce model initialization time. "
+                "Valid values are: transducer, paraformer, nemo_ctc. "
+                "All other values lead to loading the model twice.");
 }
 
 bool OfflineModelConfig::Validate() const {
@@ -34,7 +39,7 @@ bool OfflineModelConfig::Validate() const {
   }
 
   if (!FileExists(tokens)) {
-     SHERPA_ONNX_LOGE("%s does not exist", tokens.c_str());
+     SHERPA_ONNX_LOGE("tokens: %s does not exist", tokens.c_str());
     return false;
   }
 
@@ -59,7 +64,8 @@ std::string OfflineModelConfig::ToString() const {
   os << "tokens=\"" << tokens << "\", ";
   os << "num_threads=" << num_threads << ", ";
   os << "debug=" << (debug ? "True" : "False") << ", ";
-   os << "provider=\"" << provider << "\")";
+   os << "provider=\"" << provider << "\", ";
+   os << "model_type=\"" << model_type << "\")";
 
   return os.str();
 }
--- a/sherpa-onnx/csrc/offline-model-config.h
查看文件 @f3206c4
+++ b/sherpa-onnx/csrc/offline-model-config.h
查看文件 @f3206c4
@@ -22,19 +22,31 @@ struct OfflineModelConfig {
   bool debug = false;
   std::string provider = "cpu";
 
+   // With the help of this field, we only need to load the model once
+   // instead of twice; and therefore it reduces initialization time.
+   //
+   // Valid values:
+   //  - transducer. The given model is from icefall
+   //  - paraformer. It is a paraformer model
+   //  - nemo_ctc. It is a NeMo CTC model.
+   //
+   // All other values are invalid and lead to loading the model twice.
+   std::string model_type;
+ 
   OfflineModelConfig() = default;
   OfflineModelConfig(const OfflineTransducerModelConfig &transducer,
                      const OfflineParaformerModelConfig &paraformer,
                      const OfflineNemoEncDecCtcModelConfig &nemo_ctc,
                      const std::string &tokens, int32_t num_threads, bool debug,
-                      const std::string &provider)
+                      const std::string &provider, const std::string &model_type)
       : transducer(transducer),
         paraformer(paraformer),
         nemo_ctc(nemo_ctc),
         tokens(tokens),
         num_threads(num_threads),
         debug(debug),
-         provider(provider) {}
+         provider(provider),
+         model_type(model_type) {}
 
   void Register(ParseOptions *po);
   bool Validate() const;
--- a/sherpa-onnx/csrc/offline-recognizer-impl.cc
查看文件 @f3206c4
+++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc
查看文件 @f3206c4
@@ -18,6 +18,21 @@ namespace sherpa_onnx {
 
 std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
     const OfflineRecognizerConfig &config) {
+   if (!config.model_config.model_type.empty()) {
+     const auto &model_type = config.model_config.model_type;
+     if (model_type == "transducer") {
+       return std::make_unique<OfflineRecognizerTransducerImpl>(config);
+     } else if (model_type == "paraformer") {
+       return std::make_unique<OfflineRecognizerParaformerImpl>(config);
+     } else if (model_type == "nemo_ctc") {
+       return std::make_unique<OfflineRecognizerCtcImpl>(config);
+     } else {
+       SHERPA_ONNX_LOGE(
+           "Invalid model_type: %s. Trying to load the model to get its type",
+           model_type.c_str());
+     }
+   }
+ 
   Ort::Env env(ORT_LOGGING_LEVEL_ERROR);
 
   Ort::SessionOptions sess_opts;
--- a/sherpa-onnx/csrc/offline-transducer-model-config.cc
查看文件 @f3206c4
+++ b/sherpa-onnx/csrc/offline-transducer-model-config.cc
查看文件 @f3206c4
@@ -18,17 +18,17 @@ void OfflineTransducerModelConfig::Register(ParseOptions *po) {
 
 bool OfflineTransducerModelConfig::Validate() const {
   if (!FileExists(encoder_filename)) {
-     SHERPA_ONNX_LOGE("%s does not exist", encoder_filename.c_str());
+     SHERPA_ONNX_LOGE("encoder: %s does not exist", encoder_filename.c_str());
     return false;
   }
 
   if (!FileExists(decoder_filename)) {
-     SHERPA_ONNX_LOGE("%s does not exist", decoder_filename.c_str());
+     SHERPA_ONNX_LOGE("decoder: %s does not exist", decoder_filename.c_str());
     return false;
   }
 
   if (!FileExists(joiner_filename)) {
-     SHERPA_ONNX_LOGE("%s does not exist", joiner_filename.c_str());
+     SHERPA_ONNX_LOGE("joiner: %s does not exist", joiner_filename.c_str());
     return false;
   }
 
--- a/sherpa-onnx/python/csrc/offline-model-config.cc
查看文件 @f3206c4
+++ b/sherpa-onnx/python/csrc/offline-model-config.cc
查看文件 @f3206c4
@@ -21,15 +21,16 @@ void PybindOfflineModelConfig(py::module *m) {
 
   using PyClass = OfflineModelConfig;
   py::class_<PyClass>(*m, "OfflineModelConfig")
-       .def(py::init<const OfflineTransducerModelConfig &,
-                     const OfflineParaformerModelConfig &,
-                     const OfflineNemoEncDecCtcModelConfig &,
-                     const std::string &, int32_t, bool, const std::string &>(),
-            py::arg("transducer") = OfflineTransducerModelConfig(),
-            py::arg("paraformer") = OfflineParaformerModelConfig(),
-            py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(),
-            py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false,
-            py::arg("provider") = "cpu")
+       .def(
+           py::init<const OfflineTransducerModelConfig &,
+                    const OfflineParaformerModelConfig &,
+                    const OfflineNemoEncDecCtcModelConfig &, const std::string &,
+                    int32_t, bool, const std::string &, const std::string &>(),
+           py::arg("transducer") = OfflineTransducerModelConfig(),
+           py::arg("paraformer") = OfflineParaformerModelConfig(),
+           py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(),
+           py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false,
+           py::arg("provider") = "cpu", py::arg("model_type") = "")
       .def_readwrite("transducer", &PyClass::transducer)
       .def_readwrite("paraformer", &PyClass::paraformer)
       .def_readwrite("nemo_ctc", &PyClass::nemo_ctc)
@@ -37,6 +38,7 @@ void PybindOfflineModelConfig(py::module *m) {
       .def_readwrite("num_threads", &PyClass::num_threads)
       .def_readwrite("debug", &PyClass::debug)
       .def_readwrite("provider", &PyClass::provider)
+       .def_readwrite("model_type", &PyClass::model_type)
       .def("__str__", &PyClass::ToString);
 }
 
--- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py
查看文件 @f3206c4
+++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py
查看文件 @f3206c4
@@ -86,6 +86,7 @@ class OfflineRecognizer(object):
             num_threads=num_threads,
             debug=debug,
             provider=provider,
+             model_type="transducer",
         )
 
         feat_config = OfflineFeatureExtractorConfig(
@@ -149,6 +150,7 @@ class OfflineRecognizer(object):
             num_threads=num_threads,
             debug=debug,
             provider=provider,
+             model_type="paraformer",
         )
 
         feat_config = OfflineFeatureExtractorConfig(
@@ -211,6 +213,7 @@ class OfflineRecognizer(object):
             num_threads=num_threads,
             debug=debug,
             provider=provider,
+             model_type="nemo_ctc",
         )
 
         feat_config = OfflineFeatureExtractorConfig(