Implement max_symbols_per_frame for GigaAM2 accurate decoding since model uses c…

…har tokens instead of BPE. (#2423)

Implement max_symbols_per_frame for GigaAM2 accurate decoding since model uses c…
…har tokens instead of BPE. (#2423)
Nickolay V. Shmyrev · GitHub
Commit 10e845a8bad2acbc44ab7de5a52a6ef9e07bcf95 10e845a8 1 parent c1445749
sherpa-onnx/csrc/offline-transducer-greedy-search-nemo-decoder.cc
--- a/sherpa-onnx/csrc/offline-transducer-greedy-search-nemo-decoder.cc
查看文件 @10e845a
+++ b/sherpa-onnx/csrc/offline-transducer-greedy-search-nemo-decoder.cc
查看文件 @10e845a
@@ -45,6 +45,7 @@ static OfflineTransducerDecoderResult DecodeOne(
   int32_t vocab_size = model->VocabSize();
   int32_t blank_id = vocab_size - 1;
+  int32_t max_symbols_per_frame = 10;
   auto decoder_input_pair = BuildDecoderInput(blank_id, model->Allocator());
@@ -60,30 +61,34 @@ static OfflineTransducerDecoderResult DecodeOne(
         memory_info, const_cast<float *>(p) + t * num_cols, num_cols,
         encoder_shape.data(), encoder_shape.size());
-    Ort::Value logit = model->RunJoiner(std::move(cur_encoder_out),
-                                        View(&decoder_output_pair.first));
-
-    float *p_logit = logit.GetTensorMutableData<float>();
-    if (blank_penalty > 0) {
-      p_logit[blank_id] -= blank_penalty;
+    for (int32_t q = 0; q != max_symbols_per_frame; ++q) {
+        Ort::Value logit = model->RunJoiner(View(&cur_encoder_out),
+                                            View(&decoder_output_pair.first));
+
+        float *p_logit = logit.GetTensorMutableData<float>();
+        if (blank_penalty > 0) {
+          p_logit[blank_id] -= blank_penalty;
+        }
+
+        auto y = static_cast<int32_t>(std::distance(
+            static_cast<const float *>(p_logit),
+            std::max_element(static_cast<const float *>(p_logit),
+                             static_cast<const float *>(p_logit) + vocab_size)));
+
+        if (y != blank_id) {
+          ans.tokens.push_back(y);
+          ans.timestamps.push_back(t);
+
+          decoder_input_pair = BuildDecoderInput(y, model->Allocator());
+
+          decoder_output_pair =
+              model->RunDecoder(std::move(decoder_input_pair.first),
+                                std::move(decoder_input_pair.second),
+                                std::move(decoder_output_pair.second));
+        } else {
+           break;
+        } // if (y != blank_id)
     }
-
-    auto y = static_cast<int32_t>(std::distance(
-        static_cast<const float *>(p_logit),
-        std::max_element(static_cast<const float *>(p_logit),
-                         static_cast<const float *>(p_logit) + vocab_size)));
-
-    if (y != blank_id) {
-      ans.tokens.push_back(y);
-      ans.timestamps.push_back(t);
-
-      decoder_input_pair = BuildDecoderInput(y, model->Allocator());
-
-      decoder_output_pair =
-          model->RunDecoder(std::move(decoder_input_pair.first),
-                            std::move(decoder_input_pair.second),
-                            std::move(decoder_output_pair.second));
-    }  // if (y != blank_id)
   }    // for (int32_t i = 0; i != num_rows; ++i)
   return ans;
@@ -99,7 +104,7 @@ OfflineTransducerGreedySearchNeMoDecoder::Decode(
   int32_t dim1 = static_cast<int32_t>(shape[1]);
   int32_t dim2 = static_cast<int32_t>(shape[2]);
-  const int64_t *p_length = encoder_out_length.GetTensorData<int64_t>();
+  const int32_t *p_length = encoder_out_length.GetTensorData<int32_t>();
   const float *p = encoder_out.GetTensorData<float>();
   std::vector<OfflineTransducerDecoderResult> ans(batch_size);