Add C++ and Python support for T-one streaming Russian ASR models (#2575)

This PR adds support for T-one streaming Russian ASR models in both C++ and Python APIs. The T-one model is a CTC-based Russian speech recognition model with specific characteristics including float16 state handling, 300ms frame lengths, and 8kHz sampling rate. - Added new OnlineToneCtcModel implementation with specialized processing for T-one models - Integrated T-one support into the existing CTC model pipeline and Python bindings - Added Python example and test scripts for the new functionality

Add C++ and Python support for T-one streaming Russian ASR models (#2575)
This PR adds support for T-one streaming Russian ASR models in both C++ and Python APIs. The T-one model is a CTC-based Russian speech recognition model with specific characteristics including float16 state handling, 300ms frame lengths, and 8kHz sampling rate. - Added new OnlineToneCtcModel implementation with specialized processing for T-one models - Integrated T-one support into the existing CTC model pipeline and Python bindings - Added Python example and test scripts for the new functionality
Fangjun Kuang · GitHub
Commit 858b5052a2b3bdce898846f2b0c752d2055457a3 858b5052 1 parent e4f48ce6
.github/scripts/test-python.sh
.gitignore
cxx-api-examples/punctuation-cxx-api.cc
python-api-examples/online-t-one-ctc-decode-files.py
scripts/t-one/test.py
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/cat.cc
sherpa-onnx/csrc/cat.h
sherpa-onnx/csrc/features.cc
sherpa-onnx/csrc/features.h
sherpa-onnx/csrc/jieba-lexicon.cc
sherpa-onnx/csrc/offline-stream.h
sherpa-onnx/csrc/offline-tts-zipvoice-impl.h
sherpa-onnx/csrc/offline-tts-zipvoice-model.cc
sherpa-onnx/csrc/offline-tts.cc
sherpa-onnx/csrc/online-ctc-greedy-search-decoder.cc
sherpa-onnx/csrc/online-ctc-model.cc
sherpa-onnx/csrc/online-model-config.cc
sherpa-onnx/csrc/online-model-config.h
sherpa-onnx/csrc/online-recognizer-ctc-impl.h
--- a/.github/scripts/test-python.sh
查看文件 @858b505
+++ b/.github/scripts/test-python.sh
查看文件 @858b505
@@ -8,6 +8,16 @@ log() {
   echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 
+ log "test T-one"
+ 
+ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
+ tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
+ rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
+ 
+ python3 ./python-api-examples/online-t-one-ctc-decode-files.py
+ 
+ rm -rf sherpa-onnx-streaming-t-one-russian-2025-09-08
+ 
 log "test nemo canary"
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
 tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
--- a/.gitignore
查看文件 @858b505
+++ b/.gitignore
查看文件 @858b505
@@ -149,3 +149,4 @@ kitten-nano-en-v0_1-fp16
 *.egg-info
 *.jar
 vocab.json
+ *.so
--- a/cxx-api-examples/punctuation-cxx-api.cc
查看文件 @858b505
+++ b/cxx-api-examples/punctuation-cxx-api.cc
查看文件 @858b505
@@ -2,7 +2,8 @@
 // Copyright (c)  2025  Xiaomi Corporation
 
 // To use punctuation model:
- // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
+ // wget
+ // https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
 // tar xvf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
 // rm sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2
 
@@ -15,14 +16,17 @@ int32_t main() {
   using namespace sherpa_onnx::cxx;  // NOLINT
 
   OfflinePunctuationConfig punctuation_config;
-   punctuation_config.model.ct_transformer = "./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx";
+   punctuation_config.model.ct_transformer =
+       "./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/"
+       "model.onnx";
   punctuation_config.model.num_threads = 1;
   punctuation_config.model.debug = false;
   punctuation_config.model.provider = "cpu";
 
   OfflinePunctuation punct = OfflinePunctuation::Create(punctuation_config);
   if (!punct.Get()) {
-     std::cerr << "Failed to create punctuation model. Please check your config\n";
+     std::cerr
+         << "Failed to create punctuation model. Please check your config\n";
     return -1;
   }
 
--- a/python-api-examples/online-t-one-ctc-decode-files.py 0 → 100755
查看文件 @858b505
+++ b/python-api-examples/online-t-one-ctc-decode-files.py 0 → 100755
查看文件 @858b505
+ #!/usr/bin/env python3
+ 
+ """
+ This file shows how to use a streaming CTC model from T-one
+ to decode files.
+ 
+ Please download model files from
+ https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+ 
+ 
+ The example model is converted from
+ https://github.com/voicekit-team/T-one
+ using
+ https://github.com/k2-fsa/sherpa-onnx/tree/master/scripts/t-one
+ 
+ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
+ tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
+ rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
+ """
+ 
+ from pathlib import Path
+ 
+ import numpy as np
+ import sherpa_onnx
+ import soundfile as sf
+ 
+ 
+ def create_recognizer():
+     model = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx"
+     tokens = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt"
+     test_wav = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav"
+ 
+     if not Path(model).is_file() or not Path(test_wav).is_file():
+         raise ValueError(
+             """Please download model files from
+             https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+             """
+         )
+     return (
+         sherpa_onnx.OnlineRecognizer.from_t_one_ctc(
+             model=model,
+             tokens=tokens,
+             debug=True,
+         ),
+         test_wav,
+     )
+ 
+ 
+ def main():
+     recognizer, wave_filename = create_recognizer()
+ 
+     audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
+     audio = audio[:, 0]  # only use the first channel
+ 
+     # audio is a 1-D float32 numpy array normalized to the range [-1, 1]
+     # sample_rate does not need to be 8000 Hz
+ 
+     stream = recognizer.create_stream()
+     left_paddings = np.zeros(int(0.3 * sample_rate), dtype=np.float32)
+     stream.accept_waveform(sample_rate, left_paddings)
+ 
+     stream.accept_waveform(sample_rate, audio)
+ 
+     tail_paddings = np.zeros(int(0.66 * sample_rate), dtype=np.float32)
+     stream.accept_waveform(sample_rate, tail_paddings)
+     stream.input_finished()
+ 
+     while recognizer.is_ready(stream):
+         recognizer.decode_stream(stream)
+     print(wave_filename)
+     print(recognizer.get_result_all(stream))
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/t-one/test.py
查看文件 @858b505
+++ b/scripts/t-one/test.py
查看文件 @858b505
@@ -147,14 +147,13 @@ def main():
         sample_rate = model.sample_rate
 
     # Pad 0.5 seconds
-     samples = np.pad(samples, (0, 4000))
+     samples = np.pad(samples, (2400, 2400))
 
     features = compute_feat(
         samples=samples,
         sample_rate=sample_rate,
         frame_length_ms=model.frame_length_ms,
     )
-     print(features.shape)
 
     id2token = load_tokens(args.tokens)
 
--- a/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @858b505
+++ b/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @858b505
@@ -95,6 +95,8 @@ set(sources
   online-recognizer.cc
   online-rnn-lm.cc
   online-stream.cc
+   online-t-one-ctc-model-config.cc
+   online-t-one-ctc-model.cc
   online-transducer-decoder.cc
   online-transducer-greedy-search-decoder.cc
   online-transducer-greedy-search-nemo-decoder.cc
--- a/sherpa-onnx/csrc/cat.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/cat.cc
查看文件 @858b505
@@ -7,8 +7,10 @@
 #include <algorithm>
 #include <functional>
 #include <numeric>
+ #include <sstream>
 #include <utility>
 
+ #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/onnx-utils.h"
 
 namespace sherpa_onnx {
@@ -27,10 +29,12 @@ static bool Compare(const std::vector<int64_t> &a,
 }
 
 static void PrintShape(const std::vector<int64_t> &a) {
+   std::ostringstream os;
   for (auto i : a) {
-     fprintf(stderr, "%d ", static_cast<int32_t>(i));
+     os << i << " ";
   }
-   fprintf(stderr, "\n");
+   os << "\n";
+   SHERPA_ONNX_LOGE("%s", os.str().c_str());
 }
 
 template <typename T /*=float*/>
@@ -51,15 +55,15 @@ Ort::Value Cat(OrtAllocator *allocator,
 
     bool ret = Compare(v0_shape, s, dim);
     if (!ret) {
-       fprintf(stderr, "Incorrect shape in Cat !\n");
+       SHERPA_ONNX_LOGE("Incorrect shape in Cat !\n");
 
-       fprintf(stderr, "Shape for tensor 0: ");
+       SHERPA_ONNX_LOGE("Shape for tensor 0: ");
       PrintShape(v0_shape);
 
-       fprintf(stderr, "Shape for tensor %d: ", i);
+       SHERPA_ONNX_LOGE("Shape for tensor %d: ", i);
       PrintShape(s);
 
-       exit(-1);
+       SHERPA_ONNX_EXIT(-1);
     }
   }
 
@@ -99,8 +103,77 @@ template Ort::Value Cat<float>(OrtAllocator *allocator,
                                const std::vector<const Ort::Value *> &values,
                                int32_t dim);
 
+ template Ort::Value Cat<uint16_t>(OrtAllocator *allocator,
+                                   const std::vector<const Ort::Value *> &values,
+                                   int32_t dim);
+ 
 template Ort::Value Cat<int64_t>(OrtAllocator *allocator,
                                  const std::vector<const Ort::Value *> &values,
                                  int32_t dim);
 
+ Ort::Value CatFloat16(OrtAllocator *allocator,
+                       const std::vector<const Ort::Value *> &values,
+                       int32_t dim) {
+   if (values.size() == 1u) {
+     return Clone(allocator, values[0]);
+   }
+ 
+   std::vector<int64_t> v0_shape =
+       values[0]->GetTensorTypeAndShapeInfo().GetShape();
+ 
+   int64_t total_dim = v0_shape[dim];
+ 
+   for (int32_t i = 1; i != static_cast<int32_t>(values.size()); ++i) {
+     auto s = values[i]->GetTensorTypeAndShapeInfo().GetShape();
+     total_dim += s[dim];
+ 
+     bool ret = Compare(v0_shape, s, dim);
+     if (!ret) {
+       SHERPA_ONNX_LOGE("Incorrect shape in Cat !\n");
+ 
+       SHERPA_ONNX_LOGE("Shape for tensor 0: ");
+       PrintShape(v0_shape);
+ 
+       SHERPA_ONNX_LOGE("Shape for tensor %d: ", i);
+       PrintShape(s);
+ 
+       SHERPA_ONNX_EXIT(-1);
+     }
+   }
+ 
+   std::vector<int64_t> ans_shape;
+   ans_shape.reserve(v0_shape.size());
+   ans_shape.insert(ans_shape.end(), v0_shape.data(), v0_shape.data() + dim);
+   ans_shape.push_back(total_dim);
+   ans_shape.insert(ans_shape.end(), v0_shape.data() + dim + 1,
+                    v0_shape.data() + v0_shape.size());
+ 
+   auto leading_size = static_cast<int32_t>(std::accumulate(
+       v0_shape.begin(), v0_shape.begin() + dim, 1, std::multiplies<int64_t>()));
+ 
+   auto trailing_size = static_cast<int32_t>(
+       std::accumulate(v0_shape.begin() + dim + 1, v0_shape.end(), 1,
+                       std::multiplies<int64_t>()));
+ 
+   Ort::Value ans =
+       Ort::Value::CreateTensor(allocator, ans_shape.data(), ans_shape.size(),
+                                ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+   using T = uint16_t;
+ 
+   T *dst = ans.GetTensorMutableData<T>();
+ 
+   for (int32_t i = 0; i != leading_size; ++i) {
+     for (auto value : values) {
+       auto this_dim = value->GetTensorTypeAndShapeInfo().GetShape()[dim];
+       const T *src = value->GetTensorData<T>();
+       src += i * this_dim * trailing_size;
+ 
+       std::copy(src, src + this_dim * trailing_size, dst);
+       dst += this_dim * trailing_size;
+     }
+   }
+ 
+   return ans;
+ }
+ 
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/cat.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/cat.h
查看文件 @858b505
@@ -23,6 +23,10 @@ template <typename T = float>
 Ort::Value Cat(OrtAllocator *allocator,
                const std::vector<const Ort::Value *> &values, int32_t dim);
 
+ Ort::Value CatFloat16(OrtAllocator *allocator,
+                       const std::vector<const Ort::Value *> &values,
+                       int32_t dim);
+ 
 }  // namespace sherpa_onnx
 
 #endif  // SHERPA_ONNX_CSRC_CAT_H_
--- a/sherpa-onnx/csrc/features.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/features.cc
查看文件 @858b505
@@ -62,6 +62,8 @@ class FeatureExtractor::Impl {
       InitMfcc();
     } else if (config_.is_whisper) {
       InitWhisper();
+     } else if (config_.is_t_one) {
+       InitRawAudioSamples();
     } else {
       InitFbank();
     }
@@ -135,6 +137,9 @@ class FeatureExtractor::Impl {
     } else if (whisper_fbank_) {
       whisper_fbank_->InputFinished();
       return;
+     } else if (raw_audio_) {
+       raw_audio_->InputFinished();
+       return;
     } else if (mfcc_) {
       mfcc_->InputFinished();
       return;
@@ -149,6 +154,8 @@ class FeatureExtractor::Impl {
       return fbank_->NumFramesReady();
     } else if (whisper_fbank_) {
       return whisper_fbank_->NumFramesReady();
+     } else if (raw_audio_) {
+       return raw_audio_->NumFramesReady();
     } else if (mfcc_) {
       return mfcc_->NumFramesReady();
     }
@@ -163,6 +170,8 @@ class FeatureExtractor::Impl {
       return fbank_->IsLastFrame(frame);
     } else if (whisper_fbank_) {
       return whisper_fbank_->IsLastFrame(frame);
+     } else if (raw_audio_) {
+       return raw_audio_->IsLastFrame(frame);
     } else if (mfcc_) {
       return mfcc_->IsLastFrame(frame);
     }
@@ -209,6 +218,8 @@ class FeatureExtractor::Impl {
       return opts_.mel_opts.num_bins;
     } else if (mfcc_) {
       return mfcc_opts_.num_ceps;
+     } else if (raw_audio_) {
+       return raw_audio_->Dim();
     }
 
     SHERPA_ONNX_LOGE("unreachable code");
@@ -225,6 +236,9 @@ class FeatureExtractor::Impl {
     } else if (whisper_fbank_) {
       whisper_fbank_->AcceptWaveform(sampling_rate, waveform, n);
       return;
+     } else if (raw_audio_) {
+       raw_audio_->AcceptWaveform(sampling_rate, waveform, n);
+       return;
     } else if (mfcc_) {
       mfcc_->AcceptWaveform(sampling_rate, waveform, n);
       return;
@@ -239,6 +253,8 @@ class FeatureExtractor::Impl {
       return fbank_->GetFrame(frame_index);
     } else if (whisper_fbank_) {
       return whisper_fbank_->GetFrame(frame_index);
+     } else if (raw_audio_) {
+       return raw_audio_->GetFrame(frame_index);
     } else if (mfcc_) {
       return mfcc_->GetFrame(frame_index);
     }
@@ -255,6 +271,9 @@ class FeatureExtractor::Impl {
     } else if (whisper_fbank_) {
       whisper_fbank_->Pop(discard_num);
       return;
+     } else if (raw_audio_) {
+       raw_audio_->Pop(discard_num);
+       return;
     } else if (mfcc_) {
       mfcc_->Pop(discard_num);
       return;
@@ -322,11 +341,21 @@ class FeatureExtractor::Impl {
     config_.sampling_rate = opts_.frame_opts.samp_freq;
   }
 
+   void InitRawAudioSamples() {
+     opts_raw_audio_.frame_opts.samp_freq = config_.sampling_rate;
+     opts_raw_audio_.frame_opts.frame_length_ms = config_.frame_length_ms;
+     opts_raw_audio_.frame_opts.frame_shift_ms = config_.frame_shift_ms;
+ 
+     raw_audio_ = std::make_unique<knf::OnlineRawAudioSamples>(opts_raw_audio_);
+   }
+ 
  private:
   std::unique_ptr<knf::OnlineFbank> fbank_;
   std::unique_ptr<knf::OnlineMfcc> mfcc_;
   std::unique_ptr<knf::OnlineWhisperFbank> whisper_fbank_;
+   std::unique_ptr<knf::OnlineRawAudioSamples> raw_audio_;
   knf::FbankOptions opts_;
+   knf::RawAudioSamplesOptions opts_raw_audio_;
   knf::MfccOptions mfcc_opts_;
   FeatureExtractorConfig config_;
   mutable std::mutex mutex_;
--- a/sherpa-onnx/csrc/features.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/features.h
查看文件 @858b505
@@ -81,6 +81,8 @@ struct FeatureExtractorConfig {
 
   bool is_whisper = false;
 
+   bool is_t_one = false;
+ 
   bool round_to_power_of_two = true;
 
   std::string ToString() const;
--- a/sherpa-onnx/csrc/jieba-lexicon.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/jieba-lexicon.cc
查看文件 @858b505
@@ -4,6 +4,7 @@
 
 #include "sherpa-onnx/csrc/jieba-lexicon.h"
 
+ #include <algorithm>
 #include <fstream>
 #include <regex>  // NOLINT
 #include <strstream>
--- a/sherpa-onnx/csrc/offline-stream.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/offline-stream.h
查看文件 @858b505
@@ -38,7 +38,8 @@ struct OfflineRecognitionResult {
   /// timestamps[i] records the time in seconds when tokens[i] is decoded.
   std::vector<float> timestamps;
 
-   /// durations[i] contains the duration (in seconds) for tokens[i] (TDT models only)
+   /// durations[i] contains the duration (in seconds) for tokens[i] (TDT models
+   /// only)
   std::vector<float> durations;
 
   std::vector<int32_t> words;
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-impl.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-impl.h
查看文件 @858b505
@@ -4,6 +4,7 @@
 #ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_IMPL_H_
 #define SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_IMPL_H_
 
+ #include <algorithm>
 #include <cmath>
 #include <memory>
 #include <string>
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-model.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-model.cc
查看文件 @858b505
@@ -104,7 +104,8 @@ class OfflineTtsZipvoiceModel::Impl {
     int64_t feat_dim = meta_data_.feat_dim;
 
     std::vector<float> x_data(batch_size * num_frames * feat_dim);
-     std::default_random_engine rng(std::random_device{}());
+     std::random_device rd;
+     std::default_random_engine rng(rd());
     std::normal_distribution<float> norm(0, 1);
     for (auto &v : x_data) v = norm(rng);
     std::vector<int64_t> x_shape = {batch_size, num_frames, feat_dim};
--- a/sherpa-onnx/csrc/offline-tts.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/offline-tts.cc
查看文件 @858b505
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <string>
 #include <utility>
+ #include <vector>
 
 #if __ANDROID_API__ >= 9
 #include "android/asset_manager.h"
--- a/sherpa-onnx/csrc/online-ctc-greedy-search-decoder.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-ctc-greedy-search-decoder.cc
查看文件 @858b505
@@ -28,6 +28,13 @@ void OnlineCtcGreedySearchDecoder::Decode(
     auto &r = (*results)[b];
 
     int32_t prev_id = -1;
+     if (!r.tokens.empty()) {
+       if (r.num_trailing_blanks > 0) {
+         prev_id = blank_id_;
+       } else {
+         prev_id = r.tokens.back();
+       }
+     }
 
     for (int32_t t = 0; t != num_frames; ++t, p += vocab_size) {
       int32_t y = static_cast<int32_t>(std::distance(
--- a/sherpa-onnx/csrc/online-ctc-model.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-ctc-model.cc
查看文件 @858b505
@@ -20,6 +20,7 @@
 
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/online-nemo-ctc-model.h"
+ #include "sherpa-onnx/csrc/online-t-one-ctc-model.h"
 #include "sherpa-onnx/csrc/online-wenet-ctc-model.h"
 #include "sherpa-onnx/csrc/online-zipformer2-ctc-model.h"
 #include "sherpa-onnx/csrc/onnx-utils.h"
@@ -34,9 +35,11 @@ std::unique_ptr<OnlineCtcModel> OnlineCtcModel::Create(
     return std::make_unique<OnlineZipformer2CtcModel>(config);
   } else if (!config.nemo_ctc.model.empty()) {
     return std::make_unique<OnlineNeMoCtcModel>(config);
+   } else if (!config.t_one_ctc.model.empty()) {
+     return std::make_unique<OnlineToneCtcModel>(config);
   } else {
     SHERPA_ONNX_LOGE("Please specify a CTC model");
-     exit(-1);
+     SHERPA_ONNX_EXIT(-1);
   }
 }
 
@@ -49,9 +52,11 @@ std::unique_ptr<OnlineCtcModel> OnlineCtcModel::Create(
     return std::make_unique<OnlineZipformer2CtcModel>(mgr, config);
   } else if (!config.nemo_ctc.model.empty()) {
     return std::make_unique<OnlineNeMoCtcModel>(mgr, config);
+   } else if (!config.t_one_ctc.model.empty()) {
+     return std::make_unique<OnlineToneCtcModel>(mgr, config);
   } else {
     SHERPA_ONNX_LOGE("Please specify a CTC model");
-     exit(-1);
+     SHERPA_ONNX_EXIT(-1);
   }
 }
 
--- a/sherpa-onnx/csrc/online-model-config.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-model-config.cc
查看文件 @858b505
@@ -17,6 +17,7 @@ void OnlineModelConfig::Register(ParseOptions *po) {
   wenet_ctc.Register(po);
   zipformer2_ctc.Register(po);
   nemo_ctc.Register(po);
+   t_one_ctc.Register(po);
   provider_config.Register(po);
 
   po->Register("tokens", &tokens, "Path to tokens.txt");
@@ -149,6 +150,10 @@ bool OnlineModelConfig::Validate() const {
     return nemo_ctc.Validate();
   }
 
+   if (!t_one_ctc.model.empty()) {
+     return t_one_ctc.Validate();
+   }
+ 
   if (!provider_config.Validate()) {
     return false;
   }
@@ -165,6 +170,7 @@ std::string OnlineModelConfig::ToString() const {
   os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
   os << "zipformer2_ctc=" << zipformer2_ctc.ToString() << ", ";
   os << "nemo_ctc=" << nemo_ctc.ToString() << ", ";
+   os << "t_one_ctc=" << t_one_ctc.ToString() << ", ";
   os << "provider_config=" << provider_config.ToString() << ", ";
   os << "tokens=\"" << tokens << "\", ";
   os << "num_threads=" << num_threads << ", ";
--- a/sherpa-onnx/csrc/online-model-config.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-model-config.h
查看文件 @858b505
@@ -8,6 +8,7 @@
 
 #include "sherpa-onnx/csrc/online-nemo-ctc-model-config.h"
 #include "sherpa-onnx/csrc/online-paraformer-model-config.h"
+ #include "sherpa-onnx/csrc/online-t-one-ctc-model-config.h"
 #include "sherpa-onnx/csrc/online-transducer-model-config.h"
 #include "sherpa-onnx/csrc/online-wenet-ctc-model-config.h"
 #include "sherpa-onnx/csrc/online-zipformer2-ctc-model-config.h"
@@ -21,6 +22,7 @@ struct OnlineModelConfig {
   OnlineWenetCtcModelConfig wenet_ctc;
   OnlineZipformer2CtcModelConfig zipformer2_ctc;
   OnlineNeMoCtcModelConfig nemo_ctc;
+   OnlineToneCtcModelConfig t_one_ctc;
   ProviderConfig provider_config;
   std::string tokens;
   int32_t num_threads = 1;
@@ -56,6 +58,7 @@ struct OnlineModelConfig {
                     const OnlineWenetCtcModelConfig &wenet_ctc,
                     const OnlineZipformer2CtcModelConfig &zipformer2_ctc,
                     const OnlineNeMoCtcModelConfig &nemo_ctc,
+                     const OnlineToneCtcModelConfig &t_one_ctc,
                     const ProviderConfig &provider_config,
                     const std::string &tokens, int32_t num_threads,
                     int32_t warm_up, bool debug, const std::string &model_type,
@@ -66,6 +69,7 @@ struct OnlineModelConfig {
         wenet_ctc(wenet_ctc),
         zipformer2_ctc(zipformer2_ctc),
         nemo_ctc(nemo_ctc),
+         t_one_ctc(t_one_ctc),
         provider_config(provider_config),
         tokens(tokens),
         num_threads(num_threads),
--- a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h
查看文件 @858b505
@@ -6,6 +6,7 @@
 #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_
 
 #include <algorithm>
+ #include <cassert>
 #include <ios>
 #include <memory>
 #include <sstream>
@@ -79,24 +80,7 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
         config_(config),
         model_(OnlineCtcModel::Create(config.model_config)),
         endpoint_(config_.endpoint_config) {
-     if (!config.model_config.tokens_buf.empty()) {
-       sym_ = SymbolTable(config.model_config.tokens_buf, false);
-     } else {
-       /// assuming tokens_buf and tokens are guaranteed not being both empty
-       sym_ = SymbolTable(config.model_config.tokens, true);
-     }
- 
-     if (!config.model_config.wenet_ctc.model.empty()) {
-       // WeNet CTC models assume input samples are in the range
-       // [-32768, 32767], so we set normalize_samples to false
-       config_.feat_config.normalize_samples = false;
-     }
- 
-     if (model_->UseWhisperFeature()) {
-       config_.feat_config.is_whisper = true;
-     }
- 
-     InitDecoder();
+     PostInit();
   }
 
   template <typename Manager>
@@ -107,17 +91,7 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
         model_(OnlineCtcModel::Create(mgr, config.model_config)),
         sym_(mgr, config.model_config.tokens),
         endpoint_(config_.endpoint_config) {
-     if (!config.model_config.wenet_ctc.model.empty()) {
-       // WeNet CTC models assume input samples are in the range
-       // [-32768, 32767], so we set normalize_samples to false
-       config_.feat_config.normalize_samples = false;
-     }
- 
-     if (model_->UseWhisperFeature()) {
-       config_.feat_config.is_whisper = true;
-     }
- 
-     InitDecoder();
+     PostInit();
   }
 
   std::unique_ptr<OnlineStream> CreateStream() const override {
@@ -211,6 +185,14 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
     // TODO(fangjun): Remember to change these constants if needed
     int32_t frame_shift_ms = 10;
     int32_t subsampling_factor = 4;
+     if (!config_.model_config.t_one_ctc.model.empty()) {
+       // each input frame is of 300ms long, which produces 10 output frames.
+       // so frame_shift_ms is 300/10 = 30ms
+       //
+       frame_shift_ms = 30;
+       subsampling_factor = 1;
+     }
+ 
     auto r =
         ConvertCtc(decoder_result, sym_, frame_shift_ms, subsampling_factor,
                    s->GetCurrentSegment(), s->GetNumFramesSinceStart());
@@ -258,6 +240,33 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
   }
 
  private:
+   void PostInit() {
+     if (!config_.model_config.tokens_buf.empty()) {
+       sym_ = SymbolTable(config_.model_config.tokens_buf, false);
+     } else {
+       /// assuming tokens_buf and tokens are guaranteed not being both empty
+       sym_ = SymbolTable(config_.model_config.tokens, true);
+     }
+ 
+     if (!config_.model_config.wenet_ctc.model.empty()) {
+       // WeNet CTC models assume input samples are in the range
+       // [-32768, 32767], so we set normalize_samples to false
+       config_.feat_config.normalize_samples = false;
+     }
+ 
+     if (!config_.model_config.t_one_ctc.model.empty()) {
+       config_.feat_config.is_t_one = true;
+       config_.feat_config.frame_length_ms = 300;
+       config_.feat_config.frame_shift_ms = 300;
+       config_.feat_config.sampling_rate = 8000;
+     }
+ 
+     if (model_->UseWhisperFeature()) {
+       config_.feat_config.is_whisper = true;
+     }
+ 
+     InitDecoder();
+   }
   void InitDecoder() {
     if (!sym_.Contains("<blk>") && !sym_.Contains("<eps>") &&
         !sym_.Contains("<blank>")) {
--- a/sherpa-onnx/csrc/online-recognizer-impl.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-recognizer-impl.cc
查看文件 @858b505
@@ -83,12 +83,13 @@ std::unique_ptr<OnlineRecognizerImpl> OnlineRecognizerImpl::Create(
 
   if (!config.model_config.wenet_ctc.model.empty() ||
       !config.model_config.zipformer2_ctc.model.empty() ||
-       !config.model_config.nemo_ctc.model.empty()) {
+       !config.model_config.nemo_ctc.model.empty() ||
+       !config.model_config.t_one_ctc.model.empty()) {
     return std::make_unique<OnlineRecognizerCtcImpl>(config);
   }
 
   SHERPA_ONNX_LOGE("Please specify a model");
-   exit(-1);
+   SHERPA_ONNX_EXIT(-1);
 }
 
 template <typename Manager>
@@ -142,12 +143,13 @@ std::unique_ptr<OnlineRecognizerImpl> OnlineRecognizerImpl::Create(
 
   if (!config.model_config.wenet_ctc.model.empty() ||
       !config.model_config.zipformer2_ctc.model.empty() ||
-       !config.model_config.nemo_ctc.model.empty()) {
+       !config.model_config.nemo_ctc.model.empty() ||
+       !config.model_config.t_one_ctc.model.empty()) {
     return std::make_unique<OnlineRecognizerCtcImpl>(mgr, config);
   }
 
   SHERPA_ONNX_LOGE("Please specify a model");
-   exit(-1);
+   SHERPA_ONNX_EXIT(-1);
 }
 
 OnlineRecognizerImpl::OnlineRecognizerImpl(const OnlineRecognizerConfig &config)
--- a/sherpa-onnx/csrc/online-t-one-ctc-model-config.cc 0 → 100644
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-t-one-ctc-model-config.cc 0 → 100644
查看文件 @858b505
+ // sherpa-onnx/csrc/online-t-one-ctc-model-config.cc
+ //
+ // Copyright (c)  2025  Xiaomi Corporation
+ 
+ #include "sherpa-onnx/csrc/online-t-one-ctc-model-config.h"
+ 
+ #include "sherpa-onnx/csrc/file-utils.h"
+ #include "sherpa-onnx/csrc/macros.h"
+ 
+ namespace sherpa_onnx {
+ 
+ void OnlineToneCtcModelConfig::Register(ParseOptions *po) {
+   po->Register("t-one-ctc-model", &model,
+                "Path to CTC model.onnx from T-one. Please see "
+                "https://github.com/k2-fsa/sherpa-onnx/pull/2571");
+ }
+ 
+ bool OnlineToneCtcModelConfig::Validate() const {
+   if (!FileExists(model)) {
+     SHERPA_ONNX_LOGE("T-one CTC model '%s' does not exist", model.c_str());
+     return false;
+   }
+ 
+   return true;
+ }
+ 
+ std::string OnlineToneCtcModelConfig::ToString() const {
+   std::ostringstream os;
+ 
+   os << "OnlineToneCtcModelConfig(";
+   os << "model=\"" << model << "\")";
+ 
+   return os.str();
+ }
+ 
+ }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/online-t-one-ctc-model-config.h 0 → 100644
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-t-one-ctc-model-config.h 0 → 100644
查看文件 @858b505
+ // sherpa-onnx/csrc/online-t-one-ctc-model-config.h
+ //
+ // Copyright (c)  2025  Xiaomi Corporation
+ #ifndef SHERPA_ONNX_CSRC_ONLINE_T_ONE_CTC_MODEL_CONFIG_H_
+ #define SHERPA_ONNX_CSRC_ONLINE_T_ONE_CTC_MODEL_CONFIG_H_
+ 
+ #include <string>
+ 
+ #include "sherpa-onnx/csrc/parse-options.h"
+ 
+ namespace sherpa_onnx {
+ 
+ struct OnlineToneCtcModelConfig {
+   std::string model;
+ 
+   OnlineToneCtcModelConfig() = default;
+ 
+   explicit OnlineToneCtcModelConfig(const std::string &model) : model(model) {}
+ 
+   void Register(ParseOptions *po);
+   bool Validate() const;
+ 
+   std::string ToString() const;
+ };
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_CSRC_ONLINE_T_ONE_CTC_MODEL_CONFIG_H_
--- a/sherpa-onnx/csrc/online-t-one-ctc-model.cc 0 → 100644
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-t-one-ctc-model.cc 0 → 100644
查看文件 @858b505
+ // sherpa-onnx/csrc/online-t-one-ctc-model.cc
+ //
+ // Copyright (c)  2025  Xiaomi Corporation
+ 
+ #include "sherpa-onnx/csrc/online-t-one-ctc-model.h"
+ 
+ #include <algorithm>
+ #include <cmath>
+ #include <string>
+ 
+ #if __ANDROID_API__ >= 9
+ #include "android/asset_manager.h"
+ #include "android/asset_manager_jni.h"
+ #endif
+ 
+ #if __OHOS__
+ #include "rawfile/raw_file_manager.h"
+ #endif
+ 
+ #include "sherpa-onnx/csrc/cat.h"
+ #include "sherpa-onnx/csrc/file-utils.h"
+ #include "sherpa-onnx/csrc/macros.h"
+ #include "sherpa-onnx/csrc/onnx-utils.h"
+ #include "sherpa-onnx/csrc/session.h"
+ #include "sherpa-onnx/csrc/text-utils.h"
+ #include "sherpa-onnx/csrc/unbind.h"
+ 
+ namespace sherpa_onnx {
+ 
+ class OnlineToneCtcModel::Impl {
+  public:
+   explicit Impl(const OnlineModelConfig &config)
+       : config_(config),
+         env_(ORT_LOGGING_LEVEL_ERROR),
+         sess_opts_(GetSessionOptions(config)),
+         allocator_{} {
+     {
+       auto buf = ReadFile(config.t_one_ctc.model);
+       Init(buf.data(), buf.size());
+     }
+   }
+ 
+   template <typename Manager>
+   Impl(Manager *mgr, const OnlineModelConfig &config)
+       : config_(config),
+         env_(ORT_LOGGING_LEVEL_ERROR),
+         sess_opts_(GetSessionOptions(config)),
+         allocator_{} {
+     {
+       auto buf = ReadFile(mgr, config.t_one_ctc.model);
+       Init(buf.data(), buf.size());
+     }
+   }
+ 
+   std::vector<Ort::Value> Forward(Ort::Value x,
+                                   std::vector<Ort::Value> states) {
+     // shape0 is (batch_size, 1, num_samples)
+     auto shape0 = x.GetTensorTypeAndShapeInfo().GetShape();
+     std::array<int64_t, 3> shape = {shape0[0], shape0[2], shape0[1]};
+     std::vector<int32_t> samples(shape[0] * shape[1] * shape[2]);
+     const float *px = x.GetTensorData<float>();
+ 
+     for (int32_t i = 0; i < samples.size(); ++i) {
+       float f = px[i];
+       f = f > 1 ? 1 : f;
+       f = f < -1 ? -1 : f;
+       samples[i] = static_cast<int32_t>(f * 32767);
+     }
+ 
+     auto memory_info =
+         Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+ 
+     Ort::Value xx =
+         Ort::Value::CreateTensor(memory_info, samples.data(), samples.size(),
+                                  shape.data(), shape.size());
+ 
+     std::array<Ort::Value, 2> inputs = {std::move(xx), std::move(states[0])};
+ 
+     auto out =
+         sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
+                    output_names_ptr_.data(), output_names_ptr_.size());
+     // out[0]: log_probs
+     // out[1] next_states
+ 
+     return out;
+   }
+ 
+   int32_t VocabSize() const { return vocab_size_; }
+ 
+   int32_t ChunkLength() const { return 1; }
+ 
+   int32_t ChunkShift() const { return 1; }
+ 
+   OrtAllocator *Allocator() { return allocator_; }
+ 
+   // Return a vector containing 1 tensor
+   // - state_
+   std::vector<Ort::Value> GetInitStates() {
+     std::vector<Ort::Value> ans;
+     ans.push_back(View(&state_));
+ 
+     return ans;
+   }
+ 
+   std::vector<Ort::Value> StackStates(
+       std::vector<std::vector<Ort::Value>> states) {
+     int32_t batch_size = static_cast<int32_t>(states.size());
+     if (batch_size == 1) {
+       return std::move(states[0]);
+     }
+ 
+     std::vector<Ort::Value> ans;
+     ans.reserve(1);
+ 
+     std::vector<const Ort::Value *> buf;
+     buf.reserve(batch_size);
+ 
+     for (int32_t b = 0; b != batch_size; ++b) {
+       buf.push_back(&states[b][0]);
+     }
+ 
+     Ort::Value c{nullptr};
+     c = CatFloat16(allocator_, buf, 0);
+ 
+     ans.push_back(std::move(c));
+ 
+     return ans;
+   }
+ 
+   std::vector<std::vector<Ort::Value>> UnStackStates(
+       std::vector<Ort::Value> states) const {
+     auto allocator = const_cast<Impl *>(this)->allocator_;
+ 
+     std::vector<std::vector<Ort::Value>> ans;
+ 
+     auto shape = states[0].GetTensorTypeAndShapeInfo().GetShape();
+     int32_t batch_size = shape[0];
+     ans.resize(batch_size);
+ 
+     if (batch_size == 1) {
+       ans[0] = std::move(states);
+       return ans;
+     }
+ 
+     std::vector<Ort::Value> v;
+     v = UnbindFloat16(allocator, &states[0], 0);
+ 
+     for (int32_t b = 0; b != batch_size; ++b) {
+       ans[b].push_back(std::move(v[b]));
+     }
+ 
+     return ans;
+   }
+ 
+  private:
+   void Init(void *model_data, size_t model_data_length) {
+     sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
+                                            sess_opts_);
+ 
+     GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
+ 
+     GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
+ 
+     // get meta data
+     Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
+     if (config_.debug) {
+       std::ostringstream os;
+       PrintModelMetadata(os, meta_data);
+ #if __OHOS__
+       SHERPA_ONNX_LOGE("%{public}s", os.str().c_str());
+ #else
+       SHERPA_ONNX_LOGE("%s", os.str().c_str());
+ #endif
+     }
+ 
+     Ort::AllocatorWithDefaultOptions allocator;  // used in the macro below
+     SHERPA_ONNX_READ_META_DATA(frame_length_ms_, "frame_length_ms");
+     SHERPA_ONNX_READ_META_DATA(state_dim_, "state_dim");
+     SHERPA_ONNX_READ_META_DATA(sample_rate_, "sample_rate");
+ 
+     InitStates();
+ 
+     vocab_size_ = sess_->GetOutputTypeInfo(0)
+                       .GetTensorTypeAndShapeInfo()
+                       .GetShape()
+                       .back();
+   }
+ 
+   void InitStates() {
+     std::array<int64_t, 2> state_shape{1, state_dim_};
+ 
+     state_ = Ort::Value::CreateTensor(allocator_, state_shape.data(),
+                                       state_shape.size(),
+                                       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+ 
+     auto p = state_.GetTensorMutableData<uint16_t>();
+     std::fill(p, p + state_dim_, 0);
+   }
+ 
+  private:
+   OnlineModelConfig config_;
+   Ort::Env env_;
+   Ort::SessionOptions sess_opts_;
+   Ort::AllocatorWithDefaultOptions allocator_;
+ 
+   std::unique_ptr<Ort::Session> sess_;
+ 
+   std::vector<std::string> input_names_;
+   std::vector<const char *> input_names_ptr_;
+ 
+   std::vector<std::string> output_names_;
+   std::vector<const char *> output_names_ptr_;
+ 
+   // One input frame is of  length is 300ms
+   // For each input frame, there are 10 output frames,
+   // so each output frame is 30ms
+   int32_t frame_length_ms_ = 0;
+   int32_t state_dim_ = 0;
+   int32_t sample_rate_ = 0;
+   int32_t vocab_size_ = 0;
+ 
+   Ort::Value state_{nullptr};
+ };
+ 
+ OnlineToneCtcModel::OnlineToneCtcModel(const OnlineModelConfig &config)
+     : impl_(std::make_unique<Impl>(config)) {}
+ 
+ template <typename Manager>
+ OnlineToneCtcModel::OnlineToneCtcModel(Manager *mgr,
+                                        const OnlineModelConfig &config)
+     : impl_(std::make_unique<Impl>(mgr, config)) {}
+ 
+ OnlineToneCtcModel::~OnlineToneCtcModel() = default;
+ 
+ std::vector<Ort::Value> OnlineToneCtcModel::Forward(
+     Ort::Value x, std::vector<Ort::Value> states) const {
+   return impl_->Forward(std::move(x), std::move(states));
+ }
+ 
+ int32_t OnlineToneCtcModel::VocabSize() const { return impl_->VocabSize(); }
+ 
+ int32_t OnlineToneCtcModel::ChunkLength() const { return impl_->ChunkLength(); }
+ 
+ int32_t OnlineToneCtcModel::ChunkShift() const { return impl_->ChunkShift(); }
+ 
+ OrtAllocator *OnlineToneCtcModel::Allocator() const {
+   return impl_->Allocator();
+ }
+ 
+ std::vector<Ort::Value> OnlineToneCtcModel::GetInitStates() const {
+   return impl_->GetInitStates();
+ }
+ 
+ std::vector<Ort::Value> OnlineToneCtcModel::StackStates(
+     std::vector<std::vector<Ort::Value>> states) const {
+   return impl_->StackStates(std::move(states));
+ }
+ 
+ std::vector<std::vector<Ort::Value>> OnlineToneCtcModel::UnStackStates(
+     std::vector<Ort::Value> states) const {
+   return impl_->UnStackStates(std::move(states));
+ }
+ 
+ #if __ANDROID_API__ >= 9
+ template OnlineToneCtcModel::OnlineToneCtcModel(
+     AAssetManager *mgr, const OnlineModelConfig &config);
+ #endif
+ 
+ #if __OHOS__
+ template OnlineToneCtcModel::OnlineToneCtcModel(
+     NativeResourceManager *mgr, const OnlineModelConfig &config);
+ #endif
+ 
+ }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/online-t-one-ctc-model.h 0 → 100644
查看文件 @858b505
+++ b/sherpa-onnx/csrc/online-t-one-ctc-model.h 0 → 100644
查看文件 @858b505
+ // sherpa-onnx/csrc/online-t-one-ctc-model.h
+ //
+ // Copyright (c)  2025  Xiaomi Corporation
+ #ifndef SHERPA_ONNX_CSRC_ONLINE_T_ONE_CTC_MODEL_H_
+ #define SHERPA_ONNX_CSRC_ONLINE_T_ONE_CTC_MODEL_H_
+ 
+ #include <memory>
+ #include <utility>
+ #include <vector>
+ 
+ #include "onnxruntime_cxx_api.h"  // NOLINT
+ #include "sherpa-onnx/csrc/online-ctc-model.h"
+ #include "sherpa-onnx/csrc/online-model-config.h"
+ 
+ namespace sherpa_onnx {
+ 
+ class OnlineToneCtcModel : public OnlineCtcModel {
+  public:
+   explicit OnlineToneCtcModel(const OnlineModelConfig &config);
+ 
+   template <typename Manager>
+   OnlineToneCtcModel(Manager *mgr, const OnlineModelConfig &config);
+ 
+   ~OnlineToneCtcModel() override;
+ 
+   // A list of 1 tensor:
+   //   - (batch_size, state_dim)
+   std::vector<Ort::Value> GetInitStates() const override;
+ 
+   std::vector<Ort::Value> StackStates(
+       std::vector<std::vector<Ort::Value>> states) const override;
+ 
+   std::vector<std::vector<Ort::Value>> UnStackStates(
+       std::vector<Ort::Value> states) const override;
+ 
+   /**
+    *
+    * @param x A 3-D tensor of shape (batch_size, num_samples).
+    * @param states  It is from GetInitStates() or returned from this method.
+    *
+    * @return Return a list of tensors
+    *    - ans[0] contains log_probs, of shape (N, T, C)
+    *    - ans[1:] contains next_states
+    */
+   std::vector<Ort::Value> Forward(
+       Ort::Value x, std::vector<Ort::Value> states) const override;
+ 
+   /** Return the vocabulary size of the model
+    */
+   int32_t VocabSize() const override;
+ 
+   /** Return an allocator for allocating memory
+    */
+   OrtAllocator *Allocator() const override;
+ 
+   // The model accepts this number of frames before subsampling as input
+   int32_t ChunkLength() const override;
+ 
+   // Similar to frame_shift in feature extractor, after processing
+   // ChunkLength() frames, we advance by ChunkShift() frames
+   // before we process the next chunk.
+   int32_t ChunkShift() const override;
+ 
+   bool SupportBatchProcessing() const override { return true; }
+ 
+  private:
+   class Impl;
+   std::unique_ptr<Impl> impl_;
+ };
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_CSRC_ONLINE_T_ONE_CTC_MODEL_H_
--- a/sherpa-onnx/csrc/onnx-utils.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/onnx-utils.cc
查看文件 @858b505
@@ -155,10 +155,30 @@ Ort::Value Clone(OrtAllocator *allocator, const Ort::Value *v) {
       std::copy(start, end, dst);
       return ans;
     }
+     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
+       Ort::Value ans =
+           Ort::Value::CreateTensor(allocator, shape.data(), shape.size(),
+                                    ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+       const auto *start = v->GetTensorData<uint16_t>();
+       const auto *end = start + type_and_shape.GetElementCount();
+       auto *dst = ans.GetTensorMutableData<uint16_t>();
+       std::copy(start, end, dst);
+       return ans;
+     }
+     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16: {
+       Ort::Value ans = Ort::Value::CreateTensor<uint16_t>(
+           allocator, shape.data(), shape.size());
+       const auto *start = v->GetTensorData<uint16_t>();
+       const auto *end = start + type_and_shape.GetElementCount();
+       auto *dst = ans.GetTensorMutableData<uint16_t>();
+       std::copy(start, end, dst);
+       return ans;
+     }
+ 
     default:
-       fprintf(stderr, "Unsupported type: %d\n",
-               static_cast<int32_t>(type_and_shape.GetElementType()));
-       exit(-1);
+       SHERPA_ONNX_LOGE("Unsupported type: %d\n",
+                        static_cast<int32_t>(type_and_shape.GetElementType()));
+       SHERPA_ONNX_EXIT(-1);
       // unreachable code
       return Ort::Value{nullptr};
   }
@@ -183,14 +203,23 @@ Ort::Value View(Ort::Value *v) {
       return Ort::Value::CreateTensor(
           memory_info, v->GetTensorMutableData<float>(),
           type_and_shape.GetElementCount(), shape.data(), shape.size());
+     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+       return Ort::Value::CreateTensor(
+           memory_info, v->GetTensorMutableData<uint16_t>(),
+           type_and_shape.GetElementCount() * sizeof(uint16_t), shape.data(),
+           shape.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
+       return Ort::Value::CreateTensor(
+           memory_info, v->GetTensorMutableData<uint16_t>(),
+           type_and_shape.GetElementCount(), shape.data(), shape.size());
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
       return Ort::Value::CreateTensor(
           memory_info, v->GetTensorMutableData<bool>(),
           type_and_shape.GetElementCount(), shape.data(), shape.size());
     default:
-       fprintf(stderr, "Unsupported type: %d\n",
-               static_cast<int32_t>(type_and_shape.GetElementType()));
-       exit(-1);
+       SHERPA_ONNX_LOGE("Unsupported type: %d\n",
+                        static_cast<int32_t>(type_and_shape.GetElementType()));
+       SHERPA_ONNX_EXIT(-1);
       // unreachable code
       return Ort::Value{nullptr};
   }
--- a/sherpa-onnx/csrc/onnx-utils.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/onnx-utils.h
查看文件 @858b505
@@ -11,6 +11,7 @@
 #include <locale>
 #endif
 
+ #include <algorithm>
 #include <cassert>
 #include <ostream>
 #include <string>
--- a/sherpa-onnx/csrc/sherpa-onnx.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/sherpa-onnx.cc
查看文件 @858b505
@@ -117,6 +117,11 @@ for a list of pre-trained models to download.
     const float duration = samples.size() / static_cast<float>(sampling_rate);
 
     auto s = recognizer.CreateStream();
+ 
+     std::vector<float> left_paddings(static_cast<int>(0.3 * sampling_rate));
+     s->AcceptWaveform(sampling_rate, left_paddings.data(),
+                       left_paddings.size());
+ 
     s->AcceptWaveform(sampling_rate, samples.data(), samples.size());
 
     std::vector<float> tail_paddings(static_cast<int>(0.8 * sampling_rate));
--- a/sherpa-onnx/csrc/text-utils-test.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/text-utils-test.cc
查看文件 @858b505
@@ -4,7 +4,7 @@
 
 #include "sherpa-onnx/csrc/text-utils.h"
 
- #include <regex>
+ #include <regex>  // NOLINT
 #include <sstream>
 
 #include "gtest/gtest.h"
--- a/sherpa-onnx/csrc/unbind.cc
查看文件 @858b505
+++ b/sherpa-onnx/csrc/unbind.cc
查看文件 @858b505
@@ -68,4 +68,49 @@ template std::vector<Ort::Value> Unbind<int64_t>(OrtAllocator *allocator,
                                                  const Ort::Value *value,
                                                  int32_t dim);
 
+ std::vector<Ort::Value> UnbindFloat16(OrtAllocator *allocator,
+                                       const Ort::Value *value, int32_t dim) {
+   std::vector<int64_t> shape = value->GetTensorTypeAndShapeInfo().GetShape();
+   assert(dim >= 0);
+   assert(dim < static_cast<int32_t>(shape.size()));
+   int32_t n = static_cast<int32_t>(shape[dim]);
+   if (n == 1) {
+     std::vector<Ort::Value> ans;
+     ans.push_back(Clone(allocator, value));
+     return ans;
+   }
+ 
+   std::vector<int64_t> ans_shape = shape;
+   ans_shape[dim] = 1;  // // Unlike torch, we keep the dim to 1
+ 
+   // allocator tensors
+   std::vector<Ort::Value> ans;
+   ans.reserve(n);
+   for (int32_t i = 0; i != n; ++i) {
+     Ort::Value t =
+         Ort::Value::CreateTensor(allocator, ans_shape.data(), ans_shape.size(),
+                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
+     ans.push_back(std::move(t));
+   }
+ 
+   auto leading_size = static_cast<int32_t>(std::accumulate(
+       shape.begin(), shape.begin() + dim, 1, std::multiplies<int64_t>()));
+ 
+   auto trailing_size = static_cast<int32_t>(std::accumulate(
+       shape.begin() + dim + 1, shape.end(), 1, std::multiplies<int64_t>()));
+ 
+   using T = uint16_t;
+   const T *src = value->GetTensorData<T>();
+ 
+   for (int32_t i = 0; i != leading_size; ++i) {
+     for (int32_t k = 0; k != n; ++k) {
+       T *dst = ans[k].GetTensorMutableData<T>() + i * trailing_size;
+       std::copy(src, src + trailing_size, dst);
+       src += trailing_size;
+     }
+   }
+ 
+   return ans;
+ }
+ 
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/unbind.h
查看文件 @858b505
+++ b/sherpa-onnx/csrc/unbind.h
查看文件 @858b505
@@ -23,6 +23,9 @@ template <typename T = float>
 std::vector<Ort::Value> Unbind(OrtAllocator *allocator, const Ort::Value *value,
                                int32_t dim);
 
+ std::vector<Ort::Value> UnbindFloat16(OrtAllocator *allocator,
+                                       const Ort::Value *value, int32_t dim);
+ 
 }  // namespace sherpa_onnx
 
 #endif  // SHERPA_ONNX_CSRC_UNBIND_H_
--- a/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @858b505
+++ b/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @858b505
@@ -42,6 +42,7 @@ set(srcs
   online-punctuation.cc
   online-recognizer.cc
   online-stream.cc
+   online-t-one-ctc-model-config.cc
   online-transducer-model-config.cc
   online-wenet-ctc-model-config.cc
   online-zipformer2-ctc-model-config.cc
--- a/sherpa-onnx/python/csrc/offline-tts.cc
查看文件 @858b505
+++ b/sherpa-onnx/python/csrc/offline-tts.cc
查看文件 @858b505
@@ -5,6 +5,7 @@
 
 #include <algorithm>
 #include <string>
+ #include <vector>
 
 #include "sherpa-onnx/csrc/offline-tts.h"
 #include "sherpa-onnx/python/csrc/offline-tts-model-config.h"
--- a/sherpa-onnx/python/csrc/online-model-config.cc
查看文件 @858b505
+++ b/sherpa-onnx/python/csrc/online-model-config.cc
查看文件 @858b505
@@ -12,6 +12,7 @@
 #include "sherpa-onnx/csrc/provider-config.h"
 #include "sherpa-onnx/python/csrc/online-nemo-ctc-model-config.h"
 #include "sherpa-onnx/python/csrc/online-paraformer-model-config.h"
+ #include "sherpa-onnx/python/csrc/online-t-one-ctc-model-config.h"
 #include "sherpa-onnx/python/csrc/online-transducer-model-config.h"
 #include "sherpa-onnx/python/csrc/online-wenet-ctc-model-config.h"
 #include "sherpa-onnx/python/csrc/online-zipformer2-ctc-model-config.h"
@@ -25,6 +26,7 @@ void PybindOnlineModelConfig(py::module *m) {
   PybindOnlineWenetCtcModelConfig(m);
   PybindOnlineZipformer2CtcModelConfig(m);
   PybindOnlineNeMoCtcModelConfig(m);
+   PybindOnlineToneCtcModelConfig(m);
   PybindProviderConfig(m);
 
   using PyClass = OnlineModelConfig;
@@ -34,17 +36,18 @@ void PybindOnlineModelConfig(py::module *m) {
                     const OnlineWenetCtcModelConfig &,
                     const OnlineZipformer2CtcModelConfig &,
                     const OnlineNeMoCtcModelConfig &,
-                     const ProviderConfig &,
-                     const std::string &, int32_t, int32_t,
-                     bool, const std::string &, const std::string &,
+                     const OnlineToneCtcModelConfig &, const ProviderConfig &,
+                     const std::string &, int32_t, int32_t, bool,
+                     const std::string &, const std::string &,
                     const std::string &>(),
            py::arg("transducer") = OnlineTransducerModelConfig(),
            py::arg("paraformer") = OnlineParaformerModelConfig(),
            py::arg("wenet_ctc") = OnlineWenetCtcModelConfig(),
            py::arg("zipformer2_ctc") = OnlineZipformer2CtcModelConfig(),
            py::arg("nemo_ctc") = OnlineNeMoCtcModelConfig(),
-            py::arg("provider_config") = ProviderConfig(),
-            py::arg("tokens"), py::arg("num_threads"), py::arg("warm_up") = 0,
+            py::arg("t_one_ctc") = OnlineToneCtcModelConfig(),
+            py::arg("provider_config") = ProviderConfig(), py::arg("tokens"),
+            py::arg("num_threads"), py::arg("warm_up") = 0,
            py::arg("debug") = false, py::arg("model_type") = "",
            py::arg("modeling_unit") = "", py::arg("bpe_vocab") = "")
       .def_readwrite("transducer", &PyClass::transducer)
@@ -52,6 +55,7 @@ void PybindOnlineModelConfig(py::module *m) {
       .def_readwrite("wenet_ctc", &PyClass::wenet_ctc)
       .def_readwrite("zipformer2_ctc", &PyClass::zipformer2_ctc)
       .def_readwrite("nemo_ctc", &PyClass::nemo_ctc)
+       .def_readwrite("t_one_ctc", &PyClass::t_one_ctc)
       .def_readwrite("provider_config", &PyClass::provider_config)
       .def_readwrite("tokens", &PyClass::tokens)
       .def_readwrite("num_threads", &PyClass::num_threads)
--- a/sherpa-onnx/python/csrc/online-t-one-ctc-model-config.cc 0 → 100644
查看文件 @858b505
+++ b/sherpa-onnx/python/csrc/online-t-one-ctc-model-config.cc 0 → 100644
查看文件 @858b505
+ // sherpa-onnx/python/csrc/online-t-one-ctc-model-config.cc
+ //
+ // Copyright (c)  2025  Xiaomi Corporation
+ 
+ #include "sherpa-onnx/python/csrc/online-t-one-ctc-model-config.h"
+ 
+ #include <string>
+ #include <vector>
+ 
+ #include "sherpa-onnx/csrc/online-t-one-ctc-model-config.h"
+ 
+ namespace sherpa_onnx {
+ 
+ void PybindOnlineToneCtcModelConfig(py::module *m) {
+   using PyClass = OnlineToneCtcModelConfig;
+   py::class_<PyClass>(*m, "OnlineToneCtcModelConfig")
+       .def(py::init<const std::string &>(), py::arg("model"))
+       .def_readwrite("model", &PyClass::model)
+       .def("__str__", &PyClass::ToString);
+ }
+ 
+ }  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/online-t-one-ctc-model-config.h 0 → 100644
查看文件 @858b505
+++ b/sherpa-onnx/python/csrc/online-t-one-ctc-model-config.h 0 → 100644
查看文件 @858b505
+ // sherpa-onnx/python/csrc/online-t-one-ctc-model-config.h
+ //
+ // Copyright (c)  2025  Xiaomi Corporation
+ 
+ #ifndef SHERPA_ONNX_PYTHON_CSRC_ONLINE_T_ONE_CTC_MODEL_CONFIG_H_
+ #define SHERPA_ONNX_PYTHON_CSRC_ONLINE_T_ONE_CTC_MODEL_CONFIG_H_
+ 
+ #include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+ 
+ namespace sherpa_onnx {
+ 
+ void PybindOnlineToneCtcModelConfig(py::module *m);
+ 
+ }
+ 
+ #endif  // SHERPA_ONNX_PYTHON_CSRC_ONLINE_T_ONE_CTC_MODEL_CONFIG_H_
--- a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py
查看文件 @858b505
+++ b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py
查看文件 @858b505
@@ -18,6 +18,7 @@ from sherpa_onnx.lib._sherpa_onnx import (
     OnlineRecognizerConfig,
     OnlineRecognizerResult,
     OnlineStream,
+     OnlineToneCtcModelConfig,
     OnlineTransducerModelConfig,
     OnlineWenetCtcModelConfig,
     OnlineZipformer2CtcModelConfig,
@@ -603,6 +604,132 @@ class OnlineRecognizer(object):
         return self
 
     @classmethod
+     def from_t_one_ctc(
+         cls,
+         tokens: str,
+         model: str,
+         num_threads: int = 2,
+         sample_rate: float = 8000,
+         feature_dim: int = 80,
+         enable_endpoint_detection: bool = False,
+         rule1_min_trailing_silence: float = 2.4,
+         rule2_min_trailing_silence: float = 1.2,
+         rule3_min_utterance_length: float = 20.0,
+         decoding_method: str = "greedy_search",
+         provider: str = "cpu",
+         debug: bool = False,
+         rule_fsts: str = "",
+         rule_fars: str = "",
+         device: int = 0,
+         hr_dict_dir: str = "",
+         hr_rule_fsts: str = "",
+         hr_lexicon: str = "",
+     ):
+         """
+         Please refer to
+         `<https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models>`_
+         to download pre-trained models.
+ 
+         Args:
+           tokens:
+             Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
+             columns::
+ 
+                 symbol integer_id
+ 
+           model:
+             Path to ``model.onnx``.
+           num_threads:
+             Number of threads for neural network computation.
+           sample_rate:
+             Sample rate of the training data used to train the model.
+           feature_dim:
+             Dimension of the feature used to train the model.
+           enable_endpoint_detection:
+             True to enable endpoint detection. False to disable endpoint
+             detection.
+           rule1_min_trailing_silence:
+             Used only when enable_endpoint_detection is True. If the duration
+             of trailing silence in seconds is larger than this value, we assume
+             an endpoint is detected.
+           rule2_min_trailing_silence:
+             Used only when enable_endpoint_detection is True. If we have decoded
+             something that is nonsilence and if the duration of trailing silence
+             in seconds is larger than this value, we assume an endpoint is
+             detected.
+           rule3_min_utterance_length:
+             Used only when enable_endpoint_detection is True. If the utterance
+             length in seconds is larger than this value, we assume an endpoint
+             is detected.
+           decoding_method:
+             The only valid value is greedy_search.
+           provider:
+             onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
+           debug:
+             True to show meta data in the model.
+           rule_fsts:
+             If not empty, it specifies fsts for inverse text normalization.
+             If there are multiple fsts, they are separated by a comma.
+           rule_fars:
+             If not empty, it specifies fst archives for inverse text normalization.
+             If there are multiple archives, they are separated by a comma.
+           device:
+             onnxruntime cuda device index.
+         """
+         self = cls.__new__(cls)
+         _assert_file_exists(tokens)
+         _assert_file_exists(model)
+ 
+         assert num_threads > 0, num_threads
+ 
+         t_one_ctc_config = OnlineToneCtcModelConfig(
+             model=model,
+         )
+ 
+         provider_config = ProviderConfig(
+             provider=provider,
+             device=device,
+         )
+ 
+         model_config = OnlineModelConfig(
+             t_one_ctc=t_one_ctc_config,
+             tokens=tokens,
+             num_threads=num_threads,
+             provider_config=provider_config,
+             debug=debug,
+         )
+ 
+         feat_config = FeatureExtractorConfig(
+             sampling_rate=sample_rate,
+             feature_dim=feature_dim,
+         )
+ 
+         endpoint_config = EndpointConfig(
+             rule1_min_trailing_silence=rule1_min_trailing_silence,
+             rule2_min_trailing_silence=rule2_min_trailing_silence,
+             rule3_min_utterance_length=rule3_min_utterance_length,
+         )
+ 
+         recognizer_config = OnlineRecognizerConfig(
+             feat_config=feat_config,
+             model_config=model_config,
+             endpoint_config=endpoint_config,
+             enable_endpoint=enable_endpoint_detection,
+             decoding_method=decoding_method,
+             rule_fsts=rule_fsts,
+             rule_fars=rule_fars,
+             hr=HomophoneReplacerConfig(
+                 dict_dir=hr_dict_dir,
+                 lexicon=hr_lexicon,
+                 rule_fsts=hr_rule_fsts,
+             ),
+         )
+ 
+         self.recognizer = _Recognizer(recognizer_config)
+         self.config = recognizer_config
+         return self
+ 
+     @classmethod
     def from_nemo_ctc(
         cls,
         tokens: str,