Add C++ and Python support for ten-vad (#2377)

This PR adds support for the TEN VAD model alongside the existing Silero VAD in both C++ and Python interfaces. - Introduces TenVadModelConfig with Python bindings and integrates it into VadModelConfig. - Implements TenVadModel in C++ and extends the factory (VadModel::Create) and detector logic to choose between Silero and TEN VAD. - Updates build files (CMake), fixes a spelling typo, and extends the Python example script to demonstrate --ten-vad-model.

Add C++ and Python support for ten-vad (#2377)
This PR adds support for the TEN VAD model alongside the existing Silero VAD in both C++ and Python interfaces. - Introduces TenVadModelConfig with Python bindings and integrates it into VadModelConfig. - Implements TenVadModel in C++ and extends the factory (VadModel::Create) and detector logic to choose between Silero and TEN VAD. - Updates build files (CMake), fixes a spelling typo, and extends the Python example script to demonstrate --ten-vad-model.
Fangjun Kuang · GitHub
Commit 19ac7d5b4e3b4ced84ea0a9836050d96c9147857 19ac7d5b 1 parent bebd996f
cmake/kaldi-native-fbank.cmake
cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
python-api-examples/generate-subtitles.py
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/silero-vad-model-config.cc
sherpa-onnx/csrc/silero-vad-model-config.h
sherpa-onnx/csrc/ten-vad-model-config.cc
sherpa-onnx/csrc/ten-vad-model-config.h
sherpa-onnx/csrc/ten-vad-model.cc
sherpa-onnx/csrc/ten-vad-model.h
sherpa-onnx/csrc/transpose.h
sherpa-onnx/csrc/vad-model-config.cc
sherpa-onnx/csrc/vad-model-config.h
sherpa-onnx/csrc/vad-model.cc
sherpa-onnx/csrc/voice-activity-detector.cc
sherpa-onnx/python/csrc/CMakeLists.txt
sherpa-onnx/python/csrc/ten-vad-model-config.cc
sherpa-onnx/python/csrc/ten-vad-model-config.h
sherpa-onnx/python/csrc/vad-model-config.cc
sherpa-onnx/python/sherpa_onnx/__init__.py
--- a/cmake/kaldi-native-fbank.cmake
查看文件 @19ac7d5
+++ b/cmake/kaldi-native-fbank.cmake
查看文件 @19ac7d5
 function(download_kaldi_native_fbank)
   include(FetchContent)
-  set(kaldi_native_fbank_URL   "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.2.tar.gz")
-  set(kaldi_native_fbank_URL2  "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.2.tar.gz")
-  set(kaldi_native_fbank_HASH "SHA256=f4bd7d53fe8aeaecc4eda9680c72696bb86bf74e86371d81aacacd6f4ca3914d")
+  set(kaldi_native_fbank_URL   "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.3.tar.gz")
+  set(kaldi_native_fbank_URL2  "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.3.tar.gz")
+  set(kaldi_native_fbank_HASH "SHA256=d409eddae5a46dc796f0841880f489ff0728b96ae26218702cd438c28667c70e")
   set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
   set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
@@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
   # If you don't have access to the Internet,
   # please pre-download kaldi-native-fbank
   set(possible_file_locations
-    $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.2.tar.gz
-    ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.2.tar.gz
-    ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.2.tar.gz
-    /tmp/kaldi-native-fbank-1.21.2.tar.gz
-    /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.2.tar.gz
+    $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.3.tar.gz
+    ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.3.tar.gz
+    ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.3.tar.gz
+    /tmp/kaldi-native-fbank-1.21.3.tar.gz
+    /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.3.tar.gz
   )
   foreach(f IN LISTS possible_file_locations)
--- a/cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
查看文件 @19ac7d5
+++ b/cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
查看文件 @19ac7d5
 // cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
 // Copyright (c)  2025  Xiaomi Corporation
 //
-// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++ API
-// for streaming speech recognition from a microphone.
+// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++
+// API for streaming speech recognition from a microphone.
 //
 // clang-format off
 //
--- a/python-api-examples/generate-subtitles.py
查看文件 @19ac7d5
+++ b/python-api-examples/generate-subtitles.py
查看文件 @19ac7d5
@@ -19,6 +19,12 @@ For instance,
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+or download ten-vad.onnx, for instance
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
+
+Please replace --silero-vad-model with --ten-vad-model below to use ten-vad.
+
 (1) For paraformer
     ./python-api-examples/generate-subtitles.py  \
@@ -124,8 +130,13 @@ def get_args():
     parser.add_argument(
         "--silero-vad-model",
         type=str,
-        required=True,
-        help="Path to silero_vad.onnx",
+        help="Path to silero_vad.onnx.",
+    )
+
+    parser.add_argument(
+        "--ten-vad-model",
+        type=str,
+        help="Path to ten-vad.onnx",
     )
     parser.add_argument(
@@ -499,7 +510,12 @@ class Segment:
 def main():
     args = get_args()
     assert_file_exists(args.tokens)
-    assert_file_exists(args.silero_vad_model)
+    if args.silero_vad_model:
+        assert_file_exists(args.silero_vad_model)
+    elif args.ten_vad_model:
+        assert_file_exists(args.ten_vad_model)
+    else:
+        raise ValueError("You need to supply one vad model")
     assert args.num_threads > 0, args.num_threads
@@ -536,18 +552,34 @@ def main():
     stream = recognizer.create_stream()
     config = sherpa_onnx.VadModelConfig()
-    config.silero_vad.model = args.silero_vad_model
-    config.silero_vad.threshold = 0.5
-    config.silero_vad.min_silence_duration = 0.25  # seconds
-    config.silero_vad.min_speech_duration = 0.25  # seconds
-
-    # If the current segment is larger than this value, then it increases
-    # the threshold to 0.9 internally. After detecting this segment,
-    # it resets the threshold to its original value.
-    config.silero_vad.max_speech_duration = 5  # seconds
-    config.sample_rate = args.sample_rate
-
-    window_size = config.silero_vad.window_size
+    if args.silero_vad_model:
+        config.silero_vad.model = args.silero_vad_model
+        config.silero_vad.threshold = 0.2
+        config.silero_vad.min_silence_duration = 0.25  # seconds
+        config.silero_vad.min_speech_duration = 0.25  # seconds
+
+        # If the current segment is larger than this value, then it increases
+        # the threshold to 0.9 internally. After detecting this segment,
+        # it resets the threshold to its original value.
+        config.silero_vad.max_speech_duration = 5  # seconds
+        config.sample_rate = args.sample_rate
+
+        window_size = config.silero_vad.window_size
+        print("use silero-vad")
+    else:
+        config.ten_vad.model = args.ten_vad_model
+        config.ten_vad.threshold = 0.2
+        config.ten_vad.min_silence_duration = 0.25  # seconds
+        config.ten_vad.min_speech_duration = 0.25  # seconds
+
+        # If the current segment is larger than this value, then it increases
+        # the threshold to 0.9 internally. After detecting this segment,
+        # it resets the threshold to its original value.
+        config.ten_vad.max_speech_duration = 5  # seconds
+        config.sample_rate = args.sample_rate
+
+        window_size = config.ten_vad.window_size
+        print("use ten-vad")
     buffer = []
     vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
--- a/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @19ac7d5
@@ -123,6 +123,8 @@ set(sources
   spoken-language-identification.cc
   stack.cc
   symbol-table.cc
+  ten-vad-model-config.cc
+  ten-vad-model.cc
   text-utils.cc
   transducer-keyword-decoder.cc
   transpose.cc
--- a/sherpa-onnx/csrc/silero-vad-model-config.cc
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/silero-vad-model-config.cc
查看文件 @19ac7d5
@@ -40,7 +40,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
       "to the silero VAD model. WARNING! Silero VAD models were trained using "
       "512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples "
       "for 8000 sample rate. Values other than these may affect model "
-      "perfomance!");
+      "performance!");
 }
 bool SileroVadModelConfig::Validate() const {
--- a/sherpa-onnx/csrc/silero-vad-model-config.h
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/silero-vad-model-config.h
查看文件 @19ac7d5
@@ -24,7 +24,6 @@ struct SileroVadModelConfig {
   float min_speech_duration = 0.25;  // in seconds
   // 512, 1024, 1536 samples for 16000 Hz
-  // 256, 512, 768 samples for 800 Hz
   int32_t window_size = 512;  // in samples
   // If a speech segment is longer than this value, then we increase
--- a/sherpa-onnx/csrc/ten-vad-model-config.cc 0 → 100644
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/ten-vad-model-config.cc 0 → 100644
查看文件 @19ac7d5
+// sherpa-onnx/csrc/ten-vad-model-config.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/ten-vad-model-config.h"
+
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+
+namespace sherpa_onnx {
+
+void TenVadModelConfig::Register(ParseOptions *po) {
+  po->Register("ten-vad-model", &model, "Path to TEN VAD ONNX model.");
+
+  po->Register("ten-vad-threshold", &threshold,
+               "Speech threshold. TEN VAD outputs speech probabilities for "
+               "each audio chunk, probabilities ABOVE this value are "
+               "considered as SPEECH. It is better to tune this parameter for "
+               "each dataset separately, but lazy "
+               "0.5 is pretty good for most datasets.");
+
+  po->Register("ten-vad-min-silence-duration", &min_silence_duration,
+               "In seconds.  In the end of each speech chunk wait for "
+               "--ten-vad-min-silence-duration seconds before separating it");
+
+  po->Register("ten-vad-min-speech-duration", &min_speech_duration,
+               "In seconds.  In the end of each silence chunk wait for "
+               "--ten-vad-min-speech-duration seconds before separating it");
+
+  po->Register(
+      "ten-vad-max-speech-duration", &max_speech_duration,
+      "In seconds. If a speech segment is longer than this value, then we "
+      "increase the threshold to 0.9. After finishing detecting the segment, "
+      "the threshold value is reset to its original value.");
+
+  po->Register(
+      "ten-vad-window-size", &window_size,
+      "In samples. Audio chunks of --ten-vad-window-size samples are fed "
+      "to the ten VAD model. WARNING! Please use 160 or 256 ");
+}
+
+bool TenVadModelConfig::Validate() const {
+  if (model.empty()) {
+    SHERPA_ONNX_LOGE("Please provide --ten-vad-model");
+    return false;
+  }
+
+  if (!FileExists(model)) {
+    SHERPA_ONNX_LOGE("TEN vad model file '%s' does not exist", model.c_str());
+    return false;
+  }
+
+  if (threshold < 0.01) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-threshold. Given: %f",
+        threshold);
+    return false;
+  }
+
+  if (threshold >= 1) {
+    SHERPA_ONNX_LOGE(
+        "Please use a smaller value for --ten-vad-threshold. Given: %f",
+        threshold);
+    return false;
+  }
+
+  if (min_silence_duration <= 0) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-min-silence-duration. "
+        "Given: "
+        "%f",
+        min_silence_duration);
+    return false;
+  }
+
+  if (min_speech_duration <= 0) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-min-speech-duration. "
+        "Given: "
+        "%f",
+        min_speech_duration);
+    return false;
+  }
+
+  if (max_speech_duration <= 0) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-max-speech-duration. "
+        "Given: "
+        "%f",
+        max_speech_duration);
+    return false;
+  }
+
+  return true;
+}
+
+std::string TenVadModelConfig::ToString() const {
+  std::ostringstream os;
+
+  os << "TenVadModelConfig(";
+  os << "model=\"" << model << "\", ";
+  os << "threshold=" << threshold << ", ";
+  os << "min_silence_duration=" << min_silence_duration << ", ";
+  os << "min_speech_duration=" << min_speech_duration << ", ";
+  os << "max_speech_duration=" << max_speech_duration << ", ";
+  os << "window_size=" << window_size << ")";
+
+  return os.str();
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/ten-vad-model-config.h 0 → 100644
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/ten-vad-model-config.h 0 → 100644
查看文件 @19ac7d5
+// sherpa-onnx/csrc/ten-vad-model-config.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+#ifndef SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
+#define SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
+
+#include <string>
+
+#include "sherpa-onnx/csrc/parse-options.h"
+
+namespace sherpa_onnx {
+
+struct TenVadModelConfig {
+  std::string model;
+
+  // threshold to classify a segment as speech
+  //
+  // If the predicted probability of a segment is larger than this
+  // value, then it is classified as speech.
+  float threshold = 0.5;
+
+  float min_silence_duration = 0.5;  // in seconds
+
+  float min_speech_duration = 0.25;  // in seconds
+
+  // 160 or 256
+  int32_t window_size = 256;  // in samples
+
+  // If a speech segment is longer than this value, then we increase
+  // the threshold to 0.9. After finishing detecting the segment,
+  // the threshold value is reset to its original value.
+  float max_speech_duration = 20;  // in seconds
+
+  TenVadModelConfig() = default;
+
+  void Register(ParseOptions *po);
+
+  bool Validate() const;
+
+  std::string ToString() const;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
--- a/sherpa-onnx/csrc/ten-vad-model.cc 0 → 100644
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/ten-vad-model.cc 0 → 100644
查看文件 @19ac7d5
+// sherpa-onnx/csrc/ten-vad-model.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/ten-vad-model.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if __ANDROID_API__ >= 9
+#include "android/asset_manager.h"
+#include "android/asset_manager_jni.h"
+#endif
+
+#if __OHOS__
+#include "rawfile/raw_file_manager.h"
+#endif
+
+#include "kaldi-native-fbank/csrc/mel-computations.h"
+#include "kaldi-native-fbank/csrc/rfft.h"
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+#include "sherpa-onnx/csrc/onnx-utils.h"
+#include "sherpa-onnx/csrc/session.h"
+#include "sherpa-onnx/csrc/text-utils.h"
+
+namespace sherpa_onnx {
+
+class TenVadModel::Impl {
+ public:
+  explicit Impl(const VadModelConfig &config)
+      : config_(config),
+        rfft_(1024),
+        env_(ORT_LOGGING_LEVEL_ERROR),
+        sess_opts_(GetSessionOptions(config)),
+        allocator_{},
+        sample_rate_(config.sample_rate) {
+    auto buf = ReadFile(config.ten_vad.model);
+    Init(buf.data(), buf.size());
+  }
+
+  template <typename Manager>
+  Impl(Manager *mgr, const VadModelConfig &config)
+      : config_(config),
+        rfft_(1024),
+        env_(ORT_LOGGING_LEVEL_ERROR),
+        sess_opts_(GetSessionOptions(config)),
+        allocator_{},
+        sample_rate_(config.sample_rate) {
+    auto buf = ReadFile(mgr, config.ten_vad.model);
+    Init(buf.data(), buf.size());
+  }
+
+  void Reset() {
+    triggered_ = false;
+    current_sample_ = 0;
+    temp_start_ = 0;
+    temp_end_ = 0;
+
+    last_sample_ = 0;
+
+    last_features_.resize(3 * 41);
+    std::fill(last_features_.begin(), last_features_.end(), 0.0f);
+    tmp_samples_.resize(1024);
+
+    ResetStates();
+  }
+
+  bool IsSpeech(const float *samples, int32_t n) {
+    if (n != WindowSize()) {
+      SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize());
+      SHERPA_ONNX_EXIT(-1);
+    }
+
+    float prob = Run(samples, n);
+
+    float threshold = config_.ten_vad.threshold;
+
+    current_sample_ += config_.ten_vad.window_size;
+
+    if (prob > threshold && temp_end_ != 0) {
+      temp_end_ = 0;
+    }
+
+    if (prob > threshold && temp_start_ == 0) {
+      // start speaking, but we require that it must satisfy
+      // min_speech_duration
+      temp_start_ = current_sample_;
+      return false;
+    }
+
+    if (prob > threshold && temp_start_ != 0 && !triggered_) {
+      if (current_sample_ - temp_start_ < min_speech_samples_) {
+        return false;
+      }
+
+      triggered_ = true;
+
+      return true;
+    }
+
+    if ((prob < threshold) && !triggered_) {
+      // silence
+      temp_start_ = 0;
+      temp_end_ = 0;
+      return false;
+    }
+
+    if ((prob > threshold - 0.15) && triggered_) {
+      // speaking
+      return true;
+    }
+
+    if ((prob > threshold) && !triggered_) {
+      // start speaking
+      triggered_ = true;
+
+      return true;
+    }
+
+    if ((prob < threshold) && triggered_) {
+      // stop to speak
+      if (temp_end_ == 0) {
+        temp_end_ = current_sample_;
+      }
+
+      if (current_sample_ - temp_end_ < min_silence_samples_) {
+        // continue speaking
+        return true;
+      }
+      // stopped speaking
+      temp_start_ = 0;
+      temp_end_ = 0;
+      triggered_ = false;
+      return false;
+    }
+
+    return false;
+  }
+
+  int32_t WindowShift() const { return config_.ten_vad.window_size; }
+
+  int32_t WindowSize() const { return config_.ten_vad.window_size; }
+
+  int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
+
+  int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }
+
+  void SetMinSilenceDuration(float s) {
+    min_silence_samples_ = sample_rate_ * s;
+  }
+
+  void SetThreshold(float threshold) { config_.ten_vad.threshold = threshold; }
+
+ private:
+  void Init(void *model_data, size_t model_data_length) {
+    if (sample_rate_ != 16000) {
+      SHERPA_ONNX_LOGE("Expected sample rate 16000. Given: %d",
+                       config_.sample_rate);
+      SHERPA_ONNX_EXIT(-1);
+    }
+
+    if (config_.ten_vad.window_size > 768) {
+      SHERPA_ONNX_LOGE("Windows size %d for ten-vad is too large",
+                       config_.ten_vad.window_size);
+      SHERPA_ONNX_EXIT(-1);
+    }
+
+    min_silence_samples_ = sample_rate_ * config_.ten_vad.min_silence_duration;
+
+    min_speech_samples_ = sample_rate_ * config_.ten_vad.min_speech_duration;
+
+    sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
+                                           sess_opts_);
+
+    GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
+    GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
+
+    InitMelBanks();
+
+    Check();
+
+    Reset();
+  }
+
+  void ResetStates() {
+    std::array<int64_t, 2> shape{1, 64};
+
+    states_.clear();
+    states_.reserve(4);
+    for (int32_t i = 0; i != 4; ++i) {
+      Ort::Value s = Ort::Value::CreateTensor<float>(allocator_, shape.data(),
+                                                     shape.size());
+
+      Fill<float>(&s, 0);
+      states_.push_back(std::move(s));
+    }
+  }
+
+  void InitMelBanks() {
+    knf::FrameExtractionOptions frame_opts;
+
+    // 16 kHz, so num_fft is 16000*64/1000 = 1024
+    frame_opts.frame_length_ms = 64;
+
+    knf::MelBanksOptions mel_opts;
+    mel_opts.is_librosa = true;
+    mel_opts.norm = "";
+    mel_opts.use_slaney_mel_scale = true;
+    mel_opts.floor_to_int_bin = true;
+    mel_opts.low_freq = 0;
+    mel_opts.high_freq = 8000;
+    mel_opts.num_bins = 40;
+
+    mel_banks_ = std::make_unique<knf::MelBanks>(mel_opts, frame_opts, 1.0f);
+
+    features_.resize(41);
+  }
+
+  void Check() {
+    // get meta data
+    Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
+    if (config_.debug) {
+      std::ostringstream os;
+      os << "---ten-vad---\n";
+      PrintModelMetadata(os, meta_data);
+#if __OHOS__
+      SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
+#else
+      SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
+#endif
+    }
+    Ort::AllocatorWithDefaultOptions allocator;  // used in the macro below
+
+    std::string model_type;
+    SHERPA_ONNX_READ_META_DATA_STR_ALLOW_EMPTY(model_type, "model_type");
+
+    if (model_type.empty()) {
+      SHERPA_ONNX_LOGE(
+          "Please download ten-vad.onnx or ten-vad.int8.onnx from\n"
+          "https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models"
+          "\nWe have added meta data to the original ten-vad.onnx from\n"
+          "https://github.com/TEN-framework/ten-vad");
+      SHERPA_ONNX_EXIT(-1);
+    }
+
+    if (model_type != "ten-vad") {
+      SHERPA_ONNX_LOGE("Expect model type 'ten-vad', given '%s'",
+                       model_type.c_str());
+      SHERPA_ONNX_EXIT(-1);
+    }
+
+    SHERPA_ONNX_READ_META_DATA_VEC_FLOAT(mean_, "mean");
+    SHERPA_ONNX_READ_META_DATA_VEC_FLOAT(inv_stddev_, "inv_stddev");
+    SHERPA_ONNX_READ_META_DATA_VEC_FLOAT(window_, "window");
+
+    if (mean_.size() != 41) {
+      SHERPA_ONNX_LOGE(
+          "Incorrect size of the mean vector. Given %d, expected 41",
+          static_cast<int32_t>(mean_.size()));
+      SHERPA_ONNX_EXIT(-1);
+    }
+
+    if (inv_stddev_.size() != 41) {
+      SHERPA_ONNX_LOGE(
+          "Incorrect size of the inv_stddev vector. Given %d, expected 41",
+          static_cast<int32_t>(inv_stddev_.size()));
+      SHERPA_ONNX_EXIT(-1);
+    }
+
+    if (window_.size() != 768) {
+      SHERPA_ONNX_LOGE(
+          "Incorrect size of the window vector. Given %d, expected 768",
+          static_cast<int32_t>(window_.size()));
+      SHERPA_ONNX_EXIT(-1);
+    }
+  }
+
+  static void Scale(const float *samples, int32_t n, float *out) {
+    for (int32_t i = 0; i != n; ++i) {
+      out[i] = samples[i] * 32768;
+    }
+  }
+
+  void Preemphasis(const float *samples, int32_t n, float *out) {
+    float t = samples[n - 1];
+
+    for (int32_t i = n - 1; i > 0; --i) {
+      out[i] = samples[i] - 0.97 * samples[i - 1];
+    }
+
+    out[0] = samples[0] - 0.97 * last_sample_;
+
+    last_sample_ = t;
+  }
+
+  static void ApplyWindow(const float *samples, const float *window, int32_t n,
+                          float *out) {
+    for (int32_t i = 0; i != n; ++i) {
+      out[i] = samples[i] * window[i];
+    }
+  }
+
+  static void ComputePowerSpectrum(const float *fft_bins, int32_t n,
+                                   float *out) {
+    out[0] = fft_bins[0] * fft_bins[0];
+    out[n - 1] = fft_bins[1] * fft_bins[1];
+
+    for (int32_t i = 1; i < n / 2; ++i) {
+      float real = fft_bins[2 * i];
+      float imag = fft_bins[2 * i + 1];
+      out[i] = real * real + imag * imag;
+    }
+  }
+
+  static void LogMel(const float *in, int32_t n, float *out) {
+    for (int32_t i = 0; i != n; ++i) {
+      // 20.79441541679836 is log(32768*32768)
+      out[i] = std::logf(in[i] + 1e-10) - 20.79441541679836f;
+    }
+  }
+
+  void ApplyNormalization(const float *in, float *out) const {
+    for (int32_t i = 0; i != static_cast<int32_t>(mean_.size()); ++i) {
+      out[i] = (in[i] - mean_[i]) * inv_stddev_[i];
+    }
+  }
+
+  void ComputeFeatures(const float *samples, int32_t n) {
+    std::fill(tmp_samples_.begin() + n, tmp_samples_.end(), 0.0f);
+
+    Scale(samples, n, tmp_samples_.data());
+
+    Preemphasis(tmp_samples_.data(), n, tmp_samples_.data());
+    ApplyWindow(tmp_samples_.data(), window_.data(), n, tmp_samples_.data());
+
+    rfft_.Compute(tmp_samples_.data());
+    auto &power_spectrum = tmp_samples_;
+    ComputePowerSpectrum(tmp_samples_.data(), tmp_samples_.size(),
+                         power_spectrum.data());
+
+    // note only the first half of power_spectrum is used inside Compute()
+    mel_banks_->Compute(power_spectrum.data(), features_.data());
+    LogMel(features_.data(), static_cast<int32_t>(features_.size()) - 1,
+           features_.data());
+
+    // Note(fangjun): The ten-vad model expects a pitch feature, but we set it
+    // to 0 as a simplification. This may reduce performance as noted
+    // in the PR #2377
+    features_.back() = 0;
+
+    ApplyNormalization(features_.data(), features_.data());
+
+    std::memmove(last_features_.data(),
+                 last_features_.data() + features_.size(),
+                 2 * features_.size() * sizeof(float));
+    std::copy(features_.begin(), features_.end(),
+              last_features_.begin() + 2 * features_.size());
+  }
+
+  float Run(const float *samples, int32_t n) {
+    ComputeFeatures(samples, n);
+
+    auto memory_info =
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+    std::array<int64_t, 3> x_shape = {1, 3, 41};
+
+    Ort::Value x = Ort::Value::CreateTensor(memory_info, last_features_.data(),
+                                            last_features_.size(),
+                                            x_shape.data(), x_shape.size());
+
+    std::vector<Ort::Value> inputs;
+    inputs.reserve(input_names_.size());
+
+    inputs.push_back(std::move(x));
+    for (auto &s : states_) {
+      inputs.push_back(std::move(s));
+    }
+
+    auto out =
+        sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
+                   output_names_ptr_.data(), output_names_ptr_.size());
+
+    for (int32_t i = 1; i != static_cast<int32_t>(output_names_.size()); ++i) {
+      states_[i - 1] = std::move(out[i]);
+    }
+
+    float prob = out[0].GetTensorData<float>()[0];
+
+    return prob;
+  }
+
+ private:
+  VadModelConfig config_;
+  knf::Rfft rfft_;
+  std::unique_ptr<knf::MelBanks> mel_banks_;
+
+  Ort::Env env_;
+  Ort::SessionOptions sess_opts_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  std::unique_ptr<Ort::Session> sess_;
+
+  std::vector<std::string> input_names_;
+  std::vector<const char *> input_names_ptr_;
+
+  std::vector<std::string> output_names_;
+  std::vector<const char *> output_names_ptr_;
+
+  std::vector<Ort::Value> states_;
+  int64_t sample_rate_;
+  int32_t min_silence_samples_;
+  int32_t min_speech_samples_;
+
+  bool triggered_ = false;
+  int32_t current_sample_ = 0;
+  int32_t temp_start_ = 0;
+  int32_t temp_end_ = 0;
+
+  float last_sample_ = 0;
+
+  std::vector<float> mean_;
+  std::vector<float> inv_stddev_;
+  std::vector<float> window_;
+
+  std::vector<float> features_;
+  std::vector<float> last_features_;  // (3, 41), row major
+  std::vector<float> tmp_samples_;    // (1024,)
+};
+
+TenVadModel::TenVadModel(const VadModelConfig &config)
+    : impl_(std::make_unique<Impl>(config)) {}
+
+template <typename Manager>
+TenVadModel::TenVadModel(Manager *mgr, const VadModelConfig &config)
+    : impl_(std::make_unique<Impl>(mgr, config)) {}
+
+TenVadModel::~TenVadModel() = default;
+
+void TenVadModel::Reset() { return impl_->Reset(); }
+
+bool TenVadModel::IsSpeech(const float *samples, int32_t n) {
+  return impl_->IsSpeech(samples, n);
+}
+
+int32_t TenVadModel::WindowSize() const { return impl_->WindowSize(); }
+
+int32_t TenVadModel::WindowShift() const { return impl_->WindowShift(); }
+
+int32_t TenVadModel::MinSilenceDurationSamples() const {
+  return impl_->MinSilenceDurationSamples();
+}
+
+int32_t TenVadModel::MinSpeechDurationSamples() const {
+  return impl_->MinSpeechDurationSamples();
+}
+
+void TenVadModel::SetMinSilenceDuration(float s) {
+  impl_->SetMinSilenceDuration(s);
+}
+
+void TenVadModel::SetThreshold(float threshold) {
+  impl_->SetThreshold(threshold);
+}
+
+#if __ANDROID_API__ >= 9
+template TenVadModel::TenVadModel(AAssetManager *mgr,
+                                  const VadModelConfig &config);
+#endif
+
+#if __OHOS__
+template TenVadModel::TenVadModel(NativeResourceManager *mgr,
+                                  const VadModelConfig &config);
+#endif
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/ten-vad-model.h 0 → 100644
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/ten-vad-model.h 0 → 100644
查看文件 @19ac7d5
+// sherpa-onnx/csrc/ten-vad-model.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+#ifndef SHERPA_ONNX_CSRC_TEN_VAD_MODEL_H_
+#define SHERPA_ONNX_CSRC_TEN_VAD_MODEL_H_
+
+#include <memory>
+
+#include "sherpa-onnx/csrc/vad-model.h"
+
+namespace sherpa_onnx {
+
+class TenVadModel : public VadModel {
+ public:
+  explicit TenVadModel(const VadModelConfig &config);
+
+  template <typename Manager>
+  TenVadModel(Manager *mgr, const VadModelConfig &config);
+
+  ~TenVadModel() override;
+
+  // reset the internal model states
+  void Reset() override;
+
+  /**
+   * @param samples Pointer to a 1-d array containing audio samples.
+   *                Each sample should be normalized to the range [-1, 1].
+   * @param n Number of samples.
+   *
+   * @return Return true if speech is detected. Return false otherwise.
+   */
+  bool IsSpeech(const float *samples, int32_t n) override;
+
+  // 256 or 160
+  int32_t WindowSize() const override;
+
+  // 256 or 128
+  int32_t WindowShift() const override;
+
+  int32_t MinSilenceDurationSamples() const override;
+  int32_t MinSpeechDurationSamples() const override;
+
+  void SetMinSilenceDuration(float s) override;
+  void SetThreshold(float threshold) override;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_TEN_VAD_MODEL_H_
--- a/sherpa-onnx/csrc/transpose.h
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/transpose.h
查看文件 @19ac7d5
@@ -10,9 +10,9 @@ namespace sherpa_onnx {
 /** Transpose a 3-D tensor from shape (B, T, C) to (T, B, C).
  *
  * @param allocator
- * @param v A 3-D tensor of shape (B, T, C). Its dataype is type.
+ * @param v A 3-D tensor of shape (B, T, C). Its data type is type.
  *
- * @return Return a 3-D tensor of shape (T, B, C). Its datatype is type.
+ * @return Return a 3-D tensor of shape (T, B, C). Its data type is type.
  */
 template <typename type = float>
 Ort::Value Transpose01(OrtAllocator *allocator, const Ort::Value *v);
@@ -20,9 +20,9 @@ Ort::Value Transpose01(OrtAllocator *allocator, const Ort::Value *v);
 /** Transpose a 3-D tensor from shape (B, T, C) to (B, C, T).
  *
  * @param allocator
- * @param v A 3-D tensor of shape (B, T, C). Its dataype is type.
+ * @param v A 3-D tensor of shape (B, T, C). Its data type is type.
  *
- * @return Return a 3-D tensor of shape (B, C, T). Its datatype is type.
+ * @return Return a 3-D tensor of shape (B, C, T). Its data type is type.
  */
 template <typename type = float>
 Ort::Value Transpose12(OrtAllocator *allocator, const Ort::Value *v);
--- a/sherpa-onnx/csrc/vad-model-config.cc
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/vad-model-config.cc
查看文件 @19ac7d5
@@ -14,6 +14,7 @@ namespace sherpa_onnx {
 void VadModelConfig::Register(ParseOptions *po) {
   silero_vad.Register(po);
+  ten_vad.Register(po);
   po->Register("vad-sample-rate", &sample_rate,
                "Sample rate expected by the VAD model");
@@ -48,7 +49,17 @@ bool VadModelConfig::Validate() const {
     }
   }
-  return silero_vad.Validate();
+  if (!silero_vad.model.empty()) {
+    return silero_vad.Validate();
+  }
+
+  if (!ten_vad.model.empty()) {
+    return ten_vad.Validate();
+  }
+
+  SHERPA_ONNX_LOGE("Please provide one VAD model.");
+
+  return false;
 }
 std::string VadModelConfig::ToString() const {
@@ -56,6 +67,7 @@ std::string VadModelConfig::ToString() const {
   os << "VadModelConfig(";
   os << "silero_vad=" << silero_vad.ToString() << ", ";
+  os << "ten_vad=" << ten_vad.ToString() << ", ";
   os << "sample_rate=" << sample_rate << ", ";
   os << "num_threads=" << num_threads << ", ";
   os << "provider=\"" << provider << "\", ";
--- a/sherpa-onnx/csrc/vad-model-config.h
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/vad-model-config.h
查看文件 @19ac7d5
@@ -8,11 +8,13 @@
 #include "sherpa-onnx/csrc/parse-options.h"
 #include "sherpa-onnx/csrc/silero-vad-model-config.h"
+#include "sherpa-onnx/csrc/ten-vad-model-config.h"
 namespace sherpa_onnx {
 struct VadModelConfig {
   SileroVadModelConfig silero_vad;
+  TenVadModelConfig ten_vad;
   int32_t sample_rate = 16000;
   int32_t num_threads = 1;
@@ -23,9 +25,11 @@ struct VadModelConfig {
   VadModelConfig() = default;
-  VadModelConfig(const SileroVadModelConfig &silero_vad, int32_t sample_rate,
+  VadModelConfig(const SileroVadModelConfig &silero_vad,
+                 const TenVadModelConfig &ten_vad, int32_t sample_rate,
                  int32_t num_threads, const std::string &provider, bool debug)
       : silero_vad(silero_vad),
+        ten_vad(ten_vad),
         sample_rate(sample_rate),
         num_threads(num_threads),
         provider(provider),
--- a/sherpa-onnx/csrc/vad-model.cc
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/vad-model.cc
查看文件 @19ac7d5
@@ -19,13 +19,19 @@
 #include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/silero-vad-model.h"
+#include "sherpa-onnx/csrc/ten-vad-model.h"
 namespace sherpa_onnx {
 std::unique_ptr<VadModel> VadModel::Create(const VadModelConfig &config) {
   if (config.provider == "rknn") {
 #if SHERPA_ONNX_ENABLE_RKNN
-    return std::make_unique<SileroVadModelRknn>(config);
+    if (!config.silero_vad.model.empty()) {
+      return std::make_unique<SileroVadModelRknn>(config);
+    } else {
+      SHERPA_ONNX_LOGE("Only silero-vad is supported for RKNN at present");
+      SHERPA_ONNX_EXIT(-1);
+    }
 #else
     SHERPA_ONNX_LOGE(
         "Please rebuild sherpa-onnx with -DSHERPA_ONNX_ENABLE_RKNN=ON if you "
@@ -34,7 +40,17 @@ std::unique_ptr<VadModel> VadModel::Create(const VadModelConfig &config) {
     return nullptr;
 #endif
   }
-  return std::make_unique<SileroVadModel>(config);
+
+  if (!config.silero_vad.model.empty()) {
+    return std::make_unique<SileroVadModel>(config);
+  }
+
+  if (!config.ten_vad.model.empty()) {
+    return std::make_unique<TenVadModel>(config);
+  }
+
+  SHERPA_ONNX_LOGE("Please provide a vad model");
+  return nullptr;
 }
 template <typename Manager>
@@ -42,7 +58,12 @@ std::unique_ptr<VadModel> VadModel::Create(Manager *mgr,
                                            const VadModelConfig &config) {
   if (config.provider == "rknn") {
 #if SHERPA_ONNX_ENABLE_RKNN
-    return std::make_unique<SileroVadModelRknn>(mgr, config);
+    if (!config.silero_vad.model.empty()) {
+      return std::make_unique<SileroVadModelRknn>(mgr, config);
+    } else {
+      SHERPA_ONNX_LOGE("Only silero-vad is supported for RKNN at present");
+      SHERPA_ONNX_EXIT(-1);
+    }
 #else
     SHERPA_ONNX_LOGE(
         "Please rebuild sherpa-onnx with -DSHERPA_ONNX_ENABLE_RKNN=ON if you "
@@ -51,7 +72,16 @@ std::unique_ptr<VadModel> VadModel::Create(Manager *mgr,
     return nullptr;
 #endif
   }
-  return std::make_unique<SileroVadModel>(mgr, config);
+  if (!config.silero_vad.model.empty()) {
+    return std::make_unique<SileroVadModel>(mgr, config);
+  }
+
+  if (!config.ten_vad.model.empty()) {
+    return std::make_unique<TenVadModel>(mgr, config);
+  }
+
+  SHERPA_ONNX_LOGE("Please provide a vad model");
+  return nullptr;
 }
 #if __ANDROID_API__ >= 9
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @19ac7d5
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @19ac7d5
@@ -18,6 +18,7 @@
 #endif
 #include "sherpa-onnx/csrc/circular-buffer.h"
+#include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/vad-model.h"
 namespace sherpa_onnx {
@@ -45,8 +46,16 @@ class VoiceActivityDetector::Impl {
       model_->SetMinSilenceDuration(new_min_silence_duration_s_);
       model_->SetThreshold(new_threshold_);
     } else {
-      model_->SetMinSilenceDuration(config_.silero_vad.min_silence_duration);
-      model_->SetThreshold(config_.silero_vad.threshold);
+      if (!config_.silero_vad.model.empty()) {
+        model_->SetMinSilenceDuration(config_.silero_vad.min_silence_duration);
+        model_->SetThreshold(config_.silero_vad.threshold);
+      } else if (!config_.ten_vad.model.empty()) {
+        model_->SetMinSilenceDuration(config_.ten_vad.min_silence_duration);
+        model_->SetThreshold(config_.ten_vad.threshold);
+      } else {
+        SHERPA_ONNX_LOGE("Unknown vad model");
+        SHERPA_ONNX_EXIT(-1);
+      }
     }
     int32_t window_size = model_->WindowSize();
@@ -160,11 +169,16 @@ class VoiceActivityDetector::Impl {
  private:
   void Init() {
-    // TODO(fangjun): Currently, we support only one vad model.
-    // If a new vad model is added, we need to change the place
-    // where max_speech_duration is placed.
-    max_utterance_length_ =
-        config_.sample_rate * config_.silero_vad.max_speech_duration;
+    if (!config_.silero_vad.model.empty()) {
+      max_utterance_length_ =
+          config_.sample_rate * config_.silero_vad.max_speech_duration;
+    } else if (!config_.ten_vad.model.empty()) {
+      max_utterance_length_ =
+          config_.sample_rate * config_.ten_vad.max_speech_duration;
+    } else {
+      SHERPA_ONNX_LOGE("Unsupported VAD model");
+      SHERPA_ONNX_EXIT(-1);
+    }
   }
  private:
--- a/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @19ac7d5
+++ b/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @19ac7d5
@@ -51,6 +51,7 @@ set(srcs
   speaker-embedding-extractor.cc
   speaker-embedding-manager.cc
   spoken-language-identification.cc
+  ten-vad-model-config.cc
   tensorrt-config.cc
   vad-model-config.cc
   vad-model.cc
--- a/sherpa-onnx/python/csrc/ten-vad-model-config.cc 0 → 100644
查看文件 @19ac7d5
+++ b/sherpa-onnx/python/csrc/ten-vad-model-config.cc 0 → 100644
查看文件 @19ac7d5
+// sherpa-onnx/python/csrc/ten-vad-model-config.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/ten-vad-model-config.h"
+
+#include <memory>
+#include <string>
+
+#include "sherpa-onnx/csrc/ten-vad-model-config.h"
+
+namespace sherpa_onnx {
+
+void PybindTenVadModelConfig(py::module *m) {
+  using PyClass = TenVadModelConfig;
+  py::class_<PyClass>(*m, "TenVadModelConfig")
+      .def(py::init<>())
+      .def(py::init([](const std::string &model, float threshold,
+                       float min_silence_duration, float min_speech_duration,
+                       int32_t window_size,
+                       float max_speech_duration) -> std::unique_ptr<PyClass> {
+             auto ans = std::make_unique<PyClass>();
+
+             ans->model = model;
+             ans->threshold = threshold;
+             ans->min_silence_duration = min_silence_duration;
+             ans->min_speech_duration = min_speech_duration;
+             ans->window_size = window_size;
+             ans->max_speech_duration = max_speech_duration;
+
+             return ans;
+           }),
+           py::arg("model"), py::arg("threshold") = 0.5,
+           py::arg("min_silence_duration") = 0.5,
+           py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 256,
+           py::arg("max_speech_duration") = 20)
+      .def_readwrite("model", &PyClass::model)
+      .def_readwrite("threshold", &PyClass::threshold)
+      .def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
+      .def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
+      .def_readwrite("window_size", &PyClass::window_size)
+      .def_readwrite("max_speech_duration", &PyClass::max_speech_duration)
+      .def("__str__", &PyClass::ToString)
+      .def("validate", &PyClass::Validate);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/ten-vad-model-config.h 0 → 100644
查看文件 @19ac7d5
+++ b/sherpa-onnx/python/csrc/ten-vad-model-config.h 0 → 100644
查看文件 @19ac7d5
+// sherpa-onnx/python/csrc/ten-vad-model-config.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_TEN_VAD_MODEL_CONFIG_H_
+#define SHERPA_ONNX_PYTHON_CSRC_TEN_VAD_MODEL_CONFIG_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindTenVadModelConfig(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_TEN_VAD_MODEL_CONFIG_H_
--- a/sherpa-onnx/python/csrc/vad-model-config.cc
查看文件 @19ac7d5
+++ b/sherpa-onnx/python/csrc/vad-model-config.cc
查看文件 @19ac7d5
@@ -8,21 +8,25 @@
 #include "sherpa-onnx/csrc/vad-model-config.h"
 #include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
+#include "sherpa-onnx/python/csrc/ten-vad-model-config.h"
 namespace sherpa_onnx {
 void PybindVadModelConfig(py::module *m) {
   PybindSileroVadModelConfig(m);
+  PybindTenVadModelConfig(m);
   using PyClass = VadModelConfig;
   py::class_<PyClass>(*m, "VadModelConfig")
       .def(py::init<>())
-      .def(py::init<const SileroVadModelConfig &, int32_t, int32_t,
-                    const std::string &, bool>(),
-           py::arg("silero_vad"), py::arg("sample_rate") = 16000,
-           py::arg("num_threads") = 1, py::arg("provider") = "cpu",
-           py::arg("debug") = false)
+      .def(py::init<const SileroVadModelConfig &, const TenVadModelConfig &,
+                    int32_t, int32_t, const std::string &, bool>(),
+           py::arg("silero_vad") = SileroVadModelConfig{},
+           py::arg("ten_vad") = TenVadModelConfig{},
+           py::arg("sample_rate") = 16000, py::arg("num_threads") = 1,
+           py::arg("provider") = "cpu", py::arg("debug") = false)
       .def_readwrite("silero_vad", &PyClass::silero_vad)
+      .def_readwrite("ten_vad", &PyClass::ten_vad)
       .def_readwrite("sample_rate", &PyClass::sample_rate)
       .def_readwrite("num_threads", &PyClass::num_threads)
       .def_readwrite("provider", &PyClass::provider)
--- a/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @19ac7d5
+++ b/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @19ac7d5
@@ -64,6 +64,7 @@ from _sherpa_onnx import (
     SpokenLanguageIdentification,
     SpokenLanguageIdentificationConfig,
     SpokenLanguageIdentificationWhisperConfig,
+    TenVadModelConfig,
     VadModel,
     VadModelConfig,
     VoiceActivityDetector,