Expose JNI to compute probability of chunk in VAD (#2433)

Ming-Hsuan-Tu · GitHub
Commit ee9bd2553304dfa692f792ff4c5f7bf57f6079f2 ee9bd255 1 parent 847a3e71
sherpa-onnx/csrc/silero-vad-model.cc
sherpa-onnx/csrc/silero-vad-model.h
sherpa-onnx/csrc/ten-vad-model.cc
sherpa-onnx/csrc/ten-vad-model.h
sherpa-onnx/csrc/vad-model.h
sherpa-onnx/csrc/voice-activity-detector.cc
sherpa-onnx/csrc/voice-activity-detector.h
sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
sherpa-onnx/jni/voice-activity-detector.cc
sherpa-onnx/kotlin-api/Vad.kt
--- a/sherpa-onnx/csrc/silero-vad-model.cc
查看文件 @ee9bd25
+++ b/sherpa-onnx/csrc/silero-vad-model.cc
查看文件 @ee9bd25
@@ -69,6 +69,14 @@ class SileroVadModel::Impl {
     min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
   }
+  float Run(const float *samples, int32_t n) {
+    if (is_v5_) {
+      return RunV5(samples, n);
+    } else {
+      return RunV4(samples, n);
+    }
+  }
+
   void Reset() {
     if (is_v5_) {
       ResetV5();
@@ -361,14 +369,6 @@ class SileroVadModel::Impl {
     }
   }
-  float Run(const float *samples, int32_t n) {
-    if (is_v5_) {
-      return RunV5(samples, n);
-    } else {
-      return RunV4(samples, n);
-    }
-  }
-
   float RunV5(const float *samples, int32_t n) {
     auto memory_info =
         Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
@@ -496,6 +496,10 @@ void SileroVadModel::SetThreshold(float threshold) {
   impl_->SetThreshold(threshold);
 }
+float SileroVadModel::Compute(const float *samples, int32_t n) {
+  return impl_->Run(samples, n);
+}
+
 #if __ANDROID_API__ >= 9
 template SileroVadModel::SileroVadModel(AAssetManager *mgr,
                                         const VadModelConfig &config);
--- a/sherpa-onnx/csrc/silero-vad-model.h
查看文件 @ee9bd25
+++ b/sherpa-onnx/csrc/silero-vad-model.h
查看文件 @ee9bd25
@@ -31,6 +31,8 @@ class SileroVadModel : public VadModel {
    */
   bool IsSpeech(const float *samples, int32_t n) override;
+  float Compute(const float *samples, int32_t n) override;
+
   // For silero vad V4, it is WindowShift().
   // For silero vad V5, it is WindowShift()+64 for 16kHz and
   //                          WindowShift()+32 for 8kHz
--- a/sherpa-onnx/csrc/ten-vad-model.cc
查看文件 @ee9bd25
+++ b/sherpa-onnx/csrc/ten-vad-model.cc
查看文件 @ee9bd25
@@ -56,6 +56,38 @@ class TenVadModel::Impl {
     Init(buf.data(), buf.size());
   }
+  float Run(const float *samples, int32_t n) {
+    ComputeFeatures(samples, n);
+
+    auto memory_info =
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+    std::array<int64_t, 3> x_shape = {1, 3, 41};
+
+    Ort::Value x = Ort::Value::CreateTensor(memory_info, last_features_.data(),
+                                            last_features_.size(),
+                                            x_shape.data(), x_shape.size());
+
+    std::vector<Ort::Value> inputs;
+    inputs.reserve(input_names_.size());
+
+    inputs.push_back(std::move(x));
+    for (auto &s : states_) {
+      inputs.push_back(std::move(s));
+    }
+
+    auto out =
+        sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
+                   output_names_ptr_.data(), output_names_ptr_.size());
+
+    for (int32_t i = 1; i != static_cast<int32_t>(output_names_.size()); ++i) {
+      states_[i - 1] = std::move(out[i]);
+    }
+
+    float prob = out[0].GetTensorData<float>()[0];
+
+    return prob;
+  }
   void Reset() {
     triggered_ = false;
     current_sample_ = 0;
@@ -363,39 +395,6 @@ class TenVadModel::Impl {
               last_features_.begin() + 2 * features_.size());
   }
-  float Run(const float *samples, int32_t n) {
-    ComputeFeatures(samples, n);
-
-    auto memory_info =
-        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-
-    std::array<int64_t, 3> x_shape = {1, 3, 41};
-
-    Ort::Value x = Ort::Value::CreateTensor(memory_info, last_features_.data(),
-                                            last_features_.size(),
-                                            x_shape.data(), x_shape.size());
-
-    std::vector<Ort::Value> inputs;
-    inputs.reserve(input_names_.size());
-
-    inputs.push_back(std::move(x));
-    for (auto &s : states_) {
-      inputs.push_back(std::move(s));
-    }
-
-    auto out =
-        sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
-                   output_names_ptr_.data(), output_names_ptr_.size());
-
-    for (int32_t i = 1; i != static_cast<int32_t>(output_names_.size()); ++i) {
-      states_[i - 1] = std::move(out[i]);
-    }
-
-    float prob = out[0].GetTensorData<float>()[0];
-
-    return prob;
-  }
-
  private:
   VadModelConfig config_;
   knf::Rfft rfft_;
@@ -469,6 +468,10 @@ void TenVadModel::SetThreshold(float threshold) {
   impl_->SetThreshold(threshold);
 }
+float TenVadModel::Compute(const float *samples, int32_t n) {
+  return impl_->Run(samples, n);
+}
+
 #if __ANDROID_API__ >= 9
 template TenVadModel::TenVadModel(AAssetManager *mgr,
                                   const VadModelConfig &config);
--- a/sherpa-onnx/csrc/ten-vad-model.h
查看文件 @ee9bd25
+++ b/sherpa-onnx/csrc/ten-vad-model.h
查看文件 @ee9bd25
@@ -31,6 +31,8 @@ class TenVadModel : public VadModel {
    */
   bool IsSpeech(const float *samples, int32_t n) override;
+  float Compute(const float *samples, int32_t n) override;
+
   // 256 or 160
   int32_t WindowSize() const override;
--- a/sherpa-onnx/csrc/vad-model.h
查看文件 @ee9bd25
+++ b/sherpa-onnx/csrc/vad-model.h
查看文件 @ee9bd25
@@ -32,6 +32,8 @@ class VadModel {
    */
   virtual bool IsSpeech(const float *samples, int32_t n) = 0;
+  virtual float Compute(const float *samples, int32_t n) = 0;
+
   virtual int32_t WindowSize() const = 0;
   virtual int32_t WindowShift() const = 0;
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @ee9bd25
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
查看文件 @ee9bd25
@@ -41,6 +41,10 @@ class VoiceActivityDetector::Impl {
     Init();
   }
+  float Compute(const float *samples, int32_t n) {
+    return model_->Compute(samples, n);
+  }
+
   void AcceptWaveform(const float *samples, int32_t n) {
     if (buffer_.Size() > max_utterance_length_) {
       model_->SetMinSilenceDuration(new_min_silence_duration_s_);
@@ -256,6 +260,10 @@ const VadModelConfig &VoiceActivityDetector::GetConfig() const {
   return impl_->GetConfig();
 }
+float VoiceActivityDetector::Compute(const float *samples, int32_t n) {
+  return impl_->Compute(samples, n);
+}
+
 #if __ANDROID_API__ >= 9
 template VoiceActivityDetector::VoiceActivityDetector(
     AAssetManager *mgr, const VadModelConfig &config,
--- a/sherpa-onnx/csrc/voice-activity-detector.h
查看文件 @ee9bd25
+++ b/sherpa-onnx/csrc/voice-activity-detector.h
查看文件 @ee9bd25
@@ -28,6 +28,8 @@ class VoiceActivityDetector {
   ~VoiceActivityDetector();
   void AcceptWaveform(const float *samples, int32_t n);
+  float Compute(const float *samples, int32_t n);
+
   bool Empty() const;
   void Pop();
   void Clear();
--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
查看文件 @ee9bd25
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
查看文件 @ee9bd25
@@ -27,6 +27,10 @@ public class Vad {
         acceptWaveform(this.ptr, samples);
     }
+    public float compute(float[] samples) {
+        return compute(this.ptr, samples);
+    }
+
     public boolean empty() {
         return empty(this.ptr);
     }
@@ -65,6 +69,8 @@ public class Vad {
     private native void acceptWaveform(long ptr, float[] samples);
+    private native float compute(long ptr, float[] samples);
+
     private native boolean empty(long ptr);
     private native void pop(long ptr);
--- a/sherpa-onnx/jni/voice-activity-detector.cc
查看文件 @ee9bd25
+++ b/sherpa-onnx/jni/voice-activity-detector.cc
查看文件 @ee9bd25
@@ -227,3 +227,26 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/,
   auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
   model->Flush();
 }
+
+SHERPA_ONNX_EXTERN_C
+JNIEXPORT jfloat JNICALL Java_com_k2fsa_sherpa_onnx_Vad_compute(
+    JNIEnv *env, jobject /*obj*/, jlong ptr, jfloatArray samples) {
+  return SafeJNI(
+      env, "Vad_compute",
+      [&]() -> jfloat {
+        if (!ValidatePointer(env, ptr, "Vad_compute",
+                             "VoiceActivityDetector pointer is null.")) {
+          return -1.0f;
+        }
+        auto vad = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
+        jfloat *p = env->GetFloatArrayElements(samples, nullptr);
+        jsize n = env->GetArrayLength(samples);
+
+        float score = vad->Compute(p, n);
+
+        env->ReleaseFloatArrayElements(samples, p, JNI_ABORT);
+
+        return static_cast<jfloat>(score);
+      },
+      -1.0f);
+}
--- a/sherpa-onnx/kotlin-api/Vad.kt
查看文件 @ee9bd25
+++ b/sherpa-onnx/kotlin-api/Vad.kt
查看文件 @ee9bd25
@@ -55,6 +55,9 @@ class Vad(
     fun release() = finalize()
+    fun compute(samples: FloatArray): Float = compute(ptr, samples)
+
+
     fun acceptWaveform(samples: FloatArray) = acceptWaveform(ptr, samples)
     fun empty(): Boolean = empty(ptr)
@@ -85,6 +88,8 @@ class Vad(
     ): Long
     private external fun acceptWaveform(ptr: Long, samples: FloatArray)
+    private external fun compute(ptr: Long, samples: FloatArray): Float
+
     private external fun empty(ptr: Long): Boolean
     private external fun pop(ptr: Long)
     private external fun clear(ptr: Long)