Fangjun Kuang
Committed by GitHub

Add Silero VAD (#313)

1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.7.14") 4 +set(SHERPA_ONNX_VERSION "1.7.15")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -136,6 +136,7 @@ class BuildExtension(build_ext): @@ -136,6 +136,7 @@ class BuildExtension(build_ext):
136 binaries += ["sherpa-onnx-online-websocket-server"] 136 binaries += ["sherpa-onnx-online-websocket-server"]
137 binaries += ["sherpa-onnx-offline-websocket-server"] 137 binaries += ["sherpa-onnx-offline-websocket-server"]
138 binaries += ["sherpa-onnx-online-websocket-client"] 138 binaries += ["sherpa-onnx-online-websocket-client"]
  139 + binaries += ["sherpa-onnx-vad-microphone"]
139 140
140 if is_windows(): 141 if is_windows():
141 binaries += ["kaldi-native-fbank-core.dll"] 142 binaries += ["kaldi-native-fbank-core.dll"]
  1 +# File description
  2 +
  3 +- [./http_server.py](./http_server.py) It defines which files to server.
  4 + Files are saved in [./web](./web).
  5 +- [non_streaming_server.py](./non_streaming_server.py) WebSocket server for
  6 + non-streaming models.
  7 +- [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses
  8 + [silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech
  9 + segments and concatenate all speech segments into a single one.
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This file shows how to remove non-speech segments
  5 +and merge all speech segments into a large segment
  6 +and save it to a file.
  7 +
  8 +Usage
  9 +
  10 +python3 ./vad-remove-non-speech-segments.py \
  11 + --silero-vad-model silero_vad.onnx
  12 +
  13 +Please visit
  14 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  15 +to download silero_vad.onnx
  16 +
  17 +For instance,
  18 +
  19 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  20 +"""
  21 +
  22 +import argparse
  23 +import sys
  24 +import time
  25 +from pathlib import Path
  26 +
  27 +import numpy as np
  28 +import sherpa_onnx
  29 +import soundfile as sf
  30 +
  31 +try:
  32 + import sounddevice as sd
  33 +except ImportError:
  34 + print("Please install sounddevice first. You can use")
  35 + print()
  36 + print(" pip install sounddevice")
  37 + print()
  38 + print("to install it")
  39 + sys.exit(-1)
  40 +
  41 +
  42 +def assert_file_exists(filename: str):
  43 + assert Path(filename).is_file(), (
  44 + f"{filename} does not exist!\n"
  45 + "Please refer to "
  46 + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
  47 + )
  48 +
  49 +
  50 +def get_args():
  51 + parser = argparse.ArgumentParser(
  52 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  53 + )
  54 +
  55 + parser.add_argument(
  56 + "--silero-vad-model",
  57 + type=str,
  58 + required=True,
  59 + help="Path to silero_vad.onnx",
  60 + )
  61 +
  62 + return parser.parse_args()
  63 +
  64 +
  65 +def main():
  66 + devices = sd.query_devices()
  67 + if len(devices) == 0:
  68 + print("No microphone devices found")
  69 + sys.exit(0)
  70 +
  71 + print(devices)
  72 + default_input_device_idx = sd.default.device[0]
  73 + print(f'Use default device: {devices[default_input_device_idx]["name"]}')
  74 +
  75 + args = get_args()
  76 + assert_file_exists(args.silero_vad_model)
  77 +
  78 + sample_rate = 16000
  79 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
  80 +
  81 + config = sherpa_onnx.VadModelConfig()
  82 + config.silero_vad.model = args.silero_vad_model
  83 + config.sample_rate = sample_rate
  84 +
  85 + window_size = config.silero_vad.window_size
  86 +
  87 + buffer = []
  88 + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
  89 +
  90 + all_samples = []
  91 +
  92 + print("Started! Please speak")
  93 +
  94 + try:
  95 + with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
  96 + while True:
  97 + samples, _ = s.read(samples_per_read) # a blocking read
  98 + samples = samples.reshape(-1)
  99 + buffer = np.concatenate([buffer, samples])
  100 +
  101 + all_samples = np.concatenate([all_samples, samples])
  102 +
  103 + while len(buffer) > window_size:
  104 + vad.accept_waveform(buffer[:window_size])
  105 + buffer = buffer[window_size:]
  106 + except KeyboardInterrupt:
  107 + print("\nCaught Ctrl + C. Saving & Exiting")
  108 +
  109 + speech_samples = []
  110 + while not vad.empty():
  111 + speech_samples.extend(vad.front.samples)
  112 + vad.pop()
  113 +
  114 + speech_samples = np.array(speech_samples, dtype=np.float32)
  115 +
  116 + filename_for_speech = time.strftime("%Y%m%d-%H%M%S-speech.wav")
  117 + sf.write(filename_for_speech, speech_samples, samplerate=sample_rate)
  118 +
  119 + filename_for_all = time.strftime("%Y%m%d-%H%M%S-all.wav")
  120 + sf.write(filename_for_all, all_samples, samplerate=sample_rate)
  121 +
  122 + print(f"Saved to {filename_for_speech} and {filename_for_all}")
  123 +
  124 +
  125 +if __name__ == "__main__":
  126 + main()
@@ -56,6 +56,7 @@ def get_binaries_to_install(): @@ -56,6 +56,7 @@ def get_binaries_to_install():
56 binaries += ["sherpa-onnx-online-websocket-server"] 56 binaries += ["sherpa-onnx-online-websocket-server"]
57 binaries += ["sherpa-onnx-offline-websocket-server"] 57 binaries += ["sherpa-onnx-offline-websocket-server"]
58 binaries += ["sherpa-onnx-online-websocket-client"] 58 binaries += ["sherpa-onnx-online-websocket-client"]
  59 + binaries += ["sherpa-onnx-vad-microphone"]
59 if is_windows(): 60 if is_windows():
60 binaries += ["kaldi-native-fbank-core.dll"] 61 binaries += ["kaldi-native-fbank-core.dll"]
61 binaries += ["sherpa-onnx-c-api.dll"] 62 binaries += ["sherpa-onnx-c-api.dll"]
@@ -95,8 +96,8 @@ setuptools.setup( @@ -95,8 +96,8 @@ setuptools.setup(
95 "Topic :: Scientific/Engineering :: Artificial Intelligence", 96 "Topic :: Scientific/Engineering :: Artificial Intelligence",
96 ], 97 ],
97 entry_points={ 98 entry_points={
98 - 'console_scripts': [  
99 - 'sherpa-onnx-cli=sherpa_onnx.cli:cli', 99 + "console_scripts": [
  100 + "sherpa-onnx-cli=sherpa_onnx.cli:cli",
100 ], 101 ],
101 }, 102 },
102 license="Apache licensed, as found in the LICENSE file", 103 license="Apache licensed, as found in the LICENSE file",
@@ -13,6 +13,7 @@ endif() @@ -13,6 +13,7 @@ endif()
13 set(sources 13 set(sources
14 base64-decode.cc 14 base64-decode.cc
15 cat.cc 15 cat.cc
  16 + circular-buffer.cc
16 context-graph.cc 17 context-graph.cc
17 endpoint.cc 18 endpoint.cc
18 features.cc 19 features.cc
@@ -66,6 +67,8 @@ set(sources @@ -66,6 +67,8 @@ set(sources
66 provider.cc 67 provider.cc
67 resample.cc 68 resample.cc
68 session.cc 69 session.cc
  70 + silero-vad-model-config.cc
  71 + silero-vad-model.cc
69 slice.cc 72 slice.cc
70 stack.cc 73 stack.cc
71 symbol-table.cc 74 symbol-table.cc
@@ -73,6 +76,9 @@ set(sources @@ -73,6 +76,9 @@ set(sources
73 transpose.cc 76 transpose.cc
74 unbind.cc 77 unbind.cc
75 utils.cc 78 utils.cc
  79 + vad-model-config.cc
  80 + vad-model.cc
  81 + voice-activity-detector.cc
76 wave-reader.cc 82 wave-reader.cc
77 ) 83 )
78 84
@@ -172,32 +178,42 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) @@ -172,32 +178,42 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
172 microphone.cc 178 microphone.cc
173 ) 179 )
174 180
  181 + add_executable(sherpa-onnx-vad-microphone
  182 + sherpa-onnx-vad-microphone.cc
  183 + microphone.cc
  184 + )
  185 +
175 if(BUILD_SHARED_LIBS) 186 if(BUILD_SHARED_LIBS)
176 set(PA_LIB portaudio) 187 set(PA_LIB portaudio)
177 else() 188 else()
178 set(PA_LIB portaudio_static) 189 set(PA_LIB portaudio_static)
179 endif() 190 endif()
180 191
181 - target_link_libraries(sherpa-onnx-microphone ${PA_LIB} sherpa-onnx-core)  
182 - target_link_libraries(sherpa-onnx-microphone-offline ${PA_LIB} sherpa-onnx-core) 192 + set(exes
  193 + sherpa-onnx-microphone
  194 + sherpa-onnx-microphone-offline
  195 + sherpa-onnx-vad-microphone
  196 + )
  197 + foreach(exe IN LISTS exes)
  198 + target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core)
  199 + endforeach()
183 200
184 if(NOT WIN32) 201 if(NOT WIN32)
185 - target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")  
186 - target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")  
187 -  
188 - target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")  
189 - target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib") 202 + foreach(exe IN LISTS exes)
  203 + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
  204 + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
  205 + endforeach()
190 206
191 if(SHERPA_ONNX_ENABLE_PYTHON) 207 if(SHERPA_ONNX_ENABLE_PYTHON)
192 - target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")  
193 - target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib") 208 +
  209 + foreach(exe IN LISTS exes)
  210 + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
  211 + endforeach()
194 endif() 212 endif()
195 endif() 213 endif()
196 214
197 install( 215 install(
198 - TARGETS  
199 - sherpa-onnx-microphone  
200 - sherpa-onnx-microphone-offline 216 + TARGETS ${exes}
201 DESTINATION 217 DESTINATION
202 bin 218 bin
203 ) 219 )
@@ -269,6 +285,7 @@ endif() @@ -269,6 +285,7 @@ endif()
269 if(SHERPA_ONNX_ENABLE_TESTS) 285 if(SHERPA_ONNX_ENABLE_TESTS)
270 set(sherpa_onnx_test_srcs 286 set(sherpa_onnx_test_srcs
271 cat-test.cc 287 cat-test.cc
  288 + circular-buffer-test.cc
272 context-graph-test.cc 289 context-graph-test.cc
273 packed-sequence-test.cc 290 packed-sequence-test.cc
274 pad-sequence-test.cc 291 pad-sequence-test.cc
  1 +# File descriptions
  2 +
  3 +- [./sherpa-onnx-alsa.cc](./sherpa-onnx-alsa.cc) For Linux only, especially for
  4 + embedded Linux, e.g., Raspberry Pi; it uses a streaming model for real-time
  5 + speech recognition with a microphone.
  6 +
  7 +- [./sherpa-onnx-microphone.cc](./sherpa-onnx-microphone.cc)
  8 + For Linux/Windows/macOS; it uses a streaming model for real-time speech
  9 + recognition with a microphone.
  10 +
  11 +- [./sherpa-onnx-microphone-offline.cc](./sherpa-onnx-microphone-offline.cc)
  12 + For Linux/Windows/macOS; it uses a non-streaming model for speech
  13 + recognition with a microphone.
  14 +
  15 +- [./sherpa-onnx.cc](./sherpa-onnx.cc)
  16 + It uses a streaming model to decode wave files
  17 +
  18 +- [./sherpa-onnx-offline.cc](./sherpa-onnx-offline.cc)
  19 + It uses a non-streaming model to decode wave files
  20 +
  21 +- [./online-websocket-server.cc](./online-websocket-server.cc)
  22 + WebSocket server for streaming models.
  23 +
  24 +- [./offline-websocket-server.cc](./offline-websocket-server.cc)
  25 + WebSocket server for non-streaming models.
  26 +
  27 +- [./sherpa-onnx-vad-microphone.cc](./sherpa-onnx-vad-microphone.cc)
  28 + Use silero VAD to detect speeches with a microphone.
  29 +
  1 +// sherpa-onnx/csrc/circular-buffer-test.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/circular-buffer.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "gtest/gtest.h"
  10 +#include "sherpa-onnx/csrc/macros.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +TEST(CircularBuffer, Push) {
  15 + CircularBuffer buffer(10);
  16 + EXPECT_EQ(buffer.Size(), 0);
  17 + EXPECT_EQ(buffer.Head(), 0);
  18 + EXPECT_EQ(buffer.Tail(), 0);
  19 +
  20 + std::vector<float> a = {0, 1, 2, 3, 4, 5};
  21 + buffer.Push(a.data(), a.size());
  22 +
  23 + EXPECT_EQ(buffer.Size(), 6);
  24 + EXPECT_EQ(buffer.Head(), 0);
  25 + EXPECT_EQ(buffer.Tail(), 6);
  26 +
  27 + auto c = buffer.Get(0, a.size());
  28 + EXPECT_EQ(a.size(), c.size());
  29 + for (int32_t i = 0; i != a.size(); ++i) {
  30 + EXPECT_EQ(a[i], c[i]);
  31 + }
  32 +
  33 + std::vector<float> d = {-6, -7, -8, -9};
  34 + buffer.Push(d.data(), d.size());
  35 +
  36 + c = buffer.Get(a.size(), d.size());
  37 + EXPECT_EQ(d.size(), c.size());
  38 + for (int32_t i = 0; i != d.size(); ++i) {
  39 + EXPECT_EQ(d[i], c[i]);
  40 + }
  41 +}
  42 +
  43 +TEST(CircularBuffer, PushAndPop) {
  44 + CircularBuffer buffer(5);
  45 + std::vector<float> a = {0, 1, 2, 3};
  46 + buffer.Push(a.data(), a.size());
  47 +
  48 + EXPECT_EQ(buffer.Size(), 4);
  49 + EXPECT_EQ(buffer.Head(), 0);
  50 + EXPECT_EQ(buffer.Tail(), 4);
  51 +
  52 + buffer.Pop(2);
  53 +
  54 + EXPECT_EQ(buffer.Size(), 2);
  55 + EXPECT_EQ(buffer.Head(), 2);
  56 + EXPECT_EQ(buffer.Tail(), 4);
  57 +
  58 + auto c = buffer.Get(2, 2);
  59 + EXPECT_EQ(c.size(), 2);
  60 + EXPECT_EQ(c[0], 2);
  61 + EXPECT_EQ(c[1], 3);
  62 +
  63 + a = {10, 20, 30};
  64 + buffer.Push(a.data(), a.size());
  65 + EXPECT_EQ(buffer.Size(), 5);
  66 + EXPECT_EQ(buffer.Head(), 2);
  67 + EXPECT_EQ(buffer.Tail(), 7);
  68 +
  69 + c = buffer.Get(2, 5);
  70 + EXPECT_EQ(c.size(), 5);
  71 + EXPECT_EQ(c[0], 2);
  72 + EXPECT_EQ(c[1], 3);
  73 + EXPECT_EQ(c[2], 10);
  74 + EXPECT_EQ(c[3], 20);
  75 + EXPECT_EQ(c[4], 30);
  76 +
  77 + c = buffer.Get(3, 4);
  78 + EXPECT_EQ(c.size(), 4);
  79 + EXPECT_EQ(c[0], 3);
  80 + EXPECT_EQ(c[1], 10);
  81 + EXPECT_EQ(c[2], 20);
  82 + EXPECT_EQ(c[3], 30);
  83 +
  84 + c = buffer.Get(4, 3);
  85 + EXPECT_EQ(c.size(), 3);
  86 + EXPECT_EQ(c[0], 10);
  87 + EXPECT_EQ(c[1], 20);
  88 + EXPECT_EQ(c[2], 30);
  89 +
  90 + buffer.Pop(4);
  91 + EXPECT_EQ(buffer.Size(), 1);
  92 + EXPECT_EQ(buffer.Head(), 6);
  93 + EXPECT_EQ(buffer.Tail(), 7);
  94 +
  95 + c = buffer.Get(6, 1);
  96 + EXPECT_EQ(c.size(), 1);
  97 + EXPECT_EQ(c[0], 30);
  98 +
  99 + a = {100, 200, 300, 400};
  100 + buffer.Push(a.data(), a.size());
  101 + EXPECT_EQ(buffer.Size(), 5);
  102 +
  103 + EXPECT_EQ(buffer.Size(), 5);
  104 + EXPECT_EQ(buffer.Head(), 6);
  105 + EXPECT_EQ(buffer.Tail(), 11);
  106 +
  107 + c = buffer.Get(6, 5);
  108 + EXPECT_EQ(c.size(), 5);
  109 + EXPECT_EQ(c[0], 30);
  110 + EXPECT_EQ(c[1], 100);
  111 + EXPECT_EQ(c[2], 200);
  112 + EXPECT_EQ(c[3], 300);
  113 + EXPECT_EQ(c[4], 400);
  114 +
  115 + buffer.Pop(3);
  116 + EXPECT_EQ(buffer.Size(), 2);
  117 + EXPECT_EQ(buffer.Head(), 9);
  118 + EXPECT_EQ(buffer.Tail(), 11);
  119 +
  120 + c = buffer.Get(10, 1);
  121 + EXPECT_EQ(c.size(), 1);
  122 + EXPECT_EQ(c[0], 400);
  123 +
  124 + a = {1000, 2000, 3000};
  125 + buffer.Push(a.data(), a.size());
  126 +
  127 + EXPECT_EQ(buffer.Size(), 5);
  128 + EXPECT_EQ(buffer.Head(), 9);
  129 + EXPECT_EQ(buffer.Tail(), 14);
  130 +
  131 + buffer.Pop(1);
  132 +
  133 + EXPECT_EQ(buffer.Size(), 4);
  134 + EXPECT_EQ(buffer.Head(), 10);
  135 + EXPECT_EQ(buffer.Tail(), 14);
  136 +
  137 + a = {4000};
  138 +
  139 + buffer.Push(a.data(), a.size());
  140 + EXPECT_EQ(buffer.Size(), 5);
  141 + EXPECT_EQ(buffer.Head(), 10);
  142 + EXPECT_EQ(buffer.Tail(), 15);
  143 +
  144 + c = buffer.Get(13, 2);
  145 + EXPECT_EQ(c.size(), 2);
  146 + EXPECT_EQ(c[0], 3000);
  147 + EXPECT_EQ(c[1], 4000);
  148 +}
  149 +
  150 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/circular-buffer.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/circular-buffer.h"
  6 +
  7 +#include <algorithm>
  8 +
  9 +#include "sherpa-onnx/csrc/macros.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +CircularBuffer::CircularBuffer(int32_t capacity) {
  14 + if (capacity <= 0) {
  15 + SHERPA_ONNX_LOGE("Please specify a positive capacity. Given: %d\n",
  16 + capacity);
  17 + exit(-1);
  18 + }
  19 + buffer_.resize(capacity);
  20 +}
  21 +
  22 +void CircularBuffer::Push(const float *p, int32_t n) {
  23 + int32_t capacity = buffer_.size();
  24 + int32_t size = Size();
  25 + if (n + size > capacity) {
  26 + SHERPA_ONNX_LOGE("Overflow! n: %d, size: %d, n+size: %d, capacity: %d", n,
  27 + size, n + size, capacity);
  28 + exit(-1);
  29 + }
  30 +
  31 + int32_t start = tail_ % capacity;
  32 +
  33 + tail_ += n;
  34 +
  35 + if (start + n < capacity) {
  36 + std::copy(p, p + n, buffer_.begin() + start);
  37 + return;
  38 + }
  39 +
  40 + int32_t part1_size = capacity - start;
  41 +
  42 + std::copy(p, p + part1_size, buffer_.begin() + start);
  43 +
  44 + std::copy(p + part1_size, p + n, buffer_.begin());
  45 +}
  46 +
  47 +std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
  48 + if (start_index < head_ || start_index >= tail_) {
  49 + SHERPA_ONNX_LOGE("Invalid start_index: %d. head_: %d, tail_: %d",
  50 + start_index, head_, tail_);
  51 + return {};
  52 + }
  53 +
  54 + int32_t size = Size();
  55 + if (n < 0 || n > size) {
  56 + SHERPA_ONNX_LOGE("Invalid n: %d. size: %d", n, size);
  57 + return {};
  58 + }
  59 +
  60 + int32_t capacity = buffer_.size();
  61 +
  62 + if (start_index - head_ + n > size) {
  63 + SHERPA_ONNX_LOGE("Invalid start_index: %d and n: %d. head_: %d, size: %d",
  64 + start_index, n, head_, size);
  65 + return {};
  66 + }
  67 +
  68 + int32_t start = start_index % capacity;
  69 +
  70 + if (start + n < capacity) {
  71 + return {buffer_.begin() + start, buffer_.begin() + start + n};
  72 + }
  73 +
  74 + std::vector<float> ans(n);
  75 +
  76 + std::copy(buffer_.begin() + start, buffer_.end(), ans.begin());
  77 +
  78 + int32_t part1_size = capacity - start;
  79 + int32_t part2_size = n - part1_size;
  80 + std::copy(buffer_.begin(), buffer_.begin() + part2_size,
  81 + ans.begin() + part1_size);
  82 +
  83 + return ans;
  84 +}
  85 +
  86 +void CircularBuffer::Pop(int32_t n) {
  87 + int32_t size = Size();
  88 + if (n < 0 || n > size) {
  89 + SHERPA_ONNX_LOGE("Invalid n: %d. size: %d", n, size);
  90 + return;
  91 + }
  92 +
  93 + head_ += n;
  94 +}
  95 +
  96 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/circular-buffer.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_
  5 +#define SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_
  6 +
  7 +#include <cstdint>
  8 +#include <vector>
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +class CircularBuffer {
  13 + public:
  14 + // Capacity of this buffer. Should be large enough.
  15 + // If it is full, we just print a message and exit the program.
  16 + explicit CircularBuffer(int32_t capacity);
  17 +
  18 + // Push an array
  19 + //
  20 + // @param p Pointer to the start address of the array
  21 + // @param n Number of elements in the array
  22 + //
  23 + // Note: If n + Size() > capacity, we print an error message and exit.
  24 + void Push(const float *p, int32_t n);
  25 +
  26 + // @param start_index Should in the range [head_, tail_)
  27 + // @param n Number of elements to get
  28 + // @return Return a vector of size n containing the requested elements
  29 + std::vector<float> Get(int32_t start_index, int32_t n) const;
  30 +
  31 + // Remove n elements from the buffer
  32 + //
  33 + // @param n Should be in the range [0, size_]
  34 + void Pop(int32_t n);
  35 +
  36 + // Number of elements in the buffer.
  37 + int32_t Size() const { return tail_ - head_; }
  38 +
  39 + // Current position of the head
  40 + int32_t Head() const { return head_; }
  41 +
  42 + // Current position of the tail
  43 + int32_t Tail() const { return tail_; }
  44 +
  45 + void Reset() {
  46 + head_ = 0;
  47 + tail_ = 0;
  48 + }
  49 +
  50 + private:
  51 + std::vector<float> buffer_;
  52 +
  53 + int32_t head_ = 0; // linear index; always increasing; never wraps around
  54 + int32_t tail_ = 0; // linear index, always increasing; never wraps around.
  55 +};
  56 +
  57 +} // namespace sherpa_onnx
  58 +
  59 +#endif // SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_
@@ -76,4 +76,8 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) { @@ -76,4 +76,8 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) {
76 return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider); 76 return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider);
77 } 77 }
78 78
  79 +Ort::SessionOptions GetSessionOptions(const VadModelConfig &config) {
  80 + return GetSessionOptionsImpl(config.num_threads, config.provider);
  81 +}
  82 +
79 } // namespace sherpa_onnx 83 } // namespace sherpa_onnx
@@ -10,6 +10,7 @@ @@ -10,6 +10,7 @@
10 #include "sherpa-onnx/csrc/offline-model-config.h" 10 #include "sherpa-onnx/csrc/offline-model-config.h"
11 #include "sherpa-onnx/csrc/online-lm-config.h" 11 #include "sherpa-onnx/csrc/online-lm-config.h"
12 #include "sherpa-onnx/csrc/online-model-config.h" 12 #include "sherpa-onnx/csrc/online-model-config.h"
  13 +#include "sherpa-onnx/csrc/vad-model-config.h"
13 14
14 namespace sherpa_onnx { 15 namespace sherpa_onnx {
15 16
@@ -20,6 +21,8 @@ Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config); @@ -20,6 +21,8 @@ Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config);
20 Ort::SessionOptions GetSessionOptions(const OfflineLMConfig &config); 21 Ort::SessionOptions GetSessionOptions(const OfflineLMConfig &config);
21 22
22 Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config); 23 Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config);
  24 +
  25 +Ort::SessionOptions GetSessionOptions(const VadModelConfig &config);
23 } // namespace sherpa_onnx 26 } // namespace sherpa_onnx
24 27
25 #endif // SHERPA_ONNX_CSRC_SESSION_H_ 28 #endif // SHERPA_ONNX_CSRC_SESSION_H_
  1 +// sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc
  2 +//
  3 +// Copyright (c) 2022-2023 Xiaomi Corporation
  4 +
  5 +#include <signal.h>
  6 +#include <stdio.h>
  7 +#include <stdlib.h>
  8 +
  9 +#include <algorithm>
  10 +#include <mutex> // NOLINT
  11 +
  12 +#include "portaudio.h" // NOLINT
  13 +#include "sherpa-onnx/csrc/circular-buffer.h"
  14 +#include "sherpa-onnx/csrc/microphone.h"
  15 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  16 +
  17 +bool stop = false;
  18 +std::mutex mutex;
  19 +sherpa_onnx::CircularBuffer buffer(16000 * 60);
  20 +
  21 +static int32_t RecordCallback(const void *input_buffer,
  22 + void * /*output_buffer*/,
  23 + unsigned long frames_per_buffer, // NOLINT
  24 + const PaStreamCallbackTimeInfo * /*time_info*/,
  25 + PaStreamCallbackFlags /*status_flags*/,
  26 + void *user_data) {
  27 + std::lock_guard<std::mutex> lock(mutex);
  28 + buffer.Push(reinterpret_cast<const float *>(input_buffer), frames_per_buffer);
  29 +
  30 + return stop ? paComplete : paContinue;
  31 +}
  32 +
  33 +static void Handler(int32_t sig) {
  34 + stop = true;
  35 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  36 +}
  37 +
  38 +int32_t main(int32_t argc, char *argv[]) {
  39 + signal(SIGINT, Handler);
  40 +
  41 + const char *kUsageMessage = R"usage(
  42 +This program shows how to use VAD in sherpa-onnx.
  43 +
  44 + ./bin/sherpa-onnx-vad-microphone \
  45 + --silero-vad-model=/path/to/silero_vad.onnx \
  46 + --provider=cpu \
  47 + --num-threads=1
  48 +
  49 +Please download silero_vad.onnx from
  50 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  51 +
  52 +For instance, use
  53 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  54 +)usage";
  55 +
  56 + sherpa_onnx::ParseOptions po(kUsageMessage);
  57 + sherpa_onnx::VadModelConfig config;
  58 +
  59 + config.Register(&po);
  60 + po.Read(argc, argv);
  61 + if (po.NumArgs() != 0) {
  62 + po.PrintUsage();
  63 + exit(EXIT_FAILURE);
  64 + }
  65 +
  66 + fprintf(stderr, "%s\n", config.ToString().c_str());
  67 +
  68 + if (!config.Validate()) {
  69 + fprintf(stderr, "Errors in config!\n");
  70 + return -1;
  71 + }
  72 +
  73 + sherpa_onnx::Microphone mic;
  74 +
  75 + PaDeviceIndex num_devices = Pa_GetDeviceCount();
  76 + fprintf(stderr, "Num devices: %d\n", num_devices);
  77 +
  78 + PaStreamParameters param;
  79 +
  80 + param.device = Pa_GetDefaultInputDevice();
  81 + if (param.device == paNoDevice) {
  82 + fprintf(stderr, "No default input device found\n");
  83 + exit(EXIT_FAILURE);
  84 + }
  85 + fprintf(stderr, "Use default device: %d\n", param.device);
  86 +
  87 + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
  88 + fprintf(stderr, " Name: %s\n", info->name);
  89 + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels);
  90 +
  91 + param.channelCount = 1;
  92 + param.sampleFormat = paFloat32;
  93 +
  94 + param.suggestedLatency = info->defaultLowInputLatency;
  95 + param.hostApiSpecificStreamInfo = nullptr;
  96 + float sample_rate = 16000;
  97 +
  98 + PaStream *stream;
  99 + PaError err =
  100 + Pa_OpenStream(&stream, &param, nullptr, /* &outputParameters, */
  101 + sample_rate,
  102 + 0, // frames per buffer
  103 + paClipOff, // we won't output out of range samples
  104 + // so don't bother clipping them
  105 + RecordCallback, &config.silero_vad.window_size);
  106 + if (err != paNoError) {
  107 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  108 + exit(EXIT_FAILURE);
  109 + }
  110 +
  111 + err = Pa_StartStream(stream);
  112 +
  113 + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config);
  114 +
  115 + fprintf(stderr, "Started\n");
  116 +
  117 + if (err != paNoError) {
  118 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  119 + exit(EXIT_FAILURE);
  120 + }
  121 +
  122 + int32_t window_size = config.silero_vad.window_size;
  123 + bool printed = false;
  124 +
  125 + while (!stop) {
  126 + {
  127 + std::lock_guard<std::mutex> lock(mutex);
  128 +
  129 + while (buffer.Size() >= window_size) {
  130 + std::vector<float> samples = buffer.Get(buffer.Head(), window_size);
  131 + buffer.Pop(window_size);
  132 + vad->AcceptWaveform(samples.data(), samples.size());
  133 +
  134 + if (vad->IsSpeechDetected() && !printed) {
  135 + printed = true;
  136 + fprintf(stderr, "\nDetected speech!\n");
  137 + }
  138 + if (!vad->IsSpeechDetected()) {
  139 + printed = false;
  140 + }
  141 +
  142 + while (!vad->Empty()) {
  143 + float duration = vad->Front().samples.size() / sample_rate;
  144 + vad->Pop();
  145 + fprintf(stderr, "Duration: %.3f seconds\n", duration);
  146 + }
  147 + }
  148 + }
  149 + Pa_Sleep(100); // sleep for 100ms
  150 + }
  151 +
  152 + err = Pa_CloseStream(stream);
  153 + if (err != paNoError) {
  154 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  155 + exit(EXIT_FAILURE);
  156 + }
  157 +
  158 + return 0;
  159 +}
  1 +// sherpa-onnx/csrc/silero-vad-model-config.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/silero-vad-model-config.h"
  6 +
  7 +#include "sherpa-onnx/csrc/file-utils.h"
  8 +#include "sherpa-onnx/csrc/macros.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void SileroVadModelConfig::Register(ParseOptions *po) {
  13 + po->Register("silero-vad-model", &model, "Path to silero VAD ONNX model.");
  14 +
  15 + po->Register("silero-vad-threshold", &threshold,
  16 + "Speech threshold. Silero VAD outputs speech probabilities for "
  17 + "each audio chunk, probabilities ABOVE this value are "
  18 + "considered as SPEECH. It is better to tune this parameter for "
  19 + "each dataset separately, but lazy "
  20 + "0.5 is pretty good for most datasets.");
  21 +
  22 + po->Register(
  23 + "silero-vad-min-silence-duration", &min_silence_duration,
  24 + "In seconds. In the end of each speech chunk wait for "
  25 + "--silero-vad-min-silence-duration seconds before separating it");
  26 +
  27 + po->Register("silero-vad-min-speech-duration", &min_speech_duration,
  28 + "In seconds. In the end of each silence chunk wait for "
  29 + "--silero-vad-min-speech-duration seconds before separating it");
  30 +
  31 + po->Register(
  32 + "silero-vad-window-size", &window_size,
  33 + "In samples. Audio chunks of --silero-vad-window-size samples are fed "
  34 + "to the silero VAD model. WARNING! Silero VAD models were trained using "
  35 + "512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples "
  36 + "for 8000 sample rate. Values other than these may affect model "
  37 + "perfomance!");
  38 +}
  39 +
  40 +bool SileroVadModelConfig::Validate() const {
  41 + if (model.empty()) {
  42 + SHERPA_ONNX_LOGE("Please provide --silero-vad-model");
  43 + return false;
  44 + }
  45 +
  46 + if (!FileExists(model)) {
  47 + SHERPA_ONNX_LOGE("Silero vad model file %s does not exist", model.c_str());
  48 + return false;
  49 + }
  50 +
  51 + if (threshold < 0.01) {
  52 + SHERPA_ONNX_LOGE(
  53 + "Please use a larger value for --silero-vad-threshold. Given: %f",
  54 + threshold);
  55 + return false;
  56 + }
  57 +
  58 + if (threshold >= 1) {
  59 + SHERPA_ONNX_LOGE(
  60 + "Please use a smaller value for --silero-vad-threshold. Given: %f",
  61 + threshold);
  62 + return false;
  63 + }
  64 +
  65 + return true;
  66 +}
  67 +
  68 +std::string SileroVadModelConfig::ToString() const {
  69 + std::ostringstream os;
  70 +
  71 + os << "SilerVadModelConfig(";
  72 + os << "model=\"" << model << "\", ";
  73 + os << "threshold=" << threshold << ", ";
  74 + os << "min_silence_duration=" << min_silence_duration << ", ";
  75 + os << "min_speech_duration=" << min_speech_duration << ", ";
  76 + os << "window_size=" << window_size << ")";
  77 +
  78 + return os.str();
  79 +}
  80 +
  81 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/silero-vad-model-config.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_
  5 +#define SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_
  6 +
  7 +#include <string>
  8 +
  9 +#include "sherpa-onnx/csrc/parse-options.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +struct SileroVadModelConfig {
  14 + std::string model;
  15 +
  16 + // threshold to classify a segment as speech
  17 + //
  18 + // The predicted probability of a segment is larger than this
  19 + // value, then it is classified as speech.
  20 + float threshold = 0.5;
  21 +
  22 + float min_silence_duration = 0.5; // in seconds
  23 +
  24 + float min_speech_duration = 0.25; // in seconds
  25 +
  26 + // 512, 1024, 1536 samples for 16000 Hz
  27 + // 256, 512, 768 samples for 800 Hz
  28 + int window_size = 512; // in samples
  29 +
  30 + SileroVadModelConfig() = default;
  31 +
  32 + void Register(ParseOptions *po);
  33 +
  34 + bool Validate() const;
  35 +
  36 + std::string ToString() const;
  37 +};
  38 +
  39 +} // namespace sherpa_onnx
  40 +
  41 +#endif // SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_
  1 +// sherpa-onnx/csrc/silero-vad-model.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/silero-vad-model.h"
  6 +
  7 +#include <string>
  8 +#include <utility>
  9 +#include <vector>
  10 +
  11 +#include "sherpa-onnx/csrc/macros.h"
  12 +#include "sherpa-onnx/csrc/onnx-utils.h"
  13 +#include "sherpa-onnx/csrc/session.h"
  14 +
  15 +namespace sherpa_onnx {
  16 +
  17 +class SileroVadModel::Impl {
  18 + public:
  19 + explicit Impl(const VadModelConfig &config)
  20 + : config_(config),
  21 + env_(ORT_LOGGING_LEVEL_ERROR),
  22 + sess_opts_(GetSessionOptions(config)),
  23 + allocator_{} {
  24 + auto buf = ReadFile(config.silero_vad.model);
  25 + Init(buf.data(), buf.size());
  26 +
  27 + sample_rate_ = config.sample_rate;
  28 + if (sample_rate_ != 16000) {
  29 + SHERPA_ONNX_LOGE("Expected sample rate 16000. Given: %d",
  30 + config.sample_rate);
  31 + exit(-1);
  32 + }
  33 +
  34 + min_silence_samples_ =
  35 + sample_rate_ * config_.silero_vad.min_silence_duration;
  36 +
  37 + min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
  38 + }
  39 +
  40 + void Reset() {
  41 + // 2 - number of LSTM layer
  42 + // 1 - batch size
  43 + // 64 - hidden dim
  44 + std::array<int64_t, 3> shape{2, 1, 64};
  45 +
  46 + Ort::Value h =
  47 + Ort::Value::CreateTensor<float>(allocator_, shape.data(), shape.size());
  48 +
  49 + Ort::Value c =
  50 + Ort::Value::CreateTensor<float>(allocator_, shape.data(), shape.size());
  51 +
  52 + Fill<float>(&h, 0);
  53 + Fill<float>(&c, 0);
  54 +
  55 + states_.clear();
  56 +
  57 + states_.reserve(2);
  58 + states_.push_back(std::move(h));
  59 + states_.push_back(std::move(c));
  60 +
  61 + triggered_ = false;
  62 + current_sample_ = 0;
  63 + temp_start_ = 0;
  64 + temp_end_ = 0;
  65 + }
  66 +
  67 + bool IsSpeech(const float *samples, int32_t n) {
  68 + if (n != config_.silero_vad.window_size) {
  69 + SHERPA_ONNX_LOGE("n: %d != window_size: %d", n,
  70 + config_.silero_vad.window_size);
  71 + exit(-1);
  72 + }
  73 +
  74 + auto memory_info =
  75 + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
  76 +
  77 + std::array<int64_t, 2> x_shape = {1, n};
  78 +
  79 + Ort::Value x =
  80 + Ort::Value::CreateTensor(memory_info, const_cast<float *>(samples), n,
  81 + x_shape.data(), x_shape.size());
  82 +
  83 + int64_t sr_shape = 1;
  84 + Ort::Value sr =
  85 + Ort::Value::CreateTensor(memory_info, &sample_rate_, 1, &sr_shape, 1);
  86 +
  87 + std::array<Ort::Value, 4> inputs = {std::move(x), std::move(sr),
  88 + std::move(states_[0]),
  89 + std::move(states_[1])};
  90 +
  91 + auto out =
  92 + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
  93 + output_names_ptr_.data(), output_names_ptr_.size());
  94 +
  95 + states_[0] = std::move(out[1]);
  96 + states_[1] = std::move(out[2]);
  97 +
  98 + float prob = out[0].GetTensorData<float>()[0];
  99 +
  100 + float threshold = config_.silero_vad.threshold;
  101 +
  102 + current_sample_ += config_.silero_vad.window_size;
  103 +
  104 + if (prob > threshold && temp_end_ != 0) {
  105 + temp_end_ = 0;
  106 + }
  107 +
  108 + if (prob > threshold && temp_start_ == 0) {
  109 + // start speaking, but we require that it must satisfy
  110 + // min_speech_duration
  111 + temp_start_ = current_sample_;
  112 + return false;
  113 + }
  114 +
  115 + if (prob > threshold && temp_start_ != 0 && !triggered_) {
  116 + if (current_sample_ - temp_start_ < min_speech_samples_) {
  117 + return false;
  118 + }
  119 +
  120 + triggered_ = true;
  121 +
  122 + return true;
  123 + }
  124 +
  125 + if ((prob < threshold) && !triggered_) {
  126 + // silence
  127 + temp_start_ = 0;
  128 + temp_end_ = 0;
  129 + return false;
  130 + }
  131 +
  132 + if ((prob > threshold - 0.15) && triggered_) {
  133 + // speaking
  134 + return true;
  135 + }
  136 +
  137 + if ((prob > threshold) && !triggered_) {
  138 + // start speaking
  139 + triggered_ = true;
  140 +
  141 + return true;
  142 + }
  143 +
  144 + if ((prob < threshold) && triggered_) {
  145 + // stop to speak
  146 + if (temp_end_ == 0) {
  147 + temp_end_ = current_sample_;
  148 + }
  149 +
  150 + if (current_sample_ - temp_end_ < min_silence_samples_) {
  151 + // continue speaking
  152 + return true;
  153 + }
  154 + // stopped speaking
  155 + temp_start_ = 0;
  156 + temp_end_ = 0;
  157 + triggered_ = false;
  158 + return false;
  159 + }
  160 +
  161 + return false;
  162 + }
  163 +
  164 + int32_t WindowSize() const { return config_.silero_vad.window_size; }
  165 +
  166 + int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
  167 +
  168 + int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }
  169 +
  170 + private:
  171 + void Init(void *model_data, size_t model_data_length) {
  172 + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
  173 + sess_opts_);
  174 +
  175 + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
  176 + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
  177 + Check();
  178 +
  179 + Reset();
  180 + }
  181 +
  182 + void Check() {
  183 + if (input_names_.size() != 4) {
  184 + SHERPA_ONNX_LOGE("Expect 4 inputs. Given: %d",
  185 + static_cast<int32_t>(input_names_.size()));
  186 + exit(-1);
  187 + }
  188 +
  189 + if (input_names_[0] != "input") {
  190 + SHERPA_ONNX_LOGE("Input[0]: %s. Expected: input",
  191 + input_names_[0].c_str());
  192 + exit(-1);
  193 + }
  194 +
  195 + if (input_names_[1] != "sr") {
  196 + SHERPA_ONNX_LOGE("Input[1]: %s. Expected: sr", input_names_[1].c_str());
  197 + exit(-1);
  198 + }
  199 +
  200 + if (input_names_[2] != "h") {
  201 + SHERPA_ONNX_LOGE("Input[2]: %s. Expected: h", input_names_[2].c_str());
  202 + exit(-1);
  203 + }
  204 +
  205 + if (input_names_[3] != "c") {
  206 + SHERPA_ONNX_LOGE("Input[3]: %s. Expected: c", input_names_[3].c_str());
  207 + exit(-1);
  208 + }
  209 +
  210 + // Now for outputs
  211 + if (output_names_.size() != 3) {
  212 + SHERPA_ONNX_LOGE("Expect 3 outputs. Given: %d",
  213 + static_cast<int32_t>(output_names_.size()));
  214 + exit(-1);
  215 + }
  216 +
  217 + if (output_names_[0] != "output") {
  218 + SHERPA_ONNX_LOGE("Output[0]: %s. Expected: output",
  219 + output_names_[0].c_str());
  220 + exit(-1);
  221 + }
  222 +
  223 + if (output_names_[1] != "hn") {
  224 + SHERPA_ONNX_LOGE("Output[1]: %s. Expected: sr", output_names_[1].c_str());
  225 + exit(-1);
  226 + }
  227 +
  228 + if (output_names_[2] != "cn") {
  229 + SHERPA_ONNX_LOGE("Output[2]: %s. Expected: sr", output_names_[2].c_str());
  230 + exit(-1);
  231 + }
  232 + }
  233 +
  234 + private:
  235 + VadModelConfig config_;
  236 +
  237 + Ort::Env env_;
  238 + Ort::SessionOptions sess_opts_;
  239 + Ort::AllocatorWithDefaultOptions allocator_;
  240 +
  241 + std::unique_ptr<Ort::Session> sess_;
  242 +
  243 + std::vector<std::string> input_names_;
  244 + std::vector<const char *> input_names_ptr_;
  245 +
  246 + std::vector<std::string> output_names_;
  247 + std::vector<const char *> output_names_ptr_;
  248 +
  249 + std::vector<Ort::Value> states_;
  250 + int64_t sample_rate_;
  251 + int32_t min_silence_samples_;
  252 + int32_t min_speech_samples_;
  253 +
  254 + bool triggered_ = false;
  255 + int32_t current_sample_ = 0;
  256 + int32_t temp_start_ = 0;
  257 + int32_t temp_end_ = 0;
  258 +};
  259 +
  260 +SileroVadModel::SileroVadModel(const VadModelConfig &config)
  261 + : impl_(std::make_unique<Impl>(config)) {}
  262 +
  263 +SileroVadModel::~SileroVadModel() = default;
  264 +
  265 +void SileroVadModel::Reset() { return impl_->Reset(); }
  266 +
  267 +bool SileroVadModel::IsSpeech(const float *samples, int32_t n) {
  268 + return impl_->IsSpeech(samples, n);
  269 +}
  270 +
  271 +int32_t SileroVadModel::WindowSize() const { return impl_->WindowSize(); }
  272 +
  273 +int32_t SileroVadModel::MinSilenceDurationSamples() const {
  274 + return impl_->MinSilenceDurationSamples();
  275 +}
  276 +
  277 +int32_t SileroVadModel::MinSpeechDurationSamples() const {
  278 + return impl_->MinSpeechDurationSamples();
  279 +}
  280 +
  281 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/silero-vad-model.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_
  5 +#define SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_
  6 +
  7 +#include <memory>
  8 +
  9 +#include "sherpa-onnx/csrc/vad-model.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +class SileroVadModel : public VadModel {
  14 + public:
  15 + explicit SileroVadModel(const VadModelConfig &config);
  16 + ~SileroVadModel() override;
  17 +
  18 + // reset the internal model states
  19 + void Reset() override;
  20 +
  21 + /**
  22 + * @param samples Pointer to a 1-d array containing audio samples.
  23 + * Each sample should be normalized to the range [-1, 1].
  24 + * @param n Number of samples.
  25 + *
  26 + * @return Return true if speech is detected. Return false otherwise.
  27 + */
  28 + bool IsSpeech(const float *samples, int32_t n) override;
  29 +
  30 + int32_t WindowSize() const override;
  31 +
  32 + int32_t MinSilenceDurationSamples() const override;
  33 + int32_t MinSpeechDurationSamples() const override;
  34 +
  35 + private:
  36 + class Impl;
  37 + std::unique_ptr<Impl> impl_;
  38 +};
  39 +
  40 +} // namespace sherpa_onnx
  41 +
  42 +#endif // SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_
  1 +// sherpa-onnx/csrc/vad-model-config.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/vad-model-config.h"
  6 +
  7 +#include <sstream>
  8 +#include <string>
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void VadModelConfig::Register(ParseOptions *po) {
  13 + silero_vad.Register(po);
  14 +
  15 + po->Register("vad-sample-rate", &sample_rate,
  16 + "Sample rate expected by the VAD model");
  17 +
  18 + po->Register("vad-num-threads", &num_threads,
  19 + "Number of threads to run the VAD model");
  20 +
  21 + po->Register("vad-provider", &provider,
  22 + "Specify a provider to run the VAD model. Supported values: "
  23 + "cpu, cuda, coreml");
  24 +
  25 + po->Register("vad-debug", &debug,
  26 + "true to display debug information when loading vad models");
  27 +}
  28 +
  29 +bool VadModelConfig::Validate() const { return silero_vad.Validate(); }
  30 +
  31 +std::string VadModelConfig::ToString() const {
  32 + std::ostringstream os;
  33 +
  34 + os << "VadModelConfig(";
  35 + os << "silero_vad=" << silero_vad.ToString() << ", ";
  36 + os << "sample_rate=" << sample_rate << ", ";
  37 + os << "num_threads=" << num_threads << ", ";
  38 + os << "provider=\"" << provider << "\", ";
  39 + os << "debug=" << (debug ? "True" : "False") << ")";
  40 +
  41 + return os.str();
  42 +}
  43 +
  44 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/vad-model-config.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_
  5 +#define SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_
  6 +
  7 +#include <string>
  8 +
  9 +#include "sherpa-onnx/csrc/parse-options.h"
  10 +#include "sherpa-onnx/csrc/silero-vad-model-config.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +struct VadModelConfig {
  15 + SileroVadModelConfig silero_vad;
  16 +
  17 + int32_t sample_rate = 16000;
  18 + int32_t num_threads = 1;
  19 + std::string provider = "cpu";
  20 +
  21 + // true to show debug information when loading models
  22 + bool debug = false;
  23 +
  24 + VadModelConfig() = default;
  25 +
  26 + VadModelConfig(const SileroVadModelConfig &silero_vad, int32_t sample_rate,
  27 + int32_t num_threads, const std::string &provider, bool debug)
  28 + : silero_vad(silero_vad),
  29 + sample_rate(sample_rate),
  30 + num_threads(num_threads),
  31 + provider(provider),
  32 + debug(debug) {}
  33 +
  34 + void Register(ParseOptions *po);
  35 + bool Validate() const;
  36 +
  37 + std::string ToString() const;
  38 +};
  39 +
  40 +} // namespace sherpa_onnx
  41 +
  42 +#endif // SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_
  1 +// sherpa-onnx/csrc/vad-model.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/vad-model.h"
  6 +
  7 +#include "sherpa-onnx/csrc/silero-vad-model.h"
  8 +
  9 +namespace sherpa_onnx {
  10 +
  11 +std::unique_ptr<VadModel> VadModel::Create(const VadModelConfig &config) {
  12 + // TODO(fangjun): Support other VAD models.
  13 + return std::make_unique<SileroVadModel>(config);
  14 +}
  15 +
  16 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/vad-model.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_VAD_MODEL_H_
  5 +#define SHERPA_ONNX_CSRC_VAD_MODEL_H_
  6 +
  7 +#include <memory>
  8 +
  9 +#include "sherpa-onnx/csrc/vad-model-config.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +class VadModel {
  14 + public:
  15 + virtual ~VadModel() = default;
  16 +
  17 + static std::unique_ptr<VadModel> Create(const VadModelConfig &config);
  18 +
  19 + // reset the internal model states
  20 + virtual void Reset() = 0;
  21 +
  22 + /**
  23 + * @param samples Pointer to a 1-d array containing audio samples.
  24 + * Each sample should be normalized to the range [-1, 1].
  25 + * @param n Number of samples. Should be equal to WindowSize()
  26 + *
  27 + * @return Return true if speech is detected. Return false otherwise.
  28 + */
  29 + virtual bool IsSpeech(const float *samples, int32_t n) = 0;
  30 +
  31 + virtual int32_t WindowSize() const = 0;
  32 +
  33 + virtual int32_t MinSilenceDurationSamples() const = 0;
  34 + virtual int32_t MinSpeechDurationSamples() const = 0;
  35 +};
  36 +
  37 +} // namespace sherpa_onnx
  38 +
  39 +#endif // SHERPA_ONNX_CSRC_VAD_MODEL_H_
  1 +// sherpa-onnx/csrc/voice-activity-detector.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  6 +
  7 +#include <queue>
  8 +#include <utility>
  9 +
  10 +#include "sherpa-onnx/csrc/circular-buffer.h"
  11 +#include "sherpa-onnx/csrc/vad-model.h"
  12 +
  13 +namespace sherpa_onnx {
  14 +
  15 +class VoiceActivityDetector::Impl {
  16 + public:
  17 + explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60)
  18 + : model_(VadModel::Create(config)),
  19 + config_(config),
  20 + buffer_(buffer_size_in_seconds * config.sample_rate) {}
  21 +
  22 + void AcceptWaveform(const float *samples, int32_t n) {
  23 + buffer_.Push(samples, n);
  24 +
  25 + bool is_speech = model_->IsSpeech(samples, n);
  26 + if (is_speech) {
  27 + if (start_ == -1) {
  28 + // beginning of speech
  29 + start_ = buffer_.Tail() - 2 * model_->WindowSize() -
  30 + model_->MinSpeechDurationSamples();
  31 + }
  32 + } else {
  33 + // non-speech
  34 + if (start_ != -1) {
  35 + // end of speech, save the speech segment
  36 + int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
  37 +
  38 + std::vector<float> samples = buffer_.Get(start_, end - start_);
  39 + SpeechSegment segment;
  40 +
  41 + segment.start = start_;
  42 + segment.samples = std::move(samples);
  43 +
  44 + segments_.push(std::move(segment));
  45 +
  46 + buffer_.Pop(end - buffer_.Head());
  47 + }
  48 +
  49 + start_ = -1;
  50 + }
  51 + }
  52 +
  53 + bool Empty() const { return segments_.empty(); }
  54 +
  55 + void Pop() { segments_.pop(); }
  56 +
  57 + const SpeechSegment &Front() const { return segments_.front(); }
  58 +
  59 + void Reset() {
  60 + std::queue<SpeechSegment>().swap(segments_);
  61 +
  62 + model_->Reset();
  63 + buffer_.Reset();
  64 +
  65 + start_ = -1;
  66 + }
  67 +
  68 + bool IsSpeechDetected() const { return start_ != -1; }
  69 +
  70 + private:
  71 + std::queue<SpeechSegment> segments_;
  72 +
  73 + std::unique_ptr<VadModel> model_;
  74 + VadModelConfig config_;
  75 + CircularBuffer buffer_;
  76 +
  77 + int32_t start_ = -1;
  78 +};
  79 +
  80 +VoiceActivityDetector::VoiceActivityDetector(
  81 + const VadModelConfig &config, float buffer_size_in_seconds /*= 60*/)
  82 + : impl_(std::make_unique<Impl>(config, buffer_size_in_seconds)) {}
  83 +
  84 +VoiceActivityDetector::~VoiceActivityDetector() = default;
  85 +
  86 +void VoiceActivityDetector::AcceptWaveform(const float *samples, int32_t n) {
  87 + impl_->AcceptWaveform(samples, n);
  88 +}
  89 +
  90 +bool VoiceActivityDetector::Empty() const { return impl_->Empty(); }
  91 +
  92 +void VoiceActivityDetector::Pop() { impl_->Pop(); }
  93 +
  94 +const SpeechSegment &VoiceActivityDetector::Front() const {
  95 + return impl_->Front();
  96 +}
  97 +
  98 +void VoiceActivityDetector::Reset() { impl_->Reset(); }
  99 +
  100 +bool VoiceActivityDetector::IsSpeechDetected() const {
  101 + return impl_->IsSpeechDetected();
  102 +}
  103 +
  104 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/voice-activity-detector.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +#ifndef SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_
  5 +#define SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_
  6 +
  7 +#include <memory>
  8 +#include <vector>
  9 +
  10 +#include "sherpa-onnx/csrc/vad-model-config.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +struct SpeechSegment {
  15 + int32_t start; // in samples
  16 + std::vector<float> samples;
  17 +};
  18 +
  19 +class VoiceActivityDetector {
  20 + public:
  21 + explicit VoiceActivityDetector(const VadModelConfig &config,
  22 + float buffer_size_in_seconds = 60);
  23 + ~VoiceActivityDetector();
  24 +
  25 + void AcceptWaveform(const float *samples, int32_t n);
  26 + bool Empty() const;
  27 + void Pop();
  28 + const SpeechSegment &Front() const;
  29 +
  30 + bool IsSpeechDetected() const;
  31 +
  32 + void Reset();
  33 +
  34 + private:
  35 + class Impl;
  36 + std::unique_ptr<Impl> impl_;
  37 +};
  38 +
  39 +} // namespace sherpa_onnx
  40 +
  41 +#endif // SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_
1 include_directories(${CMAKE_SOURCE_DIR}) 1 include_directories(${CMAKE_SOURCE_DIR})
2 2
3 pybind11_add_module(_sherpa_onnx 3 pybind11_add_module(_sherpa_onnx
  4 + circular-buffer.cc
4 display.cc 5 display.cc
5 endpoint.cc 6 endpoint.cc
6 features.cc 7 features.cc
@@ -20,6 +21,10 @@ pybind11_add_module(_sherpa_onnx @@ -20,6 +21,10 @@ pybind11_add_module(_sherpa_onnx
20 online-stream.cc 21 online-stream.cc
21 online-transducer-model-config.cc 22 online-transducer-model-config.cc
22 sherpa-onnx.cc 23 sherpa-onnx.cc
  24 + silero-vad-model-config.cc
  25 + vad-model-config.cc
  26 + vad-model.cc
  27 + voice-activity-detector.cc
23 ) 28 )
24 29
25 if(APPLE) 30 if(APPLE)
  1 +// sherpa-onnx/python/csrc/circular-buffer.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/circular-buffer.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "sherpa-onnx/csrc/circular-buffer.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +void PybindCircularBuffer(py::module *m) {
  14 + using PyClass = CircularBuffer;
  15 + py::class_<PyClass>(*m, "CircularBuffer")
  16 + .def(py::init<int32_t>(), py::arg("capacity"))
  17 + .def(
  18 + "push",
  19 + [](PyClass &self, const std::vector<float> &samples) {
  20 + self.Push(samples.data(), samples.size());
  21 + },
  22 + py::arg("samples"))
  23 + .def("get", &PyClass::Get, py::arg("start_index"), py::arg("n"))
  24 + .def("pop", &PyClass::Pop, py::arg("n"))
  25 + .def("reset", &PyClass::Reset)
  26 + .def_property_readonly("size", &PyClass::Size)
  27 + .def_property_readonly("head", &PyClass::Head)
  28 + .def_property_readonly("tail", &PyClass::Tail);
  29 +}
  30 +
  31 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/circular-buffer.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindCircularBuffer(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
@@ -4,6 +4,7 @@ @@ -4,6 +4,7 @@
4 4
5 #include "sherpa-onnx/python/csrc/sherpa-onnx.h" 5 #include "sherpa-onnx/python/csrc/sherpa-onnx.h"
6 6
  7 +#include "sherpa-onnx/python/csrc/circular-buffer.h"
7 #include "sherpa-onnx/python/csrc/display.h" 8 #include "sherpa-onnx/python/csrc/display.h"
8 #include "sherpa-onnx/python/csrc/endpoint.h" 9 #include "sherpa-onnx/python/csrc/endpoint.h"
9 #include "sherpa-onnx/python/csrc/features.h" 10 #include "sherpa-onnx/python/csrc/features.h"
@@ -15,6 +16,9 @@ @@ -15,6 +16,9 @@
15 #include "sherpa-onnx/python/csrc/online-model-config.h" 16 #include "sherpa-onnx/python/csrc/online-model-config.h"
16 #include "sherpa-onnx/python/csrc/online-recognizer.h" 17 #include "sherpa-onnx/python/csrc/online-recognizer.h"
17 #include "sherpa-onnx/python/csrc/online-stream.h" 18 #include "sherpa-onnx/python/csrc/online-stream.h"
  19 +#include "sherpa-onnx/python/csrc/vad-model-config.h"
  20 +#include "sherpa-onnx/python/csrc/vad-model.h"
  21 +#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
18 22
19 namespace sherpa_onnx { 23 namespace sherpa_onnx {
20 24
@@ -34,6 +38,11 @@ PYBIND11_MODULE(_sherpa_onnx, m) { @@ -34,6 +38,11 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
34 PybindOfflineLMConfig(&m); 38 PybindOfflineLMConfig(&m);
35 PybindOfflineModelConfig(&m); 39 PybindOfflineModelConfig(&m);
36 PybindOfflineRecognizer(&m); 40 PybindOfflineRecognizer(&m);
  41 +
  42 + PybindVadModelConfig(&m);
  43 + PybindVadModel(&m);
  44 + PybindCircularBuffer(&m);
  45 + PybindVoiceActivityDetector(&m);
37 } 46 }
38 47
39 } // namespace sherpa_onnx 48 } // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/silero-vad-model-config.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
  6 +
  7 +#include <memory>
  8 +#include <string>
  9 +
  10 +#include "sherpa-onnx/csrc/silero-vad-model-config.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +void PybindSileroVadModelConfig(py::module *m) {
  15 + using PyClass = SileroVadModelConfig;
  16 + py::class_<PyClass>(*m, "SileroVadModelConfig")
  17 + .def(py::init<>())
  18 + .def(py::init([](const std::string &model, float threshold,
  19 + float min_silence_duration, float min_speech_duration,
  20 + int32_t window_size) -> std::unique_ptr<PyClass> {
  21 + auto ans = std::make_unique<PyClass>();
  22 +
  23 + ans->model = model;
  24 + ans->threshold = threshold;
  25 + ans->min_silence_duration = min_silence_duration;
  26 + ans->min_speech_duration = min_speech_duration;
  27 + ans->window_size = window_size;
  28 +
  29 + return ans;
  30 + }),
  31 + py::arg("model"), py::arg("threshold") = 0.5,
  32 + py::arg("min_silence_duration") = 0.5,
  33 + py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512)
  34 + .def_readwrite("model", &PyClass::model)
  35 + .def_readwrite("threshold", &PyClass::threshold)
  36 + .def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
  37 + .def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
  38 + .def_readwrite("window_size", &PyClass::window_size)
  39 + .def("__str__", &PyClass::ToString)
  40 + .def("validate", &PyClass::Validate);
  41 +}
  42 +
  43 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/silero-vad-model-config.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindSileroVadModelConfig(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
  1 +// sherpa-onnx/python/csrc/vad-model-config.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/vad-model-config.h"
  6 +
  7 +#include <string>
  8 +
  9 +#include "sherpa-onnx/csrc/vad-model-config.h"
  10 +#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +void PybindVadModelConfig(py::module *m) {
  15 + PybindSileroVadModelConfig(m);
  16 +
  17 + using PyClass = VadModelConfig;
  18 + py::class_<PyClass>(*m, "VadModelConfig")
  19 + .def(py::init<>())
  20 + .def(py::init<const SileroVadModelConfig &, int32_t, int32_t,
  21 + const std::string &, bool>(),
  22 + py::arg("silero_vad"), py::arg("sample_rate") = 16000,
  23 + py::arg("num_threads") = 1, py::arg("provider") = "cpu",
  24 + py::arg("debug") = false)
  25 + .def_readwrite("silero_vad", &PyClass::silero_vad)
  26 + .def_readwrite("sample_rate", &PyClass::sample_rate)
  27 + .def_readwrite("num_threads", &PyClass::num_threads)
  28 + .def_readwrite("provider", &PyClass::provider)
  29 + .def_readwrite("debug", &PyClass::debug)
  30 + .def("__str__", &PyClass::ToString)
  31 + .def("validate", &PyClass::Validate);
  32 +}
  33 +
  34 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/vad-model-config.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindVadModelConfig(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
  1 +// sherpa-onnx/python/csrc/vad-model.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/vad-model.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "sherpa-onnx/csrc/vad-model.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +void PybindVadModel(py::module *m) {
  14 + using PyClass = VadModel;
  15 + py::class_<PyClass>(*m, "VadModel")
  16 + .def_static("create", &PyClass::Create, py::arg("config"))
  17 + .def("reset", &PyClass::Reset)
  18 + .def(
  19 + "is_speech",
  20 + [](PyClass &self, const std::vector<float> &samples) -> bool {
  21 + return self.IsSpeech(samples.data(), samples.size());
  22 + },
  23 + py::arg("samples"))
  24 + .def("window_size", &PyClass::WindowSize)
  25 + .def("min_silence_duration_samples", &PyClass::MinSilenceDurationSamples)
  26 + .def("min_speech_duration_samples", &PyClass::MinSpeechDurationSamples);
  27 +}
  28 +
  29 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/vad-model.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindVadModel(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
  1 +// sherpa-onnx/python/csrc/voice-activity-detector.cc
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
  6 +
  7 +#include <vector>
  8 +
  9 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +void PybindSpeechSegment(py::module *m) {
  14 + using PyClass = SpeechSegment;
  15 + py::class_<PyClass>(*m, "SpeechSegment")
  16 + .def_property_readonly("start",
  17 + [](const PyClass &self) { return self.start; })
  18 + .def_property_readonly("samples",
  19 + [](const PyClass &self) { return self.samples; });
  20 +}
  21 +
  22 +void PybindVoiceActivityDetector(py::module *m) {
  23 + PybindSpeechSegment(m);
  24 + using PyClass = VoiceActivityDetector;
  25 + py::class_<PyClass>(*m, "VoiceActivityDetector")
  26 + .def(py::init<const VadModelConfig &, float>(), py::arg("config"),
  27 + py::arg("buffer_size_in_seconds") = 60)
  28 + .def(
  29 + "accept_waveform",
  30 + [](PyClass &self, const std::vector<float> &samples) {
  31 + self.AcceptWaveform(samples.data(), samples.size());
  32 + },
  33 + py::arg("samples"))
  34 + .def("empty", &PyClass::Empty)
  35 + .def("pop", &PyClass::Pop)
  36 + .def("is_speech_detected", &PyClass::IsSpeechDetected)
  37 + .def("reset", &PyClass::Reset)
  38 + .def_property_readonly("front", &PyClass::Front);
  39 +}
  40 +
  41 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/voice-activity-detector.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindVoiceActivityDetector(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
1 from typing import Dict, List, Optional 1 from typing import Dict, List, Optional
2 2
3 -from _sherpa_onnx import Display, OfflineStream, OnlineStream 3 +from _sherpa_onnx import (
  4 + CircularBuffer,
  5 + Display,
  6 + OfflineStream,
  7 + OnlineStream,
  8 + SileroVadModelConfig,
  9 + SpeechSegment,
  10 + VadModel,
  11 + VadModelConfig,
  12 + VoiceActivityDetector,
  13 +)
4 14
5 from .offline_recognizer import OfflineRecognizer 15 from .offline_recognizer import OfflineRecognizer
6 from .online_recognizer import OnlineRecognizer 16 from .online_recognizer import OnlineRecognizer