Fangjun Kuang
Committed by GitHub

Add Silero VAD (#313)

cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)
set(SHERPA_ONNX_VERSION "1.7.14")
set(SHERPA_ONNX_VERSION "1.7.15")
# Disable warning about
#
... ...
... ... @@ -136,6 +136,7 @@ class BuildExtension(build_ext):
binaries += ["sherpa-onnx-online-websocket-server"]
binaries += ["sherpa-onnx-offline-websocket-server"]
binaries += ["sherpa-onnx-online-websocket-client"]
binaries += ["sherpa-onnx-vad-microphone"]
if is_windows():
binaries += ["kaldi-native-fbank-core.dll"]
... ...
# File description
- [./http_server.py](./http_server.py) It defines which files to server.
Files are saved in [./web](./web).
- [non_streaming_server.py](./non_streaming_server.py) WebSocket server for
non-streaming models.
- [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses
[silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech
segments and concatenate all speech segments into a single one.
... ...
#!/usr/bin/env python3
"""
This file shows how to remove non-speech segments
and merge all speech segments into a large segment
and save it to a file.
Usage
python3 ./vad-remove-non-speech-segments.py \
--silero-vad-model silero_vad.onnx
Please visit
https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
to download silero_vad.onnx
For instance,
wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
"""
import argparse
import sys
import time
from pathlib import Path
import numpy as np
import sherpa_onnx
import soundfile as sf
try:
import sounddevice as sd
except ImportError:
print("Please install sounddevice first. You can use")
print()
print(" pip install sounddevice")
print()
print("to install it")
sys.exit(-1)
def assert_file_exists(filename: str):
assert Path(filename).is_file(), (
f"{filename} does not exist!\n"
"Please refer to "
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
)
def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--silero-vad-model",
type=str,
required=True,
help="Path to silero_vad.onnx",
)
return parser.parse_args()
def main():
devices = sd.query_devices()
if len(devices) == 0:
print("No microphone devices found")
sys.exit(0)
print(devices)
default_input_device_idx = sd.default.device[0]
print(f'Use default device: {devices[default_input_device_idx]["name"]}')
args = get_args()
assert_file_exists(args.silero_vad_model)
sample_rate = 16000
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
config = sherpa_onnx.VadModelConfig()
config.silero_vad.model = args.silero_vad_model
config.sample_rate = sample_rate
window_size = config.silero_vad.window_size
buffer = []
vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
all_samples = []
print("Started! Please speak")
try:
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
samples = samples.reshape(-1)
buffer = np.concatenate([buffer, samples])
all_samples = np.concatenate([all_samples, samples])
while len(buffer) > window_size:
vad.accept_waveform(buffer[:window_size])
buffer = buffer[window_size:]
except KeyboardInterrupt:
print("\nCaught Ctrl + C. Saving & Exiting")
speech_samples = []
while not vad.empty():
speech_samples.extend(vad.front.samples)
vad.pop()
speech_samples = np.array(speech_samples, dtype=np.float32)
filename_for_speech = time.strftime("%Y%m%d-%H%M%S-speech.wav")
sf.write(filename_for_speech, speech_samples, samplerate=sample_rate)
filename_for_all = time.strftime("%Y%m%d-%H%M%S-all.wav")
sf.write(filename_for_all, all_samples, samplerate=sample_rate)
print(f"Saved to {filename_for_speech} and {filename_for_all}")
if __name__ == "__main__":
main()
... ...
... ... @@ -56,6 +56,7 @@ def get_binaries_to_install():
binaries += ["sherpa-onnx-online-websocket-server"]
binaries += ["sherpa-onnx-offline-websocket-server"]
binaries += ["sherpa-onnx-online-websocket-client"]
binaries += ["sherpa-onnx-vad-microphone"]
if is_windows():
binaries += ["kaldi-native-fbank-core.dll"]
binaries += ["sherpa-onnx-c-api.dll"]
... ... @@ -95,8 +96,8 @@ setuptools.setup(
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
entry_points={
'console_scripts': [
'sherpa-onnx-cli=sherpa_onnx.cli:cli',
"console_scripts": [
"sherpa-onnx-cli=sherpa_onnx.cli:cli",
],
},
license="Apache licensed, as found in the LICENSE file",
... ...
... ... @@ -13,6 +13,7 @@ endif()
set(sources
base64-decode.cc
cat.cc
circular-buffer.cc
context-graph.cc
endpoint.cc
features.cc
... ... @@ -66,6 +67,8 @@ set(sources
provider.cc
resample.cc
session.cc
silero-vad-model-config.cc
silero-vad-model.cc
slice.cc
stack.cc
symbol-table.cc
... ... @@ -73,6 +76,9 @@ set(sources
transpose.cc
unbind.cc
utils.cc
vad-model-config.cc
vad-model.cc
voice-activity-detector.cc
wave-reader.cc
)
... ... @@ -172,32 +178,42 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
microphone.cc
)
add_executable(sherpa-onnx-vad-microphone
sherpa-onnx-vad-microphone.cc
microphone.cc
)
if(BUILD_SHARED_LIBS)
set(PA_LIB portaudio)
else()
set(PA_LIB portaudio_static)
endif()
target_link_libraries(sherpa-onnx-microphone ${PA_LIB} sherpa-onnx-core)
target_link_libraries(sherpa-onnx-microphone-offline ${PA_LIB} sherpa-onnx-core)
set(exes
sherpa-onnx-microphone
sherpa-onnx-microphone-offline
sherpa-onnx-vad-microphone
)
foreach(exe IN LISTS exes)
target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core)
endforeach()
if(NOT WIN32)
target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
foreach(exe IN LISTS exes)
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
endforeach()
if(SHERPA_ONNX_ENABLE_PYTHON)
target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
foreach(exe IN LISTS exes)
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
endforeach()
endif()
endif()
install(
TARGETS
sherpa-onnx-microphone
sherpa-onnx-microphone-offline
TARGETS ${exes}
DESTINATION
bin
)
... ... @@ -269,6 +285,7 @@ endif()
if(SHERPA_ONNX_ENABLE_TESTS)
set(sherpa_onnx_test_srcs
cat-test.cc
circular-buffer-test.cc
context-graph-test.cc
packed-sequence-test.cc
pad-sequence-test.cc
... ...
# File descriptions
- [./sherpa-onnx-alsa.cc](./sherpa-onnx-alsa.cc) For Linux only, especially for
embedded Linux, e.g., Raspberry Pi; it uses a streaming model for real-time
speech recognition with a microphone.
- [./sherpa-onnx-microphone.cc](./sherpa-onnx-microphone.cc)
For Linux/Windows/macOS; it uses a streaming model for real-time speech
recognition with a microphone.
- [./sherpa-onnx-microphone-offline.cc](./sherpa-onnx-microphone-offline.cc)
For Linux/Windows/macOS; it uses a non-streaming model for speech
recognition with a microphone.
- [./sherpa-onnx.cc](./sherpa-onnx.cc)
It uses a streaming model to decode wave files
- [./sherpa-onnx-offline.cc](./sherpa-onnx-offline.cc)
It uses a non-streaming model to decode wave files
- [./online-websocket-server.cc](./online-websocket-server.cc)
WebSocket server for streaming models.
- [./offline-websocket-server.cc](./offline-websocket-server.cc)
WebSocket server for non-streaming models.
- [./sherpa-onnx-vad-microphone.cc](./sherpa-onnx-vad-microphone.cc)
Use silero VAD to detect speeches with a microphone.
... ...
// sherpa-onnx/csrc/circular-buffer-test.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/circular-buffer.h"
#include <vector>
#include "gtest/gtest.h"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
TEST(CircularBuffer, Push) {
CircularBuffer buffer(10);
EXPECT_EQ(buffer.Size(), 0);
EXPECT_EQ(buffer.Head(), 0);
EXPECT_EQ(buffer.Tail(), 0);
std::vector<float> a = {0, 1, 2, 3, 4, 5};
buffer.Push(a.data(), a.size());
EXPECT_EQ(buffer.Size(), 6);
EXPECT_EQ(buffer.Head(), 0);
EXPECT_EQ(buffer.Tail(), 6);
auto c = buffer.Get(0, a.size());
EXPECT_EQ(a.size(), c.size());
for (int32_t i = 0; i != a.size(); ++i) {
EXPECT_EQ(a[i], c[i]);
}
std::vector<float> d = {-6, -7, -8, -9};
buffer.Push(d.data(), d.size());
c = buffer.Get(a.size(), d.size());
EXPECT_EQ(d.size(), c.size());
for (int32_t i = 0; i != d.size(); ++i) {
EXPECT_EQ(d[i], c[i]);
}
}
TEST(CircularBuffer, PushAndPop) {
CircularBuffer buffer(5);
std::vector<float> a = {0, 1, 2, 3};
buffer.Push(a.data(), a.size());
EXPECT_EQ(buffer.Size(), 4);
EXPECT_EQ(buffer.Head(), 0);
EXPECT_EQ(buffer.Tail(), 4);
buffer.Pop(2);
EXPECT_EQ(buffer.Size(), 2);
EXPECT_EQ(buffer.Head(), 2);
EXPECT_EQ(buffer.Tail(), 4);
auto c = buffer.Get(2, 2);
EXPECT_EQ(c.size(), 2);
EXPECT_EQ(c[0], 2);
EXPECT_EQ(c[1], 3);
a = {10, 20, 30};
buffer.Push(a.data(), a.size());
EXPECT_EQ(buffer.Size(), 5);
EXPECT_EQ(buffer.Head(), 2);
EXPECT_EQ(buffer.Tail(), 7);
c = buffer.Get(2, 5);
EXPECT_EQ(c.size(), 5);
EXPECT_EQ(c[0], 2);
EXPECT_EQ(c[1], 3);
EXPECT_EQ(c[2], 10);
EXPECT_EQ(c[3], 20);
EXPECT_EQ(c[4], 30);
c = buffer.Get(3, 4);
EXPECT_EQ(c.size(), 4);
EXPECT_EQ(c[0], 3);
EXPECT_EQ(c[1], 10);
EXPECT_EQ(c[2], 20);
EXPECT_EQ(c[3], 30);
c = buffer.Get(4, 3);
EXPECT_EQ(c.size(), 3);
EXPECT_EQ(c[0], 10);
EXPECT_EQ(c[1], 20);
EXPECT_EQ(c[2], 30);
buffer.Pop(4);
EXPECT_EQ(buffer.Size(), 1);
EXPECT_EQ(buffer.Head(), 6);
EXPECT_EQ(buffer.Tail(), 7);
c = buffer.Get(6, 1);
EXPECT_EQ(c.size(), 1);
EXPECT_EQ(c[0], 30);
a = {100, 200, 300, 400};
buffer.Push(a.data(), a.size());
EXPECT_EQ(buffer.Size(), 5);
EXPECT_EQ(buffer.Size(), 5);
EXPECT_EQ(buffer.Head(), 6);
EXPECT_EQ(buffer.Tail(), 11);
c = buffer.Get(6, 5);
EXPECT_EQ(c.size(), 5);
EXPECT_EQ(c[0], 30);
EXPECT_EQ(c[1], 100);
EXPECT_EQ(c[2], 200);
EXPECT_EQ(c[3], 300);
EXPECT_EQ(c[4], 400);
buffer.Pop(3);
EXPECT_EQ(buffer.Size(), 2);
EXPECT_EQ(buffer.Head(), 9);
EXPECT_EQ(buffer.Tail(), 11);
c = buffer.Get(10, 1);
EXPECT_EQ(c.size(), 1);
EXPECT_EQ(c[0], 400);
a = {1000, 2000, 3000};
buffer.Push(a.data(), a.size());
EXPECT_EQ(buffer.Size(), 5);
EXPECT_EQ(buffer.Head(), 9);
EXPECT_EQ(buffer.Tail(), 14);
buffer.Pop(1);
EXPECT_EQ(buffer.Size(), 4);
EXPECT_EQ(buffer.Head(), 10);
EXPECT_EQ(buffer.Tail(), 14);
a = {4000};
buffer.Push(a.data(), a.size());
EXPECT_EQ(buffer.Size(), 5);
EXPECT_EQ(buffer.Head(), 10);
EXPECT_EQ(buffer.Tail(), 15);
c = buffer.Get(13, 2);
EXPECT_EQ(c.size(), 2);
EXPECT_EQ(c[0], 3000);
EXPECT_EQ(c[1], 4000);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/circular-buffer.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/circular-buffer.h"
#include <algorithm>
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
CircularBuffer::CircularBuffer(int32_t capacity) {
if (capacity <= 0) {
SHERPA_ONNX_LOGE("Please specify a positive capacity. Given: %d\n",
capacity);
exit(-1);
}
buffer_.resize(capacity);
}
void CircularBuffer::Push(const float *p, int32_t n) {
int32_t capacity = buffer_.size();
int32_t size = Size();
if (n + size > capacity) {
SHERPA_ONNX_LOGE("Overflow! n: %d, size: %d, n+size: %d, capacity: %d", n,
size, n + size, capacity);
exit(-1);
}
int32_t start = tail_ % capacity;
tail_ += n;
if (start + n < capacity) {
std::copy(p, p + n, buffer_.begin() + start);
return;
}
int32_t part1_size = capacity - start;
std::copy(p, p + part1_size, buffer_.begin() + start);
std::copy(p + part1_size, p + n, buffer_.begin());
}
std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
if (start_index < head_ || start_index >= tail_) {
SHERPA_ONNX_LOGE("Invalid start_index: %d. head_: %d, tail_: %d",
start_index, head_, tail_);
return {};
}
int32_t size = Size();
if (n < 0 || n > size) {
SHERPA_ONNX_LOGE("Invalid n: %d. size: %d", n, size);
return {};
}
int32_t capacity = buffer_.size();
if (start_index - head_ + n > size) {
SHERPA_ONNX_LOGE("Invalid start_index: %d and n: %d. head_: %d, size: %d",
start_index, n, head_, size);
return {};
}
int32_t start = start_index % capacity;
if (start + n < capacity) {
return {buffer_.begin() + start, buffer_.begin() + start + n};
}
std::vector<float> ans(n);
std::copy(buffer_.begin() + start, buffer_.end(), ans.begin());
int32_t part1_size = capacity - start;
int32_t part2_size = n - part1_size;
std::copy(buffer_.begin(), buffer_.begin() + part2_size,
ans.begin() + part1_size);
return ans;
}
void CircularBuffer::Pop(int32_t n) {
int32_t size = Size();
if (n < 0 || n > size) {
SHERPA_ONNX_LOGE("Invalid n: %d. size: %d", n, size);
return;
}
head_ += n;
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/circular-buffer.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_
#define SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_
#include <cstdint>
#include <vector>
namespace sherpa_onnx {
class CircularBuffer {
public:
// Capacity of this buffer. Should be large enough.
// If it is full, we just print a message and exit the program.
explicit CircularBuffer(int32_t capacity);
// Push an array
//
// @param p Pointer to the start address of the array
// @param n Number of elements in the array
//
// Note: If n + Size() > capacity, we print an error message and exit.
void Push(const float *p, int32_t n);
// @param start_index Should in the range [head_, tail_)
// @param n Number of elements to get
// @return Return a vector of size n containing the requested elements
std::vector<float> Get(int32_t start_index, int32_t n) const;
// Remove n elements from the buffer
//
// @param n Should be in the range [0, size_]
void Pop(int32_t n);
// Number of elements in the buffer.
int32_t Size() const { return tail_ - head_; }
// Current position of the head
int32_t Head() const { return head_; }
// Current position of the tail
int32_t Tail() const { return tail_; }
void Reset() {
head_ = 0;
tail_ = 0;
}
private:
std::vector<float> buffer_;
int32_t head_ = 0; // linear index; always increasing; never wraps around
int32_t tail_ = 0; // linear index, always increasing; never wraps around.
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_
... ...
... ... @@ -76,4 +76,8 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) {
return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider);
}
Ort::SessionOptions GetSessionOptions(const VadModelConfig &config) {
return GetSessionOptionsImpl(config.num_threads, config.provider);
}
} // namespace sherpa_onnx
... ...
... ... @@ -10,6 +10,7 @@
#include "sherpa-onnx/csrc/offline-model-config.h"
#include "sherpa-onnx/csrc/online-lm-config.h"
#include "sherpa-onnx/csrc/online-model-config.h"
#include "sherpa-onnx/csrc/vad-model-config.h"
namespace sherpa_onnx {
... ... @@ -20,6 +21,8 @@ Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config);
Ort::SessionOptions GetSessionOptions(const OfflineLMConfig &config);
Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config);
Ort::SessionOptions GetSessionOptions(const VadModelConfig &config);
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_SESSION_H_
... ...
// sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc
//
// Copyright (c) 2022-2023 Xiaomi Corporation
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <mutex> // NOLINT
#include "portaudio.h" // NOLINT
#include "sherpa-onnx/csrc/circular-buffer.h"
#include "sherpa-onnx/csrc/microphone.h"
#include "sherpa-onnx/csrc/voice-activity-detector.h"
bool stop = false;
std::mutex mutex;
sherpa_onnx::CircularBuffer buffer(16000 * 60);
static int32_t RecordCallback(const void *input_buffer,
void * /*output_buffer*/,
unsigned long frames_per_buffer, // NOLINT
const PaStreamCallbackTimeInfo * /*time_info*/,
PaStreamCallbackFlags /*status_flags*/,
void *user_data) {
std::lock_guard<std::mutex> lock(mutex);
buffer.Push(reinterpret_cast<const float *>(input_buffer), frames_per_buffer);
return stop ? paComplete : paContinue;
}
static void Handler(int32_t sig) {
stop = true;
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
}
int32_t main(int32_t argc, char *argv[]) {
signal(SIGINT, Handler);
const char *kUsageMessage = R"usage(
This program shows how to use VAD in sherpa-onnx.
./bin/sherpa-onnx-vad-microphone \
--silero-vad-model=/path/to/silero_vad.onnx \
--provider=cpu \
--num-threads=1
Please download silero_vad.onnx from
https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
For instance, use
wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
)usage";
sherpa_onnx::ParseOptions po(kUsageMessage);
sherpa_onnx::VadModelConfig config;
config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() != 0) {
po.PrintUsage();
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s\n", config.ToString().c_str());
if (!config.Validate()) {
fprintf(stderr, "Errors in config!\n");
return -1;
}
sherpa_onnx::Microphone mic;
PaDeviceIndex num_devices = Pa_GetDeviceCount();
fprintf(stderr, "Num devices: %d\n", num_devices);
PaStreamParameters param;
param.device = Pa_GetDefaultInputDevice();
if (param.device == paNoDevice) {
fprintf(stderr, "No default input device found\n");
exit(EXIT_FAILURE);
}
fprintf(stderr, "Use default device: %d\n", param.device);
const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
fprintf(stderr, " Name: %s\n", info->name);
fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels);
param.channelCount = 1;
param.sampleFormat = paFloat32;
param.suggestedLatency = info->defaultLowInputLatency;
param.hostApiSpecificStreamInfo = nullptr;
float sample_rate = 16000;
PaStream *stream;
PaError err =
Pa_OpenStream(&stream, &param, nullptr, /* &outputParameters, */
sample_rate,
0, // frames per buffer
paClipOff, // we won't output out of range samples
// so don't bother clipping them
RecordCallback, &config.silero_vad.window_size);
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}
err = Pa_StartStream(stream);
auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config);
fprintf(stderr, "Started\n");
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}
int32_t window_size = config.silero_vad.window_size;
bool printed = false;
while (!stop) {
{
std::lock_guard<std::mutex> lock(mutex);
while (buffer.Size() >= window_size) {
std::vector<float> samples = buffer.Get(buffer.Head(), window_size);
buffer.Pop(window_size);
vad->AcceptWaveform(samples.data(), samples.size());
if (vad->IsSpeechDetected() && !printed) {
printed = true;
fprintf(stderr, "\nDetected speech!\n");
}
if (!vad->IsSpeechDetected()) {
printed = false;
}
while (!vad->Empty()) {
float duration = vad->Front().samples.size() / sample_rate;
vad->Pop();
fprintf(stderr, "Duration: %.3f seconds\n", duration);
}
}
}
Pa_Sleep(100); // sleep for 100ms
}
err = Pa_CloseStream(stream);
if (err != paNoError) {
fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
exit(EXIT_FAILURE);
}
return 0;
}
... ...
// sherpa-onnx/csrc/silero-vad-model-config.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/silero-vad-model-config.h"
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
void SileroVadModelConfig::Register(ParseOptions *po) {
po->Register("silero-vad-model", &model, "Path to silero VAD ONNX model.");
po->Register("silero-vad-threshold", &threshold,
"Speech threshold. Silero VAD outputs speech probabilities for "
"each audio chunk, probabilities ABOVE this value are "
"considered as SPEECH. It is better to tune this parameter for "
"each dataset separately, but lazy "
"0.5 is pretty good for most datasets.");
po->Register(
"silero-vad-min-silence-duration", &min_silence_duration,
"In seconds. In the end of each speech chunk wait for "
"--silero-vad-min-silence-duration seconds before separating it");
po->Register("silero-vad-min-speech-duration", &min_speech_duration,
"In seconds. In the end of each silence chunk wait for "
"--silero-vad-min-speech-duration seconds before separating it");
po->Register(
"silero-vad-window-size", &window_size,
"In samples. Audio chunks of --silero-vad-window-size samples are fed "
"to the silero VAD model. WARNING! Silero VAD models were trained using "
"512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples "
"for 8000 sample rate. Values other than these may affect model "
"perfomance!");
}
bool SileroVadModelConfig::Validate() const {
if (model.empty()) {
SHERPA_ONNX_LOGE("Please provide --silero-vad-model");
return false;
}
if (!FileExists(model)) {
SHERPA_ONNX_LOGE("Silero vad model file %s does not exist", model.c_str());
return false;
}
if (threshold < 0.01) {
SHERPA_ONNX_LOGE(
"Please use a larger value for --silero-vad-threshold. Given: %f",
threshold);
return false;
}
if (threshold >= 1) {
SHERPA_ONNX_LOGE(
"Please use a smaller value for --silero-vad-threshold. Given: %f",
threshold);
return false;
}
return true;
}
std::string SileroVadModelConfig::ToString() const {
std::ostringstream os;
os << "SilerVadModelConfig(";
os << "model=\"" << model << "\", ";
os << "threshold=" << threshold << ", ";
os << "min_silence_duration=" << min_silence_duration << ", ";
os << "min_speech_duration=" << min_speech_duration << ", ";
os << "window_size=" << window_size << ")";
return os.str();
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/silero-vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_
#define SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_
#include <string>
#include "sherpa-onnx/csrc/parse-options.h"
namespace sherpa_onnx {
struct SileroVadModelConfig {
std::string model;
// threshold to classify a segment as speech
//
// The predicted probability of a segment is larger than this
// value, then it is classified as speech.
float threshold = 0.5;
float min_silence_duration = 0.5; // in seconds
float min_speech_duration = 0.25; // in seconds
// 512, 1024, 1536 samples for 16000 Hz
// 256, 512, 768 samples for 800 Hz
int window_size = 512; // in samples
SileroVadModelConfig() = default;
void Register(ParseOptions *po);
bool Validate() const;
std::string ToString() const;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_
... ...
// sherpa-onnx/csrc/silero-vad-model.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/silero-vad-model.h"
#include <string>
#include <utility>
#include <vector>
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/session.h"
namespace sherpa_onnx {
class SileroVadModel::Impl {
public:
explicit Impl(const VadModelConfig &config)
: config_(config),
env_(ORT_LOGGING_LEVEL_ERROR),
sess_opts_(GetSessionOptions(config)),
allocator_{} {
auto buf = ReadFile(config.silero_vad.model);
Init(buf.data(), buf.size());
sample_rate_ = config.sample_rate;
if (sample_rate_ != 16000) {
SHERPA_ONNX_LOGE("Expected sample rate 16000. Given: %d",
config.sample_rate);
exit(-1);
}
min_silence_samples_ =
sample_rate_ * config_.silero_vad.min_silence_duration;
min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration;
}
void Reset() {
// 2 - number of LSTM layer
// 1 - batch size
// 64 - hidden dim
std::array<int64_t, 3> shape{2, 1, 64};
Ort::Value h =
Ort::Value::CreateTensor<float>(allocator_, shape.data(), shape.size());
Ort::Value c =
Ort::Value::CreateTensor<float>(allocator_, shape.data(), shape.size());
Fill<float>(&h, 0);
Fill<float>(&c, 0);
states_.clear();
states_.reserve(2);
states_.push_back(std::move(h));
states_.push_back(std::move(c));
triggered_ = false;
current_sample_ = 0;
temp_start_ = 0;
temp_end_ = 0;
}
bool IsSpeech(const float *samples, int32_t n) {
if (n != config_.silero_vad.window_size) {
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n,
config_.silero_vad.window_size);
exit(-1);
}
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
std::array<int64_t, 2> x_shape = {1, n};
Ort::Value x =
Ort::Value::CreateTensor(memory_info, const_cast<float *>(samples), n,
x_shape.data(), x_shape.size());
int64_t sr_shape = 1;
Ort::Value sr =
Ort::Value::CreateTensor(memory_info, &sample_rate_, 1, &sr_shape, 1);
std::array<Ort::Value, 4> inputs = {std::move(x), std::move(sr),
std::move(states_[0]),
std::move(states_[1])};
auto out =
sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
output_names_ptr_.data(), output_names_ptr_.size());
states_[0] = std::move(out[1]);
states_[1] = std::move(out[2]);
float prob = out[0].GetTensorData<float>()[0];
float threshold = config_.silero_vad.threshold;
current_sample_ += config_.silero_vad.window_size;
if (prob > threshold && temp_end_ != 0) {
temp_end_ = 0;
}
if (prob > threshold && temp_start_ == 0) {
// start speaking, but we require that it must satisfy
// min_speech_duration
temp_start_ = current_sample_;
return false;
}
if (prob > threshold && temp_start_ != 0 && !triggered_) {
if (current_sample_ - temp_start_ < min_speech_samples_) {
return false;
}
triggered_ = true;
return true;
}
if ((prob < threshold) && !triggered_) {
// silence
temp_start_ = 0;
temp_end_ = 0;
return false;
}
if ((prob > threshold - 0.15) && triggered_) {
// speaking
return true;
}
if ((prob > threshold) && !triggered_) {
// start speaking
triggered_ = true;
return true;
}
if ((prob < threshold) && triggered_) {
// stop to speak
if (temp_end_ == 0) {
temp_end_ = current_sample_;
}
if (current_sample_ - temp_end_ < min_silence_samples_) {
// continue speaking
return true;
}
// stopped speaking
temp_start_ = 0;
temp_end_ = 0;
triggered_ = false;
return false;
}
return false;
}
int32_t WindowSize() const { return config_.silero_vad.window_size; }
int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
int32_t MinSpeechDurationSamples() const { return min_speech_samples_; }
private:
void Init(void *model_data, size_t model_data_length) {
sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
sess_opts_);
GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
Check();
Reset();
}
void Check() {
if (input_names_.size() != 4) {
SHERPA_ONNX_LOGE("Expect 4 inputs. Given: %d",
static_cast<int32_t>(input_names_.size()));
exit(-1);
}
if (input_names_[0] != "input") {
SHERPA_ONNX_LOGE("Input[0]: %s. Expected: input",
input_names_[0].c_str());
exit(-1);
}
if (input_names_[1] != "sr") {
SHERPA_ONNX_LOGE("Input[1]: %s. Expected: sr", input_names_[1].c_str());
exit(-1);
}
if (input_names_[2] != "h") {
SHERPA_ONNX_LOGE("Input[2]: %s. Expected: h", input_names_[2].c_str());
exit(-1);
}
if (input_names_[3] != "c") {
SHERPA_ONNX_LOGE("Input[3]: %s. Expected: c", input_names_[3].c_str());
exit(-1);
}
// Now for outputs
if (output_names_.size() != 3) {
SHERPA_ONNX_LOGE("Expect 3 outputs. Given: %d",
static_cast<int32_t>(output_names_.size()));
exit(-1);
}
if (output_names_[0] != "output") {
SHERPA_ONNX_LOGE("Output[0]: %s. Expected: output",
output_names_[0].c_str());
exit(-1);
}
if (output_names_[1] != "hn") {
SHERPA_ONNX_LOGE("Output[1]: %s. Expected: sr", output_names_[1].c_str());
exit(-1);
}
if (output_names_[2] != "cn") {
SHERPA_ONNX_LOGE("Output[2]: %s. Expected: sr", output_names_[2].c_str());
exit(-1);
}
}
private:
VadModelConfig config_;
Ort::Env env_;
Ort::SessionOptions sess_opts_;
Ort::AllocatorWithDefaultOptions allocator_;
std::unique_ptr<Ort::Session> sess_;
std::vector<std::string> input_names_;
std::vector<const char *> input_names_ptr_;
std::vector<std::string> output_names_;
std::vector<const char *> output_names_ptr_;
std::vector<Ort::Value> states_;
int64_t sample_rate_;
int32_t min_silence_samples_;
int32_t min_speech_samples_;
bool triggered_ = false;
int32_t current_sample_ = 0;
int32_t temp_start_ = 0;
int32_t temp_end_ = 0;
};
SileroVadModel::SileroVadModel(const VadModelConfig &config)
: impl_(std::make_unique<Impl>(config)) {}
SileroVadModel::~SileroVadModel() = default;
void SileroVadModel::Reset() { return impl_->Reset(); }
bool SileroVadModel::IsSpeech(const float *samples, int32_t n) {
return impl_->IsSpeech(samples, n);
}
int32_t SileroVadModel::WindowSize() const { return impl_->WindowSize(); }
int32_t SileroVadModel::MinSilenceDurationSamples() const {
return impl_->MinSilenceDurationSamples();
}
int32_t SileroVadModel::MinSpeechDurationSamples() const {
return impl_->MinSpeechDurationSamples();
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/silero-vad-model.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_
#define SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_
#include <memory>
#include "sherpa-onnx/csrc/vad-model.h"
namespace sherpa_onnx {
class SileroVadModel : public VadModel {
public:
explicit SileroVadModel(const VadModelConfig &config);
~SileroVadModel() override;
// reset the internal model states
void Reset() override;
/**
* @param samples Pointer to a 1-d array containing audio samples.
* Each sample should be normalized to the range [-1, 1].
* @param n Number of samples.
*
* @return Return true if speech is detected. Return false otherwise.
*/
bool IsSpeech(const float *samples, int32_t n) override;
int32_t WindowSize() const override;
int32_t MinSilenceDurationSamples() const override;
int32_t MinSpeechDurationSamples() const override;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_
... ...
// sherpa-onnx/csrc/vad-model-config.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/vad-model-config.h"
#include <sstream>
#include <string>
namespace sherpa_onnx {
void VadModelConfig::Register(ParseOptions *po) {
silero_vad.Register(po);
po->Register("vad-sample-rate", &sample_rate,
"Sample rate expected by the VAD model");
po->Register("vad-num-threads", &num_threads,
"Number of threads to run the VAD model");
po->Register("vad-provider", &provider,
"Specify a provider to run the VAD model. Supported values: "
"cpu, cuda, coreml");
po->Register("vad-debug", &debug,
"true to display debug information when loading vad models");
}
bool VadModelConfig::Validate() const { return silero_vad.Validate(); }
std::string VadModelConfig::ToString() const {
std::ostringstream os;
os << "VadModelConfig(";
os << "silero_vad=" << silero_vad.ToString() << ", ";
os << "sample_rate=" << sample_rate << ", ";
os << "num_threads=" << num_threads << ", ";
os << "provider=\"" << provider << "\", ";
os << "debug=" << (debug ? "True" : "False") << ")";
return os.str();
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_
#define SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_
#include <string>
#include "sherpa-onnx/csrc/parse-options.h"
#include "sherpa-onnx/csrc/silero-vad-model-config.h"
namespace sherpa_onnx {
struct VadModelConfig {
SileroVadModelConfig silero_vad;
int32_t sample_rate = 16000;
int32_t num_threads = 1;
std::string provider = "cpu";
// true to show debug information when loading models
bool debug = false;
VadModelConfig() = default;
VadModelConfig(const SileroVadModelConfig &silero_vad, int32_t sample_rate,
int32_t num_threads, const std::string &provider, bool debug)
: silero_vad(silero_vad),
sample_rate(sample_rate),
num_threads(num_threads),
provider(provider),
debug(debug) {}
void Register(ParseOptions *po);
bool Validate() const;
std::string ToString() const;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_
... ...
// sherpa-onnx/csrc/vad-model.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/vad-model.h"
#include "sherpa-onnx/csrc/silero-vad-model.h"
namespace sherpa_onnx {
std::unique_ptr<VadModel> VadModel::Create(const VadModelConfig &config) {
// TODO(fangjun): Support other VAD models.
return std::make_unique<SileroVadModel>(config);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/vad-model.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_VAD_MODEL_H_
#define SHERPA_ONNX_CSRC_VAD_MODEL_H_
#include <memory>
#include "sherpa-onnx/csrc/vad-model-config.h"
namespace sherpa_onnx {
class VadModel {
public:
virtual ~VadModel() = default;
static std::unique_ptr<VadModel> Create(const VadModelConfig &config);
// reset the internal model states
virtual void Reset() = 0;
/**
* @param samples Pointer to a 1-d array containing audio samples.
* Each sample should be normalized to the range [-1, 1].
* @param n Number of samples. Should be equal to WindowSize()
*
* @return Return true if speech is detected. Return false otherwise.
*/
virtual bool IsSpeech(const float *samples, int32_t n) = 0;
virtual int32_t WindowSize() const = 0;
virtual int32_t MinSilenceDurationSamples() const = 0;
virtual int32_t MinSpeechDurationSamples() const = 0;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_VAD_MODEL_H_
... ...
// sherpa-onnx/csrc/voice-activity-detector.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/voice-activity-detector.h"
#include <queue>
#include <utility>
#include "sherpa-onnx/csrc/circular-buffer.h"
#include "sherpa-onnx/csrc/vad-model.h"
namespace sherpa_onnx {
class VoiceActivityDetector::Impl {
public:
explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60)
: model_(VadModel::Create(config)),
config_(config),
buffer_(buffer_size_in_seconds * config.sample_rate) {}
void AcceptWaveform(const float *samples, int32_t n) {
buffer_.Push(samples, n);
bool is_speech = model_->IsSpeech(samples, n);
if (is_speech) {
if (start_ == -1) {
// beginning of speech
start_ = buffer_.Tail() - 2 * model_->WindowSize() -
model_->MinSpeechDurationSamples();
}
} else {
// non-speech
if (start_ != -1) {
// end of speech, save the speech segment
int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
std::vector<float> samples = buffer_.Get(start_, end - start_);
SpeechSegment segment;
segment.start = start_;
segment.samples = std::move(samples);
segments_.push(std::move(segment));
buffer_.Pop(end - buffer_.Head());
}
start_ = -1;
}
}
bool Empty() const { return segments_.empty(); }
void Pop() { segments_.pop(); }
const SpeechSegment &Front() const { return segments_.front(); }
void Reset() {
std::queue<SpeechSegment>().swap(segments_);
model_->Reset();
buffer_.Reset();
start_ = -1;
}
bool IsSpeechDetected() const { return start_ != -1; }
private:
std::queue<SpeechSegment> segments_;
std::unique_ptr<VadModel> model_;
VadModelConfig config_;
CircularBuffer buffer_;
int32_t start_ = -1;
};
VoiceActivityDetector::VoiceActivityDetector(
const VadModelConfig &config, float buffer_size_in_seconds /*= 60*/)
: impl_(std::make_unique<Impl>(config, buffer_size_in_seconds)) {}
VoiceActivityDetector::~VoiceActivityDetector() = default;
void VoiceActivityDetector::AcceptWaveform(const float *samples, int32_t n) {
impl_->AcceptWaveform(samples, n);
}
bool VoiceActivityDetector::Empty() const { return impl_->Empty(); }
void VoiceActivityDetector::Pop() { impl_->Pop(); }
const SpeechSegment &VoiceActivityDetector::Front() const {
return impl_->Front();
}
void VoiceActivityDetector::Reset() { impl_->Reset(); }
bool VoiceActivityDetector::IsSpeechDetected() const {
return impl_->IsSpeechDetected();
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/csrc/voice-activity-detector.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_
#define SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_
#include <memory>
#include <vector>
#include "sherpa-onnx/csrc/vad-model-config.h"
namespace sherpa_onnx {
struct SpeechSegment {
int32_t start; // in samples
std::vector<float> samples;
};
class VoiceActivityDetector {
public:
explicit VoiceActivityDetector(const VadModelConfig &config,
float buffer_size_in_seconds = 60);
~VoiceActivityDetector();
void AcceptWaveform(const float *samples, int32_t n);
bool Empty() const;
void Pop();
const SpeechSegment &Front() const;
bool IsSpeechDetected() const;
void Reset();
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_
... ...
include_directories(${CMAKE_SOURCE_DIR})
pybind11_add_module(_sherpa_onnx
circular-buffer.cc
display.cc
endpoint.cc
features.cc
... ... @@ -20,6 +21,10 @@ pybind11_add_module(_sherpa_onnx
online-stream.cc
online-transducer-model-config.cc
sherpa-onnx.cc
silero-vad-model-config.cc
vad-model-config.cc
vad-model.cc
voice-activity-detector.cc
)
if(APPLE)
... ...
// sherpa-onnx/python/csrc/circular-buffer.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/circular-buffer.h"
#include <vector>
#include "sherpa-onnx/csrc/circular-buffer.h"
namespace sherpa_onnx {
void PybindCircularBuffer(py::module *m) {
using PyClass = CircularBuffer;
py::class_<PyClass>(*m, "CircularBuffer")
.def(py::init<int32_t>(), py::arg("capacity"))
.def(
"push",
[](PyClass &self, const std::vector<float> &samples) {
self.Push(samples.data(), samples.size());
},
py::arg("samples"))
.def("get", &PyClass::Get, py::arg("start_index"), py::arg("n"))
.def("pop", &PyClass::Pop, py::arg("n"))
.def("reset", &PyClass::Reset)
.def_property_readonly("size", &PyClass::Size)
.def_property_readonly("head", &PyClass::Head)
.def_property_readonly("tail", &PyClass::Tail);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/python/csrc/circular-buffer.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
#define SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindCircularBuffer(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
... ...
... ... @@ -4,6 +4,7 @@
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
#include "sherpa-onnx/python/csrc/circular-buffer.h"
#include "sherpa-onnx/python/csrc/display.h"
#include "sherpa-onnx/python/csrc/endpoint.h"
#include "sherpa-onnx/python/csrc/features.h"
... ... @@ -15,6 +16,9 @@
#include "sherpa-onnx/python/csrc/online-model-config.h"
#include "sherpa-onnx/python/csrc/online-recognizer.h"
#include "sherpa-onnx/python/csrc/online-stream.h"
#include "sherpa-onnx/python/csrc/vad-model-config.h"
#include "sherpa-onnx/python/csrc/vad-model.h"
#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
namespace sherpa_onnx {
... ... @@ -34,6 +38,11 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
PybindOfflineLMConfig(&m);
PybindOfflineModelConfig(&m);
PybindOfflineRecognizer(&m);
PybindVadModelConfig(&m);
PybindVadModel(&m);
PybindCircularBuffer(&m);
PybindVoiceActivityDetector(&m);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/python/csrc/silero-vad-model-config.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
#include <memory>
#include <string>
#include "sherpa-onnx/csrc/silero-vad-model-config.h"
namespace sherpa_onnx {
void PybindSileroVadModelConfig(py::module *m) {
using PyClass = SileroVadModelConfig;
py::class_<PyClass>(*m, "SileroVadModelConfig")
.def(py::init<>())
.def(py::init([](const std::string &model, float threshold,
float min_silence_duration, float min_speech_duration,
int32_t window_size) -> std::unique_ptr<PyClass> {
auto ans = std::make_unique<PyClass>();
ans->model = model;
ans->threshold = threshold;
ans->min_silence_duration = min_silence_duration;
ans->min_speech_duration = min_speech_duration;
ans->window_size = window_size;
return ans;
}),
py::arg("model"), py::arg("threshold") = 0.5,
py::arg("min_silence_duration") = 0.5,
py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512)
.def_readwrite("model", &PyClass::model)
.def_readwrite("threshold", &PyClass::threshold)
.def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
.def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
.def_readwrite("window_size", &PyClass::window_size)
.def("__str__", &PyClass::ToString)
.def("validate", &PyClass::Validate);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/python/csrc/silero-vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
#define SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindSileroVadModelConfig(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
... ...
// sherpa-onnx/python/csrc/vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/vad-model-config.h"
#include <string>
#include "sherpa-onnx/csrc/vad-model-config.h"
#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
namespace sherpa_onnx {
void PybindVadModelConfig(py::module *m) {
PybindSileroVadModelConfig(m);
using PyClass = VadModelConfig;
py::class_<PyClass>(*m, "VadModelConfig")
.def(py::init<>())
.def(py::init<const SileroVadModelConfig &, int32_t, int32_t,
const std::string &, bool>(),
py::arg("silero_vad"), py::arg("sample_rate") = 16000,
py::arg("num_threads") = 1, py::arg("provider") = "cpu",
py::arg("debug") = false)
.def_readwrite("silero_vad", &PyClass::silero_vad)
.def_readwrite("sample_rate", &PyClass::sample_rate)
.def_readwrite("num_threads", &PyClass::num_threads)
.def_readwrite("provider", &PyClass::provider)
.def_readwrite("debug", &PyClass::debug)
.def("__str__", &PyClass::ToString)
.def("validate", &PyClass::Validate);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/python/csrc/vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindVadModelConfig(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
... ...
// sherpa-onnx/python/csrc/vad-model.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/vad-model.h"
#include <vector>
#include "sherpa-onnx/csrc/vad-model.h"
namespace sherpa_onnx {
void PybindVadModel(py::module *m) {
using PyClass = VadModel;
py::class_<PyClass>(*m, "VadModel")
.def_static("create", &PyClass::Create, py::arg("config"))
.def("reset", &PyClass::Reset)
.def(
"is_speech",
[](PyClass &self, const std::vector<float> &samples) -> bool {
return self.IsSpeech(samples.data(), samples.size());
},
py::arg("samples"))
.def("window_size", &PyClass::WindowSize)
.def("min_silence_duration_samples", &PyClass::MinSilenceDurationSamples)
.def("min_speech_duration_samples", &PyClass::MinSpeechDurationSamples);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/python/csrc/vad-model.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindVadModel(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
... ...
// sherpa-onnx/python/csrc/voice-activity-detector.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
#include <vector>
#include "sherpa-onnx/csrc/voice-activity-detector.h"
namespace sherpa_onnx {
void PybindSpeechSegment(py::module *m) {
using PyClass = SpeechSegment;
py::class_<PyClass>(*m, "SpeechSegment")
.def_property_readonly("start",
[](const PyClass &self) { return self.start; })
.def_property_readonly("samples",
[](const PyClass &self) { return self.samples; });
}
void PybindVoiceActivityDetector(py::module *m) {
PybindSpeechSegment(m);
using PyClass = VoiceActivityDetector;
py::class_<PyClass>(*m, "VoiceActivityDetector")
.def(py::init<const VadModelConfig &, float>(), py::arg("config"),
py::arg("buffer_size_in_seconds") = 60)
.def(
"accept_waveform",
[](PyClass &self, const std::vector<float> &samples) {
self.AcceptWaveform(samples.data(), samples.size());
},
py::arg("samples"))
.def("empty", &PyClass::Empty)
.def("pop", &PyClass::Pop)
.def("is_speech_detected", &PyClass::IsSpeechDetected)
.def("reset", &PyClass::Reset)
.def_property_readonly("front", &PyClass::Front);
}
} // namespace sherpa_onnx
... ...
// sherpa-onnx/python/csrc/voice-activity-detector.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
#define SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindVoiceActivityDetector(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
... ...
from typing import Dict, List, Optional
from _sherpa_onnx import Display, OfflineStream, OnlineStream
from _sherpa_onnx import (
CircularBuffer,
Display,
OfflineStream,
OnlineStream,
SileroVadModelConfig,
SpeechSegment,
VadModel,
VadModelConfig,
VoiceActivityDetector,
)
from .offline_recognizer import OfflineRecognizer
from .online_recognizer import OnlineRecognizer
... ...