正在显示
36 个修改的文件
包含
1683 行增加
和
16 行删除
| @@ -136,6 +136,7 @@ class BuildExtension(build_ext): | @@ -136,6 +136,7 @@ class BuildExtension(build_ext): | ||
| 136 | binaries += ["sherpa-onnx-online-websocket-server"] | 136 | binaries += ["sherpa-onnx-online-websocket-server"] |
| 137 | binaries += ["sherpa-onnx-offline-websocket-server"] | 137 | binaries += ["sherpa-onnx-offline-websocket-server"] |
| 138 | binaries += ["sherpa-onnx-online-websocket-client"] | 138 | binaries += ["sherpa-onnx-online-websocket-client"] |
| 139 | + binaries += ["sherpa-onnx-vad-microphone"] | ||
| 139 | 140 | ||
| 140 | if is_windows(): | 141 | if is_windows(): |
| 141 | binaries += ["kaldi-native-fbank-core.dll"] | 142 | binaries += ["kaldi-native-fbank-core.dll"] |
python-api-examples/README.md
0 → 100644
| 1 | +# File description | ||
| 2 | + | ||
| 3 | +- [./http_server.py](./http_server.py) It defines which files to server. | ||
| 4 | + Files are saved in [./web](./web). | ||
| 5 | +- [non_streaming_server.py](./non_streaming_server.py) WebSocket server for | ||
| 6 | + non-streaming models. | ||
| 7 | +- [vad-remove-non-speech-segments.py](./vad-remove-non-speech-segments.py) It uses | ||
| 8 | + [silero-vad](https://github.com/snakers4/silero-vad) to remove non-speech | ||
| 9 | + segments and concatenate all speech segments into a single one. |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +This file shows how to remove non-speech segments | ||
| 5 | +and merge all speech segments into a large segment | ||
| 6 | +and save it to a file. | ||
| 7 | + | ||
| 8 | +Usage | ||
| 9 | + | ||
| 10 | +python3 ./vad-remove-non-speech-segments.py \ | ||
| 11 | + --silero-vad-model silero_vad.onnx | ||
| 12 | + | ||
| 13 | +Please visit | ||
| 14 | +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 15 | +to download silero_vad.onnx | ||
| 16 | + | ||
| 17 | +For instance, | ||
| 18 | + | ||
| 19 | +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 20 | +""" | ||
| 21 | + | ||
| 22 | +import argparse | ||
| 23 | +import sys | ||
| 24 | +import time | ||
| 25 | +from pathlib import Path | ||
| 26 | + | ||
| 27 | +import numpy as np | ||
| 28 | +import sherpa_onnx | ||
| 29 | +import soundfile as sf | ||
| 30 | + | ||
| 31 | +try: | ||
| 32 | + import sounddevice as sd | ||
| 33 | +except ImportError: | ||
| 34 | + print("Please install sounddevice first. You can use") | ||
| 35 | + print() | ||
| 36 | + print(" pip install sounddevice") | ||
| 37 | + print() | ||
| 38 | + print("to install it") | ||
| 39 | + sys.exit(-1) | ||
| 40 | + | ||
| 41 | + | ||
| 42 | +def assert_file_exists(filename: str): | ||
| 43 | + assert Path(filename).is_file(), ( | ||
| 44 | + f"{filename} does not exist!\n" | ||
| 45 | + "Please refer to " | ||
| 46 | + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" | ||
| 47 | + ) | ||
| 48 | + | ||
| 49 | + | ||
| 50 | +def get_args(): | ||
| 51 | + parser = argparse.ArgumentParser( | ||
| 52 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 53 | + ) | ||
| 54 | + | ||
| 55 | + parser.add_argument( | ||
| 56 | + "--silero-vad-model", | ||
| 57 | + type=str, | ||
| 58 | + required=True, | ||
| 59 | + help="Path to silero_vad.onnx", | ||
| 60 | + ) | ||
| 61 | + | ||
| 62 | + return parser.parse_args() | ||
| 63 | + | ||
| 64 | + | ||
| 65 | +def main(): | ||
| 66 | + devices = sd.query_devices() | ||
| 67 | + if len(devices) == 0: | ||
| 68 | + print("No microphone devices found") | ||
| 69 | + sys.exit(0) | ||
| 70 | + | ||
| 71 | + print(devices) | ||
| 72 | + default_input_device_idx = sd.default.device[0] | ||
| 73 | + print(f'Use default device: {devices[default_input_device_idx]["name"]}') | ||
| 74 | + | ||
| 75 | + args = get_args() | ||
| 76 | + assert_file_exists(args.silero_vad_model) | ||
| 77 | + | ||
| 78 | + sample_rate = 16000 | ||
| 79 | + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
| 80 | + | ||
| 81 | + config = sherpa_onnx.VadModelConfig() | ||
| 82 | + config.silero_vad.model = args.silero_vad_model | ||
| 83 | + config.sample_rate = sample_rate | ||
| 84 | + | ||
| 85 | + window_size = config.silero_vad.window_size | ||
| 86 | + | ||
| 87 | + buffer = [] | ||
| 88 | + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) | ||
| 89 | + | ||
| 90 | + all_samples = [] | ||
| 91 | + | ||
| 92 | + print("Started! Please speak") | ||
| 93 | + | ||
| 94 | + try: | ||
| 95 | + with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s: | ||
| 96 | + while True: | ||
| 97 | + samples, _ = s.read(samples_per_read) # a blocking read | ||
| 98 | + samples = samples.reshape(-1) | ||
| 99 | + buffer = np.concatenate([buffer, samples]) | ||
| 100 | + | ||
| 101 | + all_samples = np.concatenate([all_samples, samples]) | ||
| 102 | + | ||
| 103 | + while len(buffer) > window_size: | ||
| 104 | + vad.accept_waveform(buffer[:window_size]) | ||
| 105 | + buffer = buffer[window_size:] | ||
| 106 | + except KeyboardInterrupt: | ||
| 107 | + print("\nCaught Ctrl + C. Saving & Exiting") | ||
| 108 | + | ||
| 109 | + speech_samples = [] | ||
| 110 | + while not vad.empty(): | ||
| 111 | + speech_samples.extend(vad.front.samples) | ||
| 112 | + vad.pop() | ||
| 113 | + | ||
| 114 | + speech_samples = np.array(speech_samples, dtype=np.float32) | ||
| 115 | + | ||
| 116 | + filename_for_speech = time.strftime("%Y%m%d-%H%M%S-speech.wav") | ||
| 117 | + sf.write(filename_for_speech, speech_samples, samplerate=sample_rate) | ||
| 118 | + | ||
| 119 | + filename_for_all = time.strftime("%Y%m%d-%H%M%S-all.wav") | ||
| 120 | + sf.write(filename_for_all, all_samples, samplerate=sample_rate) | ||
| 121 | + | ||
| 122 | + print(f"Saved to {filename_for_speech} and {filename_for_all}") | ||
| 123 | + | ||
| 124 | + | ||
| 125 | +if __name__ == "__main__": | ||
| 126 | + main() |
| @@ -56,6 +56,7 @@ def get_binaries_to_install(): | @@ -56,6 +56,7 @@ def get_binaries_to_install(): | ||
| 56 | binaries += ["sherpa-onnx-online-websocket-server"] | 56 | binaries += ["sherpa-onnx-online-websocket-server"] |
| 57 | binaries += ["sherpa-onnx-offline-websocket-server"] | 57 | binaries += ["sherpa-onnx-offline-websocket-server"] |
| 58 | binaries += ["sherpa-onnx-online-websocket-client"] | 58 | binaries += ["sherpa-onnx-online-websocket-client"] |
| 59 | + binaries += ["sherpa-onnx-vad-microphone"] | ||
| 59 | if is_windows(): | 60 | if is_windows(): |
| 60 | binaries += ["kaldi-native-fbank-core.dll"] | 61 | binaries += ["kaldi-native-fbank-core.dll"] |
| 61 | binaries += ["sherpa-onnx-c-api.dll"] | 62 | binaries += ["sherpa-onnx-c-api.dll"] |
| @@ -95,8 +96,8 @@ setuptools.setup( | @@ -95,8 +96,8 @@ setuptools.setup( | ||
| 95 | "Topic :: Scientific/Engineering :: Artificial Intelligence", | 96 | "Topic :: Scientific/Engineering :: Artificial Intelligence", |
| 96 | ], | 97 | ], |
| 97 | entry_points={ | 98 | entry_points={ |
| 98 | - 'console_scripts': [ | ||
| 99 | - 'sherpa-onnx-cli=sherpa_onnx.cli:cli', | 99 | + "console_scripts": [ |
| 100 | + "sherpa-onnx-cli=sherpa_onnx.cli:cli", | ||
| 100 | ], | 101 | ], |
| 101 | }, | 102 | }, |
| 102 | license="Apache licensed, as found in the LICENSE file", | 103 | license="Apache licensed, as found in the LICENSE file", |
| @@ -13,6 +13,7 @@ endif() | @@ -13,6 +13,7 @@ endif() | ||
| 13 | set(sources | 13 | set(sources |
| 14 | base64-decode.cc | 14 | base64-decode.cc |
| 15 | cat.cc | 15 | cat.cc |
| 16 | + circular-buffer.cc | ||
| 16 | context-graph.cc | 17 | context-graph.cc |
| 17 | endpoint.cc | 18 | endpoint.cc |
| 18 | features.cc | 19 | features.cc |
| @@ -66,6 +67,8 @@ set(sources | @@ -66,6 +67,8 @@ set(sources | ||
| 66 | provider.cc | 67 | provider.cc |
| 67 | resample.cc | 68 | resample.cc |
| 68 | session.cc | 69 | session.cc |
| 70 | + silero-vad-model-config.cc | ||
| 71 | + silero-vad-model.cc | ||
| 69 | slice.cc | 72 | slice.cc |
| 70 | stack.cc | 73 | stack.cc |
| 71 | symbol-table.cc | 74 | symbol-table.cc |
| @@ -73,6 +76,9 @@ set(sources | @@ -73,6 +76,9 @@ set(sources | ||
| 73 | transpose.cc | 76 | transpose.cc |
| 74 | unbind.cc | 77 | unbind.cc |
| 75 | utils.cc | 78 | utils.cc |
| 79 | + vad-model-config.cc | ||
| 80 | + vad-model.cc | ||
| 81 | + voice-activity-detector.cc | ||
| 76 | wave-reader.cc | 82 | wave-reader.cc |
| 77 | ) | 83 | ) |
| 78 | 84 | ||
| @@ -172,32 +178,42 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | @@ -172,32 +178,42 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) | ||
| 172 | microphone.cc | 178 | microphone.cc |
| 173 | ) | 179 | ) |
| 174 | 180 | ||
| 181 | + add_executable(sherpa-onnx-vad-microphone | ||
| 182 | + sherpa-onnx-vad-microphone.cc | ||
| 183 | + microphone.cc | ||
| 184 | + ) | ||
| 185 | + | ||
| 175 | if(BUILD_SHARED_LIBS) | 186 | if(BUILD_SHARED_LIBS) |
| 176 | set(PA_LIB portaudio) | 187 | set(PA_LIB portaudio) |
| 177 | else() | 188 | else() |
| 178 | set(PA_LIB portaudio_static) | 189 | set(PA_LIB portaudio_static) |
| 179 | endif() | 190 | endif() |
| 180 | 191 | ||
| 181 | - target_link_libraries(sherpa-onnx-microphone ${PA_LIB} sherpa-onnx-core) | ||
| 182 | - target_link_libraries(sherpa-onnx-microphone-offline ${PA_LIB} sherpa-onnx-core) | 192 | + set(exes |
| 193 | + sherpa-onnx-microphone | ||
| 194 | + sherpa-onnx-microphone-offline | ||
| 195 | + sherpa-onnx-vad-microphone | ||
| 196 | + ) | ||
| 197 | + foreach(exe IN LISTS exes) | ||
| 198 | + target_link_libraries(${exe} ${PA_LIB} sherpa-onnx-core) | ||
| 199 | + endforeach() | ||
| 183 | 200 | ||
| 184 | if(NOT WIN32) | 201 | if(NOT WIN32) |
| 185 | - target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib") | ||
| 186 | - target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib") | ||
| 187 | - | ||
| 188 | - target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib") | ||
| 189 | - target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib") | 202 | + foreach(exe IN LISTS exes) |
| 203 | + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib") | ||
| 204 | + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib") | ||
| 205 | + endforeach() | ||
| 190 | 206 | ||
| 191 | if(SHERPA_ONNX_ENABLE_PYTHON) | 207 | if(SHERPA_ONNX_ENABLE_PYTHON) |
| 192 | - target_link_libraries(sherpa-onnx-microphone "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib") | ||
| 193 | - target_link_libraries(sherpa-onnx-microphone-offline "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib") | 208 | + |
| 209 | + foreach(exe IN LISTS exes) | ||
| 210 | + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib") | ||
| 211 | + endforeach() | ||
| 194 | endif() | 212 | endif() |
| 195 | endif() | 213 | endif() |
| 196 | 214 | ||
| 197 | install( | 215 | install( |
| 198 | - TARGETS | ||
| 199 | - sherpa-onnx-microphone | ||
| 200 | - sherpa-onnx-microphone-offline | 216 | + TARGETS ${exes} |
| 201 | DESTINATION | 217 | DESTINATION |
| 202 | bin | 218 | bin |
| 203 | ) | 219 | ) |
| @@ -269,6 +285,7 @@ endif() | @@ -269,6 +285,7 @@ endif() | ||
| 269 | if(SHERPA_ONNX_ENABLE_TESTS) | 285 | if(SHERPA_ONNX_ENABLE_TESTS) |
| 270 | set(sherpa_onnx_test_srcs | 286 | set(sherpa_onnx_test_srcs |
| 271 | cat-test.cc | 287 | cat-test.cc |
| 288 | + circular-buffer-test.cc | ||
| 272 | context-graph-test.cc | 289 | context-graph-test.cc |
| 273 | packed-sequence-test.cc | 290 | packed-sequence-test.cc |
| 274 | pad-sequence-test.cc | 291 | pad-sequence-test.cc |
sherpa-onnx/csrc/README.md
0 → 100644
| 1 | +# File descriptions | ||
| 2 | + | ||
| 3 | +- [./sherpa-onnx-alsa.cc](./sherpa-onnx-alsa.cc) For Linux only, especially for | ||
| 4 | + embedded Linux, e.g., Raspberry Pi; it uses a streaming model for real-time | ||
| 5 | + speech recognition with a microphone. | ||
| 6 | + | ||
| 7 | +- [./sherpa-onnx-microphone.cc](./sherpa-onnx-microphone.cc) | ||
| 8 | + For Linux/Windows/macOS; it uses a streaming model for real-time speech | ||
| 9 | + recognition with a microphone. | ||
| 10 | + | ||
| 11 | +- [./sherpa-onnx-microphone-offline.cc](./sherpa-onnx-microphone-offline.cc) | ||
| 12 | + For Linux/Windows/macOS; it uses a non-streaming model for speech | ||
| 13 | + recognition with a microphone. | ||
| 14 | + | ||
| 15 | +- [./sherpa-onnx.cc](./sherpa-onnx.cc) | ||
| 16 | + It uses a streaming model to decode wave files | ||
| 17 | + | ||
| 18 | +- [./sherpa-onnx-offline.cc](./sherpa-onnx-offline.cc) | ||
| 19 | + It uses a non-streaming model to decode wave files | ||
| 20 | + | ||
| 21 | +- [./online-websocket-server.cc](./online-websocket-server.cc) | ||
| 22 | + WebSocket server for streaming models. | ||
| 23 | + | ||
| 24 | +- [./offline-websocket-server.cc](./offline-websocket-server.cc) | ||
| 25 | + WebSocket server for non-streaming models. | ||
| 26 | + | ||
| 27 | +- [./sherpa-onnx-vad-microphone.cc](./sherpa-onnx-vad-microphone.cc) | ||
| 28 | + Use silero VAD to detect speeches with a microphone. | ||
| 29 | + |
sherpa-onnx/csrc/circular-buffer-test.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/circular-buffer-test.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "gtest/gtest.h" | ||
| 10 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +TEST(CircularBuffer, Push) { | ||
| 15 | + CircularBuffer buffer(10); | ||
| 16 | + EXPECT_EQ(buffer.Size(), 0); | ||
| 17 | + EXPECT_EQ(buffer.Head(), 0); | ||
| 18 | + EXPECT_EQ(buffer.Tail(), 0); | ||
| 19 | + | ||
| 20 | + std::vector<float> a = {0, 1, 2, 3, 4, 5}; | ||
| 21 | + buffer.Push(a.data(), a.size()); | ||
| 22 | + | ||
| 23 | + EXPECT_EQ(buffer.Size(), 6); | ||
| 24 | + EXPECT_EQ(buffer.Head(), 0); | ||
| 25 | + EXPECT_EQ(buffer.Tail(), 6); | ||
| 26 | + | ||
| 27 | + auto c = buffer.Get(0, a.size()); | ||
| 28 | + EXPECT_EQ(a.size(), c.size()); | ||
| 29 | + for (int32_t i = 0; i != a.size(); ++i) { | ||
| 30 | + EXPECT_EQ(a[i], c[i]); | ||
| 31 | + } | ||
| 32 | + | ||
| 33 | + std::vector<float> d = {-6, -7, -8, -9}; | ||
| 34 | + buffer.Push(d.data(), d.size()); | ||
| 35 | + | ||
| 36 | + c = buffer.Get(a.size(), d.size()); | ||
| 37 | + EXPECT_EQ(d.size(), c.size()); | ||
| 38 | + for (int32_t i = 0; i != d.size(); ++i) { | ||
| 39 | + EXPECT_EQ(d[i], c[i]); | ||
| 40 | + } | ||
| 41 | +} | ||
| 42 | + | ||
| 43 | +TEST(CircularBuffer, PushAndPop) { | ||
| 44 | + CircularBuffer buffer(5); | ||
| 45 | + std::vector<float> a = {0, 1, 2, 3}; | ||
| 46 | + buffer.Push(a.data(), a.size()); | ||
| 47 | + | ||
| 48 | + EXPECT_EQ(buffer.Size(), 4); | ||
| 49 | + EXPECT_EQ(buffer.Head(), 0); | ||
| 50 | + EXPECT_EQ(buffer.Tail(), 4); | ||
| 51 | + | ||
| 52 | + buffer.Pop(2); | ||
| 53 | + | ||
| 54 | + EXPECT_EQ(buffer.Size(), 2); | ||
| 55 | + EXPECT_EQ(buffer.Head(), 2); | ||
| 56 | + EXPECT_EQ(buffer.Tail(), 4); | ||
| 57 | + | ||
| 58 | + auto c = buffer.Get(2, 2); | ||
| 59 | + EXPECT_EQ(c.size(), 2); | ||
| 60 | + EXPECT_EQ(c[0], 2); | ||
| 61 | + EXPECT_EQ(c[1], 3); | ||
| 62 | + | ||
| 63 | + a = {10, 20, 30}; | ||
| 64 | + buffer.Push(a.data(), a.size()); | ||
| 65 | + EXPECT_EQ(buffer.Size(), 5); | ||
| 66 | + EXPECT_EQ(buffer.Head(), 2); | ||
| 67 | + EXPECT_EQ(buffer.Tail(), 7); | ||
| 68 | + | ||
| 69 | + c = buffer.Get(2, 5); | ||
| 70 | + EXPECT_EQ(c.size(), 5); | ||
| 71 | + EXPECT_EQ(c[0], 2); | ||
| 72 | + EXPECT_EQ(c[1], 3); | ||
| 73 | + EXPECT_EQ(c[2], 10); | ||
| 74 | + EXPECT_EQ(c[3], 20); | ||
| 75 | + EXPECT_EQ(c[4], 30); | ||
| 76 | + | ||
| 77 | + c = buffer.Get(3, 4); | ||
| 78 | + EXPECT_EQ(c.size(), 4); | ||
| 79 | + EXPECT_EQ(c[0], 3); | ||
| 80 | + EXPECT_EQ(c[1], 10); | ||
| 81 | + EXPECT_EQ(c[2], 20); | ||
| 82 | + EXPECT_EQ(c[3], 30); | ||
| 83 | + | ||
| 84 | + c = buffer.Get(4, 3); | ||
| 85 | + EXPECT_EQ(c.size(), 3); | ||
| 86 | + EXPECT_EQ(c[0], 10); | ||
| 87 | + EXPECT_EQ(c[1], 20); | ||
| 88 | + EXPECT_EQ(c[2], 30); | ||
| 89 | + | ||
| 90 | + buffer.Pop(4); | ||
| 91 | + EXPECT_EQ(buffer.Size(), 1); | ||
| 92 | + EXPECT_EQ(buffer.Head(), 6); | ||
| 93 | + EXPECT_EQ(buffer.Tail(), 7); | ||
| 94 | + | ||
| 95 | + c = buffer.Get(6, 1); | ||
| 96 | + EXPECT_EQ(c.size(), 1); | ||
| 97 | + EXPECT_EQ(c[0], 30); | ||
| 98 | + | ||
| 99 | + a = {100, 200, 300, 400}; | ||
| 100 | + buffer.Push(a.data(), a.size()); | ||
| 101 | + EXPECT_EQ(buffer.Size(), 5); | ||
| 102 | + | ||
| 103 | + EXPECT_EQ(buffer.Size(), 5); | ||
| 104 | + EXPECT_EQ(buffer.Head(), 6); | ||
| 105 | + EXPECT_EQ(buffer.Tail(), 11); | ||
| 106 | + | ||
| 107 | + c = buffer.Get(6, 5); | ||
| 108 | + EXPECT_EQ(c.size(), 5); | ||
| 109 | + EXPECT_EQ(c[0], 30); | ||
| 110 | + EXPECT_EQ(c[1], 100); | ||
| 111 | + EXPECT_EQ(c[2], 200); | ||
| 112 | + EXPECT_EQ(c[3], 300); | ||
| 113 | + EXPECT_EQ(c[4], 400); | ||
| 114 | + | ||
| 115 | + buffer.Pop(3); | ||
| 116 | + EXPECT_EQ(buffer.Size(), 2); | ||
| 117 | + EXPECT_EQ(buffer.Head(), 9); | ||
| 118 | + EXPECT_EQ(buffer.Tail(), 11); | ||
| 119 | + | ||
| 120 | + c = buffer.Get(10, 1); | ||
| 121 | + EXPECT_EQ(c.size(), 1); | ||
| 122 | + EXPECT_EQ(c[0], 400); | ||
| 123 | + | ||
| 124 | + a = {1000, 2000, 3000}; | ||
| 125 | + buffer.Push(a.data(), a.size()); | ||
| 126 | + | ||
| 127 | + EXPECT_EQ(buffer.Size(), 5); | ||
| 128 | + EXPECT_EQ(buffer.Head(), 9); | ||
| 129 | + EXPECT_EQ(buffer.Tail(), 14); | ||
| 130 | + | ||
| 131 | + buffer.Pop(1); | ||
| 132 | + | ||
| 133 | + EXPECT_EQ(buffer.Size(), 4); | ||
| 134 | + EXPECT_EQ(buffer.Head(), 10); | ||
| 135 | + EXPECT_EQ(buffer.Tail(), 14); | ||
| 136 | + | ||
| 137 | + a = {4000}; | ||
| 138 | + | ||
| 139 | + buffer.Push(a.data(), a.size()); | ||
| 140 | + EXPECT_EQ(buffer.Size(), 5); | ||
| 141 | + EXPECT_EQ(buffer.Head(), 10); | ||
| 142 | + EXPECT_EQ(buffer.Tail(), 15); | ||
| 143 | + | ||
| 144 | + c = buffer.Get(13, 2); | ||
| 145 | + EXPECT_EQ(c.size(), 2); | ||
| 146 | + EXPECT_EQ(c[0], 3000); | ||
| 147 | + EXPECT_EQ(c[1], 4000); | ||
| 148 | +} | ||
| 149 | + | ||
| 150 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/circular-buffer.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/circular-buffer.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 6 | + | ||
| 7 | +#include <algorithm> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +CircularBuffer::CircularBuffer(int32_t capacity) { | ||
| 14 | + if (capacity <= 0) { | ||
| 15 | + SHERPA_ONNX_LOGE("Please specify a positive capacity. Given: %d\n", | ||
| 16 | + capacity); | ||
| 17 | + exit(-1); | ||
| 18 | + } | ||
| 19 | + buffer_.resize(capacity); | ||
| 20 | +} | ||
| 21 | + | ||
| 22 | +void CircularBuffer::Push(const float *p, int32_t n) { | ||
| 23 | + int32_t capacity = buffer_.size(); | ||
| 24 | + int32_t size = Size(); | ||
| 25 | + if (n + size > capacity) { | ||
| 26 | + SHERPA_ONNX_LOGE("Overflow! n: %d, size: %d, n+size: %d, capacity: %d", n, | ||
| 27 | + size, n + size, capacity); | ||
| 28 | + exit(-1); | ||
| 29 | + } | ||
| 30 | + | ||
| 31 | + int32_t start = tail_ % capacity; | ||
| 32 | + | ||
| 33 | + tail_ += n; | ||
| 34 | + | ||
| 35 | + if (start + n < capacity) { | ||
| 36 | + std::copy(p, p + n, buffer_.begin() + start); | ||
| 37 | + return; | ||
| 38 | + } | ||
| 39 | + | ||
| 40 | + int32_t part1_size = capacity - start; | ||
| 41 | + | ||
| 42 | + std::copy(p, p + part1_size, buffer_.begin() + start); | ||
| 43 | + | ||
| 44 | + std::copy(p + part1_size, p + n, buffer_.begin()); | ||
| 45 | +} | ||
| 46 | + | ||
| 47 | +std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const { | ||
| 48 | + if (start_index < head_ || start_index >= tail_) { | ||
| 49 | + SHERPA_ONNX_LOGE("Invalid start_index: %d. head_: %d, tail_: %d", | ||
| 50 | + start_index, head_, tail_); | ||
| 51 | + return {}; | ||
| 52 | + } | ||
| 53 | + | ||
| 54 | + int32_t size = Size(); | ||
| 55 | + if (n < 0 || n > size) { | ||
| 56 | + SHERPA_ONNX_LOGE("Invalid n: %d. size: %d", n, size); | ||
| 57 | + return {}; | ||
| 58 | + } | ||
| 59 | + | ||
| 60 | + int32_t capacity = buffer_.size(); | ||
| 61 | + | ||
| 62 | + if (start_index - head_ + n > size) { | ||
| 63 | + SHERPA_ONNX_LOGE("Invalid start_index: %d and n: %d. head_: %d, size: %d", | ||
| 64 | + start_index, n, head_, size); | ||
| 65 | + return {}; | ||
| 66 | + } | ||
| 67 | + | ||
| 68 | + int32_t start = start_index % capacity; | ||
| 69 | + | ||
| 70 | + if (start + n < capacity) { | ||
| 71 | + return {buffer_.begin() + start, buffer_.begin() + start + n}; | ||
| 72 | + } | ||
| 73 | + | ||
| 74 | + std::vector<float> ans(n); | ||
| 75 | + | ||
| 76 | + std::copy(buffer_.begin() + start, buffer_.end(), ans.begin()); | ||
| 77 | + | ||
| 78 | + int32_t part1_size = capacity - start; | ||
| 79 | + int32_t part2_size = n - part1_size; | ||
| 80 | + std::copy(buffer_.begin(), buffer_.begin() + part2_size, | ||
| 81 | + ans.begin() + part1_size); | ||
| 82 | + | ||
| 83 | + return ans; | ||
| 84 | +} | ||
| 85 | + | ||
| 86 | +void CircularBuffer::Pop(int32_t n) { | ||
| 87 | + int32_t size = Size(); | ||
| 88 | + if (n < 0 || n > size) { | ||
| 89 | + SHERPA_ONNX_LOGE("Invalid n: %d. size: %d", n, size); | ||
| 90 | + return; | ||
| 91 | + } | ||
| 92 | + | ||
| 93 | + head_ += n; | ||
| 94 | +} | ||
| 95 | + | ||
| 96 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/circular-buffer.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/circular-buffer.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_ | ||
| 6 | + | ||
| 7 | +#include <cstdint> | ||
| 8 | +#include <vector> | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +class CircularBuffer { | ||
| 13 | + public: | ||
| 14 | + // Capacity of this buffer. Should be large enough. | ||
| 15 | + // If it is full, we just print a message and exit the program. | ||
| 16 | + explicit CircularBuffer(int32_t capacity); | ||
| 17 | + | ||
| 18 | + // Push an array | ||
| 19 | + // | ||
| 20 | + // @param p Pointer to the start address of the array | ||
| 21 | + // @param n Number of elements in the array | ||
| 22 | + // | ||
| 23 | + // Note: If n + Size() > capacity, we print an error message and exit. | ||
| 24 | + void Push(const float *p, int32_t n); | ||
| 25 | + | ||
| 26 | + // @param start_index Should in the range [head_, tail_) | ||
| 27 | + // @param n Number of elements to get | ||
| 28 | + // @return Return a vector of size n containing the requested elements | ||
| 29 | + std::vector<float> Get(int32_t start_index, int32_t n) const; | ||
| 30 | + | ||
| 31 | + // Remove n elements from the buffer | ||
| 32 | + // | ||
| 33 | + // @param n Should be in the range [0, size_] | ||
| 34 | + void Pop(int32_t n); | ||
| 35 | + | ||
| 36 | + // Number of elements in the buffer. | ||
| 37 | + int32_t Size() const { return tail_ - head_; } | ||
| 38 | + | ||
| 39 | + // Current position of the head | ||
| 40 | + int32_t Head() const { return head_; } | ||
| 41 | + | ||
| 42 | + // Current position of the tail | ||
| 43 | + int32_t Tail() const { return tail_; } | ||
| 44 | + | ||
| 45 | + void Reset() { | ||
| 46 | + head_ = 0; | ||
| 47 | + tail_ = 0; | ||
| 48 | + } | ||
| 49 | + | ||
| 50 | + private: | ||
| 51 | + std::vector<float> buffer_; | ||
| 52 | + | ||
| 53 | + int32_t head_ = 0; // linear index; always increasing; never wraps around | ||
| 54 | + int32_t tail_ = 0; // linear index, always increasing; never wraps around. | ||
| 55 | +}; | ||
| 56 | + | ||
| 57 | +} // namespace sherpa_onnx | ||
| 58 | + | ||
| 59 | +#endif // SHERPA_ONNX_CSRC_CIRCULAR_BUFFER_H_ |
| @@ -76,4 +76,8 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) { | @@ -76,4 +76,8 @@ Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config) { | ||
| 76 | return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider); | 76 | return GetSessionOptionsImpl(config.lm_num_threads, config.lm_provider); |
| 77 | } | 77 | } |
| 78 | 78 | ||
| 79 | +Ort::SessionOptions GetSessionOptions(const VadModelConfig &config) { | ||
| 80 | + return GetSessionOptionsImpl(config.num_threads, config.provider); | ||
| 81 | +} | ||
| 82 | + | ||
| 79 | } // namespace sherpa_onnx | 83 | } // namespace sherpa_onnx |
| @@ -10,6 +10,7 @@ | @@ -10,6 +10,7 @@ | ||
| 10 | #include "sherpa-onnx/csrc/offline-model-config.h" | 10 | #include "sherpa-onnx/csrc/offline-model-config.h" |
| 11 | #include "sherpa-onnx/csrc/online-lm-config.h" | 11 | #include "sherpa-onnx/csrc/online-lm-config.h" |
| 12 | #include "sherpa-onnx/csrc/online-model-config.h" | 12 | #include "sherpa-onnx/csrc/online-model-config.h" |
| 13 | +#include "sherpa-onnx/csrc/vad-model-config.h" | ||
| 13 | 14 | ||
| 14 | namespace sherpa_onnx { | 15 | namespace sherpa_onnx { |
| 15 | 16 | ||
| @@ -20,6 +21,8 @@ Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config); | @@ -20,6 +21,8 @@ Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config); | ||
| 20 | Ort::SessionOptions GetSessionOptions(const OfflineLMConfig &config); | 21 | Ort::SessionOptions GetSessionOptions(const OfflineLMConfig &config); |
| 21 | 22 | ||
| 22 | Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config); | 23 | Ort::SessionOptions GetSessionOptions(const OnlineLMConfig &config); |
| 24 | + | ||
| 25 | +Ort::SessionOptions GetSessionOptions(const VadModelConfig &config); | ||
| 23 | } // namespace sherpa_onnx | 26 | } // namespace sherpa_onnx |
| 24 | 27 | ||
| 25 | #endif // SHERPA_ONNX_CSRC_SESSION_H_ | 28 | #endif // SHERPA_ONNX_CSRC_SESSION_H_ |
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-vad-microphone.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <signal.h> | ||
| 6 | +#include <stdio.h> | ||
| 7 | +#include <stdlib.h> | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | +#include <mutex> // NOLINT | ||
| 11 | + | ||
| 12 | +#include "portaudio.h" // NOLINT | ||
| 13 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 14 | +#include "sherpa-onnx/csrc/microphone.h" | ||
| 15 | +#include "sherpa-onnx/csrc/voice-activity-detector.h" | ||
| 16 | + | ||
| 17 | +bool stop = false; | ||
| 18 | +std::mutex mutex; | ||
| 19 | +sherpa_onnx::CircularBuffer buffer(16000 * 60); | ||
| 20 | + | ||
| 21 | +static int32_t RecordCallback(const void *input_buffer, | ||
| 22 | + void * /*output_buffer*/, | ||
| 23 | + unsigned long frames_per_buffer, // NOLINT | ||
| 24 | + const PaStreamCallbackTimeInfo * /*time_info*/, | ||
| 25 | + PaStreamCallbackFlags /*status_flags*/, | ||
| 26 | + void *user_data) { | ||
| 27 | + std::lock_guard<std::mutex> lock(mutex); | ||
| 28 | + buffer.Push(reinterpret_cast<const float *>(input_buffer), frames_per_buffer); | ||
| 29 | + | ||
| 30 | + return stop ? paComplete : paContinue; | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +static void Handler(int32_t sig) { | ||
| 34 | + stop = true; | ||
| 35 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); | ||
| 36 | +} | ||
| 37 | + | ||
| 38 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 39 | + signal(SIGINT, Handler); | ||
| 40 | + | ||
| 41 | + const char *kUsageMessage = R"usage( | ||
| 42 | +This program shows how to use VAD in sherpa-onnx. | ||
| 43 | + | ||
| 44 | + ./bin/sherpa-onnx-vad-microphone \ | ||
| 45 | + --silero-vad-model=/path/to/silero_vad.onnx \ | ||
| 46 | + --provider=cpu \ | ||
| 47 | + --num-threads=1 | ||
| 48 | + | ||
| 49 | +Please download silero_vad.onnx from | ||
| 50 | +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 51 | + | ||
| 52 | +For instance, use | ||
| 53 | +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 54 | +)usage"; | ||
| 55 | + | ||
| 56 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 57 | + sherpa_onnx::VadModelConfig config; | ||
| 58 | + | ||
| 59 | + config.Register(&po); | ||
| 60 | + po.Read(argc, argv); | ||
| 61 | + if (po.NumArgs() != 0) { | ||
| 62 | + po.PrintUsage(); | ||
| 63 | + exit(EXIT_FAILURE); | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + fprintf(stderr, "%s\n", config.ToString().c_str()); | ||
| 67 | + | ||
| 68 | + if (!config.Validate()) { | ||
| 69 | + fprintf(stderr, "Errors in config!\n"); | ||
| 70 | + return -1; | ||
| 71 | + } | ||
| 72 | + | ||
| 73 | + sherpa_onnx::Microphone mic; | ||
| 74 | + | ||
| 75 | + PaDeviceIndex num_devices = Pa_GetDeviceCount(); | ||
| 76 | + fprintf(stderr, "Num devices: %d\n", num_devices); | ||
| 77 | + | ||
| 78 | + PaStreamParameters param; | ||
| 79 | + | ||
| 80 | + param.device = Pa_GetDefaultInputDevice(); | ||
| 81 | + if (param.device == paNoDevice) { | ||
| 82 | + fprintf(stderr, "No default input device found\n"); | ||
| 83 | + exit(EXIT_FAILURE); | ||
| 84 | + } | ||
| 85 | + fprintf(stderr, "Use default device: %d\n", param.device); | ||
| 86 | + | ||
| 87 | + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); | ||
| 88 | + fprintf(stderr, " Name: %s\n", info->name); | ||
| 89 | + fprintf(stderr, " Max input channels: %d\n", info->maxInputChannels); | ||
| 90 | + | ||
| 91 | + param.channelCount = 1; | ||
| 92 | + param.sampleFormat = paFloat32; | ||
| 93 | + | ||
| 94 | + param.suggestedLatency = info->defaultLowInputLatency; | ||
| 95 | + param.hostApiSpecificStreamInfo = nullptr; | ||
| 96 | + float sample_rate = 16000; | ||
| 97 | + | ||
| 98 | + PaStream *stream; | ||
| 99 | + PaError err = | ||
| 100 | + Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ | ||
| 101 | + sample_rate, | ||
| 102 | + 0, // frames per buffer | ||
| 103 | + paClipOff, // we won't output out of range samples | ||
| 104 | + // so don't bother clipping them | ||
| 105 | + RecordCallback, &config.silero_vad.window_size); | ||
| 106 | + if (err != paNoError) { | ||
| 107 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 108 | + exit(EXIT_FAILURE); | ||
| 109 | + } | ||
| 110 | + | ||
| 111 | + err = Pa_StartStream(stream); | ||
| 112 | + | ||
| 113 | + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config); | ||
| 114 | + | ||
| 115 | + fprintf(stderr, "Started\n"); | ||
| 116 | + | ||
| 117 | + if (err != paNoError) { | ||
| 118 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 119 | + exit(EXIT_FAILURE); | ||
| 120 | + } | ||
| 121 | + | ||
| 122 | + int32_t window_size = config.silero_vad.window_size; | ||
| 123 | + bool printed = false; | ||
| 124 | + | ||
| 125 | + while (!stop) { | ||
| 126 | + { | ||
| 127 | + std::lock_guard<std::mutex> lock(mutex); | ||
| 128 | + | ||
| 129 | + while (buffer.Size() >= window_size) { | ||
| 130 | + std::vector<float> samples = buffer.Get(buffer.Head(), window_size); | ||
| 131 | + buffer.Pop(window_size); | ||
| 132 | + vad->AcceptWaveform(samples.data(), samples.size()); | ||
| 133 | + | ||
| 134 | + if (vad->IsSpeechDetected() && !printed) { | ||
| 135 | + printed = true; | ||
| 136 | + fprintf(stderr, "\nDetected speech!\n"); | ||
| 137 | + } | ||
| 138 | + if (!vad->IsSpeechDetected()) { | ||
| 139 | + printed = false; | ||
| 140 | + } | ||
| 141 | + | ||
| 142 | + while (!vad->Empty()) { | ||
| 143 | + float duration = vad->Front().samples.size() / sample_rate; | ||
| 144 | + vad->Pop(); | ||
| 145 | + fprintf(stderr, "Duration: %.3f seconds\n", duration); | ||
| 146 | + } | ||
| 147 | + } | ||
| 148 | + } | ||
| 149 | + Pa_Sleep(100); // sleep for 100ms | ||
| 150 | + } | ||
| 151 | + | ||
| 152 | + err = Pa_CloseStream(stream); | ||
| 153 | + if (err != paNoError) { | ||
| 154 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 155 | + exit(EXIT_FAILURE); | ||
| 156 | + } | ||
| 157 | + | ||
| 158 | + return 0; | ||
| 159 | +} |
sherpa-onnx/csrc/silero-vad-model-config.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/silero-vad-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/silero-vad-model-config.h" | ||
| 6 | + | ||
| 7 | +#include "sherpa-onnx/csrc/file-utils.h" | ||
| 8 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void SileroVadModelConfig::Register(ParseOptions *po) { | ||
| 13 | + po->Register("silero-vad-model", &model, "Path to silero VAD ONNX model."); | ||
| 14 | + | ||
| 15 | + po->Register("silero-vad-threshold", &threshold, | ||
| 16 | + "Speech threshold. Silero VAD outputs speech probabilities for " | ||
| 17 | + "each audio chunk, probabilities ABOVE this value are " | ||
| 18 | + "considered as SPEECH. It is better to tune this parameter for " | ||
| 19 | + "each dataset separately, but lazy " | ||
| 20 | + "0.5 is pretty good for most datasets."); | ||
| 21 | + | ||
| 22 | + po->Register( | ||
| 23 | + "silero-vad-min-silence-duration", &min_silence_duration, | ||
| 24 | + "In seconds. In the end of each speech chunk wait for " | ||
| 25 | + "--silero-vad-min-silence-duration seconds before separating it"); | ||
| 26 | + | ||
| 27 | + po->Register("silero-vad-min-speech-duration", &min_speech_duration, | ||
| 28 | + "In seconds. In the end of each silence chunk wait for " | ||
| 29 | + "--silero-vad-min-speech-duration seconds before separating it"); | ||
| 30 | + | ||
| 31 | + po->Register( | ||
| 32 | + "silero-vad-window-size", &window_size, | ||
| 33 | + "In samples. Audio chunks of --silero-vad-window-size samples are fed " | ||
| 34 | + "to the silero VAD model. WARNING! Silero VAD models were trained using " | ||
| 35 | + "512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples " | ||
| 36 | + "for 8000 sample rate. Values other than these may affect model " | ||
| 37 | + "perfomance!"); | ||
| 38 | +} | ||
| 39 | + | ||
| 40 | +bool SileroVadModelConfig::Validate() const { | ||
| 41 | + if (model.empty()) { | ||
| 42 | + SHERPA_ONNX_LOGE("Please provide --silero-vad-model"); | ||
| 43 | + return false; | ||
| 44 | + } | ||
| 45 | + | ||
| 46 | + if (!FileExists(model)) { | ||
| 47 | + SHERPA_ONNX_LOGE("Silero vad model file %s does not exist", model.c_str()); | ||
| 48 | + return false; | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + if (threshold < 0.01) { | ||
| 52 | + SHERPA_ONNX_LOGE( | ||
| 53 | + "Please use a larger value for --silero-vad-threshold. Given: %f", | ||
| 54 | + threshold); | ||
| 55 | + return false; | ||
| 56 | + } | ||
| 57 | + | ||
| 58 | + if (threshold >= 1) { | ||
| 59 | + SHERPA_ONNX_LOGE( | ||
| 60 | + "Please use a smaller value for --silero-vad-threshold. Given: %f", | ||
| 61 | + threshold); | ||
| 62 | + return false; | ||
| 63 | + } | ||
| 64 | + | ||
| 65 | + return true; | ||
| 66 | +} | ||
| 67 | + | ||
| 68 | +std::string SileroVadModelConfig::ToString() const { | ||
| 69 | + std::ostringstream os; | ||
| 70 | + | ||
| 71 | + os << "SilerVadModelConfig("; | ||
| 72 | + os << "model=\"" << model << "\", "; | ||
| 73 | + os << "threshold=" << threshold << ", "; | ||
| 74 | + os << "min_silence_duration=" << min_silence_duration << ", "; | ||
| 75 | + os << "min_speech_duration=" << min_speech_duration << ", "; | ||
| 76 | + os << "window_size=" << window_size << ")"; | ||
| 77 | + | ||
| 78 | + return os.str(); | ||
| 79 | +} | ||
| 80 | + | ||
| 81 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/silero-vad-model-config.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/silero-vad-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_ | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/parse-options.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +struct SileroVadModelConfig { | ||
| 14 | + std::string model; | ||
| 15 | + | ||
| 16 | + // threshold to classify a segment as speech | ||
| 17 | + // | ||
| 18 | + // The predicted probability of a segment is larger than this | ||
| 19 | + // value, then it is classified as speech. | ||
| 20 | + float threshold = 0.5; | ||
| 21 | + | ||
| 22 | + float min_silence_duration = 0.5; // in seconds | ||
| 23 | + | ||
| 24 | + float min_speech_duration = 0.25; // in seconds | ||
| 25 | + | ||
| 26 | + // 512, 1024, 1536 samples for 16000 Hz | ||
| 27 | + // 256, 512, 768 samples for 800 Hz | ||
| 28 | + int window_size = 512; // in samples | ||
| 29 | + | ||
| 30 | + SileroVadModelConfig() = default; | ||
| 31 | + | ||
| 32 | + void Register(ParseOptions *po); | ||
| 33 | + | ||
| 34 | + bool Validate() const; | ||
| 35 | + | ||
| 36 | + std::string ToString() const; | ||
| 37 | +}; | ||
| 38 | + | ||
| 39 | +} // namespace sherpa_onnx | ||
| 40 | + | ||
| 41 | +#endif // SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_CONFIG_H_ |
sherpa-onnx/csrc/silero-vad-model.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/silero-vad-model.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/silero-vad-model.h" | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | +#include <utility> | ||
| 9 | +#include <vector> | ||
| 10 | + | ||
| 11 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 12 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 13 | +#include "sherpa-onnx/csrc/session.h" | ||
| 14 | + | ||
| 15 | +namespace sherpa_onnx { | ||
| 16 | + | ||
| 17 | +class SileroVadModel::Impl { | ||
| 18 | + public: | ||
| 19 | + explicit Impl(const VadModelConfig &config) | ||
| 20 | + : config_(config), | ||
| 21 | + env_(ORT_LOGGING_LEVEL_ERROR), | ||
| 22 | + sess_opts_(GetSessionOptions(config)), | ||
| 23 | + allocator_{} { | ||
| 24 | + auto buf = ReadFile(config.silero_vad.model); | ||
| 25 | + Init(buf.data(), buf.size()); | ||
| 26 | + | ||
| 27 | + sample_rate_ = config.sample_rate; | ||
| 28 | + if (sample_rate_ != 16000) { | ||
| 29 | + SHERPA_ONNX_LOGE("Expected sample rate 16000. Given: %d", | ||
| 30 | + config.sample_rate); | ||
| 31 | + exit(-1); | ||
| 32 | + } | ||
| 33 | + | ||
| 34 | + min_silence_samples_ = | ||
| 35 | + sample_rate_ * config_.silero_vad.min_silence_duration; | ||
| 36 | + | ||
| 37 | + min_speech_samples_ = sample_rate_ * config_.silero_vad.min_speech_duration; | ||
| 38 | + } | ||
| 39 | + | ||
| 40 | + void Reset() { | ||
| 41 | + // 2 - number of LSTM layer | ||
| 42 | + // 1 - batch size | ||
| 43 | + // 64 - hidden dim | ||
| 44 | + std::array<int64_t, 3> shape{2, 1, 64}; | ||
| 45 | + | ||
| 46 | + Ort::Value h = | ||
| 47 | + Ort::Value::CreateTensor<float>(allocator_, shape.data(), shape.size()); | ||
| 48 | + | ||
| 49 | + Ort::Value c = | ||
| 50 | + Ort::Value::CreateTensor<float>(allocator_, shape.data(), shape.size()); | ||
| 51 | + | ||
| 52 | + Fill<float>(&h, 0); | ||
| 53 | + Fill<float>(&c, 0); | ||
| 54 | + | ||
| 55 | + states_.clear(); | ||
| 56 | + | ||
| 57 | + states_.reserve(2); | ||
| 58 | + states_.push_back(std::move(h)); | ||
| 59 | + states_.push_back(std::move(c)); | ||
| 60 | + | ||
| 61 | + triggered_ = false; | ||
| 62 | + current_sample_ = 0; | ||
| 63 | + temp_start_ = 0; | ||
| 64 | + temp_end_ = 0; | ||
| 65 | + } | ||
| 66 | + | ||
| 67 | + bool IsSpeech(const float *samples, int32_t n) { | ||
| 68 | + if (n != config_.silero_vad.window_size) { | ||
| 69 | + SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, | ||
| 70 | + config_.silero_vad.window_size); | ||
| 71 | + exit(-1); | ||
| 72 | + } | ||
| 73 | + | ||
| 74 | + auto memory_info = | ||
| 75 | + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); | ||
| 76 | + | ||
| 77 | + std::array<int64_t, 2> x_shape = {1, n}; | ||
| 78 | + | ||
| 79 | + Ort::Value x = | ||
| 80 | + Ort::Value::CreateTensor(memory_info, const_cast<float *>(samples), n, | ||
| 81 | + x_shape.data(), x_shape.size()); | ||
| 82 | + | ||
| 83 | + int64_t sr_shape = 1; | ||
| 84 | + Ort::Value sr = | ||
| 85 | + Ort::Value::CreateTensor(memory_info, &sample_rate_, 1, &sr_shape, 1); | ||
| 86 | + | ||
| 87 | + std::array<Ort::Value, 4> inputs = {std::move(x), std::move(sr), | ||
| 88 | + std::move(states_[0]), | ||
| 89 | + std::move(states_[1])}; | ||
| 90 | + | ||
| 91 | + auto out = | ||
| 92 | + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), | ||
| 93 | + output_names_ptr_.data(), output_names_ptr_.size()); | ||
| 94 | + | ||
| 95 | + states_[0] = std::move(out[1]); | ||
| 96 | + states_[1] = std::move(out[2]); | ||
| 97 | + | ||
| 98 | + float prob = out[0].GetTensorData<float>()[0]; | ||
| 99 | + | ||
| 100 | + float threshold = config_.silero_vad.threshold; | ||
| 101 | + | ||
| 102 | + current_sample_ += config_.silero_vad.window_size; | ||
| 103 | + | ||
| 104 | + if (prob > threshold && temp_end_ != 0) { | ||
| 105 | + temp_end_ = 0; | ||
| 106 | + } | ||
| 107 | + | ||
| 108 | + if (prob > threshold && temp_start_ == 0) { | ||
| 109 | + // start speaking, but we require that it must satisfy | ||
| 110 | + // min_speech_duration | ||
| 111 | + temp_start_ = current_sample_; | ||
| 112 | + return false; | ||
| 113 | + } | ||
| 114 | + | ||
| 115 | + if (prob > threshold && temp_start_ != 0 && !triggered_) { | ||
| 116 | + if (current_sample_ - temp_start_ < min_speech_samples_) { | ||
| 117 | + return false; | ||
| 118 | + } | ||
| 119 | + | ||
| 120 | + triggered_ = true; | ||
| 121 | + | ||
| 122 | + return true; | ||
| 123 | + } | ||
| 124 | + | ||
| 125 | + if ((prob < threshold) && !triggered_) { | ||
| 126 | + // silence | ||
| 127 | + temp_start_ = 0; | ||
| 128 | + temp_end_ = 0; | ||
| 129 | + return false; | ||
| 130 | + } | ||
| 131 | + | ||
| 132 | + if ((prob > threshold - 0.15) && triggered_) { | ||
| 133 | + // speaking | ||
| 134 | + return true; | ||
| 135 | + } | ||
| 136 | + | ||
| 137 | + if ((prob > threshold) && !triggered_) { | ||
| 138 | + // start speaking | ||
| 139 | + triggered_ = true; | ||
| 140 | + | ||
| 141 | + return true; | ||
| 142 | + } | ||
| 143 | + | ||
| 144 | + if ((prob < threshold) && triggered_) { | ||
| 145 | + // stop to speak | ||
| 146 | + if (temp_end_ == 0) { | ||
| 147 | + temp_end_ = current_sample_; | ||
| 148 | + } | ||
| 149 | + | ||
| 150 | + if (current_sample_ - temp_end_ < min_silence_samples_) { | ||
| 151 | + // continue speaking | ||
| 152 | + return true; | ||
| 153 | + } | ||
| 154 | + // stopped speaking | ||
| 155 | + temp_start_ = 0; | ||
| 156 | + temp_end_ = 0; | ||
| 157 | + triggered_ = false; | ||
| 158 | + return false; | ||
| 159 | + } | ||
| 160 | + | ||
| 161 | + return false; | ||
| 162 | + } | ||
| 163 | + | ||
| 164 | + int32_t WindowSize() const { return config_.silero_vad.window_size; } | ||
| 165 | + | ||
| 166 | + int32_t MinSilenceDurationSamples() const { return min_silence_samples_; } | ||
| 167 | + | ||
| 168 | + int32_t MinSpeechDurationSamples() const { return min_speech_samples_; } | ||
| 169 | + | ||
| 170 | + private: | ||
| 171 | + void Init(void *model_data, size_t model_data_length) { | ||
| 172 | + sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length, | ||
| 173 | + sess_opts_); | ||
| 174 | + | ||
| 175 | + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); | ||
| 176 | + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); | ||
| 177 | + Check(); | ||
| 178 | + | ||
| 179 | + Reset(); | ||
| 180 | + } | ||
| 181 | + | ||
| 182 | + void Check() { | ||
| 183 | + if (input_names_.size() != 4) { | ||
| 184 | + SHERPA_ONNX_LOGE("Expect 4 inputs. Given: %d", | ||
| 185 | + static_cast<int32_t>(input_names_.size())); | ||
| 186 | + exit(-1); | ||
| 187 | + } | ||
| 188 | + | ||
| 189 | + if (input_names_[0] != "input") { | ||
| 190 | + SHERPA_ONNX_LOGE("Input[0]: %s. Expected: input", | ||
| 191 | + input_names_[0].c_str()); | ||
| 192 | + exit(-1); | ||
| 193 | + } | ||
| 194 | + | ||
| 195 | + if (input_names_[1] != "sr") { | ||
| 196 | + SHERPA_ONNX_LOGE("Input[1]: %s. Expected: sr", input_names_[1].c_str()); | ||
| 197 | + exit(-1); | ||
| 198 | + } | ||
| 199 | + | ||
| 200 | + if (input_names_[2] != "h") { | ||
| 201 | + SHERPA_ONNX_LOGE("Input[2]: %s. Expected: h", input_names_[2].c_str()); | ||
| 202 | + exit(-1); | ||
| 203 | + } | ||
| 204 | + | ||
| 205 | + if (input_names_[3] != "c") { | ||
| 206 | + SHERPA_ONNX_LOGE("Input[3]: %s. Expected: c", input_names_[3].c_str()); | ||
| 207 | + exit(-1); | ||
| 208 | + } | ||
| 209 | + | ||
| 210 | + // Now for outputs | ||
| 211 | + if (output_names_.size() != 3) { | ||
| 212 | + SHERPA_ONNX_LOGE("Expect 3 outputs. Given: %d", | ||
| 213 | + static_cast<int32_t>(output_names_.size())); | ||
| 214 | + exit(-1); | ||
| 215 | + } | ||
| 216 | + | ||
| 217 | + if (output_names_[0] != "output") { | ||
| 218 | + SHERPA_ONNX_LOGE("Output[0]: %s. Expected: output", | ||
| 219 | + output_names_[0].c_str()); | ||
| 220 | + exit(-1); | ||
| 221 | + } | ||
| 222 | + | ||
| 223 | + if (output_names_[1] != "hn") { | ||
| 224 | + SHERPA_ONNX_LOGE("Output[1]: %s. Expected: sr", output_names_[1].c_str()); | ||
| 225 | + exit(-1); | ||
| 226 | + } | ||
| 227 | + | ||
| 228 | + if (output_names_[2] != "cn") { | ||
| 229 | + SHERPA_ONNX_LOGE("Output[2]: %s. Expected: sr", output_names_[2].c_str()); | ||
| 230 | + exit(-1); | ||
| 231 | + } | ||
| 232 | + } | ||
| 233 | + | ||
| 234 | + private: | ||
| 235 | + VadModelConfig config_; | ||
| 236 | + | ||
| 237 | + Ort::Env env_; | ||
| 238 | + Ort::SessionOptions sess_opts_; | ||
| 239 | + Ort::AllocatorWithDefaultOptions allocator_; | ||
| 240 | + | ||
| 241 | + std::unique_ptr<Ort::Session> sess_; | ||
| 242 | + | ||
| 243 | + std::vector<std::string> input_names_; | ||
| 244 | + std::vector<const char *> input_names_ptr_; | ||
| 245 | + | ||
| 246 | + std::vector<std::string> output_names_; | ||
| 247 | + std::vector<const char *> output_names_ptr_; | ||
| 248 | + | ||
| 249 | + std::vector<Ort::Value> states_; | ||
| 250 | + int64_t sample_rate_; | ||
| 251 | + int32_t min_silence_samples_; | ||
| 252 | + int32_t min_speech_samples_; | ||
| 253 | + | ||
| 254 | + bool triggered_ = false; | ||
| 255 | + int32_t current_sample_ = 0; | ||
| 256 | + int32_t temp_start_ = 0; | ||
| 257 | + int32_t temp_end_ = 0; | ||
| 258 | +}; | ||
| 259 | + | ||
| 260 | +SileroVadModel::SileroVadModel(const VadModelConfig &config) | ||
| 261 | + : impl_(std::make_unique<Impl>(config)) {} | ||
| 262 | + | ||
| 263 | +SileroVadModel::~SileroVadModel() = default; | ||
| 264 | + | ||
| 265 | +void SileroVadModel::Reset() { return impl_->Reset(); } | ||
| 266 | + | ||
| 267 | +bool SileroVadModel::IsSpeech(const float *samples, int32_t n) { | ||
| 268 | + return impl_->IsSpeech(samples, n); | ||
| 269 | +} | ||
| 270 | + | ||
| 271 | +int32_t SileroVadModel::WindowSize() const { return impl_->WindowSize(); } | ||
| 272 | + | ||
| 273 | +int32_t SileroVadModel::MinSilenceDurationSamples() const { | ||
| 274 | + return impl_->MinSilenceDurationSamples(); | ||
| 275 | +} | ||
| 276 | + | ||
| 277 | +int32_t SileroVadModel::MinSpeechDurationSamples() const { | ||
| 278 | + return impl_->MinSpeechDurationSamples(); | ||
| 279 | +} | ||
| 280 | + | ||
| 281 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/silero-vad-model.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/silero-vad-model.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_ | ||
| 6 | + | ||
| 7 | +#include <memory> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/vad-model.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +class SileroVadModel : public VadModel { | ||
| 14 | + public: | ||
| 15 | + explicit SileroVadModel(const VadModelConfig &config); | ||
| 16 | + ~SileroVadModel() override; | ||
| 17 | + | ||
| 18 | + // reset the internal model states | ||
| 19 | + void Reset() override; | ||
| 20 | + | ||
| 21 | + /** | ||
| 22 | + * @param samples Pointer to a 1-d array containing audio samples. | ||
| 23 | + * Each sample should be normalized to the range [-1, 1]. | ||
| 24 | + * @param n Number of samples. | ||
| 25 | + * | ||
| 26 | + * @return Return true if speech is detected. Return false otherwise. | ||
| 27 | + */ | ||
| 28 | + bool IsSpeech(const float *samples, int32_t n) override; | ||
| 29 | + | ||
| 30 | + int32_t WindowSize() const override; | ||
| 31 | + | ||
| 32 | + int32_t MinSilenceDurationSamples() const override; | ||
| 33 | + int32_t MinSpeechDurationSamples() const override; | ||
| 34 | + | ||
| 35 | + private: | ||
| 36 | + class Impl; | ||
| 37 | + std::unique_ptr<Impl> impl_; | ||
| 38 | +}; | ||
| 39 | + | ||
| 40 | +} // namespace sherpa_onnx | ||
| 41 | + | ||
| 42 | +#endif // SHERPA_ONNX_CSRC_SILERO_VAD_MODEL_H_ |
sherpa-onnx/csrc/vad-model-config.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/vad-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/vad-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <sstream> | ||
| 8 | +#include <string> | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void VadModelConfig::Register(ParseOptions *po) { | ||
| 13 | + silero_vad.Register(po); | ||
| 14 | + | ||
| 15 | + po->Register("vad-sample-rate", &sample_rate, | ||
| 16 | + "Sample rate expected by the VAD model"); | ||
| 17 | + | ||
| 18 | + po->Register("vad-num-threads", &num_threads, | ||
| 19 | + "Number of threads to run the VAD model"); | ||
| 20 | + | ||
| 21 | + po->Register("vad-provider", &provider, | ||
| 22 | + "Specify a provider to run the VAD model. Supported values: " | ||
| 23 | + "cpu, cuda, coreml"); | ||
| 24 | + | ||
| 25 | + po->Register("vad-debug", &debug, | ||
| 26 | + "true to display debug information when loading vad models"); | ||
| 27 | +} | ||
| 28 | + | ||
| 29 | +bool VadModelConfig::Validate() const { return silero_vad.Validate(); } | ||
| 30 | + | ||
| 31 | +std::string VadModelConfig::ToString() const { | ||
| 32 | + std::ostringstream os; | ||
| 33 | + | ||
| 34 | + os << "VadModelConfig("; | ||
| 35 | + os << "silero_vad=" << silero_vad.ToString() << ", "; | ||
| 36 | + os << "sample_rate=" << sample_rate << ", "; | ||
| 37 | + os << "num_threads=" << num_threads << ", "; | ||
| 38 | + os << "provider=\"" << provider << "\", "; | ||
| 39 | + os << "debug=" << (debug ? "True" : "False") << ")"; | ||
| 40 | + | ||
| 41 | + return os.str(); | ||
| 42 | +} | ||
| 43 | + | ||
| 44 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/vad-model-config.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/vad-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_ | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/parse-options.h" | ||
| 10 | +#include "sherpa-onnx/csrc/silero-vad-model-config.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +struct VadModelConfig { | ||
| 15 | + SileroVadModelConfig silero_vad; | ||
| 16 | + | ||
| 17 | + int32_t sample_rate = 16000; | ||
| 18 | + int32_t num_threads = 1; | ||
| 19 | + std::string provider = "cpu"; | ||
| 20 | + | ||
| 21 | + // true to show debug information when loading models | ||
| 22 | + bool debug = false; | ||
| 23 | + | ||
| 24 | + VadModelConfig() = default; | ||
| 25 | + | ||
| 26 | + VadModelConfig(const SileroVadModelConfig &silero_vad, int32_t sample_rate, | ||
| 27 | + int32_t num_threads, const std::string &provider, bool debug) | ||
| 28 | + : silero_vad(silero_vad), | ||
| 29 | + sample_rate(sample_rate), | ||
| 30 | + num_threads(num_threads), | ||
| 31 | + provider(provider), | ||
| 32 | + debug(debug) {} | ||
| 33 | + | ||
| 34 | + void Register(ParseOptions *po); | ||
| 35 | + bool Validate() const; | ||
| 36 | + | ||
| 37 | + std::string ToString() const; | ||
| 38 | +}; | ||
| 39 | + | ||
| 40 | +} // namespace sherpa_onnx | ||
| 41 | + | ||
| 42 | +#endif // SHERPA_ONNX_CSRC_VAD_MODEL_CONFIG_H_ |
sherpa-onnx/csrc/vad-model.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/vad-model.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/vad-model.h" | ||
| 6 | + | ||
| 7 | +#include "sherpa-onnx/csrc/silero-vad-model.h" | ||
| 8 | + | ||
| 9 | +namespace sherpa_onnx { | ||
| 10 | + | ||
| 11 | +std::unique_ptr<VadModel> VadModel::Create(const VadModelConfig &config) { | ||
| 12 | + // TODO(fangjun): Support other VAD models. | ||
| 13 | + return std::make_unique<SileroVadModel>(config); | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/vad-model.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/vad-model.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_VAD_MODEL_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_VAD_MODEL_H_ | ||
| 6 | + | ||
| 7 | +#include <memory> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/vad-model-config.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +class VadModel { | ||
| 14 | + public: | ||
| 15 | + virtual ~VadModel() = default; | ||
| 16 | + | ||
| 17 | + static std::unique_ptr<VadModel> Create(const VadModelConfig &config); | ||
| 18 | + | ||
| 19 | + // reset the internal model states | ||
| 20 | + virtual void Reset() = 0; | ||
| 21 | + | ||
| 22 | + /** | ||
| 23 | + * @param samples Pointer to a 1-d array containing audio samples. | ||
| 24 | + * Each sample should be normalized to the range [-1, 1]. | ||
| 25 | + * @param n Number of samples. Should be equal to WindowSize() | ||
| 26 | + * | ||
| 27 | + * @return Return true if speech is detected. Return false otherwise. | ||
| 28 | + */ | ||
| 29 | + virtual bool IsSpeech(const float *samples, int32_t n) = 0; | ||
| 30 | + | ||
| 31 | + virtual int32_t WindowSize() const = 0; | ||
| 32 | + | ||
| 33 | + virtual int32_t MinSilenceDurationSamples() const = 0; | ||
| 34 | + virtual int32_t MinSpeechDurationSamples() const = 0; | ||
| 35 | +}; | ||
| 36 | + | ||
| 37 | +} // namespace sherpa_onnx | ||
| 38 | + | ||
| 39 | +#endif // SHERPA_ONNX_CSRC_VAD_MODEL_H_ |
sherpa-onnx/csrc/voice-activity-detector.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/voice-activity-detector.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/voice-activity-detector.h" | ||
| 6 | + | ||
| 7 | +#include <queue> | ||
| 8 | +#include <utility> | ||
| 9 | + | ||
| 10 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 11 | +#include "sherpa-onnx/csrc/vad-model.h" | ||
| 12 | + | ||
| 13 | +namespace sherpa_onnx { | ||
| 14 | + | ||
| 15 | +class VoiceActivityDetector::Impl { | ||
| 16 | + public: | ||
| 17 | + explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60) | ||
| 18 | + : model_(VadModel::Create(config)), | ||
| 19 | + config_(config), | ||
| 20 | + buffer_(buffer_size_in_seconds * config.sample_rate) {} | ||
| 21 | + | ||
| 22 | + void AcceptWaveform(const float *samples, int32_t n) { | ||
| 23 | + buffer_.Push(samples, n); | ||
| 24 | + | ||
| 25 | + bool is_speech = model_->IsSpeech(samples, n); | ||
| 26 | + if (is_speech) { | ||
| 27 | + if (start_ == -1) { | ||
| 28 | + // beginning of speech | ||
| 29 | + start_ = buffer_.Tail() - 2 * model_->WindowSize() - | ||
| 30 | + model_->MinSpeechDurationSamples(); | ||
| 31 | + } | ||
| 32 | + } else { | ||
| 33 | + // non-speech | ||
| 34 | + if (start_ != -1) { | ||
| 35 | + // end of speech, save the speech segment | ||
| 36 | + int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples(); | ||
| 37 | + | ||
| 38 | + std::vector<float> samples = buffer_.Get(start_, end - start_); | ||
| 39 | + SpeechSegment segment; | ||
| 40 | + | ||
| 41 | + segment.start = start_; | ||
| 42 | + segment.samples = std::move(samples); | ||
| 43 | + | ||
| 44 | + segments_.push(std::move(segment)); | ||
| 45 | + | ||
| 46 | + buffer_.Pop(end - buffer_.Head()); | ||
| 47 | + } | ||
| 48 | + | ||
| 49 | + start_ = -1; | ||
| 50 | + } | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | + bool Empty() const { return segments_.empty(); } | ||
| 54 | + | ||
| 55 | + void Pop() { segments_.pop(); } | ||
| 56 | + | ||
| 57 | + const SpeechSegment &Front() const { return segments_.front(); } | ||
| 58 | + | ||
| 59 | + void Reset() { | ||
| 60 | + std::queue<SpeechSegment>().swap(segments_); | ||
| 61 | + | ||
| 62 | + model_->Reset(); | ||
| 63 | + buffer_.Reset(); | ||
| 64 | + | ||
| 65 | + start_ = -1; | ||
| 66 | + } | ||
| 67 | + | ||
| 68 | + bool IsSpeechDetected() const { return start_ != -1; } | ||
| 69 | + | ||
| 70 | + private: | ||
| 71 | + std::queue<SpeechSegment> segments_; | ||
| 72 | + | ||
| 73 | + std::unique_ptr<VadModel> model_; | ||
| 74 | + VadModelConfig config_; | ||
| 75 | + CircularBuffer buffer_; | ||
| 76 | + | ||
| 77 | + int32_t start_ = -1; | ||
| 78 | +}; | ||
| 79 | + | ||
| 80 | +VoiceActivityDetector::VoiceActivityDetector( | ||
| 81 | + const VadModelConfig &config, float buffer_size_in_seconds /*= 60*/) | ||
| 82 | + : impl_(std::make_unique<Impl>(config, buffer_size_in_seconds)) {} | ||
| 83 | + | ||
| 84 | +VoiceActivityDetector::~VoiceActivityDetector() = default; | ||
| 85 | + | ||
| 86 | +void VoiceActivityDetector::AcceptWaveform(const float *samples, int32_t n) { | ||
| 87 | + impl_->AcceptWaveform(samples, n); | ||
| 88 | +} | ||
| 89 | + | ||
| 90 | +bool VoiceActivityDetector::Empty() const { return impl_->Empty(); } | ||
| 91 | + | ||
| 92 | +void VoiceActivityDetector::Pop() { impl_->Pop(); } | ||
| 93 | + | ||
| 94 | +const SpeechSegment &VoiceActivityDetector::Front() const { | ||
| 95 | + return impl_->Front(); | ||
| 96 | +} | ||
| 97 | + | ||
| 98 | +void VoiceActivityDetector::Reset() { impl_->Reset(); } | ||
| 99 | + | ||
| 100 | +bool VoiceActivityDetector::IsSpeechDetected() const { | ||
| 101 | + return impl_->IsSpeechDetected(); | ||
| 102 | +} | ||
| 103 | + | ||
| 104 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/voice-activity-detector.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/voice-activity-detector.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | +#ifndef SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_ | ||
| 5 | +#define SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_ | ||
| 6 | + | ||
| 7 | +#include <memory> | ||
| 8 | +#include <vector> | ||
| 9 | + | ||
| 10 | +#include "sherpa-onnx/csrc/vad-model-config.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +struct SpeechSegment { | ||
| 15 | + int32_t start; // in samples | ||
| 16 | + std::vector<float> samples; | ||
| 17 | +}; | ||
| 18 | + | ||
| 19 | +class VoiceActivityDetector { | ||
| 20 | + public: | ||
| 21 | + explicit VoiceActivityDetector(const VadModelConfig &config, | ||
| 22 | + float buffer_size_in_seconds = 60); | ||
| 23 | + ~VoiceActivityDetector(); | ||
| 24 | + | ||
| 25 | + void AcceptWaveform(const float *samples, int32_t n); | ||
| 26 | + bool Empty() const; | ||
| 27 | + void Pop(); | ||
| 28 | + const SpeechSegment &Front() const; | ||
| 29 | + | ||
| 30 | + bool IsSpeechDetected() const; | ||
| 31 | + | ||
| 32 | + void Reset(); | ||
| 33 | + | ||
| 34 | + private: | ||
| 35 | + class Impl; | ||
| 36 | + std::unique_ptr<Impl> impl_; | ||
| 37 | +}; | ||
| 38 | + | ||
| 39 | +} // namespace sherpa_onnx | ||
| 40 | + | ||
| 41 | +#endif // SHERPA_ONNX_CSRC_VOICE_ACTIVITY_DETECTOR_H_ |
| 1 | include_directories(${CMAKE_SOURCE_DIR}) | 1 | include_directories(${CMAKE_SOURCE_DIR}) |
| 2 | 2 | ||
| 3 | pybind11_add_module(_sherpa_onnx | 3 | pybind11_add_module(_sherpa_onnx |
| 4 | + circular-buffer.cc | ||
| 4 | display.cc | 5 | display.cc |
| 5 | endpoint.cc | 6 | endpoint.cc |
| 6 | features.cc | 7 | features.cc |
| @@ -20,6 +21,10 @@ pybind11_add_module(_sherpa_onnx | @@ -20,6 +21,10 @@ pybind11_add_module(_sherpa_onnx | ||
| 20 | online-stream.cc | 21 | online-stream.cc |
| 21 | online-transducer-model-config.cc | 22 | online-transducer-model-config.cc |
| 22 | sherpa-onnx.cc | 23 | sherpa-onnx.cc |
| 24 | + silero-vad-model-config.cc | ||
| 25 | + vad-model-config.cc | ||
| 26 | + vad-model.cc | ||
| 27 | + voice-activity-detector.cc | ||
| 23 | ) | 28 | ) |
| 24 | 29 | ||
| 25 | if(APPLE) | 30 | if(APPLE) |
sherpa-onnx/python/csrc/circular-buffer.cc
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/circular-buffer.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/circular-buffer.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +void PybindCircularBuffer(py::module *m) { | ||
| 14 | + using PyClass = CircularBuffer; | ||
| 15 | + py::class_<PyClass>(*m, "CircularBuffer") | ||
| 16 | + .def(py::init<int32_t>(), py::arg("capacity")) | ||
| 17 | + .def( | ||
| 18 | + "push", | ||
| 19 | + [](PyClass &self, const std::vector<float> &samples) { | ||
| 20 | + self.Push(samples.data(), samples.size()); | ||
| 21 | + }, | ||
| 22 | + py::arg("samples")) | ||
| 23 | + .def("get", &PyClass::Get, py::arg("start_index"), py::arg("n")) | ||
| 24 | + .def("pop", &PyClass::Pop, py::arg("n")) | ||
| 25 | + .def("reset", &PyClass::Reset) | ||
| 26 | + .def_property_readonly("size", &PyClass::Size) | ||
| 27 | + .def_property_readonly("head", &PyClass::Head) | ||
| 28 | + .def_property_readonly("tail", &PyClass::Tail); | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +} // namespace sherpa_onnx |
sherpa-onnx/python/csrc/circular-buffer.h
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/circular-buffer.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindCircularBuffer(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_ |
| @@ -4,6 +4,7 @@ | @@ -4,6 +4,7 @@ | ||
| 4 | 4 | ||
| 5 | #include "sherpa-onnx/python/csrc/sherpa-onnx.h" | 5 | #include "sherpa-onnx/python/csrc/sherpa-onnx.h" |
| 6 | 6 | ||
| 7 | +#include "sherpa-onnx/python/csrc/circular-buffer.h" | ||
| 7 | #include "sherpa-onnx/python/csrc/display.h" | 8 | #include "sherpa-onnx/python/csrc/display.h" |
| 8 | #include "sherpa-onnx/python/csrc/endpoint.h" | 9 | #include "sherpa-onnx/python/csrc/endpoint.h" |
| 9 | #include "sherpa-onnx/python/csrc/features.h" | 10 | #include "sherpa-onnx/python/csrc/features.h" |
| @@ -15,6 +16,9 @@ | @@ -15,6 +16,9 @@ | ||
| 15 | #include "sherpa-onnx/python/csrc/online-model-config.h" | 16 | #include "sherpa-onnx/python/csrc/online-model-config.h" |
| 16 | #include "sherpa-onnx/python/csrc/online-recognizer.h" | 17 | #include "sherpa-onnx/python/csrc/online-recognizer.h" |
| 17 | #include "sherpa-onnx/python/csrc/online-stream.h" | 18 | #include "sherpa-onnx/python/csrc/online-stream.h" |
| 19 | +#include "sherpa-onnx/python/csrc/vad-model-config.h" | ||
| 20 | +#include "sherpa-onnx/python/csrc/vad-model.h" | ||
| 21 | +#include "sherpa-onnx/python/csrc/voice-activity-detector.h" | ||
| 18 | 22 | ||
| 19 | namespace sherpa_onnx { | 23 | namespace sherpa_onnx { |
| 20 | 24 | ||
| @@ -34,6 +38,11 @@ PYBIND11_MODULE(_sherpa_onnx, m) { | @@ -34,6 +38,11 @@ PYBIND11_MODULE(_sherpa_onnx, m) { | ||
| 34 | PybindOfflineLMConfig(&m); | 38 | PybindOfflineLMConfig(&m); |
| 35 | PybindOfflineModelConfig(&m); | 39 | PybindOfflineModelConfig(&m); |
| 36 | PybindOfflineRecognizer(&m); | 40 | PybindOfflineRecognizer(&m); |
| 41 | + | ||
| 42 | + PybindVadModelConfig(&m); | ||
| 43 | + PybindVadModel(&m); | ||
| 44 | + PybindCircularBuffer(&m); | ||
| 45 | + PybindVoiceActivityDetector(&m); | ||
| 37 | } | 46 | } |
| 38 | 47 | ||
| 39 | } // namespace sherpa_onnx | 48 | } // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/python/csrc/silero-vad-model-config.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/silero-vad-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <memory> | ||
| 8 | +#include <string> | ||
| 9 | + | ||
| 10 | +#include "sherpa-onnx/csrc/silero-vad-model-config.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +void PybindSileroVadModelConfig(py::module *m) { | ||
| 15 | + using PyClass = SileroVadModelConfig; | ||
| 16 | + py::class_<PyClass>(*m, "SileroVadModelConfig") | ||
| 17 | + .def(py::init<>()) | ||
| 18 | + .def(py::init([](const std::string &model, float threshold, | ||
| 19 | + float min_silence_duration, float min_speech_duration, | ||
| 20 | + int32_t window_size) -> std::unique_ptr<PyClass> { | ||
| 21 | + auto ans = std::make_unique<PyClass>(); | ||
| 22 | + | ||
| 23 | + ans->model = model; | ||
| 24 | + ans->threshold = threshold; | ||
| 25 | + ans->min_silence_duration = min_silence_duration; | ||
| 26 | + ans->min_speech_duration = min_speech_duration; | ||
| 27 | + ans->window_size = window_size; | ||
| 28 | + | ||
| 29 | + return ans; | ||
| 30 | + }), | ||
| 31 | + py::arg("model"), py::arg("threshold") = 0.5, | ||
| 32 | + py::arg("min_silence_duration") = 0.5, | ||
| 33 | + py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512) | ||
| 34 | + .def_readwrite("model", &PyClass::model) | ||
| 35 | + .def_readwrite("threshold", &PyClass::threshold) | ||
| 36 | + .def_readwrite("min_silence_duration", &PyClass::min_silence_duration) | ||
| 37 | + .def_readwrite("min_speech_duration", &PyClass::min_speech_duration) | ||
| 38 | + .def_readwrite("window_size", &PyClass::window_size) | ||
| 39 | + .def("__str__", &PyClass::ToString) | ||
| 40 | + .def("validate", &PyClass::Validate); | ||
| 41 | +} | ||
| 42 | + | ||
| 43 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/python/csrc/silero-vad-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindSileroVadModelConfig(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_ |
sherpa-onnx/python/csrc/vad-model-config.cc
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/vad-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/vad-model-config.h" | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/vad-model-config.h" | ||
| 10 | +#include "sherpa-onnx/python/csrc/silero-vad-model-config.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +void PybindVadModelConfig(py::module *m) { | ||
| 15 | + PybindSileroVadModelConfig(m); | ||
| 16 | + | ||
| 17 | + using PyClass = VadModelConfig; | ||
| 18 | + py::class_<PyClass>(*m, "VadModelConfig") | ||
| 19 | + .def(py::init<>()) | ||
| 20 | + .def(py::init<const SileroVadModelConfig &, int32_t, int32_t, | ||
| 21 | + const std::string &, bool>(), | ||
| 22 | + py::arg("silero_vad"), py::arg("sample_rate") = 16000, | ||
| 23 | + py::arg("num_threads") = 1, py::arg("provider") = "cpu", | ||
| 24 | + py::arg("debug") = false) | ||
| 25 | + .def_readwrite("silero_vad", &PyClass::silero_vad) | ||
| 26 | + .def_readwrite("sample_rate", &PyClass::sample_rate) | ||
| 27 | + .def_readwrite("num_threads", &PyClass::num_threads) | ||
| 28 | + .def_readwrite("provider", &PyClass::provider) | ||
| 29 | + .def_readwrite("debug", &PyClass::debug) | ||
| 30 | + .def("__str__", &PyClass::ToString) | ||
| 31 | + .def("validate", &PyClass::Validate); | ||
| 32 | +} | ||
| 33 | + | ||
| 34 | +} // namespace sherpa_onnx |
sherpa-onnx/python/csrc/vad-model-config.h
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/vad-model-config.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindVadModelConfig(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_ |
sherpa-onnx/python/csrc/vad-model.cc
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/vad-model.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/vad-model.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/vad-model.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +void PybindVadModel(py::module *m) { | ||
| 14 | + using PyClass = VadModel; | ||
| 15 | + py::class_<PyClass>(*m, "VadModel") | ||
| 16 | + .def_static("create", &PyClass::Create, py::arg("config")) | ||
| 17 | + .def("reset", &PyClass::Reset) | ||
| 18 | + .def( | ||
| 19 | + "is_speech", | ||
| 20 | + [](PyClass &self, const std::vector<float> &samples) -> bool { | ||
| 21 | + return self.IsSpeech(samples.data(), samples.size()); | ||
| 22 | + }, | ||
| 23 | + py::arg("samples")) | ||
| 24 | + .def("window_size", &PyClass::WindowSize) | ||
| 25 | + .def("min_silence_duration_samples", &PyClass::MinSilenceDurationSamples) | ||
| 26 | + .def("min_speech_duration_samples", &PyClass::MinSpeechDurationSamples); | ||
| 27 | +} | ||
| 28 | + | ||
| 29 | +} // namespace sherpa_onnx |
sherpa-onnx/python/csrc/vad-model.h
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/vad-model.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindVadModel(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_ |
| 1 | +// sherpa-onnx/python/csrc/voice-activity-detector.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/voice-activity-detector.h" | ||
| 6 | + | ||
| 7 | +#include <vector> | ||
| 8 | + | ||
| 9 | +#include "sherpa-onnx/csrc/voice-activity-detector.h" | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +void PybindSpeechSegment(py::module *m) { | ||
| 14 | + using PyClass = SpeechSegment; | ||
| 15 | + py::class_<PyClass>(*m, "SpeechSegment") | ||
| 16 | + .def_property_readonly("start", | ||
| 17 | + [](const PyClass &self) { return self.start; }) | ||
| 18 | + .def_property_readonly("samples", | ||
| 19 | + [](const PyClass &self) { return self.samples; }); | ||
| 20 | +} | ||
| 21 | + | ||
| 22 | +void PybindVoiceActivityDetector(py::module *m) { | ||
| 23 | + PybindSpeechSegment(m); | ||
| 24 | + using PyClass = VoiceActivityDetector; | ||
| 25 | + py::class_<PyClass>(*m, "VoiceActivityDetector") | ||
| 26 | + .def(py::init<const VadModelConfig &, float>(), py::arg("config"), | ||
| 27 | + py::arg("buffer_size_in_seconds") = 60) | ||
| 28 | + .def( | ||
| 29 | + "accept_waveform", | ||
| 30 | + [](PyClass &self, const std::vector<float> &samples) { | ||
| 31 | + self.AcceptWaveform(samples.data(), samples.size()); | ||
| 32 | + }, | ||
| 33 | + py::arg("samples")) | ||
| 34 | + .def("empty", &PyClass::Empty) | ||
| 35 | + .def("pop", &PyClass::Pop) | ||
| 36 | + .def("is_speech_detected", &PyClass::IsSpeechDetected) | ||
| 37 | + .def("reset", &PyClass::Reset) | ||
| 38 | + .def_property_readonly("front", &PyClass::Front); | ||
| 39 | +} | ||
| 40 | + | ||
| 41 | +} // namespace sherpa_onnx |
| 1 | +// sherpa-onnx/python/csrc/voice-activity-detector.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindVoiceActivityDetector(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_ |
| 1 | from typing import Dict, List, Optional | 1 | from typing import Dict, List, Optional |
| 2 | 2 | ||
| 3 | -from _sherpa_onnx import Display, OfflineStream, OnlineStream | 3 | +from _sherpa_onnx import ( |
| 4 | + CircularBuffer, | ||
| 5 | + Display, | ||
| 6 | + OfflineStream, | ||
| 7 | + OnlineStream, | ||
| 8 | + SileroVadModelConfig, | ||
| 9 | + SpeechSegment, | ||
| 10 | + VadModel, | ||
| 11 | + VadModelConfig, | ||
| 12 | + VoiceActivityDetector, | ||
| 13 | +) | ||
| 4 | 14 | ||
| 5 | from .offline_recognizer import OfflineRecognizer | 15 | from .offline_recognizer import OfflineRecognizer |
| 6 | from .online_recognizer import OnlineRecognizer | 16 | from .online_recognizer import OnlineRecognizer |
-
请 注册 或 登录 后发表评论