Fangjun Kuang
Committed by GitHub

Add VAD examples using ALSA for recording (#739)

@@ -58,7 +58,6 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 @@ -58,7 +58,6 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
58 node ./test-online-zipformer2-ctc.js 58 node ./test-online-zipformer2-ctc.js
59 rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 59 rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
60 60
61 -  
62 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 61 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
63 tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 62 tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
64 rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 63 rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
@@ -70,9 +69,9 @@ rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 @@ -70,9 +69,9 @@ rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
70 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 69 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
71 tar xf vits-piper-en_US-amy-low.tar.bz2 70 tar xf vits-piper-en_US-amy-low.tar.bz2
72 node ./test-offline-tts-en.js 71 node ./test-offline-tts-en.js
73 -rm vits-piper-en_US-amy-low* 72 +rm -rf vits-piper-en_US-amy-low*
74 73
75 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 74 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
76 tar xvf vits-icefall-zh-aishell3.tar.bz2 75 tar xvf vits-icefall-zh-aishell3.tar.bz2
77 node ./test-offline-tts-zh.js 76 node ./test-offline-tts-zh.js
78 -rm vits-icefall-zh-aishell3* 77 +rm -rf vits-icefall-zh-aishell3*
@@ -59,8 +59,27 @@ jobs: @@ -59,8 +59,27 @@ jobs:
59 run: | 59 run: |
60 ls -lh ./wheelhouse/ 60 ls -lh ./wheelhouse/
61 61
  62 + - name: Install patchelf
  63 + if: matrix.os == 'ubuntu-latest'
  64 + shell: bash
  65 + run: |
  66 + sudo apt-get update -q
  67 + sudo apt-get install -q -y patchelf
  68 + patchelf --help
  69 +
  70 + - name: Patch wheels
  71 + shell: bash
  72 + if: matrix.os == 'ubuntu-latest'
  73 + run: |
  74 + mkdir ./wheels
  75 + sudo ./scripts/wheel/patch_wheel.py --in-dir ./wheelhouse --out-dir ./wheels
  76 +
  77 + ls -lh ./wheels/
  78 + rm -rf ./wheelhouse
  79 + mv ./wheels ./wheelhouse
  80 +
62 - name: Publish to huggingface 81 - name: Publish to huggingface
63 - if: matrix.python-version == 'cp38' && matrix.manylinux == 'manylinux2014' 82 + if: (matrix.python-version == 'cp38' || matrix.python-version == 'cp39' ) && matrix.manylinux == 'manylinux2014'
64 env: 83 env:
65 HF_TOKEN: ${{ secrets.HF_TOKEN }} 84 HF_TOKEN: ${{ secrets.HF_TOKEN }}
66 uses: nick-fields/retry@v3 85 uses: nick-fields/retry@v3
@@ -186,7 +186,7 @@ class MainActivity : AppCompatActivity() { @@ -186,7 +186,7 @@ class MainActivity : AppCompatActivity() {
186 // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 186 // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
187 // modelDir = "vits-icefall-zh-aishell3" 187 // modelDir = "vits-icefall-zh-aishell3"
188 // modelName = "model.onnx" 188 // modelName = "model.onnx"
189 - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst," 189 + // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst"
190 // ruleFars = "vits-icefall-zh-aishell3/rule.far" 190 // ruleFars = "vits-icefall-zh-aishell3/rule.far"
191 // lexicon = "lexicon.txt" 191 // lexicon = "lexicon.txt"
192 192
@@ -67,6 +67,7 @@ def get_binaries(): @@ -67,6 +67,7 @@ def get_binaries():
67 "sherpa-onnx-alsa-offline", 67 "sherpa-onnx-alsa-offline",
68 "sherpa-onnx-alsa-offline-speaker-identification", 68 "sherpa-onnx-alsa-offline-speaker-identification",
69 "sherpa-onnx-offline-tts-play-alsa", 69 "sherpa-onnx-offline-tts-play-alsa",
  70 + "sherpa-onnx-vad-alsa",
70 ] 71 ]
71 72
72 if is_windows(): 73 if is_windows():
@@ -75,6 +75,10 @@ function(download_openfst) @@ -75,6 +75,10 @@ function(download_openfst)
75 set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst") 75 set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst")
76 set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar") 76 set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar")
77 77
  78 + if(LINUX)
  79 + target_compile_options(fst PUBLIC -Wno-missing-template-keyword)
  80 + endif()
  81 +
78 target_include_directories(fst 82 target_include_directories(fst
79 PUBLIC 83 PUBLIC
80 ${openfst_SOURCE_DIR}/src/include 84 ${openfst_SOURCE_DIR}/src/include
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This script works only on Linux. It uses ALSA for recording.
  5 +"""
  6 +
  7 +import argparse
  8 +from pathlib import Path
  9 +
  10 +import sherpa_onnx
  11 +
  12 +
  13 +def get_args():
  14 + parser = argparse.ArgumentParser(
  15 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  16 + )
  17 +
  18 + parser.add_argument(
  19 + "--silero-vad-model",
  20 + type=str,
  21 + required=True,
  22 + help="Path to silero_vad.onnx",
  23 + )
  24 +
  25 + parser.add_argument(
  26 + "--device-name",
  27 + type=str,
  28 + required=True,
  29 + help="""
  30 +The device name specifies which microphone to use in case there are several
  31 +on your system. You can use
  32 +
  33 + arecord -l
  34 +
  35 +to find all available microphones on your computer. For instance, if it outputs
  36 +
  37 +**** List of CAPTURE Hardware Devices ****
  38 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  39 + Subdevices: 1/1
  40 + Subdevice #0: subdevice #0
  41 +
  42 +and if you want to select card 3 and the device 0 on that card, please use:
  43 +
  44 + plughw:3,0
  45 +
  46 +as the device_name.
  47 + """,
  48 + )
  49 +
  50 + return parser.parse_args()
  51 +
  52 +
  53 +def main():
  54 + args = get_args()
  55 + if not Path(args.silero_vad_model).is_file():
  56 + raise RuntimeError(
  57 + f"{args.silero_vad_model} does not exist. Please download it from "
  58 + "https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx"
  59 + )
  60 +
  61 + device_name = args.device_name
  62 + print(f"device_name: {device_name}")
  63 + alsa = sherpa_onnx.Alsa(device_name)
  64 +
  65 + sample_rate = 16000
  66 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
  67 +
  68 + config = sherpa_onnx.VadModelConfig()
  69 + config.silero_vad.model = args.silero_vad_model
  70 + config.sample_rate = sample_rate
  71 +
  72 + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
  73 +
  74 + print("Started! Please speak. Press Ctrl C to exit")
  75 +
  76 + printed = False
  77 + k = 0
  78 + try:
  79 + while True:
  80 + samples = alsa.read(samples_per_read) # a blocking read
  81 +
  82 + vad.accept_waveform(samples)
  83 +
  84 + if vad.is_speech_detected() and not printed:
  85 + print("Detected speech")
  86 + printed = True
  87 +
  88 + if not vad.is_speech_detected():
  89 + printed = False
  90 +
  91 + while not vad.empty():
  92 + samples = vad.front.samples
  93 + duration = len(samples) / sample_rate
  94 + filename = f"seg-{k}-{duration:.3f}-seconds.wav"
  95 + k += 1
  96 + sherpa_onnx.write_wave(filename, samples, sample_rate)
  97 + print(f"Duration: {duration:.3f} seconds")
  98 + print(f"Saved to {filename}")
  99 + print("----------")
  100 +
  101 + vad.pop()
  102 + except KeyboardInterrupt:
  103 + print("\nCaught Ctrl + C. Exit")
  104 +
  105 +
  106 +if __name__ == "__main__":
  107 + main()
  1 +#!/usr/bin/env python3
  2 +
  3 +import argparse
  4 +import os
  5 +import sys
  6 +from pathlib import Path
  7 +
  8 +try:
  9 + import sounddevice as sd
  10 +except ImportError:
  11 + print("Please install sounddevice first. You can use")
  12 + print()
  13 + print(" pip install sounddevice")
  14 + print()
  15 + print("to install it")
  16 + sys.exit(-1)
  17 +
  18 +import sherpa_onnx
  19 +
  20 +
  21 +def get_args():
  22 + parser = argparse.ArgumentParser(
  23 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  24 + )
  25 +
  26 + parser.add_argument(
  27 + "--silero-vad-model",
  28 + type=str,
  29 + required=True,
  30 + help="Path to silero_vad.onnx",
  31 + )
  32 +
  33 + return parser.parse_args()
  34 +
  35 +
  36 +def main():
  37 + args = get_args()
  38 + if not Path(args.silero_vad_model).is_file():
  39 + raise RuntimeError(
  40 + f"{args.silero_vad_model} does not exist. Please download it from "
  41 + "https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx"
  42 + )
  43 +
  44 + mic_sample_rate = 16000
  45 + if "SHERPA_ONNX_MIC_SAMPLE_RATE" in os.environ:
  46 + mic_sample_rate = int(os.environ.get("SHERPA_ONNX_MIC_SAMPLE_RATE"))
  47 + print(f"Change microphone sample rate to {mic_sample_rate}")
  48 +
  49 + sample_rate = 16000
  50 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
  51 +
  52 + config = sherpa_onnx.VadModelConfig()
  53 + config.silero_vad.model = args.silero_vad_model
  54 + config.sample_rate = sample_rate
  55 +
  56 + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
  57 +
  58 + # python3 -m sounddevice
  59 + # can also be used to list all devices
  60 +
  61 + devices = sd.query_devices()
  62 + if len(devices) == 0:
  63 + print("No microphone devices found")
  64 + print(
  65 + "If you are using Linux and you are sure there is a microphone "
  66 + "on your system, please use "
  67 + "./vad-alsa.py"
  68 + )
  69 + sys.exit(0)
  70 +
  71 + print(devices)
  72 +
  73 + if "SHERPA_ONNX_MIC_DEVICE" in os.environ:
  74 + input_device_idx = int(os.environ.get("SHERPA_ONNX_MIC_DEVICE"))
  75 + sd.default.device[0] = input_device_idx
  76 + print(f'Use selected device: {devices[input_device_idx]["name"]}')
  77 + else:
  78 + input_device_idx = sd.default.device[0]
  79 + print(f'Use default device: {devices[input_device_idx]["name"]}')
  80 +
  81 + print("Started! Please speak. Press Ctrl C to exit")
  82 +
  83 + printed = False
  84 + k = 0
  85 + try:
  86 + with sd.InputStream(
  87 + channels=1, dtype="float32", samplerate=mic_sample_rate
  88 + ) as s:
  89 + while True:
  90 + samples, _ = s.read(samples_per_read) # a blocking read
  91 + samples = samples.reshape(-1)
  92 +
  93 + if mic_sample_rate != sample_rate:
  94 + import librosa
  95 +
  96 + samples = librosa.resample(
  97 + samples, orig_sr=mic_sample_rate, target_sr=sample_rate
  98 + )
  99 +
  100 + vad.accept_waveform(samples)
  101 +
  102 + if vad.is_speech_detected() and not printed:
  103 + print("Detected speech")
  104 + printed = True
  105 +
  106 + if not vad.is_speech_detected():
  107 + printed = False
  108 +
  109 + while not vad.empty():
  110 + samples = vad.front.samples
  111 + duration = len(samples) / sample_rate
  112 + filename = f"seg-{k}-{duration:.3f}-seconds.wav"
  113 + k += 1
  114 + sherpa_onnx.write_wave(filename, samples, sample_rate)
  115 + print(f"Duration: {duration:.3f} seconds")
  116 + print(f"Saved to {filename}")
  117 + print("----------")
  118 +
  119 + vad.pop()
  120 + except KeyboardInterrupt:
  121 + print("\nCaught Ctrl + C. Exit")
  122 +
  123 +
  124 +if __name__ == "__main__":
  125 + main()
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This file shows how to remove non-speech segments
  5 +and merge all speech segments into a large segment
  6 +and save it to a file.
  7 +
  8 +Different from ./vad-remove-non-speech-segments.py, this file supports only
  9 +Linux.
  10 +
  11 +Usage
  12 +
  13 +python3 ./vad-remove-non-speech-segments-alsa.py \
  14 + --silero-vad-model silero_vad.onnx
  15 +
  16 +Please visit
  17 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  18 +to download silero_vad.onnx
  19 +
  20 +For instance,
  21 +
  22 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  23 +"""
  24 +
  25 +import argparse
  26 +import time
  27 +from pathlib import Path
  28 +
  29 +import numpy as np
  30 +import sherpa_onnx
  31 +import soundfile as sf
  32 +
  33 +
  34 +def assert_file_exists(filename: str):
  35 + assert Path(filename).is_file(), (
  36 + f"{filename} does not exist!\n"
  37 + "Please refer to "
  38 + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
  39 + )
  40 +
  41 +
  42 +def get_args():
  43 + parser = argparse.ArgumentParser(
  44 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  45 + )
  46 +
  47 + parser.add_argument(
  48 + "--silero-vad-model",
  49 + type=str,
  50 + required=True,
  51 + help="Path to silero_vad.onnx",
  52 + )
  53 +
  54 + parser.add_argument(
  55 + "--device-name",
  56 + type=str,
  57 + required=True,
  58 + help="""
  59 +The device name specifies which microphone to use in case there are several
  60 +on your system. You can use
  61 +
  62 + arecord -l
  63 +
  64 +to find all available microphones on your computer. For instance, if it outputs
  65 +
  66 +**** List of CAPTURE Hardware Devices ****
  67 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  68 + Subdevices: 1/1
  69 + Subdevice #0: subdevice #0
  70 +
  71 +and if you want to select card 3 and the device 0 on that card, please use:
  72 +
  73 + plughw:3,0
  74 +
  75 +as the device_name.
  76 + """,
  77 + )
  78 +
  79 + return parser.parse_args()
  80 +
  81 +
  82 +def main():
  83 + args = get_args()
  84 + assert_file_exists(args.silero_vad_model)
  85 +
  86 + device_name = args.device_name
  87 + print(f"device_name: {device_name}")
  88 + alsa = sherpa_onnx.Alsa(device_name)
  89 +
  90 + sample_rate = 16000
  91 + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
  92 +
  93 + config = sherpa_onnx.VadModelConfig()
  94 + config.silero_vad.model = args.silero_vad_model
  95 + config.sample_rate = sample_rate
  96 +
  97 + window_size = config.silero_vad.window_size
  98 +
  99 + buffer = []
  100 + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
  101 +
  102 + all_samples = []
  103 +
  104 + print("Started! Please speak. Press Ctrl C to exit")
  105 +
  106 + try:
  107 + while True:
  108 + samples = alsa.read(samples_per_read) # a blocking read
  109 + samples = np.array(samples)
  110 +
  111 + buffer = np.concatenate([buffer, samples])
  112 +
  113 + all_samples = np.concatenate([all_samples, samples])
  114 +
  115 + while len(buffer) > window_size:
  116 + vad.accept_waveform(buffer[:window_size])
  117 + buffer = buffer[window_size:]
  118 + except KeyboardInterrupt:
  119 + print("\nCaught Ctrl + C. Saving & Exiting")
  120 +
  121 + speech_samples = []
  122 + while not vad.empty():
  123 + speech_samples.extend(vad.front.samples)
  124 + vad.pop()
  125 +
  126 + speech_samples = np.array(speech_samples, dtype=np.float32)
  127 +
  128 + filename_for_speech = time.strftime("%Y%m%d-%H%M%S-speech.wav")
  129 + sf.write(filename_for_speech, speech_samples, samplerate=sample_rate)
  130 +
  131 + filename_for_all = time.strftime("%Y%m%d-%H%M%S-all.wav")
  132 + sf.write(filename_for_all, all_samples, samplerate=sample_rate)
  133 +
  134 + print(f"Saved to {filename_for_speech} and {filename_for_all}")
  135 +
  136 +
  137 +if __name__ == "__main__":
  138 + main()
@@ -66,6 +66,11 @@ def main(): @@ -66,6 +66,11 @@ def main():
66 devices = sd.query_devices() 66 devices = sd.query_devices()
67 if len(devices) == 0: 67 if len(devices) == 0:
68 print("No microphone devices found") 68 print("No microphone devices found")
  69 + print(
  70 + "If you are using Linux and you are sure there is a microphone "
  71 + "on your system, please use "
  72 + "./vad-remove-non-speech-segments-alsa.py"
  73 + )
69 sys.exit(0) 74 sys.exit(0)
70 75
71 print(devices) 76 print(devices)
@@ -89,7 +94,7 @@ def main(): @@ -89,7 +94,7 @@ def main():
89 94
90 all_samples = [] 95 all_samples = []
91 96
92 - print("Started! Please speak") 97 + print("Started! Please speak. Press Ctrl C to exit")
93 98
94 try: 99 try:
95 with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s: 100 with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
@@ -251,6 +251,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) @@ -251,6 +251,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY)
251 add_executable(sherpa-onnx-keyword-spotter-alsa sherpa-onnx-keyword-spotter-alsa.cc alsa.cc) 251 add_executable(sherpa-onnx-keyword-spotter-alsa sherpa-onnx-keyword-spotter-alsa.cc alsa.cc)
252 add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc) 252 add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc)
253 add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) 253 add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc)
  254 + add_executable(sherpa-onnx-vad-alsa sherpa-onnx-vad-alsa.cc alsa.cc)
254 255
255 256
256 if(SHERPA_ONNX_ENABLE_TTS) 257 if(SHERPA_ONNX_ENABLE_TTS)
@@ -259,9 +260,10 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) @@ -259,9 +260,10 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY)
259 260
260 set(exes 261 set(exes
261 sherpa-onnx-alsa 262 sherpa-onnx-alsa
262 - sherpa-onnx-keyword-spotter-alsa  
263 sherpa-onnx-alsa-offline 263 sherpa-onnx-alsa-offline
264 sherpa-onnx-alsa-offline-speaker-identification 264 sherpa-onnx-alsa-offline-speaker-identification
  265 + sherpa-onnx-keyword-spotter-alsa
  266 + sherpa-onnx-vad-alsa
265 ) 267 )
266 268
267 if(SHERPA_ONNX_ENABLE_TTS) 269 if(SHERPA_ONNX_ENABLE_TTS)
  1 +// sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include <signal.h>
  6 +#include <stdio.h>
  7 +#include <stdlib.h>
  8 +
  9 +#include <algorithm>
  10 +
  11 +#include "sherpa-onnx/csrc/alsa.h"
  12 +#include "sherpa-onnx/csrc/circular-buffer.h"
  13 +#include "sherpa-onnx/csrc/voice-activity-detector.h"
  14 +#include "sherpa-onnx/csrc/wave-writer.h"
  15 +
  16 +bool stop = false;
  17 +static void Handler(int32_t sig) {
  18 + stop = true;
  19 + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
  20 +}
  21 +
  22 +int32_t main(int32_t argc, char *argv[]) {
  23 + signal(SIGINT, Handler);
  24 +
  25 + const char *kUsageMessage = R"usage(
  26 +This program shows how to use VAD in sherpa-onnx.
  27 +
  28 + ./bin/sherpa-onnx-vad-alsa \
  29 + --silero-vad-model=/path/to/silero_vad.onnx \
  30 + device_name
  31 +
  32 +Please download silero_vad.onnx from
  33 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  34 +
  35 +For instance, use
  36 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  37 +
  38 +The device name specifies which microphone to use in case there are several
  39 +on your system. You can use
  40 +
  41 + arecord -l
  42 +
  43 +to find all available microphones on your computer. For instance, if it outputs
  44 +
  45 +**** List of CAPTURE Hardware Devices ****
  46 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  47 + Subdevices: 1/1
  48 + Subdevice #0: subdevice #0
  49 +
  50 +and if you want to select card 3 and the device 0 on that card, please use:
  51 +
  52 + plughw:3,0
  53 +
  54 +as the device_name.
  55 +)usage";
  56 +
  57 + sherpa_onnx::ParseOptions po(kUsageMessage);
  58 + sherpa_onnx::VadModelConfig config;
  59 +
  60 + config.Register(&po);
  61 + po.Read(argc, argv);
  62 + if (po.NumArgs() != 1) {
  63 + fprintf(stderr, "Please provide only 1 argument: the device name\n");
  64 + po.PrintUsage();
  65 + exit(EXIT_FAILURE);
  66 + }
  67 +
  68 + fprintf(stderr, "%s\n", config.ToString().c_str());
  69 +
  70 + if (!config.Validate()) {
  71 + fprintf(stderr, "Errors in config!\n");
  72 + return -1;
  73 + }
  74 +
  75 + std::string device_name = po.GetArg(1);
  76 + sherpa_onnx::Alsa alsa(device_name.c_str());
  77 + fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
  78 +
  79 + int32_t sample_rate = 16000;
  80 +
  81 + if (alsa.GetExpectedSampleRate() != sample_rate) {
  82 + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
  83 + sample_rate);
  84 + exit(-1);
  85 + }
  86 +
  87 + int32_t chunk = 0.1 * alsa.GetActualSampleRate();
  88 +
  89 + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config);
  90 +
  91 + fprintf(stderr, "Started. Please speak\n");
  92 +
  93 + int32_t window_size = config.silero_vad.window_size;
  94 + bool printed = false;
  95 +
  96 + int32_t k = 0;
  97 + while (!stop) {
  98 + {
  99 + const std::vector<float> &samples = alsa.Read(chunk);
  100 +
  101 + vad->AcceptWaveform(samples.data(), samples.size());
  102 +
  103 + if (vad->IsSpeechDetected() && !printed) {
  104 + printed = true;
  105 + fprintf(stderr, "\nDetected speech!\n");
  106 + }
  107 + if (!vad->IsSpeechDetected()) {
  108 + printed = false;
  109 + }
  110 +
  111 + while (!vad->Empty()) {
  112 + const auto &segment = vad->Front();
  113 + float duration =
  114 + segment.samples.size() / static_cast<float>(sample_rate);
  115 +
  116 + fprintf(stderr, "Duration: %.3f seconds\n", duration);
  117 +
  118 + char filename[128];
  119 + snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration);
  120 + k += 1;
  121 + sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(),
  122 + segment.samples.size());
  123 + fprintf(stderr, "Saved to %s\n", filename);
  124 + fprintf(stderr, "----------\n");
  125 +
  126 + vad->Pop();
  127 + }
  128 + }
  129 + }
  130 +
  131 + return 0;
  132 +}
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 #include "sherpa-onnx/csrc/circular-buffer.h" 13 #include "sherpa-onnx/csrc/circular-buffer.h"
14 #include "sherpa-onnx/csrc/microphone.h" 14 #include "sherpa-onnx/csrc/microphone.h"
15 #include "sherpa-onnx/csrc/voice-activity-detector.h" 15 #include "sherpa-onnx/csrc/voice-activity-detector.h"
  16 +#include "sherpa-onnx/csrc/wave-writer.h"
16 17
17 bool stop = false; 18 bool stop = false;
18 std::mutex mutex; 19 std::mutex mutex;
@@ -122,6 +123,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx @@ -122,6 +123,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
122 int32_t window_size = config.silero_vad.window_size; 123 int32_t window_size = config.silero_vad.window_size;
123 bool printed = false; 124 bool printed = false;
124 125
  126 + int32_t k = 0;
125 while (!stop) { 127 while (!stop) {
126 { 128 {
127 std::lock_guard<std::mutex> lock(mutex); 129 std::lock_guard<std::mutex> lock(mutex);
@@ -140,9 +142,19 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx @@ -140,9 +142,19 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
140 } 142 }
141 143
142 while (!vad->Empty()) { 144 while (!vad->Empty()) {
143 - float duration = vad->Front().samples.size() / sample_rate;  
144 - vad->Pop(); 145 + const auto &segment = vad->Front();
  146 + float duration = segment.samples.size() / sample_rate;
145 fprintf(stderr, "Duration: %.3f seconds\n", duration); 147 fprintf(stderr, "Duration: %.3f seconds\n", duration);
  148 +
  149 + char filename[128];
  150 + snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration);
  151 + k += 1;
  152 + sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(),
  153 + segment.samples.size());
  154 + fprintf(stderr, "Saved to %s\n", filename);
  155 + fprintf(stderr, "----------\n");
  156 +
  157 + vad->Pop();
146 } 158 }
147 } 159 }
148 } 160 }
@@ -35,6 +35,7 @@ set(srcs @@ -35,6 +35,7 @@ set(srcs
35 vad-model-config.cc 35 vad-model-config.cc
36 vad-model.cc 36 vad-model.cc
37 voice-activity-detector.cc 37 voice-activity-detector.cc
  38 + wave-writer.cc
38 ) 39 )
39 if(SHERPA_ONNX_HAS_ALSA) 40 if(SHERPA_ONNX_HAS_ALSA)
40 list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc) 41 list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc)
@@ -26,6 +26,7 @@ @@ -26,6 +26,7 @@
26 #include "sherpa-onnx/python/csrc/vad-model-config.h" 26 #include "sherpa-onnx/python/csrc/vad-model-config.h"
27 #include "sherpa-onnx/python/csrc/vad-model.h" 27 #include "sherpa-onnx/python/csrc/vad-model.h"
28 #include "sherpa-onnx/python/csrc/voice-activity-detector.h" 28 #include "sherpa-onnx/python/csrc/voice-activity-detector.h"
  29 +#include "sherpa-onnx/python/csrc/wave-writer.h"
29 30
30 #if SHERPA_ONNX_ENABLE_TTS == 1 31 #if SHERPA_ONNX_ENABLE_TTS == 1
31 #include "sherpa-onnx/python/csrc/offline-tts.h" 32 #include "sherpa-onnx/python/csrc/offline-tts.h"
@@ -36,6 +37,8 @@ namespace sherpa_onnx { @@ -36,6 +37,8 @@ namespace sherpa_onnx {
36 PYBIND11_MODULE(_sherpa_onnx, m) { 37 PYBIND11_MODULE(_sherpa_onnx, m) {
37 m.doc() = "pybind11 binding of sherpa-onnx"; 38 m.doc() = "pybind11 binding of sherpa-onnx";
38 39
  40 + PybindWaveWriter(&m);
  41 +
39 PybindFeatures(&m); 42 PybindFeatures(&m);
40 PybindOnlineCtcFstDecoderConfig(&m); 43 PybindOnlineCtcFstDecoderConfig(&m);
41 PybindOnlineModelConfig(&m); 44 PybindOnlineModelConfig(&m);
  1 +// sherpa-onnx/python/csrc/wave-writer.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/python/csrc/wave-writer.h"
  6 +
  7 +#include <string>
  8 +#include <vector>
  9 +
  10 +#include "sherpa-onnx/csrc/wave-writer.h"
  11 +
  12 +namespace sherpa_onnx {
  13 +
  14 +void PybindWaveWriter(py::module *m) {
  15 + m->def(
  16 + "write_wave",
  17 + [](const std::string &filename, const std::vector<float> &samples,
  18 + int32_t sample_rate) -> bool {
  19 + bool ok =
  20 + WriteWave(filename, sample_rate, samples.data(), samples.size());
  21 +
  22 + return ok;
  23 + },
  24 + py::arg("filename"), py::arg("samples"), py::arg("sample_rate"));
  25 +}
  26 +
  27 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/python/csrc/wave-writer.h
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_PYTHON_CSRC_WAVE_WRITER_H_
  6 +#define SHERPA_ONNX_PYTHON_CSRC_WAVE_WRITER_H_
  7 +
  8 +#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
  9 +
  10 +namespace sherpa_onnx {
  11 +
  12 +void PybindWaveWriter(py::module *m);
  13 +
  14 +}
  15 +
  16 +#endif // SHERPA_ONNX_PYTHON_CSRC_WAVE_WRITER_H_
@@ -19,6 +19,7 @@ from _sherpa_onnx import ( @@ -19,6 +19,7 @@ from _sherpa_onnx import (
19 VadModel, 19 VadModel,
20 VadModelConfig, 20 VadModelConfig,
21 VoiceActivityDetector, 21 VoiceActivityDetector,
  22 + write_wave,
22 ) 23 )
23 24
24 from .keyword_spotter import KeywordSpotter 25 from .keyword_spotter import KeywordSpotter