Committed by
GitHub
Add VAD examples using ALSA for recording (#739)
正在显示
17 个修改的文件
包含
601 行增加
和
9 行删除
| @@ -58,7 +58,6 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 | @@ -58,7 +58,6 @@ rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 | ||
| 58 | node ./test-online-zipformer2-ctc.js | 58 | node ./test-online-zipformer2-ctc.js |
| 59 | rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 | 59 | rm -rf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 |
| 60 | 60 | ||
| 61 | - | ||
| 62 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | 61 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 |
| 63 | tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | 62 | tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 |
| 64 | rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | 63 | rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 |
| @@ -70,9 +69,9 @@ rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 | @@ -70,9 +69,9 @@ rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 | ||
| 70 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 69 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 71 | tar xf vits-piper-en_US-amy-low.tar.bz2 | 70 | tar xf vits-piper-en_US-amy-low.tar.bz2 |
| 72 | node ./test-offline-tts-en.js | 71 | node ./test-offline-tts-en.js |
| 73 | -rm vits-piper-en_US-amy-low* | 72 | +rm -rf vits-piper-en_US-amy-low* |
| 74 | 73 | ||
| 75 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 74 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 76 | tar xvf vits-icefall-zh-aishell3.tar.bz2 | 75 | tar xvf vits-icefall-zh-aishell3.tar.bz2 |
| 77 | node ./test-offline-tts-zh.js | 76 | node ./test-offline-tts-zh.js |
| 78 | -rm vits-icefall-zh-aishell3* | 77 | +rm -rf vits-icefall-zh-aishell3* |
| @@ -59,8 +59,27 @@ jobs: | @@ -59,8 +59,27 @@ jobs: | ||
| 59 | run: | | 59 | run: | |
| 60 | ls -lh ./wheelhouse/ | 60 | ls -lh ./wheelhouse/ |
| 61 | 61 | ||
| 62 | + - name: Install patchelf | ||
| 63 | + if: matrix.os == 'ubuntu-latest' | ||
| 64 | + shell: bash | ||
| 65 | + run: | | ||
| 66 | + sudo apt-get update -q | ||
| 67 | + sudo apt-get install -q -y patchelf | ||
| 68 | + patchelf --help | ||
| 69 | + | ||
| 70 | + - name: Patch wheels | ||
| 71 | + shell: bash | ||
| 72 | + if: matrix.os == 'ubuntu-latest' | ||
| 73 | + run: | | ||
| 74 | + mkdir ./wheels | ||
| 75 | + sudo ./scripts/wheel/patch_wheel.py --in-dir ./wheelhouse --out-dir ./wheels | ||
| 76 | + | ||
| 77 | + ls -lh ./wheels/ | ||
| 78 | + rm -rf ./wheelhouse | ||
| 79 | + mv ./wheels ./wheelhouse | ||
| 80 | + | ||
| 62 | - name: Publish to huggingface | 81 | - name: Publish to huggingface |
| 63 | - if: matrix.python-version == 'cp38' && matrix.manylinux == 'manylinux2014' | 82 | + if: (matrix.python-version == 'cp38' || matrix.python-version == 'cp39' ) && matrix.manylinux == 'manylinux2014' |
| 64 | env: | 83 | env: |
| 65 | HF_TOKEN: ${{ secrets.HF_TOKEN }} | 84 | HF_TOKEN: ${{ secrets.HF_TOKEN }} |
| 66 | uses: nick-fields/retry@v3 | 85 | uses: nick-fields/retry@v3 |
| @@ -186,7 +186,7 @@ class MainActivity : AppCompatActivity() { | @@ -186,7 +186,7 @@ class MainActivity : AppCompatActivity() { | ||
| 186 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 186 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 187 | // modelDir = "vits-icefall-zh-aishell3" | 187 | // modelDir = "vits-icefall-zh-aishell3" |
| 188 | // modelName = "model.onnx" | 188 | // modelName = "model.onnx" |
| 189 | - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst," | 189 | + // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst" |
| 190 | // ruleFars = "vits-icefall-zh-aishell3/rule.far" | 190 | // ruleFars = "vits-icefall-zh-aishell3/rule.far" |
| 191 | // lexicon = "lexicon.txt" | 191 | // lexicon = "lexicon.txt" |
| 192 | 192 |
| @@ -67,6 +67,7 @@ def get_binaries(): | @@ -67,6 +67,7 @@ def get_binaries(): | ||
| 67 | "sherpa-onnx-alsa-offline", | 67 | "sherpa-onnx-alsa-offline", |
| 68 | "sherpa-onnx-alsa-offline-speaker-identification", | 68 | "sherpa-onnx-alsa-offline-speaker-identification", |
| 69 | "sherpa-onnx-offline-tts-play-alsa", | 69 | "sherpa-onnx-offline-tts-play-alsa", |
| 70 | + "sherpa-onnx-vad-alsa", | ||
| 70 | ] | 71 | ] |
| 71 | 72 | ||
| 72 | if is_windows(): | 73 | if is_windows(): |
| @@ -75,6 +75,10 @@ function(download_openfst) | @@ -75,6 +75,10 @@ function(download_openfst) | ||
| 75 | set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst") | 75 | set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst") |
| 76 | set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar") | 76 | set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar") |
| 77 | 77 | ||
| 78 | + if(LINUX) | ||
| 79 | + target_compile_options(fst PUBLIC -Wno-missing-template-keyword) | ||
| 80 | + endif() | ||
| 81 | + | ||
| 78 | target_include_directories(fst | 82 | target_include_directories(fst |
| 79 | PUBLIC | 83 | PUBLIC |
| 80 | ${openfst_SOURCE_DIR}/src/include | 84 | ${openfst_SOURCE_DIR}/src/include |
python-api-examples/vad-alsa.py
0 → 100755
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +This script works only on Linux. It uses ALSA for recording. | ||
| 5 | +""" | ||
| 6 | + | ||
| 7 | +import argparse | ||
| 8 | +from pathlib import Path | ||
| 9 | + | ||
| 10 | +import sherpa_onnx | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +def get_args(): | ||
| 14 | + parser = argparse.ArgumentParser( | ||
| 15 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 16 | + ) | ||
| 17 | + | ||
| 18 | + parser.add_argument( | ||
| 19 | + "--silero-vad-model", | ||
| 20 | + type=str, | ||
| 21 | + required=True, | ||
| 22 | + help="Path to silero_vad.onnx", | ||
| 23 | + ) | ||
| 24 | + | ||
| 25 | + parser.add_argument( | ||
| 26 | + "--device-name", | ||
| 27 | + type=str, | ||
| 28 | + required=True, | ||
| 29 | + help=""" | ||
| 30 | +The device name specifies which microphone to use in case there are several | ||
| 31 | +on your system. You can use | ||
| 32 | + | ||
| 33 | + arecord -l | ||
| 34 | + | ||
| 35 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 36 | + | ||
| 37 | +**** List of CAPTURE Hardware Devices **** | ||
| 38 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 39 | + Subdevices: 1/1 | ||
| 40 | + Subdevice #0: subdevice #0 | ||
| 41 | + | ||
| 42 | +and if you want to select card 3 and the device 0 on that card, please use: | ||
| 43 | + | ||
| 44 | + plughw:3,0 | ||
| 45 | + | ||
| 46 | +as the device_name. | ||
| 47 | + """, | ||
| 48 | + ) | ||
| 49 | + | ||
| 50 | + return parser.parse_args() | ||
| 51 | + | ||
| 52 | + | ||
| 53 | +def main(): | ||
| 54 | + args = get_args() | ||
| 55 | + if not Path(args.silero_vad_model).is_file(): | ||
| 56 | + raise RuntimeError( | ||
| 57 | + f"{args.silero_vad_model} does not exist. Please download it from " | ||
| 58 | + "https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx" | ||
| 59 | + ) | ||
| 60 | + | ||
| 61 | + device_name = args.device_name | ||
| 62 | + print(f"device_name: {device_name}") | ||
| 63 | + alsa = sherpa_onnx.Alsa(device_name) | ||
| 64 | + | ||
| 65 | + sample_rate = 16000 | ||
| 66 | + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
| 67 | + | ||
| 68 | + config = sherpa_onnx.VadModelConfig() | ||
| 69 | + config.silero_vad.model = args.silero_vad_model | ||
| 70 | + config.sample_rate = sample_rate | ||
| 71 | + | ||
| 72 | + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) | ||
| 73 | + | ||
| 74 | + print("Started! Please speak. Press Ctrl C to exit") | ||
| 75 | + | ||
| 76 | + printed = False | ||
| 77 | + k = 0 | ||
| 78 | + try: | ||
| 79 | + while True: | ||
| 80 | + samples = alsa.read(samples_per_read) # a blocking read | ||
| 81 | + | ||
| 82 | + vad.accept_waveform(samples) | ||
| 83 | + | ||
| 84 | + if vad.is_speech_detected() and not printed: | ||
| 85 | + print("Detected speech") | ||
| 86 | + printed = True | ||
| 87 | + | ||
| 88 | + if not vad.is_speech_detected(): | ||
| 89 | + printed = False | ||
| 90 | + | ||
| 91 | + while not vad.empty(): | ||
| 92 | + samples = vad.front.samples | ||
| 93 | + duration = len(samples) / sample_rate | ||
| 94 | + filename = f"seg-{k}-{duration:.3f}-seconds.wav" | ||
| 95 | + k += 1 | ||
| 96 | + sherpa_onnx.write_wave(filename, samples, sample_rate) | ||
| 97 | + print(f"Duration: {duration:.3f} seconds") | ||
| 98 | + print(f"Saved to {filename}") | ||
| 99 | + print("----------") | ||
| 100 | + | ||
| 101 | + vad.pop() | ||
| 102 | + except KeyboardInterrupt: | ||
| 103 | + print("\nCaught Ctrl + C. Exit") | ||
| 104 | + | ||
| 105 | + | ||
| 106 | +if __name__ == "__main__": | ||
| 107 | + main() |
python-api-examples/vad-microphone.py
0 → 100755
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +import argparse | ||
| 4 | +import os | ||
| 5 | +import sys | ||
| 6 | +from pathlib import Path | ||
| 7 | + | ||
| 8 | +try: | ||
| 9 | + import sounddevice as sd | ||
| 10 | +except ImportError: | ||
| 11 | + print("Please install sounddevice first. You can use") | ||
| 12 | + print() | ||
| 13 | + print(" pip install sounddevice") | ||
| 14 | + print() | ||
| 15 | + print("to install it") | ||
| 16 | + sys.exit(-1) | ||
| 17 | + | ||
| 18 | +import sherpa_onnx | ||
| 19 | + | ||
| 20 | + | ||
| 21 | +def get_args(): | ||
| 22 | + parser = argparse.ArgumentParser( | ||
| 23 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 24 | + ) | ||
| 25 | + | ||
| 26 | + parser.add_argument( | ||
| 27 | + "--silero-vad-model", | ||
| 28 | + type=str, | ||
| 29 | + required=True, | ||
| 30 | + help="Path to silero_vad.onnx", | ||
| 31 | + ) | ||
| 32 | + | ||
| 33 | + return parser.parse_args() | ||
| 34 | + | ||
| 35 | + | ||
| 36 | +def main(): | ||
| 37 | + args = get_args() | ||
| 38 | + if not Path(args.silero_vad_model).is_file(): | ||
| 39 | + raise RuntimeError( | ||
| 40 | + f"{args.silero_vad_model} does not exist. Please download it from " | ||
| 41 | + "https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx" | ||
| 42 | + ) | ||
| 43 | + | ||
| 44 | + mic_sample_rate = 16000 | ||
| 45 | + if "SHERPA_ONNX_MIC_SAMPLE_RATE" in os.environ: | ||
| 46 | + mic_sample_rate = int(os.environ.get("SHERPA_ONNX_MIC_SAMPLE_RATE")) | ||
| 47 | + print(f"Change microphone sample rate to {mic_sample_rate}") | ||
| 48 | + | ||
| 49 | + sample_rate = 16000 | ||
| 50 | + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
| 51 | + | ||
| 52 | + config = sherpa_onnx.VadModelConfig() | ||
| 53 | + config.silero_vad.model = args.silero_vad_model | ||
| 54 | + config.sample_rate = sample_rate | ||
| 55 | + | ||
| 56 | + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) | ||
| 57 | + | ||
| 58 | + # python3 -m sounddevice | ||
| 59 | + # can also be used to list all devices | ||
| 60 | + | ||
| 61 | + devices = sd.query_devices() | ||
| 62 | + if len(devices) == 0: | ||
| 63 | + print("No microphone devices found") | ||
| 64 | + print( | ||
| 65 | + "If you are using Linux and you are sure there is a microphone " | ||
| 66 | + "on your system, please use " | ||
| 67 | + "./vad-alsa.py" | ||
| 68 | + ) | ||
| 69 | + sys.exit(0) | ||
| 70 | + | ||
| 71 | + print(devices) | ||
| 72 | + | ||
| 73 | + if "SHERPA_ONNX_MIC_DEVICE" in os.environ: | ||
| 74 | + input_device_idx = int(os.environ.get("SHERPA_ONNX_MIC_DEVICE")) | ||
| 75 | + sd.default.device[0] = input_device_idx | ||
| 76 | + print(f'Use selected device: {devices[input_device_idx]["name"]}') | ||
| 77 | + else: | ||
| 78 | + input_device_idx = sd.default.device[0] | ||
| 79 | + print(f'Use default device: {devices[input_device_idx]["name"]}') | ||
| 80 | + | ||
| 81 | + print("Started! Please speak. Press Ctrl C to exit") | ||
| 82 | + | ||
| 83 | + printed = False | ||
| 84 | + k = 0 | ||
| 85 | + try: | ||
| 86 | + with sd.InputStream( | ||
| 87 | + channels=1, dtype="float32", samplerate=mic_sample_rate | ||
| 88 | + ) as s: | ||
| 89 | + while True: | ||
| 90 | + samples, _ = s.read(samples_per_read) # a blocking read | ||
| 91 | + samples = samples.reshape(-1) | ||
| 92 | + | ||
| 93 | + if mic_sample_rate != sample_rate: | ||
| 94 | + import librosa | ||
| 95 | + | ||
| 96 | + samples = librosa.resample( | ||
| 97 | + samples, orig_sr=mic_sample_rate, target_sr=sample_rate | ||
| 98 | + ) | ||
| 99 | + | ||
| 100 | + vad.accept_waveform(samples) | ||
| 101 | + | ||
| 102 | + if vad.is_speech_detected() and not printed: | ||
| 103 | + print("Detected speech") | ||
| 104 | + printed = True | ||
| 105 | + | ||
| 106 | + if not vad.is_speech_detected(): | ||
| 107 | + printed = False | ||
| 108 | + | ||
| 109 | + while not vad.empty(): | ||
| 110 | + samples = vad.front.samples | ||
| 111 | + duration = len(samples) / sample_rate | ||
| 112 | + filename = f"seg-{k}-{duration:.3f}-seconds.wav" | ||
| 113 | + k += 1 | ||
| 114 | + sherpa_onnx.write_wave(filename, samples, sample_rate) | ||
| 115 | + print(f"Duration: {duration:.3f} seconds") | ||
| 116 | + print(f"Saved to {filename}") | ||
| 117 | + print("----------") | ||
| 118 | + | ||
| 119 | + vad.pop() | ||
| 120 | + except KeyboardInterrupt: | ||
| 121 | + print("\nCaught Ctrl + C. Exit") | ||
| 122 | + | ||
| 123 | + | ||
| 124 | +if __name__ == "__main__": | ||
| 125 | + main() |
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +This file shows how to remove non-speech segments | ||
| 5 | +and merge all speech segments into a large segment | ||
| 6 | +and save it to a file. | ||
| 7 | + | ||
| 8 | +Different from ./vad-remove-non-speech-segments.py, this file supports only | ||
| 9 | +Linux. | ||
| 10 | + | ||
| 11 | +Usage | ||
| 12 | + | ||
| 13 | +python3 ./vad-remove-non-speech-segments-alsa.py \ | ||
| 14 | + --silero-vad-model silero_vad.onnx | ||
| 15 | + | ||
| 16 | +Please visit | ||
| 17 | +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 18 | +to download silero_vad.onnx | ||
| 19 | + | ||
| 20 | +For instance, | ||
| 21 | + | ||
| 22 | +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 23 | +""" | ||
| 24 | + | ||
| 25 | +import argparse | ||
| 26 | +import time | ||
| 27 | +from pathlib import Path | ||
| 28 | + | ||
| 29 | +import numpy as np | ||
| 30 | +import sherpa_onnx | ||
| 31 | +import soundfile as sf | ||
| 32 | + | ||
| 33 | + | ||
| 34 | +def assert_file_exists(filename: str): | ||
| 35 | + assert Path(filename).is_file(), ( | ||
| 36 | + f"{filename} does not exist!\n" | ||
| 37 | + "Please refer to " | ||
| 38 | + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" | ||
| 39 | + ) | ||
| 40 | + | ||
| 41 | + | ||
| 42 | +def get_args(): | ||
| 43 | + parser = argparse.ArgumentParser( | ||
| 44 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 45 | + ) | ||
| 46 | + | ||
| 47 | + parser.add_argument( | ||
| 48 | + "--silero-vad-model", | ||
| 49 | + type=str, | ||
| 50 | + required=True, | ||
| 51 | + help="Path to silero_vad.onnx", | ||
| 52 | + ) | ||
| 53 | + | ||
| 54 | + parser.add_argument( | ||
| 55 | + "--device-name", | ||
| 56 | + type=str, | ||
| 57 | + required=True, | ||
| 58 | + help=""" | ||
| 59 | +The device name specifies which microphone to use in case there are several | ||
| 60 | +on your system. You can use | ||
| 61 | + | ||
| 62 | + arecord -l | ||
| 63 | + | ||
| 64 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 65 | + | ||
| 66 | +**** List of CAPTURE Hardware Devices **** | ||
| 67 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 68 | + Subdevices: 1/1 | ||
| 69 | + Subdevice #0: subdevice #0 | ||
| 70 | + | ||
| 71 | +and if you want to select card 3 and the device 0 on that card, please use: | ||
| 72 | + | ||
| 73 | + plughw:3,0 | ||
| 74 | + | ||
| 75 | +as the device_name. | ||
| 76 | + """, | ||
| 77 | + ) | ||
| 78 | + | ||
| 79 | + return parser.parse_args() | ||
| 80 | + | ||
| 81 | + | ||
| 82 | +def main(): | ||
| 83 | + args = get_args() | ||
| 84 | + assert_file_exists(args.silero_vad_model) | ||
| 85 | + | ||
| 86 | + device_name = args.device_name | ||
| 87 | + print(f"device_name: {device_name}") | ||
| 88 | + alsa = sherpa_onnx.Alsa(device_name) | ||
| 89 | + | ||
| 90 | + sample_rate = 16000 | ||
| 91 | + samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms | ||
| 92 | + | ||
| 93 | + config = sherpa_onnx.VadModelConfig() | ||
| 94 | + config.silero_vad.model = args.silero_vad_model | ||
| 95 | + config.sample_rate = sample_rate | ||
| 96 | + | ||
| 97 | + window_size = config.silero_vad.window_size | ||
| 98 | + | ||
| 99 | + buffer = [] | ||
| 100 | + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) | ||
| 101 | + | ||
| 102 | + all_samples = [] | ||
| 103 | + | ||
| 104 | + print("Started! Please speak. Press Ctrl C to exit") | ||
| 105 | + | ||
| 106 | + try: | ||
| 107 | + while True: | ||
| 108 | + samples = alsa.read(samples_per_read) # a blocking read | ||
| 109 | + samples = np.array(samples) | ||
| 110 | + | ||
| 111 | + buffer = np.concatenate([buffer, samples]) | ||
| 112 | + | ||
| 113 | + all_samples = np.concatenate([all_samples, samples]) | ||
| 114 | + | ||
| 115 | + while len(buffer) > window_size: | ||
| 116 | + vad.accept_waveform(buffer[:window_size]) | ||
| 117 | + buffer = buffer[window_size:] | ||
| 118 | + except KeyboardInterrupt: | ||
| 119 | + print("\nCaught Ctrl + C. Saving & Exiting") | ||
| 120 | + | ||
| 121 | + speech_samples = [] | ||
| 122 | + while not vad.empty(): | ||
| 123 | + speech_samples.extend(vad.front.samples) | ||
| 124 | + vad.pop() | ||
| 125 | + | ||
| 126 | + speech_samples = np.array(speech_samples, dtype=np.float32) | ||
| 127 | + | ||
| 128 | + filename_for_speech = time.strftime("%Y%m%d-%H%M%S-speech.wav") | ||
| 129 | + sf.write(filename_for_speech, speech_samples, samplerate=sample_rate) | ||
| 130 | + | ||
| 131 | + filename_for_all = time.strftime("%Y%m%d-%H%M%S-all.wav") | ||
| 132 | + sf.write(filename_for_all, all_samples, samplerate=sample_rate) | ||
| 133 | + | ||
| 134 | + print(f"Saved to {filename_for_speech} and {filename_for_all}") | ||
| 135 | + | ||
| 136 | + | ||
| 137 | +if __name__ == "__main__": | ||
| 138 | + main() |
| @@ -66,6 +66,11 @@ def main(): | @@ -66,6 +66,11 @@ def main(): | ||
| 66 | devices = sd.query_devices() | 66 | devices = sd.query_devices() |
| 67 | if len(devices) == 0: | 67 | if len(devices) == 0: |
| 68 | print("No microphone devices found") | 68 | print("No microphone devices found") |
| 69 | + print( | ||
| 70 | + "If you are using Linux and you are sure there is a microphone " | ||
| 71 | + "on your system, please use " | ||
| 72 | + "./vad-remove-non-speech-segments-alsa.py" | ||
| 73 | + ) | ||
| 69 | sys.exit(0) | 74 | sys.exit(0) |
| 70 | 75 | ||
| 71 | print(devices) | 76 | print(devices) |
| @@ -89,7 +94,7 @@ def main(): | @@ -89,7 +94,7 @@ def main(): | ||
| 89 | 94 | ||
| 90 | all_samples = [] | 95 | all_samples = [] |
| 91 | 96 | ||
| 92 | - print("Started! Please speak") | 97 | + print("Started! Please speak. Press Ctrl C to exit") |
| 93 | 98 | ||
| 94 | try: | 99 | try: |
| 95 | with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s: | 100 | with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s: |
| @@ -251,6 +251,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | @@ -251,6 +251,7 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | ||
| 251 | add_executable(sherpa-onnx-keyword-spotter-alsa sherpa-onnx-keyword-spotter-alsa.cc alsa.cc) | 251 | add_executable(sherpa-onnx-keyword-spotter-alsa sherpa-onnx-keyword-spotter-alsa.cc alsa.cc) |
| 252 | add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc) | 252 | add_executable(sherpa-onnx-alsa-offline sherpa-onnx-alsa-offline.cc alsa.cc) |
| 253 | add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) | 253 | add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc) |
| 254 | + add_executable(sherpa-onnx-vad-alsa sherpa-onnx-vad-alsa.cc alsa.cc) | ||
| 254 | 255 | ||
| 255 | 256 | ||
| 256 | if(SHERPA_ONNX_ENABLE_TTS) | 257 | if(SHERPA_ONNX_ENABLE_TTS) |
| @@ -259,9 +260,10 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | @@ -259,9 +260,10 @@ if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) | ||
| 259 | 260 | ||
| 260 | set(exes | 261 | set(exes |
| 261 | sherpa-onnx-alsa | 262 | sherpa-onnx-alsa |
| 262 | - sherpa-onnx-keyword-spotter-alsa | ||
| 263 | sherpa-onnx-alsa-offline | 263 | sherpa-onnx-alsa-offline |
| 264 | sherpa-onnx-alsa-offline-speaker-identification | 264 | sherpa-onnx-alsa-offline-speaker-identification |
| 265 | + sherpa-onnx-keyword-spotter-alsa | ||
| 266 | + sherpa-onnx-vad-alsa | ||
| 265 | ) | 267 | ) |
| 266 | 268 | ||
| 267 | if(SHERPA_ONNX_ENABLE_TTS) | 269 | if(SHERPA_ONNX_ENABLE_TTS) |
sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include <signal.h> | ||
| 6 | +#include <stdio.h> | ||
| 7 | +#include <stdlib.h> | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | + | ||
| 11 | +#include "sherpa-onnx/csrc/alsa.h" | ||
| 12 | +#include "sherpa-onnx/csrc/circular-buffer.h" | ||
| 13 | +#include "sherpa-onnx/csrc/voice-activity-detector.h" | ||
| 14 | +#include "sherpa-onnx/csrc/wave-writer.h" | ||
| 15 | + | ||
| 16 | +bool stop = false; | ||
| 17 | +static void Handler(int32_t sig) { | ||
| 18 | + stop = true; | ||
| 19 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); | ||
| 20 | +} | ||
| 21 | + | ||
| 22 | +int32_t main(int32_t argc, char *argv[]) { | ||
| 23 | + signal(SIGINT, Handler); | ||
| 24 | + | ||
| 25 | + const char *kUsageMessage = R"usage( | ||
| 26 | +This program shows how to use VAD in sherpa-onnx. | ||
| 27 | + | ||
| 28 | + ./bin/sherpa-onnx-vad-alsa \ | ||
| 29 | + --silero-vad-model=/path/to/silero_vad.onnx \ | ||
| 30 | + device_name | ||
| 31 | + | ||
| 32 | +Please download silero_vad.onnx from | ||
| 33 | +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 34 | + | ||
| 35 | +For instance, use | ||
| 36 | +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 37 | + | ||
| 38 | +The device name specifies which microphone to use in case there are several | ||
| 39 | +on your system. You can use | ||
| 40 | + | ||
| 41 | + arecord -l | ||
| 42 | + | ||
| 43 | +to find all available microphones on your computer. For instance, if it outputs | ||
| 44 | + | ||
| 45 | +**** List of CAPTURE Hardware Devices **** | ||
| 46 | +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] | ||
| 47 | + Subdevices: 1/1 | ||
| 48 | + Subdevice #0: subdevice #0 | ||
| 49 | + | ||
| 50 | +and if you want to select card 3 and the device 0 on that card, please use: | ||
| 51 | + | ||
| 52 | + plughw:3,0 | ||
| 53 | + | ||
| 54 | +as the device_name. | ||
| 55 | +)usage"; | ||
| 56 | + | ||
| 57 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 58 | + sherpa_onnx::VadModelConfig config; | ||
| 59 | + | ||
| 60 | + config.Register(&po); | ||
| 61 | + po.Read(argc, argv); | ||
| 62 | + if (po.NumArgs() != 1) { | ||
| 63 | + fprintf(stderr, "Please provide only 1 argument: the device name\n"); | ||
| 64 | + po.PrintUsage(); | ||
| 65 | + exit(EXIT_FAILURE); | ||
| 66 | + } | ||
| 67 | + | ||
| 68 | + fprintf(stderr, "%s\n", config.ToString().c_str()); | ||
| 69 | + | ||
| 70 | + if (!config.Validate()) { | ||
| 71 | + fprintf(stderr, "Errors in config!\n"); | ||
| 72 | + return -1; | ||
| 73 | + } | ||
| 74 | + | ||
| 75 | + std::string device_name = po.GetArg(1); | ||
| 76 | + sherpa_onnx::Alsa alsa(device_name.c_str()); | ||
| 77 | + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); | ||
| 78 | + | ||
| 79 | + int32_t sample_rate = 16000; | ||
| 80 | + | ||
| 81 | + if (alsa.GetExpectedSampleRate() != sample_rate) { | ||
| 82 | + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), | ||
| 83 | + sample_rate); | ||
| 84 | + exit(-1); | ||
| 85 | + } | ||
| 86 | + | ||
| 87 | + int32_t chunk = 0.1 * alsa.GetActualSampleRate(); | ||
| 88 | + | ||
| 89 | + auto vad = std::make_unique<sherpa_onnx::VoiceActivityDetector>(config); | ||
| 90 | + | ||
| 91 | + fprintf(stderr, "Started. Please speak\n"); | ||
| 92 | + | ||
| 93 | + int32_t window_size = config.silero_vad.window_size; | ||
| 94 | + bool printed = false; | ||
| 95 | + | ||
| 96 | + int32_t k = 0; | ||
| 97 | + while (!stop) { | ||
| 98 | + { | ||
| 99 | + const std::vector<float> &samples = alsa.Read(chunk); | ||
| 100 | + | ||
| 101 | + vad->AcceptWaveform(samples.data(), samples.size()); | ||
| 102 | + | ||
| 103 | + if (vad->IsSpeechDetected() && !printed) { | ||
| 104 | + printed = true; | ||
| 105 | + fprintf(stderr, "\nDetected speech!\n"); | ||
| 106 | + } | ||
| 107 | + if (!vad->IsSpeechDetected()) { | ||
| 108 | + printed = false; | ||
| 109 | + } | ||
| 110 | + | ||
| 111 | + while (!vad->Empty()) { | ||
| 112 | + const auto &segment = vad->Front(); | ||
| 113 | + float duration = | ||
| 114 | + segment.samples.size() / static_cast<float>(sample_rate); | ||
| 115 | + | ||
| 116 | + fprintf(stderr, "Duration: %.3f seconds\n", duration); | ||
| 117 | + | ||
| 118 | + char filename[128]; | ||
| 119 | + snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration); | ||
| 120 | + k += 1; | ||
| 121 | + sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(), | ||
| 122 | + segment.samples.size()); | ||
| 123 | + fprintf(stderr, "Saved to %s\n", filename); | ||
| 124 | + fprintf(stderr, "----------\n"); | ||
| 125 | + | ||
| 126 | + vad->Pop(); | ||
| 127 | + } | ||
| 128 | + } | ||
| 129 | + } | ||
| 130 | + | ||
| 131 | + return 0; | ||
| 132 | +} |
| @@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
| 13 | #include "sherpa-onnx/csrc/circular-buffer.h" | 13 | #include "sherpa-onnx/csrc/circular-buffer.h" |
| 14 | #include "sherpa-onnx/csrc/microphone.h" | 14 | #include "sherpa-onnx/csrc/microphone.h" |
| 15 | #include "sherpa-onnx/csrc/voice-activity-detector.h" | 15 | #include "sherpa-onnx/csrc/voice-activity-detector.h" |
| 16 | +#include "sherpa-onnx/csrc/wave-writer.h" | ||
| 16 | 17 | ||
| 17 | bool stop = false; | 18 | bool stop = false; |
| 18 | std::mutex mutex; | 19 | std::mutex mutex; |
| @@ -122,6 +123,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | @@ -122,6 +123,7 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 122 | int32_t window_size = config.silero_vad.window_size; | 123 | int32_t window_size = config.silero_vad.window_size; |
| 123 | bool printed = false; | 124 | bool printed = false; |
| 124 | 125 | ||
| 126 | + int32_t k = 0; | ||
| 125 | while (!stop) { | 127 | while (!stop) { |
| 126 | { | 128 | { |
| 127 | std::lock_guard<std::mutex> lock(mutex); | 129 | std::lock_guard<std::mutex> lock(mutex); |
| @@ -140,9 +142,19 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | @@ -140,9 +142,19 @@ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 140 | } | 142 | } |
| 141 | 143 | ||
| 142 | while (!vad->Empty()) { | 144 | while (!vad->Empty()) { |
| 143 | - float duration = vad->Front().samples.size() / sample_rate; | ||
| 144 | - vad->Pop(); | 145 | + const auto &segment = vad->Front(); |
| 146 | + float duration = segment.samples.size() / sample_rate; | ||
| 145 | fprintf(stderr, "Duration: %.3f seconds\n", duration); | 147 | fprintf(stderr, "Duration: %.3f seconds\n", duration); |
| 148 | + | ||
| 149 | + char filename[128]; | ||
| 150 | + snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration); | ||
| 151 | + k += 1; | ||
| 152 | + sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(), | ||
| 153 | + segment.samples.size()); | ||
| 154 | + fprintf(stderr, "Saved to %s\n", filename); | ||
| 155 | + fprintf(stderr, "----------\n"); | ||
| 156 | + | ||
| 157 | + vad->Pop(); | ||
| 146 | } | 158 | } |
| 147 | } | 159 | } |
| 148 | } | 160 | } |
| @@ -35,6 +35,7 @@ set(srcs | @@ -35,6 +35,7 @@ set(srcs | ||
| 35 | vad-model-config.cc | 35 | vad-model-config.cc |
| 36 | vad-model.cc | 36 | vad-model.cc |
| 37 | voice-activity-detector.cc | 37 | voice-activity-detector.cc |
| 38 | + wave-writer.cc | ||
| 38 | ) | 39 | ) |
| 39 | if(SHERPA_ONNX_HAS_ALSA) | 40 | if(SHERPA_ONNX_HAS_ALSA) |
| 40 | list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc) | 41 | list(APPEND srcs ${CMAKE_SOURCE_DIR}/sherpa-onnx/csrc/alsa.cc alsa.cc) |
| @@ -26,6 +26,7 @@ | @@ -26,6 +26,7 @@ | ||
| 26 | #include "sherpa-onnx/python/csrc/vad-model-config.h" | 26 | #include "sherpa-onnx/python/csrc/vad-model-config.h" |
| 27 | #include "sherpa-onnx/python/csrc/vad-model.h" | 27 | #include "sherpa-onnx/python/csrc/vad-model.h" |
| 28 | #include "sherpa-onnx/python/csrc/voice-activity-detector.h" | 28 | #include "sherpa-onnx/python/csrc/voice-activity-detector.h" |
| 29 | +#include "sherpa-onnx/python/csrc/wave-writer.h" | ||
| 29 | 30 | ||
| 30 | #if SHERPA_ONNX_ENABLE_TTS == 1 | 31 | #if SHERPA_ONNX_ENABLE_TTS == 1 |
| 31 | #include "sherpa-onnx/python/csrc/offline-tts.h" | 32 | #include "sherpa-onnx/python/csrc/offline-tts.h" |
| @@ -36,6 +37,8 @@ namespace sherpa_onnx { | @@ -36,6 +37,8 @@ namespace sherpa_onnx { | ||
| 36 | PYBIND11_MODULE(_sherpa_onnx, m) { | 37 | PYBIND11_MODULE(_sherpa_onnx, m) { |
| 37 | m.doc() = "pybind11 binding of sherpa-onnx"; | 38 | m.doc() = "pybind11 binding of sherpa-onnx"; |
| 38 | 39 | ||
| 40 | + PybindWaveWriter(&m); | ||
| 41 | + | ||
| 39 | PybindFeatures(&m); | 42 | PybindFeatures(&m); |
| 40 | PybindOnlineCtcFstDecoderConfig(&m); | 43 | PybindOnlineCtcFstDecoderConfig(&m); |
| 41 | PybindOnlineModelConfig(&m); | 44 | PybindOnlineModelConfig(&m); |
sherpa-onnx/python/csrc/wave-writer.cc
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/wave-writer.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/python/csrc/wave-writer.h" | ||
| 6 | + | ||
| 7 | +#include <string> | ||
| 8 | +#include <vector> | ||
| 9 | + | ||
| 10 | +#include "sherpa-onnx/csrc/wave-writer.h" | ||
| 11 | + | ||
| 12 | +namespace sherpa_onnx { | ||
| 13 | + | ||
| 14 | +void PybindWaveWriter(py::module *m) { | ||
| 15 | + m->def( | ||
| 16 | + "write_wave", | ||
| 17 | + [](const std::string &filename, const std::vector<float> &samples, | ||
| 18 | + int32_t sample_rate) -> bool { | ||
| 19 | + bool ok = | ||
| 20 | + WriteWave(filename, sample_rate, samples.data(), samples.size()); | ||
| 21 | + | ||
| 22 | + return ok; | ||
| 23 | + }, | ||
| 24 | + py::arg("filename"), py::arg("samples"), py::arg("sample_rate")); | ||
| 25 | +} | ||
| 26 | + | ||
| 27 | +} // namespace sherpa_onnx |
sherpa-onnx/python/csrc/wave-writer.h
0 → 100644
| 1 | +// sherpa-onnx/python/csrc/wave-writer.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_PYTHON_CSRC_WAVE_WRITER_H_ | ||
| 6 | +#define SHERPA_ONNX_PYTHON_CSRC_WAVE_WRITER_H_ | ||
| 7 | + | ||
| 8 | +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" | ||
| 9 | + | ||
| 10 | +namespace sherpa_onnx { | ||
| 11 | + | ||
| 12 | +void PybindWaveWriter(py::module *m); | ||
| 13 | + | ||
| 14 | +} | ||
| 15 | + | ||
| 16 | +#endif // SHERPA_ONNX_PYTHON_CSRC_WAVE_WRITER_H_ |
| @@ -19,6 +19,7 @@ from _sherpa_onnx import ( | @@ -19,6 +19,7 @@ from _sherpa_onnx import ( | ||
| 19 | VadModel, | 19 | VadModel, |
| 20 | VadModelConfig, | 20 | VadModelConfig, |
| 21 | VoiceActivityDetector, | 21 | VoiceActivityDetector, |
| 22 | + write_wave, | ||
| 22 | ) | 23 | ) |
| 23 | 24 | ||
| 24 | from .keyword_spotter import KeywordSpotter | 25 | from .keyword_spotter import KeywordSpotter |
-
请 注册 或 登录 后发表评论