vad-remove-non-speech-segments-alsa.py 3.5 KB
#!/usr/bin/env python3

"""
This file shows how to remove non-speech segments
and merge all speech segments into a large segment
and save it to a file.

Different from ./vad-remove-non-speech-segments.py, this file supports only
Linux.

Usage

python3 ./vad-remove-non-speech-segments-alsa.py \
        --silero-vad-model silero_vad.onnx

Please visit
https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
to download silero_vad.onnx

For instance,

wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
"""

import argparse
import time
from pathlib import Path

import numpy as np
import sherpa_onnx
import soundfile as sf


def assert_file_exists(filename: str):
    assert Path(filename).is_file(), (
        f"{filename} does not exist!\n"
        "Please refer to "
        "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
    )


def get_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument(
        "--silero-vad-model",
        type=str,
        required=True,
        help="Path to silero_vad.onnx",
    )

    parser.add_argument(
        "--device-name",
        type=str,
        required=True,
        help="""
The device name specifies which microphone to use in case there are several
on your system. You can use

  arecord -l

to find all available microphones on your computer. For instance, if it outputs

**** List of CAPTURE Hardware Devices ****
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  Subdevices: 1/1
  Subdevice #0: subdevice #0

and if you want to select card 3 and device 0 on that card, please use:

  plughw:3,0

as the device_name.
        """,
    )

    return parser.parse_args()


def main():
    args = get_args()
    assert_file_exists(args.silero_vad_model)

    device_name = args.device_name
    print(f"device_name: {device_name}")
    alsa = sherpa_onnx.Alsa(device_name)

    sample_rate = 16000
    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms

    config = sherpa_onnx.VadModelConfig()
    config.silero_vad.model = args.silero_vad_model
    config.sample_rate = sample_rate

    window_size = config.silero_vad.window_size

    buffer = []
    vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)

    all_samples = []

    print("Started! Please speak. Press Ctrl C to exit")

    try:
        while True:
            samples = alsa.read(samples_per_read)  # a blocking read
            samples = np.array(samples)

            buffer = np.concatenate([buffer, samples])

            all_samples = np.concatenate([all_samples, samples])

            while len(buffer) > window_size:
                vad.accept_waveform(buffer[:window_size])
                buffer = buffer[window_size:]
    except KeyboardInterrupt:
        print("\nCaught Ctrl + C. Saving & Exiting")

        speech_samples = []
        while not vad.empty():
            speech_samples.extend(vad.front.samples)
            vad.pop()

        speech_samples = np.array(speech_samples, dtype=np.float32)

        filename_for_speech = time.strftime("%Y%m%d-%H%M%S-speech.wav")
        sf.write(filename_for_speech, speech_samples, samplerate=sample_rate)

        filename_for_all = time.strftime("%Y%m%d-%H%M%S-all.wav")
        sf.write(filename_for_all, all_samples, samplerate=sample_rate)

        print(f"Saved to {filename_for_speech} and {filename_for_all}")


if __name__ == "__main__":
    main()