Fangjun Kuang
Committed by GitHub

Add a VAD Python example to remove silences from a file. (#963)

  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This file shows how to remove non-speech segments
  5 +and merge all speech segments into a large segment
  6 +and save it to a file.
  7 +
  8 +Usage
  9 +
  10 +python3 ./vad-remove-non-speech-segments-from-file.py \
  11 + --silero-vad-model silero_vad.onnx \
  12 + input.wav \
  13 + output.wav
  14 +
  15 +Please visit
  16 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  17 +to download silero_vad.onnx
  18 +
  19 +For instance,
  20 +
  21 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  22 +"""
  23 +
  24 +import argparse
  25 +from pathlib import Path
  26 +from typing import Tuple
  27 +
  28 +import numpy as np
  29 +import sherpa_onnx
  30 +import soundfile as sf
  31 +
  32 +
  33 +def assert_file_exists(filename: str):
  34 + assert Path(filename).is_file(), (
  35 + f"{filename} does not exist!\n"
  36 + "Please refer to "
  37 + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
  38 + )
  39 +
  40 +
  41 +def get_args():
  42 + parser = argparse.ArgumentParser(
  43 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  44 + )
  45 +
  46 + parser.add_argument(
  47 + "--silero-vad-model",
  48 + type=str,
  49 + required=True,
  50 + help="Path to silero_vad.onnx",
  51 + )
  52 +
  53 + parser.add_argument(
  54 + "input",
  55 + type=str,
  56 + help="Path to input.wav",
  57 + )
  58 +
  59 + parser.add_argument(
  60 + "output",
  61 + type=str,
  62 + help="Path to output.wav",
  63 + )
  64 +
  65 + return parser.parse_args()
  66 +
  67 +
  68 +def load_audio(filename: str) -> Tuple[np.ndarray, int]:
  69 + data, sample_rate = sf.read(
  70 + filename,
  71 + always_2d=True,
  72 + dtype="float32",
  73 + )
  74 + data = data[:, 0] # use only the first channel
  75 + samples = np.ascontiguousarray(data)
  76 + return samples, sample_rate
  77 +
  78 +
  79 +def main():
  80 + args = get_args()
  81 + assert_file_exists(args.silero_vad_model)
  82 + assert_file_exists(args.input)
  83 +
  84 + samples, sample_rate = load_audio(args.input)
  85 + if sample_rate != 16000:
  86 + import librosa
  87 +
  88 + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
  89 + sample_rate = 16000
  90 +
  91 + config = sherpa_onnx.VadModelConfig()
  92 + config.silero_vad.model = args.silero_vad_model
  93 + config.sample_rate = sample_rate
  94 +
  95 + window_size = config.silero_vad.window_size
  96 +
  97 + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
  98 +
  99 + speech_samples = []
  100 + while len(samples) > window_size:
  101 + vad.accept_waveform(samples[:window_size])
  102 + samples = samples[window_size:]
  103 +
  104 + while not vad.empty():
  105 + speech_samples.extend(vad.front.samples)
  106 + vad.pop()
  107 +
  108 + speech_samples = np.array(speech_samples, dtype=np.float32)
  109 +
  110 + sf.write(args.output, speech_samples, samplerate=sample_rate)
  111 +
  112 + print(f"Saved to {args.output}")
  113 +
  114 +
  115 +if __name__ == "__main__":
  116 + main()