Add a VAD Python example to remove silences from a file. (#963)

Fangjun Kuang · GitHub
Commit b31b9f3a2d634f7bd43eace26316ed002647ee9e b31b9f3a 1 parent 9edb78e2
python-api-examples/vad-remove-non-speech-segments-from-file.py
--- a/python-api-examples/vad-remove-non-speech-segments-from-file.py 0 → 100755
查看文件 @b31b9f3
+++ b/python-api-examples/vad-remove-non-speech-segments-from-file.py 0 → 100755
查看文件 @b31b9f3
+ #!/usr/bin/env python3
+ 
+ """
+ This file shows how to remove non-speech segments
+ and merge all speech segments into a large segment
+ and save it to a file.
+ 
+ Usage
+ 
+ python3 ./vad-remove-non-speech-segments-from-file.py \
+         --silero-vad-model silero_vad.onnx \
+         input.wav \
+         output.wav
+ 
+ Please visit
+ https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
+ to download silero_vad.onnx
+ 
+ For instance,
+ 
+ wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
+ """
+ 
+ import argparse
+ from pathlib import Path
+ from typing import Tuple
+ 
+ import numpy as np
+ import sherpa_onnx
+ import soundfile as sf
+ 
+ 
+ def assert_file_exists(filename: str):
+     assert Path(filename).is_file(), (
+         f"{filename} does not exist!\n"
+         "Please refer to "
+         "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
+     )
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser(
+         formatter_class=argparse.ArgumentDefaultsHelpFormatter
+     )
+ 
+     parser.add_argument(
+         "--silero-vad-model",
+         type=str,
+         required=True,
+         help="Path to silero_vad.onnx",
+     )
+ 
+     parser.add_argument(
+         "input",
+         type=str,
+         help="Path to input.wav",
+     )
+ 
+     parser.add_argument(
+         "output",
+         type=str,
+         help="Path to output.wav",
+     )
+ 
+     return parser.parse_args()
+ 
+ 
+ def load_audio(filename: str) -> Tuple[np.ndarray, int]:
+     data, sample_rate = sf.read(
+         filename,
+         always_2d=True,
+         dtype="float32",
+     )
+     data = data[:, 0]  # use only the first channel
+     samples = np.ascontiguousarray(data)
+     return samples, sample_rate
+ 
+ 
+ def main():
+     args = get_args()
+     assert_file_exists(args.silero_vad_model)
+     assert_file_exists(args.input)
+ 
+     samples, sample_rate = load_audio(args.input)
+     if sample_rate != 16000:
+         import librosa
+ 
+         samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
+         sample_rate = 16000
+ 
+     config = sherpa_onnx.VadModelConfig()
+     config.silero_vad.model = args.silero_vad_model
+     config.sample_rate = sample_rate
+ 
+     window_size = config.silero_vad.window_size
+ 
+     vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
+ 
+     speech_samples = []
+     while len(samples) > window_size:
+         vad.accept_waveform(samples[:window_size])
+         samples = samples[window_size:]
+ 
+         while not vad.empty():
+             speech_samples.extend(vad.front.samples)
+             vad.pop()
+ 
+     speech_samples = np.array(speech_samples, dtype=np.float32)
+ 
+     sf.write(args.output, speech_samples, samplerate=sample_rate)
+ 
+     print(f"Saved to {args.output}")
+ 
+ 
+ if __name__ == "__main__":
+     main()