Committed by
GitHub
Add a VAD Python example to remove silences from a file. (#963)
正在显示
1 个修改的文件
包含
116 行增加
和
0 行删除
| 1 | +#!/usr/bin/env python3 | ||
| 2 | + | ||
| 3 | +""" | ||
| 4 | +This file shows how to remove non-speech segments | ||
| 5 | +and merge all speech segments into a large segment | ||
| 6 | +and save it to a file. | ||
| 7 | + | ||
| 8 | +Usage | ||
| 9 | + | ||
| 10 | +python3 ./vad-remove-non-speech-segments-from-file.py \ | ||
| 11 | + --silero-vad-model silero_vad.onnx \ | ||
| 12 | + input.wav \ | ||
| 13 | + output.wav | ||
| 14 | + | ||
| 15 | +Please visit | ||
| 16 | +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 17 | +to download silero_vad.onnx | ||
| 18 | + | ||
| 19 | +For instance, | ||
| 20 | + | ||
| 21 | +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx | ||
| 22 | +""" | ||
| 23 | + | ||
| 24 | +import argparse | ||
| 25 | +from pathlib import Path | ||
| 26 | +from typing import Tuple | ||
| 27 | + | ||
| 28 | +import numpy as np | ||
| 29 | +import sherpa_onnx | ||
| 30 | +import soundfile as sf | ||
| 31 | + | ||
| 32 | + | ||
| 33 | +def assert_file_exists(filename: str): | ||
| 34 | + assert Path(filename).is_file(), ( | ||
| 35 | + f"{filename} does not exist!\n" | ||
| 36 | + "Please refer to " | ||
| 37 | + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" | ||
| 38 | + ) | ||
| 39 | + | ||
| 40 | + | ||
| 41 | +def get_args(): | ||
| 42 | + parser = argparse.ArgumentParser( | ||
| 43 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
| 44 | + ) | ||
| 45 | + | ||
| 46 | + parser.add_argument( | ||
| 47 | + "--silero-vad-model", | ||
| 48 | + type=str, | ||
| 49 | + required=True, | ||
| 50 | + help="Path to silero_vad.onnx", | ||
| 51 | + ) | ||
| 52 | + | ||
| 53 | + parser.add_argument( | ||
| 54 | + "input", | ||
| 55 | + type=str, | ||
| 56 | + help="Path to input.wav", | ||
| 57 | + ) | ||
| 58 | + | ||
| 59 | + parser.add_argument( | ||
| 60 | + "output", | ||
| 61 | + type=str, | ||
| 62 | + help="Path to output.wav", | ||
| 63 | + ) | ||
| 64 | + | ||
| 65 | + return parser.parse_args() | ||
| 66 | + | ||
| 67 | + | ||
| 68 | +def load_audio(filename: str) -> Tuple[np.ndarray, int]: | ||
| 69 | + data, sample_rate = sf.read( | ||
| 70 | + filename, | ||
| 71 | + always_2d=True, | ||
| 72 | + dtype="float32", | ||
| 73 | + ) | ||
| 74 | + data = data[:, 0] # use only the first channel | ||
| 75 | + samples = np.ascontiguousarray(data) | ||
| 76 | + return samples, sample_rate | ||
| 77 | + | ||
| 78 | + | ||
| 79 | +def main(): | ||
| 80 | + args = get_args() | ||
| 81 | + assert_file_exists(args.silero_vad_model) | ||
| 82 | + assert_file_exists(args.input) | ||
| 83 | + | ||
| 84 | + samples, sample_rate = load_audio(args.input) | ||
| 85 | + if sample_rate != 16000: | ||
| 86 | + import librosa | ||
| 87 | + | ||
| 88 | + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) | ||
| 89 | + sample_rate = 16000 | ||
| 90 | + | ||
| 91 | + config = sherpa_onnx.VadModelConfig() | ||
| 92 | + config.silero_vad.model = args.silero_vad_model | ||
| 93 | + config.sample_rate = sample_rate | ||
| 94 | + | ||
| 95 | + window_size = config.silero_vad.window_size | ||
| 96 | + | ||
| 97 | + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) | ||
| 98 | + | ||
| 99 | + speech_samples = [] | ||
| 100 | + while len(samples) > window_size: | ||
| 101 | + vad.accept_waveform(samples[:window_size]) | ||
| 102 | + samples = samples[window_size:] | ||
| 103 | + | ||
| 104 | + while not vad.empty(): | ||
| 105 | + speech_samples.extend(vad.front.samples) | ||
| 106 | + vad.pop() | ||
| 107 | + | ||
| 108 | + speech_samples = np.array(speech_samples, dtype=np.float32) | ||
| 109 | + | ||
| 110 | + sf.write(args.output, speech_samples, samplerate=sample_rate) | ||
| 111 | + | ||
| 112 | + print(f"Saved to {args.output}") | ||
| 113 | + | ||
| 114 | + | ||
| 115 | +if __name__ == "__main__": | ||
| 116 | + main() |
-
请 注册 或 登录 后发表评论