vad-remove-non-speech-segments-from-file.py
3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
"""
This file shows how to remove non-speech segments
and merge all speech segments into a large segment
and save it to a file.
Usage
python3 ./vad-remove-non-speech-segments-from-file.py \
--silero-vad-model silero_vad.onnx \
input.wav \
output.wav
Please visit
https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
to download silero_vad.onnx
For instance,
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
"""
import argparse
from pathlib import Path
from typing import Tuple
import numpy as np
import sherpa_onnx
import soundfile as sf
def assert_file_exists(filename: str):
assert Path(filename).is_file(), (
f"{filename} does not exist!\n"
"Please refer to "
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
)
def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--silero-vad-model",
type=str,
required=True,
help="Path to silero_vad.onnx",
)
parser.add_argument(
"input",
type=str,
help="Path to input.wav",
)
parser.add_argument(
"output",
type=str,
help="Path to output.wav",
)
return parser.parse_args()
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate
def main():
args = get_args()
assert_file_exists(args.silero_vad_model)
assert_file_exists(args.input)
samples, sample_rate = load_audio(args.input)
if sample_rate != 16000:
import librosa
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000
config = sherpa_onnx.VadModelConfig()
config.silero_vad.model = args.silero_vad_model
config.silero_vad.threshold = 0.5
config.silero_vad.min_silence_duration = 0.25 # seconds
config.silero_vad.min_speech_duration = 0.25 # seconds
# If the current segment is larger than this value, then it increases
# the threshold to 0.9 internally. After detecting this segment,
# it resets the threshold to its original value.
config.silero_vad.max_speech_duration = 5 # seconds
config.sample_rate = sample_rate
window_size = config.silero_vad.window_size
vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
speech_samples = []
while len(samples) > window_size:
vad.accept_waveform(samples[:window_size])
samples = samples[window_size:]
while not vad.empty():
speech_samples.extend(vad.front.samples)
vad.pop()
vad.flush()
while not vad.empty():
speech_samples.extend(vad.front.samples)
vad.pop()
speech_samples = np.array(speech_samples, dtype=np.float32)
sf.write(args.output, speech_samples, samplerate=sample_rate)
print(f"Saved to {args.output}")
if __name__ == "__main__":
main()