speaker-identification-with-vad-dynamic.py
6.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python3
"""
This script shows how to use Python APIs for speaker identification with
a microphone and a VAD model
Usage:
(1) Download a model for computing speaker embeddings
Please visit
https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
to download a model. An example is given below:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx
Note that `zh` means Chinese, while `en` means English.
(2) Download the VAD model
Please visit
https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
to download silero_vad.onnx
For instance,
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
(3) Run this script
python3 ./python-api-examples/speaker-identification-with-vad-dynamic.py \
--silero-vad-model=/path/to/silero_vad.onnx \
--model ./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx
"""
import argparse
import sys
import numpy as np
import sherpa_onnx
try:
import sounddevice as sd
except ImportError:
print("Please install sounddevice first. You can use")
print()
print(" pip install sounddevice")
print()
print("to install it")
sys.exit(-1)
g_sample_rate = 16000
def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--model",
type=str,
required=True,
help="Path to the speaker embedding model file.",
)
parser.add_argument(
"--silero-vad-model",
type=str,
required=True,
help="Path to silero_vad.onnx",
)
parser.add_argument("--threshold", type=float, default=0.4)
parser.add_argument(
"--num-threads",
type=int,
default=1,
help="Number of threads for neural network computation",
)
parser.add_argument(
"--debug",
type=bool,
default=False,
help="True to show debug messages",
)
parser.add_argument(
"--provider",
type=str,
default="cpu",
help="Valid values: cpu, cuda, coreml",
)
return parser.parse_args()
def load_speaker_embedding_model(args):
config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
model=args.model,
num_threads=args.num_threads,
debug=args.debug,
provider=args.provider,
)
if not config.validate():
raise ValueError(f"Invalid config. {config}")
extractor = sherpa_onnx.SpeakerEmbeddingExtractor(config)
return extractor
def compute_speaker_embedding(
samples: np.ndarray,
extractor: sherpa_onnx.SpeakerEmbeddingExtractor,
) -> np.ndarray:
"""
Args:
samples:
A 1-D float32 array.
extractor:
The return value of function load_speaker_embedding_model().
Returns:
Return a 1-D float32 array.
"""
if len(samples) < g_sample_rate:
print(f"Your input contains only {len(samples)} samples!")
stream = extractor.create_stream()
stream.accept_waveform(sample_rate=g_sample_rate, waveform=samples)
stream.input_finished()
assert extractor.is_ready(stream)
embedding = extractor.compute(stream)
embedding = np.array(embedding)
return embedding
def main():
args = get_args()
print(args)
devices = sd.query_devices()
if len(devices) == 0:
print("No microphone devices found")
sys.exit(0)
print(devices)
# If you want to select a different device, please change
# sd.default.device[0]. For instance, if you want to select device 10,
# please use
#
# sd.default.device[0] = 4
# print(devices)
#
default_input_device_idx = sd.default.device[0]
print(f'Use default device: {devices[default_input_device_idx]["name"]}')
extractor = load_speaker_embedding_model(args)
manager = sherpa_onnx.SpeakerEmbeddingManager(extractor.dim)
vad_config = sherpa_onnx.VadModelConfig()
vad_config.silero_vad.model = args.silero_vad_model
vad_config.silero_vad.min_silence_duration = 0.25
vad_config.silero_vad.min_speech_duration = 1.0
vad_config.sample_rate = g_sample_rate
window_size = vad_config.silero_vad.window_size
vad = sherpa_onnx.VoiceActivityDetector(vad_config, buffer_size_in_seconds=100)
samples_per_read = int(0.1 * g_sample_rate) # 0.1 second = 100 ms
print("Started! Please speak")
line_num = 0
speaker_id = 0
buffer = []
with sd.InputStream(channels=1, dtype="float32", samplerate=g_sample_rate) as s:
while True:
samples, _ = s.read(samples_per_read) # a blocking read
samples = samples.reshape(-1)
buffer = np.concatenate([buffer, samples])
while len(buffer) > window_size:
vad.accept_waveform(buffer[:window_size])
buffer = buffer[window_size:]
while not vad.empty():
if len(vad.front.samples) < 0.5 * g_sample_rate:
# this segment is too short, skip it
vad.pop()
continue
stream = extractor.create_stream()
stream.accept_waveform(
sample_rate=g_sample_rate, waveform=vad.front.samples
)
vad.pop()
stream.input_finished()
embedding = extractor.compute(stream)
embedding = np.array(embedding)
name = manager.search(embedding, threshold=args.threshold)
if not name:
# register it
new_name = f"speaker_{speaker_id}"
status = manager.add(new_name, embedding)
if not status:
raise RuntimeError(f"Failed to register speaker {new_name}")
print(
f"{line_num}: Detected new speaker. Register it as {new_name}"
)
speaker_id += 1
else:
print(f"{line_num}: Detected existing speaker: {name}")
line_num += 1
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\nCaught Ctrl + C. Exiting")