Fangjun Kuang
Committed by GitHub

Add Python API examples for speaker recognition with VAD and ASR. (#532)

  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This script shows how to use Python APIs for speaker identification with
  5 +a microphone, a VAD model, and a non-streaming ASR model.
  6 +
  7 +Please see also ./generate-subtitles.py
  8 +
  9 +Usage:
  10 +
  11 +(1) Prepare a text file containing speaker related files.
  12 +
  13 +Each line in the text file contains two columns. The first column is the
  14 +speaker name, while the second column contains the wave file of the speaker.
  15 +
  16 +If the text file contains multiple wave files for the same speaker, then the
  17 +embeddings of these files are averaged.
  18 +
  19 +An example text file is given below:
  20 +
  21 + foo /path/to/a.wav
  22 + bar /path/to/b.wav
  23 + foo /path/to/c.wav
  24 + foobar /path/to/d.wav
  25 +
  26 +Each wave file should contain only a single channel; the sample format
  27 +should be int16_t; the sample rate can be arbitrary.
  28 +
  29 +(2) Download a model for computing speaker embeddings
  30 +
  31 +Please visit
  32 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  33 +to download a model. An example is given below:
  34 +
  35 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_zh_cnceleb_resnet34.onnx
  36 +
  37 +Note that `zh` means Chinese, while `en` means English.
  38 +
  39 +(3) Download the VAD model
  40 +Please visit
  41 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  42 +to download silero_vad.onnx
  43 +
  44 +For instance,
  45 +
  46 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  47 +
  48 +(4) Please refer to ./generate-subtitles.py
  49 +to download a non-streaming ASR model.
  50 +
  51 +(5) Run this script
  52 +
  53 +Assume the filename of the text file is speaker.txt.
  54 +
  55 +python3 ./python-api-examples/speaker-identification-with-vad.py \
  56 + --silero-vad-model=/path/to/silero_vad.onnx \
  57 + --speaker-file ./speaker.txt \
  58 + --model ./wespeaker_zh_cnceleb_resnet34.onnx
  59 +"""
  60 +import argparse
  61 +import sys
  62 +from collections import defaultdict
  63 +from pathlib import Path
  64 +from typing import Dict, List, Tuple
  65 +
  66 +import numpy as np
  67 +import sherpa_onnx
  68 +import torchaudio
  69 +
  70 +try:
  71 + import sounddevice as sd
  72 +except ImportError:
  73 + print("Please install sounddevice first. You can use")
  74 + print()
  75 + print(" pip install sounddevice")
  76 + print()
  77 + print("to install it")
  78 + sys.exit(-1)
  79 +
  80 +g_sample_rate = 16000
  81 +
  82 +
  83 +def register_non_streaming_asr_model_args(parser):
  84 + parser.add_argument(
  85 + "--tokens",
  86 + type=str,
  87 + help="Path to tokens.txt",
  88 + )
  89 +
  90 + parser.add_argument(
  91 + "--encoder",
  92 + default="",
  93 + type=str,
  94 + help="Path to the transducer encoder model",
  95 + )
  96 +
  97 + parser.add_argument(
  98 + "--decoder",
  99 + default="",
  100 + type=str,
  101 + help="Path to the transducer decoder model",
  102 + )
  103 +
  104 + parser.add_argument(
  105 + "--joiner",
  106 + default="",
  107 + type=str,
  108 + help="Path to the transducer joiner model",
  109 + )
  110 +
  111 + parser.add_argument(
  112 + "--paraformer",
  113 + default="",
  114 + type=str,
  115 + help="Path to the model.onnx from Paraformer",
  116 + )
  117 +
  118 + parser.add_argument(
  119 + "--wenet-ctc",
  120 + default="",
  121 + type=str,
  122 + help="Path to the CTC model.onnx from WeNet",
  123 + )
  124 +
  125 + parser.add_argument(
  126 + "--whisper-encoder",
  127 + default="",
  128 + type=str,
  129 + help="Path to whisper encoder model",
  130 + )
  131 +
  132 + parser.add_argument(
  133 + "--whisper-decoder",
  134 + default="",
  135 + type=str,
  136 + help="Path to whisper decoder model",
  137 + )
  138 +
  139 + parser.add_argument(
  140 + "--whisper-language",
  141 + default="",
  142 + type=str,
  143 + help="""It specifies the spoken language in the input file.
  144 + Example values: en, fr, de, zh, jp.
  145 + Available languages for multilingual models can be found at
  146 + https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
  147 + If not specified, we infer the language from the input audio file.
  148 + """,
  149 + )
  150 +
  151 + parser.add_argument(
  152 + "--whisper-task",
  153 + default="transcribe",
  154 + choices=["transcribe", "translate"],
  155 + type=str,
  156 + help="""For multilingual models, if you specify translate, the output
  157 + will be in English.
  158 + """,
  159 + )
  160 +
  161 + parser.add_argument(
  162 + "--whisper-tail-paddings",
  163 + default=-1,
  164 + type=int,
  165 + help="""Number of tail padding frames.
  166 + We have removed the 30-second constraint from whisper, so you need to
  167 + choose the amount of tail padding frames by yourself.
  168 + Use -1 to use a default value for tail padding.
  169 + """,
  170 + )
  171 +
  172 + parser.add_argument(
  173 + "--decoding-method",
  174 + type=str,
  175 + default="greedy_search",
  176 + help="""Valid values are greedy_search and modified_beam_search.
  177 + modified_beam_search is valid only for transducer models.
  178 + """,
  179 + )
  180 +
  181 + parser.add_argument(
  182 + "--feature-dim",
  183 + type=int,
  184 + default=80,
  185 + help="Feature dimension. Must match the one expected by the model",
  186 + )
  187 +
  188 +
  189 +def get_args():
  190 + parser = argparse.ArgumentParser(
  191 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  192 + )
  193 +
  194 + register_non_streaming_asr_model_args(parser)
  195 +
  196 + parser.add_argument(
  197 + "--speaker-file",
  198 + type=str,
  199 + required=True,
  200 + help="""Path to the speaker file. Read the help doc at the beginning of this
  201 + file for the format.""",
  202 + )
  203 +
  204 + parser.add_argument(
  205 + "--model",
  206 + type=str,
  207 + required=True,
  208 + help="Path to the speaker embedding model file.",
  209 + )
  210 +
  211 + parser.add_argument(
  212 + "--silero-vad-model",
  213 + type=str,
  214 + required=True,
  215 + help="Path to silero_vad.onnx",
  216 + )
  217 +
  218 + parser.add_argument("--threshold", type=float, default=0.6)
  219 +
  220 + parser.add_argument(
  221 + "--num-threads",
  222 + type=int,
  223 + default=1,
  224 + help="Number of threads for neural network computation",
  225 + )
  226 +
  227 + parser.add_argument(
  228 + "--debug",
  229 + type=bool,
  230 + default=False,
  231 + help="True to show debug messages",
  232 + )
  233 +
  234 + parser.add_argument(
  235 + "--provider",
  236 + type=str,
  237 + default="cpu",
  238 + help="Valid values: cpu, cuda, coreml",
  239 + )
  240 +
  241 + return parser.parse_args()
  242 +
  243 +
  244 +def assert_file_exists(filename: str):
  245 + assert Path(filename).is_file(), (
  246 + f"{filename} does not exist!\n"
  247 + "Please refer to "
  248 + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
  249 + )
  250 +
  251 +
  252 +def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
  253 + if args.encoder:
  254 + assert len(args.paraformer) == 0, args.paraformer
  255 + assert len(args.wenet_ctc) == 0, args.wenet_ctc
  256 + assert len(args.whisper_encoder) == 0, args.whisper_encoder
  257 + assert len(args.whisper_decoder) == 0, args.whisper_decoder
  258 +
  259 + assert_file_exists(args.encoder)
  260 + assert_file_exists(args.decoder)
  261 + assert_file_exists(args.joiner)
  262 +
  263 + recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
  264 + encoder=args.encoder,
  265 + decoder=args.decoder,
  266 + joiner=args.joiner,
  267 + tokens=args.tokens,
  268 + num_threads=args.num_threads,
  269 + sample_rate=args.sample_rate,
  270 + feature_dim=args.feature_dim,
  271 + decoding_method=args.decoding_method,
  272 + debug=args.debug,
  273 + )
  274 + elif args.paraformer:
  275 + assert len(args.wenet_ctc) == 0, args.wenet_ctc
  276 + assert len(args.whisper_encoder) == 0, args.whisper_encoder
  277 + assert len(args.whisper_decoder) == 0, args.whisper_decoder
  278 +
  279 + assert_file_exists(args.paraformer)
  280 +
  281 + recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
  282 + paraformer=args.paraformer,
  283 + tokens=args.tokens,
  284 + num_threads=args.num_threads,
  285 + sample_rate=g_sample_rate,
  286 + feature_dim=args.feature_dim,
  287 + decoding_method=args.decoding_method,
  288 + debug=args.debug,
  289 + )
  290 + elif args.wenet_ctc:
  291 + assert len(args.whisper_encoder) == 0, args.whisper_encoder
  292 + assert len(args.whisper_decoder) == 0, args.whisper_decoder
  293 +
  294 + assert_file_exists(args.wenet_ctc)
  295 +
  296 + recognizer = sherpa_onnx.OfflineRecognizer.from_wenet_ctc(
  297 + model=args.wenet_ctc,
  298 + tokens=args.tokens,
  299 + num_threads=args.num_threads,
  300 + sample_rate=args.sample_rate,
  301 + feature_dim=args.feature_dim,
  302 + decoding_method=args.decoding_method,
  303 + debug=args.debug,
  304 + )
  305 + elif args.whisper_encoder:
  306 + assert_file_exists(args.whisper_encoder)
  307 + assert_file_exists(args.whisper_decoder)
  308 +
  309 + recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
  310 + encoder=args.whisper_encoder,
  311 + decoder=args.whisper_decoder,
  312 + tokens=args.tokens,
  313 + num_threads=args.num_threads,
  314 + decoding_method=args.decoding_method,
  315 + debug=args.debug,
  316 + language=args.whisper_language,
  317 + task=args.whisper_task,
  318 + tail_paddings=args.whisper_tail_paddings,
  319 + )
  320 + else:
  321 + raise ValueError("Please specify at least one model")
  322 +
  323 + return recognizer
  324 +
  325 +
  326 +def load_speaker_embedding_model(args):
  327 + config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
  328 + model=args.model,
  329 + num_threads=args.num_threads,
  330 + debug=args.debug,
  331 + provider=args.provider,
  332 + )
  333 + if not config.validate():
  334 + raise ValueError(f"Invalid config. {config}")
  335 + extractor = sherpa_onnx.SpeakerEmbeddingExtractor(config)
  336 + return extractor
  337 +
  338 +
  339 +def load_speaker_file(args) -> Dict[str, List[str]]:
  340 + if not Path(args.speaker_file).is_file():
  341 + raise ValueError(f"--speaker-file {args.speaker_file} does not exist")
  342 +
  343 + ans = defaultdict(list)
  344 + with open(args.speaker_file) as f:
  345 + for line in f:
  346 + line = line.strip()
  347 + if not line:
  348 + continue
  349 +
  350 + fields = line.split()
  351 + if len(fields) != 2:
  352 + raise ValueError(f"Invalid line: {line}. Fields: {fields}")
  353 +
  354 + speaker_name, filename = fields
  355 + ans[speaker_name].append(filename)
  356 + return ans
  357 +
  358 +
  359 +def load_audio(filename: str) -> Tuple[np.ndarray, int]:
  360 + samples, sample_rate = torchaudio.load(filename)
  361 + return samples[0].contiguous().numpy(), sample_rate
  362 +
  363 +
  364 +def compute_speaker_embedding(
  365 + filenames: List[str],
  366 + extractor: sherpa_onnx.SpeakerEmbeddingExtractor,
  367 +) -> np.ndarray:
  368 + assert len(filenames) > 0, "filenames is empty"
  369 +
  370 + ans = None
  371 + for filename in filenames:
  372 + print(f"processing {filename}")
  373 + samples, sample_rate = load_audio(filename)
  374 + stream = extractor.create_stream()
  375 + stream.accept_waveform(sample_rate=sample_rate, waveform=samples)
  376 + stream.input_finished()
  377 +
  378 + assert extractor.is_ready(stream)
  379 + embedding = extractor.compute(stream)
  380 + embedding = np.array(embedding)
  381 + if ans is None:
  382 + ans = embedding
  383 + else:
  384 + ans += embedding
  385 +
  386 + return ans / len(filenames)
  387 +
  388 +
  389 +def main():
  390 + args = get_args()
  391 + print(args)
  392 + recognizer = create_recognizer(args)
  393 + extractor = load_speaker_embedding_model(args)
  394 + speaker_file = load_speaker_file(args)
  395 +
  396 + manager = sherpa_onnx.SpeakerEmbeddingManager(extractor.dim)
  397 + for name, filename_list in speaker_file.items():
  398 + embedding = compute_speaker_embedding(
  399 + filenames=filename_list,
  400 + extractor=extractor,
  401 + )
  402 + status = manager.add(name, embedding)
  403 + if not status:
  404 + raise RuntimeError(f"Failed to register speaker {name}")
  405 +
  406 + vad_config = sherpa_onnx.VadModelConfig()
  407 + vad_config.silero_vad.model = args.silero_vad_model
  408 + vad_config.silero_vad.min_silence_duration = 0.25
  409 + vad_config.silero_vad.min_speech_duration = 0.25
  410 + vad_config.sample_rate = g_sample_rate
  411 +
  412 + window_size = vad_config.silero_vad.window_size
  413 + vad = sherpa_onnx.VoiceActivityDetector(vad_config, buffer_size_in_seconds=100)
  414 +
  415 + samples_per_read = int(0.1 * g_sample_rate) # 0.1 second = 100 ms
  416 +
  417 + devices = sd.query_devices()
  418 + if len(devices) == 0:
  419 + print("No microphone devices found")
  420 + sys.exit(0)
  421 +
  422 + print(devices)
  423 + default_input_device_idx = sd.default.device[0]
  424 + print(f'Use default device: {devices[default_input_device_idx]["name"]}')
  425 +
  426 + print("Started! Please speak")
  427 +
  428 + idx = 0
  429 + buffer = []
  430 + with sd.InputStream(channels=1, dtype="float32", samplerate=g_sample_rate) as s:
  431 + while True:
  432 + samples, _ = s.read(samples_per_read) # a blocking read
  433 + samples = samples.reshape(-1)
  434 + buffer = np.concatenate([buffer, samples])
  435 + while len(buffer) > window_size:
  436 + vad.accept_waveform(buffer[:window_size])
  437 + buffer = buffer[window_size:]
  438 +
  439 + while not vad.empty():
  440 + if len(vad.front.samples) < 0.5 * g_sample_rate:
  441 + # this segment is too short, skip it
  442 + vad.pop()
  443 + continue
  444 + stream = extractor.create_stream()
  445 + stream.accept_waveform(
  446 + sample_rate=g_sample_rate, waveform=vad.front.samples
  447 + )
  448 + stream.input_finished()
  449 +
  450 + embedding = extractor.compute(stream)
  451 + embedding = np.array(embedding)
  452 + name = manager.search(embedding, threshold=args.threshold)
  453 + if not name:
  454 + name = "unknown"
  455 +
  456 + # Now for non-streaming ASR
  457 + asr_stream = recognizer.create_stream()
  458 + asr_stream.accept_waveform(
  459 + sample_rate=g_sample_rate, waveform=vad.front.samples
  460 + )
  461 + recognizer.decode_stream(asr_stream)
  462 + text = asr_stream.result.text
  463 +
  464 + vad.pop()
  465 +
  466 + print(f"\r{idx}-{name}: {text}")
  467 + idx += 1
  468 +
  469 +
  470 +if __name__ == "__main__":
  471 + try:
  472 + main()
  473 + except KeyboardInterrupt:
  474 + print("\nCaught Ctrl + C. Exiting")
  1 +#!/usr/bin/env python3
  2 +
  3 +"""
  4 +This script shows how to use Python APIs for speaker identification with
  5 +a microphone and a VAD model
  6 +
  7 +Usage:
  8 +
  9 +(1) Prepare a text file containing speaker related files.
  10 +
  11 +Each line in the text file contains two columns. The first column is the
  12 +speaker name, while the second column contains the wave file of the speaker.
  13 +
  14 +If the text file contains multiple wave files for the same speaker, then the
  15 +embeddings of these files are averaged.
  16 +
  17 +An example text file is given below:
  18 +
  19 + foo /path/to/a.wav
  20 + bar /path/to/b.wav
  21 + foo /path/to/c.wav
  22 + foobar /path/to/d.wav
  23 +
  24 +Each wave file should contain only a single channel; the sample format
  25 +should be int16_t; the sample rate can be arbitrary.
  26 +
  27 +(2) Download a model for computing speaker embeddings
  28 +
  29 +Please visit
  30 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  31 +to download a model. An example is given below:
  32 +
  33 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_zh_cnceleb_resnet34.onnx
  34 +
  35 +Note that `zh` means Chinese, while `en` means English.
  36 +
  37 +(3) Download the VAD model
  38 +Please visit
  39 +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  40 +to download silero_vad.onnx
  41 +
  42 +For instance,
  43 +
  44 +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
  45 +
  46 +(4) Run this script
  47 +
  48 +Assume the filename of the text file is speaker.txt.
  49 +
  50 +python3 ./python-api-examples/speaker-identification-with-vad.py \
  51 + --silero-vad-model=/path/to/silero_vad.onnx \
  52 + --speaker-file ./speaker.txt \
  53 + --model ./wespeaker_zh_cnceleb_resnet34.onnx
  54 +"""
  55 +import argparse
  56 +import sys
  57 +from collections import defaultdict
  58 +from pathlib import Path
  59 +from typing import Dict, List, Tuple
  60 +
  61 +import numpy as np
  62 +import sherpa_onnx
  63 +import torchaudio
  64 +
  65 +try:
  66 + import sounddevice as sd
  67 +except ImportError:
  68 + print("Please install sounddevice first. You can use")
  69 + print()
  70 + print(" pip install sounddevice")
  71 + print()
  72 + print("to install it")
  73 + sys.exit(-1)
  74 +
  75 +
  76 +def get_args():
  77 + parser = argparse.ArgumentParser(
  78 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  79 + )
  80 +
  81 + parser.add_argument(
  82 + "--speaker-file",
  83 + type=str,
  84 + required=True,
  85 + help="""Path to the speaker file. Read the help doc at the beginning of this
  86 + file for the format.""",
  87 + )
  88 +
  89 + parser.add_argument(
  90 + "--model",
  91 + type=str,
  92 + required=True,
  93 + help="Path to the speaker embedding model file.",
  94 + )
  95 +
  96 + parser.add_argument(
  97 + "--silero-vad-model",
  98 + type=str,
  99 + required=True,
  100 + help="Path to silero_vad.onnx",
  101 + )
  102 +
  103 + parser.add_argument("--threshold", type=float, default=0.6)
  104 +
  105 + parser.add_argument(
  106 + "--num-threads",
  107 + type=int,
  108 + default=1,
  109 + help="Number of threads for neural network computation",
  110 + )
  111 +
  112 + parser.add_argument(
  113 + "--debug",
  114 + type=bool,
  115 + default=False,
  116 + help="True to show debug messages",
  117 + )
  118 +
  119 + parser.add_argument(
  120 + "--provider",
  121 + type=str,
  122 + default="cpu",
  123 + help="Valid values: cpu, cuda, coreml",
  124 + )
  125 +
  126 + return parser.parse_args()
  127 +
  128 +
  129 +def load_speaker_embedding_model(args):
  130 + config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
  131 + model=args.model,
  132 + num_threads=args.num_threads,
  133 + debug=args.debug,
  134 + provider=args.provider,
  135 + )
  136 + if not config.validate():
  137 + raise ValueError(f"Invalid config. {config}")
  138 + extractor = sherpa_onnx.SpeakerEmbeddingExtractor(config)
  139 + return extractor
  140 +
  141 +
  142 +def load_speaker_file(args) -> Dict[str, List[str]]:
  143 + if not Path(args.speaker_file).is_file():
  144 + raise ValueError(f"--speaker-file {args.speaker_file} does not exist")
  145 +
  146 + ans = defaultdict(list)
  147 + with open(args.speaker_file) as f:
  148 + for line in f:
  149 + line = line.strip()
  150 + if not line:
  151 + continue
  152 +
  153 + fields = line.split()
  154 + if len(fields) != 2:
  155 + raise ValueError(f"Invalid line: {line}. Fields: {fields}")
  156 +
  157 + speaker_name, filename = fields
  158 + ans[speaker_name].append(filename)
  159 + return ans
  160 +
  161 +
  162 +def load_audio(filename: str) -> Tuple[np.ndarray, int]:
  163 + samples, sample_rate = torchaudio.load(filename)
  164 + return samples[0].contiguous().numpy(), sample_rate
  165 +
  166 +
  167 +def compute_speaker_embedding(
  168 + filenames: List[str],
  169 + extractor: sherpa_onnx.SpeakerEmbeddingExtractor,
  170 +) -> np.ndarray:
  171 + assert len(filenames) > 0, "filenames is empty"
  172 +
  173 + ans = None
  174 + for filename in filenames:
  175 + print(f"processing {filename}")
  176 + samples, sample_rate = load_audio(filename)
  177 + stream = extractor.create_stream()
  178 + stream.accept_waveform(sample_rate=sample_rate, waveform=samples)
  179 + stream.input_finished()
  180 +
  181 + assert extractor.is_ready(stream)
  182 + embedding = extractor.compute(stream)
  183 + embedding = np.array(embedding)
  184 + if ans is None:
  185 + ans = embedding
  186 + else:
  187 + ans += embedding
  188 +
  189 + return ans / len(filenames)
  190 +
  191 +
  192 +g_sample_rate = 16000
  193 +
  194 +
  195 +def main():
  196 + args = get_args()
  197 + print(args)
  198 + extractor = load_speaker_embedding_model(args)
  199 + speaker_file = load_speaker_file(args)
  200 +
  201 + manager = sherpa_onnx.SpeakerEmbeddingManager(extractor.dim)
  202 + for name, filename_list in speaker_file.items():
  203 + embedding = compute_speaker_embedding(
  204 + filenames=filename_list,
  205 + extractor=extractor,
  206 + )
  207 + status = manager.add(name, embedding)
  208 + if not status:
  209 + raise RuntimeError(f"Failed to register speaker {name}")
  210 +
  211 + vad_config = sherpa_onnx.VadModelConfig()
  212 + vad_config.silero_vad.model = args.silero_vad_model
  213 + vad_config.silero_vad.min_silence_duration = 0.25
  214 + vad_config.silero_vad.min_speech_duration = 0.25
  215 + vad_config.sample_rate = g_sample_rate
  216 +
  217 + window_size = vad_config.silero_vad.window_size
  218 + vad = sherpa_onnx.VoiceActivityDetector(vad_config, buffer_size_in_seconds=100)
  219 +
  220 + samples_per_read = int(0.1 * g_sample_rate) # 0.1 second = 100 ms
  221 +
  222 + devices = sd.query_devices()
  223 + if len(devices) == 0:
  224 + print("No microphone devices found")
  225 + sys.exit(0)
  226 +
  227 + print(devices)
  228 + default_input_device_idx = sd.default.device[0]
  229 + print(f'Use default device: {devices[default_input_device_idx]["name"]}')
  230 +
  231 + print("Started! Please speak")
  232 +
  233 + idx = 0
  234 + buffer = []
  235 + with sd.InputStream(channels=1, dtype="float32", samplerate=g_sample_rate) as s:
  236 + while True:
  237 + samples, _ = s.read(samples_per_read) # a blocking read
  238 + samples = samples.reshape(-1)
  239 + buffer = np.concatenate([buffer, samples])
  240 + while len(buffer) > window_size:
  241 + vad.accept_waveform(buffer[:window_size])
  242 + buffer = buffer[window_size:]
  243 +
  244 + while not vad.empty():
  245 + if len(vad.front.samples) < 0.5 * g_sample_rate:
  246 + # this segment is too short, skip it
  247 + vad.pop()
  248 + continue
  249 + stream = extractor.create_stream()
  250 + stream.accept_waveform(
  251 + sample_rate=g_sample_rate, waveform=vad.front.samples
  252 + )
  253 + vad.pop()
  254 + stream.input_finished()
  255 +
  256 + print("Computing", end="")
  257 + embedding = extractor.compute(stream)
  258 + embedding = np.array(embedding)
  259 + name = manager.search(embedding, threshold=args.threshold)
  260 + if not name:
  261 + name = "unknown"
  262 + print(f"\r{idx}: Predicted name: {name}")
  263 + idx += 1
  264 +
  265 +
  266 +if __name__ == "__main__":
  267 + try:
  268 + main()
  269 + except KeyboardInterrupt:
  270 + print("\nCaught Ctrl + C. Exiting")
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 2
3 """ 3 """
4 -This script shows how to use Python APIs for speaker identification. 4 +This script shows how to use Python APIs for speaker identification with
  5 +a microphone.
5 6
6 Usage: 7 Usage:
7 8
@@ -43,6 +44,7 @@ python3 ./python-api-examples/speaker-identification.py \ @@ -43,6 +44,7 @@ python3 ./python-api-examples/speaker-identification.py \
43 """ 44 """
44 import argparse 45 import argparse
45 import queue 46 import queue
  47 +import sys
46 import threading 48 import threading
47 from collections import defaultdict 49 from collections import defaultdict
48 from pathlib import Path 50 from pathlib import Path
@@ -151,7 +153,7 @@ def compute_speaker_embedding( @@ -151,7 +153,7 @@ def compute_speaker_embedding(
151 filenames: List[str], 153 filenames: List[str],
152 extractor: sherpa_onnx.SpeakerEmbeddingExtractor, 154 extractor: sherpa_onnx.SpeakerEmbeddingExtractor,
153 ) -> np.ndarray: 155 ) -> np.ndarray:
154 - assert len(filenames) > 0, f"filenames is empty" 156 + assert len(filenames) > 0, "filenames is empty"
155 157
156 ans = None 158 ans = None
157 for filename in filenames: 159 for filename in filenames:
@@ -215,7 +217,7 @@ def main(): @@ -215,7 +217,7 @@ def main():
215 global g_stop 217 global g_stop
216 global g_read_mic_thread 218 global g_read_mic_thread
217 while True: 219 while True:
218 - key = input("Press enter to start recording") 220 + key = input("Press Enter to start recording")
219 if key.lower() in ("q", "quit"): 221 if key.lower() in ("q", "quit"):
220 g_stop = True 222 g_stop = True
221 break 223 break
@@ -224,7 +226,7 @@ def main(): @@ -224,7 +226,7 @@ def main():
224 g_buffer.queue.clear() 226 g_buffer.queue.clear()
225 g_read_mic_thread = threading.Thread(target=read_mic) 227 g_read_mic_thread = threading.Thread(target=read_mic)
226 g_read_mic_thread.start() 228 g_read_mic_thread.start()
227 - input("Press enter to stop recording") 229 + input("Press Enter to stop recording")
228 g_stop = True 230 g_stop = True
229 g_read_mic_thread.join() 231 g_read_mic_thread.join()
230 print("Compute embedding") 232 print("Compute embedding")