Speaker diarization example with onnxruntime Python API (#1395)

Fangjun Kuang · GitHub
Commit 70165cb42de94da9d3c81fdfd181a4ec29b1cdf4 70165cb4 1 parent 5f50cbf6
.github/workflows/speaker-diarization.yaml
.gitignore
scripts/pyannote/segmentation/README.md
scripts/pyannote/segmentation/speaker-diarization-onnx.py
scripts/pyannote/segmentation/speaker-diarization-torch.py
sherpa-onnx/csrc/fast-clustering.cc
--- a/.github/workflows/speaker-diarization.yaml 0 → 100644
查看文件 @70165cb
+++ b/.github/workflows/speaker-diarization.yaml 0 → 100644
查看文件 @70165cb
+ name: speaker-diarization
+ 
+ on:
+   push:
+     branches:
+       - speaker-diarization
+   workflow_dispatch:
+ 
+ concurrency:
+   group: speaker-diarization-${{ github.ref }}
+   cancel-in-progress: true
+ 
+ jobs:
+   linux:
+     name: speaker diarization
+     runs-on: ${{ matrix.os }}
+     strategy:
+       fail-fast: false
+       matrix:
+         os: [macos-latest]
+         python-version: ["3.10"]
+ 
+     steps:
+       - uses: actions/checkout@v4
+         with:
+           fetch-depth: 0
+ 
+       - name: ccache
+         uses: hendrikmuhs/ccache-action@v1.2
+         with:
+           key: ${{ matrix.os }}-speaker-diarization
+ 
+       - name: Setup Python ${{ matrix.python-version }}
+         uses: actions/setup-python@v5
+         with:
+           python-version: ${{ matrix.python-version }}
+ 
+       - name: Install pyannote
+         shell: bash
+         run: |
+           pip install pyannote.audio onnx onnxruntime
+ 
+       - name: Install sherpa-onnx from source
+         shell: bash
+         run: |
+           python3 -m pip install --upgrade pip
+           python3 -m pip install wheel twine setuptools
+ 
+           export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+           export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+ 
+           cat sherpa-onnx/python/sherpa_onnx/__init__.py
+ 
+           python3 setup.py bdist_wheel
+           ls -lh dist
+           pip install ./dist/*.whl
+ 
+       - name: Run tests
+         shell: bash
+         run: |
+           pushd scripts/pyannote/segmentation
+ 
+           python3 -c "import sherpa_onnx; print(sherpa_onnx.__file__)"
+           python3 -c "import sherpa_onnx; print(sherpa_onnx.__version__)"
+           python3 -c "import sherpa_onnx; print(dir(sherpa_onnx))"
+ 
+           curl -SL -O https://huggingface.co/csukuangfj/pyannote-models/resolve/main/segmentation-3.0/pytorch_model.bin
+ 
+           test_wavs=(
+             0-two-speakers-zh.wav
+             1-two-speakers-en.wav
+             2-two-speakers-en.wav
+             3-two-speakers-en.wav
+           )
+ 
+           for w in ${test_wavs[@]}; do
+             curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/$w
+           done
+ 
+           soxi *.wav
+ 
+           curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+           tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+           rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+           ls -lh sherpa-onnx-pyannote-segmentation-3-0
+ 
+           curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+ 
+           for w in ${test_wavs[@]}; do
+             echo "---------test $w (onnx)----------"
+             time ./speaker-diarization-onnx.py \
+               --seg-model ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx \
+               --speaker-embedding-model ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \
+               --wav $w
+ 
+             echo "---------test $w (torch)----------"
+             time ./speaker-diarization-torch.py  --wav $w
+           done
--- a/.gitignore
查看文件 @70165cb
+++ b/.gitignore
查看文件 @70165cb
@@ -118,3 +118,5 @@ vits-melo-tts-zh_en
 *.o
 *.ppu
 sherpa-onnx-online-punct-en-2024-08-06
+ *.mp4
+ *.mp3
--- a/scripts/pyannote/segmentation/README.md 0 → 100644
查看文件 @70165cb
+++ b/scripts/pyannote/segmentation/README.md 0 → 100644
查看文件 @70165cb
+ # File description
+ 
+ Please download test wave files from
+ https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+ 
+ ## 0-two-speakers-zh.wav
+ 
+ This file is from
+ https://www.modelscope.cn/models/iic/speech_campplus_speaker-diarization_common/file/view/master?fileName=examples%252F2speakers_example.wav&status=0
+ 
+ Note that we have renamed it from `2speakers_example.wav` to `0-two-speakers-zh.wav`.
+ 
+ ## 1-two-speakers-en.wav
+ 
+ This file is from
+ https://github.com/pengzhendong/pyannote-onnx/blob/master/data/test_16k.wav
+ and it contains speeches from two speakers.
+ 
+ Note that we have renamed it from `test_16k.wav` to `1-two-speakers-en.wav`
+ 
+ 
+ ## 2-two-speakers-en.wav
+ This file is from
+ https://huggingface.co/spaces/Xenova/whisper-speaker-diarization
+ 
+ Note that the original file is `./fcf059e3-689f-47ec-a000-bdace87f0113.mp4`.
+ We use the following commands to convert it to `2-two-speakers-en.wav`.
+ 
+ ```bash
+ ffmpeg -i ./fcf059e3-689f-47ec-a000-bdace87f0113.mp4 -ac 1 -ar 16000 ./2-two-speakers-en.wav
+ ```
+ 
+ ## 3-two-speakers-en.wav
+ 
+ This file is from
+ https://aws.amazon.com/blogs/machine-learning/deploy-a-hugging-face-pyannote-speaker-diarization-model-on-amazon-sagemaker-as-an-asynchronous-endpoint/
+ 
+ Note that the original file is `ML16091-Audio.mp3`. We use the following
+ commands to convert it to `3-two-speakers-en.wav`
+ 
+ 
+ ```bash
+ sox ML16091-Audio.mp3 3-two-speakers-en.wav
+ ```
--- a/scripts/pyannote/segmentation/speaker-diarization-onnx.py 0 → 100755
查看文件 @70165cb
+++ b/scripts/pyannote/segmentation/speaker-diarization-onnx.py 0 → 100755
查看文件 @70165cb
+ #!/usr/bin/env python3
+ # Copyright    2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ """
+ Please refer to
+ https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/speaker-diarization.yaml
+ for usages.
+ """
+ 
+ import argparse
+ from datetime import timedelta
+ from pathlib import Path
+ from typing import List
+ 
+ import librosa
+ import numpy as np
+ import onnxruntime as ort
+ import sherpa_onnx
+ import soundfile as sf
+ from numpy.lib.stride_tricks import as_strided
+ 
+ 
+ class Segment:
+     def __init__(
+         self,
+         start,
+         end,
+         speaker,
+     ):
+         assert start < end
+         self.start = start
+         self.end = end
+         self.speaker = speaker
+ 
+     def merge(self, other, gap=0.5):
+         assert self.speaker == other.speaker, (self.speaker, other.speaker)
+         if self.end < other.start and self.end + gap >= other.start:
+             return Segment(start=self.start, end=other.end, speaker=self.speaker)
+         elif other.end < self.start and other.end + gap >= self.start:
+             return Segment(start=other.start, end=self.end, speaker=self.speaker)
+         else:
+             return None
+ 
+     @property
+     def duration(self):
+         return self.end - self.start
+ 
+     def __str__(self):
+         s = f"{timedelta(seconds=self.start)}"[:-3]
+         s += " --> "
+         s += f"{timedelta(seconds=self.end)}"[:-3]
+         s += f" speaker_{self.speaker:02d}"
+         return s
+ 
+ 
+ def merge_segment_list(in_out: List[Segment], min_duration_off: float):
+     changed = True
+     while changed:
+         changed = False
+         for i in range(len(in_out)):
+             if i + 1 >= len(in_out):
+                 continue
+ 
+             new_segment = in_out[i].merge(in_out[i + 1], gap=min_duration_off)
+             if new_segment is None:
+                 continue
+             del in_out[i + 1]
+             in_out[i] = new_segment
+             changed = True
+             break
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser()
+     parser.add_argument(
+         "--seg-model",
+         type=str,
+         required=True,
+         help="Path to model.onnx for segmentation",
+     )
+     parser.add_argument(
+         "--speaker-embedding-model",
+         type=str,
+         required=True,
+         help="Path to model.onnx for speaker embedding extractor",
+     )
+     parser.add_argument("--wav", type=str, required=True, help="Path to test.wav")
+ 
+     return parser.parse_args()
+ 
+ 
+ class OnnxSegmentationModel:
+     def __init__(self, filename):
+         session_opts = ort.SessionOptions()
+         session_opts.inter_op_num_threads = 1
+         session_opts.intra_op_num_threads = 1
+ 
+         self.session_opts = session_opts
+ 
+         self.model = ort.InferenceSession(
+             filename,
+             sess_options=self.session_opts,
+             providers=["CPUExecutionProvider"],
+         )
+ 
+         meta = self.model.get_modelmeta().custom_metadata_map
+         print(meta)
+ 
+         self.window_size = int(meta["window_size"])
+         self.sample_rate = int(meta["sample_rate"])
+         self.window_shift = int(0.1 * self.window_size)
+         self.receptive_field_size = int(meta["receptive_field_size"])
+         self.receptive_field_shift = int(meta["receptive_field_shift"])
+         self.num_speakers = int(meta["num_speakers"])
+         self.powerset_max_classes = int(meta["powerset_max_classes"])
+         self.num_classes = int(meta["num_classes"])
+ 
+     def __call__(self, x):
+         """
+         Args:
+           x: (N, num_samples)
+         Returns:
+           A tensor of shape (N, num_frames, num_classes)
+         """
+         x = np.expand_dims(x, axis=1)
+ 
+         (y,) = self.model.run(
+             [self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: x}
+         )
+ 
+         return y
+ 
+ 
+ def load_wav(filename, expected_sample_rate) -> np.ndarray:
+     audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
+     audio = audio[:, 0]  # only use the first channel
+     if sample_rate != expected_sample_rate:
+         audio = librosa.resample(
+             audio,
+             orig_sr=sample_rate,
+             target_sr=expected_sample_rate,
+         )
+     return audio
+ 
+ 
+ def get_powerset_mapping(num_classes, num_speakers, powerset_max_classes):
+     mapping = np.zeros((num_classes, num_speakers))
+ 
+     k = 1
+     for i in range(1, powerset_max_classes + 1):
+         if i == 1:
+             for j in range(0, num_speakers):
+                 mapping[k, j] = 1
+                 k += 1
+         elif i == 2:
+             for j in range(0, num_speakers):
+                 for m in range(j + 1, num_speakers):
+                     mapping[k, j] = 1
+                     mapping[k, m] = 1
+                     k += 1
+         elif i == 3:
+             raise RuntimeError("Unsupported")
+ 
+     return mapping
+ 
+ 
+ def to_multi_label(y, mapping):
+     """
+     Args:
+       y: (num_chunks, num_frames, num_classes)
+     Returns:
+       A tensor of shape (num_chunks, num_frames, num_speakers)
+     """
+     y = np.argmax(y, axis=-1)
+     labels = mapping[y.reshape(-1)].reshape(y.shape[0], y.shape[1], -1)
+     return labels
+ 
+ 
+ # speaker count per frame
+ def speaker_count(labels, seg_m):
+     """
+     Args:
+       labels: (num_chunks, num_frames, num_speakers)
+       seg_m: Segmentation model
+     Returns:
+       A integer array of shape (num_total_frames,)
+     """
+     labels = labels.sum(axis=-1)
+     # Now labels: (num_chunks, num_frames)
+ 
+     num_frames = (
+         int(
+             (seg_m.window_size + (labels.shape[0] - 1) * seg_m.window_shift)
+             / seg_m.receptive_field_shift
+         )
+         + 1
+     )
+     ans = np.zeros((num_frames,))
+     count = np.zeros((num_frames,))
+ 
+     for i in range(labels.shape[0]):
+         this_chunk = labels[i]
+         start = int(i * seg_m.window_shift / seg_m.receptive_field_shift + 0.5)
+         end = start + this_chunk.shape[0]
+         ans[start:end] += this_chunk
+         count[start:end] += 1
+ 
+     ans /= np.maximum(count, 1e-12)
+ 
+     return (ans + 0.5).astype(np.int8)
+ 
+ 
+ def load_speaker_embedding_model(filename):
+     config = sherpa_onnx.SpeakerEmbeddingExtractorConfig(
+         model=filename,
+         num_threads=1,
+         debug=0,
+     )
+     if not config.validate():
+         raise ValueError(f"Invalid config. {config}")
+     extractor = sherpa_onnx.SpeakerEmbeddingExtractor(config)
+     return extractor
+ 
+ 
+ def get_embeddings(embedding_filename, audio, labels, seg_m, exclude_overlap):
+     """
+     Args:
+       embedding_filename: Path to the speaker embedding extractor model
+       audio: (num_samples,)
+       labels: (num_chunks, num_frames, num_speakers)
+       seg_m: segmentation model
+     Returns:
+       Return (num_chunks, num_speakers, embedding_dim)
+     """
+     if exclude_overlap:
+         labels = labels * (labels.sum(axis=-1, keepdims=True) < 2)
+ 
+     extractor = load_speaker_embedding_model(embedding_filename)
+     buffer = np.empty(seg_m.window_size)
+     num_chunks, num_frames, num_speakers = labels.shape
+ 
+     ans_chunk_speaker_pair = []
+     ans_embeddings = []
+ 
+     for i in range(num_chunks):
+         labels_T = labels[i].T
+         # t: (num_speakers, num_frames)
+ 
+         sample_offset = i * seg_m.window_shift
+ 
+         for j in range(num_speakers):
+             frames = labels_T[j]
+             if frames.sum() < 10:
+                 # skip segment less than 20 frames, i.e., about 0.2 seconds
+                 continue
+ 
+             start = None
+             start_samples = 0
+             idx = 0
+             for k in range(num_frames):
+                 if frames[k] != 0:
+                     if start is None:
+                         start = k
+                 elif start is not None:
+                     start_samples = (
+                         int(start / num_frames * seg_m.window_size) + sample_offset
+                     )
+                     end_samples = (
+                         int(k / num_frames * seg_m.window_size) + sample_offset
+                     )
+                     num_samples = end_samples - start_samples
+                     buffer[idx : idx + num_samples] = audio[start_samples:end_samples]
+                     idx += num_samples
+ 
+                     start = None
+             if start is not None:
+                 start_samples = (
+                     int(start / num_frames * seg_m.window_size) + sample_offset
+                 )
+                 end_samples = int(k / num_frames * seg_m.window_size) + sample_offset
+                 num_samples = end_samples - start_samples
+                 buffer[idx : idx + num_samples] = audio[start_samples:end_samples]
+                 idx += num_samples
+ 
+             stream = extractor.create_stream()
+             stream.accept_waveform(sample_rate=seg_m.sample_rate, waveform=buffer[:idx])
+             stream.input_finished()
+ 
+             assert extractor.is_ready(stream)
+             embedding = extractor.compute(stream)
+             embedding = np.array(embedding)
+ 
+             ans_chunk_speaker_pair.append([i, j])
+             ans_embeddings.append(embedding)
+ 
+     assert len(ans_chunk_speaker_pair) == len(ans_embeddings), (
+         len(ans_chunk_speaker_pair),
+         len(ans_embeddings),
+     )
+     return ans_chunk_speaker_pair, np.array(ans_embeddings)
+ 
+ 
+ def main():
+     args = get_args()
+     assert Path(args.seg_model).is_file(), args.seg_model
+     assert Path(args.wav).is_file(), args.wav
+ 
+     seg_m = OnnxSegmentationModel(args.seg_model)
+     audio = load_wav(args.wav, seg_m.sample_rate)
+     # audio: (num_samples,)
+ 
+     num = (audio.shape[0] - seg_m.window_size) // seg_m.window_shift + 1
+ 
+     samples = as_strided(
+         audio,
+         shape=(num, seg_m.window_size),
+         strides=(seg_m.window_shift * audio.strides[0], audio.strides[0]),
+     )
+ 
+     # or use torch.Tensor.unfold
+     #  samples = torch.from_numpy(audio).unfold(0, seg_m.window_size, seg_m.window_shift).numpy()
+ 
+     if (
+         audio.shape[0] < seg_m.window_size
+         or (audio.shape[0] - seg_m.window_size) % seg_m.window_shift > 0
+     ):
+         has_last_chunk = True
+     else:
+         has_last_chunk = False
+ 
+     num_chunks = samples.shape[0]
+     batch_size = 32
+     output = []
+     for i in range(0, num_chunks, batch_size):
+         start = i
+         end = i + batch_size
+         # it's perfectly ok to use end > num_chunks
+         y = seg_m(samples[start:end])
+         output.append(y)
+ 
+     if has_last_chunk:
+         last_chunk = audio[num_chunks * seg_m.window_shift :]  # noqa
+         pad_size = seg_m.window_size - last_chunk.shape[0]
+         last_chunk = np.pad(last_chunk, (0, pad_size))
+         last_chunk = np.expand_dims(last_chunk, axis=0)
+         y = seg_m(last_chunk)
+         output.append(y)
+ 
+     y = np.vstack(output)
+     # y: (num_chunks, num_frames, num_classes)
+ 
+     mapping = get_powerset_mapping(
+         num_classes=seg_m.num_classes,
+         num_speakers=seg_m.num_speakers,
+         powerset_max_classes=seg_m.powerset_max_classes,
+     )
+     labels = to_multi_label(y, mapping=mapping)
+     # labels: (num_chunks, num_frames, num_speakers)
+ 
+     inactive = (labels.sum(axis=1) == 0).astype(np.int8)
+     # inactive: (num_chunks, num_speakers)
+ 
+     speakers_per_frame = speaker_count(labels=labels, seg_m=seg_m)
+     # speakers_per_frame: (num_frames, speakers_per_frame)
+ 
+     if speakers_per_frame.max() == 0:
+         print("No speakers found in the audio file!")
+         return
+ 
+     # if users specify only 1 speaker for clustering, then return the
+     # result directly
+ 
+     # Now, get embeddings
+     chunk_speaker_pair, embeddings = get_embeddings(
+         args.speaker_embedding_model,
+         audio=audio,
+         labels=labels,
+         seg_m=seg_m,
+         #  exclude_overlap=True,
+         exclude_overlap=False,
+     )
+     # chunk_speaker_pair: a list of (chunk_idx, speaker_idx)
+     # embeddings: (batch_size, embedding_dim)
+ 
+     # Please change num_clusters or threshold by yourself.
+     clustering_config = sherpa_onnx.FastClusteringConfig(num_clusters=2)
+     #  clustering_config = sherpa_onnx.FastClusteringConfig(threshold=0.8)
+     clustering = sherpa_onnx.FastClustering(clustering_config)
+     cluster_labels = clustering(embeddings)
+ 
+     chunk_speaker_to_cluster = dict()
+     for (chunk_idx, speaker_idx), cluster_idx in zip(
+         chunk_speaker_pair, cluster_labels
+     ):
+         if inactive[chunk_idx, speaker_idx] == 1:
+             print("skip ", chunk_idx, speaker_idx)
+             continue
+         chunk_speaker_to_cluster[(chunk_idx, speaker_idx)] = cluster_idx
+ 
+     num_speakers = max(cluster_labels) + 1
+     relabels = np.zeros((labels.shape[0], labels.shape[1], num_speakers))
+     for i in range(labels.shape[0]):
+         for j in range(labels.shape[1]):
+             for k in range(labels.shape[2]):
+                 if (i, k) not in chunk_speaker_to_cluster:
+                     continue
+                 t = chunk_speaker_to_cluster[(i, k)]
+ 
+                 if labels[i, j, k] == 1:
+                     relabels[i, j, t] = 1
+ 
+     num_frames = (
+         int(
+             (seg_m.window_size + (relabels.shape[0] - 1) * seg_m.window_shift)
+             / seg_m.receptive_field_shift
+         )
+         + 1
+     )
+ 
+     count = np.zeros((num_frames, relabels.shape[-1]))
+     for i in range(relabels.shape[0]):
+         this_chunk = relabels[i]
+         start = int(i * seg_m.window_shift / seg_m.receptive_field_shift + 0.5)
+         end = start + this_chunk.shape[0]
+         count[start:end] += this_chunk
+ 
+     if has_last_chunk:
+         stop_frame = int(audio.shape[0] / seg_m.receptive_field_shift)
+         count = count[:stop_frame]
+ 
+     sorted_count = np.argsort(-count, axis=-1)
+     final = np.zeros((count.shape[0], count.shape[1]))
+ 
+     for i, (c, sc) in enumerate(zip(speakers_per_frame, sorted_count)):
+         for k in range(c):
+             final[i, sc[k]] = 1
+ 
+     min_duration_off = 0.5
+     min_duration_on = 0.3
+     onset = 0.5
+     offset = 0.5
+     # final: (num_frames, num_speakers)
+ 
+     final = final.T
+     for kk in range(final.shape[0]):
+         segment_list = []
+         frames = final[kk]
+ 
+         is_active = frames[0] > onset
+ 
+         start = None
+         if is_active:
+             start = 0
+         scale = seg_m.receptive_field_shift / seg_m.sample_rate
+         scale_offset = seg_m.receptive_field_size / seg_m.sample_rate * 0.5
+         for i in range(1, len(frames)):
+             if is_active:
+                 if frames[i] < offset:
+                     segment = Segment(
+                         start=start * scale + scale_offset,
+                         end=i * scale + scale_offset,
+                         speaker=kk,
+                     )
+                     segment_list.append(segment)
+                     is_active = False
+             else:
+                 if frames[i] > onset:
+                     start = i
+                     is_active = True
+ 
+         if is_active:
+             segment = Segment(
+                 start=start * scale + scale_offset,
+                 end=(len(frames) - 1) * scale + scale_offset,
+                 speaker=kk,
+             )
+             segment_list.append(segment)
+ 
+         if len(segment_list) > 1:
+             merge_segment_list(segment_list, min_duration_off=min_duration_off)
+             for s in segment_list:
+                 if s.duration < min_duration_on:
+                     continue
+                 print(s)
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/pyannote/segmentation/speaker-diarization-torch.py 0 → 100755
查看文件 @70165cb
+++ b/scripts/pyannote/segmentation/speaker-diarization-torch.py 0 → 100755
查看文件 @70165cb
+ #!/usr/bin/env python3
+ 
+ """
+ Please refer to
+ https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/speaker-diarization.yaml
+ for usages.
+ """
+ 
+ """
+ 1. Go to https://huggingface.co/hbredin/wespeaker-voxceleb-resnet34-LM/tree/main
+ wget https://huggingface.co/hbredin/wespeaker-voxceleb-resnet34-LM/resolve/main/speaker-embedding.onnx
+ 
+ 2. Change line 166 of pyannote/audio/pipelines/speaker_diarization.py
+ 
+ ```
+             #  self._embedding = PretrainedSpeakerEmbedding(
+             #      self.embedding, use_auth_token=use_auth_token
+             #  )
+             self._embedding = embedding
+ ```
+ """
+ 
+ import argparse
+ from pathlib import Path
+ 
+ import torch
+ from pyannote.audio import Model
+ from pyannote.audio.pipelines import SpeakerDiarization as SpeakerDiarizationPipeline
+ from pyannote.audio.pipelines.speaker_verification import (
+     ONNXWeSpeakerPretrainedSpeakerEmbedding,
+ )
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser()
+     parser.add_argument("--wav", type=str, required=True, help="Path to test.wav")
+ 
+     return parser.parse_args()
+ 
+ 
+ def build_pipeline():
+     embedding_filename = "./speaker-embedding.onnx"
+     if Path(embedding_filename).is_file():
+         # You need to modify line 166
+         # of pyannote/audio/pipelines/speaker_diarization.py
+         # Please see the comments at the start of this script for details
+         embedding = ONNXWeSpeakerPretrainedSpeakerEmbedding(embedding_filename)
+     else:
+         embedding = "hbredin/wespeaker-voxceleb-resnet34-LM"
+ 
+     pt_filename = "./pytorch_model.bin"
+     segmentation = Model.from_pretrained(pt_filename)
+     segmentation.eval()
+ 
+     pipeline = SpeakerDiarizationPipeline(
+         segmentation=segmentation,
+         embedding=embedding,
+         embedding_exclude_overlap=True,
+     )
+ 
+     params = {
+         "clustering": {
+             "method": "centroid",
+             "min_cluster_size": 12,
+             "threshold": 0.7045654963945799,
+         },
+         "segmentation": {"min_duration_off": 0.5},
+     }
+ 
+     pipeline.instantiate(params)
+     return pipeline
+ 
+ 
+ @torch.no_grad()
+ def main():
+     args = get_args()
+     assert Path(args.wav).is_file(), args.wav
+     pipeline = build_pipeline()
+     print(pipeline)
+     t = pipeline(args.wav)
+     print(type(t))
+     print(t)
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/sherpa-onnx/csrc/fast-clustering.cc
查看文件 @70165cb
+++ b/sherpa-onnx/csrc/fast-clustering.cc
查看文件 @70165cb
@@ -52,7 +52,7 @@ class FastClustering::Impl {
     std::vector<double> height(num_rows - 1);
 
     fastclustercpp::hclust_fast(num_rows, distance.data(),
-                                 fastclustercpp::HCLUST_METHOD_SINGLE,
+                                 fastclustercpp::HCLUST_METHOD_COMPLETE,
                                 merge.data(), height.data());
 
     std::vector<int32_t> labels(num_rows);