Export Pyannote speaker segmentation models to onnx (#1382)

Fangjun Kuang · GitHub
Commit bc0816082024aef173abe76f2dfd9d2421674da3 bc081608 1 parent 11f0cb7e
.github/workflows/export-pyannote-segmentation-to-onnx.yaml
scripts/pyannote/segmentation/.gitignore
scripts/pyannote/segmentation/export-onnx.py
scripts/pyannote/segmentation/notes.md
scripts/pyannote/segmentation/preprocess.sh
scripts/pyannote/segmentation/run.sh
scripts/pyannote/segmentation/show-onnx.py
scripts/pyannote/segmentation/vad-onnx.py
scripts/pyannote/segmentation/vad-torch.py
--- a/.github/workflows/export-pyannote-segmentation-to-onnx.yaml 0 → 100644
查看文件 @bc08160
+++ b/.github/workflows/export-pyannote-segmentation-to-onnx.yaml 0 → 100644
查看文件 @bc08160
+ name: export-pyannote-segmentation-to-onnx
+ 
+ on:
+   workflow_dispatch:
+ 
+ concurrency:
+   group: export-pyannote-segmentation-to-onnx-${{ github.ref }}
+   cancel-in-progress: true
+ 
+ jobs:
+   export-pyannote-segmentation-to-onnx:
+     if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
+     name: export Pyannote segmentation models to ONNX
+     runs-on: ${{ matrix.os }}
+     strategy:
+       fail-fast: false
+       matrix:
+         os: [macos-latest]
+         python-version: ["3.10"]
+ 
+     steps:
+       - uses: actions/checkout@v4
+ 
+       - name: Setup Python ${{ matrix.python-version }}
+         uses: actions/setup-python@v5
+         with:
+           python-version: ${{ matrix.python-version }}
+ 
+       - name: Install pyannote
+         shell: bash
+         run: |
+           pip install pyannote.audio onnx onnxruntime
+ 
+       - name: Run
+         shell: bash
+         run: |
+           d=sherpa-onnx-pyannote-segmentation-3-0
+           src=$PWD/$d
+           mkdir -p $src
+ 
+           pushd scripts/pyannote/segmentation
+           ./run.sh
+           cp ./*.onnx $src/
+           cp ./README.md $src/
+           cp ./LICENSE $src/
+           cp ./run.sh $src/
+           cp ./*.py $src/
+ 
+           popd
+           ls -lh $d
+           tar cjfv $d.tar.bz2 $d
+ 
+       - name: Release
+         uses: svenstaro/upload-release-action@v2
+         with:
+           file_glob: true
+           file: ./*.tar.bz2
+           overwrite: true
+           repo_name: k2-fsa/sherpa-onnx
+           repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+           tag: speaker-segmentation-models
+ 
+       - name: Publish to huggingface
+         env:
+           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+         uses: nick-fields/retry@v3
+         with:
+           max_attempts: 20
+           timeout_seconds: 200
+           shell: bash
+           command: |
+             git config --global user.email "csukuangfj@gmail.com"
+             git config --global user.name "Fangjun Kuang"
+ 
+             d=sherpa-onnx-pyannote-segmentation-3-0
+             export GIT_LFS_SKIP_SMUDGE=1
+             export GIT_CLONE_PROTECTION_ACTIVE=false
+             git clone https://huggingface.co/csukuangfj/$d huggingface
+             cp -v $d/* ./huggingface
+             cd huggingface
+             git lfs track "*.onnx"
+             git status
+             git add .
+             git status
+             git commit -m "add models"
+             git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main
--- a/scripts/pyannote/segmentation/.gitignore 0 → 100644
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/.gitignore 0 → 100644
查看文件 @bc08160
+ *.bin
+ *.onnx
--- a/scripts/pyannote/segmentation/export-onnx.py 0 → 100755
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/export-onnx.py 0 → 100755
查看文件 @bc08160
+ #!/usr/bin/env python3
+ 
+ from typing import Any, Dict
+ 
+ import onnx
+ import torch
+ from onnxruntime.quantization import QuantType, quantize_dynamic
+ from pyannote.audio import Model
+ from pyannote.audio.core.task import Problem, Resolution
+ 
+ 
+ def add_meta_data(filename: str, meta_data: Dict[str, Any]):
+     """Add meta data to an ONNX model. It is changed in-place.
+ 
+     Args:
+       filename:
+         Filename of the ONNX model to be changed.
+       meta_data:
+         Key-value pairs.
+     """
+     model = onnx.load(filename)
+ 
+     while len(model.metadata_props):
+         model.metadata_props.pop()
+ 
+     for key, value in meta_data.items():
+         meta = model.metadata_props.add()
+         meta.key = key
+         meta.value = str(value)
+ 
+     onnx.save(model, filename)
+ 
+ 
+ @torch.no_grad()
+ def main():
+     # You can download ./pytorch_model.bin from
+     # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0
+     pt_filename = "./pytorch_model.bin"
+     model = Model.from_pretrained(pt_filename)
+     model.eval()
+     assert model.dimension == 7, model.dimension
+     print(model.specifications)
+ 
+     assert (
+         model.specifications.problem == Problem.MONO_LABEL_CLASSIFICATION
+     ), model.specifications.problem
+ 
+     assert (
+         model.specifications.resolution == Resolution.FRAME
+     ), model.specifications.resolution
+ 
+     assert model.specifications.duration == 10.0, model.specifications.duration
+ 
+     assert model.audio.sample_rate == 16000, model.audio.sample_rate
+ 
+     # (batch, num_channels, num_samples)
+     assert list(model.example_input_array.shape) == [
+         1,
+         1,
+         16000 * 10,
+     ], model.example_input_array.shape
+ 
+     example_output = model(model.example_input_array)
+ 
+     # (batch, num_frames, num_classes)
+     assert list(example_output.shape) == [1, 589, 7], example_output.shape
+ 
+     assert model.receptive_field.step == 0.016875, model.receptive_field.step
+     assert model.receptive_field.duration == 0.0619375, model.receptive_field.duration
+     assert model.receptive_field.step * 16000 == 270, model.receptive_field.step * 16000
+     assert model.receptive_field.duration * 16000 == 991, (
+         model.receptive_field.duration * 16000
+     )
+ 
+     opset_version = 18
+ 
+     filename = "model.onnx"
+     torch.onnx.export(
+         model,
+         model.example_input_array,
+         filename,
+         opset_version=opset_version,
+         input_names=["x"],
+         output_names=["y"],
+         dynamic_axes={
+             "x": {0: "N", 2: "T"},
+             "y": {0: "N", 1: "T"},
+         },
+     )
+ 
+     sample_rate = model.audio.sample_rate
+ 
+     window_size = int(model.specifications.duration) * 16000
+     receptive_field_size = int(model.receptive_field.duration * 16000)
+     receptive_field_shift = int(model.receptive_field.step * 16000)
+ 
+     meta_data = {
+         "num_speakers": len(model.specifications.classes),
+         "powerset_max_classes": model.specifications.powerset_max_classes,
+         "num_classes": model.dimension,
+         "sample_rate": sample_rate,
+         "window_size": window_size,
+         "receptive_field_size": receptive_field_size,
+         "receptive_field_shift": receptive_field_shift,
+         "model_type": "pyannote-segmentation-3.0",
+         "version": "1",
+         "model_author": "pyannote",
+         "maintainer": "k2-fsa",
+         "url_1": "https://huggingface.co/pyannote/segmentation-3.0",
+         "url_2": "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0",
+         "license": "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE",
+     }
+     add_meta_data(filename=filename, meta_data=meta_data)
+ 
+     print("Generate int8 quantization models")
+ 
+     filename_int8 = "model.int8.onnx"
+     quantize_dynamic(
+         model_input=filename,
+         model_output=filename_int8,
+         weight_type=QuantType.QUInt8,
+     )
+ 
+     print(f"Saved to {filename} and {filename_int8}")
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/pyannote/segmentation/notes.md 0 → 100644
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/notes.md 0 → 100644
查看文件 @bc08160
+ 
+ # config.yaml
+ 
+ 
+ ```yaml
+ task:
+   _target_: pyannote.audio.tasks.SpeakerDiarization
+   duration: 10.0
+   max_speakers_per_chunk: 3
+   max_speakers_per_frame: 2
+ model:
+   _target_: pyannote.audio.models.segmentation.PyanNet
+   sample_rate: 16000
+   num_channels: 1
+   sincnet:
+     stride: 10
+   lstm:
+     hidden_size: 128
+     num_layers: 4
+     bidirectional: true
+     monolithic: true
+   linear:
+     hidden_size: 128
+     num_layers: 2
+ ```
+ 
+ # Model architecture of ./pytorch_model.bin
+ 
+ `print(model)`:
+ 
+ ```python3
+ PyanNet(
+   (sincnet): SincNet(
+     (wav_norm1d): InstanceNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+     (conv1d): ModuleList(
+       (0): Encoder(
+         (filterbank): ParamSincFB()
+       )
+       (1): Conv1d(80, 60, kernel_size=(5,), stride=(1,))
+       (2): Conv1d(60, 60, kernel_size=(5,), stride=(1,))
+     )
+     (pool1d): ModuleList(
+       (0-2): 3 x MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
+     )
+     (norm1d): ModuleList(
+       (0): InstanceNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+       (1-2): 2 x InstanceNorm1d(60, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
+     )
+   )
+   (lstm): LSTM(60, 128, num_layers=4, batch_first=True, dropout=0.5, bidirectional=True)
+   (linear): ModuleList(
+     (0): Linear(in_features=256, out_features=128, bias=True)
+     (1): Linear(in_features=128, out_features=128, bias=True)
+   )
+   (classifier): Linear(in_features=128, out_features=7, bias=True)
+   (activation): LogSoftmax(dim=-1)
+ )
+ ```
+ 
+ ```python3
+ >>> list(model.specifications)
+ [Specifications(problem=<Problem.MONO_LABEL_CLASSIFICATION: 1>, resolution=<Resolution.FRAME: 1>, duration=10.0, min_duration=None, warm_up=(0.0, 0.0), classes=['speaker#1', 'speaker#2', 'speaker#3'], powerset_max_classes=2, permutation_invariant=True)]
+ ```
+ 
+ ```python3
+ >>> model.hparams
+ "linear":       {'hidden_size': 128, 'num_layers': 2}
+ "lstm":         {'hidden_size': 128, 'num_layers': 4, 'bidirectional': True, 'monolithic': True, 'dropout': 0.5, 'batch_first': True}
+ "num_channels": 1
+ "sample_rate":  16000
+ "sincnet":      {'stride': 10, 'sample_rate': 16000}
+ ```
+ 
+ ## Papers
+ 
+ - [pyannote.audio 2.1 speaker diarization pipeline: principle, benchmark, and recipe](https://hal.science/hal-04247212/document)
+ - [pyannote.audio speaker diarization pipeline at VoxSRC 2023](https://mmai.io/datasets/voxceleb/voxsrc/data_workshop_2023/reports/pyannote_report.pdf)
+ 
--- a/scripts/pyannote/segmentation/preprocess.sh 0 → 100755
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/preprocess.sh 0 → 100755
查看文件 @bc08160
+ #!/usr/bin/env bash
+ 
+ 
+ python3 -m onnxruntime.quantization.preprocess --input model.onnx --output tmp.preprocessed.onnx
+ mv ./tmp.preprocessed.onnx ./model.onnx
+ ./show-onnx.py --filename ./model.onnx
+ 
+ <<EOF
+ =========./model.onnx==========
+ NodeArg(name='x', type='tensor(float)', shape=[1, 1, 'T'])
+ -----
+ NodeArg(name='y', type='tensor(float)', shape=[1, 'floor(floor(floor(floor(T/10 - 251/10)/3 - 2/3)/3)/3 - 8/3) + 1', 7])
+ 
+   floor(floor(floor(floor(T/10 - 251/10)/3 - 2/3)/3)/3 - 8/3) + 1
+ = floor(floor(floor(floor(T - 251)/30 - 2/3)/3)/3 - 8/3) + 1
+ = floor(floor(floor(floor(T - 271)/30)/3)/3 - 8/3) + 1
+ = floor(floor(floor(floor(T - 271)/90))/3 - 8/3) + 1
+ = floor(floor(floor(T - 271)/90)/3 - 8/3) + 1
+ = floor(floor((T - 271)/90)/3 - 8/3) + 1
+ = floor(floor((T - 271)/90 - 8)/3) + 1
+ = floor(floor((T - 271 - 720)/90)/3) + 1
+ = floor(floor((T - 991)/90)/3) + 1
+ = floor(floor((T - 991)/270)) + 1
+ = (T - 991)/270 + 1
+ = (T - 991 + 270)/270
+ = (T - 721)/270
+ 
+ It means:
+  - Number of input samples should be at least 721
+  - One frame corresponds to 270 samples. (If we use T + 270, it outputs one more frame)
+ EOF
--- a/scripts/pyannote/segmentation/run.sh 0 → 100755
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/run.sh 0 → 100755
查看文件 @bc08160
+ #!/usr/bin/env bash
+ # Copyright    2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ set -ex
+ function install_pyannote() {
+   pip install pyannote.audio onnx onnxruntime
+ }
+ 
+ function download_test_files() {
+   curl -SL -O https://huggingface.co/csukuangfj/pyannote-models/resolve/main/segmentation-3.0/pytorch_model.bin
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+ }
+ 
+ install_pyannote
+ download_test_files
+ 
+ ./export-onnx.py
+ ./preprocess.sh
+ 
+ echo "----------torch----------"
+ ./vad-torch.py
+ 
+ echo "----------onnx model.onnx----------"
+ ./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav
+ 
+ echo "----------onnx model.int8.onnx----------"
+ ./vad-onnx.py --model ./model.int8.onnx --wav ./lei-jun-test.wav
+ 
+ cat >README.md << EOF
+ # Introduction
+ 
+ Models in this file are converted from
+ https://huggingface.co/pyannote/segmentation-3.0/tree/main
+ 
+ EOF
+ 
+ cat >LICENSE <<EOF
+ MIT License
+ 
+ Copyright (c) 2022 CNRS
+ 
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ 
+ The above copyright notice and this permission notice shall be included in all
+ copies or substantial portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+ EOF
--- a/scripts/pyannote/segmentation/show-onnx.py 0 → 100755
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/show-onnx.py 0 → 100755
查看文件 @bc08160
+ #!/usr/bin/env python3
+ # Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+ 
+ import onnxruntime
+ import argparse
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser(
+         formatter_class=argparse.ArgumentDefaultsHelpFormatter
+     )
+ 
+     parser.add_argument(
+         "--filename",
+         type=str,
+         required=True,
+         help="Path to model.onnx",
+     )
+ 
+     return parser.parse_args()
+ 
+ 
+ def show(filename):
+     session_opts = onnxruntime.SessionOptions()
+     session_opts.log_severity_level = 3
+     sess = onnxruntime.InferenceSession(filename, session_opts)
+     for i in sess.get_inputs():
+         print(i)
+ 
+     print("-----")
+ 
+     for i in sess.get_outputs():
+         print(i)
+ 
+ 
+ def main():
+     args = get_args()
+     print(f"========={args.filename}==========")
+     show(args.filename)
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/pyannote/segmentation/vad-onnx.py 0 → 100755
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/vad-onnx.py 0 → 100755
查看文件 @bc08160
+ #!/usr/bin/env python3
+ 
+ """
+ ./export-onnx.py
+ ./preprocess.sh
+ 
+ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+ ./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav
+ """
+ 
+ import argparse
+ from pathlib import Path
+ 
+ import librosa
+ import numpy as np
+ import onnxruntime as ort
+ import soundfile as sf
+ from numpy.lib.stride_tricks import as_strided
+ 
+ 
+ def get_args():
+     parser = argparse.ArgumentParser()
+     parser.add_argument("--model", type=str, required=True, help="Path to model.onnx")
+     parser.add_argument("--wav", type=str, required=True, help="Path to test.wav")
+ 
+     return parser.parse_args()
+ 
+ 
+ class OnnxModel:
+     def __init__(self, filename):
+         session_opts = ort.SessionOptions()
+         session_opts.inter_op_num_threads = 1
+         session_opts.intra_op_num_threads = 1
+ 
+         self.session_opts = session_opts
+ 
+         self.model = ort.InferenceSession(
+             filename,
+             sess_options=self.session_opts,
+             providers=["CPUExecutionProvider"],
+         )
+ 
+         meta = self.model.get_modelmeta().custom_metadata_map
+         print(meta)
+ 
+         self.window_size = int(meta["window_size"])
+         self.sample_rate = int(meta["sample_rate"])
+         self.window_shift = int(0.1 * self.window_size)
+         self.receptive_field_size = int(meta["receptive_field_size"])
+         self.receptive_field_shift = int(meta["receptive_field_shift"])
+         self.num_speakers = int(meta["num_speakers"])
+         self.powerset_max_classes = int(meta["powerset_max_classes"])
+         self.num_classes = int(meta["num_classes"])
+ 
+     def __call__(self, x):
+         """
+         Args:
+           x: (N, num_samples)
+         Returns:
+           A tensor of shape (N, num_frames, num_classes)
+         """
+         x = np.expand_dims(x, axis=1)
+ 
+         (y,) = self.model.run(
+             [self.model.get_outputs()[0].name], {self.model.get_inputs()[0].name: x}
+         )
+ 
+         return y
+ 
+ 
+ def load_wav(filename, expected_sample_rate) -> np.ndarray:
+     audio, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
+     audio = audio[:, 0]  # only use the first channel
+     if sample_rate != expected_sample_rate:
+         audio = librosa.resample(
+             audio,
+             orig_sr=sample_rate,
+             target_sr=expected_sample_rate,
+         )
+     return audio
+ 
+ 
+ def get_powerset_mapping(num_classes, num_speakers, powerset_max_classes):
+     mapping = np.zeros((num_classes, num_speakers))
+ 
+     k = 1
+     for i in range(1, powerset_max_classes + 1):
+         if i == 1:
+             for j in range(0, num_speakers):
+                 mapping[k, j] = 1
+                 k += 1
+         elif i == 2:
+             for j in range(0, num_speakers):
+                 for m in range(j + 1, num_speakers):
+                     mapping[k, j] = 1
+                     mapping[k, m] = 1
+                     k += 1
+         elif i == 3:
+             raise RuntimeError("Unsupported")
+ 
+     return mapping
+ 
+ 
+ def to_multi_label(y, mapping):
+     """
+     Args:
+       y: (num_chunks, num_frames, num_classes)
+     Returns:
+       A tensor of shape (num_chunks, num_frames, num_speakers)
+     """
+     y = np.argmax(y, axis=-1)
+     labels = mapping[y.reshape(-1)].reshape(y.shape[0], y.shape[1], -1)
+     return labels
+ 
+ 
+ def main():
+     args = get_args()
+     assert Path(args.model).is_file(), args.model
+     assert Path(args.wav).is_file(), args.wav
+ 
+     m = OnnxModel(args.model)
+     audio = load_wav(args.wav, m.sample_rate)
+     # audio: (num_samples,)
+     print("audio", audio.shape, audio.min(), audio.max(), audio.sum())
+ 
+     num = (audio.shape[0] - m.window_size) // m.window_shift + 1
+ 
+     samples = as_strided(
+         audio,
+         shape=(num, m.window_size),
+         strides=(m.window_shift * audio.strides[0], audio.strides[0]),
+     )
+ 
+     # or use torch.Tensor.unfold
+     #  samples = torch.from_numpy(audio).unfold(0, m.window_size, m.window_shift).numpy()
+ 
+     print(
+         "samples",
+         samples.shape,
+         samples.mean(),
+         samples.sum(),
+         samples[:3, :3].sum(axis=-1),
+     )
+ 
+     if (
+         audio.shape[0] < m.window_size
+         or (audio.shape[0] - m.window_size) % m.window_shift > 0
+     ):
+         has_last_chunk = True
+     else:
+         has_last_chunk = False
+ 
+     num_chunks = samples.shape[0]
+     batch_size = 32
+     output = []
+     for i in range(0, num_chunks, batch_size):
+         start = i
+         end = i + batch_size
+         # it's perfectly ok to use end > num_chunks
+         y = m(samples[start:end])
+         output.append(y)
+ 
+     if has_last_chunk:
+         last_chunk = audio[num_chunks * m.window_shift :]  # noqa
+         pad_size = m.window_size - last_chunk.shape[0]
+         last_chunk = np.pad(last_chunk, (0, pad_size))
+         last_chunk = np.expand_dims(last_chunk, axis=0)
+         y = m(last_chunk)
+         output.append(y)
+ 
+     y = np.vstack(output)
+     # y: (num_chunks, num_frames, num_classes)
+ 
+     mapping = get_powerset_mapping(
+         num_classes=m.num_classes,
+         num_speakers=m.num_speakers,
+         powerset_max_classes=m.powerset_max_classes,
+     )
+     labels = to_multi_label(y, mapping=mapping)
+     # labels: (num_chunks, num_frames, num_speakers)
+ 
+     # binary classification
+     labels = np.max(labels, axis=-1)
+     # labels: (num_chunk, num_frames)
+ 
+     num_frames = (
+         int(
+             (m.window_size + (labels.shape[0] - 1) * m.window_shift)
+             / m.receptive_field_shift
+         )
+         + 1
+     )
+ 
+     count = np.zeros((num_frames,))
+     classification = np.zeros((num_frames,))
+     weight = np.hamming(labels.shape[1])
+ 
+     for i in range(labels.shape[0]):
+         this_chunk = labels[i]
+         start = int(i * m.window_shift / m.receptive_field_shift + 0.5)
+         end = start + this_chunk.shape[0]
+ 
+         classification[start:end] += this_chunk * weight
+         count[start:end] += weight
+ 
+     classification /= np.maximum(count, 1e-12)
+ 
+     if has_last_chunk:
+         stop_frame = int(audio.shape[0] / m.receptive_field_shift)
+         classification = classification[:stop_frame]
+ 
+     classification = classification.tolist()
+ 
+     onset = 0.5
+     offset = 0.5
+ 
+     is_active = classification[0] > onset
+     start = None
+ 
+     scale = m.receptive_field_shift / m.sample_rate
+     scale_offset = m.receptive_field_size / m.sample_rate * 0.5
+ 
+     for i in range(len(classification)):
+         if is_active:
+             if classification[i] < offset:
+                 print(
+                     f"{start*scale + scale_offset:.3f} -- {i*scale + scale_offset:.3f}"
+                 )
+                 is_active = False
+         else:
+             if classification[i] > onset:
+                 start = i
+                 is_active = True
+ 
+     if is_active:
+         print(
+             f"{start*scale + scale_offset:.3f} -- {(len(classification)-1)*scale + scale_offset:.3f}"
+         )
+ 
+ 
+ if __name__ == "__main__":
+     main()
--- a/scripts/pyannote/segmentation/vad-torch.py 0 → 100755
查看文件 @bc08160
+++ b/scripts/pyannote/segmentation/vad-torch.py 0 → 100755
查看文件 @bc08160
+ #!/usr/bin/env python3
+ 
+ import torch
+ from pyannote.audio import Model
+ from pyannote.audio.pipelines import (
+     VoiceActivityDetection as VoiceActivityDetectionPipeline,
+ )
+ 
+ 
+ @torch.no_grad()
+ def main():
+     # Please download it from
+     # https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0
+     pt_filename = "./pytorch_model.bin"
+     model = Model.from_pretrained(pt_filename)
+     model.eval()
+ 
+     pipeline = VoiceActivityDetectionPipeline(segmentation=model)
+ 
+     # https://huggingface.co/pyannote/voice-activity-detection/blob/main/config.yaml
+     # https://github.com/pyannote/pyannote-audio/issues/1215
+     initial_params = {
+         "min_duration_on": 0.0,
+         "min_duration_off": 0.0,
+     }
+     pipeline.onset = 0.5
+     pipeline.offset = 0.5
+ 
+     pipeline.instantiate(initial_params)
+ 
+     # wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
+     t = pipeline("./lei-jun-test.wav")
+     print(type(t))
+     print(t)
+ 
+ 
+ if __name__ == "__main__":
+     main()