Fangjun Kuang
Committed by GitHub
@@ -4,7 +4,6 @@ on: @@ -4,7 +4,6 @@ on:
4 push: 4 push:
5 branches: 5 branches:
6 - apk 6 - apk
7 - - android-demo-speaker-diarization-2  
8 7
9 workflow_dispatch: 8 workflow_dispatch:
10 9
@@ -76,6 +75,11 @@ jobs: @@ -76,6 +75,11 @@ jobs:
76 run: | 75 run: |
77 cd scripts/apk 76 cd scripts/apk
78 77
  78 + total=${{ matrix.total }}
  79 + index=${{ matrix.index }}
  80 +
  81 + python3 ./generate-speaker-diarization-apk-script.py --total $total --index $index
  82 +
79 chmod +x build-apk-speaker-diarization.sh 83 chmod +x build-apk-speaker-diarization.sh
80 mv -v ./build-apk-speaker-diarization.sh ../.. 84 mv -v ./build-apk-speaker-diarization.sh ../..
81 85
  1 +name: export-revai-segmentation-to-onnx
  2 +
  3 +on:
  4 + workflow_dispatch:
  5 +
  6 +concurrency:
  7 + group: export-revai-segmentation-to-onnx-${{ github.ref }}
  8 + cancel-in-progress: true
  9 +
  10 +jobs:
  11 + export-revai-segmentation-to-onnx:
  12 + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
  13 + name: export revai segmentation models to ONNX
  14 + runs-on: ${{ matrix.os }}
  15 + strategy:
  16 + fail-fast: false
  17 + matrix:
  18 + os: [macos-latest]
  19 + python-version: ["3.10"]
  20 +
  21 + steps:
  22 + - uses: actions/checkout@v4
  23 +
  24 + - name: Setup Python ${{ matrix.python-version }}
  25 + uses: actions/setup-python@v5
  26 + with:
  27 + python-version: ${{ matrix.python-version }}
  28 +
  29 + - name: Install pyannote
  30 + shell: bash
  31 + run: |
  32 + pip install pyannote.audio onnx==1.15.0 onnxruntime==1.16.3
  33 +
  34 + - name: Run
  35 + shell: bash
  36 + run: |
  37 + d=sherpa-onnx-reverb-diarization-v1
  38 + src=$PWD/$d
  39 + mkdir -p $src
  40 +
  41 + pushd scripts/pyannote/segmentation
  42 + ./run-revai.sh
  43 + cp ./*.onnx $src/
  44 + cp ./README.md $src/
  45 + cp ./LICENSE $src/
  46 + cp ./run-revai.sh $src/run.sh
  47 + cp ./*.py $src/
  48 +
  49 + popd
  50 + ls -lh $d
  51 + tar cjfv $d.tar.bz2 $d
  52 +
  53 + - name: Release
  54 + uses: svenstaro/upload-release-action@v2
  55 + with:
  56 + file_glob: true
  57 + file: ./*.tar.bz2
  58 + overwrite: true
  59 + repo_name: k2-fsa/sherpa-onnx
  60 + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
  61 + tag: speaker-segmentation-models
  62 +
  63 + - name: Publish to huggingface
  64 + env:
  65 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  66 + uses: nick-fields/retry@v3
  67 + with:
  68 + max_attempts: 20
  69 + timeout_seconds: 200
  70 + shell: bash
  71 + command: |
  72 + git config --global user.email "csukuangfj@gmail.com"
  73 + git config --global user.name "Fangjun Kuang"
  74 +
  75 + d=sherpa-onnx-reverb-diarization-v1
  76 + export GIT_LFS_SKIP_SMUDGE=1
  77 + export GIT_CLONE_PROTECTION_ACTIVE=false
  78 + git clone https://huggingface.co/csukuangfj/$d huggingface
  79 + cp -v $d/* ./huggingface
  80 + cd huggingface
  81 + git lfs track "*.onnx"
  82 + git status
  83 + git add .
  84 + git status
  85 + git commit -m "add models"
  86 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main
@@ -31,15 +31,24 @@ log "====================x86====================" @@ -31,15 +31,24 @@ log "====================x86===================="
31 31
32 mkdir -p apks 32 mkdir -p apks
33 33
  34 +{% for model in model_list %}
  35 +
34 pushd ./android/SherpaOnnxSpeakerDiarization/app/src/main/assets/ 36 pushd ./android/SherpaOnnxSpeakerDiarization/app/src/main/assets/
35 37
36 -curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2  
37 -tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2  
38 -rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2  
39 -mv sherpa-onnx-pyannote-segmentation-3-0/model.onnx segmentation.onnx  
40 -rm -rf sherpa-onnx-pyannote-segmentation-3-0 38 +ls -lh
  39 +
  40 +model_name={{ model.model_name }}
  41 +short_name={{ model.short_name }}
41 42
42 -curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx 43 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/$model_name.tar.bz2
  44 +tar xvf $model_name.tar.bz2
  45 +rm $model_name.tar.bz2
  46 +mv $model_name/model.onnx segmentation.onnx
  47 +rm -rf $model_name
  48 +
  49 +if [ ! -f 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
  50 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  51 +fi
43 52
44 echo "pwd: $PWD" 53 echo "pwd: $PWD"
45 ls -lh 54 ls -lh
@@ -65,9 +74,13 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do @@ -65,9 +74,13 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do
65 ./gradlew build 74 ./gradlew build
66 popd 75 popd
67 76
68 - mv android/SherpaOnnxSpeakerDiarization/app/build/outputs/apk/debug/app-debug.apk ./apks/sherpa-onnx-${SHERPA_ONNX_VERSION}-$arch-speaker-diarization-pyannote_audio-3dspeaker.apk 77 + mv android/SherpaOnnxSpeakerDiarization/app/build/outputs/apk/debug/app-debug.apk ./apks/sherpa-onnx-${SHERPA_ONNX_VERSION}-$arch-speaker-diarization-$short_name-3dspeaker.apk
69 ls -lh apks 78 ls -lh apks
70 rm -v ./android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/$arch/*.so 79 rm -v ./android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/$arch/*.so
71 done 80 done
72 81
  82 +rm -rf ./android/SherpaOnnxSpeakerDiarization/app/src/main/assets/segmentation.onnx
  83 +
  84 +{% endfor %}
  85 +
73 ls -lh apks 86 ls -lh apks
  1 +#!/usr/bin/env python3
  2 +
  3 +import argparse
  4 +from dataclasses import dataclass
  5 +from typing import List
  6 +
  7 +import jinja2
  8 +
  9 +
  10 +def get_args():
  11 + parser = argparse.ArgumentParser()
  12 + parser.add_argument(
  13 + "--total",
  14 + type=int,
  15 + default=1,
  16 + help="Number of runners",
  17 + )
  18 + parser.add_argument(
  19 + "--index",
  20 + type=int,
  21 + default=0,
  22 + help="Index of the current runner",
  23 + )
  24 + return parser.parse_args()
  25 +
  26 +
  27 +@dataclass
  28 +class SpeakerSegmentationModel:
  29 + model_name: str
  30 + short_name: str = ""
  31 +
  32 +
  33 +def get_models() -> List[SpeakerSegmentationModel]:
  34 + models = [
  35 + SpeakerSegmentationModel(
  36 + model_name="sherpa-onnx-pyannote-segmentation-3-0",
  37 + short_name="pyannote_audio",
  38 + ),
  39 + SpeakerSegmentationModel(
  40 + model_name="sherpa-onnx-reverb-diarization-v1",
  41 + short_name="revai_v1",
  42 + ),
  43 + ]
  44 +
  45 + return models
  46 +
  47 +
  48 +def main():
  49 + args = get_args()
  50 + index = args.index
  51 + total = args.total
  52 + assert 0 <= index < total, (index, total)
  53 +
  54 + all_model_list = get_models()
  55 +
  56 + num_models = len(all_model_list)
  57 +
  58 + num_per_runner = num_models // total
  59 + if num_per_runner <= 0:
  60 + raise ValueError(f"num_models: {num_models}, num_runners: {total}")
  61 +
  62 + start = index * num_per_runner
  63 + end = start + num_per_runner
  64 +
  65 + remaining = num_models - args.total * num_per_runner
  66 +
  67 + print(f"{index}/{total}: {start}-{end}/{num_models}")
  68 +
  69 + d = dict()
  70 + d["model_list"] = all_model_list[start:end]
  71 + if index < remaining:
  72 + s = args.total * num_per_runner + index
  73 + d["model_list"].append(all_model_list[s])
  74 + print(f"{s}/{num_models}")
  75 +
  76 + filename_list = ["./build-apk-speaker-diarization.sh"]
  77 + for filename in filename_list:
  78 + environment = jinja2.Environment()
  79 + with open(f"{filename}.in") as f:
  80 + s = f.read()
  81 + template = environment.from_string(s)
  82 +
  83 + s = template.render(**d)
  84 + with open(filename, "w") as f:
  85 + print(s, file=f)
  86 +
  87 +
  88 +if __name__ == "__main__":
  89 + main()
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
  2 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
2 3
  4 +import os
3 from typing import Any, Dict 5 from typing import Any, Dict
4 6
5 import onnx 7 import onnx
@@ -35,6 +37,8 @@ def add_meta_data(filename: str, meta_data: Dict[str, Any]): @@ -35,6 +37,8 @@ def add_meta_data(filename: str, meta_data: Dict[str, Any]):
35 def main(): 37 def main():
36 # You can download ./pytorch_model.bin from 38 # You can download ./pytorch_model.bin from
37 # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0 39 # https://hf-mirror.com/csukuangfj/pyannote-models/tree/main/segmentation-3.0
  40 + # or from
  41 + # https://huggingface.co/Revai/reverb-diarization-v1/tree/main
38 pt_filename = "./pytorch_model.bin" 42 pt_filename = "./pytorch_model.bin"
39 model = Model.from_pretrained(pt_filename) 43 model = Model.from_pretrained(pt_filename)
40 model.eval() 44 model.eval()
@@ -94,6 +98,22 @@ def main(): @@ -94,6 +98,22 @@ def main():
94 receptive_field_size = int(model.receptive_field.duration * 16000) 98 receptive_field_size = int(model.receptive_field.duration * 16000)
95 receptive_field_shift = int(model.receptive_field.step * 16000) 99 receptive_field_shift = int(model.receptive_field.step * 16000)
96 100
  101 + is_revai = os.getenv("SHERPA_ONNX_IS_REVAI", "")
  102 + if is_revai == "":
  103 + url_1 = "https://huggingface.co/pyannote/segmentation-3.0"
  104 + url_2 = "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0"
  105 + license_url = (
  106 + "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE"
  107 + )
  108 + model_author = "pyannote-audio"
  109 + else:
  110 + url_1 = "https://huggingface.co/Revai/reverb-diarization-v1"
  111 + url_2 = "https://huggingface.co/csukuangfj/sherpa-onnx-reverb-diarization-v1"
  112 + license_url = (
  113 + "https://huggingface.co/Revai/reverb-diarization-v1/blob/main/LICENSE"
  114 + )
  115 + model_author = "Revai"
  116 +
97 meta_data = { 117 meta_data = {
98 "num_speakers": len(model.specifications.classes), 118 "num_speakers": len(model.specifications.classes),
99 "powerset_max_classes": model.specifications.powerset_max_classes, 119 "powerset_max_classes": model.specifications.powerset_max_classes,
@@ -104,11 +124,11 @@ def main(): @@ -104,11 +124,11 @@ def main():
104 "receptive_field_shift": receptive_field_shift, 124 "receptive_field_shift": receptive_field_shift,
105 "model_type": "pyannote-segmentation-3.0", 125 "model_type": "pyannote-segmentation-3.0",
106 "version": "1", 126 "version": "1",
107 - "model_author": "pyannote", 127 + "model_author": model_author,
108 "maintainer": "k2-fsa", 128 "maintainer": "k2-fsa",
109 - "url_1": "https://huggingface.co/pyannote/segmentation-3.0",  
110 - "url_2": "https://huggingface.co/csukuangfj/pyannote-models/tree/main/segmentation-3.0",  
111 - "license": "https://huggingface.co/pyannote/segmentation-3.0/blob/main/LICENSE", 129 + "url_1": url_1,
  130 + "url_2": url_2,
  131 + "license": license_url,
112 } 132 }
113 add_meta_data(filename=filename, meta_data=meta_data) 133 add_meta_data(filename=filename, meta_data=meta_data)
114 134
1 #!/usr/bin/env bash 1 #!/usr/bin/env bash
  2 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
2 3
3 4
4 python3 -m onnxruntime.quantization.preprocess --input model.onnx --output tmp.preprocessed.onnx 5 python3 -m onnxruntime.quantization.preprocess --input model.onnx --output tmp.preprocessed.onnx
  1 +#!/usr/bin/env bash
  2 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +export SHERPA_ONNX_IS_REVAI=1
  5 +
  6 +set -ex
  7 +function install_pyannote() {
  8 + pip install pyannote.audio onnx onnxruntime
  9 +}
  10 +
  11 +function download_test_files() {
  12 + curl -SL -O https://huggingface.co/Revai/reverb-diarization-v1/resolve/main/pytorch_model.bin
  13 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  14 +}
  15 +
  16 +install_pyannote
  17 +download_test_files
  18 +
  19 +./export-onnx.py
  20 +./preprocess.sh
  21 +
  22 +echo "----------torch----------"
  23 +./vad-torch.py
  24 +
  25 +echo "----------onnx model.onnx----------"
  26 +./vad-onnx.py --model ./model.onnx --wav ./lei-jun-test.wav
  27 +
  28 +echo "----------onnx model.int8.onnx----------"
  29 +./vad-onnx.py --model ./model.int8.onnx --wav ./lei-jun-test.wav
  30 +
  31 +curl -SL -O https://huggingface.co/Revai/reverb-diarization-v1/resolve/main/LICENSE
  32 +
  33 +cat >README.md << EOF
  34 +# Introduction
  35 +
  36 +Models in this file are converted from
  37 +https://huggingface.co/Revai/reverb-diarization-v1/tree/main
  38 +
  39 +Note that it is accessible under a non-commercial license.
  40 +
  41 +Please see ./LICENSE for details.
  42 +
  43 +See also
  44 +https://www.rev.com/blog/speech-to-text-technology/introducing-reverb-open-source-asr-diarization
  45 +
  46 +EOF
  47 +
  48 +
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
  2 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
2 3
3 """ 4 """
4 Please refer to 5 Please refer to
@@ -216,6 +216,8 @@ def main(): @@ -216,6 +216,8 @@ def main():
216 216
217 is_active = classification[0] > onset 217 is_active = classification[0] > onset
218 start = None 218 start = None
  219 + if is_active:
  220 + start = 0
219 221
220 scale = m.receptive_field_shift / m.sample_rate 222 scale = m.receptive_field_shift / m.sample_rate
221 scale_offset = m.receptive_field_size / m.sample_rate * 0.5 223 scale_offset = m.receptive_field_size / m.sample_rate * 0.5