Fangjun Kuang
Committed by GitHub

WebAssembly exmaple for speaker diarization (#1411)

正在显示 37 个修改的文件 包含 1008 行增加24 行删除
@@ -29,7 +29,7 @@ jobs: @@ -29,7 +29,7 @@ jobs:
29 - name: Install emsdk 29 - name: Install emsdk
30 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
31 with: 31 with:
32 - version: 3.1.51 32 + version: 3.1.53
33 actions-cache-folder: 'emsdk-cache' 33 actions-cache-folder: 'emsdk-cache'
34 34
35 - name: View emsdk version 35 - name: View emsdk version
@@ -28,7 +28,7 @@ jobs: @@ -28,7 +28,7 @@ jobs:
28 - name: Install emsdk 28 - name: Install emsdk
29 uses: mymindstorm/setup-emsdk@v14 29 uses: mymindstorm/setup-emsdk@v14
30 with: 30 with:
31 - version: 3.1.51 31 + version: 3.1.53
32 actions-cache-folder: 'emsdk-cache' 32 actions-cache-folder: 'emsdk-cache'
33 33
34 - name: View emsdk version 34 - name: View emsdk version
@@ -29,7 +29,7 @@ jobs: @@ -29,7 +29,7 @@ jobs:
29 - name: Install emsdk 29 - name: Install emsdk
30 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
31 with: 31 with:
32 - version: 3.1.51 32 + version: 3.1.53
33 actions-cache-folder: 'emsdk-cache' 33 actions-cache-folder: 'emsdk-cache'
34 34
35 - name: View emsdk version 35 - name: View emsdk version
@@ -29,7 +29,7 @@ jobs: @@ -29,7 +29,7 @@ jobs:
29 - name: Install emsdk 29 - name: Install emsdk
30 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
31 with: 31 with:
32 - version: 3.1.51 32 + version: 3.1.53
33 actions-cache-folder: 'emsdk-cache' 33 actions-cache-folder: 'emsdk-cache'
34 34
35 - name: View emsdk version 35 - name: View emsdk version
  1 +name: wasm-simd-hf-space-speaker-diarization
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - wasm
  7 + - wasm-speaker-diarization
  8 + tags:
  9 + - 'v[0-9]+.[0-9]+.[0-9]+*'
  10 +
  11 + workflow_dispatch:
  12 +
  13 +concurrency:
  14 + group: wasm-simd-hf-space-speaker-diarization-${{ github.ref }}
  15 + cancel-in-progress: true
  16 +
  17 +jobs:
  18 + wasm-simd-hf-space-speaker-diarization:
  19 + runs-on: ${{ matrix.os }}
  20 + strategy:
  21 + fail-fast: false
  22 + matrix:
  23 + os: [ubuntu-latest]
  24 +
  25 + steps:
  26 + - uses: actions/checkout@v4
  27 + with:
  28 + fetch-depth: 0
  29 +
  30 + - name: Install emsdk
  31 + uses: mymindstorm/setup-emsdk@v14
  32 + with:
  33 + version: 3.1.53
  34 + actions-cache-folder: 'emsdk-cache'
  35 +
  36 + - name: View emsdk version
  37 + shell: bash
  38 + run: |
  39 + emcc -v
  40 + echo "--------------------"
  41 + emcc --check
  42 +
  43 + - name: Download model files
  44 + shell: bash
  45 + run: |
  46 + cd wasm/speaker-diarization/assets/
  47 + ls -lh
  48 + echo "----------"
  49 +
  50 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  51 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  52 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  53 + mv sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx
  54 + rm -rf sherpa-onnx-pyannote-segmentation-3-0
  55 +
  56 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  57 + mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx
  58 +
  59 + echo "----------"
  60 +
  61 + ls -lh
  62 +
  63 + - name: Build sherpa-onnx for WebAssembly
  64 + shell: bash
  65 + run: |
  66 + ./build-wasm-simd-speaker-diarization.sh
  67 +
  68 + - name: collect files
  69 + shell: bash
  70 + run: |
  71 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  72 +
  73 + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-speaker-diarization
  74 + mv build-wasm-simd-speaker-diarization/install/bin/wasm/speaker-diarization $dst
  75 + ls -lh $dst
  76 + tar cjfv $dst.tar.bz2 ./$dst
  77 +
  78 + - name: Upload wasm files
  79 + uses: actions/upload-artifact@v4
  80 + with:
  81 + name: sherpa-onnx-wasm-simd-speaker-diarization
  82 + path: ./sherpa-onnx-wasm-simd-*.tar.bz2
  83 +
  84 + - name: Release
  85 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
  86 + uses: svenstaro/upload-release-action@v2
  87 + with:
  88 + file_glob: true
  89 + overwrite: true
  90 + file: ./*.tar.bz2
  91 +
  92 + - name: Publish to ModelScope
  93 + # if: false
  94 + env:
  95 + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }}
  96 + uses: nick-fields/retry@v2
  97 + with:
  98 + max_attempts: 20
  99 + timeout_seconds: 200
  100 + shell: bash
  101 + command: |
  102 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  103 +
  104 + git config --global user.email "csukuangfj@gmail.com"
  105 + git config --global user.name "Fangjun Kuang"
  106 +
  107 + rm -rf ms
  108 + export GIT_LFS_SKIP_SMUDGE=1
  109 + export GIT_CLONE_PROTECTION_ACTIVE=false
  110 +
  111 + git clone https://www.modelscope.cn/studios/csukuangfj/web-assembly-speaker-diarization-sherpa-onnx.git ms
  112 + cd ms
  113 + rm -fv *.js
  114 + rm -fv *.data
  115 + git fetch
  116 + git pull
  117 + git merge -m "merge remote" --ff origin main
  118 +
  119 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  120 +
  121 + git status
  122 + git lfs track "*.data"
  123 + git lfs track "*.wasm"
  124 + ls -lh
  125 +
  126 + git add .
  127 + git commit -m "update model"
  128 + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/csukuangfj/web-assembly-speaker-diarization-sherpa-onnx.git
  129 +
  130 + - name: Publish to huggingface
  131 + env:
  132 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  133 + uses: nick-fields/retry@v2
  134 + with:
  135 + max_attempts: 20
  136 + timeout_seconds: 200
  137 + shell: bash
  138 + command: |
  139 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  140 +
  141 + git config --global user.email "csukuangfj@gmail.com"
  142 + git config --global user.name "Fangjun Kuang"
  143 +
  144 + rm -rf huggingface
  145 + export GIT_LFS_SKIP_SMUDGE=1
  146 + export GIT_CLONE_PROTECTION_ACTIVE=false
  147 +
  148 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-speaker-diarization-sherpa-onnx huggingface
  149 + ls -lh
  150 +
  151 + cd huggingface
  152 + rm -fv *.js
  153 + rm -fv *.data
  154 + git fetch
  155 + git pull
  156 + git merge -m "merge remote" --ff origin main
  157 +
  158 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  159 +
  160 + git status
  161 + git lfs track "*.data"
  162 + git lfs track "*.wasm"
  163 + ls -lh
  164 +
  165 + git add .
  166 + git commit -m "update model"
  167 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-speaker-diarization-sherpa-onnx main
@@ -37,7 +37,7 @@ jobs: @@ -37,7 +37,7 @@ jobs:
37 - name: Install emsdk 37 - name: Install emsdk
38 uses: mymindstorm/setup-emsdk@v14 38 uses: mymindstorm/setup-emsdk@v14
39 with: 39 with:
40 - version: 3.1.51 40 + version: 3.1.53
41 actions-cache-folder: 'emsdk-cache' 41 actions-cache-folder: 'emsdk-cache'
42 42
43 - name: View emsdk version 43 - name: View emsdk version
@@ -29,7 +29,7 @@ jobs: @@ -29,7 +29,7 @@ jobs:
29 - name: Install emsdk 29 - name: Install emsdk
30 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
31 with: 31 with:
32 - version: 3.1.51 32 + version: 3.1.53
33 actions-cache-folder: 'emsdk-cache' 33 actions-cache-folder: 'emsdk-cache'
34 34
35 - name: View emsdk version 35 - name: View emsdk version
@@ -29,7 +29,7 @@ jobs: @@ -29,7 +29,7 @@ jobs:
29 - name: Install emsdk 29 - name: Install emsdk
30 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
31 with: 31 with:
32 - version: 3.1.51 32 + version: 3.1.53
33 actions-cache-folder: 'emsdk-cache' 33 actions-cache-folder: 'emsdk-cache'
34 34
35 - name: View emsdk version 35 - name: View emsdk version
@@ -29,7 +29,7 @@ jobs: @@ -29,7 +29,7 @@ jobs:
29 - name: Install emsdk 29 - name: Install emsdk
30 uses: mymindstorm/setup-emsdk@v14 30 uses: mymindstorm/setup-emsdk@v14
31 with: 31 with:
32 - version: 3.1.51 32 + version: 3.1.53
33 actions-cache-folder: 'emsdk-cache' 33 actions-cache-folder: 'emsdk-cache'
34 34
35 - name: View emsdk version 35 - name: View emsdk version
@@ -32,6 +32,7 @@ option(SHERPA_ONNX_ENABLE_WEBSOCKET "Whether to build webscoket server/client" O @@ -32,6 +32,7 @@ option(SHERPA_ONNX_ENABLE_WEBSOCKET "Whether to build webscoket server/client" O
32 option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF) 32 option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF)
33 option(SHERPA_ONNX_ENABLE_DIRECTML "Enable ONNX Runtime DirectML support" OFF) 33 option(SHERPA_ONNX_ENABLE_DIRECTML "Enable ONNX Runtime DirectML support" OFF)
34 option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF) 34 option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF)
  35 +option(SHERPA_ONNX_ENABLE_WASM_SPEAKER_DIARIZATION "Whether to enable WASM for speaker diarization" OFF)
35 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) 36 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF)
36 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) 37 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF)
37 option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) 38 option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF)
@@ -135,6 +136,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_C_API ${SHERPA_ONNX_ENABLE_C_API}") @@ -135,6 +136,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_C_API ${SHERPA_ONNX_ENABLE_C_API}")
135 message(STATUS "SHERPA_ONNX_ENABLE_WEBSOCKET ${SHERPA_ONNX_ENABLE_WEBSOCKET}") 136 message(STATUS "SHERPA_ONNX_ENABLE_WEBSOCKET ${SHERPA_ONNX_ENABLE_WEBSOCKET}")
136 message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}") 137 message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}")
137 message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}") 138 message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}")
  139 +message(STATUS "SHERPA_ONNX_ENABLE_WASM_SPEAKER_DIARIZATION ${SHERPA_ONNX_ENABLE_WASM_SPEAKER_DIARIZATION}")
138 message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") 140 message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}")
139 message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") 141 message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}")
140 message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") 142 message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}")
@@ -196,9 +198,19 @@ else() @@ -196,9 +198,19 @@ else()
196 add_definitions(-DSHERPA_ONNX_ENABLE_DIRECTML=0) 198 add_definitions(-DSHERPA_ONNX_ENABLE_DIRECTML=0)
197 endif() 199 endif()
198 200
  201 +if(SHERPA_ONNX_ENABLE_WASM_SPEAKER_DIARIZATION)
  202 + if(NOT SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION)
  203 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION to ON if you want to build WASM for speaker diarization")
  204 + endif()
  205 +
  206 + if(NOT SHERPA_ONNX_ENABLE_WASM)
  207 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for speaker diarization")
  208 + endif()
  209 +endif()
  210 +
199 if(SHERPA_ONNX_ENABLE_WASM_TTS) 211 if(SHERPA_ONNX_ENABLE_WASM_TTS)
200 if(NOT SHERPA_ONNX_ENABLE_TTS) 212 if(NOT SHERPA_ONNX_ENABLE_TTS)
201 - message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_TTS to ON if you want to build wasm TTS") 213 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_TTS to ON if you want to build WASM for TTS")
202 endif() 214 endif()
203 215
204 if(NOT SHERPA_ONNX_ENABLE_WASM) 216 if(NOT SHERPA_ONNX_ENABLE_WASM)
@@ -116,6 +116,7 @@ We also have spaces built using WebAssembly. They are listed below: @@ -116,6 +116,7 @@ We also have spaces built using WebAssembly. They are listed below:
116 |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| 116 |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]|
117 |Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| 117 |Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]|
118 |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| 118 |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]|
  119 +|Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]|
119 120
120 ### Links for pre-built Android APKs 121 ### Links for pre-built Android APKs
121 122
@@ -173,6 +174,7 @@ We also have spaces built using WebAssembly. They are listed below: @@ -173,6 +174,7 @@ We also have spaces built using WebAssembly. They are listed below:
173 | Speaker identification (Speaker ID) | [Address][sid-models] | 174 | Speaker identification (Speaker ID) | [Address][sid-models] |
174 | Spoken language identification (Language ID)| See multi-lingual [Whisper][Whisper] ASR models from [Speech recognition][asr-models]| 175 | Spoken language identification (Language ID)| See multi-lingual [Whisper][Whisper] ASR models from [Speech recognition][asr-models]|
175 | Punctuation | [Address][punct-models] | 176 | Punctuation | [Address][punct-models] |
  177 +| Speaker segmentation | [Address][speaker-segmentation-models] |
176 178
177 ### Useful links 179 ### Useful links
178 180
@@ -261,6 +263,8 @@ Video demo in Chinese: [辷コシ∫き逾樊蕗菴蠑謇灘ュ玲撃∫悄豁」蠖ア蜩崎邇 @@ -261,6 +263,8 @@ Video demo in Chinese: [辷コシ∫き逾樊蕗菴蠑謇灘ュ玲撃∫悄豁」蠖ア蜩崎邇
261 [wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en 263 [wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en
262 [wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de 264 [wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de
263 [wasm-ms-tts-piper-de]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de 265 [wasm-ms-tts-piper-de]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de
  266 +[wasm-hf-speaker-diarization]: https://huggingface.co/spaces/k2-fsa/web-assembly-speaker-diarization-sherpa-onnx
  267 +[wasm-ms-speaker-diarization]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-speaker-diarization-sherpa-onnx
264 [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html 268 [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html
265 [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html 269 [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html
266 [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html 270 [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html
@@ -303,5 +307,6 @@ Video demo in Chinese: [辷コシ∫き逾樊蕗菴蠑謇灘ュ玲撃∫悄豁」蠖ア蜩崎邇 @@ -303,5 +307,6 @@ Video demo in Chinese: [辷コシ∫き逾樊蕗菴蠑謇灘ュ玲撃∫悄豁」蠖ア蜩崎邇
303 [sid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models 307 [sid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
304 [slid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models 308 [slid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
305 [punct-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models 309 [punct-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models
  310 +[speaker-segmentation-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
306 [GigaSpeech]: https://github.com/SpeechColab/GigaSpeech 311 [GigaSpeech]: https://github.com/SpeechColab/GigaSpeech
307 [WenetSpeech]: https://github.com/wenet-e2e/WenetSpeech 312 [WenetSpeech]: https://github.com/wenet-e2e/WenetSpeech
@@ -14,8 +14,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then @@ -14,8 +14,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then
14 echo "git clone https://github.com/emscripten-core/emsdk.git" 14 echo "git clone https://github.com/emscripten-core/emsdk.git"
15 echo "cd emsdk" 15 echo "cd emsdk"
16 echo "git pull" 16 echo "git pull"
17 - echo "./emsdk install latest"  
18 - echo "./emsdk activate latest" 17 + echo "./emsdk install 3.1.53"
  18 + echo "./emsdk activate 3.1.53"
19 echo "source ./emsdk_env.sh" 19 echo "source ./emsdk_env.sh"
20 exit 1 20 exit 1
21 else 21 else
@@ -9,8 +9,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then @@ -9,8 +9,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then
9 echo "git clone https://github.com/emscripten-core/emsdk.git" 9 echo "git clone https://github.com/emscripten-core/emsdk.git"
10 echo "cd emsdk" 10 echo "cd emsdk"
11 echo "git pull" 11 echo "git pull"
12 - echo "./emsdk install latest"  
13 - echo "./emsdk activate latest" 12 + echo "./emsdk install 3.1.53"
  13 + echo "./emsdk activate 3.1.53"
14 echo "source ./emsdk_env.sh" 14 echo "source ./emsdk_env.sh"
15 exit 1 15 exit 1
16 else 16 else
@@ -16,8 +16,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then @@ -16,8 +16,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then
16 echo "git clone https://github.com/emscripten-core/emsdk.git" 16 echo "git clone https://github.com/emscripten-core/emsdk.git"
17 echo "cd emsdk" 17 echo "cd emsdk"
18 echo "git pull" 18 echo "git pull"
19 - echo "./emsdk install latest"  
20 - echo "./emsdk activate latest" 19 + echo "./emsdk install 3.1.53"
  20 + echo "./emsdk activate 3.1.53"
21 echo "source ./emsdk_env.sh" 21 echo "source ./emsdk_env.sh"
22 exit 1 22 exit 1
23 else 23 else
  1 +#!/usr/bin/env bash
  2 +# Copyright (c) 2024 Xiaomi Corporation
  3 +#
  4 +# This script is to build sherpa-onnx for WebAssembly (speaker diarization)
  5 +
  6 +set -ex
  7 +
  8 +if [ x"$EMSCRIPTEN" == x"" ]; then
  9 + if ! command -v emcc &> /dev/null; then
  10 + echo "Please install emscripten first"
  11 + echo ""
  12 + echo "You can use the following commands to install it:"
  13 + echo ""
  14 + echo "git clone https://github.com/emscripten-core/emsdk.git"
  15 + echo "cd emsdk"
  16 + echo "git pull"
  17 + echo "./emsdk install 3.1.53"
  18 + echo "./emsdk activate 3.1.53"
  19 + echo "source ./emsdk_env.sh"
  20 + exit 1
  21 + else
  22 + EMSCRIPTEN=$(dirname $(realpath $(which emcc)))
  23 + fi
  24 +fi
  25 +
  26 +export EMSCRIPTEN=$EMSCRIPTEN
  27 +echo "EMSCRIPTEN: $EMSCRIPTEN"
  28 +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then
  29 + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake"
  30 + echo "Please make sure you have installed emsdk correctly"
  31 + exit 1
  32 +fi
  33 +
  34 +mkdir -p build-wasm-simd-speaker-diarization
  35 +pushd build-wasm-simd-speaker-diarization
  36 +
  37 +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON
  38 +
  39 +cmake \
  40 + -DCMAKE_INSTALL_PREFIX=./install \
  41 + -DCMAKE_BUILD_TYPE=Release \
  42 + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \
  43 + \
  44 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  45 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  46 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  47 + -DBUILD_SHARED_LIBS=OFF \
  48 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  49 + -DSHERPA_ONNX_ENABLE_JNI=OFF \
  50 + -DSHERPA_ONNX_ENABLE_C_API=ON \
  51 + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
  52 + -DSHERPA_ONNX_ENABLE_GPU=OFF \
  53 + -DSHERPA_ONNX_ENABLE_WASM=ON \
  54 + -DSHERPA_ONNX_ENABLE_WASM_SPEAKER_DIARIZATION=ON \
  55 + -DSHERPA_ONNX_ENABLE_BINARY=OFF \
  56 + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \
  57 + ..
  58 +make -j2
  59 +make install
  60 +
  61 +ls -lh install/bin/wasm/speaker-diarization
@@ -14,8 +14,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then @@ -14,8 +14,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then
14 echo "git clone https://github.com/emscripten-core/emsdk.git" 14 echo "git clone https://github.com/emscripten-core/emsdk.git"
15 echo "cd emsdk" 15 echo "cd emsdk"
16 echo "git pull" 16 echo "git pull"
17 - echo "./emsdk install latest"  
18 - echo "./emsdk activate latest" 17 + echo "./emsdk install 3.1.53"
  18 + echo "./emsdk activate 3.1.53"
19 echo "source ./emsdk_env.sh" 19 echo "source ./emsdk_env.sh"
20 exit 1 20 exit 1
21 else 21 else
@@ -15,8 +15,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then @@ -15,8 +15,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then
15 echo "git clone https://github.com/emscripten-core/emsdk.git" 15 echo "git clone https://github.com/emscripten-core/emsdk.git"
16 echo "cd emsdk" 16 echo "cd emsdk"
17 echo "git pull" 17 echo "git pull"
18 - echo "./emsdk install latest"  
19 - echo "./emsdk activate latest" 18 + echo "./emsdk install 3.1.53"
  19 + echo "./emsdk activate 3.1.53"
20 echo "source ./emsdk_env.sh" 20 echo "source ./emsdk_env.sh"
21 exit 1 21 exit 1
22 else 22 else
@@ -14,8 +14,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then @@ -14,8 +14,8 @@ if [ x"$EMSCRIPTEN" == x"" ]; then
14 echo "git clone https://github.com/emscripten-core/emsdk.git" 14 echo "git clone https://github.com/emscripten-core/emsdk.git"
15 echo "cd emsdk" 15 echo "cd emsdk"
16 echo "git pull" 16 echo "git pull"
17 - echo "./emsdk install latest"  
18 - echo "./emsdk activate latest" 17 + echo "./emsdk install 3.1.53"
  18 + echo "./emsdk activate 3.1.53"
19 echo "source ./emsdk_env.sh" 19 echo "source ./emsdk_env.sh"
20 exit 1 20 exit 1
21 else 21 else
@@ -16,6 +16,11 @@ namespace SherpaOnnx @@ -16,6 +16,11 @@ namespace SherpaOnnx
16 _handle = new HandleRef(this, h); 16 _handle = new HandleRef(this, h);
17 } 17 }
18 18
  19 + public void SetConfig(OfflineSpeakerDiarizationConfig config)
  20 + {
  21 + SherpaOnnxOfflineSpeakerDiarizationSetConfig(_handle.Handle, ref config);
  22 + }
  23 +
19 public OfflineSpeakerDiarizationSegment[] Process(float[] samples) 24 public OfflineSpeakerDiarizationSegment[] Process(float[] samples)
20 { 25 {
21 IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcess(_handle.Handle, samples, samples.Length); 26 IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcess(_handle.Handle, samples, samples.Length);
@@ -117,6 +122,9 @@ namespace SherpaOnnx @@ -117,6 +122,9 @@ namespace SherpaOnnx
117 122
118 [DllImport(Dll.Filename)] 123 [DllImport(Dll.Filename)]
119 private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(IntPtr handle); 124 private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(IntPtr handle);
  125 +
  126 + [DllImport(Dll.Filename)]
  127 + private static extern void SherpaOnnxOfflineSpeakerDiarizationSetConfig(IntPtr handle, ref OfflineSpeakerDiarizationConfig config);
120 } 128 }
121 } 129 }
122 130
@@ -1276,6 +1276,16 @@ func (sd *OfflineSpeakerDiarization) SampleRate() int { @@ -1276,6 +1276,16 @@ func (sd *OfflineSpeakerDiarization) SampleRate() int {
1276 return int(C.SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd.impl)) 1276 return int(C.SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd.impl))
1277 } 1277 }
1278 1278
  1279 +// only config.Clustering is used. All other fields are ignored
  1280 +func (sd *OfflineSpeakerDiarization) SetConfig(config *OfflineSpeakerDiarizationConfig) {
  1281 + c := C.struct_SherpaOnnxOfflineSpeakerDiarizationConfig{}
  1282 +
  1283 + c.clustering.num_clusters = C.int(config.Clustering.NumClusters)
  1284 + c.clustering.threshold = C.float(config.Clustering.Threshold)
  1285 +
  1286 + SherpaOnnxOfflineSpeakerDiarizationSetConfig(sd.impl, &c)
  1287 +}
  1288 +
1279 type OfflineSpeakerDiarizationSegment struct { 1289 type OfflineSpeakerDiarizationSegment struct {
1280 Start float32 1290 Start float32
1281 End float32 1291 End float32
@@ -25,6 +25,11 @@ class OfflineSpeakerDiarization { @@ -25,6 +25,11 @@ class OfflineSpeakerDiarization {
25 process(samples) { 25 process(samples) {
26 return addon.offlineSpeakerDiarizationProcess(this.handle, samples); 26 return addon.offlineSpeakerDiarizationProcess(this.handle, samples);
27 } 27 }
  28 +
  29 + setConfig(config) {
  30 + addon.offlineSpeakerDiarizationSetConfig(config);
  31 + this.config.clustering = config.clustering;
  32 + }
28 } 33 }
29 34
30 module.exports = { 35 module.exports = {
@@ -251,6 +251,46 @@ static Napi::Array OfflineSpeakerDiarizationProcessWrapper( @@ -251,6 +251,46 @@ static Napi::Array OfflineSpeakerDiarizationProcessWrapper(
251 return ans; 251 return ans;
252 } 252 }
253 253
  254 +static void OfflineSpeakerDiarizationSetConfigWrapper(
  255 + const Napi::CallbackInfo &info) {
  256 + Napi::Env env = info.Env();
  257 +
  258 + if (info.Length() != 2) {
  259 + std::ostringstream os;
  260 + os << "Expect only 2 arguments. Given: " << info.Length();
  261 +
  262 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  263 +
  264 + return;
  265 + }
  266 +
  267 + if (!info[0].IsExternal()) {
  268 + Napi::TypeError::New(
  269 + env, "Argument 0 should be an offline speaker diarization pointer.")
  270 + .ThrowAsJavaScriptException();
  271 +
  272 + return;
  273 + }
  274 +
  275 + const SherpaOnnxOfflineSpeakerDiarization *sd =
  276 + info[0].As<Napi::External<SherpaOnnxOfflineSpeakerDiarization>>().Data();
  277 +
  278 + if (!info[1].IsObject()) {
  279 + Napi::TypeError::New(env, "Expect an object as the argument")
  280 + .ThrowAsJavaScriptException();
  281 +
  282 + return;
  283 + }
  284 +
  285 + Napi::Object o = info[0].As<Napi::Object>();
  286 +
  287 + SherpaOnnxOfflineSpeakerDiarizationConfig c;
  288 + memset(&c, 0, sizeof(c));
  289 +
  290 + c.clustering = GetFastClusteringConfig(o);
  291 + SherpaOnnxOfflineSpeakerDiarizationSetConfig(sd, &c);
  292 +}
  293 +
254 void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) { 294 void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) {
255 exports.Set(Napi::String::New(env, "createOfflineSpeakerDiarization"), 295 exports.Set(Napi::String::New(env, "createOfflineSpeakerDiarization"),
256 Napi::Function::New(env, CreateOfflineSpeakerDiarizationWrapper)); 296 Napi::Function::New(env, CreateOfflineSpeakerDiarizationWrapper));
@@ -262,4 +302,8 @@ void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) { @@ -262,4 +302,8 @@ void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) {
262 exports.Set( 302 exports.Set(
263 Napi::String::New(env, "offlineSpeakerDiarizationProcess"), 303 Napi::String::New(env, "offlineSpeakerDiarizationProcess"),
264 Napi::Function::New(env, OfflineSpeakerDiarizationProcessWrapper)); 304 Napi::Function::New(env, OfflineSpeakerDiarizationProcessWrapper));
  305 +
  306 + exports.Set(
  307 + Napi::String::New(env, "offlineSpeakerDiarizationSetConfig"),
  308 + Napi::Function::New(env, OfflineSpeakerDiarizationSetConfigWrapper));
265 } 309 }
@@ -1749,6 +1749,20 @@ int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate( @@ -1749,6 +1749,20 @@ int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
1749 return sd->impl->SampleRate(); 1749 return sd->impl->SampleRate();
1750 } 1750 }
1751 1751
  1752 +void SherpaOnnxOfflineSpeakerDiarizationSetConfig(
  1753 + const SherpaOnnxOfflineSpeakerDiarization *sd,
  1754 + const SherpaOnnxOfflineSpeakerDiarizationConfig *config) {
  1755 + sherpa_onnx::OfflineSpeakerDiarizationConfig sd_config;
  1756 +
  1757 + sd_config.clustering.num_clusters =
  1758 + SHERPA_ONNX_OR(config->clustering.num_clusters, -1);
  1759 +
  1760 + sd_config.clustering.threshold =
  1761 + SHERPA_ONNX_OR(config->clustering.threshold, 0.5);
  1762 +
  1763 + sd->impl->SetConfig(sd_config);
  1764 +}
  1765 +
1752 int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers( 1766 int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers(
1753 const SherpaOnnxOfflineSpeakerDiarizationResult *r) { 1767 const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
1754 return r->impl.NumSpeakers(); 1768 return r->impl.NumSpeakers();
@@ -1449,6 +1449,11 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineSpeakerDiarization( @@ -1449,6 +1449,11 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineSpeakerDiarization(
1449 SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate( 1449 SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
1450 const SherpaOnnxOfflineSpeakerDiarization *sd); 1450 const SherpaOnnxOfflineSpeakerDiarization *sd);
1451 1451
  1452 +// Only config->clustering is used. All other fields are ignored
  1453 +SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationSetConfig(
  1454 + const SherpaOnnxOfflineSpeakerDiarization *sd,
  1455 + const SherpaOnnxOfflineSpeakerDiarizationConfig *config);
  1456 +
1452 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationResult 1457 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationResult
1453 SherpaOnnxOfflineSpeakerDiarizationResult; 1458 SherpaOnnxOfflineSpeakerDiarizationResult;
1454 1459
@@ -20,6 +20,10 @@ class OfflineSpeakerDiarizationImpl { @@ -20,6 +20,10 @@ class OfflineSpeakerDiarizationImpl {
20 20
21 virtual int32_t SampleRate() const = 0; 21 virtual int32_t SampleRate() const = 0;
22 22
  23 + // Note: Only config.clustering is used. All other fields in config are
  24 + // ignored
  25 + virtual void SetConfig(const OfflineSpeakerDiarizationConfig &config) = 0;
  26 +
23 virtual OfflineSpeakerDiarizationResult Process( 27 virtual OfflineSpeakerDiarizationResult Process(
24 const float *audio, int32_t n, 28 const float *audio, int32_t n,
25 OfflineSpeakerDiarizationProgressCallback callback = nullptr, 29 OfflineSpeakerDiarizationProgressCallback callback = nullptr,
@@ -60,7 +60,7 @@ class OfflineSpeakerDiarizationPyannoteImpl @@ -60,7 +60,7 @@ class OfflineSpeakerDiarizationPyannoteImpl
60 : config_(config), 60 : config_(config),
61 segmentation_model_(config_.segmentation), 61 segmentation_model_(config_.segmentation),
62 embedding_extractor_(config_.embedding), 62 embedding_extractor_(config_.embedding),
63 - clustering_(config_.clustering) { 63 + clustering_(std::make_unique<FastClustering>(config_.clustering)) {
64 Init(); 64 Init();
65 } 65 }
66 66
@@ -70,6 +70,15 @@ class OfflineSpeakerDiarizationPyannoteImpl @@ -70,6 +70,15 @@ class OfflineSpeakerDiarizationPyannoteImpl
70 return meta_data.sample_rate; 70 return meta_data.sample_rate;
71 } 71 }
72 72
  73 + void SetConfig(const OfflineSpeakerDiarizationConfig &config) override {
  74 + if (!config.clustering.Validate()) {
  75 + SHERPA_ONNX_LOGE("Invalid clustering config. Skip it");
  76 + return;
  77 + }
  78 + clustering_ = std::make_unique<FastClustering>(config.clustering);
  79 + config_.clustering = config.clustering;
  80 + }
  81 +
73 OfflineSpeakerDiarizationResult Process( 82 OfflineSpeakerDiarizationResult Process(
74 const float *audio, int32_t n, 83 const float *audio, int32_t n,
75 OfflineSpeakerDiarizationProgressCallback callback = nullptr, 84 OfflineSpeakerDiarizationProgressCallback callback = nullptr,
@@ -105,7 +114,7 @@ class OfflineSpeakerDiarizationPyannoteImpl @@ -105,7 +114,7 @@ class OfflineSpeakerDiarizationPyannoteImpl
105 ComputeEmbeddings(audio, n, chunk_speaker_samples_list_pair.second, 114 ComputeEmbeddings(audio, n, chunk_speaker_samples_list_pair.second,
106 std::move(callback), callback_arg); 115 std::move(callback), callback_arg);
107 116
108 - std::vector<int32_t> cluster_labels = clustering_.Cluster( 117 + std::vector<int32_t> cluster_labels = clustering_->Cluster(
109 &embeddings(0, 0), embeddings.rows(), embeddings.cols()); 118 &embeddings(0, 0), embeddings.rows(), embeddings.cols());
110 119
111 int32_t max_cluster_index = 120 int32_t max_cluster_index =
@@ -636,7 +645,7 @@ class OfflineSpeakerDiarizationPyannoteImpl @@ -636,7 +645,7 @@ class OfflineSpeakerDiarizationPyannoteImpl
636 OfflineSpeakerDiarizationConfig config_; 645 OfflineSpeakerDiarizationConfig config_;
637 OfflineSpeakerSegmentationPyannoteModel segmentation_model_; 646 OfflineSpeakerSegmentationPyannoteModel segmentation_model_;
638 SpeakerEmbeddingExtractor embedding_extractor_; 647 SpeakerEmbeddingExtractor embedding_extractor_;
639 - FastClustering clustering_; 648 + std::unique_ptr<FastClustering> clustering_;
640 Matrix2DInt32 powerset_mapping_; 649 Matrix2DInt32 powerset_mapping_;
641 }; 650 };
642 651
@@ -79,6 +79,11 @@ int32_t OfflineSpeakerDiarization::SampleRate() const { @@ -79,6 +79,11 @@ int32_t OfflineSpeakerDiarization::SampleRate() const {
79 return impl_->SampleRate(); 79 return impl_->SampleRate();
80 } 80 }
81 81
  82 +void OfflineSpeakerDiarization::SetConfig(
  83 + const OfflineSpeakerDiarizationConfig &config) {
  84 + impl_->SetConfig(config);
  85 +}
  86 +
82 OfflineSpeakerDiarizationResult OfflineSpeakerDiarization::Process( 87 OfflineSpeakerDiarizationResult OfflineSpeakerDiarization::Process(
83 const float *audio, int32_t n, 88 const float *audio, int32_t n,
84 OfflineSpeakerDiarizationProgressCallback callback /*= nullptr*/, 89 OfflineSpeakerDiarizationProgressCallback callback /*= nullptr*/,
@@ -62,6 +62,10 @@ class OfflineSpeakerDiarization { @@ -62,6 +62,10 @@ class OfflineSpeakerDiarization {
62 // Expected sample rate of the input audio samples 62 // Expected sample rate of the input audio samples
63 int32_t SampleRate() const; 63 int32_t SampleRate() const;
64 64
  65 + // Note: Only config.clustering is used. All other fields in config are
  66 + // ignored
  67 + void SetConfig(const OfflineSpeakerDiarizationConfig &config);
  68 +
65 OfflineSpeakerDiarizationResult Process( 69 OfflineSpeakerDiarizationResult Process(
66 const float *audio, int32_t n, 70 const float *audio, int32_t n,
67 OfflineSpeakerDiarizationProgressCallback callback = nullptr, 71 OfflineSpeakerDiarizationProgressCallback callback = nullptr,
@@ -68,6 +68,7 @@ void PybindOfflineSpeakerDiarization(py::module *m) { @@ -68,6 +68,7 @@ void PybindOfflineSpeakerDiarization(py::module *m) {
68 .def(py::init<const OfflineSpeakerDiarizationConfig &>(), 68 .def(py::init<const OfflineSpeakerDiarizationConfig &>(),
69 py::arg("config")) 69 py::arg("config"))
70 .def_property_readonly("sample_rate", &PyClass::SampleRate) 70 .def_property_readonly("sample_rate", &PyClass::SampleRate)
  71 + .def("set_config", &PyClass::SetConfig, py::arg("config"))
71 .def( 72 .def(
72 "process", 73 "process",
73 [](const PyClass &self, const std::vector<float> samples, 74 [](const PyClass &self, const std::vector<float> samples,
@@ -1161,6 +1161,11 @@ class SherpaOnnxOfflineSpeakerDiarizationWrapper { @@ -1161,6 +1161,11 @@ class SherpaOnnxOfflineSpeakerDiarizationWrapper {
1161 return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl)) 1161 return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl))
1162 } 1162 }
1163 1163
  1164 + // only config.clustering is used. All other fields are ignored
  1165 + func setConfig(config: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationConfig>!) {
  1166 + SherpaOnnxOfflineSpeakerDiarizationSetConfig(impl, config)
  1167 + }
  1168 +
1164 func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] { 1169 func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] {
1165 let result = SherpaOnnxOfflineSpeakerDiarizationProcess( 1170 let result = SherpaOnnxOfflineSpeakerDiarizationProcess(
1166 impl, samples, Int32(samples.count)) 1171 impl, samples, Int32(samples.count))
@@ -18,6 +18,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR) @@ -18,6 +18,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR)
18 add_subdirectory(vad-asr) 18 add_subdirectory(vad-asr)
19 endif() 19 endif()
20 20
  21 +if(SHERPA_ONNX_ENABLE_WASM_SPEAKER_DIARIZATION)
  22 + add_subdirectory(speaker-diarization)
  23 +endif()
  24 +
21 if(SHERPA_ONNX_ENABLE_WASM_NODEJS) 25 if(SHERPA_ONNX_ENABLE_WASM_NODEJS)
22 add_subdirectory(nodejs) 26 add_subdirectory(nodejs)
23 endif() 27 endif()
  1 +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH})
  2 + message(FATAL_ERROR "Please use ./build-wasm-simd-speaker-diarization.sh to build for WASM for speaker diarization")
  3 +endif()
  4 +
  5 +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/segmentation.onnx" OR NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/embedding.onnx")
  6 + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue")
  7 +endif()
  8 +
  9 +set(exported_functions
  10 + MyPrint
  11 + SherpaOnnxCreateOfflineSpeakerDiarization
  12 + SherpaOnnxDestroyOfflineSpeakerDiarization
  13 + SherpaOnnxOfflineSpeakerDiarizationDestroyResult
  14 + SherpaOnnxOfflineSpeakerDiarizationDestroySegment
  15 + SherpaOnnxOfflineSpeakerDiarizationGetSampleRate
  16 + SherpaOnnxOfflineSpeakerDiarizationProcess
  17 + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback
  18 + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments
  19 + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime
  20 + SherpaOnnxOfflineSpeakerDiarizationSetConfig
  21 +)
  22 +set(mangled_exported_functions)
  23 +foreach(x IN LISTS exported_functions)
  24 + list(APPEND mangled_exported_functions "_${x}")
  25 +endforeach()
  26 +list(JOIN mangled_exported_functions "," all_exported_functions)
  27 +
  28 +
  29 +include_directories(${CMAKE_SOURCE_DIR})
  30 +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=512MB -s ALLOW_MEMORY_GROWTH=1")
  31 +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB
  32 +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ")
  33 +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ")
  34 +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ")
  35 +
  36 +message(STATUS "MY_FLAGS: ${MY_FLAGS}")
  37 +
  38 +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}")
  39 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}")
  40 +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}")
  41 +
  42 +if (NOT CMAKE_EXECUTABLE_SUFFIX STREQUAL ".js")
  43 + message(FATAL_ERROR "The default suffix for building executables should be .js!")
  44 +endif()
  45 +# set(CMAKE_EXECUTABLE_SUFFIX ".html")
  46 +
  47 +add_executable(sherpa-onnx-wasm-main-speaker-diarization sherpa-onnx-wasm-main-speaker-diarization.cc)
  48 +target_link_libraries(sherpa-onnx-wasm-main-speaker-diarization sherpa-onnx-c-api)
  49 +install(TARGETS sherpa-onnx-wasm-main-speaker-diarization DESTINATION bin/wasm/speaker-diarization)
  50 +
  51 +install(
  52 + FILES
  53 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-speaker-diarization>/sherpa-onnx-wasm-main-speaker-diarization.js"
  54 + "index.html"
  55 + "sherpa-onnx-speaker-diarization.js"
  56 + "app-speaker-diarization.js"
  57 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-speaker-diarization>/sherpa-onnx-wasm-main-speaker-diarization.wasm"
  58 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-main-speaker-diarization>/sherpa-onnx-wasm-main-speaker-diarization.data"
  59 + DESTINATION
  60 + bin/wasm/speaker-diarization
  61 +)
  1 +const startBtn = document.getElementById('startBtn');
  2 +const hint = document.getElementById('hint');
  3 +const numClustersInput = document.getElementById('numClustersInputID');
  4 +const thresholdInput = document.getElementById('thresholdInputID');
  5 +const textArea = document.getElementById('text');
  6 +
  7 +const fileSelectCtrl = document.getElementById('file');
  8 +
  9 +let sd = null;
  10 +let float32Samples = null;
  11 +
  12 +Module = {};
  13 +Module.onRuntimeInitialized = function() {
  14 + console.log('Model files downloaded!');
  15 +
  16 + console.log('Initializing speaker diarization ......');
  17 + sd = createOfflineSpeakerDiarization(Module)
  18 + console.log('sampleRate', sd.sampleRate);
  19 +
  20 + hint.innerText =
  21 + 'Initialized! Please select a wave file and click the Start button.';
  22 +
  23 + fileSelectCtrl.disabled = false;
  24 +};
  25 +
  26 +function onFileChange() {
  27 + var files = document.getElementById('file').files;
  28 +
  29 + if (files.length == 0) {
  30 + console.log('No file selected');
  31 + float32Samples = null;
  32 + startBtn.disabled = true;
  33 + return;
  34 + }
  35 + textArea.value = '';
  36 +
  37 + console.log('files: ' + files);
  38 +
  39 + const file = files[0];
  40 + console.log(file);
  41 + console.log('file.name ' + file.name);
  42 + console.log('file.type ' + file.type);
  43 + console.log('file.size ' + file.size);
  44 +
  45 + let audioCtx = new AudioContext({sampleRate: sd.sampleRate});
  46 +
  47 + let reader = new FileReader();
  48 + reader.onload = function() {
  49 + console.log('reading file!');
  50 + audioCtx.decodeAudioData(reader.result, decodedDone);
  51 + };
  52 +
  53 + function decodedDone(decoded) {
  54 + let typedArray = new Float32Array(decoded.length);
  55 + float32Samples = decoded.getChannelData(0);
  56 +
  57 + startBtn.disabled = false;
  58 + }
  59 +
  60 + reader.readAsArrayBuffer(file);
  61 +}
  62 +
  63 +startBtn.onclick = function() {
  64 + textArea.value = '';
  65 + if (float32Samples == null) {
  66 + alert('Empty audio samples!');
  67 +
  68 + startBtn.disabled = true;
  69 + return;
  70 + }
  71 +
  72 + let numClusters = numClustersInput.value;
  73 + if (numClusters.trim().length == 0) {
  74 + alert(
  75 + 'Please provide numClusters. Use -1 if you are not sure how many speakers are there');
  76 + return;
  77 + }
  78 +
  79 + if (!numClusters.match(/^\d+$/)) {
  80 + alert(`number of clusters ${
  81 + numClusters} is not an integer .\nPlease enter an integer`);
  82 + return;
  83 + }
  84 + numClusters = parseInt(numClusters, 10);
  85 + if (numClusters < -1) {
  86 + alert(`Number of clusters should be >= -1`);
  87 + return;
  88 + }
  89 +
  90 + let threshold = 0.5;
  91 + if (numClusters <= 0) {
  92 + threshold = thresholdInput.value;
  93 + if (threshold.trim().length == 0) {
  94 + alert('Please provide a threshold.');
  95 + return;
  96 + }
  97 +
  98 + threshold = parseFloat(threshold);
  99 + if (threshold < 0) {
  100 + alert(`Pleaser enter a positive threshold`);
  101 + return;
  102 + }
  103 + }
  104 +
  105 + let config = sd.config
  106 + config.clustering = {numClusters: numClusters, threshold: threshold};
  107 + sd.setConfig(config);
  108 + let segments = sd.process(float32Samples);
  109 + if (segments == null) {
  110 + textArea.value = 'No speakers detected';
  111 + return
  112 + }
  113 +
  114 + let s = '';
  115 + let sep = '';
  116 +
  117 + for (seg of segments) {
  118 + // clang-format off
  119 + s += sep + `${seg.start.toFixed(2)} -- ${seg.end.toFixed(2)} speaker_${seg.speaker}`
  120 + // clang-format on
  121 + sep = '\n';
  122 + }
  123 + textArea.value = s;
  124 +}
  1 +# Introduction
  2 +
  3 +Please refer to
  4 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
  5 +to download a speaker segmentation model
  6 +and
  7 +refer to
  8 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  9 +to download a speaker embedding extraction model.
  10 +
  11 +Remember to rename the downloaded files.
  12 +
  13 +The following is an example.
  14 +
  15 +
  16 +```bash
  17 +cd wasm/speaker-diarization/assets/
  18 +
  19 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  20 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  21 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  22 +cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx
  23 +rm -rf sherpa-onnx-pyannote-segmentation-3-0
  24 +
  25 +
  26 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  27 +mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx
  28 +
  29 +
  30 +```
  1 +<html lang="en">
  2 +
  3 +<head>
  4 + <meta charset="utf-8">
  5 + <meta name="viewport" content="width=device-width" />
  6 + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Speaker Diarization</title>
  7 + <style>
  8 + h1,div {
  9 + text-align: center;
  10 + }
  11 + textarea {
  12 + width:100%;
  13 + }
  14 + </style>
  15 +</head>
  16 +
  17 +<body>
  18 + <h1>
  19 + Next-gen Kaldi + WebAssembly<br/>
  20 + Speaker Diarization <br> with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a>
  21 + </h1>
  22 + <div>
  23 + <span id="hint">Loading model ... ...</span>
  24 + <br/>
  25 + <br/>
  26 + <label for="avatar">Choose a wav file:</label>
  27 + <input type="file" id="file" accept=".wav" onchange="onFileChange()" disabled></input>
  28 + <br/>
  29 + <br/>
  30 + <label for="numClusters" id="numClustersID">Number of speakers: </label>
  31 + <input type="text" id="numClustersInputID" name="numClusters" value="-1" />
  32 + <br/>
  33 + <br/>
  34 + <label for="clusteringThreshold" id="thresholdID">Clustering threshold: </label>
  35 + <input type="text" id="thresholdInputID" name="clusteringThreshold" value="0.5" />
  36 + <br/>
  37 + <br/>
  38 +
  39 + <textarea id="text" rows="10" placeholder="If you know the actual number of speakers in the input wave file, please provide it via Number of speakers. Otherwise, please leave Number of speakers to -1 and provide Clustering threshold instead. A larger threshold leads to fewer clusters, i.e., fewer speakers; a smaller threshold leads to more clusters, i.e., more speakers."></textarea>
  40 + <br/>
  41 + <br/>
  42 + <button id="startBtn" disabled>Start</button>
  43 + </div>
  44 +
  45 + <script src="app-speaker-diarization.js"></script>
  46 + <script src="sherpa-onnx-speaker-diarization.js"></script>
  47 + <script src="sherpa-onnx-wasm-main-speaker-diarization.js"></script>
  48 +</body>
  1 +
  2 +function freeConfig(config, Module) {
  3 + if ('buffer' in config) {
  4 + Module._free(config.buffer);
  5 + }
  6 +
  7 + if ('config' in config) {
  8 + freeConfig(config.config, Module)
  9 + }
  10 +
  11 + if ('segmentation' in config) {
  12 + freeConfig(config.segmentation, Module)
  13 + }
  14 +
  15 + if ('embedding' in config) {
  16 + freeConfig(config.embedding, Module)
  17 + }
  18 +
  19 + if ('clustering' in config) {
  20 + freeConfig(config.clustering, Module)
  21 + }
  22 +
  23 + Module._free(config.ptr);
  24 +}
  25 +
  26 +function initSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(
  27 + config, Module) {
  28 + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1;
  29 + const n = modelLen;
  30 + const buffer = Module._malloc(n);
  31 +
  32 + const len = 1 * 4;
  33 + const ptr = Module._malloc(len);
  34 +
  35 + let offset = 0;
  36 + Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
  37 + offset += modelLen;
  38 +
  39 + offset = 0;
  40 + Module.setValue(ptr, buffer + offset, 'i8*');
  41 +
  42 + return {
  43 + buffer: buffer, ptr: ptr, len: len,
  44 + }
  45 +}
  46 +
  47 +function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) {
  48 + if (!('pyannote' in config)) {
  49 + config.pyannote = {
  50 + model: '',
  51 + };
  52 + }
  53 +
  54 + const pyannote = initSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(
  55 + config.pyannote, Module);
  56 +
  57 + const len = pyannote.len + 3 * 4;
  58 + const ptr = Module._malloc(len);
  59 +
  60 + let offset = 0;
  61 + Module._CopyHeap(pyannote.ptr, pyannote.len, ptr + offset);
  62 + offset += pyannote.len;
  63 +
  64 + Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
  65 + offset += 4;
  66 +
  67 + Module.setValue(ptr + offset, config.debug || 1, 'i32');
  68 + offset += 4;
  69 +
  70 + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
  71 + const buffer = Module._malloc(providerLen);
  72 + Module.stringToUTF8(config.provider || 'cpu', buffer, providerLen);
  73 + Module.setValue(ptr + offset, buffer, 'i8*');
  74 +
  75 + return {
  76 + buffer: buffer,
  77 + ptr: ptr,
  78 + len: len,
  79 + config: pyannote,
  80 + };
  81 +}
  82 +
  83 +function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) {
  84 + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1;
  85 + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
  86 + const n = modelLen + providerLen;
  87 + const buffer = Module._malloc(n);
  88 +
  89 + const len = 4 * 4;
  90 + const ptr = Module._malloc(len);
  91 +
  92 + let offset = 0;
  93 + Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
  94 + offset += modelLen;
  95 +
  96 + Module.stringToUTF8(config.provider || 'cpu', buffer + offset, providerLen);
  97 + offset += providerLen;
  98 +
  99 + offset = 0
  100 + Module.setValue(ptr + offset, buffer, 'i8*');
  101 + offset += 4;
  102 +
  103 + Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
  104 + offset += 4;
  105 +
  106 + Module.setValue(ptr + offset, config.debug || 1, 'i32');
  107 + offset += 4;
  108 +
  109 + Module.setValue(ptr + offset, buffer + modelLen, 'i8*');
  110 + offset += 4;
  111 +
  112 + return {
  113 + buffer: buffer,
  114 + ptr: ptr,
  115 + len: len,
  116 + };
  117 +}
  118 +
  119 +function initSherpaOnnxFastClusteringConfig(config, Module) {
  120 + const len = 2 * 4;
  121 + const ptr = Module._malloc(len);
  122 +
  123 + let offset = 0;
  124 + Module.setValue(ptr + offset, config.numClusters || -1, 'i32');
  125 + offset += 4;
  126 +
  127 + Module.setValue(ptr + offset, config.threshold || 0.5, 'float');
  128 + offset += 4;
  129 +
  130 + return {
  131 + ptr: ptr,
  132 + len: len,
  133 + };
  134 +}
  135 +
  136 +function initSherpaOnnxOfflineSpeakerDiarizationConfig(config, Module) {
  137 + if (!('segmentation' in config)) {
  138 + config.segmentation = {
  139 + pyannote: {model: ''},
  140 + numThreads: 1,
  141 + debug: 0,
  142 + provider: 'cpu',
  143 + };
  144 + }
  145 +
  146 + if (!('embedding' in config)) {
  147 + config.embedding = {
  148 + model: '',
  149 + numThreads: 1,
  150 + debug: 0,
  151 + provider: 'cpu',
  152 + };
  153 + }
  154 +
  155 + if (!('clustering' in config)) {
  156 + config.clustering = {
  157 + numClusters: -1,
  158 + threshold: 0.5,
  159 + };
  160 + }
  161 +
  162 + const segmentation = initSherpaOnnxOfflineSpeakerSegmentationModelConfig(
  163 + config.segmentation, Module);
  164 +
  165 + const embedding =
  166 + initSherpaOnnxSpeakerEmbeddingExtractorConfig(config.embedding, Module);
  167 +
  168 + const clustering =
  169 + initSherpaOnnxFastClusteringConfig(config.clustering, Module);
  170 +
  171 + const len = segmentation.len + embedding.len + clustering.len + 2 * 4;
  172 + const ptr = Module._malloc(len);
  173 +
  174 + let offset = 0;
  175 + Module._CopyHeap(segmentation.ptr, segmentation.len, ptr + offset);
  176 + offset += segmentation.len;
  177 +
  178 + Module._CopyHeap(embedding.ptr, embedding.len, ptr + offset);
  179 + offset += embedding.len;
  180 +
  181 + Module._CopyHeap(clustering.ptr, clustering.len, ptr + offset);
  182 + offset += clustering.len;
  183 +
  184 + Module.setValue(ptr + offset, config.minDurationOn || 0.2, 'float');
  185 + offset += 4;
  186 +
  187 + Module.setValue(ptr + offset, config.minDurationOff || 0.5, 'float');
  188 + offset += 4;
  189 +
  190 + return {
  191 + ptr: ptr, len: len, segmentation: segmentation, embedding: embedding,
  192 + clustering: clustering,
  193 + }
  194 +}
  195 +
  196 +class OfflineSpeakerDiarization {
  197 + constructor(configObj, Module) {
  198 + const config =
  199 + initSherpaOnnxOfflineSpeakerDiarizationConfig(configObj, Module)
  200 + // Module._MyPrint(config.ptr);
  201 +
  202 + const handle =
  203 + Module._SherpaOnnxCreateOfflineSpeakerDiarization(config.ptr);
  204 +
  205 + freeConfig(config, Module);
  206 +
  207 + this.handle = handle;
  208 + this.sampleRate =
  209 + Module._SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(this.handle);
  210 + this.Module = Module
  211 +
  212 + this.config = configObj;
  213 + }
  214 +
  215 + free() {
  216 + this.Module._SherpaOnnxDestroyOfflineSpeakerDiarization(this.handle);
  217 + this.handle = 0
  218 + }
  219 +
  220 + setConfig(configObj) {
  221 + if (!('clustering' in configObj)) {
  222 + return;
  223 + }
  224 +
  225 + const config =
  226 + initSherpaOnnxOfflineSpeakerDiarizationConfig(configObj, this.Module);
  227 +
  228 + this.Module._SherpaOnnxOfflineSpeakerDiarizationSetConfig(
  229 + this.handle, config.ptr);
  230 +
  231 + freeConfig(config, Module);
  232 +
  233 + this.config.clustering = configObj.clustering;
  234 + }
  235 +
  236 + process(samples) {
  237 + const pointer =
  238 + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT);
  239 + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT);
  240 +
  241 + let r = this.Module._SherpaOnnxOfflineSpeakerDiarizationProcess(
  242 + this.handle, pointer, samples.length);
  243 + this.Module._free(pointer);
  244 +
  245 + let numSegments =
  246 + this.Module._SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r);
  247 +
  248 + let segments =
  249 + this.Module._SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(
  250 + r);
  251 +
  252 + let ans = [];
  253 +
  254 + let sizeOfSegment = 3 * 4;
  255 + for (let i = 0; i < numSegments; ++i) {
  256 + let p = segments + i * sizeOfSegment
  257 +
  258 + let start = this.Module.HEAPF32[p / 4 + 0];
  259 + let end = this.Module.HEAPF32[p / 4 + 1];
  260 + let speaker = this.Module.HEAP32[p / 4 + 2];
  261 +
  262 + ans.push({start: start, end: end, speaker: speaker});
  263 + }
  264 +
  265 + this.Module._SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments);
  266 + this.Module._SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r);
  267 +
  268 + return ans;
  269 + }
  270 +}
  271 +
  272 +function createOfflineSpeakerDiarization(Module, myConfig) {
  273 + const config = {
  274 + segmentation: {
  275 + pyannote: {model: './segmentation.onnx'},
  276 + },
  277 + embedding: {model: './embedding.onnx'},
  278 + clustering: {numClusters: -1, threshold: 0.5},
  279 + minDurationOn: 0.3,
  280 + minDurationOff: 0.5,
  281 + };
  282 +
  283 + if (myConfig) {
  284 + config = myConfig;
  285 + }
  286 +
  287 + return new OfflineSpeakerDiarization(config, Module);
  288 +}
  289 +
  290 +if (typeof process == 'object' && typeof process.versions == 'object' &&
  291 + typeof process.versions.node == 'string') {
  292 + module.exports = {
  293 + createOfflineSpeakerDiarization,
  294 + };
  295 +}
  1 +// wasm/sherpa-onnx-wasm-main-speaker-diarization.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#include <stdio.h>
  5 +
  6 +#include <algorithm>
  7 +#include <memory>
  8 +
  9 +#include "sherpa-onnx/c-api/c-api.h"
  10 +
  11 +// see also
  12 +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html
  13 +
  14 +extern "C" {
  15 +
  16 +static_assert(sizeof(SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig) ==
  17 + 1 * 4,
  18 + "");
  19 +
  20 +static_assert(
  21 + sizeof(SherpaOnnxOfflineSpeakerSegmentationModelConfig) ==
  22 + sizeof(SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig) + 3 * 4,
  23 + "");
  24 +
  25 +static_assert(sizeof(SherpaOnnxFastClusteringConfig) == 2 * 4, "");
  26 +
  27 +static_assert(sizeof(SherpaOnnxSpeakerEmbeddingExtractorConfig) == 4 * 4, "");
  28 +
  29 +static_assert(sizeof(SherpaOnnxOfflineSpeakerDiarizationConfig) ==
  30 + sizeof(SherpaOnnxOfflineSpeakerSegmentationModelConfig) +
  31 + sizeof(SherpaOnnxSpeakerEmbeddingExtractorConfig) +
  32 + sizeof(SherpaOnnxFastClusteringConfig) + 2 * 4,
  33 + "");
  34 +
  35 +void MyPrint(const SherpaOnnxOfflineSpeakerDiarizationConfig *sd_config) {
  36 + const auto &segmentation = sd_config->segmentation;
  37 + const auto &embedding = sd_config->embedding;
  38 + const auto &clustering = sd_config->clustering;
  39 +
  40 + fprintf(stdout, "----------segmentation config----------\n");
  41 + fprintf(stdout, "pyannote model: %s\n", segmentation.pyannote.model);
  42 + fprintf(stdout, "num threads: %d\n", segmentation.num_threads);
  43 + fprintf(stdout, "debug: %d\n", segmentation.debug);
  44 + fprintf(stdout, "provider: %s\n", segmentation.provider);
  45 +
  46 + fprintf(stdout, "----------embedding config----------\n");
  47 + fprintf(stdout, "model: %s\n", embedding.model);
  48 + fprintf(stdout, "num threads: %d\n", embedding.num_threads);
  49 + fprintf(stdout, "debug: %d\n", embedding.debug);
  50 + fprintf(stdout, "provider: %s\n", embedding.provider);
  51 +
  52 + fprintf(stdout, "----------clustering config----------\n");
  53 + fprintf(stdout, "num_clusters: %d\n", clustering.num_clusters);
  54 + fprintf(stdout, "threshold: %.3f\n", clustering.threshold);
  55 +
  56 + fprintf(stdout, "min_duration_on: %.3f\n", sd_config->min_duration_on);
  57 + fprintf(stdout, "min_duration_off: %.3f\n", sd_config->min_duration_off);
  58 +}
  59 +
  60 +void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
  61 + std::copy(src, src + num_bytes, dst);
  62 +}
  63 +}