Fangjun Kuang
Committed by GitHub

Swift API for speaker diarization (#1404)

@@ -7,6 +7,11 @@ echo "pwd: $PWD" @@ -7,6 +7,11 @@ echo "pwd: $PWD"
7 cd swift-api-examples 7 cd swift-api-examples
8 ls -lh 8 ls -lh
9 9
  10 +./run-speaker-diarization.sh
  11 +rm -rf *.onnx
  12 +rm -rf sherpa-onnx-pyannote-segmentation-3-0
  13 +rm -fv *.wav
  14 +
10 ./run-add-punctuations.sh 15 ./run-add-punctuations.sh
11 rm ./add-punctuations 16 rm ./add-punctuations
12 rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12 17 rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12
@@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper { @@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper {
1078 return ans 1078 return ans
1079 } 1079 }
1080 } 1080 }
  1081 +
  1082 +func sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: String)
  1083 + -> SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
  1084 +{
  1085 + return SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: toCPointer(model))
  1086 +}
  1087 +
  1088 +func sherpaOnnxOfflineSpeakerSegmentationModelConfig(
  1089 + pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig,
  1090 + numThreads: Int = 1,
  1091 + debug: Int = 0,
  1092 + provider: String = "cpu"
  1093 +) -> SherpaOnnxOfflineSpeakerSegmentationModelConfig {
  1094 + return SherpaOnnxOfflineSpeakerSegmentationModelConfig(
  1095 + pyannote: pyannote,
  1096 + num_threads: Int32(numThreads),
  1097 + debug: Int32(debug),
  1098 + provider: toCPointer(provider)
  1099 + )
  1100 +}
  1101 +
  1102 +func sherpaOnnxFastClusteringConfig(numClusters: Int = -1, threshold: Float = 0.5)
  1103 + -> SherpaOnnxFastClusteringConfig
  1104 +{
  1105 + return SherpaOnnxFastClusteringConfig(num_clusters: Int32(numClusters), threshold: threshold)
  1106 +}
  1107 +
  1108 +func sherpaOnnxSpeakerEmbeddingExtractorConfig(
  1109 + model: String,
  1110 + numThreads: Int = 1,
  1111 + debug: Int = 0,
  1112 + provider: String = "cpu"
  1113 +) -> SherpaOnnxSpeakerEmbeddingExtractorConfig {
  1114 + return SherpaOnnxSpeakerEmbeddingExtractorConfig(
  1115 + model: toCPointer(model),
  1116 + num_threads: Int32(numThreads),
  1117 + debug: Int32(debug),
  1118 + provider: toCPointer(provider)
  1119 + )
  1120 +}
  1121 +
  1122 +func sherpaOnnxOfflineSpeakerDiarizationConfig(
  1123 + segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig,
  1124 + embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig,
  1125 + clustering: SherpaOnnxFastClusteringConfig,
  1126 + minDurationOn: Float = 0.3,
  1127 + minDurationOff: Float = 0.5
  1128 +) -> SherpaOnnxOfflineSpeakerDiarizationConfig {
  1129 + return SherpaOnnxOfflineSpeakerDiarizationConfig(
  1130 + segmentation: segmentation,
  1131 + embedding: embedding,
  1132 + clustering: clustering,
  1133 + min_duration_on: minDurationOn,
  1134 + min_duration_off: minDurationOff
  1135 + )
  1136 +}
  1137 +
  1138 +struct SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper {
  1139 + var start: Float = 0
  1140 + var end: Float = 0
  1141 + var speaker: Int = 0
  1142 +}
  1143 +
  1144 +class SherpaOnnxOfflineSpeakerDiarizationWrapper {
  1145 + /// A pointer to the underlying counterpart in C
  1146 + let impl: OpaquePointer!
  1147 +
  1148 + init(
  1149 + config: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationConfig>!
  1150 + ) {
  1151 + impl = SherpaOnnxCreateOfflineSpeakerDiarization(config)
  1152 + }
  1153 +
  1154 + deinit {
  1155 + if let impl {
  1156 + SherpaOnnxDestroyOfflineSpeakerDiarization(impl)
  1157 + }
  1158 + }
  1159 +
  1160 + var sampleRate: Int {
  1161 + return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl))
  1162 + }
  1163 +
  1164 + func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] {
  1165 + let result = SherpaOnnxOfflineSpeakerDiarizationProcess(
  1166 + impl, samples, Int32(samples.count))
  1167 +
  1168 + if result == nil {
  1169 + return []
  1170 + }
  1171 +
  1172 + let numSegments = Int(SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result))
  1173 +
  1174 + let p: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationSegment>? =
  1175 + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result)
  1176 +
  1177 + if p == nil {
  1178 + return []
  1179 + }
  1180 +
  1181 + var ans: [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] = []
  1182 + for i in 0..<numSegments {
  1183 + ans.append(
  1184 + SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper(
  1185 + start: p![i].start, end: p![i].end, speaker: Int(p![i].speaker)))
  1186 + }
  1187 +
  1188 + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p)
  1189 + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result)
  1190 +
  1191 + return ans
  1192 + }
  1193 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
  4 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  5 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  6 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  7 +fi
  8 +
  9 +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
  10 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  11 +fi
  12 +
  13 +if [ ! -f ./0-four-speakers-zh.wav ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  15 +fi
  16 +
  17 +if [ ! -e ./speaker-diarization ]; then
  18 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  19 + swiftc \
  20 + -lc++ \
  21 + -I ../build-swift-macos/install/include \
  22 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  23 + ./speaker-diarization.swift ./SherpaOnnx.swift \
  24 + -L ../build-swift-macos/install/lib/ \
  25 + -l sherpa-onnx \
  26 + -l onnxruntime \
  27 + -o speaker-diarization
  28 +
  29 + strip speaker-diarization
  30 +else
  31 + echo "./speaker-diarization exists - skip building"
  32 +fi
  33 +
  34 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  35 +./speaker-diarization
  1 +import AVFoundation
  2 +
  3 +extension AudioBuffer {
  4 + func array() -> [Float] {
  5 + return Array(UnsafeBufferPointer(self))
  6 + }
  7 +}
  8 +
  9 +extension AVAudioPCMBuffer {
  10 + func array() -> [Float] {
  11 + return self.audioBufferList.pointee.mBuffers.array()
  12 + }
  13 +}
  14 +
  15 +func run() {
  16 + let segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"
  17 + let embeddingExtractorModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"
  18 + let waveFilename = "./0-four-speakers-zh.wav"
  19 +
  20 + // There are 4 speakers in ./0-four-speakers-zh.wav, so we use 4 here
  21 + let numSpeakers = 4
  22 + var config = sherpaOnnxOfflineSpeakerDiarizationConfig(
  23 + segmentation: sherpaOnnxOfflineSpeakerSegmentationModelConfig(
  24 + pyannote: sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: segmentationModel)),
  25 + embedding: sherpaOnnxSpeakerEmbeddingExtractorConfig(model: embeddingExtractorModel),
  26 + clustering: sherpaOnnxFastClusteringConfig(numClusters: numSpeakers)
  27 + )
  28 +
  29 + let sd = SherpaOnnxOfflineSpeakerDiarizationWrapper(config: &config)
  30 +
  31 + let fileURL: NSURL = NSURL(fileURLWithPath: waveFilename)
  32 + let audioFile = try! AVAudioFile(forReading: fileURL as URL)
  33 +
  34 + let audioFormat = audioFile.processingFormat
  35 + assert(Int(audioFormat.sampleRate) == sd.sampleRate)
  36 + assert(audioFormat.channelCount == 1)
  37 + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
  38 +
  39 + let audioFrameCount = UInt32(audioFile.length)
  40 + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
  41 +
  42 + try! audioFile.read(into: audioFileBuffer!)
  43 + let array: [Float]! = audioFileBuffer?.array()
  44 + print("Started!")
  45 + let segments = sd.process(samples: array)
  46 + for i in 0..<segments.count {
  47 + print("\(segments[i].start) -- \(segments[i].end) speaker_\(segments[i].speaker)")
  48 + }
  49 +}
  50 +
  51 +@main
  52 +struct App {
  53 + static func main() {
  54 + run()
  55 + }
  56 +}