Committed by
GitHub
Swift API for speaker diarization (#1404)
正在显示
4 个修改的文件
包含
209 行增加
和
0 行删除
| @@ -7,6 +7,11 @@ echo "pwd: $PWD" | @@ -7,6 +7,11 @@ echo "pwd: $PWD" | ||
| 7 | cd swift-api-examples | 7 | cd swift-api-examples |
| 8 | ls -lh | 8 | ls -lh |
| 9 | 9 | ||
| 10 | +./run-speaker-diarization.sh | ||
| 11 | +rm -rf *.onnx | ||
| 12 | +rm -rf sherpa-onnx-pyannote-segmentation-3-0 | ||
| 13 | +rm -fv *.wav | ||
| 14 | + | ||
| 10 | ./run-add-punctuations.sh | 15 | ./run-add-punctuations.sh |
| 11 | rm ./add-punctuations | 16 | rm ./add-punctuations |
| 12 | rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12 | 17 | rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12 |
| @@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper { | @@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper { | ||
| 1078 | return ans | 1078 | return ans |
| 1079 | } | 1079 | } |
| 1080 | } | 1080 | } |
| 1081 | + | ||
| 1082 | +func sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: String) | ||
| 1083 | + -> SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig | ||
| 1084 | +{ | ||
| 1085 | + return SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: toCPointer(model)) | ||
| 1086 | +} | ||
| 1087 | + | ||
| 1088 | +func sherpaOnnxOfflineSpeakerSegmentationModelConfig( | ||
| 1089 | + pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig, | ||
| 1090 | + numThreads: Int = 1, | ||
| 1091 | + debug: Int = 0, | ||
| 1092 | + provider: String = "cpu" | ||
| 1093 | +) -> SherpaOnnxOfflineSpeakerSegmentationModelConfig { | ||
| 1094 | + return SherpaOnnxOfflineSpeakerSegmentationModelConfig( | ||
| 1095 | + pyannote: pyannote, | ||
| 1096 | + num_threads: Int32(numThreads), | ||
| 1097 | + debug: Int32(debug), | ||
| 1098 | + provider: toCPointer(provider) | ||
| 1099 | + ) | ||
| 1100 | +} | ||
| 1101 | + | ||
| 1102 | +func sherpaOnnxFastClusteringConfig(numClusters: Int = -1, threshold: Float = 0.5) | ||
| 1103 | + -> SherpaOnnxFastClusteringConfig | ||
| 1104 | +{ | ||
| 1105 | + return SherpaOnnxFastClusteringConfig(num_clusters: Int32(numClusters), threshold: threshold) | ||
| 1106 | +} | ||
| 1107 | + | ||
| 1108 | +func sherpaOnnxSpeakerEmbeddingExtractorConfig( | ||
| 1109 | + model: String, | ||
| 1110 | + numThreads: Int = 1, | ||
| 1111 | + debug: Int = 0, | ||
| 1112 | + provider: String = "cpu" | ||
| 1113 | +) -> SherpaOnnxSpeakerEmbeddingExtractorConfig { | ||
| 1114 | + return SherpaOnnxSpeakerEmbeddingExtractorConfig( | ||
| 1115 | + model: toCPointer(model), | ||
| 1116 | + num_threads: Int32(numThreads), | ||
| 1117 | + debug: Int32(debug), | ||
| 1118 | + provider: toCPointer(provider) | ||
| 1119 | + ) | ||
| 1120 | +} | ||
| 1121 | + | ||
| 1122 | +func sherpaOnnxOfflineSpeakerDiarizationConfig( | ||
| 1123 | + segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig, | ||
| 1124 | + embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig, | ||
| 1125 | + clustering: SherpaOnnxFastClusteringConfig, | ||
| 1126 | + minDurationOn: Float = 0.3, | ||
| 1127 | + minDurationOff: Float = 0.5 | ||
| 1128 | +) -> SherpaOnnxOfflineSpeakerDiarizationConfig { | ||
| 1129 | + return SherpaOnnxOfflineSpeakerDiarizationConfig( | ||
| 1130 | + segmentation: segmentation, | ||
| 1131 | + embedding: embedding, | ||
| 1132 | + clustering: clustering, | ||
| 1133 | + min_duration_on: minDurationOn, | ||
| 1134 | + min_duration_off: minDurationOff | ||
| 1135 | + ) | ||
| 1136 | +} | ||
| 1137 | + | ||
| 1138 | +struct SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper { | ||
| 1139 | + var start: Float = 0 | ||
| 1140 | + var end: Float = 0 | ||
| 1141 | + var speaker: Int = 0 | ||
| 1142 | +} | ||
| 1143 | + | ||
| 1144 | +class SherpaOnnxOfflineSpeakerDiarizationWrapper { | ||
| 1145 | + /// A pointer to the underlying counterpart in C | ||
| 1146 | + let impl: OpaquePointer! | ||
| 1147 | + | ||
| 1148 | + init( | ||
| 1149 | + config: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationConfig>! | ||
| 1150 | + ) { | ||
| 1151 | + impl = SherpaOnnxCreateOfflineSpeakerDiarization(config) | ||
| 1152 | + } | ||
| 1153 | + | ||
| 1154 | + deinit { | ||
| 1155 | + if let impl { | ||
| 1156 | + SherpaOnnxDestroyOfflineSpeakerDiarization(impl) | ||
| 1157 | + } | ||
| 1158 | + } | ||
| 1159 | + | ||
| 1160 | + var sampleRate: Int { | ||
| 1161 | + return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl)) | ||
| 1162 | + } | ||
| 1163 | + | ||
| 1164 | + func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] { | ||
| 1165 | + let result = SherpaOnnxOfflineSpeakerDiarizationProcess( | ||
| 1166 | + impl, samples, Int32(samples.count)) | ||
| 1167 | + | ||
| 1168 | + if result == nil { | ||
| 1169 | + return [] | ||
| 1170 | + } | ||
| 1171 | + | ||
| 1172 | + let numSegments = Int(SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result)) | ||
| 1173 | + | ||
| 1174 | + let p: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationSegment>? = | ||
| 1175 | + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result) | ||
| 1176 | + | ||
| 1177 | + if p == nil { | ||
| 1178 | + return [] | ||
| 1179 | + } | ||
| 1180 | + | ||
| 1181 | + var ans: [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] = [] | ||
| 1182 | + for i in 0..<numSegments { | ||
| 1183 | + ans.append( | ||
| 1184 | + SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper( | ||
| 1185 | + start: p![i].start, end: p![i].end, speaker: Int(p![i].speaker))) | ||
| 1186 | + } | ||
| 1187 | + | ||
| 1188 | + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p) | ||
| 1189 | + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result) | ||
| 1190 | + | ||
| 1191 | + return ans | ||
| 1192 | + } | ||
| 1193 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then | ||
| 4 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 5 | + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 6 | + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 7 | +fi | ||
| 8 | + | ||
| 9 | +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then | ||
| 10 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 11 | +fi | ||
| 12 | + | ||
| 13 | +if [ ! -f ./0-four-speakers-zh.wav ]; then | ||
| 14 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 15 | +fi | ||
| 16 | + | ||
| 17 | +if [ ! -e ./speaker-diarization ]; then | ||
| 18 | + # Note: We use -lc++ to link against libc++ instead of libstdc++ | ||
| 19 | + swiftc \ | ||
| 20 | + -lc++ \ | ||
| 21 | + -I ../build-swift-macos/install/include \ | ||
| 22 | + -import-objc-header ./SherpaOnnx-Bridging-Header.h \ | ||
| 23 | + ./speaker-diarization.swift ./SherpaOnnx.swift \ | ||
| 24 | + -L ../build-swift-macos/install/lib/ \ | ||
| 25 | + -l sherpa-onnx \ | ||
| 26 | + -l onnxruntime \ | ||
| 27 | + -o speaker-diarization | ||
| 28 | + | ||
| 29 | + strip speaker-diarization | ||
| 30 | +else | ||
| 31 | + echo "./speaker-diarization exists - skip building" | ||
| 32 | +fi | ||
| 33 | + | ||
| 34 | +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH | ||
| 35 | +./speaker-diarization |
swift-api-examples/speaker-diarization.swift
0 → 100644
| 1 | +import AVFoundation | ||
| 2 | + | ||
| 3 | +extension AudioBuffer { | ||
| 4 | + func array() -> [Float] { | ||
| 5 | + return Array(UnsafeBufferPointer(self)) | ||
| 6 | + } | ||
| 7 | +} | ||
| 8 | + | ||
| 9 | +extension AVAudioPCMBuffer { | ||
| 10 | + func array() -> [Float] { | ||
| 11 | + return self.audioBufferList.pointee.mBuffers.array() | ||
| 12 | + } | ||
| 13 | +} | ||
| 14 | + | ||
| 15 | +func run() { | ||
| 16 | + let segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx" | ||
| 17 | + let embeddingExtractorModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx" | ||
| 18 | + let waveFilename = "./0-four-speakers-zh.wav" | ||
| 19 | + | ||
| 20 | + // There are 4 speakers in ./0-four-speakers-zh.wav, so we use 4 here | ||
| 21 | + let numSpeakers = 4 | ||
| 22 | + var config = sherpaOnnxOfflineSpeakerDiarizationConfig( | ||
| 23 | + segmentation: sherpaOnnxOfflineSpeakerSegmentationModelConfig( | ||
| 24 | + pyannote: sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: segmentationModel)), | ||
| 25 | + embedding: sherpaOnnxSpeakerEmbeddingExtractorConfig(model: embeddingExtractorModel), | ||
| 26 | + clustering: sherpaOnnxFastClusteringConfig(numClusters: numSpeakers) | ||
| 27 | + ) | ||
| 28 | + | ||
| 29 | + let sd = SherpaOnnxOfflineSpeakerDiarizationWrapper(config: &config) | ||
| 30 | + | ||
| 31 | + let fileURL: NSURL = NSURL(fileURLWithPath: waveFilename) | ||
| 32 | + let audioFile = try! AVAudioFile(forReading: fileURL as URL) | ||
| 33 | + | ||
| 34 | + let audioFormat = audioFile.processingFormat | ||
| 35 | + assert(Int(audioFormat.sampleRate) == sd.sampleRate) | ||
| 36 | + assert(audioFormat.channelCount == 1) | ||
| 37 | + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) | ||
| 38 | + | ||
| 39 | + let audioFrameCount = UInt32(audioFile.length) | ||
| 40 | + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) | ||
| 41 | + | ||
| 42 | + try! audioFile.read(into: audioFileBuffer!) | ||
| 43 | + let array: [Float]! = audioFileBuffer?.array() | ||
| 44 | + print("Started!") | ||
| 45 | + let segments = sd.process(samples: array) | ||
| 46 | + for i in 0..<segments.count { | ||
| 47 | + print("\(segments[i].start) -- \(segments[i].end) speaker_\(segments[i].speaker)") | ||
| 48 | + } | ||
| 49 | +} | ||
| 50 | + | ||
| 51 | +@main | ||
| 52 | +struct App { | ||
| 53 | + static func main() { | ||
| 54 | + run() | ||
| 55 | + } | ||
| 56 | +} |
-
请 注册 或 登录 后发表评论