Fangjun Kuang
Committed by GitHub

Swift API for speaker diarization (#1404)

... ... @@ -7,6 +7,11 @@ echo "pwd: $PWD"
cd swift-api-examples
ls -lh
./run-speaker-diarization.sh
rm -rf *.onnx
rm -rf sherpa-onnx-pyannote-segmentation-3-0
rm -fv *.wav
./run-add-punctuations.sh
rm ./add-punctuations
rm -rf sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12
... ...
... ... @@ -1078,3 +1078,116 @@ class SherpaOnnxOfflinePunctuationWrapper {
return ans
}
}
func sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: String)
-> SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig
{
return SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: toCPointer(model))
}
func sherpaOnnxOfflineSpeakerSegmentationModelConfig(
pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig,
numThreads: Int = 1,
debug: Int = 0,
provider: String = "cpu"
) -> SherpaOnnxOfflineSpeakerSegmentationModelConfig {
return SherpaOnnxOfflineSpeakerSegmentationModelConfig(
pyannote: pyannote,
num_threads: Int32(numThreads),
debug: Int32(debug),
provider: toCPointer(provider)
)
}
func sherpaOnnxFastClusteringConfig(numClusters: Int = -1, threshold: Float = 0.5)
-> SherpaOnnxFastClusteringConfig
{
return SherpaOnnxFastClusteringConfig(num_clusters: Int32(numClusters), threshold: threshold)
}
func sherpaOnnxSpeakerEmbeddingExtractorConfig(
model: String,
numThreads: Int = 1,
debug: Int = 0,
provider: String = "cpu"
) -> SherpaOnnxSpeakerEmbeddingExtractorConfig {
return SherpaOnnxSpeakerEmbeddingExtractorConfig(
model: toCPointer(model),
num_threads: Int32(numThreads),
debug: Int32(debug),
provider: toCPointer(provider)
)
}
func sherpaOnnxOfflineSpeakerDiarizationConfig(
segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig,
embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig,
clustering: SherpaOnnxFastClusteringConfig,
minDurationOn: Float = 0.3,
minDurationOff: Float = 0.5
) -> SherpaOnnxOfflineSpeakerDiarizationConfig {
return SherpaOnnxOfflineSpeakerDiarizationConfig(
segmentation: segmentation,
embedding: embedding,
clustering: clustering,
min_duration_on: minDurationOn,
min_duration_off: minDurationOff
)
}
struct SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper {
var start: Float = 0
var end: Float = 0
var speaker: Int = 0
}
class SherpaOnnxOfflineSpeakerDiarizationWrapper {
/// A pointer to the underlying counterpart in C
let impl: OpaquePointer!
init(
config: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationConfig>!
) {
impl = SherpaOnnxCreateOfflineSpeakerDiarization(config)
}
deinit {
if let impl {
SherpaOnnxDestroyOfflineSpeakerDiarization(impl)
}
}
var sampleRate: Int {
return Int(SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(impl))
}
func process(samples: [Float]) -> [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] {
let result = SherpaOnnxOfflineSpeakerDiarizationProcess(
impl, samples, Int32(samples.count))
if result == nil {
return []
}
let numSegments = Int(SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result))
let p: UnsafePointer<SherpaOnnxOfflineSpeakerDiarizationSegment>? =
SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result)
if p == nil {
return []
}
var ans: [SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper] = []
for i in 0..<numSegments {
ans.append(
SherpaOnnxOfflineSpeakerDiarizationSegmentWrapper(
start: p![i].start, end: p![i].end, speaker: Int(p![i].speaker)))
}
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p)
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result)
return ans
}
}
... ...
#!/usr/bin/env bash
if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
fi
if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi
if [ ! -f ./0-four-speakers-zh.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
fi
if [ ! -e ./speaker-diarization ]; then
# Note: We use -lc++ to link against libc++ instead of libstdc++
swiftc \
-lc++ \
-I ../build-swift-macos/install/include \
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
./speaker-diarization.swift ./SherpaOnnx.swift \
-L ../build-swift-macos/install/lib/ \
-l sherpa-onnx \
-l onnxruntime \
-o speaker-diarization
strip speaker-diarization
else
echo "./speaker-diarization exists - skip building"
fi
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
./speaker-diarization
... ...
import AVFoundation
extension AudioBuffer {
func array() -> [Float] {
return Array(UnsafeBufferPointer(self))
}
}
extension AVAudioPCMBuffer {
func array() -> [Float] {
return self.audioBufferList.pointee.mBuffers.array()
}
}
func run() {
let segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"
let embeddingExtractorModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"
let waveFilename = "./0-four-speakers-zh.wav"
// There are 4 speakers in ./0-four-speakers-zh.wav, so we use 4 here
let numSpeakers = 4
var config = sherpaOnnxOfflineSpeakerDiarizationConfig(
segmentation: sherpaOnnxOfflineSpeakerSegmentationModelConfig(
pyannote: sherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(model: segmentationModel)),
embedding: sherpaOnnxSpeakerEmbeddingExtractorConfig(model: embeddingExtractorModel),
clustering: sherpaOnnxFastClusteringConfig(numClusters: numSpeakers)
)
let sd = SherpaOnnxOfflineSpeakerDiarizationWrapper(config: &config)
let fileURL: NSURL = NSURL(fileURLWithPath: waveFilename)
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
let audioFormat = audioFile.processingFormat
assert(Int(audioFormat.sampleRate) == sd.sampleRate)
assert(audioFormat.channelCount == 1)
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
let audioFrameCount = UInt32(audioFile.length)
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
try! audioFile.read(into: audioFileBuffer!)
let array: [Float]! = audioFileBuffer?.array()
print("Started!")
let segments = sd.process(samples: array)
for i in 0..<segments.count {
print("\(segments[i].start) -- \(segments[i].end) speaker_\(segments[i].speaker)")
}
}
@main
struct App {
static func main() {
run()
}
}
... ...