Fangjun Kuang
Committed by GitHub

Add Swift API for computing speaker embeddings (#2492)

... ... @@ -9,6 +9,9 @@ ls -lh
./run-test-version.sh
./run-compute-speaker-embeddings.sh
rm -fv *.wav *.onnx
./run-tts-kitten-en.sh
ls -lh
rm -rf kitten-*
... ...
... ... @@ -21,3 +21,4 @@ test-version
zipformer-ctc-asr
dolphin-ctc-asr
tts-kitten-en
compute-speaker-embeddings
... ...
/// swfit-api-examples/SherpaOnnx.swift
/// swift-api-examples/SherpaOnnx.swift
/// Copyright (c) 2023 Xiaomi Corporation
import Foundation // For NSString
... ... @@ -936,6 +936,41 @@ func sherpaOnnxOfflineTtsConfig(
)
}
class SherpaOnnxWaveWrapper {
let wave: UnsafePointer<SherpaOnnxWave>!
class func readWave(filename: String) -> SherpaOnnxWaveWrapper {
let wave = SherpaOnnxReadWave(toCPointer(filename))
return SherpaOnnxWaveWrapper(wave: wave)
}
init(wave: UnsafePointer<SherpaOnnxWave>!) {
self.wave = wave
}
deinit {
if let wave {
SherpaOnnxFreeWave(wave)
}
}
var numSamples: Int {
return Int(wave.pointee.num_samples)
}
var sampleRate: Int {
return Int(wave.pointee.sample_rate)
}
var samples: [Float] {
if numSamples == 0 {
return []
} else {
return [Float](UnsafeBufferPointer(start: wave.pointee.samples, count: numSamples))
}
}
}
class SherpaOnnxGeneratedAudioWrapper {
/// A pointer to the underlying counterpart in C
let audio: UnsafePointer<SherpaOnnxGeneratedAudio>!
... ... @@ -960,14 +995,9 @@ class SherpaOnnxGeneratedAudioWrapper {
var samples: [Float] {
if let p = audio.pointee.samples {
var samples: [Float] = []
for index in 0..<n {
samples.append(p[Int(index)])
}
return samples
return [Float](UnsafeBufferPointer(start: p, count: Int(n)))
} else {
let samples: [Float] = []
return samples
return []
}
}
... ... @@ -1432,6 +1462,72 @@ class SherpaOnnxOfflineSpeakerDiarizationWrapper {
}
}
class SherpaOnnxOnlineStreamWrapper {
/// A pointer to the underlying counterpart in C
let impl: OpaquePointer!
init(impl: OpaquePointer!) {
self.impl = impl
}
deinit {
if let impl {
SherpaOnnxDestroyOnlineStream(impl)
}
}
func acceptWaveform(samples: [Float], sampleRate: Int = 16000) {
SherpaOnnxOnlineStreamAcceptWaveform(impl, Int32(sampleRate), samples, Int32(samples.count))
}
func inputFinished() {
SherpaOnnxOnlineStreamInputFinished(impl)
}
}
class SherpaOnnxSpeakerEmbeddingExtractorWrapper {
/// A pointer to the underlying counterpart in C
let impl: OpaquePointer!
init(
config: UnsafePointer<SherpaOnnxSpeakerEmbeddingExtractorConfig>!
) {
impl = SherpaOnnxCreateSpeakerEmbeddingExtractor(config)
}
deinit {
if let impl {
SherpaOnnxDestroySpeakerEmbeddingExtractor(impl)
}
}
var dim: Int {
return Int(SherpaOnnxSpeakerEmbeddingExtractorDim(impl))
}
func createStream() -> SherpaOnnxOnlineStreamWrapper {
let newStream = SherpaOnnxSpeakerEmbeddingExtractorCreateStream(impl)
return SherpaOnnxOnlineStreamWrapper(impl: newStream)
}
func isReady(stream: SherpaOnnxOnlineStreamWrapper) -> Bool {
return SherpaOnnxSpeakerEmbeddingExtractorIsReady(impl, stream.impl) == 1 ? true : false
}
func compute(stream: SherpaOnnxOnlineStreamWrapper) -> [Float] {
if !isReady(stream: stream) {
return []
}
let p = SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(impl, stream.impl)
defer {
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(p)
}
return [Float](UnsafeBufferPointer(start: p, count: dim))
}
}
func sherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(model: String = "")
-> SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig
{
... ...
/// swift-api-examples/compute-speaker-embeddings.swift
/// Copyright (c) 2025 Xiaomi Corporation
/*
Please download test files used in this script from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
*/
func cosineSimilarity(_ a: [Float], _ b: [Float]) -> Float {
precondition(a.count == b.count, "Vectors must have the same length")
// Dot product
let dotProduct = zip(a, b).reduce(0) { $0 + $1.0 * $1.1 }
// Magnitudes
let magA = sqrt(a.reduce(0) { $0 + $1 * $1 })
let magB = sqrt(b.reduce(0) { $0 + $1 * $1 })
// Avoid division by zero
guard magA > 0 && magB > 0 else { return 0 }
return dotProduct / (magA * magB)
}
func computeEmbedding(extractor: SherpaOnnxSpeakerEmbeddingExtractorWrapper, waveFilename: String)
-> [Float]
{
let audio = SherpaOnnxWaveWrapper.readWave(filename: waveFilename)
let stream = extractor.createStream()
stream.acceptWaveform(samples: audio.samples, sampleRate: audio.sampleRate)
stream.inputFinished()
return extractor.compute(stream: stream)
}
func run() {
let model = "./wespeaker_zh_cnceleb_resnet34.onnx"
var config = sherpaOnnxSpeakerEmbeddingExtractorConfig(model: model)
let extractor = SherpaOnnxSpeakerEmbeddingExtractorWrapper(config: &config)
let embedding1 = computeEmbedding(extractor: extractor, waveFilename: "./fangjun-sr-1.wav")
let embedding2 = computeEmbedding(extractor: extractor, waveFilename: "./fangjun-sr-2.wav")
let embedding3 = computeEmbedding(extractor: extractor, waveFilename: "./leijun-sr-1.wav")
let score12 = cosineSimilarity(embedding1, embedding2)
let score13 = cosineSimilarity(embedding1, embedding3)
let score23 = cosineSimilarity(embedding2, embedding3)
print("Score between spk1 and spk2: \(score12)")
print("Score between spk1 and spk3: \(score13)")
print("Score between spk2 and spk3: \(score23)")
}
@main
struct App {
static func main() {
run()
}
}
... ...
#!/usr/bin/env bash
set -ex
if [ ! -d ../build-swift-macos ]; then
echo "Please run ../build-swift-macos.sh first!"
exit 1
fi
if [ ! -f ./wespeaker_zh_cnceleb_resnet34.onnx ]; then
echo "Please download the pre-trained model for testing."
echo "You can refer to"
echo ""
echo "https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models"
echo ""
echo "for help"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_zh_cnceleb_resnet34.onnx
fi
if [ ! -f ./fangjun-sr-1.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/fangjun-sr-1.wav
fi
if [ ! -f ./fangjun-sr-2.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/fangjun-sr-2.wav
fi
if [ ! -f ./leijun-sr-1.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/leijun-sr-1.wav
fi
if [ ! -e ./compute-speaker-embeddings ]; then
# Note: We use -lc++ to link against libc++ instead of libstdc++
swiftc \
-lc++ \
-I ../build-swift-macos/install/include \
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
./compute-speaker-embeddings.swift ./SherpaOnnx.swift \
-L ../build-swift-macos/install/lib/ \
-l sherpa-onnx \
-l onnxruntime \
-o compute-speaker-embeddings
strip compute-speaker-embeddings
else
echo "./compute-speaker-embeddings exists - skip building"
fi
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
./compute-speaker-embeddings
... ...