Fangjun Kuang
Committed by GitHub

Add Swift API for computing speaker embeddings (#2492)

@@ -9,6 +9,9 @@ ls -lh @@ -9,6 +9,9 @@ ls -lh
9 9
10 ./run-test-version.sh 10 ./run-test-version.sh
11 11
  12 +./run-compute-speaker-embeddings.sh
  13 +rm -fv *.wav *.onnx
  14 +
12 ./run-tts-kitten-en.sh 15 ./run-tts-kitten-en.sh
13 ls -lh 16 ls -lh
14 rm -rf kitten-* 17 rm -rf kitten-*
@@ -21,3 +21,4 @@ test-version @@ -21,3 +21,4 @@ test-version
21 zipformer-ctc-asr 21 zipformer-ctc-asr
22 dolphin-ctc-asr 22 dolphin-ctc-asr
23 tts-kitten-en 23 tts-kitten-en
  24 +compute-speaker-embeddings
1 -/// swfit-api-examples/SherpaOnnx.swift 1 +/// swift-api-examples/SherpaOnnx.swift
2 /// Copyright (c) 2023 Xiaomi Corporation 2 /// Copyright (c) 2023 Xiaomi Corporation
3 3
4 import Foundation // For NSString 4 import Foundation // For NSString
@@ -936,6 +936,41 @@ func sherpaOnnxOfflineTtsConfig( @@ -936,6 +936,41 @@ func sherpaOnnxOfflineTtsConfig(
936 ) 936 )
937 } 937 }
938 938
  939 +class SherpaOnnxWaveWrapper {
  940 + let wave: UnsafePointer<SherpaOnnxWave>!
  941 +
  942 + class func readWave(filename: String) -> SherpaOnnxWaveWrapper {
  943 + let wave = SherpaOnnxReadWave(toCPointer(filename))
  944 + return SherpaOnnxWaveWrapper(wave: wave)
  945 + }
  946 +
  947 + init(wave: UnsafePointer<SherpaOnnxWave>!) {
  948 + self.wave = wave
  949 + }
  950 +
  951 + deinit {
  952 + if let wave {
  953 + SherpaOnnxFreeWave(wave)
  954 + }
  955 + }
  956 +
  957 + var numSamples: Int {
  958 + return Int(wave.pointee.num_samples)
  959 + }
  960 +
  961 + var sampleRate: Int {
  962 + return Int(wave.pointee.sample_rate)
  963 + }
  964 +
  965 + var samples: [Float] {
  966 + if numSamples == 0 {
  967 + return []
  968 + } else {
  969 + return [Float](UnsafeBufferPointer(start: wave.pointee.samples, count: numSamples))
  970 + }
  971 + }
  972 +}
  973 +
939 class SherpaOnnxGeneratedAudioWrapper { 974 class SherpaOnnxGeneratedAudioWrapper {
940 /// A pointer to the underlying counterpart in C 975 /// A pointer to the underlying counterpart in C
941 let audio: UnsafePointer<SherpaOnnxGeneratedAudio>! 976 let audio: UnsafePointer<SherpaOnnxGeneratedAudio>!
@@ -960,14 +995,9 @@ class SherpaOnnxGeneratedAudioWrapper { @@ -960,14 +995,9 @@ class SherpaOnnxGeneratedAudioWrapper {
960 995
961 var samples: [Float] { 996 var samples: [Float] {
962 if let p = audio.pointee.samples { 997 if let p = audio.pointee.samples {
963 - var samples: [Float] = []  
964 - for index in 0..<n {  
965 - samples.append(p[Int(index)])  
966 - }  
967 - return samples 998 + return [Float](UnsafeBufferPointer(start: p, count: Int(n)))
968 } else { 999 } else {
969 - let samples: [Float] = []  
970 - return samples 1000 + return []
971 } 1001 }
972 } 1002 }
973 1003
@@ -1432,6 +1462,72 @@ class SherpaOnnxOfflineSpeakerDiarizationWrapper { @@ -1432,6 +1462,72 @@ class SherpaOnnxOfflineSpeakerDiarizationWrapper {
1432 } 1462 }
1433 } 1463 }
1434 1464
  1465 +class SherpaOnnxOnlineStreamWrapper {
  1466 + /// A pointer to the underlying counterpart in C
  1467 + let impl: OpaquePointer!
  1468 + init(impl: OpaquePointer!) {
  1469 + self.impl = impl
  1470 + }
  1471 +
  1472 + deinit {
  1473 + if let impl {
  1474 + SherpaOnnxDestroyOnlineStream(impl)
  1475 + }
  1476 + }
  1477 +
  1478 + func acceptWaveform(samples: [Float], sampleRate: Int = 16000) {
  1479 + SherpaOnnxOnlineStreamAcceptWaveform(impl, Int32(sampleRate), samples, Int32(samples.count))
  1480 + }
  1481 +
  1482 + func inputFinished() {
  1483 + SherpaOnnxOnlineStreamInputFinished(impl)
  1484 + }
  1485 +}
  1486 +
  1487 +class SherpaOnnxSpeakerEmbeddingExtractorWrapper {
  1488 + /// A pointer to the underlying counterpart in C
  1489 + let impl: OpaquePointer!
  1490 +
  1491 + init(
  1492 + config: UnsafePointer<SherpaOnnxSpeakerEmbeddingExtractorConfig>!
  1493 + ) {
  1494 + impl = SherpaOnnxCreateSpeakerEmbeddingExtractor(config)
  1495 + }
  1496 +
  1497 + deinit {
  1498 + if let impl {
  1499 + SherpaOnnxDestroySpeakerEmbeddingExtractor(impl)
  1500 + }
  1501 + }
  1502 +
  1503 + var dim: Int {
  1504 + return Int(SherpaOnnxSpeakerEmbeddingExtractorDim(impl))
  1505 + }
  1506 +
  1507 + func createStream() -> SherpaOnnxOnlineStreamWrapper {
  1508 + let newStream = SherpaOnnxSpeakerEmbeddingExtractorCreateStream(impl)
  1509 + return SherpaOnnxOnlineStreamWrapper(impl: newStream)
  1510 + }
  1511 +
  1512 + func isReady(stream: SherpaOnnxOnlineStreamWrapper) -> Bool {
  1513 + return SherpaOnnxSpeakerEmbeddingExtractorIsReady(impl, stream.impl) == 1 ? true : false
  1514 + }
  1515 +
  1516 + func compute(stream: SherpaOnnxOnlineStreamWrapper) -> [Float] {
  1517 + if !isReady(stream: stream) {
  1518 + return []
  1519 + }
  1520 +
  1521 + let p = SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(impl, stream.impl)
  1522 +
  1523 + defer {
  1524 + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(p)
  1525 + }
  1526 +
  1527 + return [Float](UnsafeBufferPointer(start: p, count: dim))
  1528 + }
  1529 +}
  1530 +
1435 func sherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(model: String = "") 1531 func sherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(model: String = "")
1436 -> SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig 1532 -> SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig
1437 { 1533 {
  1 +/// swift-api-examples/compute-speaker-embeddings.swift
  2 +/// Copyright (c) 2025 Xiaomi Corporation
  3 +/*
  4 +Please download test files used in this script from
  5 +
  6 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  7 +*/
  8 +func cosineSimilarity(_ a: [Float], _ b: [Float]) -> Float {
  9 + precondition(a.count == b.count, "Vectors must have the same length")
  10 +
  11 + // Dot product
  12 + let dotProduct = zip(a, b).reduce(0) { $0 + $1.0 * $1.1 }
  13 +
  14 + // Magnitudes
  15 + let magA = sqrt(a.reduce(0) { $0 + $1 * $1 })
  16 + let magB = sqrt(b.reduce(0) { $0 + $1 * $1 })
  17 +
  18 + // Avoid division by zero
  19 + guard magA > 0 && magB > 0 else { return 0 }
  20 +
  21 + return dotProduct / (magA * magB)
  22 +}
  23 +
  24 +func computeEmbedding(extractor: SherpaOnnxSpeakerEmbeddingExtractorWrapper, waveFilename: String)
  25 + -> [Float]
  26 +{
  27 + let audio = SherpaOnnxWaveWrapper.readWave(filename: waveFilename)
  28 + let stream = extractor.createStream()
  29 + stream.acceptWaveform(samples: audio.samples, sampleRate: audio.sampleRate)
  30 + stream.inputFinished()
  31 + return extractor.compute(stream: stream)
  32 +}
  33 +
  34 +func run() {
  35 + let model = "./wespeaker_zh_cnceleb_resnet34.onnx"
  36 + var config = sherpaOnnxSpeakerEmbeddingExtractorConfig(model: model)
  37 + let extractor = SherpaOnnxSpeakerEmbeddingExtractorWrapper(config: &config)
  38 + let embedding1 = computeEmbedding(extractor: extractor, waveFilename: "./fangjun-sr-1.wav")
  39 + let embedding2 = computeEmbedding(extractor: extractor, waveFilename: "./fangjun-sr-2.wav")
  40 + let embedding3 = computeEmbedding(extractor: extractor, waveFilename: "./leijun-sr-1.wav")
  41 +
  42 + let score12 = cosineSimilarity(embedding1, embedding2)
  43 + let score13 = cosineSimilarity(embedding1, embedding3)
  44 + let score23 = cosineSimilarity(embedding2, embedding3)
  45 +
  46 + print("Score between spk1 and spk2: \(score12)")
  47 + print("Score between spk1 and spk3: \(score13)")
  48 + print("Score between spk2 and spk3: \(score23)")
  49 +}
  50 +
  51 +@main
  52 +struct App {
  53 + static func main() {
  54 + run()
  55 + }
  56 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -f ./wespeaker_zh_cnceleb_resnet34.onnx ]; then
  11 + echo "Please download the pre-trained model for testing."
  12 + echo "You can refer to"
  13 + echo ""
  14 + echo "https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models"
  15 + echo ""
  16 + echo "for help"
  17 +
  18 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_zh_cnceleb_resnet34.onnx
  19 +fi
  20 +
  21 +if [ ! -f ./fangjun-sr-1.wav ]; then
  22 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/fangjun-sr-1.wav
  23 +fi
  24 +
  25 +if [ ! -f ./fangjun-sr-2.wav ]; then
  26 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/fangjun-sr-2.wav
  27 +fi
  28 +
  29 +if [ ! -f ./leijun-sr-1.wav ]; then
  30 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/leijun-sr-1.wav
  31 +fi
  32 +
  33 +if [ ! -e ./compute-speaker-embeddings ]; then
  34 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  35 + swiftc \
  36 + -lc++ \
  37 + -I ../build-swift-macos/install/include \
  38 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  39 + ./compute-speaker-embeddings.swift ./SherpaOnnx.swift \
  40 + -L ../build-swift-macos/install/lib/ \
  41 + -l sherpa-onnx \
  42 + -l onnxruntime \
  43 + -o compute-speaker-embeddings
  44 +
  45 + strip compute-speaker-embeddings
  46 +else
  47 + echo "./compute-speaker-embeddings exists - skip building"
  48 +fi
  49 +
  50 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  51 +./compute-speaker-embeddings