Committed by
GitHub
Add Swift API for computing speaker embeddings (#2492)
正在显示
5 个修改的文件
包含
215 行增加
和
8 行删除
| 1 | -/// swfit-api-examples/SherpaOnnx.swift | 1 | +/// swift-api-examples/SherpaOnnx.swift |
| 2 | /// Copyright (c) 2023 Xiaomi Corporation | 2 | /// Copyright (c) 2023 Xiaomi Corporation |
| 3 | 3 | ||
| 4 | import Foundation // For NSString | 4 | import Foundation // For NSString |
| @@ -936,6 +936,41 @@ func sherpaOnnxOfflineTtsConfig( | @@ -936,6 +936,41 @@ func sherpaOnnxOfflineTtsConfig( | ||
| 936 | ) | 936 | ) |
| 937 | } | 937 | } |
| 938 | 938 | ||
| 939 | +class SherpaOnnxWaveWrapper { | ||
| 940 | + let wave: UnsafePointer<SherpaOnnxWave>! | ||
| 941 | + | ||
| 942 | + class func readWave(filename: String) -> SherpaOnnxWaveWrapper { | ||
| 943 | + let wave = SherpaOnnxReadWave(toCPointer(filename)) | ||
| 944 | + return SherpaOnnxWaveWrapper(wave: wave) | ||
| 945 | + } | ||
| 946 | + | ||
| 947 | + init(wave: UnsafePointer<SherpaOnnxWave>!) { | ||
| 948 | + self.wave = wave | ||
| 949 | + } | ||
| 950 | + | ||
| 951 | + deinit { | ||
| 952 | + if let wave { | ||
| 953 | + SherpaOnnxFreeWave(wave) | ||
| 954 | + } | ||
| 955 | + } | ||
| 956 | + | ||
| 957 | + var numSamples: Int { | ||
| 958 | + return Int(wave.pointee.num_samples) | ||
| 959 | + } | ||
| 960 | + | ||
| 961 | + var sampleRate: Int { | ||
| 962 | + return Int(wave.pointee.sample_rate) | ||
| 963 | + } | ||
| 964 | + | ||
| 965 | + var samples: [Float] { | ||
| 966 | + if numSamples == 0 { | ||
| 967 | + return [] | ||
| 968 | + } else { | ||
| 969 | + return [Float](UnsafeBufferPointer(start: wave.pointee.samples, count: numSamples)) | ||
| 970 | + } | ||
| 971 | + } | ||
| 972 | +} | ||
| 973 | + | ||
| 939 | class SherpaOnnxGeneratedAudioWrapper { | 974 | class SherpaOnnxGeneratedAudioWrapper { |
| 940 | /// A pointer to the underlying counterpart in C | 975 | /// A pointer to the underlying counterpart in C |
| 941 | let audio: UnsafePointer<SherpaOnnxGeneratedAudio>! | 976 | let audio: UnsafePointer<SherpaOnnxGeneratedAudio>! |
| @@ -960,14 +995,9 @@ class SherpaOnnxGeneratedAudioWrapper { | @@ -960,14 +995,9 @@ class SherpaOnnxGeneratedAudioWrapper { | ||
| 960 | 995 | ||
| 961 | var samples: [Float] { | 996 | var samples: [Float] { |
| 962 | if let p = audio.pointee.samples { | 997 | if let p = audio.pointee.samples { |
| 963 | - var samples: [Float] = [] | ||
| 964 | - for index in 0..<n { | ||
| 965 | - samples.append(p[Int(index)]) | ||
| 966 | - } | ||
| 967 | - return samples | 998 | + return [Float](UnsafeBufferPointer(start: p, count: Int(n))) |
| 968 | } else { | 999 | } else { |
| 969 | - let samples: [Float] = [] | ||
| 970 | - return samples | 1000 | + return [] |
| 971 | } | 1001 | } |
| 972 | } | 1002 | } |
| 973 | 1003 | ||
| @@ -1432,6 +1462,72 @@ class SherpaOnnxOfflineSpeakerDiarizationWrapper { | @@ -1432,6 +1462,72 @@ class SherpaOnnxOfflineSpeakerDiarizationWrapper { | ||
| 1432 | } | 1462 | } |
| 1433 | } | 1463 | } |
| 1434 | 1464 | ||
| 1465 | +class SherpaOnnxOnlineStreamWrapper { | ||
| 1466 | + /// A pointer to the underlying counterpart in C | ||
| 1467 | + let impl: OpaquePointer! | ||
| 1468 | + init(impl: OpaquePointer!) { | ||
| 1469 | + self.impl = impl | ||
| 1470 | + } | ||
| 1471 | + | ||
| 1472 | + deinit { | ||
| 1473 | + if let impl { | ||
| 1474 | + SherpaOnnxDestroyOnlineStream(impl) | ||
| 1475 | + } | ||
| 1476 | + } | ||
| 1477 | + | ||
| 1478 | + func acceptWaveform(samples: [Float], sampleRate: Int = 16000) { | ||
| 1479 | + SherpaOnnxOnlineStreamAcceptWaveform(impl, Int32(sampleRate), samples, Int32(samples.count)) | ||
| 1480 | + } | ||
| 1481 | + | ||
| 1482 | + func inputFinished() { | ||
| 1483 | + SherpaOnnxOnlineStreamInputFinished(impl) | ||
| 1484 | + } | ||
| 1485 | +} | ||
| 1486 | + | ||
| 1487 | +class SherpaOnnxSpeakerEmbeddingExtractorWrapper { | ||
| 1488 | + /// A pointer to the underlying counterpart in C | ||
| 1489 | + let impl: OpaquePointer! | ||
| 1490 | + | ||
| 1491 | + init( | ||
| 1492 | + config: UnsafePointer<SherpaOnnxSpeakerEmbeddingExtractorConfig>! | ||
| 1493 | + ) { | ||
| 1494 | + impl = SherpaOnnxCreateSpeakerEmbeddingExtractor(config) | ||
| 1495 | + } | ||
| 1496 | + | ||
| 1497 | + deinit { | ||
| 1498 | + if let impl { | ||
| 1499 | + SherpaOnnxDestroySpeakerEmbeddingExtractor(impl) | ||
| 1500 | + } | ||
| 1501 | + } | ||
| 1502 | + | ||
| 1503 | + var dim: Int { | ||
| 1504 | + return Int(SherpaOnnxSpeakerEmbeddingExtractorDim(impl)) | ||
| 1505 | + } | ||
| 1506 | + | ||
| 1507 | + func createStream() -> SherpaOnnxOnlineStreamWrapper { | ||
| 1508 | + let newStream = SherpaOnnxSpeakerEmbeddingExtractorCreateStream(impl) | ||
| 1509 | + return SherpaOnnxOnlineStreamWrapper(impl: newStream) | ||
| 1510 | + } | ||
| 1511 | + | ||
| 1512 | + func isReady(stream: SherpaOnnxOnlineStreamWrapper) -> Bool { | ||
| 1513 | + return SherpaOnnxSpeakerEmbeddingExtractorIsReady(impl, stream.impl) == 1 ? true : false | ||
| 1514 | + } | ||
| 1515 | + | ||
| 1516 | + func compute(stream: SherpaOnnxOnlineStreamWrapper) -> [Float] { | ||
| 1517 | + if !isReady(stream: stream) { | ||
| 1518 | + return [] | ||
| 1519 | + } | ||
| 1520 | + | ||
| 1521 | + let p = SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(impl, stream.impl) | ||
| 1522 | + | ||
| 1523 | + defer { | ||
| 1524 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(p) | ||
| 1525 | + } | ||
| 1526 | + | ||
| 1527 | + return [Float](UnsafeBufferPointer(start: p, count: dim)) | ||
| 1528 | + } | ||
| 1529 | +} | ||
| 1530 | + | ||
| 1435 | func sherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(model: String = "") | 1531 | func sherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig(model: String = "") |
| 1436 | -> SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig | 1532 | -> SherpaOnnxOfflineSpeechDenoiserGtcrnModelConfig |
| 1437 | { | 1533 | { |
| 1 | +/// swift-api-examples/compute-speaker-embeddings.swift | ||
| 2 | +/// Copyright (c) 2025 Xiaomi Corporation | ||
| 3 | +/* | ||
| 4 | +Please download test files used in this script from | ||
| 5 | + | ||
| 6 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 7 | +*/ | ||
| 8 | +func cosineSimilarity(_ a: [Float], _ b: [Float]) -> Float { | ||
| 9 | + precondition(a.count == b.count, "Vectors must have the same length") | ||
| 10 | + | ||
| 11 | + // Dot product | ||
| 12 | + let dotProduct = zip(a, b).reduce(0) { $0 + $1.0 * $1.1 } | ||
| 13 | + | ||
| 14 | + // Magnitudes | ||
| 15 | + let magA = sqrt(a.reduce(0) { $0 + $1 * $1 }) | ||
| 16 | + let magB = sqrt(b.reduce(0) { $0 + $1 * $1 }) | ||
| 17 | + | ||
| 18 | + // Avoid division by zero | ||
| 19 | + guard magA > 0 && magB > 0 else { return 0 } | ||
| 20 | + | ||
| 21 | + return dotProduct / (magA * magB) | ||
| 22 | +} | ||
| 23 | + | ||
| 24 | +func computeEmbedding(extractor: SherpaOnnxSpeakerEmbeddingExtractorWrapper, waveFilename: String) | ||
| 25 | + -> [Float] | ||
| 26 | +{ | ||
| 27 | + let audio = SherpaOnnxWaveWrapper.readWave(filename: waveFilename) | ||
| 28 | + let stream = extractor.createStream() | ||
| 29 | + stream.acceptWaveform(samples: audio.samples, sampleRate: audio.sampleRate) | ||
| 30 | + stream.inputFinished() | ||
| 31 | + return extractor.compute(stream: stream) | ||
| 32 | +} | ||
| 33 | + | ||
| 34 | +func run() { | ||
| 35 | + let model = "./wespeaker_zh_cnceleb_resnet34.onnx" | ||
| 36 | + var config = sherpaOnnxSpeakerEmbeddingExtractorConfig(model: model) | ||
| 37 | + let extractor = SherpaOnnxSpeakerEmbeddingExtractorWrapper(config: &config) | ||
| 38 | + let embedding1 = computeEmbedding(extractor: extractor, waveFilename: "./fangjun-sr-1.wav") | ||
| 39 | + let embedding2 = computeEmbedding(extractor: extractor, waveFilename: "./fangjun-sr-2.wav") | ||
| 40 | + let embedding3 = computeEmbedding(extractor: extractor, waveFilename: "./leijun-sr-1.wav") | ||
| 41 | + | ||
| 42 | + let score12 = cosineSimilarity(embedding1, embedding2) | ||
| 43 | + let score13 = cosineSimilarity(embedding1, embedding3) | ||
| 44 | + let score23 = cosineSimilarity(embedding2, embedding3) | ||
| 45 | + | ||
| 46 | + print("Score between spk1 and spk2: \(score12)") | ||
| 47 | + print("Score between spk1 and spk3: \(score13)") | ||
| 48 | + print("Score between spk2 and spk3: \(score23)") | ||
| 49 | +} | ||
| 50 | + | ||
| 51 | +@main | ||
| 52 | +struct App { | ||
| 53 | + static func main() { | ||
| 54 | + run() | ||
| 55 | + } | ||
| 56 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -d ../build-swift-macos ]; then | ||
| 6 | + echo "Please run ../build-swift-macos.sh first!" | ||
| 7 | + exit 1 | ||
| 8 | +fi | ||
| 9 | + | ||
| 10 | +if [ ! -f ./wespeaker_zh_cnceleb_resnet34.onnx ]; then | ||
| 11 | + echo "Please download the pre-trained model for testing." | ||
| 12 | + echo "You can refer to" | ||
| 13 | + echo "" | ||
| 14 | + echo "https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models" | ||
| 15 | + echo "" | ||
| 16 | + echo "for help" | ||
| 17 | + | ||
| 18 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_zh_cnceleb_resnet34.onnx | ||
| 19 | +fi | ||
| 20 | + | ||
| 21 | +if [ ! -f ./fangjun-sr-1.wav ]; then | ||
| 22 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/fangjun-sr-1.wav | ||
| 23 | +fi | ||
| 24 | + | ||
| 25 | +if [ ! -f ./fangjun-sr-2.wav ]; then | ||
| 26 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/fangjun-sr-2.wav | ||
| 27 | +fi | ||
| 28 | + | ||
| 29 | +if [ ! -f ./leijun-sr-1.wav ]; then | ||
| 30 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/leijun-sr-1.wav | ||
| 31 | +fi | ||
| 32 | + | ||
| 33 | +if [ ! -e ./compute-speaker-embeddings ]; then | ||
| 34 | + # Note: We use -lc++ to link against libc++ instead of libstdc++ | ||
| 35 | + swiftc \ | ||
| 36 | + -lc++ \ | ||
| 37 | + -I ../build-swift-macos/install/include \ | ||
| 38 | + -import-objc-header ./SherpaOnnx-Bridging-Header.h \ | ||
| 39 | + ./compute-speaker-embeddings.swift ./SherpaOnnx.swift \ | ||
| 40 | + -L ../build-swift-macos/install/lib/ \ | ||
| 41 | + -l sherpa-onnx \ | ||
| 42 | + -l onnxruntime \ | ||
| 43 | + -o compute-speaker-embeddings | ||
| 44 | + | ||
| 45 | + strip compute-speaker-embeddings | ||
| 46 | +else | ||
| 47 | + echo "./compute-speaker-embeddings exists - skip building" | ||
| 48 | +fi | ||
| 49 | + | ||
| 50 | +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH | ||
| 51 | +./compute-speaker-embeddings |
-
请 注册 或 登录 后发表评论