Fangjun Kuang
Committed by GitHub

Add Swift API for spoken language identification. (#696)

@@ -7,6 +7,9 @@ echo "pwd: $PWD" @@ -7,6 +7,9 @@ echo "pwd: $PWD"
7 cd swift-api-examples 7 cd swift-api-examples
8 ls -lh 8 ls -lh
9 9
  10 +./run-spoken-language-identification.sh
  11 +rm -rf sherpa-onnx-whisper*
  12 +
10 mkdir -p /Users/fangjun/Desktop 13 mkdir -p /Users/fangjun/Desktop
11 pushd /Users/fangjun/Desktop 14 pushd /Users/fangjun/Desktop
12 curl -SL -O https://huggingface.co/csukuangfj/test-data/resolve/main/Obama.wav 15 curl -SL -O https://huggingface.co/csukuangfj/test-data/resolve/main/Obama.wav
1 decode-file 1 decode-file
2 decode-file-non-streaming 2 decode-file-non-streaming
3 generate-subtitles 3 generate-subtitles
  4 +spoken-language-identification
4 tts 5 tts
5 vits-vctk 6 vits-vctk
6 sherpa-onnx-paraformer-zh-2023-09-14 7 sherpa-onnx-paraformer-zh-2023-09-14
@@ -713,3 +713,86 @@ class SherpaOnnxOfflineTtsWrapper { @@ -713,3 +713,86 @@ class SherpaOnnxOfflineTtsWrapper {
713 return SherpaOnnxGeneratedAudioWrapper(audio: audio) 713 return SherpaOnnxGeneratedAudioWrapper(audio: audio)
714 } 714 }
715 } 715 }
  716 +
  717 +// spoken language identification
  718 +
  719 +func sherpaOnnxSpokenLanguageIdentificationWhisperConfig(
  720 + encoder: String,
  721 + decoder: String,
  722 + tailPaddings: Int = -1
  723 +) -> SherpaOnnxSpokenLanguageIdentificationWhisperConfig {
  724 + return SherpaOnnxSpokenLanguageIdentificationWhisperConfig(
  725 + encoder: toCPointer(encoder),
  726 + decoder: toCPointer(decoder),
  727 + tail_paddings: Int32(tailPaddings))
  728 +}
  729 +
  730 +func sherpaOnnxSpokenLanguageIdentificationConfig(
  731 + whisper: SherpaOnnxSpokenLanguageIdentificationWhisperConfig,
  732 + numThreads: Int = 1,
  733 + debug: Int = 0,
  734 + provider: String = "cpu"
  735 +) -> SherpaOnnxSpokenLanguageIdentificationConfig {
  736 + return SherpaOnnxSpokenLanguageIdentificationConfig(
  737 + whisper: whisper,
  738 + num_threads: Int32(numThreads),
  739 + debug: Int32(debug),
  740 + provider: toCPointer(provider))
  741 +}
  742 +
  743 +class SherpaOnnxSpokenLanguageIdentificationResultWrapper {
  744 + /// A pointer to the underlying counterpart in C
  745 + let result: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationResult>!
  746 +
  747 + /// Return the detected language.
  748 + /// en for English
  749 + /// zh for Chinese
  750 + /// es for Spanish
  751 + /// de for German
  752 + /// etc.
  753 + var lang: String {
  754 + return String(cString: result.pointee.lang)
  755 + }
  756 +
  757 + init(result: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationResult>!) {
  758 + self.result = result
  759 + }
  760 +
  761 + deinit {
  762 + if let result {
  763 + SherpaOnnxDestroySpokenLanguageIdentificationResult(result)
  764 + }
  765 + }
  766 +}
  767 +
  768 +class SherpaOnnxSpokenLanguageIdentificationWrapper {
  769 + /// A pointer to the underlying counterpart in C
  770 + let slid: OpaquePointer!
  771 +
  772 + init(
  773 + config: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationConfig>!
  774 + ) {
  775 + slid = SherpaOnnxCreateSpokenLanguageIdentification(config)
  776 + }
  777 +
  778 + deinit {
  779 + if let slid {
  780 + SherpaOnnxDestroySpokenLanguageIdentification(slid)
  781 + }
  782 + }
  783 +
  784 + func decode(samples: [Float], sampleRate: Int = 16000)
  785 + -> SherpaOnnxSpokenLanguageIdentificationResultWrapper
  786 + {
  787 + let stream: OpaquePointer! = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid)
  788 + AcceptWaveformOffline(stream, Int32(sampleRate), samples, Int32(samples.count))
  789 +
  790 + let result: UnsafePointer<SherpaOnnxSpokenLanguageIdentificationResult>? =
  791 + SherpaOnnxSpokenLanguageIdentificationCompute(
  792 + slid,
  793 + stream)
  794 +
  795 + DestroyOfflineStream(stream)
  796 + return SherpaOnnxSpokenLanguageIdentificationResultWrapper(result: result)
  797 + }
  798 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ../build-swift-macos ]; then
  6 + echo "Please run ../build-swift-macos.sh first!"
  7 + exit 1
  8 +fi
  9 +
  10 +if [ ! -d ./sherpa-onnx-whisper-tiny ]; then
  11 + echo "Download a pre-trained model for testing."
  12 +
  13 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
  14 + tar xvf sherpa-onnx-whisper-tiny.tar.bz2
  15 + rm sherpa-onnx-whisper-tiny.tar.bz2
  16 +fi
  17 +
  18 +if [ ! -e ./spoken-language-identification ]; then
  19 + # Note: We use -lc++ to link against libc++ instead of libstdc++
  20 + swiftc \
  21 + -lc++ \
  22 + -I ../build-swift-macos/install/include \
  23 + -import-objc-header ./SherpaOnnx-Bridging-Header.h \
  24 + ./spoken-language-identification.swift ./SherpaOnnx.swift \
  25 + -L ../build-swift-macos/install/lib/ \
  26 + -l sherpa-onnx \
  27 + -l onnxruntime \
  28 + -o spoken-language-identification
  29 +
  30 + strip spoken-language-identification
  31 +else
  32 + echo "./spoken-language-identification exists - skip building"
  33 +fi
  34 +
  35 +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
  36 +./spoken-language-identification
  1 +import AVFoundation
  2 +
  3 +extension AudioBuffer {
  4 + func array() -> [Float] {
  5 + return Array(UnsafeBufferPointer(self))
  6 + }
  7 +}
  8 +
  9 +extension AVAudioPCMBuffer {
  10 + func array() -> [Float] {
  11 + return self.audioBufferList.pointee.mBuffers.array()
  12 + }
  13 +}
  14 +
  15 +func run() {
  16 + let encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx"
  17 + let decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx"
  18 +
  19 + let whisperConfig = sherpaOnnxSpokenLanguageIdentificationWhisperConfig(
  20 + encoder: encoder,
  21 + decoder: decoder
  22 + )
  23 +
  24 + var config = sherpaOnnxSpokenLanguageIdentificationConfig(
  25 + whisper: whisperConfig,
  26 + numThreads: 1,
  27 + debug: 1,
  28 + provider: "cpu"
  29 + )
  30 + let filePath = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav"
  31 +
  32 + let slid = SherpaOnnxSpokenLanguageIdentificationWrapper(config: &config)
  33 +
  34 + let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
  35 + let audioFile = try! AVAudioFile(forReading: fileURL as URL)
  36 +
  37 + let audioFormat = audioFile.processingFormat
  38 + assert(audioFormat.sampleRate == 16000)
  39 + assert(audioFormat.channelCount == 1)
  40 + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
  41 +
  42 + let audioFrameCount = UInt32(audioFile.length)
  43 + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
  44 +
  45 + try! audioFile.read(into: audioFileBuffer!)
  46 + let array: [Float]! = audioFileBuffer?.array()
  47 + let result = slid.decode(samples: array)
  48 +
  49 + print("\nDetectedllanguage is:\n\(result.lang)")
  50 +}
  51 +
  52 +@main
  53 +struct App {
  54 + static func main() {
  55 + run()
  56 + }
  57 +}