SherpaOnnx.swift
5.0 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/// swfit-api-examples/SherpaOnnx.swift
/// Copyright (c) 2023 Xiaomi Corporation
import Foundation // For NSString
/// Convert a String from swift to a `const char*` so that we can pass it to
/// the C language.
///
/// - Parameters:
/// - s: The String to convert.
/// - Returns: A pointer that can be passed to C as `const char*`
func toCPointer(_ s: String) -> UnsafePointer<Int8>! {
let cs = (s as NSString).utf8String
return UnsafePointer<Int8>(cs)
}
/// Return an instance of SherpaOnnxOnlineTransducerModelConfig.
///
/// Please refer to
/// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
/// to download the required `.onnx` files.
///
/// - Parameters:
/// - encoder: Path to encoder.onnx
/// - decoder: Path to decoder.onnx
/// - joiner: Path to joiner.onnx
/// - tokens: Path to tokens.txt
/// - numThreads: Number of threads to use for neural network computation.
///
/// - Returns: Return an instance of SherpaOnnxOnlineTransducerModelConfig
func sherpaOnnxOnlineTransducerModelConfig(
encoder: String,
decoder: String,
joiner: String,
tokens: String,
numThreads: Int = 2,
debug: Int = 0
) -> SherpaOnnxOnlineTransducerModelConfig{
return SherpaOnnxOnlineTransducerModelConfig(
encoder: toCPointer(encoder),
decoder: toCPointer(decoder),
joiner: toCPointer(joiner),
tokens: toCPointer(tokens),
num_threads: Int32(numThreads),
debug: Int32(debug)
)
}
func sherpaOnnxFeatureConfig(
sampleRate: Int = 16000,
featureDim: Int = 80
) -> SherpaOnnxFeatureConfig {
return SherpaOnnxFeatureConfig(
sample_rate: Int32(sampleRate),
feature_dim: Int32(featureDim))
}
func sherpaOnnxOnlineRecognizerConfig(
featConfig: SherpaOnnxFeatureConfig,
modelConfig: SherpaOnnxOnlineTransducerModelConfig,
enableEndpoint: Bool = false,
rule1MinTrailingSilence: Float = 2.4,
rule2MinTrailingSilence: Float = 1.2,
rule3MinUtteranceLength: Float = 30
) -> SherpaOnnxOnlineRecognizerConfig{
return SherpaOnnxOnlineRecognizerConfig(
feat_config: featConfig,
model_config: modelConfig,
enable_endpoint: enableEndpoint ? 1 : 0,
rule1_min_trailing_silence: rule1MinTrailingSilence,
rule2_min_trailing_silence: rule2MinTrailingSilence,
rule3_min_utterance_length: rule3MinUtteranceLength)
}
/// Wrapper for recognition result.
///
/// Usage:
///
/// let result = recognizer.getResult()
/// print("text: \(result.text)")
///
class SherpaOnnxOnlineRecongitionResult {
/// A pointer to the underlying counterpart in C
let result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>!
/// Return the actual recognition result.
/// For English models, it contains words separated by spaces.
/// For Chinese models, it contains Chinese words.
var text: String {
return String(cString: result.pointee.text)
}
init(result: UnsafePointer<SherpaOnnxOnlineRecognizerResult>!) {
self.result = result
}
deinit {
if let result {
DestroyOnlineRecognizerResult(result)
}
}
}
class SherpaOnnxRecognizer {
/// A pointer to the underlying counterpart in C
let recognizer: OpaquePointer!
let stream: OpaquePointer!
/// Constructor taking a model config and a decoder config.
init(
config: UnsafePointer<SherpaOnnxOnlineRecognizerConfig>!
) {
recognizer = CreateOnlineRecognizer(config)
stream = CreateOnlineStream(recognizer)
}
deinit {
if let stream {
DestoryOnlineStream(stream)
}
if let recognizer {
DestroyOnlineRecognizer(recognizer)
}
}
/// Decode wave samples.
///
/// - Parameters:
/// - samples: Audio samples normalzed to the range [-1, 1]
/// - sampleRate: Sample rate of the input audio samples. Must match
/// the one expected by the model. It must be 16000 for
/// models from icefall.
func acceptWaveform(samples: [Float], sampleRate: Float = 16000) {
AcceptWaveform(stream, sampleRate, samples, Int32(samples.count))
}
func isReady() -> Bool {
return IsOnlineStreamReady(recognizer, stream) == 1 ? true : false
}
/// If there are enough number of feature frames, it invokes the neural
/// network computation and decoding. Otherwise, it is a no-op.
func decode() {
DecodeOnlineStream(recognizer, stream)
}
/// Get the decoding results so far
func getResult() -> SherpaOnnxOnlineRecongitionResult {
let result: UnsafeMutablePointer<SherpaOnnxOnlineRecognizerResult>? = GetOnlineStreamResult(recognizer, stream)
return SherpaOnnxOnlineRecongitionResult(result: result)
}
/// Reset the recognizer, which clears the neural network model state
/// and the state for decoding.
func reset() {
Reset(recognizer, stream)
}
/// Signal that no more audio samples would be available.
/// After this call, you cannot call acceptWaveform() any more.
func inputFinished() {
InputFinished(stream)
}
/// Return true is an endpoint has been detected.
func isEndpoint() -> Bool {
return IsEndpoint(recognizer, stream) == 1 ? true : false
}
}