Committed by
GitHub
Support Kokoro TTS for HarmonyOS. (#1743)
正在显示
5 个修改的文件
包含
82 行增加
和
35 行删除
| 1 | /** | 1 | /** |
| 2 | * Use these variables when you tailor your ArkTS code. They must be of the const type. | 2 | * Use these variables when you tailor your ArkTS code. They must be of the const type. |
| 3 | */ | 3 | */ |
| 4 | -export const HAR_VERSION = '1.10.37'; | 4 | +export const HAR_VERSION = '1.10.40'; |
| 5 | export const BUILD_MODE_NAME = 'debug'; | 5 | export const BUILD_MODE_NAME = 'debug'; |
| 6 | export const DEBUG = true; | 6 | export const DEBUG = true; |
| 7 | export const TARGET_NAME = 'default'; | 7 | export const TARGET_NAME = 'default'; |
| @@ -31,7 +31,8 @@ export { OnlineStream, | @@ -31,7 +31,8 @@ export { OnlineStream, | ||
| 31 | OnlineRecognizer, | 31 | OnlineRecognizer, |
| 32 | } from './src/main/ets/components/StreamingAsr'; | 32 | } from './src/main/ets/components/StreamingAsr'; |
| 33 | 33 | ||
| 34 | -export { OfflineTtsMatchaModelConfig, | 34 | +export { OfflineTtsKokoroModelConfig, |
| 35 | + OfflineTtsMatchaModelConfig, | ||
| 35 | OfflineTtsVitsModelConfig, | 36 | OfflineTtsVitsModelConfig, |
| 36 | OfflineTtsModelConfig, | 37 | OfflineTtsModelConfig, |
| 37 | OfflineTtsConfig, | 38 | OfflineTtsConfig, |
| @@ -28,9 +28,18 @@ export class OfflineTtsMatchaModelConfig { | @@ -28,9 +28,18 @@ export class OfflineTtsMatchaModelConfig { | ||
| 28 | public lengthScale: number = 1.0; | 28 | public lengthScale: number = 1.0; |
| 29 | } | 29 | } |
| 30 | 30 | ||
| 31 | +export class OfflineTtsKokoroModelConfig { | ||
| 32 | + public model: string = ''; | ||
| 33 | + public voices: string = ''; | ||
| 34 | + public tokens: string = ''; | ||
| 35 | + public dataDir: string = ''; | ||
| 36 | + public lengthScale: number = 1.0; | ||
| 37 | +} | ||
| 38 | + | ||
| 31 | export class OfflineTtsModelConfig { | 39 | export class OfflineTtsModelConfig { |
| 32 | public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); | 40 | public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); |
| 33 | public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig(); | 41 | public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig(); |
| 42 | + public kokoro: OfflineTtsKokoroModelConfig = new OfflineTtsKokoroModelConfig(); | ||
| 34 | public numThreads: number = 1; | 43 | public numThreads: number = 1; |
| 35 | public debug: boolean = false; | 44 | public debug: boolean = false; |
| 36 | public provider: string = 'cpu'; | 45 | public provider: string = 'cpu'; |
| @@ -66,6 +66,7 @@ struct Index { | @@ -66,6 +66,7 @@ struct Index { | ||
| 66 | @State initTtsDone: boolean = false; | 66 | @State initTtsDone: boolean = false; |
| 67 | @State ttsGeneratedDone: boolean = true; | 67 | @State ttsGeneratedDone: boolean = true; |
| 68 | @State numSpeakers: number = 1; | 68 | @State numSpeakers: number = 1; |
| 69 | + @State numThreads: number = 1; | ||
| 69 | @State initAudioDone: boolean = false; | 70 | @State initAudioDone: boolean = false; |
| 70 | private controller: TabsController = new TabsController(); | 71 | private controller: TabsController = new TabsController(); |
| 71 | private cancelled: boolean = false; | 72 | private cancelled: boolean = false; |
| @@ -135,6 +136,7 @@ struct Index { | @@ -135,6 +136,7 @@ struct Index { | ||
| 135 | this.info = 'Model initialized!\nPlease enter text and press start.'; | 136 | this.info = 'Model initialized!\nPlease enter text and press start.'; |
| 136 | this.sampleRate = e.data['sampleRate'] as number; | 137 | this.sampleRate = e.data['sampleRate'] as number; |
| 137 | this.numSpeakers = e.data['numSpeakers'] as number; | 138 | this.numSpeakers = e.data['numSpeakers'] as number; |
| 139 | + this.numThreads = e.data['numThreads'] as number; | ||
| 138 | 140 | ||
| 139 | this.initTtsDone = true; | 141 | this.initTtsDone = true; |
| 140 | } | 142 | } |
| @@ -177,6 +179,7 @@ struct Index { | @@ -177,6 +179,7 @@ struct Index { | ||
| 177 | this.info = `Audio duration: ${audioDuration} s | 179 | this.info = `Audio duration: ${audioDuration} s |
| 178 | Elapsed: ${elapsedSeconds} s | 180 | Elapsed: ${elapsedSeconds} s |
| 179 | RTF = ${elapsedSeconds.toFixed(2)}/${audioDuration.toFixed(2)} = ${RTF.toFixed(3)} | 181 | RTF = ${elapsedSeconds.toFixed(2)}/${audioDuration.toFixed(2)} = ${RTF.toFixed(3)} |
| 182 | +Number of threads: ${this.numThreads} | ||
| 180 | `; | 183 | `; |
| 181 | if (this.cancelled) { | 184 | if (this.cancelled) { |
| 182 | this.info += '\nCancelled.'; | 185 | this.info += '\nCancelled.'; |
| @@ -2,7 +2,7 @@ import worker, { ThreadWorkerGlobalScope, MessageEvents, ErrorEvent } from '@oho | @@ -2,7 +2,7 @@ import worker, { ThreadWorkerGlobalScope, MessageEvents, ErrorEvent } from '@oho | ||
| 2 | 2 | ||
| 3 | import { fileIo as fs } from '@kit.CoreFileKit'; | 3 | import { fileIo as fs } from '@kit.CoreFileKit'; |
| 4 | 4 | ||
| 5 | -import {OfflineTtsConfig, OfflineTts, listRawfileDir, TtsInput, TtsOutput} from 'sherpa_onnx'; | 5 | +import { OfflineTtsConfig, OfflineTts, listRawfileDir, TtsInput, TtsOutput } from 'sherpa_onnx'; |
| 6 | import { buffer } from '@kit.ArkTS'; | 6 | import { buffer } from '@kit.ArkTS'; |
| 7 | 7 | ||
| 8 | const workerPort: ThreadWorkerGlobalScope = worker.workerPort; | 8 | const workerPort: ThreadWorkerGlobalScope = worker.workerPort; |
| @@ -42,18 +42,22 @@ function copyRawFileDirToSandbox(context: Context, srcDir: string) { | @@ -42,18 +42,22 @@ function copyRawFileDirToSandbox(context: Context, srcDir: string) { | ||
| 42 | } | 42 | } |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | -function copyRawFileToSandbox(context: Context, src: string, dst: string) { | ||
| 46 | - // see https://blog.csdn.net/weixin_44640245/article/details/142634846 | ||
| 47 | - // https://developer.huawei.com/consumer/cn/doc/harmonyos-guides-V5/rawfile-guidelines-V5 | 45 | +function copyRawFileToSandbox(context: Context, src: string, |
| 46 | + dst: string) { | ||
| 47 | + /* see | ||
| 48 | + https://blog.csdn.net/weixin_44640245/article/details/142634846 | ||
| 49 | + https://developer.huawei.com/consumer/cn/doc/harmonyos-guides-V5/rawfile-guidelines-V5 | ||
| 50 | + */ | ||
| 48 | let uint8Array: Uint8Array = context.resourceManager.getRawFileContentSync(src); | 51 | let uint8Array: Uint8Array = context.resourceManager.getRawFileContentSync(src); |
| 49 | 52 | ||
| 50 | // https://developer.huawei.com/consumer/cn/doc/harmonyos-references-V5/js-apis-file-fs-V5#fsmkdir | 53 | // https://developer.huawei.com/consumer/cn/doc/harmonyos-references-V5/js-apis-file-fs-V5#fsmkdir |
| 51 | let sandboxPath: string = context.getApplicationContext().filesDir; | 54 | let sandboxPath: string = context.getApplicationContext().filesDir; |
| 52 | - let filepath = sandboxPath + '/' + dst; | 55 | + let filepath = sandboxPath + '/' + dst; |
| 53 | 56 | ||
| 54 | if (fs.accessSync(filepath)) { | 57 | if (fs.accessSync(filepath)) { |
| 55 | - // if the destination exists and has the expected file size, | ||
| 56 | - // then we skip copying it | 58 | + /* if the destination exists and has the expected file size |
| 59 | + then we skip copying it | ||
| 60 | + */ | ||
| 57 | let stat = fs.statSync(filepath); | 61 | let stat = fs.statSync(filepath); |
| 58 | if (stat.size == uint8Array.length) { | 62 | if (stat.size == uint8Array.length) { |
| 59 | return; | 63 | return; |
| @@ -66,11 +70,12 @@ function copyRawFileToSandbox(context: Context, src: string, dst: string) { | @@ -66,11 +70,12 @@ function copyRawFileToSandbox(context: Context, src: string, dst: string) { | ||
| 66 | } | 70 | } |
| 67 | 71 | ||
| 68 | function initTts(context: Context): OfflineTts { | 72 | function initTts(context: Context): OfflineTts { |
| 69 | - // Such a design is to make it easier to build flutter APPs with | ||
| 70 | - // github actions for a variety of tts models | ||
| 71 | - // | ||
| 72 | - // See https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/flutter/generate-tts.py | ||
| 73 | - // for details | 73 | + /* Such a design is to make it easier to build flutter APPs with |
| 74 | + github actions for a variety of tts models | ||
| 75 | + | ||
| 76 | + See https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/flutter/generate-tts.py | ||
| 77 | + for details | ||
| 78 | + */ | ||
| 74 | 79 | ||
| 75 | let modelDir = ''; | 80 | let modelDir = ''; |
| 76 | 81 | ||
| @@ -83,13 +88,19 @@ function initTts(context: Context): OfflineTts { | @@ -83,13 +88,19 @@ function initTts(context: Context): OfflineTts { | ||
| 83 | let vocoder = ''; | 88 | let vocoder = ''; |
| 84 | // for Matcha end | 89 | // for Matcha end |
| 85 | 90 | ||
| 91 | + // for Kokoro begin | ||
| 92 | + let voices = ''; | ||
| 93 | + // for Kokoro end | ||
| 94 | + | ||
| 86 | let ruleFsts = ''; | 95 | let ruleFsts = ''; |
| 87 | let ruleFars = ''; | 96 | let ruleFars = ''; |
| 88 | let lexicon = ''; | 97 | let lexicon = ''; |
| 89 | let dataDir = ''; | 98 | let dataDir = ''; |
| 90 | let dictDir = ''; | 99 | let dictDir = ''; |
| 91 | - // You can select an example below and change it according to match your | ||
| 92 | - // selected tts model | 100 | + /* |
| 101 | + You can select an example below and change it according to match your | ||
| 102 | + selected tts model | ||
| 103 | + */ | ||
| 93 | 104 | ||
| 94 | // ============================================================ | 105 | // ============================================================ |
| 95 | // Your change starts here | 106 | // Your change starts here |
| @@ -146,19 +157,26 @@ function initTts(context: Context): OfflineTts { | @@ -146,19 +157,26 @@ function initTts(context: Context): OfflineTts { | ||
| 146 | // Example 8 | 157 | // Example 8 |
| 147 | // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 158 | // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 148 | // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | 159 | // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker |
| 149 | - // modelDir = 'matcha-icefall-zh-baker' | ||
| 150 | - // acousticModelName = 'model-steps-3.onnx' | ||
| 151 | - // vocoder = 'hifigan_v2.onnx' | ||
| 152 | - // lexicon = 'lexicon.txt' | 160 | + // modelDir = 'matcha-icefall-zh-baker'; |
| 161 | + // acousticModelName = 'model-steps-3.onnx'; | ||
| 162 | + // vocoder = 'hifigan_v2.onnx'; | ||
| 163 | + // lexicon = 'lexicon.txt'; | ||
| 153 | // dictDir = 'dict'; | 164 | // dictDir = 'dict'; |
| 154 | // ruleFsts = `date.fst,phone.fst,number.fst`; | 165 | // ruleFsts = `date.fst,phone.fst,number.fst`; |
| 155 | 166 | ||
| 156 | // Example 9 | 167 | // Example 9 |
| 157 | // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | 168 | // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 158 | // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | 169 | // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker |
| 159 | - // modelDir = 'matcha-icefall-en_US-ljspeech' | ||
| 160 | - // acousticModelName = 'model-steps-3.onnx' | ||
| 161 | - // vocoder = 'hifigan_v2.onnx' | 170 | + // modelDir = 'matcha-icefall-en_US-ljspeech'; |
| 171 | + // acousticModelName = 'model-steps-3.onnx'; | ||
| 172 | + // vocoder = 'hifigan_v2.onnx'; | ||
| 173 | + // dataDir = 'espeak-ng-data'; | ||
| 174 | + | ||
| 175 | + // Example 10 | ||
| 176 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html#kokoro-en-v0-19-english-11-speakers | ||
| 177 | + // modelDir = 'kokoro-en-v0_19'; | ||
| 178 | + // modelName = 'model.onnx'; | ||
| 179 | + // voices = 'voices.bin' | ||
| 162 | // dataDir = 'espeak-ng-data'; | 180 | // dataDir = 'espeak-ng-data'; |
| 163 | 181 | ||
| 164 | // ============================================================ | 182 | // ============================================================ |
| @@ -185,6 +203,10 @@ function initTts(context: Context): OfflineTts { | @@ -185,6 +203,10 @@ function initTts(context: Context): OfflineTts { | ||
| 185 | acousticModelName = modelDir + '/' + acousticModelName; | 203 | acousticModelName = modelDir + '/' + acousticModelName; |
| 186 | } | 204 | } |
| 187 | 205 | ||
| 206 | + if (voices != '') { | ||
| 207 | + voices = modelDir + '/' + voices; | ||
| 208 | + } | ||
| 209 | + | ||
| 188 | if (ruleFsts != '') { | 210 | if (ruleFsts != '') { |
| 189 | let fsts = ruleFsts.split(',') | 211 | let fsts = ruleFsts.split(',') |
| 190 | let tmp: string[] = []; | 212 | let tmp: string[] = []; |
| @@ -210,19 +232,24 @@ function initTts(context: Context): OfflineTts { | @@ -210,19 +232,24 @@ function initTts(context: Context): OfflineTts { | ||
| 210 | if (dataDir != '') { | 232 | if (dataDir != '') { |
| 211 | copyRawFileDirToSandbox(context, modelDir + '/' + dataDir) | 233 | copyRawFileDirToSandbox(context, modelDir + '/' + dataDir) |
| 212 | let sandboxPath: string = context.getApplicationContext().filesDir; | 234 | let sandboxPath: string = context.getApplicationContext().filesDir; |
| 213 | - dataDir = sandboxPath + '/' + modelDir + '/' + dataDir; | 235 | + dataDir = sandboxPath + '/' + modelDir + '/' + dataDir; |
| 214 | } | 236 | } |
| 215 | 237 | ||
| 216 | if (dictDir != '') { | 238 | if (dictDir != '') { |
| 217 | copyRawFileDirToSandbox(context, modelDir + '/' + dictDir) | 239 | copyRawFileDirToSandbox(context, modelDir + '/' + dictDir) |
| 218 | let sandboxPath: string = context.getApplicationContext().filesDir; | 240 | let sandboxPath: string = context.getApplicationContext().filesDir; |
| 219 | - dictDir = sandboxPath + '/' + modelDir + '/' + dictDir; | 241 | + dictDir = sandboxPath + '/' + modelDir + '/' + dictDir; |
| 220 | } | 242 | } |
| 221 | 243 | ||
| 222 | const tokens = modelDir + '/tokens.txt'; | 244 | const tokens = modelDir + '/tokens.txt'; |
| 223 | 245 | ||
| 224 | const config: OfflineTtsConfig = new OfflineTtsConfig(); | 246 | const config: OfflineTtsConfig = new OfflineTtsConfig(); |
| 225 | - config.model.vits.model = modelName; | 247 | + if (voices != '') { |
| 248 | + config.model.vits.model = ''; | ||
| 249 | + } else { | ||
| 250 | + config.model.vits.model = modelName; | ||
| 251 | + } | ||
| 252 | + | ||
| 226 | config.model.vits.lexicon = lexicon; | 253 | config.model.vits.lexicon = lexicon; |
| 227 | config.model.vits.tokens = tokens; | 254 | config.model.vits.tokens = tokens; |
| 228 | config.model.vits.dataDir = dataDir; | 255 | config.model.vits.dataDir = dataDir; |
| @@ -235,6 +262,15 @@ function initTts(context: Context): OfflineTts { | @@ -235,6 +262,15 @@ function initTts(context: Context): OfflineTts { | ||
| 235 | config.model.matcha.dataDir = dataDir; | 262 | config.model.matcha.dataDir = dataDir; |
| 236 | config.model.matcha.dictDir = dictDir; | 263 | config.model.matcha.dictDir = dictDir; |
| 237 | 264 | ||
| 265 | + if (voices != '') { | ||
| 266 | + config.model.kokoro.model = modelName; | ||
| 267 | + } else { | ||
| 268 | + config.model.kokoro.model = ''; | ||
| 269 | + } | ||
| 270 | + config.model.kokoro.voices = voices; | ||
| 271 | + config.model.kokoro.tokens = tokens; | ||
| 272 | + config.model.kokoro.dataDir = dataDir; | ||
| 273 | + | ||
| 238 | config.model.numThreads = 2; | 274 | config.model.numThreads = 2; |
| 239 | config.model.debug = true; | 275 | config.model.debug = true; |
| 240 | config.ruleFsts = ruleFsts; | 276 | config.ruleFsts = ruleFsts; |
| @@ -250,14 +286,12 @@ interface TtsCallbackData { | @@ -250,14 +286,12 @@ interface TtsCallbackData { | ||
| 250 | 286 | ||
| 251 | function callback(data: TtsCallbackData): number { | 287 | function callback(data: TtsCallbackData): number { |
| 252 | workerPort.postMessage({ | 288 | workerPort.postMessage({ |
| 253 | - 'msgType': 'tts-generate-partial', | ||
| 254 | - samples: Float32Array.from(data.samples), | ||
| 255 | - progress: data.progress, | 289 | + 'msgType': 'tts-generate-partial', samples: Float32Array.from(data.samples), progress: data.progress, |
| 256 | }); | 290 | }); |
| 257 | 291 | ||
| 258 | // 0 means to stop generating in C++ | 292 | // 0 means to stop generating in C++ |
| 259 | // 1 means to continue generating in C++ | 293 | // 1 means to continue generating in C++ |
| 260 | - return cancelled? 0 : 1; | 294 | + return cancelled ? 0 : 1; |
| 261 | } | 295 | } |
| 262 | 296 | ||
| 263 | /** | 297 | /** |
| @@ -272,9 +306,11 @@ workerPort.onmessage = (e: MessageEvents) => { | @@ -272,9 +306,11 @@ workerPort.onmessage = (e: MessageEvents) => { | ||
| 272 | if (msgType == 'init-tts' && !tts) { | 306 | if (msgType == 'init-tts' && !tts) { |
| 273 | const context = e.data['context'] as Context; | 307 | const context = e.data['context'] as Context; |
| 274 | tts = initTts(context); | 308 | tts = initTts(context); |
| 275 | - workerPort.postMessage({ 'msgType': 'init-tts-done', | 309 | + workerPort.postMessage({ |
| 310 | + 'msgType': 'init-tts-done', | ||
| 276 | sampleRate: tts.sampleRate, | 311 | sampleRate: tts.sampleRate, |
| 277 | numSpeakers: tts.numSpeakers, | 312 | numSpeakers: tts.numSpeakers, |
| 313 | + numThreads: tts.config.model.numThreads, | ||
| 278 | }); | 314 | }); |
| 279 | } | 315 | } |
| 280 | 316 | ||
| @@ -297,16 +333,14 @@ workerPort.onmessage = (e: MessageEvents) => { | @@ -297,16 +333,14 @@ workerPort.onmessage = (e: MessageEvents) => { | ||
| 297 | console.log(`sampleRate: ${ttsOutput.sampleRate}`); | 333 | console.log(`sampleRate: ${ttsOutput.sampleRate}`); |
| 298 | 334 | ||
| 299 | workerPort.postMessage({ | 335 | workerPort.postMessage({ |
| 300 | - 'msgType': 'tts-generate-done', | ||
| 301 | - samples: Float32Array.from(ttsOutput.samples), | 336 | + 'msgType': 'tts-generate-done', samples: Float32Array.from(ttsOutput.samples), |
| 302 | }); | 337 | }); |
| 303 | 338 | ||
| 304 | }); | 339 | }); |
| 305 | } else { | 340 | } else { |
| 306 | const ttsOutput: TtsOutput = tts.generate(input); | 341 | const ttsOutput: TtsOutput = tts.generate(input); |
| 307 | workerPort.postMessage({ | 342 | workerPort.postMessage({ |
| 308 | - 'msgType': 'tts-generate-done', | ||
| 309 | - samples: Float32Array.from(ttsOutput.samples), | 343 | + 'msgType': 'tts-generate-done', samples: Float32Array.from(ttsOutput.samples), |
| 310 | }); | 344 | }); |
| 311 | } | 345 | } |
| 312 | 346 |
-
请 注册 或 登录 后发表评论