Fangjun Kuang
Committed by GitHub

Support Kokoro TTS for HarmonyOS. (#1743)

1 /** 1 /**
2 * Use these variables when you tailor your ArkTS code. They must be of the const type. 2 * Use these variables when you tailor your ArkTS code. They must be of the const type.
3 */ 3 */
4 -export const HAR_VERSION = '1.10.37'; 4 +export const HAR_VERSION = '1.10.40';
5 export const BUILD_MODE_NAME = 'debug'; 5 export const BUILD_MODE_NAME = 'debug';
6 export const DEBUG = true; 6 export const DEBUG = true;
7 export const TARGET_NAME = 'default'; 7 export const TARGET_NAME = 'default';
@@ -31,7 +31,8 @@ export { OnlineStream, @@ -31,7 +31,8 @@ export { OnlineStream,
31 OnlineRecognizer, 31 OnlineRecognizer,
32 } from './src/main/ets/components/StreamingAsr'; 32 } from './src/main/ets/components/StreamingAsr';
33 33
34 -export { OfflineTtsMatchaModelConfig, 34 +export { OfflineTtsKokoroModelConfig,
  35 + OfflineTtsMatchaModelConfig,
35 OfflineTtsVitsModelConfig, 36 OfflineTtsVitsModelConfig,
36 OfflineTtsModelConfig, 37 OfflineTtsModelConfig,
37 OfflineTtsConfig, 38 OfflineTtsConfig,
@@ -28,9 +28,18 @@ export class OfflineTtsMatchaModelConfig { @@ -28,9 +28,18 @@ export class OfflineTtsMatchaModelConfig {
28 public lengthScale: number = 1.0; 28 public lengthScale: number = 1.0;
29 } 29 }
30 30
  31 +export class OfflineTtsKokoroModelConfig {
  32 + public model: string = '';
  33 + public voices: string = '';
  34 + public tokens: string = '';
  35 + public dataDir: string = '';
  36 + public lengthScale: number = 1.0;
  37 +}
  38 +
31 export class OfflineTtsModelConfig { 39 export class OfflineTtsModelConfig {
32 public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); 40 public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig();
33 public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig(); 41 public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig();
  42 + public kokoro: OfflineTtsKokoroModelConfig = new OfflineTtsKokoroModelConfig();
34 public numThreads: number = 1; 43 public numThreads: number = 1;
35 public debug: boolean = false; 44 public debug: boolean = false;
36 public provider: string = 'cpu'; 45 public provider: string = 'cpu';
@@ -66,6 +66,7 @@ struct Index { @@ -66,6 +66,7 @@ struct Index {
66 @State initTtsDone: boolean = false; 66 @State initTtsDone: boolean = false;
67 @State ttsGeneratedDone: boolean = true; 67 @State ttsGeneratedDone: boolean = true;
68 @State numSpeakers: number = 1; 68 @State numSpeakers: number = 1;
  69 + @State numThreads: number = 1;
69 @State initAudioDone: boolean = false; 70 @State initAudioDone: boolean = false;
70 private controller: TabsController = new TabsController(); 71 private controller: TabsController = new TabsController();
71 private cancelled: boolean = false; 72 private cancelled: boolean = false;
@@ -135,6 +136,7 @@ struct Index { @@ -135,6 +136,7 @@ struct Index {
135 this.info = 'Model initialized!\nPlease enter text and press start.'; 136 this.info = 'Model initialized!\nPlease enter text and press start.';
136 this.sampleRate = e.data['sampleRate'] as number; 137 this.sampleRate = e.data['sampleRate'] as number;
137 this.numSpeakers = e.data['numSpeakers'] as number; 138 this.numSpeakers = e.data['numSpeakers'] as number;
  139 + this.numThreads = e.data['numThreads'] as number;
138 140
139 this.initTtsDone = true; 141 this.initTtsDone = true;
140 } 142 }
@@ -177,6 +179,7 @@ struct Index { @@ -177,6 +179,7 @@ struct Index {
177 this.info = `Audio duration: ${audioDuration} s 179 this.info = `Audio duration: ${audioDuration} s
178 Elapsed: ${elapsedSeconds} s 180 Elapsed: ${elapsedSeconds} s
179 RTF = ${elapsedSeconds.toFixed(2)}/${audioDuration.toFixed(2)} = ${RTF.toFixed(3)} 181 RTF = ${elapsedSeconds.toFixed(2)}/${audioDuration.toFixed(2)} = ${RTF.toFixed(3)}
  182 +Number of threads: ${this.numThreads}
180 `; 183 `;
181 if (this.cancelled) { 184 if (this.cancelled) {
182 this.info += '\nCancelled.'; 185 this.info += '\nCancelled.';
@@ -2,7 +2,7 @@ import worker, { ThreadWorkerGlobalScope, MessageEvents, ErrorEvent } from '@oho @@ -2,7 +2,7 @@ import worker, { ThreadWorkerGlobalScope, MessageEvents, ErrorEvent } from '@oho
2 2
3 import { fileIo as fs } from '@kit.CoreFileKit'; 3 import { fileIo as fs } from '@kit.CoreFileKit';
4 4
5 -import {OfflineTtsConfig, OfflineTts, listRawfileDir, TtsInput, TtsOutput} from 'sherpa_onnx'; 5 +import { OfflineTtsConfig, OfflineTts, listRawfileDir, TtsInput, TtsOutput } from 'sherpa_onnx';
6 import { buffer } from '@kit.ArkTS'; 6 import { buffer } from '@kit.ArkTS';
7 7
8 const workerPort: ThreadWorkerGlobalScope = worker.workerPort; 8 const workerPort: ThreadWorkerGlobalScope = worker.workerPort;
@@ -42,18 +42,22 @@ function copyRawFileDirToSandbox(context: Context, srcDir: string) { @@ -42,18 +42,22 @@ function copyRawFileDirToSandbox(context: Context, srcDir: string) {
42 } 42 }
43 } 43 }
44 44
45 -function copyRawFileToSandbox(context: Context, src: string, dst: string) {  
46 - // see https://blog.csdn.net/weixin_44640245/article/details/142634846  
47 - // https://developer.huawei.com/consumer/cn/doc/harmonyos-guides-V5/rawfile-guidelines-V5 45 +function copyRawFileToSandbox(context: Context, src: string,
  46 + dst: string) {
  47 + /* see
  48 + https://blog.csdn.net/weixin_44640245/article/details/142634846
  49 + https://developer.huawei.com/consumer/cn/doc/harmonyos-guides-V5/rawfile-guidelines-V5
  50 + */
48 let uint8Array: Uint8Array = context.resourceManager.getRawFileContentSync(src); 51 let uint8Array: Uint8Array = context.resourceManager.getRawFileContentSync(src);
49 52
50 // https://developer.huawei.com/consumer/cn/doc/harmonyos-references-V5/js-apis-file-fs-V5#fsmkdir 53 // https://developer.huawei.com/consumer/cn/doc/harmonyos-references-V5/js-apis-file-fs-V5#fsmkdir
51 let sandboxPath: string = context.getApplicationContext().filesDir; 54 let sandboxPath: string = context.getApplicationContext().filesDir;
52 - let filepath = sandboxPath + '/' + dst; 55 + let filepath = sandboxPath + '/' + dst;
53 56
54 if (fs.accessSync(filepath)) { 57 if (fs.accessSync(filepath)) {
55 - // if the destination exists and has the expected file size,  
56 - // then we skip copying it 58 + /* if the destination exists and has the expected file size
  59 + then we skip copying it
  60 + */
57 let stat = fs.statSync(filepath); 61 let stat = fs.statSync(filepath);
58 if (stat.size == uint8Array.length) { 62 if (stat.size == uint8Array.length) {
59 return; 63 return;
@@ -66,11 +70,12 @@ function copyRawFileToSandbox(context: Context, src: string, dst: string) { @@ -66,11 +70,12 @@ function copyRawFileToSandbox(context: Context, src: string, dst: string) {
66 } 70 }
67 71
68 function initTts(context: Context): OfflineTts { 72 function initTts(context: Context): OfflineTts {
69 - // Such a design is to make it easier to build flutter APPs with  
70 - // github actions for a variety of tts models  
71 - //  
72 - // See https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/flutter/generate-tts.py  
73 - // for details 73 + /* Such a design is to make it easier to build flutter APPs with
  74 + github actions for a variety of tts models
  75 +
  76 + See https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/flutter/generate-tts.py
  77 + for details
  78 + */
74 79
75 let modelDir = ''; 80 let modelDir = '';
76 81
@@ -83,13 +88,19 @@ function initTts(context: Context): OfflineTts { @@ -83,13 +88,19 @@ function initTts(context: Context): OfflineTts {
83 let vocoder = ''; 88 let vocoder = '';
84 // for Matcha end 89 // for Matcha end
85 90
  91 + // for Kokoro begin
  92 + let voices = '';
  93 + // for Kokoro end
  94 +
86 let ruleFsts = ''; 95 let ruleFsts = '';
87 let ruleFars = ''; 96 let ruleFars = '';
88 let lexicon = ''; 97 let lexicon = '';
89 let dataDir = ''; 98 let dataDir = '';
90 let dictDir = ''; 99 let dictDir = '';
91 - // You can select an example below and change it according to match your  
92 - // selected tts model 100 + /*
  101 + You can select an example below and change it according to match your
  102 + selected tts model
  103 + */
93 104
94 // ============================================================ 105 // ============================================================
95 // Your change starts here 106 // Your change starts here
@@ -146,19 +157,26 @@ function initTts(context: Context): OfflineTts { @@ -146,19 +157,26 @@ function initTts(context: Context): OfflineTts {
146 // Example 8 157 // Example 8
147 // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 158 // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
148 // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker 159 // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
149 - // modelDir = 'matcha-icefall-zh-baker'  
150 - // acousticModelName = 'model-steps-3.onnx'  
151 - // vocoder = 'hifigan_v2.onnx'  
152 - // lexicon = 'lexicon.txt' 160 + // modelDir = 'matcha-icefall-zh-baker';
  161 + // acousticModelName = 'model-steps-3.onnx';
  162 + // vocoder = 'hifigan_v2.onnx';
  163 + // lexicon = 'lexicon.txt';
153 // dictDir = 'dict'; 164 // dictDir = 'dict';
154 // ruleFsts = `date.fst,phone.fst,number.fst`; 165 // ruleFsts = `date.fst,phone.fst,number.fst`;
155 166
156 // Example 9 167 // Example 9
157 // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models 168 // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
158 // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker 169 // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
159 - // modelDir = 'matcha-icefall-en_US-ljspeech'  
160 - // acousticModelName = 'model-steps-3.onnx'  
161 - // vocoder = 'hifigan_v2.onnx' 170 + // modelDir = 'matcha-icefall-en_US-ljspeech';
  171 + // acousticModelName = 'model-steps-3.onnx';
  172 + // vocoder = 'hifigan_v2.onnx';
  173 + // dataDir = 'espeak-ng-data';
  174 +
  175 + // Example 10
  176 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html#kokoro-en-v0-19-english-11-speakers
  177 + // modelDir = 'kokoro-en-v0_19';
  178 + // modelName = 'model.onnx';
  179 + // voices = 'voices.bin'
162 // dataDir = 'espeak-ng-data'; 180 // dataDir = 'espeak-ng-data';
163 181
164 // ============================================================ 182 // ============================================================
@@ -185,6 +203,10 @@ function initTts(context: Context): OfflineTts { @@ -185,6 +203,10 @@ function initTts(context: Context): OfflineTts {
185 acousticModelName = modelDir + '/' + acousticModelName; 203 acousticModelName = modelDir + '/' + acousticModelName;
186 } 204 }
187 205
  206 + if (voices != '') {
  207 + voices = modelDir + '/' + voices;
  208 + }
  209 +
188 if (ruleFsts != '') { 210 if (ruleFsts != '') {
189 let fsts = ruleFsts.split(',') 211 let fsts = ruleFsts.split(',')
190 let tmp: string[] = []; 212 let tmp: string[] = [];
@@ -210,19 +232,24 @@ function initTts(context: Context): OfflineTts { @@ -210,19 +232,24 @@ function initTts(context: Context): OfflineTts {
210 if (dataDir != '') { 232 if (dataDir != '') {
211 copyRawFileDirToSandbox(context, modelDir + '/' + dataDir) 233 copyRawFileDirToSandbox(context, modelDir + '/' + dataDir)
212 let sandboxPath: string = context.getApplicationContext().filesDir; 234 let sandboxPath: string = context.getApplicationContext().filesDir;
213 - dataDir = sandboxPath + '/' + modelDir + '/' + dataDir; 235 + dataDir = sandboxPath + '/' + modelDir + '/' + dataDir;
214 } 236 }
215 237
216 if (dictDir != '') { 238 if (dictDir != '') {
217 copyRawFileDirToSandbox(context, modelDir + '/' + dictDir) 239 copyRawFileDirToSandbox(context, modelDir + '/' + dictDir)
218 let sandboxPath: string = context.getApplicationContext().filesDir; 240 let sandboxPath: string = context.getApplicationContext().filesDir;
219 - dictDir = sandboxPath + '/' + modelDir + '/' + dictDir; 241 + dictDir = sandboxPath + '/' + modelDir + '/' + dictDir;
220 } 242 }
221 243
222 const tokens = modelDir + '/tokens.txt'; 244 const tokens = modelDir + '/tokens.txt';
223 245
224 const config: OfflineTtsConfig = new OfflineTtsConfig(); 246 const config: OfflineTtsConfig = new OfflineTtsConfig();
225 - config.model.vits.model = modelName; 247 + if (voices != '') {
  248 + config.model.vits.model = '';
  249 + } else {
  250 + config.model.vits.model = modelName;
  251 + }
  252 +
226 config.model.vits.lexicon = lexicon; 253 config.model.vits.lexicon = lexicon;
227 config.model.vits.tokens = tokens; 254 config.model.vits.tokens = tokens;
228 config.model.vits.dataDir = dataDir; 255 config.model.vits.dataDir = dataDir;
@@ -235,6 +262,15 @@ function initTts(context: Context): OfflineTts { @@ -235,6 +262,15 @@ function initTts(context: Context): OfflineTts {
235 config.model.matcha.dataDir = dataDir; 262 config.model.matcha.dataDir = dataDir;
236 config.model.matcha.dictDir = dictDir; 263 config.model.matcha.dictDir = dictDir;
237 264
  265 + if (voices != '') {
  266 + config.model.kokoro.model = modelName;
  267 + } else {
  268 + config.model.kokoro.model = '';
  269 + }
  270 + config.model.kokoro.voices = voices;
  271 + config.model.kokoro.tokens = tokens;
  272 + config.model.kokoro.dataDir = dataDir;
  273 +
238 config.model.numThreads = 2; 274 config.model.numThreads = 2;
239 config.model.debug = true; 275 config.model.debug = true;
240 config.ruleFsts = ruleFsts; 276 config.ruleFsts = ruleFsts;
@@ -250,14 +286,12 @@ interface TtsCallbackData { @@ -250,14 +286,12 @@ interface TtsCallbackData {
250 286
251 function callback(data: TtsCallbackData): number { 287 function callback(data: TtsCallbackData): number {
252 workerPort.postMessage({ 288 workerPort.postMessage({
253 - 'msgType': 'tts-generate-partial',  
254 - samples: Float32Array.from(data.samples),  
255 - progress: data.progress, 289 + 'msgType': 'tts-generate-partial', samples: Float32Array.from(data.samples), progress: data.progress,
256 }); 290 });
257 291
258 // 0 means to stop generating in C++ 292 // 0 means to stop generating in C++
259 // 1 means to continue generating in C++ 293 // 1 means to continue generating in C++
260 - return cancelled? 0 : 1; 294 + return cancelled ? 0 : 1;
261 } 295 }
262 296
263 /** 297 /**
@@ -272,9 +306,11 @@ workerPort.onmessage = (e: MessageEvents) => { @@ -272,9 +306,11 @@ workerPort.onmessage = (e: MessageEvents) => {
272 if (msgType == 'init-tts' && !tts) { 306 if (msgType == 'init-tts' && !tts) {
273 const context = e.data['context'] as Context; 307 const context = e.data['context'] as Context;
274 tts = initTts(context); 308 tts = initTts(context);
275 - workerPort.postMessage({ 'msgType': 'init-tts-done', 309 + workerPort.postMessage({
  310 + 'msgType': 'init-tts-done',
276 sampleRate: tts.sampleRate, 311 sampleRate: tts.sampleRate,
277 numSpeakers: tts.numSpeakers, 312 numSpeakers: tts.numSpeakers,
  313 + numThreads: tts.config.model.numThreads,
278 }); 314 });
279 } 315 }
280 316
@@ -297,16 +333,14 @@ workerPort.onmessage = (e: MessageEvents) => { @@ -297,16 +333,14 @@ workerPort.onmessage = (e: MessageEvents) => {
297 console.log(`sampleRate: ${ttsOutput.sampleRate}`); 333 console.log(`sampleRate: ${ttsOutput.sampleRate}`);
298 334
299 workerPort.postMessage({ 335 workerPort.postMessage({
300 - 'msgType': 'tts-generate-done',  
301 - samples: Float32Array.from(ttsOutput.samples), 336 + 'msgType': 'tts-generate-done', samples: Float32Array.from(ttsOutput.samples),
302 }); 337 });
303 338
304 }); 339 });
305 } else { 340 } else {
306 const ttsOutput: TtsOutput = tts.generate(input); 341 const ttsOutput: TtsOutput = tts.generate(input);
307 workerPort.postMessage({ 342 workerPort.postMessage({
308 - 'msgType': 'tts-generate-done',  
309 - samples: Float32Array.from(ttsOutput.samples), 343 + 'msgType': 'tts-generate-done', samples: Float32Array.from(ttsOutput.samples),
310 }); 344 });
311 } 345 }
312 346