Fangjun Kuang
Committed by GitHub

Add JavaScript (WebAssembly) API for Kokoro TTS models. (#1726)

@@ -10,7 +10,15 @@ ls -lh @@ -10,7 +10,15 @@ ls -lh
10 ls -lh node_modules 10 ls -lh node_modules
11 11
12 # offline tts 12 # offline tts
13 -# 13 +
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  15 +tar xf kokoro-en-v0_19.tar.bz2
  16 +rm kokoro-en-v0_19.tar.bz2
  17 +
  18 +node ./test-offline-tts-kokoro-en.js
  19 +
  20 +ls -lh
  21 +
14 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 22 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
15 tar xvf matcha-icefall-zh-baker.tar.bz2 23 tar xvf matcha-icefall-zh-baker.tar.bz2
16 rm matcha-icefall-zh-baker.tar.bz2 24 rm matcha-icefall-zh-baker.tar.bz2
@@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js @@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js
42 42
43 In the following, we demonstrate how to run text-to-speech. 43 In the following, we demonstrate how to run text-to-speech.
44 44
  45 +## ./test-offline-tts-kokoro-en.js
  46 +
  47 +[./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use
  48 +[kokoro-en-v0_19](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2)
  49 +for text-to-speech.
  50 +
  51 +You can use the following command to run it:
  52 +
  53 +```bash
  54 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  55 +tar xf kokoro-en-v0_19.tar.bz2
  56 +rm kokoro-en-v0_19.tar.bz2
  57 +
  58 +node ./test-offline-tts-kokoro-en.js
  59 +```
  60 +
45 ## ./test-offline-tts-matcha-zh.js 61 ## ./test-offline-tts-matcha-zh.js
46 62
47 [./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use 63 [./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOfflineTts() {
  6 + let offlineTtsKokoroModelConfig = {
  7 + model: './kokoro-en-v0_19/model.onnx',
  8 + voices: './kokoro-en-v0_19/voices.bin',
  9 + tokens: './kokoro-en-v0_19/tokens.txt',
  10 + dataDir: './kokoro-en-v0_19/espeak-ng-data',
  11 + lengthScale: 1.0,
  12 + };
  13 + let offlineTtsModelConfig = {
  14 + offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
  15 + numThreads: 1,
  16 + debug: 1,
  17 + provider: 'cpu',
  18 + };
  19 +
  20 + let offlineTtsConfig = {
  21 + offlineTtsModelConfig: offlineTtsModelConfig,
  22 + maxNumSentences: 1,
  23 + };
  24 +
  25 + return sherpa_onnx.createOfflineTts(offlineTtsConfig);
  26 +}
  27 +
  28 +const tts = createOfflineTts();
  29 +const speakerId = 0;
  30 +const speed = 1.0;
  31 +const text =
  32 + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  33 +
  34 +const audio = tts.generate({text: text, sid: speakerId, speed: speed});
  35 +tts.save('./test-kokoro-en.wav', audio);
  36 +console.log('Saved to test-kokoro-en.wav successfully.');
  37 +tts.free();
@@ -8,8 +8,12 @@ function freeConfig(config, Module) { @@ -8,8 +8,12 @@ function freeConfig(config, Module) {
8 freeConfig(config.config, Module) 8 freeConfig(config.config, Module)
9 } 9 }
10 10
11 - if ('config2' in config) {  
12 - freeConfig(config.config2, Module) 11 + if ('matcha' in config) {
  12 + freeConfig(config.matcha, Module)
  13 + }
  14 +
  15 + if ('kokoro' in config) {
  16 + freeConfig(config.kokoro, Module)
13 } 17 }
14 18
15 Module._free(config.ptr); 19 Module._free(config.ptr);
@@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) { @@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
132 } 136 }
133 } 137 }
134 138
  139 +function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
  140 + const modelLen = Module.lengthBytesUTF8(config.model) + 1;
  141 + const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
  142 + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
  143 + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
  144 +
  145 + const n = modelLen + voicesLen + tokensLen + dataDirLen;
  146 +
  147 + const buffer = Module._malloc(n);
  148 +
  149 + const len = 5 * 4;
  150 + const ptr = Module._malloc(len);
  151 +
  152 + let offset = 0;
  153 + Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
  154 + offset += modelLen;
  155 +
  156 + Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen);
  157 + offset += voicesLen;
  158 +
  159 + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
  160 + offset += tokensLen;
  161 +
  162 + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
  163 + offset += dataDirLen;
  164 +
  165 + offset = 0;
  166 + Module.setValue(ptr, buffer + offset, 'i8*');
  167 + offset += modelLen;
  168 +
  169 + Module.setValue(ptr + 4, buffer + offset, 'i8*');
  170 + offset += voicesLen;
  171 +
  172 + Module.setValue(ptr + 8, buffer + offset, 'i8*');
  173 + offset += tokensLen;
  174 +
  175 + Module.setValue(ptr + 12, buffer + offset, 'i8*');
  176 + offset += dataDirLen;
  177 +
  178 + Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');
  179 +
  180 + return {
  181 + buffer: buffer, ptr: ptr, len: len,
  182 + }
  183 +}
  184 +
135 function initSherpaOnnxOfflineTtsModelConfig(config, Module) { 185 function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
136 if (!('offlineTtsVitsModelConfig' in config)) { 186 if (!('offlineTtsVitsModelConfig' in config)) {
137 config.offlineTtsVitsModelConfig = { 187 config.offlineTtsVitsModelConfig = {
@@ -159,6 +209,16 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -159,6 +209,16 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
159 }; 209 };
160 } 210 }
161 211
  212 + if (!('offlineTtsKokoroModelConfig' in config)) {
  213 + config.offlineTtsKokoroModelConfig = {
  214 + model: '',
  215 + voices: '',
  216 + tokens: '',
  217 + lengthScale: 1.0,
  218 + dataDir: '',
  219 + };
  220 + }
  221 +
162 222
163 const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( 223 const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
164 config.offlineTtsVitsModelConfig, Module); 224 config.offlineTtsVitsModelConfig, Module);
@@ -166,7 +226,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -166,7 +226,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
166 const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig( 226 const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
167 config.offlineTtsMatchaModelConfig, Module); 227 config.offlineTtsMatchaModelConfig, Module);
168 228
169 - const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4; 229 + const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig(
  230 + config.offlineTtsKokoroModelConfig, Module);
  231 +
  232 + const len = vitsModelConfig.len + matchaModelConfig.len +
  233 + kokoroModelConfig.len + 3 * 4;
  234 +
170 const ptr = Module._malloc(len); 235 const ptr = Module._malloc(len);
171 236
172 let offset = 0; 237 let offset = 0;
@@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
188 Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset); 253 Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
189 offset += matchaModelConfig.len; 254 offset += matchaModelConfig.len;
190 255
  256 + Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset);
  257 + offset += kokoroModelConfig.len;
  258 +
191 return { 259 return {
192 buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, 260 buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
193 - config2: matchaModelConfig 261 + matcha: matchaModelConfig, kokoro: kokoroModelConfig,
194 } 262 }
195 } 263 }
196 264
@@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) { @@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) {
308 lengthScale: 1.0, 376 lengthScale: 1.0,
309 }; 377 };
310 378
  379 + const offlineTtsKokoroModelConfig = {
  380 + model: '',
  381 + voices: '',
  382 + tokens: '',
  383 + dataDir: '',
  384 + lengthScale: 1.0,
  385 + };
  386 +
311 const offlineTtsModelConfig = { 387 const offlineTtsModelConfig = {
312 offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, 388 offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
313 offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, 389 offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
  390 + offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
314 numThreads: 1, 391 numThreads: 1,
315 debug: 1, 392 debug: 1,
316 provider: 'cpu', 393 provider: 'cpu',
@@ -15,9 +15,11 @@ extern "C" { @@ -15,9 +15,11 @@ extern "C" {
15 15
16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); 16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
  18 +static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 5 * 4, "");
18 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == 19 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
19 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 20 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
20 - sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4, 21 + sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
  22 + sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
21 ""); 23 "");
22 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == 24 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
23 sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, 25 sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
@@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
27 auto tts_model_config = &tts_config->model; 29 auto tts_model_config = &tts_config->model;
28 auto vits_model_config = &tts_model_config->vits; 30 auto vits_model_config = &tts_model_config->vits;
29 auto matcha_model_config = &tts_model_config->matcha; 31 auto matcha_model_config = &tts_model_config->matcha;
  32 + auto kokoro = &tts_model_config->kokoro;
30 fprintf(stdout, "----------vits model config----------\n"); 33 fprintf(stdout, "----------vits model config----------\n");
31 fprintf(stdout, "model: %s\n", vits_model_config->model); 34 fprintf(stdout, "model: %s\n", vits_model_config->model);
32 fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); 35 fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
@@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
47 fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale); 50 fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
48 fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir); 51 fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);
49 52
  53 + fprintf(stdout, "----------kokoro model config----------\n");
  54 + fprintf(stdout, "model: %s\n", kokoro->model);
  55 + fprintf(stdout, "voices: %s\n", kokoro->voices);
  56 + fprintf(stdout, "tokens: %s\n", kokoro->tokens);
  57 + fprintf(stdout, "data_dir: %s\n", kokoro->data_dir);
  58 + fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale);
  59 +
50 fprintf(stdout, "----------tts model config----------\n"); 60 fprintf(stdout, "----------tts model config----------\n");
51 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); 61 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
52 fprintf(stdout, "debug: %d\n", tts_model_config->debug); 62 fprintf(stdout, "debug: %d\n", tts_model_config->debug);