Fangjun Kuang
Committed by GitHub

Add JavaScript API (WebAssembly) for KittenTTS (#2471)

@@ -9,6 +9,14 @@ git status @@ -9,6 +9,14 @@ git status
9 ls -lh 9 ls -lh
10 ls -lh node_modules 10 ls -lh node_modules
11 11
  12 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
  13 +tar xf kitten-nano-en-v0_1-fp16.tar.bz2
  14 +rm kitten-nano-en-v0_1-fp16.tar.bz2
  15 +
  16 +node ./test-offline-tts-kitten-en.js
  17 +ls -lh *.wav
  18 +rm -rf kitten-nano-en-v0_1-fp16
  19 +
12 # online asr 20 # online asr
13 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 21 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
14 tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 22 tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
@@ -85,9 +93,8 @@ rm gtcrn_simple.onnx @@ -85,9 +93,8 @@ rm gtcrn_simple.onnx
85 rm inp_16k.wav 93 rm inp_16k.wav
86 rm enhanced-16k.wav 94 rm enhanced-16k.wav
87 95
88 -  
89 # offline tts 96 # offline tts
90 -# 97 +
91 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 98 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
92 tar xf kokoro-multi-lang-v1_0.tar.bz2 99 tar xf kokoro-multi-lang-v1_0.tar.bz2
93 rm kokoro-multi-lang-v1_0.tar.bz2 100 rm kokoro-multi-lang-v1_0.tar.bz2
@@ -54,6 +54,22 @@ node ./test-offline-speaker-diarization.js @@ -54,6 +54,22 @@ node ./test-offline-speaker-diarization.js
54 54
55 In the following, we demonstrate how to run text-to-speech. 55 In the following, we demonstrate how to run text-to-speech.
56 56
  57 +## ./test-offline-tts-kitten-en.js
  58 +
  59 +[./test-offline-tts-kitten-en.js](./test-offline-tts-kitten-en.js) shows how to use
  60 +[kitten-nano-en-v0_1-fp16](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2)
  61 +for text-to-speech.
  62 +
  63 +You can use the following command to run it:
  64 +
  65 +```bash
  66 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
  67 +tar xf kitten-nano-en-v0_1-fp16.tar.bz2
  68 +rm kitten-nano-en-v0_1-fp16.tar.bz2
  69 +
  70 +node ./test-offline-tts-kitten-en.js
  71 +```
  72 +
57 ## ./test-offline-tts-kokoro-en.js 73 ## ./test-offline-tts-kokoro-en.js
58 74
59 [./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use 75 [./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOfflineTts() {
  6 + let offlineTtsKittenModelConfig = {
  7 + model: './kitten-nano-en-v0_1-fp16/model.fp16.onnx',
  8 + voices: './kitten-nano-en-v0_1-fp16/voices.bin',
  9 + tokens: './kitten-nano-en-v0_1-fp16/tokens.txt',
  10 + dataDir: './kitten-nano-en-v0_1-fp16/espeak-ng-data',
  11 + lengthScale: 1.0,
  12 + };
  13 + let offlineTtsModelConfig = {
  14 + offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
  15 + numThreads: 1,
  16 + debug: 1,
  17 + provider: 'cpu',
  18 + };
  19 +
  20 + let offlineTtsConfig = {
  21 + offlineTtsModelConfig: offlineTtsModelConfig,
  22 + maxNumSentences: 1,
  23 + };
  24 +
  25 + return sherpa_onnx.createOfflineTts(offlineTtsConfig);
  26 +}
  27 +
  28 +const tts = createOfflineTts();
  29 +const speakerId = 0;
  30 +const speed = 1.0;
  31 +const text =
  32 + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  33 +
  34 +const audio = tts.generate({text: text, sid: speakerId, speed: speed});
  35 +tts.save('./test-kitten-en.wav', audio);
  36 +console.log('Saved to test-kitten-en.wav successfully.');
  37 +tts.free();
@@ -16,6 +16,10 @@ function freeConfig(config, Module) { @@ -16,6 +16,10 @@ function freeConfig(config, Module) {
16 freeConfig(config.kokoro, Module) 16 freeConfig(config.kokoro, Module)
17 } 17 }
18 18
  19 + if ('kitten' in config) {
  20 + freeConfig(config.kitten, Module)
  21 + }
  22 +
19 Module._free(config.ptr); 23 Module._free(config.ptr);
20 } 24 }
21 25
@@ -204,6 +208,52 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { @@ -204,6 +208,52 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
204 } 208 }
205 } 209 }
206 210
  211 +function initSherpaOnnxOfflineTtsKittenModelConfig(config, Module) {
  212 + const modelLen = Module.lengthBytesUTF8(config.model) + 1;
  213 + const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
  214 + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
  215 + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
  216 +
  217 + const n = modelLen + voicesLen + tokensLen + dataDirLen;
  218 +
  219 + const buffer = Module._malloc(n);
  220 +
  221 + const len = 5 * 4;
  222 + const ptr = Module._malloc(len);
  223 +
  224 + let offset = 0;
  225 + Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
  226 + offset += modelLen;
  227 +
  228 + Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen);
  229 + offset += voicesLen;
  230 +
  231 + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
  232 + offset += tokensLen;
  233 +
  234 + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
  235 + offset += dataDirLen;
  236 +
  237 + offset = 0;
  238 + Module.setValue(ptr, buffer + offset, 'i8*');
  239 + offset += modelLen;
  240 +
  241 + Module.setValue(ptr + 4, buffer + offset, 'i8*');
  242 + offset += voicesLen;
  243 +
  244 + Module.setValue(ptr + 8, buffer + offset, 'i8*');
  245 + offset += tokensLen;
  246 +
  247 + Module.setValue(ptr + 12, buffer + offset, 'i8*');
  248 + offset += dataDirLen;
  249 +
  250 + Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');
  251 +
  252 + return {
  253 + buffer: buffer, ptr: ptr, len: len,
  254 + }
  255 +}
  256 +
207 function initSherpaOnnxOfflineTtsModelConfig(config, Module) { 257 function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
208 if (!('offlineTtsVitsModelConfig' in config)) { 258 if (!('offlineTtsVitsModelConfig' in config)) {
209 config.offlineTtsVitsModelConfig = { 259 config.offlineTtsVitsModelConfig = {
@@ -244,6 +294,15 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -244,6 +294,15 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
244 }; 294 };
245 } 295 }
246 296
  297 + if (!('offlineTtsKittenModelConfig' in config)) {
  298 + config.offlineTtsKittenModelConfig = {
  299 + model: '',
  300 + voices: '',
  301 + tokens: '',
  302 + lengthScale: 1.0,
  303 + };
  304 + }
  305 +
247 306
248 const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( 307 const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
249 config.offlineTtsVitsModelConfig, Module); 308 config.offlineTtsVitsModelConfig, Module);
@@ -254,8 +313,11 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -254,8 +313,11 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
254 const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig( 313 const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig(
255 config.offlineTtsKokoroModelConfig, Module); 314 config.offlineTtsKokoroModelConfig, Module);
256 315
  316 + const kittenModelConfig = initSherpaOnnxOfflineTtsKittenModelConfig(
  317 + config.offlineTtsKittenModelConfig, Module);
  318 +
257 const len = vitsModelConfig.len + matchaModelConfig.len + 319 const len = vitsModelConfig.len + matchaModelConfig.len +
258 - kokoroModelConfig.len + 3 * 4; 320 + kokoroModelConfig.len + kittenModelConfig.len + 3 * 4;
259 321
260 const ptr = Module._malloc(len); 322 const ptr = Module._malloc(len);
261 323
@@ -281,9 +343,13 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -281,9 +343,13 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
281 Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset); 343 Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset);
282 offset += kokoroModelConfig.len; 344 offset += kokoroModelConfig.len;
283 345
  346 + Module._CopyHeap(kittenModelConfig.ptr, kittenModelConfig.len, ptr + offset);
  347 + offset += kittenModelConfig.len;
  348 +
284 return { 349 return {
285 buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, 350 buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
286 matcha: matchaModelConfig, kokoro: kokoroModelConfig, 351 matcha: matchaModelConfig, kokoro: kokoroModelConfig,
  352 + kitten: kittenModelConfig,
287 } 353 }
288 } 354 }
289 355
@@ -413,12 +479,22 @@ function createOfflineTts(Module, myConfig) { @@ -413,12 +479,22 @@ function createOfflineTts(Module, myConfig) {
413 lengthScale: 1.0, 479 lengthScale: 1.0,
414 dictDir: '', 480 dictDir: '',
415 lexicon: '', 481 lexicon: '',
  482 + lang: '',
  483 + };
  484 +
  485 + const offlineTtsKittenModelConfig = {
  486 + model: '',
  487 + voices: '',
  488 + tokens: '',
  489 + dataDir: '',
  490 + lengthScale: 1.0,
416 }; 491 };
417 492
418 const offlineTtsModelConfig = { 493 const offlineTtsModelConfig = {
419 offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, 494 offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
420 offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, 495 offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
421 offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, 496 offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
  497 + offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
422 numThreads: 1, 498 numThreads: 1,
423 debug: 1, 499 debug: 1,
424 provider: 'cpu', 500 provider: 'cpu',
@@ -16,10 +16,12 @@ extern "C" { @@ -16,10 +16,12 @@ extern "C" {
16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); 16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
18 static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, ""); 18 static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, "");
  19 +static_assert(sizeof(SherpaOnnxOfflineTtsKittenModelConfig) == 5 * 4, "");
19 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == 20 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
20 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 21 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
21 sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 22 sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
22 - sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, 23 + sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4 +
  24 + sizeof(SherpaOnnxOfflineTtsKittenModelConfig),
23 ""); 25 "");
24 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == 26 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
25 sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4, 27 sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4,
@@ -30,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -30,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
30 auto vits_model_config = &tts_model_config->vits; 32 auto vits_model_config = &tts_model_config->vits;
31 auto matcha_model_config = &tts_model_config->matcha; 33 auto matcha_model_config = &tts_model_config->matcha;
32 auto kokoro = &tts_model_config->kokoro; 34 auto kokoro = &tts_model_config->kokoro;
  35 + auto kitten = &tts_model_config->kitten;
33 fprintf(stdout, "----------vits model config----------\n"); 36 fprintf(stdout, "----------vits model config----------\n");
34 fprintf(stdout, "model: %s\n", vits_model_config->model); 37 fprintf(stdout, "model: %s\n", vits_model_config->model);
35 fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); 38 fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
@@ -58,6 +61,14 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -58,6 +61,14 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
58 fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale); 61 fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale);
59 fprintf(stdout, "dict_dir: %s\n", kokoro->dict_dir); 62 fprintf(stdout, "dict_dir: %s\n", kokoro->dict_dir);
60 fprintf(stdout, "lexicon: %s\n", kokoro->lexicon); 63 fprintf(stdout, "lexicon: %s\n", kokoro->lexicon);
  64 + fprintf(stdout, "lang: %s\n", kokoro->lang);
  65 +
  66 + fprintf(stdout, "----------kitten model config----------\n");
  67 + fprintf(stdout, "model: %s\n", kitten->model);
  68 + fprintf(stdout, "voices: %s\n", kitten->voices);
  69 + fprintf(stdout, "tokens: %s\n", kitten->tokens);
  70 + fprintf(stdout, "data_dir: %s\n", kitten->data_dir);
  71 + fprintf(stdout, "length scale: %.3f\n", kitten->length_scale);
61 72
62 fprintf(stdout, "----------tts model config----------\n"); 73 fprintf(stdout, "----------tts model config----------\n");
63 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); 74 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);