Fangjun Kuang
Committed by GitHub

Add JavaScript API (WebAssembly) for KittenTTS (#2471)

... ... @@ -9,6 +9,14 @@ git status
ls -lh
ls -lh node_modules
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
node ./test-offline-tts-kitten-en.js
ls -lh *.wav
rm -rf kitten-nano-en-v0_1-fp16
# online asr
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
... ... @@ -85,9 +93,8 @@ rm gtcrn_simple.onnx
rm inp_16k.wav
rm enhanced-16k.wav
# offline tts
#
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
tar xf kokoro-multi-lang-v1_0.tar.bz2
rm kokoro-multi-lang-v1_0.tar.bz2
... ...
... ... @@ -54,6 +54,22 @@ node ./test-offline-speaker-diarization.js
In the following, we demonstrate how to run text-to-speech.
## ./test-offline-tts-kitten-en.js
[./test-offline-tts-kitten-en.js](./test-offline-tts-kitten-en.js) shows how to use
[kitten-nano-en-v0_1-fp16](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2)
for text-to-speech.
You can use the following command to run it:
```bash
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
node ./test-offline-tts-kitten-en.js
```
## ./test-offline-tts-kokoro-en.js
[./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use
... ...
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
const sherpa_onnx = require('sherpa-onnx');
function createOfflineTts() {
let offlineTtsKittenModelConfig = {
model: './kitten-nano-en-v0_1-fp16/model.fp16.onnx',
voices: './kitten-nano-en-v0_1-fp16/voices.bin',
tokens: './kitten-nano-en-v0_1-fp16/tokens.txt',
dataDir: './kitten-nano-en-v0_1-fp16/espeak-ng-data',
lengthScale: 1.0,
};
let offlineTtsModelConfig = {
offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
numThreads: 1,
debug: 1,
provider: 'cpu',
};
let offlineTtsConfig = {
offlineTtsModelConfig: offlineTtsModelConfig,
maxNumSentences: 1,
};
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
}
const tts = createOfflineTts();
const speakerId = 0;
const speed = 1.0;
const text =
'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
const audio = tts.generate({text: text, sid: speakerId, speed: speed});
tts.save('./test-kitten-en.wav', audio);
console.log('Saved to test-kitten-en.wav successfully.');
tts.free();
... ...
... ... @@ -16,6 +16,10 @@ function freeConfig(config, Module) {
freeConfig(config.kokoro, Module)
}
if ('kitten' in config) {
freeConfig(config.kitten, Module)
}
Module._free(config.ptr);
}
... ... @@ -204,6 +208,52 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
}
}
function initSherpaOnnxOfflineTtsKittenModelConfig(config, Module) {
const modelLen = Module.lengthBytesUTF8(config.model) + 1;
const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
const n = modelLen + voicesLen + tokensLen + dataDirLen;
const buffer = Module._malloc(n);
const len = 5 * 4;
const ptr = Module._malloc(len);
let offset = 0;
Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
offset += modelLen;
Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen);
offset += voicesLen;
Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
offset += tokensLen;
Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
offset += dataDirLen;
offset = 0;
Module.setValue(ptr, buffer + offset, 'i8*');
offset += modelLen;
Module.setValue(ptr + 4, buffer + offset, 'i8*');
offset += voicesLen;
Module.setValue(ptr + 8, buffer + offset, 'i8*');
offset += tokensLen;
Module.setValue(ptr + 12, buffer + offset, 'i8*');
offset += dataDirLen;
Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');
return {
buffer: buffer, ptr: ptr, len: len,
}
}
function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
if (!('offlineTtsVitsModelConfig' in config)) {
config.offlineTtsVitsModelConfig = {
... ... @@ -244,6 +294,15 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
};
}
if (!('offlineTtsKittenModelConfig' in config)) {
config.offlineTtsKittenModelConfig = {
model: '',
voices: '',
tokens: '',
lengthScale: 1.0,
};
}
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
config.offlineTtsVitsModelConfig, Module);
... ... @@ -254,8 +313,11 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig(
config.offlineTtsKokoroModelConfig, Module);
const kittenModelConfig = initSherpaOnnxOfflineTtsKittenModelConfig(
config.offlineTtsKittenModelConfig, Module);
const len = vitsModelConfig.len + matchaModelConfig.len +
kokoroModelConfig.len + 3 * 4;
kokoroModelConfig.len + kittenModelConfig.len + 3 * 4;
const ptr = Module._malloc(len);
... ... @@ -281,9 +343,13 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset);
offset += kokoroModelConfig.len;
Module._CopyHeap(kittenModelConfig.ptr, kittenModelConfig.len, ptr + offset);
offset += kittenModelConfig.len;
return {
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
matcha: matchaModelConfig, kokoro: kokoroModelConfig,
kitten: kittenModelConfig,
}
}
... ... @@ -413,12 +479,22 @@ function createOfflineTts(Module, myConfig) {
lengthScale: 1.0,
dictDir: '',
lexicon: '',
lang: '',
};
const offlineTtsKittenModelConfig = {
model: '',
voices: '',
tokens: '',
dataDir: '',
lengthScale: 1.0,
};
const offlineTtsModelConfig = {
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
numThreads: 1,
debug: 1,
provider: 'cpu',
... ...
... ... @@ -16,10 +16,12 @@ extern "C" {
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsKittenModelConfig) == 5 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4 +
sizeof(SherpaOnnxOfflineTtsKittenModelConfig),
"");
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4,
... ... @@ -30,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
auto vits_model_config = &tts_model_config->vits;
auto matcha_model_config = &tts_model_config->matcha;
auto kokoro = &tts_model_config->kokoro;
auto kitten = &tts_model_config->kitten;
fprintf(stdout, "----------vits model config----------\n");
fprintf(stdout, "model: %s\n", vits_model_config->model);
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
... ... @@ -58,6 +61,14 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale);
fprintf(stdout, "dict_dir: %s\n", kokoro->dict_dir);
fprintf(stdout, "lexicon: %s\n", kokoro->lexicon);
fprintf(stdout, "lang: %s\n", kokoro->lang);
fprintf(stdout, "----------kitten model config----------\n");
fprintf(stdout, "model: %s\n", kitten->model);
fprintf(stdout, "voices: %s\n", kitten->voices);
fprintf(stdout, "tokens: %s\n", kitten->tokens);
fprintf(stdout, "data_dir: %s\n", kitten->data_dir);
fprintf(stdout, "length scale: %.3f\n", kitten->length_scale);
fprintf(stdout, "----------tts model config----------\n");
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
... ...