Fangjun Kuang
Committed by GitHub

Add JavaScript API (WebAssembly) for Kokoro TTS 1.0 (#1809)

@@ -10,12 +10,21 @@ ls -lh @@ -10,12 +10,21 @@ ls -lh
10 ls -lh node_modules 10 ls -lh node_modules
11 11
12 # offline tts 12 # offline tts
  13 +#
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2
  15 +tar xf kokoro-multi-lang-v1_0.tar.bz2
  16 +rm kokoro-multi-lang-v1_0.tar.bz2
  17 +
  18 +node ./test-offline-tts-kokoro-zh-en.js
  19 +ls -lh *.wav
  20 +rm -rf kokoro-multi-lang-v1_0
13 21
14 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 22 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
15 tar xf kokoro-en-v0_19.tar.bz2 23 tar xf kokoro-en-v0_19.tar.bz2
16 rm kokoro-en-v0_19.tar.bz2 24 rm kokoro-en-v0_19.tar.bz2
17 25
18 node ./test-offline-tts-kokoro-en.js 26 node ./test-offline-tts-kokoro-en.js
  27 +rm -rf kokoro-en-v0_19
19 28
20 ls -lh 29 ls -lh
21 30
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOfflineTts() {
  6 + let offlineTtsKokoroModelConfig = {
  7 + model: './kokoro-multi-lang-v1_0/model.onnx',
  8 + voices: './kokoro-multi-lang-v1_0/voices.bin',
  9 + tokens: './kokoro-multi-lang-v1_0/tokens.txt',
  10 + dataDir: './kokoro-multi-lang-v1_0/espeak-ng-data',
  11 + dictDir: './kokoro-multi-lang-v1_0/dict',
  12 + lexicon:
  13 + './kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt',
  14 + lengthScale: 1.0,
  15 + };
  16 + let offlineTtsModelConfig = {
  17 + offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
  18 + numThreads: 1,
  19 + debug: 1,
  20 + provider: 'cpu',
  21 + };
  22 +
  23 + let offlineTtsConfig = {
  24 + offlineTtsModelConfig: offlineTtsModelConfig,
  25 + maxNumSentences: 1,
  26 + };
  27 +
  28 + return sherpa_onnx.createOfflineTts(offlineTtsConfig);
  29 +}
  30 +
  31 +const tts = createOfflineTts();
  32 +const speakerId = 49;
  33 +const speed = 1.0;
  34 +const text =
  35 + '中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?'
  36 +
  37 +const audio = tts.generate({text: text, sid: speakerId, speed: speed});
  38 +tts.save('./test-kokoro-zh-en-49.wav', audio);
  39 +console.log('Saved to test-kokoro-zh-en-49.wav successfully.');
  40 +tts.free();
@@ -141,12 +141,15 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { @@ -141,12 +141,15 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
141 const voicesLen = Module.lengthBytesUTF8(config.voices) + 1; 141 const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
142 const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; 142 const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
143 const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; 143 const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
  144 + const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
  145 + const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
144 146
145 - const n = modelLen + voicesLen + tokensLen + dataDirLen; 147 + const n =
  148 + modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen;
146 149
147 const buffer = Module._malloc(n); 150 const buffer = Module._malloc(n);
148 151
149 - const len = 5 * 4; 152 + const len = 7 * 4;
150 const ptr = Module._malloc(len); 153 const ptr = Module._malloc(len);
151 154
152 let offset = 0; 155 let offset = 0;
@@ -162,6 +165,12 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { @@ -162,6 +165,12 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
162 Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); 165 Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
163 offset += dataDirLen; 166 offset += dataDirLen;
164 167
  168 + Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen);
  169 + offset += dictDirLen;
  170 +
  171 + Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
  172 + offset += lexiconLen;
  173 +
165 offset = 0; 174 offset = 0;
166 Module.setValue(ptr, buffer + offset, 'i8*'); 175 Module.setValue(ptr, buffer + offset, 'i8*');
167 offset += modelLen; 176 offset += modelLen;
@@ -177,6 +186,12 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { @@ -177,6 +186,12 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
177 186
178 Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float'); 187 Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');
179 188
  189 + Module.setValue(ptr + 20, buffer + offset, 'i8*');
  190 + offset += dictDirLen;
  191 +
  192 + Module.setValue(ptr + 24, buffer + offset, 'i8*');
  193 + offset += lexiconLen;
  194 +
180 return { 195 return {
181 buffer: buffer, ptr: ptr, len: len, 196 buffer: buffer, ptr: ptr, len: len,
182 } 197 }
@@ -216,6 +231,8 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -216,6 +231,8 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
216 tokens: '', 231 tokens: '',
217 lengthScale: 1.0, 232 lengthScale: 1.0,
218 dataDir: '', 233 dataDir: '',
  234 + dictDir: '',
  235 + lexicon: '',
219 }; 236 };
220 } 237 }
221 238
@@ -382,6 +399,8 @@ function createOfflineTts(Module, myConfig) { @@ -382,6 +399,8 @@ function createOfflineTts(Module, myConfig) {
382 tokens: '', 399 tokens: '',
383 dataDir: '', 400 dataDir: '',
384 lengthScale: 1.0, 401 lengthScale: 1.0,
  402 + dictDir: '',
  403 + lexicon: '',
385 }; 404 };
386 405
387 const offlineTtsModelConfig = { 406 const offlineTtsModelConfig = {
@@ -15,7 +15,7 @@ extern "C" { @@ -15,7 +15,7 @@ extern "C" {
15 15
16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); 16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
18 -static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 5 * 4, ""); 18 +static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 7 * 4, "");
19 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == 19 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
20 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 20 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
21 sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 21 sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
@@ -56,6 +56,8 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -56,6 +56,8 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
56 fprintf(stdout, "tokens: %s\n", kokoro->tokens); 56 fprintf(stdout, "tokens: %s\n", kokoro->tokens);
57 fprintf(stdout, "data_dir: %s\n", kokoro->data_dir); 57 fprintf(stdout, "data_dir: %s\n", kokoro->data_dir);
58 fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale); 58 fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale);
  59 + fprintf(stdout, "dict_dir: %s\n", kokoro->dict_dir);
  60 + fprintf(stdout, "lexicon: %s\n", kokoro->lexicon);
59 61
60 fprintf(stdout, "----------tts model config----------\n"); 62 fprintf(stdout, "----------tts model config----------\n");
61 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); 63 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);