Committed by
GitHub
Add JavaScript API (WebAssembly) for KittenTTS (#2471)
正在显示
5 个修改的文件
包含
151 行增加
和
4 行删除
| @@ -9,6 +9,14 @@ git status | @@ -9,6 +9,14 @@ git status | ||
| 9 | ls -lh | 9 | ls -lh |
| 10 | ls -lh node_modules | 10 | ls -lh node_modules |
| 11 | 11 | ||
| 12 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 13 | +tar xf kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 14 | +rm kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 15 | + | ||
| 16 | +node ./test-offline-tts-kitten-en.js | ||
| 17 | +ls -lh *.wav | ||
| 18 | +rm -rf kitten-nano-en-v0_1-fp16 | ||
| 19 | + | ||
| 12 | # online asr | 20 | # online asr |
| 13 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 | 21 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 |
| 14 | tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 | 22 | tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 |
| @@ -85,9 +93,8 @@ rm gtcrn_simple.onnx | @@ -85,9 +93,8 @@ rm gtcrn_simple.onnx | ||
| 85 | rm inp_16k.wav | 93 | rm inp_16k.wav |
| 86 | rm enhanced-16k.wav | 94 | rm enhanced-16k.wav |
| 87 | 95 | ||
| 88 | - | ||
| 89 | # offline tts | 96 | # offline tts |
| 90 | -# | 97 | + |
| 91 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 | 98 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 |
| 92 | tar xf kokoro-multi-lang-v1_0.tar.bz2 | 99 | tar xf kokoro-multi-lang-v1_0.tar.bz2 |
| 93 | rm kokoro-multi-lang-v1_0.tar.bz2 | 100 | rm kokoro-multi-lang-v1_0.tar.bz2 |
| @@ -54,6 +54,22 @@ node ./test-offline-speaker-diarization.js | @@ -54,6 +54,22 @@ node ./test-offline-speaker-diarization.js | ||
| 54 | 54 | ||
| 55 | In the following, we demonstrate how to run text-to-speech. | 55 | In the following, we demonstrate how to run text-to-speech. |
| 56 | 56 | ||
| 57 | +## ./test-offline-tts-kitten-en.js | ||
| 58 | + | ||
| 59 | +[./test-offline-tts-kitten-en.js](./test-offline-tts-kitten-en.js) shows how to use | ||
| 60 | +[kitten-nano-en-v0_1-fp16](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2) | ||
| 61 | +for text-to-speech. | ||
| 62 | + | ||
| 63 | +You can use the following command to run it: | ||
| 64 | + | ||
| 65 | +```bash | ||
| 66 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 67 | +tar xf kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 68 | +rm kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 69 | + | ||
| 70 | +node ./test-offline-tts-kitten-en.js | ||
| 71 | +``` | ||
| 72 | + | ||
| 57 | ## ./test-offline-tts-kokoro-en.js | 73 | ## ./test-offline-tts-kokoro-en.js |
| 58 | 74 | ||
| 59 | [./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use | 75 | [./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | + | ||
| 3 | +const sherpa_onnx = require('sherpa-onnx'); | ||
| 4 | + | ||
| 5 | +function createOfflineTts() { | ||
| 6 | + let offlineTtsKittenModelConfig = { | ||
| 7 | + model: './kitten-nano-en-v0_1-fp16/model.fp16.onnx', | ||
| 8 | + voices: './kitten-nano-en-v0_1-fp16/voices.bin', | ||
| 9 | + tokens: './kitten-nano-en-v0_1-fp16/tokens.txt', | ||
| 10 | + dataDir: './kitten-nano-en-v0_1-fp16/espeak-ng-data', | ||
| 11 | + lengthScale: 1.0, | ||
| 12 | + }; | ||
| 13 | + let offlineTtsModelConfig = { | ||
| 14 | + offlineTtsKittenModelConfig: offlineTtsKittenModelConfig, | ||
| 15 | + numThreads: 1, | ||
| 16 | + debug: 1, | ||
| 17 | + provider: 'cpu', | ||
| 18 | + }; | ||
| 19 | + | ||
| 20 | + let offlineTtsConfig = { | ||
| 21 | + offlineTtsModelConfig: offlineTtsModelConfig, | ||
| 22 | + maxNumSentences: 1, | ||
| 23 | + }; | ||
| 24 | + | ||
| 25 | + return sherpa_onnx.createOfflineTts(offlineTtsConfig); | ||
| 26 | +} | ||
| 27 | + | ||
| 28 | +const tts = createOfflineTts(); | ||
| 29 | +const speakerId = 0; | ||
| 30 | +const speed = 1.0; | ||
| 31 | +const text = | ||
| 32 | + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' | ||
| 33 | + | ||
| 34 | +const audio = tts.generate({text: text, sid: speakerId, speed: speed}); | ||
| 35 | +tts.save('./test-kitten-en.wav', audio); | ||
| 36 | +console.log('Saved to test-kitten-en.wav successfully.'); | ||
| 37 | +tts.free(); |
| @@ -16,6 +16,10 @@ function freeConfig(config, Module) { | @@ -16,6 +16,10 @@ function freeConfig(config, Module) { | ||
| 16 | freeConfig(config.kokoro, Module) | 16 | freeConfig(config.kokoro, Module) |
| 17 | } | 17 | } |
| 18 | 18 | ||
| 19 | + if ('kitten' in config) { | ||
| 20 | + freeConfig(config.kitten, Module) | ||
| 21 | + } | ||
| 22 | + | ||
| 19 | Module._free(config.ptr); | 23 | Module._free(config.ptr); |
| 20 | } | 24 | } |
| 21 | 25 | ||
| @@ -204,6 +208,52 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | @@ -204,6 +208,52 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | ||
| 204 | } | 208 | } |
| 205 | } | 209 | } |
| 206 | 210 | ||
| 211 | +function initSherpaOnnxOfflineTtsKittenModelConfig(config, Module) { | ||
| 212 | + const modelLen = Module.lengthBytesUTF8(config.model) + 1; | ||
| 213 | + const voicesLen = Module.lengthBytesUTF8(config.voices) + 1; | ||
| 214 | + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; | ||
| 215 | + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; | ||
| 216 | + | ||
| 217 | + const n = modelLen + voicesLen + tokensLen + dataDirLen; | ||
| 218 | + | ||
| 219 | + const buffer = Module._malloc(n); | ||
| 220 | + | ||
| 221 | + const len = 5 * 4; | ||
| 222 | + const ptr = Module._malloc(len); | ||
| 223 | + | ||
| 224 | + let offset = 0; | ||
| 225 | + Module.stringToUTF8(config.model || '', buffer + offset, modelLen); | ||
| 226 | + offset += modelLen; | ||
| 227 | + | ||
| 228 | + Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen); | ||
| 229 | + offset += voicesLen; | ||
| 230 | + | ||
| 231 | + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); | ||
| 232 | + offset += tokensLen; | ||
| 233 | + | ||
| 234 | + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); | ||
| 235 | + offset += dataDirLen; | ||
| 236 | + | ||
| 237 | + offset = 0; | ||
| 238 | + Module.setValue(ptr, buffer + offset, 'i8*'); | ||
| 239 | + offset += modelLen; | ||
| 240 | + | ||
| 241 | + Module.setValue(ptr + 4, buffer + offset, 'i8*'); | ||
| 242 | + offset += voicesLen; | ||
| 243 | + | ||
| 244 | + Module.setValue(ptr + 8, buffer + offset, 'i8*'); | ||
| 245 | + offset += tokensLen; | ||
| 246 | + | ||
| 247 | + Module.setValue(ptr + 12, buffer + offset, 'i8*'); | ||
| 248 | + offset += dataDirLen; | ||
| 249 | + | ||
| 250 | + Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float'); | ||
| 251 | + | ||
| 252 | + return { | ||
| 253 | + buffer: buffer, ptr: ptr, len: len, | ||
| 254 | + } | ||
| 255 | +} | ||
| 256 | + | ||
| 207 | function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | 257 | function initSherpaOnnxOfflineTtsModelConfig(config, Module) { |
| 208 | if (!('offlineTtsVitsModelConfig' in config)) { | 258 | if (!('offlineTtsVitsModelConfig' in config)) { |
| 209 | config.offlineTtsVitsModelConfig = { | 259 | config.offlineTtsVitsModelConfig = { |
| @@ -244,6 +294,15 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -244,6 +294,15 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 244 | }; | 294 | }; |
| 245 | } | 295 | } |
| 246 | 296 | ||
| 297 | + if (!('offlineTtsKittenModelConfig' in config)) { | ||
| 298 | + config.offlineTtsKittenModelConfig = { | ||
| 299 | + model: '', | ||
| 300 | + voices: '', | ||
| 301 | + tokens: '', | ||
| 302 | + lengthScale: 1.0, | ||
| 303 | + }; | ||
| 304 | + } | ||
| 305 | + | ||
| 247 | 306 | ||
| 248 | const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( | 307 | const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( |
| 249 | config.offlineTtsVitsModelConfig, Module); | 308 | config.offlineTtsVitsModelConfig, Module); |
| @@ -254,8 +313,11 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -254,8 +313,11 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 254 | const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig( | 313 | const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig( |
| 255 | config.offlineTtsKokoroModelConfig, Module); | 314 | config.offlineTtsKokoroModelConfig, Module); |
| 256 | 315 | ||
| 316 | + const kittenModelConfig = initSherpaOnnxOfflineTtsKittenModelConfig( | ||
| 317 | + config.offlineTtsKittenModelConfig, Module); | ||
| 318 | + | ||
| 257 | const len = vitsModelConfig.len + matchaModelConfig.len + | 319 | const len = vitsModelConfig.len + matchaModelConfig.len + |
| 258 | - kokoroModelConfig.len + 3 * 4; | 320 | + kokoroModelConfig.len + kittenModelConfig.len + 3 * 4; |
| 259 | 321 | ||
| 260 | const ptr = Module._malloc(len); | 322 | const ptr = Module._malloc(len); |
| 261 | 323 | ||
| @@ -281,9 +343,13 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -281,9 +343,13 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 281 | Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset); | 343 | Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset); |
| 282 | offset += kokoroModelConfig.len; | 344 | offset += kokoroModelConfig.len; |
| 283 | 345 | ||
| 346 | + Module._CopyHeap(kittenModelConfig.ptr, kittenModelConfig.len, ptr + offset); | ||
| 347 | + offset += kittenModelConfig.len; | ||
| 348 | + | ||
| 284 | return { | 349 | return { |
| 285 | buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, | 350 | buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, |
| 286 | matcha: matchaModelConfig, kokoro: kokoroModelConfig, | 351 | matcha: matchaModelConfig, kokoro: kokoroModelConfig, |
| 352 | + kitten: kittenModelConfig, | ||
| 287 | } | 353 | } |
| 288 | } | 354 | } |
| 289 | 355 | ||
| @@ -413,12 +479,22 @@ function createOfflineTts(Module, myConfig) { | @@ -413,12 +479,22 @@ function createOfflineTts(Module, myConfig) { | ||
| 413 | lengthScale: 1.0, | 479 | lengthScale: 1.0, |
| 414 | dictDir: '', | 480 | dictDir: '', |
| 415 | lexicon: '', | 481 | lexicon: '', |
| 482 | + lang: '', | ||
| 483 | + }; | ||
| 484 | + | ||
| 485 | + const offlineTtsKittenModelConfig = { | ||
| 486 | + model: '', | ||
| 487 | + voices: '', | ||
| 488 | + tokens: '', | ||
| 489 | + dataDir: '', | ||
| 490 | + lengthScale: 1.0, | ||
| 416 | }; | 491 | }; |
| 417 | 492 | ||
| 418 | const offlineTtsModelConfig = { | 493 | const offlineTtsModelConfig = { |
| 419 | offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, | 494 | offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, |
| 420 | offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, | 495 | offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, |
| 421 | offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, | 496 | offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, |
| 497 | + offlineTtsKittenModelConfig: offlineTtsKittenModelConfig, | ||
| 422 | numThreads: 1, | 498 | numThreads: 1, |
| 423 | debug: 1, | 499 | debug: 1, |
| 424 | provider: 'cpu', | 500 | provider: 'cpu', |
| @@ -16,10 +16,12 @@ extern "C" { | @@ -16,10 +16,12 @@ extern "C" { | ||
| 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); | 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); |
| 17 | static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); | 17 | static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); |
| 18 | static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, ""); | 18 | static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, ""); |
| 19 | +static_assert(sizeof(SherpaOnnxOfflineTtsKittenModelConfig) == 5 * 4, ""); | ||
| 19 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == | 20 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == |
| 20 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + | 21 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + |
| 21 | sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + | 22 | sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + |
| 22 | - sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, | 23 | + sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4 + |
| 24 | + sizeof(SherpaOnnxOfflineTtsKittenModelConfig), | ||
| 23 | ""); | 25 | ""); |
| 24 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == | 26 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == |
| 25 | sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4, | 27 | sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4, |
| @@ -30,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -30,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 30 | auto vits_model_config = &tts_model_config->vits; | 32 | auto vits_model_config = &tts_model_config->vits; |
| 31 | auto matcha_model_config = &tts_model_config->matcha; | 33 | auto matcha_model_config = &tts_model_config->matcha; |
| 32 | auto kokoro = &tts_model_config->kokoro; | 34 | auto kokoro = &tts_model_config->kokoro; |
| 35 | + auto kitten = &tts_model_config->kitten; | ||
| 33 | fprintf(stdout, "----------vits model config----------\n"); | 36 | fprintf(stdout, "----------vits model config----------\n"); |
| 34 | fprintf(stdout, "model: %s\n", vits_model_config->model); | 37 | fprintf(stdout, "model: %s\n", vits_model_config->model); |
| 35 | fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); | 38 | fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); |
| @@ -58,6 +61,14 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -58,6 +61,14 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 58 | fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale); | 61 | fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale); |
| 59 | fprintf(stdout, "dict_dir: %s\n", kokoro->dict_dir); | 62 | fprintf(stdout, "dict_dir: %s\n", kokoro->dict_dir); |
| 60 | fprintf(stdout, "lexicon: %s\n", kokoro->lexicon); | 63 | fprintf(stdout, "lexicon: %s\n", kokoro->lexicon); |
| 64 | + fprintf(stdout, "lang: %s\n", kokoro->lang); | ||
| 65 | + | ||
| 66 | + fprintf(stdout, "----------kitten model config----------\n"); | ||
| 67 | + fprintf(stdout, "model: %s\n", kitten->model); | ||
| 68 | + fprintf(stdout, "voices: %s\n", kitten->voices); | ||
| 69 | + fprintf(stdout, "tokens: %s\n", kitten->tokens); | ||
| 70 | + fprintf(stdout, "data_dir: %s\n", kitten->data_dir); | ||
| 71 | + fprintf(stdout, "length scale: %.3f\n", kitten->length_scale); | ||
| 61 | 72 | ||
| 62 | fprintf(stdout, "----------tts model config----------\n"); | 73 | fprintf(stdout, "----------tts model config----------\n"); |
| 63 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); | 74 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); |
-
请 注册 或 登录 后发表评论