Committed by
GitHub
Add JavaScript (WebAssembly) API for Kokoro TTS models. (#1726)
正在显示
5 个修改的文件
包含
154 行增加
和
6 行删除
| @@ -10,7 +10,15 @@ ls -lh | @@ -10,7 +10,15 @@ ls -lh | ||
| 10 | ls -lh node_modules | 10 | ls -lh node_modules |
| 11 | 11 | ||
| 12 | # offline tts | 12 | # offline tts |
| 13 | -# | 13 | + |
| 14 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | ||
| 15 | +tar xf kokoro-en-v0_19.tar.bz2 | ||
| 16 | +rm kokoro-en-v0_19.tar.bz2 | ||
| 17 | + | ||
| 18 | +node ./test-offline-tts-kokoro-en.js | ||
| 19 | + | ||
| 20 | +ls -lh | ||
| 21 | + | ||
| 14 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | 22 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 |
| 15 | tar xvf matcha-icefall-zh-baker.tar.bz2 | 23 | tar xvf matcha-icefall-zh-baker.tar.bz2 |
| 16 | rm matcha-icefall-zh-baker.tar.bz2 | 24 | rm matcha-icefall-zh-baker.tar.bz2 |
| @@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js | @@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js | ||
| 42 | 42 | ||
| 43 | In the following, we demonstrate how to run text-to-speech. | 43 | In the following, we demonstrate how to run text-to-speech. |
| 44 | 44 | ||
| 45 | +## ./test-offline-tts-kokoro-en.js | ||
| 46 | + | ||
| 47 | +[./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use | ||
| 48 | +[kokoro-en-v0_19](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2) | ||
| 49 | +for text-to-speech. | ||
| 50 | + | ||
| 51 | +You can use the following command to run it: | ||
| 52 | + | ||
| 53 | +```bash | ||
| 54 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | ||
| 55 | +tar xf kokoro-en-v0_19.tar.bz2 | ||
| 56 | +rm kokoro-en-v0_19.tar.bz2 | ||
| 57 | + | ||
| 58 | +node ./test-offline-tts-kokoro-en.js | ||
| 59 | +``` | ||
| 60 | + | ||
| 45 | ## ./test-offline-tts-matcha-zh.js | 61 | ## ./test-offline-tts-matcha-zh.js |
| 46 | 62 | ||
| 47 | [./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use | 63 | [./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | + | ||
| 3 | +const sherpa_onnx = require('sherpa-onnx'); | ||
| 4 | + | ||
| 5 | +function createOfflineTts() { | ||
| 6 | + let offlineTtsKokoroModelConfig = { | ||
| 7 | + model: './kokoro-en-v0_19/model.onnx', | ||
| 8 | + voices: './kokoro-en-v0_19/voices.bin', | ||
| 9 | + tokens: './kokoro-en-v0_19/tokens.txt', | ||
| 10 | + dataDir: './kokoro-en-v0_19/espeak-ng-data', | ||
| 11 | + lengthScale: 1.0, | ||
| 12 | + }; | ||
| 13 | + let offlineTtsModelConfig = { | ||
| 14 | + offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, | ||
| 15 | + numThreads: 1, | ||
| 16 | + debug: 1, | ||
| 17 | + provider: 'cpu', | ||
| 18 | + }; | ||
| 19 | + | ||
| 20 | + let offlineTtsConfig = { | ||
| 21 | + offlineTtsModelConfig: offlineTtsModelConfig, | ||
| 22 | + maxNumSentences: 1, | ||
| 23 | + }; | ||
| 24 | + | ||
| 25 | + return sherpa_onnx.createOfflineTts(offlineTtsConfig); | ||
| 26 | +} | ||
| 27 | + | ||
| 28 | +const tts = createOfflineTts(); | ||
| 29 | +const speakerId = 0; | ||
| 30 | +const speed = 1.0; | ||
| 31 | +const text = | ||
| 32 | + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' | ||
| 33 | + | ||
| 34 | +const audio = tts.generate({text: text, sid: speakerId, speed: speed}); | ||
| 35 | +tts.save('./test-kokoro-en.wav', audio); | ||
| 36 | +console.log('Saved to test-kokoro-en.wav successfully.'); | ||
| 37 | +tts.free(); |
| @@ -8,8 +8,12 @@ function freeConfig(config, Module) { | @@ -8,8 +8,12 @@ function freeConfig(config, Module) { | ||
| 8 | freeConfig(config.config, Module) | 8 | freeConfig(config.config, Module) |
| 9 | } | 9 | } |
| 10 | 10 | ||
| 11 | - if ('config2' in config) { | ||
| 12 | - freeConfig(config.config2, Module) | 11 | + if ('matcha' in config) { |
| 12 | + freeConfig(config.matcha, Module) | ||
| 13 | + } | ||
| 14 | + | ||
| 15 | + if ('kokoro' in config) { | ||
| 16 | + freeConfig(config.kokoro, Module) | ||
| 13 | } | 17 | } |
| 14 | 18 | ||
| 15 | Module._free(config.ptr); | 19 | Module._free(config.ptr); |
| @@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) { | @@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) { | ||
| 132 | } | 136 | } |
| 133 | } | 137 | } |
| 134 | 138 | ||
| 139 | +function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | ||
| 140 | + const modelLen = Module.lengthBytesUTF8(config.model) + 1; | ||
| 141 | + const voicesLen = Module.lengthBytesUTF8(config.voices) + 1; | ||
| 142 | + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; | ||
| 143 | + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; | ||
| 144 | + | ||
| 145 | + const n = modelLen + voicesLen + tokensLen + dataDirLen; | ||
| 146 | + | ||
| 147 | + const buffer = Module._malloc(n); | ||
| 148 | + | ||
| 149 | + const len = 5 * 4; | ||
| 150 | + const ptr = Module._malloc(len); | ||
| 151 | + | ||
| 152 | + let offset = 0; | ||
| 153 | + Module.stringToUTF8(config.model || '', buffer + offset, modelLen); | ||
| 154 | + offset += modelLen; | ||
| 155 | + | ||
| 156 | + Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen); | ||
| 157 | + offset += voicesLen; | ||
| 158 | + | ||
| 159 | + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); | ||
| 160 | + offset += tokensLen; | ||
| 161 | + | ||
| 162 | + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); | ||
| 163 | + offset += dataDirLen; | ||
| 164 | + | ||
| 165 | + offset = 0; | ||
| 166 | + Module.setValue(ptr, buffer + offset, 'i8*'); | ||
| 167 | + offset += modelLen; | ||
| 168 | + | ||
| 169 | + Module.setValue(ptr + 4, buffer + offset, 'i8*'); | ||
| 170 | + offset += voicesLen; | ||
| 171 | + | ||
| 172 | + Module.setValue(ptr + 8, buffer + offset, 'i8*'); | ||
| 173 | + offset += tokensLen; | ||
| 174 | + | ||
| 175 | + Module.setValue(ptr + 12, buffer + offset, 'i8*'); | ||
| 176 | + offset += dataDirLen; | ||
| 177 | + | ||
| 178 | + Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float'); | ||
| 179 | + | ||
| 180 | + return { | ||
| 181 | + buffer: buffer, ptr: ptr, len: len, | ||
| 182 | + } | ||
| 183 | +} | ||
| 184 | + | ||
| 135 | function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | 185 | function initSherpaOnnxOfflineTtsModelConfig(config, Module) { |
| 136 | if (!('offlineTtsVitsModelConfig' in config)) { | 186 | if (!('offlineTtsVitsModelConfig' in config)) { |
| 137 | config.offlineTtsVitsModelConfig = { | 187 | config.offlineTtsVitsModelConfig = { |
| @@ -159,6 +209,16 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -159,6 +209,16 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 159 | }; | 209 | }; |
| 160 | } | 210 | } |
| 161 | 211 | ||
| 212 | + if (!('offlineTtsKokoroModelConfig' in config)) { | ||
| 213 | + config.offlineTtsKokoroModelConfig = { | ||
| 214 | + model: '', | ||
| 215 | + voices: '', | ||
| 216 | + tokens: '', | ||
| 217 | + lengthScale: 1.0, | ||
| 218 | + dataDir: '', | ||
| 219 | + }; | ||
| 220 | + } | ||
| 221 | + | ||
| 162 | 222 | ||
| 163 | const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( | 223 | const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( |
| 164 | config.offlineTtsVitsModelConfig, Module); | 224 | config.offlineTtsVitsModelConfig, Module); |
| @@ -166,7 +226,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -166,7 +226,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 166 | const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig( | 226 | const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig( |
| 167 | config.offlineTtsMatchaModelConfig, Module); | 227 | config.offlineTtsMatchaModelConfig, Module); |
| 168 | 228 | ||
| 169 | - const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4; | 229 | + const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig( |
| 230 | + config.offlineTtsKokoroModelConfig, Module); | ||
| 231 | + | ||
| 232 | + const len = vitsModelConfig.len + matchaModelConfig.len + | ||
| 233 | + kokoroModelConfig.len + 3 * 4; | ||
| 234 | + | ||
| 170 | const ptr = Module._malloc(len); | 235 | const ptr = Module._malloc(len); |
| 171 | 236 | ||
| 172 | let offset = 0; | 237 | let offset = 0; |
| @@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 188 | Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset); | 253 | Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset); |
| 189 | offset += matchaModelConfig.len; | 254 | offset += matchaModelConfig.len; |
| 190 | 255 | ||
| 256 | + Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset); | ||
| 257 | + offset += kokoroModelConfig.len; | ||
| 258 | + | ||
| 191 | return { | 259 | return { |
| 192 | buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, | 260 | buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, |
| 193 | - config2: matchaModelConfig | 261 | + matcha: matchaModelConfig, kokoro: kokoroModelConfig, |
| 194 | } | 262 | } |
| 195 | } | 263 | } |
| 196 | 264 | ||
| @@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) { | @@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) { | ||
| 308 | lengthScale: 1.0, | 376 | lengthScale: 1.0, |
| 309 | }; | 377 | }; |
| 310 | 378 | ||
| 379 | + const offlineTtsKokoroModelConfig = { | ||
| 380 | + model: '', | ||
| 381 | + voices: '', | ||
| 382 | + tokens: '', | ||
| 383 | + dataDir: '', | ||
| 384 | + lengthScale: 1.0, | ||
| 385 | + }; | ||
| 386 | + | ||
| 311 | const offlineTtsModelConfig = { | 387 | const offlineTtsModelConfig = { |
| 312 | offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, | 388 | offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, |
| 313 | offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, | 389 | offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, |
| 390 | + offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, | ||
| 314 | numThreads: 1, | 391 | numThreads: 1, |
| 315 | debug: 1, | 392 | debug: 1, |
| 316 | provider: 'cpu', | 393 | provider: 'cpu', |
| @@ -15,9 +15,11 @@ extern "C" { | @@ -15,9 +15,11 @@ extern "C" { | ||
| 15 | 15 | ||
| 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); | 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); |
| 17 | static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); | 17 | static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); |
| 18 | +static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 5 * 4, ""); | ||
| 18 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == | 19 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == |
| 19 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + | 20 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + |
| 20 | - sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4, | 21 | + sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + |
| 22 | + sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, | ||
| 21 | ""); | 23 | ""); |
| 22 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == | 24 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == |
| 23 | sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, | 25 | sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, |
| @@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 27 | auto tts_model_config = &tts_config->model; | 29 | auto tts_model_config = &tts_config->model; |
| 28 | auto vits_model_config = &tts_model_config->vits; | 30 | auto vits_model_config = &tts_model_config->vits; |
| 29 | auto matcha_model_config = &tts_model_config->matcha; | 31 | auto matcha_model_config = &tts_model_config->matcha; |
| 32 | + auto kokoro = &tts_model_config->kokoro; | ||
| 30 | fprintf(stdout, "----------vits model config----------\n"); | 33 | fprintf(stdout, "----------vits model config----------\n"); |
| 31 | fprintf(stdout, "model: %s\n", vits_model_config->model); | 34 | fprintf(stdout, "model: %s\n", vits_model_config->model); |
| 32 | fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); | 35 | fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); |
| @@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 47 | fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale); | 50 | fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale); |
| 48 | fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir); | 51 | fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir); |
| 49 | 52 | ||
| 53 | + fprintf(stdout, "----------kokoro model config----------\n"); | ||
| 54 | + fprintf(stdout, "model: %s\n", kokoro->model); | ||
| 55 | + fprintf(stdout, "voices: %s\n", kokoro->voices); | ||
| 56 | + fprintf(stdout, "tokens: %s\n", kokoro->tokens); | ||
| 57 | + fprintf(stdout, "data_dir: %s\n", kokoro->data_dir); | ||
| 58 | + fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale); | ||
| 59 | + | ||
| 50 | fprintf(stdout, "----------tts model config----------\n"); | 60 | fprintf(stdout, "----------tts model config----------\n"); |
| 51 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); | 61 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); |
| 52 | fprintf(stdout, "debug: %d\n", tts_model_config->debug); | 62 | fprintf(stdout, "debug: %d\n", tts_model_config->debug); |
-
请 注册 或 登录 后发表评论