Committed by
GitHub
Add JavaScript API (node-addon) for KittenTTS (#2470)
正在显示
7 个修改的文件
包含
104 行增加
和
3 行删除
| @@ -10,6 +10,14 @@ arch=$(node -p "require('os').arch()") | @@ -10,6 +10,14 @@ arch=$(node -p "require('os').arch()") | ||
| 10 | platform=$(node -p "require('os').platform()") | 10 | platform=$(node -p "require('os').platform()") |
| 11 | node_version=$(node -p "process.versions.node.split('.')[0]") | 11 | node_version=$(node -p "process.versions.node.split('.')[0]") |
| 12 | 12 | ||
| 13 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 14 | +tar xf kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 15 | +rm kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 16 | + | ||
| 17 | +node ./test_tts_non_streaming_kitten_en.js | ||
| 18 | + | ||
| 19 | +rm -rf kitten-nano-en-v0_1-fp16 | ||
| 20 | + | ||
| 13 | echo "----------non-streaming ASR NeMo Canary----------" | 21 | echo "----------non-streaming ASR NeMo Canary----------" |
| 14 | 22 | ||
| 15 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 | 23 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 |
| @@ -37,9 +37,9 @@ jobs: | @@ -37,9 +37,9 @@ jobs: | ||
| 37 | strategy: | 37 | strategy: |
| 38 | fail-fast: false | 38 | fail-fast: false |
| 39 | matrix: | 39 | matrix: |
| 40 | - os: [macos-latest, ubuntu-latest, ubuntu-latest] | 40 | + os: [macos-latest, ubuntu-latest] |
| 41 | node-version: ["16", "22"] | 41 | node-version: ["16", "22"] |
| 42 | - python-version: ["3.8"] | 42 | + python-version: ["3.10"] |
| 43 | 43 | ||
| 44 | steps: | 44 | steps: |
| 45 | - uses: actions/checkout@v4 | 45 | - uses: actions/checkout@v4 |
| @@ -36,7 +36,8 @@ export { OnlineStream, | @@ -36,7 +36,8 @@ export { OnlineStream, | ||
| 36 | OnlineRecognizer, | 36 | OnlineRecognizer, |
| 37 | } from './src/main/ets/components/StreamingAsr'; | 37 | } from './src/main/ets/components/StreamingAsr'; |
| 38 | 38 | ||
| 39 | -export { OfflineTtsKokoroModelConfig, | 39 | +export { OfflineTtsKittenModelConfig, |
| 40 | + OfflineTtsKokoroModelConfig, | ||
| 40 | OfflineTtsMatchaModelConfig, | 41 | OfflineTtsMatchaModelConfig, |
| 41 | OfflineTtsVitsModelConfig, | 42 | OfflineTtsVitsModelConfig, |
| 42 | OfflineTtsModelConfig, | 43 | OfflineTtsModelConfig, |
| @@ -75,6 +75,25 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig( | @@ -75,6 +75,25 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig( | ||
| 75 | return c; | 75 | return c; |
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | +static SherpaOnnxOfflineTtsKittenModelConfig GetOfflineTtsKittenModelConfig( | ||
| 79 | + Napi::Object obj) { | ||
| 80 | + SherpaOnnxOfflineTtsKittenModelConfig c; | ||
| 81 | + memset(&c, 0, sizeof(c)); | ||
| 82 | + | ||
| 83 | + if (!obj.Has("kitten") || !obj.Get("kitten").IsObject()) { | ||
| 84 | + return c; | ||
| 85 | + } | ||
| 86 | + | ||
| 87 | + Napi::Object o = obj.Get("kitten").As<Napi::Object>(); | ||
| 88 | + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); | ||
| 89 | + SHERPA_ONNX_ASSIGN_ATTR_STR(voices, voices); | ||
| 90 | + SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); | ||
| 91 | + SHERPA_ONNX_ASSIGN_ATTR_STR(data_dir, dataDir); | ||
| 92 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale); | ||
| 93 | + | ||
| 94 | + return c; | ||
| 95 | +} | ||
| 96 | + | ||
| 78 | static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( | 97 | static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( |
| 79 | Napi::Object obj) { | 98 | Napi::Object obj) { |
| 80 | SherpaOnnxOfflineTtsModelConfig c; | 99 | SherpaOnnxOfflineTtsModelConfig c; |
| @@ -89,6 +108,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( | @@ -89,6 +108,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( | ||
| 89 | c.vits = GetOfflineTtsVitsModelConfig(o); | 108 | c.vits = GetOfflineTtsVitsModelConfig(o); |
| 90 | c.matcha = GetOfflineTtsMatchaModelConfig(o); | 109 | c.matcha = GetOfflineTtsMatchaModelConfig(o); |
| 91 | c.kokoro = GetOfflineTtsKokoroModelConfig(o); | 110 | c.kokoro = GetOfflineTtsKokoroModelConfig(o); |
| 111 | + c.kitten = GetOfflineTtsKittenModelConfig(o); | ||
| 92 | 112 | ||
| 93 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); | 113 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); |
| 94 | 114 | ||
| @@ -172,6 +192,11 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | @@ -172,6 +192,11 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | ||
| 172 | SHERPA_ONNX_DELETE_C_STR(c.model.matcha.data_dir); | 192 | SHERPA_ONNX_DELETE_C_STR(c.model.matcha.data_dir); |
| 173 | SHERPA_ONNX_DELETE_C_STR(c.model.matcha.dict_dir); | 193 | SHERPA_ONNX_DELETE_C_STR(c.model.matcha.dict_dir); |
| 174 | 194 | ||
| 195 | + SHERPA_ONNX_DELETE_C_STR(c.model.kitten.model); | ||
| 196 | + SHERPA_ONNX_DELETE_C_STR(c.model.kitten.voices); | ||
| 197 | + SHERPA_ONNX_DELETE_C_STR(c.model.kitten.tokens); | ||
| 198 | + SHERPA_ONNX_DELETE_C_STR(c.model.kitten.data_dir); | ||
| 199 | + | ||
| 175 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.model); | 200 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.model); |
| 176 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.voices); | 201 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.voices); |
| 177 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.tokens); | 202 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.tokens); |
| @@ -39,10 +39,19 @@ export class OfflineTtsKokoroModelConfig { | @@ -39,10 +39,19 @@ export class OfflineTtsKokoroModelConfig { | ||
| 39 | public lang: string = ''; | 39 | public lang: string = ''; |
| 40 | } | 40 | } |
| 41 | 41 | ||
| 42 | +export class OfflineTtsKittenModelConfig { | ||
| 43 | + public model: string = ''; | ||
| 44 | + public voices: string = ''; | ||
| 45 | + public tokens: string = ''; | ||
| 46 | + public dataDir: string = ''; | ||
| 47 | + public lengthScale: number = 1.0; | ||
| 48 | +} | ||
| 49 | + | ||
| 42 | export class OfflineTtsModelConfig { | 50 | export class OfflineTtsModelConfig { |
| 43 | public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); | 51 | public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); |
| 44 | public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig(); | 52 | public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig(); |
| 45 | public kokoro: OfflineTtsKokoroModelConfig = new OfflineTtsKokoroModelConfig(); | 53 | public kokoro: OfflineTtsKokoroModelConfig = new OfflineTtsKokoroModelConfig(); |
| 54 | + public kitten: OfflineTtsKittenModelConfig = new OfflineTtsKittenModelConfig(); | ||
| 46 | public numThreads: number = 1; | 55 | public numThreads: number = 1; |
| 47 | public debug: boolean = false; | 56 | public debug: boolean = false; |
| 48 | public provider: string = 'cpu'; | 57 | public provider: string = 'cpu'; |
| @@ -147,6 +147,7 @@ The following tables list the examples in this folder. | @@ -147,6 +147,7 @@ The following tables list the examples in this folder. | ||
| 147 | 147 | ||
| 148 | |File| Description| | 148 | |File| Description| |
| 149 | |---|---| | 149 | |---|---| |
| 150 | +|[./test_tts_non_streaming_kitten_en.js](./test_tts_non_streaming_kitten_en.js)| Text-to-speech with a KittenTTS English Model| | ||
| 150 | |[./test_tts_non_streaming_kokoro_en.js](./test_tts_non_streaming_kokoro_en.js)| Text-to-speech with a Kokoro English Model| | 151 | |[./test_tts_non_streaming_kokoro_en.js](./test_tts_non_streaming_kokoro_en.js)| Text-to-speech with a Kokoro English Model| |
| 151 | |[./test_tts_non_streaming_kokoro_zh_en.js](./test_tts_non_streaming_kokoro_zh_en.js)| Text-to-speech with a Kokoro Model supporting Chinese and English| | 152 | |[./test_tts_non_streaming_kokoro_zh_en.js](./test_tts_non_streaming_kokoro_zh_en.js)| Text-to-speech with a Kokoro Model supporting Chinese and English| |
| 152 | |[./test_tts_non_streaming_matcha_icefall_en.js](./test_tts_non_streaming_matcha_icefall_en.js)| Text-to-speech with a [MatchaTTS English Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)| | 153 | |[./test_tts_non_streaming_matcha_icefall_en.js](./test_tts_non_streaming_matcha_icefall_en.js)| Text-to-speech with a [MatchaTTS English Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)| |
| @@ -458,6 +459,16 @@ npm install naudiodon2 | @@ -458,6 +459,16 @@ npm install naudiodon2 | ||
| 458 | node ./test_vad_asr_non_streaming_sense_voice_microphone.js | 459 | node ./test_vad_asr_non_streaming_sense_voice_microphone.js |
| 459 | ``` | 460 | ``` |
| 460 | 461 | ||
| 462 | +### Text-to-speech with KittenTTS models (English TTS) | ||
| 463 | + | ||
| 464 | +```bash | ||
| 465 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 466 | +tar xf kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 467 | +rm kitten-nano-en-v0_1-fp16.tar.bz2 | ||
| 468 | + | ||
| 469 | +node ./test_tts_non_streaming_kitten_en.js | ||
| 470 | +``` | ||
| 471 | + | ||
| 461 | ### Text-to-speech with Kokoro TTS models (Chinese + English TTS) | 472 | ### Text-to-speech with Kokoro TTS models (Chinese + English TTS) |
| 462 | 473 | ||
| 463 | ```bash | 474 | ```bash |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | + | ||
| 4 | +// please refer to | ||
| 5 | +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html | ||
| 6 | +// to download model files | ||
| 7 | +function createOfflineTts() { | ||
| 8 | + const config = { | ||
| 9 | + model: { | ||
| 10 | + kitten: { | ||
| 11 | + model: './kitten-nano-en-v0_1-fp16/model.fp16.onnx', | ||
| 12 | + voices: './kitten-nano-en-v0_1-fp16/voices.bin', | ||
| 13 | + tokens: './kitten-nano-en-v0_1-fp16/tokens.txt', | ||
| 14 | + dataDir: './kitten-nano-en-v0_1-fp16/espeak-ng-data', | ||
| 15 | + }, | ||
| 16 | + debug: true, | ||
| 17 | + numThreads: 1, | ||
| 18 | + provider: 'cpu', | ||
| 19 | + }, | ||
| 20 | + maxNumSentences: 1, | ||
| 21 | + }; | ||
| 22 | + return new sherpa_onnx.OfflineTts(config); | ||
| 23 | +} | ||
| 24 | + | ||
| 25 | +const tts = createOfflineTts(); | ||
| 26 | + | ||
| 27 | +const text = | ||
| 28 | + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' | ||
| 29 | + | ||
| 30 | + | ||
| 31 | +let start = Date.now(); | ||
| 32 | +const audio = tts.generate({text: text, sid: 6, speed: 1.0}); | ||
| 33 | +let stop = Date.now(); | ||
| 34 | +const elapsed_seconds = (stop - start) / 1000; | ||
| 35 | +const duration = audio.samples.length / audio.sampleRate; | ||
| 36 | +const real_time_factor = elapsed_seconds / duration; | ||
| 37 | +console.log('Wave duration', duration.toFixed(3), 'seconds') | ||
| 38 | +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') | ||
| 39 | +console.log( | ||
| 40 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 41 | + real_time_factor.toFixed(3)) | ||
| 42 | + | ||
| 43 | +const filename = 'test-kitten-en-6.wav'; | ||
| 44 | +sherpa_onnx.writeWave( | ||
| 45 | + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); | ||
| 46 | + | ||
| 47 | +console.log(`Saved to ${filename}`); |
-
请 注册 或 登录 后发表评论