Committed by
GitHub
Add JavaScript API (node-addon-api) for MatchaTTS models. (#1677)
正在显示
6 个修改的文件
包含
194 行增加
和
4 行删除
| @@ -85,6 +85,25 @@ fi | @@ -85,6 +85,25 @@ fi | ||
| 85 | 85 | ||
| 86 | echo "----------tts----------" | 86 | echo "----------tts----------" |
| 87 | 87 | ||
| 88 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 89 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 90 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 91 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 92 | + | ||
| 93 | +node ./test_tts_non_streaming_matcha_icefall_en.js | ||
| 94 | +rm hifigan_v2.onnx | ||
| 95 | +rm -rf matcha-icefall-en_US-ljspeech | ||
| 96 | + | ||
| 97 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 98 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 99 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 100 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 101 | + | ||
| 102 | +node ./test_tts_non_streaming_matcha_icefall_zh.js | ||
| 103 | +rm hifigan_v2.onnx | ||
| 104 | +rm -rf matcha-icefall-zh-baker | ||
| 105 | +ls -lh *.wav | ||
| 106 | + | ||
| 88 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 | 107 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 |
| 89 | tar xf vits-piper-en_GB-cori-medium.tar.bz2 | 108 | tar xf vits-piper-en_GB-cori-medium.tar.bz2 |
| 90 | rm vits-piper-en_GB-cori-medium.tar.bz2 | 109 | rm vits-piper-en_GB-cori-medium.tar.bz2 |
| @@ -31,6 +31,28 @@ static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig( | @@ -31,6 +31,28 @@ static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig( | ||
| 31 | return c; | 31 | return c; |
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | +static SherpaOnnxOfflineTtsMatchaModelConfig GetOfflineTtsMatchaModelConfig( | ||
| 35 | + Napi::Object obj) { | ||
| 36 | + SherpaOnnxOfflineTtsMatchaModelConfig c; | ||
| 37 | + memset(&c, 0, sizeof(c)); | ||
| 38 | + | ||
| 39 | + if (!obj.Has("matcha") || !obj.Get("matcha").IsObject()) { | ||
| 40 | + return c; | ||
| 41 | + } | ||
| 42 | + | ||
| 43 | + Napi::Object o = obj.Get("matcha").As<Napi::Object>(); | ||
| 44 | + SHERPA_ONNX_ASSIGN_ATTR_STR(acoustic_model, acousticModel); | ||
| 45 | + SHERPA_ONNX_ASSIGN_ATTR_STR(vocoder, vocoder); | ||
| 46 | + SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon); | ||
| 47 | + SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); | ||
| 48 | + SHERPA_ONNX_ASSIGN_ATTR_STR(data_dir, dataDir); | ||
| 49 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(noise_scale, noiseScale); | ||
| 50 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale); | ||
| 51 | + SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir); | ||
| 52 | + | ||
| 53 | + return c; | ||
| 54 | +} | ||
| 55 | + | ||
| 34 | static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( | 56 | static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( |
| 35 | Napi::Object obj) { | 57 | Napi::Object obj) { |
| 36 | SherpaOnnxOfflineTtsModelConfig c; | 58 | SherpaOnnxOfflineTtsModelConfig c; |
| @@ -43,6 +65,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( | @@ -43,6 +65,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( | ||
| 43 | Napi::Object o = obj.Get("model").As<Napi::Object>(); | 65 | Napi::Object o = obj.Get("model").As<Napi::Object>(); |
| 44 | 66 | ||
| 45 | c.vits = GetOfflineTtsVitsModelConfig(o); | 67 | c.vits = GetOfflineTtsVitsModelConfig(o); |
| 68 | + c.matcha = GetOfflineTtsMatchaModelConfig(o); | ||
| 46 | 69 | ||
| 47 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); | 70 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); |
| 48 | 71 | ||
| @@ -107,9 +130,10 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | @@ -107,9 +130,10 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | ||
| 107 | decltype(&OH_ResourceManager_ReleaseNativeResourceManager)> | 130 | decltype(&OH_ResourceManager_ReleaseNativeResourceManager)> |
| 108 | mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]), | 131 | mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]), |
| 109 | &OH_ResourceManager_ReleaseNativeResourceManager); | 132 | &OH_ResourceManager_ReleaseNativeResourceManager); |
| 110 | - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get()); | 133 | + const SherpaOnnxOfflineTts *tts = |
| 134 | + SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get()); | ||
| 111 | #else | 135 | #else |
| 112 | - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c); | 136 | + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c); |
| 113 | #endif | 137 | #endif |
| 114 | 138 | ||
| 115 | if (c.model.vits.model) { | 139 | if (c.model.vits.model) { |
| @@ -132,6 +156,30 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | @@ -132,6 +156,30 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | ||
| 132 | delete[] c.model.vits.dict_dir; | 156 | delete[] c.model.vits.dict_dir; |
| 133 | } | 157 | } |
| 134 | 158 | ||
| 159 | + if (c.model.matcha.acoustic_model) { | ||
| 160 | + delete[] c.model.matcha.acoustic_model; | ||
| 161 | + } | ||
| 162 | + | ||
| 163 | + if (c.model.matcha.vocoder) { | ||
| 164 | + delete[] c.model.matcha.vocoder; | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + if (c.model.matcha.lexicon) { | ||
| 168 | + delete[] c.model.matcha.lexicon; | ||
| 169 | + } | ||
| 170 | + | ||
| 171 | + if (c.model.matcha.tokens) { | ||
| 172 | + delete[] c.model.matcha.tokens; | ||
| 173 | + } | ||
| 174 | + | ||
| 175 | + if (c.model.matcha.data_dir) { | ||
| 176 | + delete[] c.model.matcha.data_dir; | ||
| 177 | + } | ||
| 178 | + | ||
| 179 | + if (c.model.matcha.dict_dir) { | ||
| 180 | + delete[] c.model.matcha.dict_dir; | ||
| 181 | + } | ||
| 182 | + | ||
| 135 | if (c.model.provider) { | 183 | if (c.model.provider) { |
| 136 | delete[] c.model.provider; | 184 | delete[] c.model.provider; |
| 137 | } | 185 | } |
| @@ -152,7 +200,8 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | @@ -152,7 +200,8 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | ||
| 152 | } | 200 | } |
| 153 | 201 | ||
| 154 | return Napi::External<SherpaOnnxOfflineTts>::New( | 202 | return Napi::External<SherpaOnnxOfflineTts>::New( |
| 155 | - env, tts, [](Napi::Env env, SherpaOnnxOfflineTts *tts) { | 203 | + env, const_cast<SherpaOnnxOfflineTts *>(tts), |
| 204 | + [](Napi::Env env, SherpaOnnxOfflineTts *tts) { | ||
| 156 | SherpaOnnxDestroyOfflineTts(tts); | 205 | SherpaOnnxDestroyOfflineTts(tts); |
| 157 | }); | 206 | }); |
| 158 | } | 207 | } |
| @@ -133,6 +133,8 @@ The following tables list the examples in this folder. | @@ -133,6 +133,8 @@ The following tables list the examples in this folder. | ||
| 133 | 133 | ||
| 134 | |File| Description| | 134 | |File| Description| |
| 135 | |---|---| | 135 | |---|---| |
| 136 | +|[./test_tts_non_streaming_matcha_icefall_en.js](./test_tts_non_streaming_matcha_icefall_en.js)| Text-to-speech with a [MatchaTTS English Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)| | ||
| 137 | +|[./test_tts_non_streaming_matcha_icefall_zhjs](./test_tts_non_streaming_matcha_icefall_zh.js)| Text-to-speech with a [MatchaTTS Chinese Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker)| | ||
| 136 | |[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model| | 138 | |[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model| |
| 137 | |[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model| | 139 | |[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model| |
| 138 | |[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)| | 140 | |[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)| |
| @@ -345,6 +347,28 @@ npm install naudiodon2 | @@ -345,6 +347,28 @@ npm install naudiodon2 | ||
| 345 | node ./test_vad_asr_non_streaming_sense_voice_microphone.js | 347 | node ./test_vad_asr_non_streaming_sense_voice_microphone.js |
| 346 | ``` | 348 | ``` |
| 347 | 349 | ||
| 350 | +### Text-to-speech with MatchaTTS models (English TTS) | ||
| 351 | +```bash | ||
| 352 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 353 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 354 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 355 | + | ||
| 356 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 357 | + | ||
| 358 | +node ./test_tts_non_streaming_matcha_icefall_en.js | ||
| 359 | +``` | ||
| 360 | + | ||
| 361 | +### Text-to-speech with MatchaTTS models (Chinese TTS) | ||
| 362 | +```bash | ||
| 363 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 364 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 365 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 366 | + | ||
| 367 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 368 | + | ||
| 369 | +node ./test_tts_non_streaming_matcha_icefall_zh.js | ||
| 370 | +``` | ||
| 371 | + | ||
| 348 | ### Text-to-speech with piper VITS models (TTS) | 372 | ### Text-to-speech with piper VITS models (TTS) |
| 349 | 373 | ||
| 350 | ```bash | 374 | ```bash |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | + | ||
| 4 | +// please refer to | ||
| 5 | +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 6 | +// to download model files | ||
| 7 | +function createOfflineTts() { | ||
| 8 | + const config = { | ||
| 9 | + model: { | ||
| 10 | + matcha: { | ||
| 11 | + acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx', | ||
| 12 | + vocoder: './hifigan_v2.onnx', | ||
| 13 | + lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt', | ||
| 14 | + tokens: './matcha-icefall-en_US-ljspeech/tokens.txt', | ||
| 15 | + dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data', | ||
| 16 | + }, | ||
| 17 | + debug: true, | ||
| 18 | + numThreads: 1, | ||
| 19 | + provider: 'cpu', | ||
| 20 | + }, | ||
| 21 | + maxNumSentences: 1, | ||
| 22 | + }; | ||
| 23 | + return new sherpa_onnx.OfflineTts(config); | ||
| 24 | +} | ||
| 25 | + | ||
| 26 | +const tts = createOfflineTts(); | ||
| 27 | + | ||
| 28 | +const text = | ||
| 29 | + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' | ||
| 30 | + | ||
| 31 | + | ||
| 32 | +let start = Date.now(); | ||
| 33 | +const audio = tts.generate({text: text, sid: 0, speed: 1.0}); | ||
| 34 | +let stop = Date.now(); | ||
| 35 | +const elapsed_seconds = (stop - start) / 1000; | ||
| 36 | +const duration = audio.samples.length / audio.sampleRate; | ||
| 37 | +const real_time_factor = elapsed_seconds / duration; | ||
| 38 | +console.log('Wave duration', duration.toFixed(3), 'secodns') | ||
| 39 | +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') | ||
| 40 | +console.log( | ||
| 41 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 42 | + real_time_factor.toFixed(3)) | ||
| 43 | + | ||
| 44 | +const filename = 'test-matcha-en.wav'; | ||
| 45 | +sherpa_onnx.writeWave( | ||
| 46 | + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); | ||
| 47 | + | ||
| 48 | +console.log(`Saved to ${filename}`); |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | + | ||
| 4 | +// please refer to | ||
| 5 | +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
| 6 | +// to download model files | ||
| 7 | +function createOfflineTts() { | ||
| 8 | + const config = { | ||
| 9 | + model: { | ||
| 10 | + matcha: { | ||
| 11 | + acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx', | ||
| 12 | + vocoder: './hifigan_v2.onnx', | ||
| 13 | + lexicon: './matcha-icefall-zh-baker/lexicon.txt', | ||
| 14 | + tokens: './matcha-icefall-zh-baker/tokens.txt', | ||
| 15 | + dictDir: './matcha-icefall-zh-baker/dict', | ||
| 16 | + }, | ||
| 17 | + debug: true, | ||
| 18 | + numThreads: 1, | ||
| 19 | + provider: 'cpu', | ||
| 20 | + }, | ||
| 21 | + maxNumSentences: 1, | ||
| 22 | + ruleFsts: | ||
| 23 | + './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst', | ||
| 24 | + }; | ||
| 25 | + return new sherpa_onnx.OfflineTts(config); | ||
| 26 | +} | ||
| 27 | + | ||
| 28 | +const tts = createOfflineTts(); | ||
| 29 | + | ||
| 30 | +const text = | ||
| 31 | + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' | ||
| 32 | + | ||
| 33 | + | ||
| 34 | +let start = Date.now(); | ||
| 35 | +const audio = tts.generate({text: text, sid: 0, speed: 1.0}); | ||
| 36 | +let stop = Date.now(); | ||
| 37 | +const elapsed_seconds = (stop - start) / 1000; | ||
| 38 | +const duration = audio.samples.length / audio.sampleRate; | ||
| 39 | +const real_time_factor = elapsed_seconds / duration; | ||
| 40 | +console.log('Wave duration', duration.toFixed(3), 'secodns') | ||
| 41 | +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') | ||
| 42 | +console.log( | ||
| 43 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 44 | + real_time_factor.toFixed(3)) | ||
| 45 | + | ||
| 46 | +const filename = 'test-matcha-zh.wav'; | ||
| 47 | +sherpa_onnx.writeWave( | ||
| 48 | + filename, {samples: audio.samples, sampleRate: audio.sampleRate}); | ||
| 49 | + | ||
| 50 | +console.log(`Saved to ${filename}`); |
| @@ -3,7 +3,7 @@ | @@ -3,7 +3,7 @@ | ||
| 3 | "version": "1.0.0", | 3 | "version": "1.0.0", |
| 4 | "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", | 4 | "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", |
| 5 | "dependencies": { | 5 | "dependencies": { |
| 6 | - "cmake-js": "^7.0.0", | 6 | + "cmake-js": "^7.3.0", |
| 7 | "node-addon-api": "^8.3.0", | 7 | "node-addon-api": "^8.3.0", |
| 8 | "perf_hooks": "*" | 8 | "perf_hooks": "*" |
| 9 | }, | 9 | }, |
-
请 注册 或 登录 后发表评论