Fangjun Kuang
Committed by GitHub

Add JavaScript API (node-addon-api) for MatchaTTS models. (#1677)

@@ -85,6 +85,25 @@ fi @@ -85,6 +85,25 @@ fi
85 85
86 echo "----------tts----------" 86 echo "----------tts----------"
87 87
  88 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  89 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  90 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  91 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  92 +
  93 +node ./test_tts_non_streaming_matcha_icefall_en.js
  94 +rm hifigan_v2.onnx
  95 +rm -rf matcha-icefall-en_US-ljspeech
  96 +
  97 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  98 +tar xvf matcha-icefall-zh-baker.tar.bz2
  99 +rm matcha-icefall-zh-baker.tar.bz2
  100 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  101 +
  102 +node ./test_tts_non_streaming_matcha_icefall_zh.js
  103 +rm hifigan_v2.onnx
  104 +rm -rf matcha-icefall-zh-baker
  105 +ls -lh *.wav
  106 +
88 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 107 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
89 tar xf vits-piper-en_GB-cori-medium.tar.bz2 108 tar xf vits-piper-en_GB-cori-medium.tar.bz2
90 rm vits-piper-en_GB-cori-medium.tar.bz2 109 rm vits-piper-en_GB-cori-medium.tar.bz2
@@ -31,6 +31,28 @@ static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig( @@ -31,6 +31,28 @@ static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig(
31 return c; 31 return c;
32 } 32 }
33 33
  34 +static SherpaOnnxOfflineTtsMatchaModelConfig GetOfflineTtsMatchaModelConfig(
  35 + Napi::Object obj) {
  36 + SherpaOnnxOfflineTtsMatchaModelConfig c;
  37 + memset(&c, 0, sizeof(c));
  38 +
  39 + if (!obj.Has("matcha") || !obj.Get("matcha").IsObject()) {
  40 + return c;
  41 + }
  42 +
  43 + Napi::Object o = obj.Get("matcha").As<Napi::Object>();
  44 + SHERPA_ONNX_ASSIGN_ATTR_STR(acoustic_model, acousticModel);
  45 + SHERPA_ONNX_ASSIGN_ATTR_STR(vocoder, vocoder);
  46 + SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon);
  47 + SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
  48 + SHERPA_ONNX_ASSIGN_ATTR_STR(data_dir, dataDir);
  49 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(noise_scale, noiseScale);
  50 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale);
  51 + SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir);
  52 +
  53 + return c;
  54 +}
  55 +
34 static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( 56 static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig(
35 Napi::Object obj) { 57 Napi::Object obj) {
36 SherpaOnnxOfflineTtsModelConfig c; 58 SherpaOnnxOfflineTtsModelConfig c;
@@ -43,6 +65,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( @@ -43,6 +65,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig(
43 Napi::Object o = obj.Get("model").As<Napi::Object>(); 65 Napi::Object o = obj.Get("model").As<Napi::Object>();
44 66
45 c.vits = GetOfflineTtsVitsModelConfig(o); 67 c.vits = GetOfflineTtsVitsModelConfig(o);
  68 + c.matcha = GetOfflineTtsMatchaModelConfig(o);
46 69
47 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); 70 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
48 71
@@ -107,9 +130,10 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( @@ -107,9 +130,10 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
107 decltype(&OH_ResourceManager_ReleaseNativeResourceManager)> 130 decltype(&OH_ResourceManager_ReleaseNativeResourceManager)>
108 mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]), 131 mgr(OH_ResourceManager_InitNativeResourceManager(env, info[1]),
109 &OH_ResourceManager_ReleaseNativeResourceManager); 132 &OH_ResourceManager_ReleaseNativeResourceManager);
110 - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get()); 133 + const SherpaOnnxOfflineTts *tts =
  134 + SherpaOnnxCreateOfflineTtsOHOS(&c, mgr.get());
111 #else 135 #else
112 - SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c); 136 + const SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c);
113 #endif 137 #endif
114 138
115 if (c.model.vits.model) { 139 if (c.model.vits.model) {
@@ -132,6 +156,30 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( @@ -132,6 +156,30 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
132 delete[] c.model.vits.dict_dir; 156 delete[] c.model.vits.dict_dir;
133 } 157 }
134 158
  159 + if (c.model.matcha.acoustic_model) {
  160 + delete[] c.model.matcha.acoustic_model;
  161 + }
  162 +
  163 + if (c.model.matcha.vocoder) {
  164 + delete[] c.model.matcha.vocoder;
  165 + }
  166 +
  167 + if (c.model.matcha.lexicon) {
  168 + delete[] c.model.matcha.lexicon;
  169 + }
  170 +
  171 + if (c.model.matcha.tokens) {
  172 + delete[] c.model.matcha.tokens;
  173 + }
  174 +
  175 + if (c.model.matcha.data_dir) {
  176 + delete[] c.model.matcha.data_dir;
  177 + }
  178 +
  179 + if (c.model.matcha.dict_dir) {
  180 + delete[] c.model.matcha.dict_dir;
  181 + }
  182 +
135 if (c.model.provider) { 183 if (c.model.provider) {
136 delete[] c.model.provider; 184 delete[] c.model.provider;
137 } 185 }
@@ -152,7 +200,8 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( @@ -152,7 +200,8 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
152 } 200 }
153 201
154 return Napi::External<SherpaOnnxOfflineTts>::New( 202 return Napi::External<SherpaOnnxOfflineTts>::New(
155 - env, tts, [](Napi::Env env, SherpaOnnxOfflineTts *tts) { 203 + env, const_cast<SherpaOnnxOfflineTts *>(tts),
  204 + [](Napi::Env env, SherpaOnnxOfflineTts *tts) {
156 SherpaOnnxDestroyOfflineTts(tts); 205 SherpaOnnxDestroyOfflineTts(tts);
157 }); 206 });
158 } 207 }
@@ -133,6 +133,8 @@ The following tables list the examples in this folder. @@ -133,6 +133,8 @@ The following tables list the examples in this folder.
133 133
134 |File| Description| 134 |File| Description|
135 |---|---| 135 |---|---|
  136 +|[./test_tts_non_streaming_matcha_icefall_en.js](./test_tts_non_streaming_matcha_icefall_en.js)| Text-to-speech with a [MatchaTTS English Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)|
  137 +|[./test_tts_non_streaming_matcha_icefall_zhjs](./test_tts_non_streaming_matcha_icefall_zh.js)| Text-to-speech with a [MatchaTTS Chinese Model](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker)|
136 |[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model| 138 |[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model|
137 |[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model| 139 |[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model|
138 |[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)| 140 |[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)|
@@ -345,6 +347,28 @@ npm install naudiodon2 @@ -345,6 +347,28 @@ npm install naudiodon2
345 node ./test_vad_asr_non_streaming_sense_voice_microphone.js 347 node ./test_vad_asr_non_streaming_sense_voice_microphone.js
346 ``` 348 ```
347 349
  350 +### Text-to-speech with MatchaTTS models (English TTS)
  351 +```bash
  352 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  353 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  354 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  355 +
  356 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  357 +
  358 +node ./test_tts_non_streaming_matcha_icefall_en.js
  359 +```
  360 +
  361 +### Text-to-speech with MatchaTTS models (Chinese TTS)
  362 +```bash
  363 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  364 +tar xvf matcha-icefall-zh-baker.tar.bz2
  365 +rm matcha-icefall-zh-baker.tar.bz2
  366 +
  367 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  368 +
  369 +node ./test_tts_non_streaming_matcha_icefall_zh.js
  370 +```
  371 +
348 ### Text-to-speech with piper VITS models (TTS) 372 ### Text-to-speech with piper VITS models (TTS)
349 373
350 ```bash 374 ```bash
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// please refer to
  5 +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  6 +// to download model files
  7 +function createOfflineTts() {
  8 + const config = {
  9 + model: {
  10 + matcha: {
  11 + acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx',
  12 + vocoder: './hifigan_v2.onnx',
  13 + lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt',
  14 + tokens: './matcha-icefall-en_US-ljspeech/tokens.txt',
  15 + dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data',
  16 + },
  17 + debug: true,
  18 + numThreads: 1,
  19 + provider: 'cpu',
  20 + },
  21 + maxNumSentences: 1,
  22 + };
  23 + return new sherpa_onnx.OfflineTts(config);
  24 +}
  25 +
  26 +const tts = createOfflineTts();
  27 +
  28 +const text =
  29 + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  30 +
  31 +
  32 +let start = Date.now();
  33 +const audio = tts.generate({text: text, sid: 0, speed: 1.0});
  34 +let stop = Date.now();
  35 +const elapsed_seconds = (stop - start) / 1000;
  36 +const duration = audio.samples.length / audio.sampleRate;
  37 +const real_time_factor = elapsed_seconds / duration;
  38 +console.log('Wave duration', duration.toFixed(3), 'secodns')
  39 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  40 +console.log(
  41 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  42 + real_time_factor.toFixed(3))
  43 +
  44 +const filename = 'test-matcha-en.wav';
  45 +sherpa_onnx.writeWave(
  46 + filename, {samples: audio.samples, sampleRate: audio.sampleRate});
  47 +
  48 +console.log(`Saved to ${filename}`);
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// please refer to
  5 +// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  6 +// to download model files
  7 +function createOfflineTts() {
  8 + const config = {
  9 + model: {
  10 + matcha: {
  11 + acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx',
  12 + vocoder: './hifigan_v2.onnx',
  13 + lexicon: './matcha-icefall-zh-baker/lexicon.txt',
  14 + tokens: './matcha-icefall-zh-baker/tokens.txt',
  15 + dictDir: './matcha-icefall-zh-baker/dict',
  16 + },
  17 + debug: true,
  18 + numThreads: 1,
  19 + provider: 'cpu',
  20 + },
  21 + maxNumSentences: 1,
  22 + ruleFsts:
  23 + './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst',
  24 + };
  25 + return new sherpa_onnx.OfflineTts(config);
  26 +}
  27 +
  28 +const tts = createOfflineTts();
  29 +
  30 +const text =
  31 + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'
  32 +
  33 +
  34 +let start = Date.now();
  35 +const audio = tts.generate({text: text, sid: 0, speed: 1.0});
  36 +let stop = Date.now();
  37 +const elapsed_seconds = (stop - start) / 1000;
  38 +const duration = audio.samples.length / audio.sampleRate;
  39 +const real_time_factor = elapsed_seconds / duration;
  40 +console.log('Wave duration', duration.toFixed(3), 'secodns')
  41 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  42 +console.log(
  43 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  44 + real_time_factor.toFixed(3))
  45 +
  46 +const filename = 'test-matcha-zh.wav';
  47 +sherpa_onnx.writeWave(
  48 + filename, {samples: audio.samples, sampleRate: audio.sampleRate});
  49 +
  50 +console.log(`Saved to ${filename}`);
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 "version": "1.0.0", 3 "version": "1.0.0",
4 "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", 4 "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection",
5 "dependencies": { 5 "dependencies": {
6 - "cmake-js": "^7.0.0", 6 + "cmake-js": "^7.3.0",
7 "node-addon-api": "^8.3.0", 7 "node-addon-api": "^8.3.0",
8 "perf_hooks": "*" 8 "perf_hooks": "*"
9 }, 9 },