Fangjun Kuang
Committed by GitHub

Add TTS for node-addon-api (#871)

@@ -6,6 +6,8 @@ d=nodejs-addon-examples @@ -6,6 +6,8 @@ d=nodejs-addon-examples
6 echo "dir: $d" 6 echo "dir: $d"
7 cd $d 7 cd $d
8 8
  9 +echo "----------streaming asr----------"
  10 +
9 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 11 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
10 tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 12 tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
11 rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 13 rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
@@ -31,6 +33,8 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 @@ -31,6 +33,8 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
31 node ./test_asr_streaming_paraformer.js 33 node ./test_asr_streaming_paraformer.js
32 rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en 34 rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
33 35
  36 +echo "----------non-streaming asr----------"
  37 +
34 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 38 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
35 tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 39 tar xvf sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
36 rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 40 rm sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
@@ -58,3 +62,35 @@ rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 @@ -58,3 +62,35 @@ rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
58 62
59 node ./test_asr_non_streaming_paraformer.js 63 node ./test_asr_non_streaming_paraformer.js
60 rm -rf sherpa-onnx-paraformer-zh-2023-03-28 64 rm -rf sherpa-onnx-paraformer-zh-2023-03-28
  65 +
  66 +echo "----------tts----------"
  67 +
  68 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
  69 +tar xvf vits-piper-en_GB-cori-medium.tar.bz2
  70 +rm vits-piper-en_GB-cori-medium.tar.bz2
  71 +
  72 +node ./test_tts_non_streaming_vits_piper_en.js
  73 +rm -rf vits-piper-en_GB-cori-medium
  74 +
  75 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2
  76 +tar xvf vits-coqui-de-css10.tar.bz2
  77 +rm vits-coqui-de-css10.tar.bz2
  78 +
  79 +node ./test_tts_non_streaming_vits_coqui_de.js
  80 +rm -rf vits-coqui-de-css10
  81 +
  82 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
  83 +tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
  84 +rm sherpa-onnx-vits-zh-ll.tar.bz2
  85 +
  86 +node ./test_tts_non_streaming_vits_zh_ll.js
  87 +rm -rf sherpa-onnx-vits-zh-ll
  88 +
  89 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
  90 +tar xvf vits-icefall-zh-aishell3.tar.bz2
  91 +rm vits-icefall-zh-aishell3.tar.bz2
  92 +
  93 +node ./test_tts_non_streaming_vits_zh_aishell3.js
  94 +rm -rf vits-icefall-zh-aishell3
  95 +
  96 +ls -lh
@@ -94,7 +94,7 @@ jobs: @@ -94,7 +94,7 @@ jobs:
94 -DSHERPA_ONNX_ENABLE_BINARY=OFF \ 94 -DSHERPA_ONNX_ENABLE_BINARY=OFF \
95 .. 95 ..
96 96
97 - make -j 97 + make -j2
98 make install 98 make install
99 cd .. 99 cd ..
100 100
@@ -105,3 +105,4 @@ sherpa-onnx-ced-* @@ -105,3 +105,4 @@ sherpa-onnx-ced-*
105 node_modules 105 node_modules
106 package-lock.json 106 package-lock.json
107 sherpa-onnx-nemo-* 107 sherpa-onnx-nemo-*
  108 +sherpa-onnx-vits-*
@@ -143,3 +143,43 @@ node ./test_asr_non_streaming_paraformer.js @@ -143,3 +143,43 @@ node ./test_asr_non_streaming_paraformer.js
143 npm install naudiodon2 143 npm install naudiodon2
144 node ./test_vad_asr_non_streaming_paraformer_microphone.js 144 node ./test_vad_asr_non_streaming_paraformer_microphone.js
145 ``` 145 ```
  146 +
  147 +## Text-to-speech with piper VITS models (TTS)
  148 +
  149 +```bash
  150 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
  151 +tar xvf vits-piper-en_GB-cori-medium.tar.bz2
  152 +rm vits-piper-en_GB-cori-medium.tar.bz2
  153 +
  154 +node ./test_tts_non_streaming_vits_piper_en.js
  155 +```
  156 +
  157 +## Text-to-speech with piper Coqui-ai/TTS models (TTS)
  158 +
  159 +```bash
  160 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2
  161 +tar xvf vits-coqui-de-css10.tar.bz2
  162 +rm vits-coqui-de-css10.tar.bz2
  163 +
  164 +node ./test_tts_non_streaming_vits_coqui_de.js
  165 +```
  166 +
  167 +## Text-to-speech with vits Chinese models (1/2)
  168 +
  169 +```bash
  170 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
  171 +tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
  172 +rm sherpa-onnx-vits-zh-ll.tar.bz2
  173 +
  174 +node ./test_tts_non_streaming_vits_zh_ll.js
  175 +```
  176 +
  177 +## Text-to-speech with vits Chinese models (2/2)
  178 +
  179 +```bash
  180 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
  181 +tar xvf vits-icefall-zh-aishell3.tar.bz2
  182 +rm vits-icefall-zh-aishell3.tar.bz2
  183 +
  184 +node ./test_tts_non_streaming_vits_zh_aishell3.js
  185 +```
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +const performance = require('perf_hooks').performance;
  4 +
  5 +// please download model files from
  6 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  7 +function createOfflineTts() {
  8 + const config = {
  9 + model: {
  10 + vits: {
  11 + model: './vits-coqui-de-css10/model.onnx',
  12 + tokens: './vits-coqui-de-css10/tokens.txt',
  13 + },
  14 + debug: true,
  15 + numThreads: 1,
  16 + provider: 'cpu',
  17 + },
  18 + maxNumStences: 1,
  19 + };
  20 + return new sherpa_onnx.OfflineTts(config);
  21 +}
  22 +
  23 +const tts = createOfflineTts();
  24 +
  25 +const text = 'Alles hat ein Ende, nur die Wurst hat zwei.'
  26 +
  27 +let start = performance.now();
  28 +const audio = tts.generate({text: text, sid: 0, speed: 1.0});
  29 +let stop = performance.now();
  30 +const elapsed_seconds = (stop - start) / 1000;
  31 +const duration = audio.samples.length / audio.sampleRate;
  32 +const real_time_factor = elapsed_seconds / duration;
  33 +console.log('Wave duration', duration.toFixed(3), 'secodns')
  34 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  35 +console.log(
  36 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  37 + real_time_factor.toFixed(3))
  38 +
  39 +const filename = 'test-coqui-de.wav';
  40 +sherpa_onnx.writeWave(
  41 + filename, {samples: audio.samples, sampleRate: audio.sampleRate});
  42 +
  43 +console.log(`Saved to ${filename}`);
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +const performance = require('perf_hooks').performance;
  4 +
  5 +// please download model files from
  6 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  7 +function createOfflineTts() {
  8 + const config = {
  9 + model: {
  10 + vits: {
  11 + model: './vits-piper-en_GB-cori-medium/en_GB-cori-medium.onnx',
  12 + tokens: './vits-piper-en_GB-cori-medium/tokens.txt',
  13 + dataDir: './vits-piper-en_GB-cori-medium/espeak-ng-data',
  14 + },
  15 + debug: true,
  16 + numThreads: 1,
  17 + provider: 'cpu',
  18 + },
  19 + maxNumStences: 1,
  20 + };
  21 + return new sherpa_onnx.OfflineTts(config);
  22 +}
  23 +
  24 +const tts = createOfflineTts();
  25 +
  26 +const text =
  27 + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  28 +
  29 +
  30 +let start = performance.now();
  31 +const audio = tts.generate({text: text, sid: 0, speed: 1.0});
  32 +let stop = performance.now();
  33 +const elapsed_seconds = (stop - start) / 1000;
  34 +const duration = audio.samples.length / audio.sampleRate;
  35 +const real_time_factor = elapsed_seconds / duration;
  36 +console.log('Wave duration', duration.toFixed(3), 'secodns')
  37 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  38 +console.log(
  39 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  40 + real_time_factor.toFixed(3))
  41 +
  42 +const filename = 'test-piper-en.wav';
  43 +sherpa_onnx.writeWave(
  44 + filename, {samples: audio.samples, sampleRate: audio.sampleRate});
  45 +
  46 +console.log(`Saved to ${filename}`);
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +const performance = require('perf_hooks').performance;
  4 +
  5 +// please download model files from
  6 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  7 +function createOfflineTts() {
  8 + const config = {
  9 + model: {
  10 + vits: {
  11 + model: './vits-icefall-zh-aishell3/model.onnx',
  12 + tokens: './vits-icefall-zh-aishell3/tokens.txt',
  13 + lexicon: './vits-icefall-zh-aishell3/lexicon.txt',
  14 + },
  15 + debug: true,
  16 + numThreads: 1,
  17 + provider: 'cpu',
  18 + },
  19 + maxNumStences: 1,
  20 + ruleFsts:
  21 + './vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/number.fst,./vits-icefall-zh-aishell3/new_heteronym.fst',
  22 + ruleFars: './vits-icefall-zh-aishell3/rule.far',
  23 + };
  24 + return new sherpa_onnx.OfflineTts(config);
  25 +}
  26 +
  27 +const tts = createOfflineTts();
  28 +
  29 +const text =
  30 + '他在长沙出生,长白山长大,去过长江,现在他是一个银行的行长,主管行政工作。有困难,请拨110,或者13020240513。今天是2024年5月13号, 他上个月的工资是12345块钱。'
  31 +
  32 +let start = performance.now();
  33 +const audio = tts.generate({text: text, sid: 88, speed: 1.0});
  34 +let stop = performance.now();
  35 +const elapsed_seconds = (stop - start) / 1000;
  36 +const duration = audio.samples.length / audio.sampleRate;
  37 +const real_time_factor = elapsed_seconds / duration;
  38 +console.log('Wave duration', duration.toFixed(3), 'secodns')
  39 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  40 +console.log(
  41 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  42 + real_time_factor.toFixed(3))
  43 +
  44 +const filename = 'test-zh-aishell3.wav';
  45 +sherpa_onnx.writeWave(
  46 + filename, {samples: audio.samples, sampleRate: audio.sampleRate});
  47 +
  48 +console.log(`Saved to ${filename}`);
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +const performance = require('perf_hooks').performance;
  4 +
  5 +// please download model files from
  6 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  7 +function createOfflineTts() {
  8 + const config = {
  9 + model: {
  10 + vits: {
  11 + model: './sherpa-onnx-vits-zh-ll/model.onnx',
  12 + tokens: './sherpa-onnx-vits-zh-ll/tokens.txt',
  13 + lexicon: './sherpa-onnx-vits-zh-ll/lexicon.txt',
  14 + dictDir: './sherpa-onnx-vits-zh-ll/dict',
  15 + },
  16 + debug: true,
  17 + numThreads: 1,
  18 + provider: 'cpu',
  19 + },
  20 + maxNumStences: 1,
  21 + ruleFsts:
  22 + './sherpa-onnx-vits-zh-ll/date.fst,./sherpa-onnx-vits-zh-ll/phone.fst,./sherpa-onnx-vits-zh-ll/number.fst',
  23 + };
  24 + return new sherpa_onnx.OfflineTts(config);
  25 +}
  26 +
  27 +const tts = createOfflineTts();
  28 +
  29 +const text =
  30 + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月13号,拨打110或者18920240513。123456块钱。'
  31 +
  32 +let start = performance.now();
  33 +const audio = tts.generate({text: text, sid: 2, speed: 1.0});
  34 +let stop = performance.now();
  35 +const elapsed_seconds = (stop - start) / 1000;
  36 +const duration = audio.samples.length / audio.sampleRate;
  37 +const real_time_factor = elapsed_seconds / duration;
  38 +console.log('Wave duration', duration.toFixed(3), 'secodns')
  39 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  40 +console.log(
  41 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  42 + real_time_factor.toFixed(3))
  43 +
  44 +const filename = 'test-zh-ll.wav';
  45 +sherpa_onnx.writeWave(
  46 + filename, {samples: audio.samples, sampleRate: audio.sampleRate});
  47 +
  48 +console.log(`Saved to ${filename}`);
@@ -99,7 +99,7 @@ ai.on('data', data => { @@ -99,7 +99,7 @@ ai.on('data', data => {
99 .split(' ')[0]}.wav`; 99 .split(' ')[0]}.wav`;
100 sherpa_onnx.writeWave( 100 sherpa_onnx.writeWave(
101 filename, 101 filename,
102 - {samples: segment.samples, sampleRate: vad.config.sampleRate}) 102 + {samples: segment.samples, sampleRate: vad.config.sampleRate});
103 103
104 index += 1; 104 index += 1;
105 } 105 }
@@ -97,7 +97,7 @@ ai.on('data', data => { @@ -97,7 +97,7 @@ ai.on('data', data => {
97 .split(' ')[0]}.wav`; 97 .split(' ')[0]}.wav`;
98 sherpa_onnx.writeWave( 98 sherpa_onnx.writeWave(
99 filename, 99 filename,
100 - {samples: segment.samples, sampleRate: vad.config.sampleRate}) 100 + {samples: segment.samples, sampleRate: vad.config.sampleRate});
101 101
102 index += 1; 102 index += 1;
103 } 103 }
@@ -102,7 +102,7 @@ ai.on('data', data => { @@ -102,7 +102,7 @@ ai.on('data', data => {
102 .split(' ')[0]}.wav`; 102 .split(' ')[0]}.wav`;
103 sherpa_onnx.writeWave( 103 sherpa_onnx.writeWave(
104 filename, 104 filename,
105 - {samples: segment.samples, sampleRate: vad.config.sampleRate}) 105 + {samples: segment.samples, sampleRate: vad.config.sampleRate});
106 106
107 index += 1; 107 index += 1;
108 } 108 }
@@ -98,7 +98,7 @@ ai.on('data', data => { @@ -98,7 +98,7 @@ ai.on('data', data => {
98 .split(' ')[0]}.wav`; 98 .split(' ')[0]}.wav`;
99 sherpa_onnx.writeWave( 99 sherpa_onnx.writeWave(
100 filename, 100 filename,
101 - {samples: segment.samples, sampleRate: vad.config.sampleRate}) 101 + {samples: segment.samples, sampleRate: vad.config.sampleRate});
102 102
103 index += 1; 103 index += 1;
104 } 104 }
@@ -71,7 +71,7 @@ ai.on('data', data => { @@ -71,7 +71,7 @@ ai.on('data', data => {
71 .split(' ')[0]}.wav`; 71 .split(' ')[0]}.wav`;
72 sherpa_onnx.writeWave( 72 sherpa_onnx.writeWave(
73 filename, 73 filename,
74 - {samples: segment.samples, sampleRate: vad.config.sampleRate}) 74 + {samples: segment.samples, sampleRate: vad.config.sampleRate});
75 const duration = segment.samples.length / vad.config.sampleRate; 75 const duration = segment.samples.length / vad.config.sampleRate;
76 console.log(`${index} End of speech. Duration: ${duration} seconds`); 76 console.log(`${index} End of speech. Duration: ${duration} seconds`);
77 console.log(`Saved to ${filename}`); 77 console.log(`Saved to ${filename}`);
@@ -19,6 +19,7 @@ include_directories(${CMAKE_JS_INC}) @@ -19,6 +19,7 @@ include_directories(${CMAKE_JS_INC})
19 19
20 set(srcs 20 set(srcs
21 src/non-streaming-asr.cc 21 src/non-streaming-asr.cc
  22 + src/non-streaming-tts.cc
22 src/sherpa-onnx-node-addon-api.cc 23 src/sherpa-onnx-node-addon-api.cc
23 src/streaming-asr.cc 24 src/streaming-asr.cc
24 src/vad.cc 25 src/vad.cc
@@ -25,8 +25,8 @@ for (const p of possible_paths) { @@ -25,8 +25,8 @@ for (const p of possible_paths) {
25 } 25 }
26 26
27 if (!found) { 27 if (!found) {
28 - let msg =  
29 - `Could not find sherpa-onnx. Tried\n\n ${possible_paths.join('\n ')}\n` 28 + let msg = `Could not find sherpa-onnx-node. Tried\n\n ${
  29 + possible_paths.join('\n ')}\n`
30 if (os.platform() == 'darwin' && process.env.DYLD_LIBRARY_PATH && 30 if (os.platform() == 'darwin' && process.env.DYLD_LIBRARY_PATH &&
31 !process.env.DYLD_LIBRARY_PATH.includes( 31 !process.env.DYLD_LIBRARY_PATH.includes(
32 `node_modules/sherpa-onnx-${platform_arch}`)) { 32 `node_modules/sherpa-onnx-${platform_arch}`)) {
  1 +const addon = require('./addon.js');
  2 +
  3 +class OfflineTts {
  4 + constructor(config) {
  5 + this.handle = addon.createOfflineTts(config);
  6 + this.config = config;
  7 +
  8 + this.numSpeakers = addon.getOfflineTtsNumSpeakers(this.handle);
  9 + this.sampleRate = addon.getOfflineTtsSampleRate(this.handle);
  10 + }
  11 +
  12 + /*
  13 + input obj: {text: "xxxx", sid: 0, speed: 1.0}
  14 + where text is a string, sid is a int32, speed is a float
  15 +
  16 + return an object {samples: Float32Array, sampleRate: <a number>}
  17 + */
  18 + generate(obj) {
  19 + return addon.offlineTtsGenerate(this.handle, obj);
  20 + }
  21 +}
  22 +
  23 +module.exports = {
  24 + OfflineTts,
  25 +}
1 const addon = require('./addon.js') 1 const addon = require('./addon.js')
2 const streaming_asr = require('./streaming-asr.js'); 2 const streaming_asr = require('./streaming-asr.js');
3 const non_streaming_asr = require('./non-streaming-asr.js'); 3 const non_streaming_asr = require('./non-streaming-asr.js');
  4 +const non_streaming_tts = require('./non-streaming-tts.js');
4 const vad = require('./vad.js'); 5 const vad = require('./vad.js');
5 6
6 module.exports = { 7 module.exports = {
7 OnlineRecognizer: streaming_asr.OnlineRecognizer, 8 OnlineRecognizer: streaming_asr.OnlineRecognizer,
8 OfflineRecognizer: non_streaming_asr.OfflineRecognizer, 9 OfflineRecognizer: non_streaming_asr.OfflineRecognizer,
  10 + OfflineTts: non_streaming_tts.OfflineTts,
9 readWave: addon.readWave, 11 readWave: addon.readWave,
10 writeWave: addon.writeWave, 12 writeWave: addon.writeWave,
11 Display: streaming_asr.Display, 13 Display: streaming_asr.Display,
  1 +// scripts/node-addon-api/src/non-streaming-tts.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include <sstream>
  6 +
  7 +#include "napi.h" // NOLINT
  8 +#include "sherpa-onnx/c-api/c-api.h"
  9 +
  10 +static SherpaOnnxOfflineTtsVitsModelConfig GetOfflineTtsVitsModelConfig(
  11 + Napi::Object obj) {
  12 + SherpaOnnxOfflineTtsVitsModelConfig c;
  13 + memset(&c, 0, sizeof(c));
  14 +
  15 + if (!obj.Has("vits") || !obj.Get("vits").IsObject()) {
  16 + return c;
  17 + }
  18 +
  19 + Napi::Object o = obj.Get("vits").As<Napi::Object>();
  20 +
  21 + if (o.Has("model") && o.Get("model").IsString()) {
  22 + Napi::String model = o.Get("model").As<Napi::String>();
  23 + std::string s = model.Utf8Value();
  24 + char *p = new char[s.size() + 1];
  25 + std::copy(s.begin(), s.end(), p);
  26 + p[s.size()] = 0;
  27 +
  28 + c.model = p;
  29 + }
  30 +
  31 + if (o.Has("lexicon") && o.Get("lexicon").IsString()) {
  32 + Napi::String lexicon = o.Get("lexicon").As<Napi::String>();
  33 + std::string s = lexicon.Utf8Value();
  34 + char *p = new char[s.size() + 1];
  35 + std::copy(s.begin(), s.end(), p);
  36 + p[s.size()] = 0;
  37 +
  38 + c.lexicon = p;
  39 + }
  40 +
  41 + if (o.Has("tokens") && o.Get("tokens").IsString()) {
  42 + Napi::String tokens = o.Get("tokens").As<Napi::String>();
  43 + std::string s = tokens.Utf8Value();
  44 + char *p = new char[s.size() + 1];
  45 + std::copy(s.begin(), s.end(), p);
  46 + p[s.size()] = 0;
  47 +
  48 + c.tokens = p;
  49 + }
  50 +
  51 + if (o.Has("dataDir") && o.Get("dataDir").IsString()) {
  52 + Napi::String data_dir = o.Get("dataDir").As<Napi::String>();
  53 + std::string s = data_dir.Utf8Value();
  54 + char *p = new char[s.size() + 1];
  55 + std::copy(s.begin(), s.end(), p);
  56 + p[s.size()] = 0;
  57 +
  58 + c.data_dir = p;
  59 + }
  60 +
  61 + if (o.Has("noiseScale") && o.Get("noiseScale").IsNumber()) {
  62 + c.noise_scale = o.Get("noiseScale").As<Napi::Number>().FloatValue();
  63 + }
  64 +
  65 + if (o.Has("noiseScaleW") && o.Get("noiseScaleW").IsNumber()) {
  66 + c.noise_scale_w = o.Get("noiseScaleW").As<Napi::Number>().FloatValue();
  67 + }
  68 +
  69 + if (o.Has("lengthScale") && o.Get("lengthScale").IsNumber()) {
  70 + c.length_scale = o.Get("lengthScale").As<Napi::Number>().FloatValue();
  71 + }
  72 +
  73 + if (o.Has("dictDir") && o.Get("dictDir").IsString()) {
  74 + Napi::String dict_dir = o.Get("dictDir").As<Napi::String>();
  75 + std::string s = dict_dir.Utf8Value();
  76 + char *p = new char[s.size() + 1];
  77 + std::copy(s.begin(), s.end(), p);
  78 + p[s.size()] = 0;
  79 +
  80 + c.dict_dir = p;
  81 + }
  82 +
  83 + return c;
  84 +}
  85 +
  86 +static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig(
  87 + Napi::Object obj) {
  88 + SherpaOnnxOfflineTtsModelConfig c;
  89 + memset(&c, 0, sizeof(c));
  90 +
  91 + if (!obj.Has("model") || !obj.Get("model").IsObject()) {
  92 + return c;
  93 + }
  94 +
  95 + Napi::Object o = obj.Get("model").As<Napi::Object>();
  96 +
  97 + c.vits = GetOfflineTtsVitsModelConfig(o);
  98 +
  99 + if (o.Has("numThreads") && o.Get("numThreads").IsNumber()) {
  100 + c.num_threads = o.Get("numThreads").As<Napi::Number>().Int32Value();
  101 + }
  102 +
  103 + if (o.Has("debug") &&
  104 + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
  105 + if (o.Get("debug").IsBoolean()) {
  106 + c.debug = o.Get("debug").As<Napi::Boolean>().Value();
  107 + } else {
  108 + c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
  109 + }
  110 + }
  111 +
  112 + if (o.Has("provider") && o.Get("provider").IsString()) {
  113 + Napi::String provider = o.Get("provider").As<Napi::String>();
  114 + std::string s = provider.Utf8Value();
  115 + char *p = new char[s.size() + 1];
  116 + std::copy(s.begin(), s.end(), p);
  117 + p[s.size()] = 0;
  118 +
  119 + c.provider = p;
  120 + }
  121 +
  122 + return c;
  123 +}
  124 +
  125 +static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
  126 + const Napi::CallbackInfo &info) {
  127 + Napi::Env env = info.Env();
  128 + if (info.Length() != 1) {
  129 + std::ostringstream os;
  130 + os << "Expect only 1 argument. Given: " << info.Length();
  131 +
  132 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  133 +
  134 + return {};
  135 + }
  136 +
  137 + if (!info[0].IsObject()) {
  138 + Napi::TypeError::New(env, "Expect an object as the argument")
  139 + .ThrowAsJavaScriptException();
  140 +
  141 + return {};
  142 + }
  143 +
  144 + Napi::Object o = info[0].As<Napi::Object>();
  145 +
  146 + SherpaOnnxOfflineTtsConfig c;
  147 + memset(&c, 0, sizeof(c));
  148 +
  149 + c.model = GetOfflineTtsModelConfig(o);
  150 +
  151 + if (o.Has("ruleFsts") && o.Get("ruleFsts").IsString()) {
  152 + Napi::String rule_fsts = o.Get("ruleFsts").As<Napi::String>();
  153 + std::string s = rule_fsts.Utf8Value();
  154 + char *p = new char[s.size() + 1];
  155 + std::copy(s.begin(), s.end(), p);
  156 + p[s.size()] = 0;
  157 +
  158 + c.rule_fsts = p;
  159 + }
  160 +
  161 + if (o.Has("maxNumSentences") && o.Get("maxNumSentences").IsNumber()) {
  162 + c.max_num_sentences =
  163 + o.Get("maxNumSentences").As<Napi::Number>().Int32Value();
  164 + }
  165 +
  166 + if (o.Has("ruleFars") && o.Get("ruleFars").IsString()) {
  167 + Napi::String rule_fars = o.Get("ruleFars").As<Napi::String>();
  168 + std::string s = rule_fars.Utf8Value();
  169 + char *p = new char[s.size() + 1];
  170 + std::copy(s.begin(), s.end(), p);
  171 + p[s.size()] = 0;
  172 +
  173 + c.rule_fars = p;
  174 + }
  175 +
  176 + SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&c);
  177 +
  178 + if (c.model.vits.model) {
  179 + delete[] c.model.vits.model;
  180 + }
  181 +
  182 + if (c.model.vits.lexicon) {
  183 + delete[] c.model.vits.lexicon;
  184 + }
  185 +
  186 + if (c.model.vits.tokens) {
  187 + delete[] c.model.vits.tokens;
  188 + }
  189 +
  190 + if (c.model.vits.data_dir) {
  191 + delete[] c.model.vits.data_dir;
  192 + }
  193 +
  194 + if (c.model.vits.dict_dir) {
  195 + delete[] c.model.vits.dict_dir;
  196 + }
  197 +
  198 + if (c.model.provider) {
  199 + delete[] c.model.provider;
  200 + }
  201 +
  202 + if (c.rule_fsts) {
  203 + delete[] c.rule_fsts;
  204 + }
  205 +
  206 + if (c.rule_fars) {
  207 + delete[] c.rule_fars;
  208 + }
  209 +
  210 + if (!tts) {
  211 + Napi::TypeError::New(env, "Please check your config!")
  212 + .ThrowAsJavaScriptException();
  213 +
  214 + return {};
  215 + }
  216 +
  217 + return Napi::External<SherpaOnnxOfflineTts>::New(
  218 + env, tts, [](Napi::Env env, SherpaOnnxOfflineTts *tts) {
  219 + SherpaOnnxDestroyOfflineTts(tts);
  220 + });
  221 +}
  222 +
  223 +static Napi::Number OfflineTtsSampleRateWrapper(
  224 + const Napi::CallbackInfo &info) {
  225 + Napi::Env env = info.Env();
  226 +
  227 + if (info.Length() != 1) {
  228 + std::ostringstream os;
  229 + os << "Expect only 1 argument. Given: " << info.Length();
  230 +
  231 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  232 +
  233 + return {};
  234 + }
  235 +
  236 + if (!info[0].IsExternal()) {
  237 + Napi::TypeError::New(env, "Argument 0 should be an offline tts pointer.")
  238 + .ThrowAsJavaScriptException();
  239 +
  240 + return {};
  241 + }
  242 +
  243 + SherpaOnnxOfflineTts *tts =
  244 + info[0].As<Napi::External<SherpaOnnxOfflineTts>>().Data();
  245 +
  246 + int32_t sample_rate = SherpaOnnxOfflineTtsSampleRate(tts);
  247 +
  248 + return Napi::Number::New(env, sample_rate);
  249 +}
  250 +
  251 +static Napi::Number OfflineTtsNumSpeakersWrapper(
  252 + const Napi::CallbackInfo &info) {
  253 + Napi::Env env = info.Env();
  254 +
  255 + if (info.Length() != 1) {
  256 + std::ostringstream os;
  257 + os << "Expect only 1 argument. Given: " << info.Length();
  258 +
  259 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  260 +
  261 + return {};
  262 + }
  263 +
  264 + if (!info[0].IsExternal()) {
  265 + Napi::TypeError::New(env, "Argument 0 should be an offline tts pointer.")
  266 + .ThrowAsJavaScriptException();
  267 +
  268 + return {};
  269 + }
  270 +
  271 + SherpaOnnxOfflineTts *tts =
  272 + info[0].As<Napi::External<SherpaOnnxOfflineTts>>().Data();
  273 +
  274 + int32_t num_speakers = SherpaOnnxOfflineTtsNumSpeakers(tts);
  275 +
  276 + return Napi::Number::New(env, num_speakers);
  277 +}
  278 +
  279 +static Napi::Object OfflineTtsGenerateWrapper(const Napi::CallbackInfo &info) {
  280 + Napi::Env env = info.Env();
  281 +
  282 + if (info.Length() != 2) {
  283 + std::ostringstream os;
  284 + os << "Expect only 1 argument. Given: " << info.Length();
  285 +
  286 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  287 +
  288 + return {};
  289 + }
  290 +
  291 + if (!info[0].IsExternal()) {
  292 + Napi::TypeError::New(env, "Argument 0 should be an offline tts pointer.")
  293 + .ThrowAsJavaScriptException();
  294 +
  295 + return {};
  296 + }
  297 +
  298 + SherpaOnnxOfflineTts *tts =
  299 + info[0].As<Napi::External<SherpaOnnxOfflineTts>>().Data();
  300 +
  301 + if (!info[1].IsObject()) {
  302 + Napi::TypeError::New(env, "Argument 1 should be an object")
  303 + .ThrowAsJavaScriptException();
  304 +
  305 + return {};
  306 + }
  307 +
  308 + Napi::Object obj = info[1].As<Napi::Object>();
  309 +
  310 + if (!obj.Has("text")) {
  311 + Napi::TypeError::New(env, "The argument object should have a field text")
  312 + .ThrowAsJavaScriptException();
  313 +
  314 + return {};
  315 + }
  316 +
  317 + if (!obj.Get("text").IsString()) {
  318 + Napi::TypeError::New(env, "The object['text'] should be a string")
  319 + .ThrowAsJavaScriptException();
  320 +
  321 + return {};
  322 + }
  323 +
  324 + if (!obj.Has("sid")) {
  325 + Napi::TypeError::New(env, "The argument object should have a field sid")
  326 + .ThrowAsJavaScriptException();
  327 +
  328 + return {};
  329 + }
  330 +
  331 + if (!obj.Get("sid").IsNumber()) {
  332 + Napi::TypeError::New(env, "The object['sid'] should be a number")
  333 + .ThrowAsJavaScriptException();
  334 +
  335 + return {};
  336 + }
  337 +
  338 + if (!obj.Has("speed")) {
  339 + Napi::TypeError::New(env, "The argument object should have a field speed")
  340 + .ThrowAsJavaScriptException();
  341 +
  342 + return {};
  343 + }
  344 +
  345 + if (!obj.Get("speed").IsNumber()) {
  346 + Napi::TypeError::New(env, "The object['speed'] should be a number")
  347 + .ThrowAsJavaScriptException();
  348 +
  349 + return {};
  350 + }
  351 +
  352 + Napi::String _text = obj.Get("text").As<Napi::String>();
  353 + std::string text = _text.Utf8Value();
  354 + int32_t sid = obj.Get("sid").As<Napi::Number>().Int32Value();
  355 + float speed = obj.Get("speed").As<Napi::Number>().FloatValue();
  356 +
  357 + const SherpaOnnxGeneratedAudio *audio =
  358 + SherpaOnnxOfflineTtsGenerate(tts, text.c_str(), sid, speed);
  359 +
  360 + Napi::ArrayBuffer arrayBuffer = Napi::ArrayBuffer::New(
  361 + env, const_cast<float *>(audio->samples), sizeof(float) * audio->n,
  362 + [](Napi::Env /*env*/, void * /*data*/,
  363 + const SherpaOnnxGeneratedAudio *hint) {
  364 + SherpaOnnxDestroyOfflineTtsGeneratedAudio(hint);
  365 + },
  366 + audio);
  367 + Napi::Float32Array float32Array =
  368 + Napi::Float32Array::New(env, audio->n, arrayBuffer, 0);
  369 +
  370 + Napi::Object ans = Napi::Object::New(env);
  371 + ans.Set(Napi::String::New(env, "samples"), float32Array);
  372 + ans.Set(Napi::String::New(env, "sampleRate"), audio->sample_rate);
  373 + return ans;
  374 +}
  375 +
  376 +void InitNonStreamingTts(Napi::Env env, Napi::Object exports) {
  377 + exports.Set(Napi::String::New(env, "createOfflineTts"),
  378 + Napi::Function::New(env, CreateOfflineTtsWrapper));
  379 +
  380 + exports.Set(Napi::String::New(env, "getOfflineTtsSampleRate"),
  381 + Napi::Function::New(env, OfflineTtsSampleRateWrapper));
  382 +
  383 + exports.Set(Napi::String::New(env, "getOfflineTtsNumSpeakers"),
  384 + Napi::Function::New(env, OfflineTtsNumSpeakersWrapper));
  385 +
  386 + exports.Set(Napi::String::New(env, "offlineTtsGenerate"),
  387 + Napi::Function::New(env, OfflineTtsGenerateWrapper));
  388 +}
@@ -7,6 +7,8 @@ void InitStreamingAsr(Napi::Env env, Napi::Object exports); @@ -7,6 +7,8 @@ void InitStreamingAsr(Napi::Env env, Napi::Object exports);
7 7
8 void InitNonStreamingAsr(Napi::Env env, Napi::Object exports); 8 void InitNonStreamingAsr(Napi::Env env, Napi::Object exports);
9 9
  10 +void InitNonStreamingTts(Napi::Env env, Napi::Object exports);
  11 +
10 void InitVad(Napi::Env env, Napi::Object exports); 12 void InitVad(Napi::Env env, Napi::Object exports);
11 13
12 void InitWaveReader(Napi::Env env, Napi::Object exports); 14 void InitWaveReader(Napi::Env env, Napi::Object exports);
@@ -16,6 +18,7 @@ void InitWaveWriter(Napi::Env env, Napi::Object exports); @@ -16,6 +18,7 @@ void InitWaveWriter(Napi::Env env, Napi::Object exports);
16 Napi::Object Init(Napi::Env env, Napi::Object exports) { 18 Napi::Object Init(Napi::Env env, Napi::Object exports) {
17 InitStreamingAsr(env, exports); 19 InitStreamingAsr(env, exports);
18 InitNonStreamingAsr(env, exports); 20 InitNonStreamingAsr(env, exports);
  21 + InitNonStreamingTts(env, exports);
19 InitVad(env, exports); 22 InitVad(env, exports);
20 InitWaveReader(env, exports); 23 InitWaveReader(env, exports);
21 InitWaveWriter(env, exports); 24 InitWaveWriter(env, exports);
@@ -605,7 +605,7 @@ static void InputFinishedWrapper(const Napi::CallbackInfo &info) { @@ -605,7 +605,7 @@ static void InputFinishedWrapper(const Napi::CallbackInfo &info) {
605 605
606 if (info.Length() != 1) { 606 if (info.Length() != 1) {
607 std::ostringstream os; 607 std::ostringstream os;
608 - os << "Expect only 1 arguments. Given: " << info.Length(); 608 + os << "Expect only 1 argument. Given: " << info.Length();
609 609
610 Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); 610 Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
611 611
@@ -823,7 +823,7 @@ SHERPA_ONNX_API int32_t @@ -823,7 +823,7 @@ SHERPA_ONNX_API int32_t
823 SherpaOnnxOfflineTtsNumSpeakers(const SherpaOnnxOfflineTts *tts); 823 SherpaOnnxOfflineTtsNumSpeakers(const SherpaOnnxOfflineTts *tts);
824 824
825 // Generate audio from the given text and speaker id (sid). 825 // Generate audio from the given text and speaker id (sid).
826 -// The user has to use DestroyOfflineTtsGeneratedAudio() to free the 826 +// The user has to use SherpaOnnxDestroyOfflineTtsGeneratedAudio() to free the
827 // returned pointer to avoid memory leak. 827 // returned pointer to avoid memory leak.
828 SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( 828 SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate(
829 const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, 829 const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,