Committed by
GitHub
Add streaming CTC ASR APIs for node-addon-api (#867)
正在显示
15 个修改的文件
包含
445 行增加
和
31 行删除
| @@ -5,15 +5,6 @@ set -ex | @@ -5,15 +5,6 @@ set -ex | ||
| 5 | d=nodejs-addon-examples | 5 | d=nodejs-addon-examples |
| 6 | echo "dir: $d" | 6 | echo "dir: $d" |
| 7 | cd $d | 7 | cd $d |
| 8 | -npm install --verbose | ||
| 9 | -git status | ||
| 10 | -ls -lh | ||
| 11 | -ls -lh node_modules | ||
| 12 | - | ||
| 13 | -export DYLD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-darwin-x64:$DYLD_LIBRARY_PATH | ||
| 14 | -export DYLD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-darwin-arm64:$DYLD_LIBRARY_PATH | ||
| 15 | -export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH | ||
| 16 | -export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH | ||
| 17 | 8 | ||
| 18 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | 9 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 |
| 19 | tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | 10 | tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 |
| @@ -22,3 +13,14 @@ rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | @@ -22,3 +13,14 @@ rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | ||
| 22 | node test_asr_streaming_transducer.js | 13 | node test_asr_streaming_transducer.js |
| 23 | 14 | ||
| 24 | rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 | 15 | rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 |
| 16 | + | ||
| 17 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | ||
| 18 | +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | ||
| 19 | +rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | ||
| 20 | + | ||
| 21 | +node ./test_asr_streaming_ctc.js | ||
| 22 | + | ||
| 23 | +# To decode with HLG.fst | ||
| 24 | +node ./test_asr_streaming_ctc_hlg.js | ||
| 25 | + | ||
| 26 | +rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 |
| @@ -152,17 +152,23 @@ jobs: | @@ -152,17 +152,23 @@ jobs: | ||
| 152 | 152 | ||
| 153 | ./node_modules/.bin/cmake-js compile --log-level verbose | 153 | ./node_modules/.bin/cmake-js compile --log-level verbose |
| 154 | 154 | ||
| 155 | - - name: Test streaming transducer | 155 | + - name: Run tests |
| 156 | shell: bash | 156 | shell: bash |
| 157 | run: | | 157 | run: | |
| 158 | export PATH=$PWD/build/install/lib:$PATH | 158 | export PATH=$PWD/build/install/lib:$PATH |
| 159 | export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH | 159 | export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH |
| 160 | - | ||
| 161 | - cd scripts/node-addon-api | ||
| 162 | - curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | ||
| 163 | - tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | ||
| 164 | - rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | ||
| 165 | - | ||
| 166 | - node test/test_asr_streaming_transducer.js | ||
| 167 | - | ||
| 168 | - rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 | 160 | + d=nodejs-addon-examples |
| 161 | + cd $d | ||
| 162 | + files=$(ls *.js) | ||
| 163 | + echo $files | ||
| 164 | + for f in ${files[@]}; do | ||
| 165 | + echo $f | ||
| 166 | + sed -i.bak s%sherpa-onnx-node%./sherpa-onnx% ./$f | ||
| 167 | + done | ||
| 168 | + cd .. | ||
| 169 | + | ||
| 170 | + cp -v scripts/node-addon-api/build/Release/sherpa-onnx.node $d/ | ||
| 171 | + cp -v scripts/node-addon-api/lib/*.js $d/ | ||
| 172 | + cp -v ./build/install/lib/lib* $d/ | ||
| 173 | + | ||
| 174 | + .github/scripts/test-nodejs-addon-npm.sh |
| @@ -63,4 +63,19 @@ jobs: | @@ -63,4 +63,19 @@ jobs: | ||
| 63 | - name: Run tests | 63 | - name: Run tests |
| 64 | shell: bash | 64 | shell: bash |
| 65 | run: | | 65 | run: | |
| 66 | + d=nodejs-addon-examples | ||
| 67 | + echo "dir: $d" | ||
| 68 | + cd $d | ||
| 69 | + npm install --verbose | ||
| 70 | + git status | ||
| 71 | + ls -lh | ||
| 72 | + ls -lh node_modules | ||
| 73 | + | ||
| 74 | + export DYLD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-darwin-x64:$DYLD_LIBRARY_PATH | ||
| 75 | + export DYLD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-darwin-arm64:$DYLD_LIBRARY_PATH | ||
| 76 | + export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH | ||
| 77 | + export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH | ||
| 78 | + | ||
| 79 | + cd ../ | ||
| 80 | + | ||
| 66 | .github/scripts/test-nodejs-addon-npm.sh | 81 | .github/scripts/test-nodejs-addon-npm.sh |
| @@ -27,6 +27,18 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH | @@ -27,6 +27,18 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH | ||
| 27 | export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH | 27 | export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH |
| 28 | ``` | 28 | ``` |
| 29 | 29 | ||
| 30 | +# Voice Activity detection (VAD) | ||
| 31 | + | ||
| 32 | +```bash | ||
| 33 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 34 | + | ||
| 35 | + | ||
| 36 | +# To run the test with a microphone, you need to install the package naudiodon2 | ||
| 37 | +npm install naudiodon2 | ||
| 38 | + | ||
| 39 | +node ./test_vad_microphone.js | ||
| 40 | +``` | ||
| 41 | + | ||
| 30 | ## Streaming speech recognition with zipformer transducer | 42 | ## Streaming speech recognition with zipformer transducer |
| 31 | 43 | ||
| 32 | ```bash | 44 | ```bash |
| @@ -36,21 +48,27 @@ rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | @@ -36,21 +48,27 @@ rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 | ||
| 36 | 48 | ||
| 37 | node ./test_asr_streaming_transducer.js | 49 | node ./test_asr_streaming_transducer.js |
| 38 | 50 | ||
| 39 | -# To run the test with microphone, you need to install the package naudiodon2 | 51 | +# To run the test with a microphone, you need to install the package naudiodon2 |
| 40 | npm install naudiodon2 | 52 | npm install naudiodon2 |
| 41 | 53 | ||
| 42 | node ./test_asr_streaming_transducer_microphone.js | 54 | node ./test_asr_streaming_transducer_microphone.js |
| 43 | ``` | 55 | ``` |
| 44 | 56 | ||
| 45 | -# VAD | 57 | +## Streaming speech recognition with zipformer CTC |
| 46 | 58 | ||
| 47 | ```bash | 59 | ```bash |
| 48 | -wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | 60 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 |
| 61 | +tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | ||
| 62 | +rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | ||
| 49 | 63 | ||
| 64 | +node ./test_asr_streaming_ctc.js | ||
| 50 | 65 | ||
| 51 | -# To run the test with microphone, you need to install the package naudiodon2 | 66 | +# To decode with HLG.fst |
| 67 | +node ./test_asr_streaming_ctc_hlg.js | ||
| 68 | + | ||
| 69 | +# To run the test with a microphone, you need to install the package naudiodon2 | ||
| 52 | npm install naudiodon2 | 70 | npm install naudiodon2 |
| 53 | 71 | ||
| 54 | -node ./test_vad_microphone.js | 72 | +node ./test_asr_streaming_ctc_microphone.js |
| 73 | +node ./test_asr_streaming_ctc_hlg_microphone.js | ||
| 55 | ``` | 74 | ``` |
| 56 | - |
| 1 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | +const performance = require('perf_hooks').performance; | ||
| 4 | + | ||
| 5 | + | ||
| 6 | +// Please download test files from | ||
| 7 | +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 8 | +const config = { | ||
| 9 | + 'featConfig': { | ||
| 10 | + 'sampleRate': 16000, | ||
| 11 | + 'featureDim': 80, | ||
| 12 | + }, | ||
| 13 | + 'modelConfig': { | ||
| 14 | + 'zipformer2Ctc': { | ||
| 15 | + 'model': | ||
| 16 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx', | ||
| 17 | + }, | ||
| 18 | + 'tokens': | ||
| 19 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt', | ||
| 20 | + 'numThreads': 2, | ||
| 21 | + 'provider': 'cpu', | ||
| 22 | + 'debug': 1, | ||
| 23 | + } | ||
| 24 | +}; | ||
| 25 | + | ||
| 26 | +const waveFilename = | ||
| 27 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/0.wav'; | ||
| 28 | + | ||
| 29 | +const recognizer = new sherpa_onnx.OnlineRecognizer(config); | ||
| 30 | +console.log('Started') | ||
| 31 | +let start = performance.now(); | ||
| 32 | +const stream = recognizer.createStream(); | ||
| 33 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 34 | +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); | ||
| 35 | + | ||
| 36 | +const tailPadding = new Float32Array(wave.sampleRate * 0.4); | ||
| 37 | +stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate}); | ||
| 38 | + | ||
| 39 | +while (recognizer.isReady(stream)) { | ||
| 40 | + recognizer.decode(stream); | ||
| 41 | +} | ||
| 42 | +result = recognizer.getResult(stream) | ||
| 43 | +let stop = performance.now(); | ||
| 44 | +console.log('Done') | ||
| 45 | + | ||
| 46 | +const elapsed_seconds = (stop - start) / 1000; | ||
| 47 | +const duration = wave.samples.length / wave.sampleRate; | ||
| 48 | +const real_time_factor = elapsed_seconds / duration; | ||
| 49 | +console.log('Wave duration', duration.toFixed(3), 'secodns') | ||
| 50 | +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') | ||
| 51 | +console.log( | ||
| 52 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 53 | + real_time_factor.toFixed(3)) | ||
| 54 | +console.log(waveFilename) | ||
| 55 | +console.log('result\n', result) |
| 1 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | +const performance = require('perf_hooks').performance; | ||
| 4 | + | ||
| 5 | + | ||
| 6 | +// Please download test files from | ||
| 7 | +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 8 | +const config = { | ||
| 9 | + 'featConfig': { | ||
| 10 | + 'sampleRate': 16000, | ||
| 11 | + 'featureDim': 80, | ||
| 12 | + }, | ||
| 13 | + 'modelConfig': { | ||
| 14 | + 'zipformer2Ctc': { | ||
| 15 | + 'model': | ||
| 16 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx', | ||
| 17 | + }, | ||
| 18 | + 'tokens': | ||
| 19 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt', | ||
| 20 | + 'numThreads': 2, | ||
| 21 | + 'provider': 'cpu', | ||
| 22 | + 'debug': 1, | ||
| 23 | + }, | ||
| 24 | + 'ctcFstDecoderConfig': { | ||
| 25 | + 'graph': './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst', | ||
| 26 | + }, | ||
| 27 | +}; | ||
| 28 | + | ||
| 29 | +const waveFilename = | ||
| 30 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/1.wav'; | ||
| 31 | + | ||
| 32 | +const recognizer = new sherpa_onnx.OnlineRecognizer(config); | ||
| 33 | +console.log('Started') | ||
| 34 | +let start = performance.now(); | ||
| 35 | +const stream = recognizer.createStream(); | ||
| 36 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 37 | +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); | ||
| 38 | + | ||
| 39 | +const tailPadding = new Float32Array(wave.sampleRate * 0.4); | ||
| 40 | +stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate}); | ||
| 41 | + | ||
| 42 | +while (recognizer.isReady(stream)) { | ||
| 43 | + recognizer.decode(stream); | ||
| 44 | +} | ||
| 45 | +result = recognizer.getResult(stream) | ||
| 46 | +let stop = performance.now(); | ||
| 47 | +console.log('Done') | ||
| 48 | + | ||
| 49 | +const elapsed_seconds = (stop - start) / 1000; | ||
| 50 | +const duration = wave.samples.length / wave.sampleRate; | ||
| 51 | +const real_time_factor = elapsed_seconds / duration; | ||
| 52 | +console.log('Wave duration', duration.toFixed(3), 'secodns') | ||
| 53 | +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') | ||
| 54 | +console.log( | ||
| 55 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 56 | + real_time_factor.toFixed(3)) | ||
| 57 | +console.log(waveFilename) | ||
| 58 | +console.log('result\n', result) |
| 1 | +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | +// | ||
| 3 | +const portAudio = require('naudiodon2'); | ||
| 4 | +// console.log(portAudio.getDevices()); | ||
| 5 | + | ||
| 6 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 7 | + | ||
| 8 | +function createOnlineRecognizer() { | ||
| 9 | + const config = { | ||
| 10 | + 'featConfig': { | ||
| 11 | + 'sampleRate': 16000, | ||
| 12 | + 'featureDim': 80, | ||
| 13 | + }, | ||
| 14 | + 'modelConfig': { | ||
| 15 | + 'zipformer2Ctc': { | ||
| 16 | + 'model': | ||
| 17 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx', | ||
| 18 | + }, | ||
| 19 | + 'tokens': | ||
| 20 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt', | ||
| 21 | + 'numThreads': 2, | ||
| 22 | + 'provider': 'cpu', | ||
| 23 | + 'debug': 1, | ||
| 24 | + }, | ||
| 25 | + 'ctcFstDecoderConfig': { | ||
| 26 | + 'graph': './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst', | ||
| 27 | + }, | ||
| 28 | + 'enableEndpoint': true, | ||
| 29 | + 'rule1MinTrailingSilence': 2.4, | ||
| 30 | + 'rule2MinTrailingSilence': 1.2, | ||
| 31 | + 'rule3MinUtteranceLength': 20 | ||
| 32 | + }; | ||
| 33 | + | ||
| 34 | + return new sherpa_onnx.OnlineRecognizer(config); | ||
| 35 | +} | ||
| 36 | + | ||
| 37 | +const recognizer = createOnlineRecognizer(); | ||
| 38 | +const stream = recognizer.createStream(); | ||
| 39 | + | ||
| 40 | +let lastText = ''; | ||
| 41 | +let segmentIndex = 0; | ||
| 42 | + | ||
| 43 | +const ai = new portAudio.AudioIO({ | ||
| 44 | + inOptions: { | ||
| 45 | + channelCount: 1, | ||
| 46 | + closeOnError: true, // Close the stream if an audio error is detected, if | ||
| 47 | + // set false then just log the error | ||
| 48 | + deviceId: -1, // Use -1 or omit the deviceId to select the default device | ||
| 49 | + sampleFormat: portAudio.SampleFormatFloat32, | ||
| 50 | + sampleRate: recognizer.config.featConfig.sampleRate | ||
| 51 | + } | ||
| 52 | +}); | ||
| 53 | + | ||
| 54 | +const display = new sherpa_onnx.Display(50); | ||
| 55 | + | ||
| 56 | +ai.on('data', data => { | ||
| 57 | + const samples = new Float32Array(data.buffer); | ||
| 58 | + | ||
| 59 | + stream.acceptWaveform( | ||
| 60 | + {sampleRate: recognizer.config.featConfig.sampleRate, samples: samples}); | ||
| 61 | + | ||
| 62 | + while (recognizer.isReady(stream)) { | ||
| 63 | + recognizer.decode(stream); | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + const isEndpoint = recognizer.isEndpoint(stream); | ||
| 67 | + const text = recognizer.getResult(stream).text.toLowerCase(); | ||
| 68 | + | ||
| 69 | + if (text.length > 0 && lastText != text) { | ||
| 70 | + lastText = text; | ||
| 71 | + display.print(segmentIndex, lastText); | ||
| 72 | + } | ||
| 73 | + if (isEndpoint) { | ||
| 74 | + if (text.length > 0) { | ||
| 75 | + lastText = text; | ||
| 76 | + segmentIndex += 1; | ||
| 77 | + } | ||
| 78 | + recognizer.reset(stream) | ||
| 79 | + } | ||
| 80 | +}); | ||
| 81 | + | ||
| 82 | +ai.on('close', () => { | ||
| 83 | + console.log('Free resources'); | ||
| 84 | + stream.free(); | ||
| 85 | + recognizer.free(); | ||
| 86 | +}); | ||
| 87 | + | ||
| 88 | +ai.start(); | ||
| 89 | +console.log('Started! Please speak') |
| 1 | +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | +// | ||
| 3 | +const portAudio = require('naudiodon2'); | ||
| 4 | +// console.log(portAudio.getDevices()); | ||
| 5 | + | ||
| 6 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 7 | + | ||
| 8 | +function createOnlineRecognizer() { | ||
| 9 | + const config = { | ||
| 10 | + 'featConfig': { | ||
| 11 | + 'sampleRate': 16000, | ||
| 12 | + 'featureDim': 80, | ||
| 13 | + }, | ||
| 14 | + 'modelConfig': { | ||
| 15 | + 'zipformer2Ctc': { | ||
| 16 | + 'model': | ||
| 17 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx', | ||
| 18 | + }, | ||
| 19 | + 'tokens': | ||
| 20 | + './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt', | ||
| 21 | + 'numThreads': 2, | ||
| 22 | + 'provider': 'cpu', | ||
| 23 | + 'debug': 1, | ||
| 24 | + }, | ||
| 25 | + 'decodingMethod': 'greedy_search', | ||
| 26 | + 'maxActivePaths': 4, | ||
| 27 | + 'enableEndpoint': true, | ||
| 28 | + 'rule1MinTrailingSilence': 2.4, | ||
| 29 | + 'rule2MinTrailingSilence': 1.2, | ||
| 30 | + 'rule3MinUtteranceLength': 20 | ||
| 31 | + }; | ||
| 32 | + | ||
| 33 | + return new sherpa_onnx.OnlineRecognizer(config); | ||
| 34 | +} | ||
| 35 | + | ||
| 36 | +const recognizer = createOnlineRecognizer(); | ||
| 37 | +const stream = recognizer.createStream(); | ||
| 38 | + | ||
| 39 | +let lastText = ''; | ||
| 40 | +let segmentIndex = 0; | ||
| 41 | + | ||
| 42 | +const ai = new portAudio.AudioIO({ | ||
| 43 | + inOptions: { | ||
| 44 | + channelCount: 1, | ||
| 45 | + closeOnError: true, // Close the stream if an audio error is detected, if | ||
| 46 | + // set false then just log the error | ||
| 47 | + deviceId: -1, // Use -1 or omit the deviceId to select the default device | ||
| 48 | + sampleFormat: portAudio.SampleFormatFloat32, | ||
| 49 | + sampleRate: recognizer.config.featConfig.sampleRate | ||
| 50 | + } | ||
| 51 | +}); | ||
| 52 | + | ||
| 53 | +const display = new sherpa_onnx.Display(50); | ||
| 54 | + | ||
| 55 | +ai.on('data', data => { | ||
| 56 | + const samples = new Float32Array(data.buffer); | ||
| 57 | + | ||
| 58 | + stream.acceptWaveform( | ||
| 59 | + {sampleRate: recognizer.config.featConfig.sampleRate, samples: samples}); | ||
| 60 | + | ||
| 61 | + while (recognizer.isReady(stream)) { | ||
| 62 | + recognizer.decode(stream); | ||
| 63 | + } | ||
| 64 | + | ||
| 65 | + const isEndpoint = recognizer.isEndpoint(stream); | ||
| 66 | + const text = recognizer.getResult(stream).text.toLowerCase(); | ||
| 67 | + | ||
| 68 | + if (text.length > 0 && lastText != text) { | ||
| 69 | + lastText = text; | ||
| 70 | + display.print(segmentIndex, lastText); | ||
| 71 | + } | ||
| 72 | + if (isEndpoint) { | ||
| 73 | + if (text.length > 0) { | ||
| 74 | + lastText = text; | ||
| 75 | + segmentIndex += 1; | ||
| 76 | + } | ||
| 77 | + recognizer.reset(stream) | ||
| 78 | + } | ||
| 79 | +}); | ||
| 80 | + | ||
| 81 | +ai.on('close', () => { | ||
| 82 | + console.log('Free resources'); | ||
| 83 | + stream.free(); | ||
| 84 | + recognizer.free(); | ||
| 85 | +}); | ||
| 86 | + | ||
| 87 | +ai.start(); | ||
| 88 | +console.log('Started! Please speak') |
| @@ -24,7 +24,6 @@ const config = { | @@ -24,7 +24,6 @@ const config = { | ||
| 24 | 'numThreads': 2, | 24 | 'numThreads': 2, |
| 25 | 'provider': 'cpu', | 25 | 'provider': 'cpu', |
| 26 | 'debug': 1, | 26 | 'debug': 1, |
| 27 | - 'modelType': 'zipformer', | ||
| 28 | } | 27 | } |
| 29 | }; | 28 | }; |
| 30 | 29 | ||
| @@ -53,5 +52,8 @@ const duration = wave.samples.length / wave.sampleRate; | @@ -53,5 +52,8 @@ const duration = wave.samples.length / wave.sampleRate; | ||
| 53 | const real_time_factor = elapsed_seconds / duration; | 52 | const real_time_factor = elapsed_seconds / duration; |
| 54 | console.log('Wave duration', duration.toFixed(3), 'secodns') | 53 | console.log('Wave duration', duration.toFixed(3), 'secodns') |
| 55 | console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') | 54 | console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') |
| 56 | -console.log('RTF', real_time_factor.toFixed(3)) | ||
| 57 | -console.log('result', result.text) | 55 | +console.log( |
| 56 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 57 | + real_time_factor.toFixed(3)) | ||
| 58 | +console.log(waveFilename) | ||
| 59 | +console.log('result\n', result) |
| @@ -25,7 +25,6 @@ function createOnlineRecognizer() { | @@ -25,7 +25,6 @@ function createOnlineRecognizer() { | ||
| 25 | 'numThreads': 2, | 25 | 'numThreads': 2, |
| 26 | 'provider': 'cpu', | 26 | 'provider': 'cpu', |
| 27 | 'debug': 1, | 27 | 'debug': 1, |
| 28 | - 'modelType': 'zipformer', | ||
| 29 | }, | 28 | }, |
| 30 | 'decodingMethod': 'greedy_search', | 29 | 'decodingMethod': 'greedy_search', |
| 31 | 'maxActivePaths': 4, | 30 | 'maxActivePaths': 4, |
| @@ -68,7 +67,7 @@ ai.on('data', data => { | @@ -68,7 +67,7 @@ ai.on('data', data => { | ||
| 68 | } | 67 | } |
| 69 | 68 | ||
| 70 | const isEndpoint = recognizer.isEndpoint(stream); | 69 | const isEndpoint = recognizer.isEndpoint(stream); |
| 71 | - const text = recognizer.getResult(stream).text; | 70 | + const text = recognizer.getResult(stream).text.toLowerCase(); |
| 72 | 71 | ||
| 73 | if (text.length > 0 && lastText != text) { | 72 | if (text.length > 0 && lastText != text) { |
| 74 | lastText = text; | 73 | lastText = text; |
| @@ -158,7 +158,7 @@ def get_piper_models() -> List[TtsModel]: | @@ -158,7 +158,7 @@ def get_piper_models() -> List[TtsModel]: | ||
| 158 | TtsModel(model_dir="vits-piper-fa_IR-gyro-medium"), | 158 | TtsModel(model_dir="vits-piper-fa_IR-gyro-medium"), |
| 159 | TtsModel(model_dir="vits-piper-fi_FI-harri-low"), | 159 | TtsModel(model_dir="vits-piper-fi_FI-harri-low"), |
| 160 | TtsModel(model_dir="vits-piper-fi_FI-harri-medium"), | 160 | TtsModel(model_dir="vits-piper-fi_FI-harri-medium"), |
| 161 | - TtsModel(model_dir="vits-piper-fr_FR-mls-medium"), | 161 | + # TtsModel(model_dir="vits-piper-fr_FR-mls-medium"), |
| 162 | TtsModel(model_dir="vits-piper-fr_FR-siwis-low"), | 162 | TtsModel(model_dir="vits-piper-fr_FR-siwis-low"), |
| 163 | TtsModel(model_dir="vits-piper-fr_FR-siwis-medium"), | 163 | TtsModel(model_dir="vits-piper-fr_FR-siwis-medium"), |
| 164 | TtsModel(model_dir="vits-piper-fr_FR-upmc-medium"), | 164 | TtsModel(model_dir="vits-piper-fr_FR-upmc-medium"), |
| @@ -9,6 +9,7 @@ const possible_paths = [ | @@ -9,6 +9,7 @@ const possible_paths = [ | ||
| 9 | '../build/Debug/sherpa-onnx.node', | 9 | '../build/Debug/sherpa-onnx.node', |
| 10 | `./node_modules/sherpa-onnx-${platform_arch}/sherpa-onnx.node`, | 10 | `./node_modules/sherpa-onnx-${platform_arch}/sherpa-onnx.node`, |
| 11 | `../sherpa-onnx-${platform_arch}/sherpa-onnx.node`, | 11 | `../sherpa-onnx-${platform_arch}/sherpa-onnx.node`, |
| 12 | + './sherpa-onnx.node', | ||
| 12 | ]; | 13 | ]; |
| 13 | 14 | ||
| 14 | let found = false; | 15 | let found = false; |
scripts/node-addon-api/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [[ ! -f ../../build/install/lib/libsherpa-onnx-core.dylib && ! -f ../../build/install/lib/libsherpa-onnx-core.so ]]; then | ||
| 6 | + pushd ../../ | ||
| 7 | + mkdir -p build | ||
| 8 | + cd build | ||
| 9 | + cmake -DCMAKE_INSTALL_PREFIX=./install -DBUILD_SHARED_LIBS=ON .. | ||
| 10 | + make install | ||
| 11 | + popd | ||
| 12 | +fi | ||
| 13 | +export SHERPA_ONNX_INSTALL_DIR=$PWD/../../build/install | ||
| 14 | + | ||
| 15 | +./node_modules/.bin/cmake-js compile |
| @@ -89,6 +89,30 @@ static SherpaOnnxOnlineTransducerModelConfig GetOnlineTransducerModelConfig( | @@ -89,6 +89,30 @@ static SherpaOnnxOnlineTransducerModelConfig GetOnlineTransducerModelConfig( | ||
| 89 | return config; | 89 | return config; |
| 90 | } | 90 | } |
| 91 | 91 | ||
| 92 | +static SherpaOnnxOnlineZipformer2CtcModelConfig | ||
| 93 | +GetOnlineZipformer2CtcModelConfig(Napi::Object obj) { | ||
| 94 | + SherpaOnnxOnlineZipformer2CtcModelConfig config; | ||
| 95 | + memset(&config, 0, sizeof(config)); | ||
| 96 | + | ||
| 97 | + if (!obj.Has("zipformer2Ctc") || !obj.Get("zipformer2Ctc").IsObject()) { | ||
| 98 | + return config; | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + Napi::Object o = obj.Get("zipformer2Ctc").As<Napi::Object>(); | ||
| 102 | + | ||
| 103 | + if (o.Has("model") && o.Get("model").IsString()) { | ||
| 104 | + Napi::String model = o.Get("model").As<Napi::String>(); | ||
| 105 | + std::string s = model.Utf8Value(); | ||
| 106 | + char *p = new char[s.size() + 1]; | ||
| 107 | + std::copy(s.begin(), s.end(), p); | ||
| 108 | + p[s.size()] = 0; | ||
| 109 | + | ||
| 110 | + config.model = p; | ||
| 111 | + } | ||
| 112 | + | ||
| 113 | + return config; | ||
| 114 | +} | ||
| 115 | + | ||
| 92 | static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { | 116 | static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { |
| 93 | SherpaOnnxOnlineModelConfig config; | 117 | SherpaOnnxOnlineModelConfig config; |
| 94 | memset(&config, 0, sizeof(config)); | 118 | memset(&config, 0, sizeof(config)); |
| @@ -100,6 +124,7 @@ static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { | @@ -100,6 +124,7 @@ static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { | ||
| 100 | Napi::Object o = obj.Get("modelConfig").As<Napi::Object>(); | 124 | Napi::Object o = obj.Get("modelConfig").As<Napi::Object>(); |
| 101 | 125 | ||
| 102 | config.transducer = GetOnlineTransducerModelConfig(o); | 126 | config.transducer = GetOnlineTransducerModelConfig(o); |
| 127 | + config.zipformer2_ctc = GetOnlineZipformer2CtcModelConfig(o); | ||
| 103 | 128 | ||
| 104 | if (o.Has("tokens") && o.Get("tokens").IsString()) { | 129 | if (o.Has("tokens") && o.Get("tokens").IsString()) { |
| 105 | Napi::String tokens = o.Get("tokens").As<Napi::String>(); | 130 | Napi::String tokens = o.Get("tokens").As<Napi::String>(); |
| @@ -147,6 +172,35 @@ static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { | @@ -147,6 +172,35 @@ static SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { | ||
| 147 | return config; | 172 | return config; |
| 148 | } | 173 | } |
| 149 | 174 | ||
| 175 | +static SherpaOnnxOnlineCtcFstDecoderConfig GetCtcFstDecoderConfig( | ||
| 176 | + Napi::Object obj) { | ||
| 177 | + SherpaOnnxOnlineCtcFstDecoderConfig config; | ||
| 178 | + memset(&config, 0, sizeof(config)); | ||
| 179 | + | ||
| 180 | + if (!obj.Has("ctcFstDecoderConfig") || | ||
| 181 | + !obj.Get("ctcFstDecoderConfig").IsObject()) { | ||
| 182 | + return config; | ||
| 183 | + } | ||
| 184 | + | ||
| 185 | + Napi::Object o = obj.Get("ctcFstDecoderConfig").As<Napi::Object>(); | ||
| 186 | + | ||
| 187 | + if (o.Has("graph") && o.Get("graph").IsString()) { | ||
| 188 | + Napi::String graph = o.Get("graph").As<Napi::String>(); | ||
| 189 | + std::string s = graph.Utf8Value(); | ||
| 190 | + char *p = new char[s.size() + 1]; | ||
| 191 | + std::copy(s.begin(), s.end(), p); | ||
| 192 | + p[s.size()] = 0; | ||
| 193 | + | ||
| 194 | + config.graph = p; | ||
| 195 | + } | ||
| 196 | + | ||
| 197 | + if (o.Has("maxActive") && o.Get("maxActive").IsNumber()) { | ||
| 198 | + config.max_active = o.Get("maxActive").As<Napi::Number>().Int32Value(); | ||
| 199 | + } | ||
| 200 | + | ||
| 201 | + return config; | ||
| 202 | +} | ||
| 203 | + | ||
| 150 | static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( | 204 | static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( |
| 151 | const Napi::CallbackInfo &info) { | 205 | const Napi::CallbackInfo &info) { |
| 152 | Napi::Env env = info.Env(); | 206 | Napi::Env env = info.Env(); |
| @@ -234,6 +288,8 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( | @@ -234,6 +288,8 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( | ||
| 234 | config.Get("hotwordsScore").As<Napi::Number>().FloatValue(); | 288 | config.Get("hotwordsScore").As<Napi::Number>().FloatValue(); |
| 235 | } | 289 | } |
| 236 | 290 | ||
| 291 | + c.ctc_fst_decoder_config = GetCtcFstDecoderConfig(config); | ||
| 292 | + | ||
| 237 | #if 0 | 293 | #if 0 |
| 238 | printf("encoder: %s\n", c.model_config.transducer.encoder | 294 | printf("encoder: %s\n", c.model_config.transducer.encoder |
| 239 | ? c.model_config.transducer.encoder | 295 | ? c.model_config.transducer.encoder |
| @@ -277,6 +333,10 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( | @@ -277,6 +333,10 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( | ||
| 277 | delete[] c.model_config.transducer.joiner; | 333 | delete[] c.model_config.transducer.joiner; |
| 278 | } | 334 | } |
| 279 | 335 | ||
| 336 | + if (c.model_config.zipformer2_ctc.model) { | ||
| 337 | + delete[] c.model_config.zipformer2_ctc.model; | ||
| 338 | + } | ||
| 339 | + | ||
| 280 | if (c.model_config.tokens) { | 340 | if (c.model_config.tokens) { |
| 281 | delete[] c.model_config.tokens; | 341 | delete[] c.model_config.tokens; |
| 282 | } | 342 | } |
| @@ -297,6 +357,10 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( | @@ -297,6 +357,10 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper( | ||
| 297 | delete[] c.hotwords_file; | 357 | delete[] c.hotwords_file; |
| 298 | } | 358 | } |
| 299 | 359 | ||
| 360 | + if (c.ctc_fst_decoder_config.graph) { | ||
| 361 | + delete[] c.ctc_fst_decoder_config.graph; | ||
| 362 | + } | ||
| 363 | + | ||
| 300 | if (!recognizer) { | 364 | if (!recognizer) { |
| 301 | Napi::TypeError::New(env, "Please check your config!") | 365 | Napi::TypeError::New(env, "Please check your config!") |
| 302 | .ThrowAsJavaScriptException(); | 366 | .ThrowAsJavaScriptException(); |
| @@ -216,6 +216,8 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl { | @@ -216,6 +216,8 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl { | ||
| 216 | // clear states | 216 | // clear states |
| 217 | s->SetStates(model_->GetInitStates()); | 217 | s->SetStates(model_->GetInitStates()); |
| 218 | 218 | ||
| 219 | + s->GetFasterDecoderProcessedFrames() = 0; | ||
| 220 | + | ||
| 219 | // Note: We only update counters. The underlying audio samples | 221 | // Note: We only update counters. The underlying audio samples |
| 220 | // are not discarded. | 222 | // are not discarded. |
| 221 | s->Reset(); | 223 | s->Reset(); |
-
请 注册 或 登录 后发表评论