Fangjun Kuang
Committed by GitHub

Add VAD and keyword spotting for the Node package with WebAssembly (#1286)

正在显示 40 个修改的文件 包含 456 行增加524 行删除
... ... @@ -9,6 +9,28 @@ git status
ls -lh
ls -lh node_modules
echo '-----vad+whisper----------'
curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
rm sherpa-onnx-whisper-tiny.en.tar.bz2
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
node ./test-vad-with-non-streaming-asr-whisper.js
rm Obama.wav
rm silero_vad.onnx
rm -rf sherpa-onnx-whisper-tiny.en
echo "----------keyword spotting----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
node ./test-keyword-spotter-transducer.js
rm -rf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01
# offline asr
#
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
... ...
name: npm
on:
push:
branches:
- npm
workflow_dispatch:
concurrency:
... ... @@ -27,6 +30,9 @@ jobs:
- name: Install emsdk
uses: mymindstorm/setup-emsdk@v14
with:
version: 3.1.51
actions-cache-folder: 'emsdk-cache'
- name: View emsdk version
shell: bash
... ... @@ -51,8 +57,6 @@ jobs:
- name: Build nodejs package
shell: bash
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
run: |
./build-wasm-simd-nodejs.sh
cp -v build-wasm-simd-nodejs/install/bin/wasm/nodejs/*.js ./scripts/nodejs/
... ... @@ -71,6 +75,29 @@ jobs:
rm package.json.bak
- name: Collect files
shell: bash
run: |
dst=sherpa-onnx-wasm-nodejs
mkdir $dst
cp -v scripts/nodejs/* $dst
tar cvjf $dst.tar.bz2 $dst
echo "---"
ls -h $dst
- uses: actions/upload-artifact@v4
with:
name: sherpa-onnx-wasm-nodejs
path: ./*.tar.bz2
- name: Build nodejs package
shell: bash
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
run: |
cd scripts/nodejs
git diff
npm install
... ...
... ... @@ -55,6 +55,9 @@ jobs:
- name: Install emsdk
uses: mymindstorm/setup-emsdk@v14
with:
version: 3.1.51
actions-cache-folder: 'emsdk-cache'
- name: View emsdk version
shell: bash
... ... @@ -109,6 +112,7 @@ jobs:
node --version
npm --version
export d=scripts/nodejs
cat $d/index.js
pushd $d
npm install
... ...
## 1.10.23
* flutter: add lang, emotion, event to OfflineRecognizerResult (#1268)
* Use a separate thread to initialize models for lazarus examples. (#1270)
* Object pascal examples for recording and playing audio with portaudio. (#1271)
* Text to speech API for Object Pascal. (#1273)
* update kotlin api for better release native object and add user-friendly apis. (#1275)
* Update wave-reader.cc to support 8/16/32-bit waves (#1278)
* Add WebAssembly for VAD (#1281)
* WebAssembly example for VAD + Non-streaming ASR (#1284)
## 1.10.22
* Add Pascal API for reading wave files (#1243)
... ...
... ... @@ -11,7 +11,7 @@ project(sherpa-onnx)
# ./nodejs-addon-examples
# ./dart-api-examples/
# ./CHANGELOG.md
set(SHERPA_ONNX_VERSION "1.10.22")
set(SHERPA_ONNX_VERSION "1.10.23")
# Disable warning about
#
... ... @@ -206,6 +206,7 @@ if(SHERPA_ONNX_ENABLE_WASM_NODEJS)
if(NOT SHERPA_ONNX_ENABLE_WASM)
message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for NodeJS")
endif()
add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1)
endif()
if(SHERPA_ONNX_ENABLE_WASM)
... ...
... ... @@ -9,7 +9,7 @@ environment:
sdk: ^3.4.0
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -9,7 +9,7 @@ environment:
sdk: ^3.4.0
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -9,7 +9,7 @@ environment:
sdk: ^3.4.0
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
# sherpa_onnx:
# path: ../../flutter/sherpa_onnx
path: ^1.9.0
... ...
... ... @@ -10,7 +10,7 @@ environment:
# Add regular dependencies here.
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -9,7 +9,7 @@ environment:
sdk: ^3.4.0
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -11,7 +11,7 @@ environment:
# Add regular dependencies here.
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -8,7 +8,7 @@ environment:
# Add regular dependencies here.
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -10,7 +10,7 @@ environment:
sdk: ^3.4.0
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -9,7 +9,7 @@ environment:
sdk: ^3.4.0
dependencies:
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -5,7 +5,7 @@ description: >
publish_to: 'none'
version: 1.10.22
version: 1.10.23
topics:
- speech-recognition
... ... @@ -30,7 +30,7 @@ dependencies:
record: ^5.1.0
url_launcher: ^6.2.6
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
# sherpa_onnx:
# path: ../../flutter/sherpa_onnx
... ...
... ... @@ -5,7 +5,7 @@ description: >
publish_to: 'none' # Remove this line if you wish to publish to pub.dev
version: 1.10.22
version: 1.10.23
environment:
sdk: '>=3.4.0 <4.0.0'
... ... @@ -17,7 +17,7 @@ dependencies:
cupertino_icons: ^1.0.6
path_provider: ^2.1.3
path: ^1.9.0
sherpa_onnx: ^1.10.22
sherpa_onnx: ^1.10.23
url_launcher: ^6.2.6
audioplayers: ^5.0.0
... ...
... ... @@ -17,7 +17,7 @@ topics:
- voice-activity-detection
# remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
version: 1.10.22
version: 1.10.23
homepage: https://github.com/k2-fsa/sherpa-onnx
... ... @@ -30,23 +30,23 @@ dependencies:
flutter:
sdk: flutter
sherpa_onnx_android: ^1.10.22
sherpa_onnx_android: ^1.10.23
# sherpa_onnx_android:
# path: ../sherpa_onnx_android
sherpa_onnx_macos: ^1.10.22
sherpa_onnx_macos: ^1.10.23
# sherpa_onnx_macos:
# path: ../sherpa_onnx_macos
sherpa_onnx_linux: ^1.10.22
sherpa_onnx_linux: ^1.10.23
# sherpa_onnx_linux:
# path: ../sherpa_onnx_linux
#
sherpa_onnx_windows: ^1.10.22
sherpa_onnx_windows: ^1.10.23
# sherpa_onnx_windows:
# path: ../sherpa_onnx_windows
sherpa_onnx_ios: ^1.10.22
sherpa_onnx_ios: ^1.10.23
# sherpa_onnx_ios:
# path: ../sherpa_onnx_ios
... ...
... ... @@ -7,7 +7,7 @@
# https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
Pod::Spec.new do |s|
s.name = 'sherpa_onnx_ios'
s.version = '1.10.22'
s.version = '1.10.23'
s.summary = 'A new Flutter FFI plugin project.'
s.description = <<-DESC
A new Flutter FFI plugin project.
... ...
... ... @@ -4,7 +4,7 @@
#
Pod::Spec.new do |s|
s.name = 'sherpa_onnx_macos'
s.version = '1.10.22'
s.version = '1.10.23'
s.summary = 'sherpa-onnx Flutter FFI plugin project.'
s.description = <<-DESC
sherpa-onnx Flutter FFI plugin project.
... ...
#!/usr/bin/env bash
find flutter -name *.yaml -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \;
find dart-api-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \;
find flutter-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \;
find flutter -name *.podspec -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \;
find nodejs-addon-examples -name package.json -type f -exec sed -i.bak 's/1\.10\.22/1\.10\.23/g' {} \;
... ...
{
"dependencies": {
"sherpa-onnx-node": "^1.10.22"
"sherpa-onnx-node": "^1.10.23"
}
}
... ...
... ... @@ -42,11 +42,11 @@ stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate});
const detectedKeywords = [];
while (kws.isReady(stream)) {
kws.decode(stream);
const keyword = kws.getResult(stream).keyword;
if (keyword != '') {
detectedKeywords.push(keyword);
}
kws.decode(stream);
}
let stop = Date.now();
... ...
... ... @@ -120,8 +120,8 @@ console.log('Done')
const elapsed_seconds = (stop - start) / 1000;
const duration = wave.samples.length / wave.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'secodns')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
console.log('Wave duration', duration.toFixed(3), 'seconds')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
console.log(
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
real_time_factor.toFixed(3))
... ...
// Copyright (c) 2024 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx');
function createKeywordSpotter() {
// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models
const config = {
'modelConfig': {
'transducer': {
'encoder':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx',
'decoder':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx',
'joiner':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx',
},
'tokens':
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt',
},
keywords: 'w én s ēn t è k ǎ s uǒ @文森特卡索\n' +
'f ǎ g uó @法国'
};
return sherpa_onnx.createKws(config);
}
const kws = createKeywordSpotter();
const stream = kws.createStream();
const waveFilename =
'./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const tailPadding = new Float32Array(wave.sampleRate * 0.4);
stream.acceptWaveform(kws.config.featConfig.sampleRate, tailPadding);
const detectedKeywords = [];
while (kws.isReady(stream)) {
kws.decode(stream);
const keyword = kws.getResult(stream).keyword;
if (keyword != '') {
detectedKeywords.push(keyword);
}
}
console.log(detectedKeywords);
stream.free();
kws.free();
... ...
... ... @@ -7,27 +7,13 @@ const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
nemoCtc: {
model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx',
},
tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'nemo_ctc',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
modelConfig: {
nemoCtc: {
model: './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx',
},
tokens: './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt',
}
};
return sherpa_onnx.createOfflineRecognizer(config);
... ... @@ -38,63 +24,12 @@ const stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {highWaterMark: 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();
... ...
... ... @@ -7,27 +7,15 @@ const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
paraformer: {
model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx',
},
tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'paraformer',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
ruleFsts: './itn_zh_number.fst',
};
... ... @@ -41,62 +29,12 @@ const stream = recognizer.createStream();
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav
const waveFilename = './itn-zh-number.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();
... ...
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
paraformer: {
model: './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx',
},
tokens: './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'paraformer',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
};
return sherpa_onnx.createOfflineRecognizer(config);
}
const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-paraformer-zh-2023-09-14/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();
... ...
// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang)
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
senseVoice: {
model:
... ... @@ -20,82 +11,26 @@ function createOfflineRecognizer() {
useInverseTextNormalization: 1,
},
tokens: './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
};
return sherpa_onnx.createOfflineRecognizer(config);
}
const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();
... ...
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
transducer: {
encoder:
... ... @@ -22,19 +13,11 @@ function createOfflineRecognizer() {
'./sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.int8.onnx',
},
tokens: './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'transducer',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
hotwordsFile: '',
hotwordsScore: 1.5,
};
return sherpa_onnx.createOfflineRecognizer(config);
... ... @@ -43,62 +26,12 @@ const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();
... ...
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let featConfig = {
sampleRate: 16000,
featureDim: 80,
};
let modelConfig = {
whisper: {
encoder: './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
... ... @@ -21,83 +12,25 @@ function createOfflineRecognizer() {
tailPaddings: -1,
},
tokens: './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
numThreads: 1,
debug: 0,
provider: 'cpu',
modelType: 'whisper',
};
let config = {
featConfig: featConfig,
modelConfig: modelConfig,
decodingMethod: 'greedy_search',
};
return sherpa_onnx.createOfflineRecognizer(config);
}
recognizer = createOfflineRecognizer();
stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
const reader = new wav.Reader();
const readable = new Readable().wrap(reader);
const buf = [];
reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => {
if (sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(`Only support sampleRate ${
recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`);
}
if (audioFormat != 1) {
throw new Error(`Only support PCM format. Given ${audioFormat}`);
}
if (channels != 1) {
throw new Error(`Only a single channel. Given ${channel}`);
}
if (bitDepth != 16) {
throw new Error(`Only support 16-bit samples. Given ${bitDepth}`);
}
});
fs.createReadStream(waveFilename, {'highWaterMark': 4096})
.pipe(reader)
.on('finish', function(err) {
// tail padding
const floatSamples =
new Float32Array(recognizer.config.featConfig.sampleRate * 0.5);
buf.push(floatSamples);
const flattened =
Float32Array.from(buf.reduce((a, b) => [...a, ...b], []));
stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
});
readable.on('readable', function() {
let chunk;
while ((chunk = readable.read()) != null) {
const int16Samples = new Int16Array(
chunk.buffer, chunk.byteOffset,
chunk.length / Int16Array.BYTES_PER_ELEMENT);
const floatSamples = new Float32Array(int16Samples.length);
for (let i = 0; i < floatSamples.length; i++) {
floatSamples[i] = int16Samples[i] / 32768.0;
}
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
buf.push(floatSamples);
}
});
stream.free();
recognizer.free();
... ...
... ... @@ -16,22 +16,10 @@ function createOnlineRecognizer() {
let onlineModelConfig = {
paraformer: onlineParaformerModelConfig,
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
modelType: 'paraformer',
};
let featureConfig = {
sampleRate: 16000,
featureDim: 80,
};
let recognizerConfig = {
featConfig: featureConfig,
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
... ...
... ... @@ -17,26 +17,10 @@ function createOnlineRecognizer() {
let onlineModelConfig = {
paraformer: onlineParaformerModelConfig,
tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
modelType: 'paraformer',
};
let featureConfig = {
sampleRate: 16000,
featureDim: 80,
};
let recognizerConfig = {
featConfig: featureConfig,
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
... ...
... ... @@ -20,26 +20,10 @@ function createOnlineRecognizer() {
transducer: onlineTransducerModelConfig,
tokens:
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
modelType: 'zipformer',
};
let featureConfig = {
sampleRate: 16000,
featureDim: 80,
};
let recognizerConfig = {
featConfig: featureConfig,
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
... ...
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
const sherpa_onnx = require('sherpa-onnx');
function createRecognizer() {
// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const config = {
'modelConfig': {
'whisper': {
'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx',
'tailPaddings': 2000,
},
'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
'debug': 0,
}
};
return sherpa_onnx.createOfflineRecognizer(config);
}
function createVad() {
// please download silero_vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
const config = {
sileroVad: {
model: './silero_vad.onnx',
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
windowSize: 512,
},
sampleRate: 16000,
debug: true,
numThreads: 1,
bufferSizeInSeconds: 60,
};
return sherpa_onnx.createVad(config);
}
const recognizer = createRecognizer();
const vad = createVad();
// please download ./Obama.wav from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const waveFilename = './Obama.wav';
const wave = sherpa_onnx.readWave(waveFilename);
if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
throw new Error(
'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
}
console.log('Started')
let start = Date.now();
const windowSize = vad.config.sileroVad.windowSize;
for (let i = 0; i < wave.samples.length; i += windowSize) {
const thisWindow = wave.samples.subarray(i, i + windowSize);
vad.acceptWaveform(thisWindow);
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
let start_time = segment.start / wave.sampleRate;
let end_time = start_time + segment.samples.length / wave.sampleRate;
start_time = start_time.toFixed(2);
end_time = end_time.toFixed(2);
const stream = recognizer.createStream();
stream.acceptWaveform(wave.sampleRate, segment.samples);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
if (r.text.length > 0) {
const text = r.text.toLowerCase().trim();
console.log(`${start_time} -- ${end_time}: ${text}`);
}
stream.free();
}
}
vad.flush();
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
let start_time = segment.start / wave.sampleRate;
let end_time = start_time + segment.samples.length / wave.sampleRate;
start_time = start_time.toFixed(2);
end_time = end_time.toFixed(2);
const stream = recognizer.createStream();
stream.acceptWaveform(wave.sampleRate, segment.samples);
recognizer.decode(stream);
const r = recognizer.getResult(stream);
if (r.text.length > 0) {
const text = r.text.toLowerCase().trim();
console.log(`${start_time} -- ${end_time}: ${text}`);
}
}
let stop = Date.now();
console.log('Done')
const elapsed_seconds = (stop - start) / 1000;
const duration = wave.samples.length / wave.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'seconds')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
console.log(
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
real_time_factor.toFixed(3))
vad.free();
recognizer.free();
... ...
node_modules
jslint.mjs
sherpa-onnx-*.js
sherpa-onnx-*.wasm
... ...
... ... @@ -4,6 +4,9 @@
const wasmModule = require('./sherpa-onnx-wasm-nodejs.js')();
const sherpa_onnx_asr = require('./sherpa-onnx-asr.js');
const sherpa_onnx_tts = require('./sherpa-onnx-tts.js');
const sherpa_onnx_kws = require('./sherpa-onnx-kws.js');
const sherpa_onnx_wave = require('./sherpa-onnx-wave.js');
const sherpa_onnx_vad = require('./sherpa-onnx-vad.js');
function createOnlineRecognizer(config) {
return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config);
... ... @@ -17,10 +20,35 @@ function createOfflineTts(config) {
return sherpa_onnx_tts.createOfflineTts(wasmModule, config);
}
function createKws(config) {
return sherpa_onnx_kws.createKws(wasmModule, config);
}
function createCircularBuffer(capacity) {
return new sherpa_onnx_vad.CircularBuffer(capacity, wasmModule);
}
function createVad(config) {
return sherpa_onnx_vad.createVad(wasmModule, config);
}
function readWave(filename) {
return sherpa_onnx_wave.readWave(filename, wasmModule);
}
function writeWave(filename, data) {
sherpa_onnx_wave.writeWave(filename, data, wasmModule);
}
// Note: online means streaming and offline means non-streaming here.
// Both of them don't require internet connection.
module.exports = {
createOnlineRecognizer,
createOfflineRecognizer,
createOfflineTts,
createKws,
readWave,
writeWave,
createCircularBuffer,
createVad,
};
... ...
... ... @@ -546,7 +546,7 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
Module.setValue(ptr + 12, buffer + offset, 'i8*');
offset += taskLen;
Module.setValue(ptr + 16, config.tailPaddings || -1, 'i32');
Module.setValue(ptr + 16, config.tailPaddings || 2000, 'i32');
return {
buffer: buffer, ptr: ptr, len: len,
... ...
... ... @@ -69,13 +69,14 @@ function initModelConfig(config, Module) {
const len = transducer.len + paraformer_len + ctc_len + 7 * 4;
const ptr = Module._malloc(len);
Module.HEAPU8.fill(0, ptr, ptr + len);
let offset = 0;
Module._CopyHeap(transducer.ptr, transducer.len, ptr + offset);
const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1;
const providerLen = Module.lengthBytesUTF8(config.provider) + 1;
const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1;
const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1;
const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1;
const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1;
const bufferLen =
... ... @@ -86,10 +87,10 @@ function initModelConfig(config, Module) {
Module.stringToUTF8(config.tokens, buffer, tokensLen);
offset += tokensLen;
Module.stringToUTF8(config.provider, buffer + offset, providerLen);
Module.stringToUTF8(config.provider || 'cpu', buffer + offset, providerLen);
offset += providerLen;
Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen);
Module.stringToUTF8(config.modelType || '', buffer + offset, modelTypeLen);
offset += modelTypeLen;
Module.stringToUTF8(
... ... @@ -103,7 +104,7 @@ function initModelConfig(config, Module) {
Module.setValue(ptr + offset, buffer, 'i8*'); // tokens
offset += 4;
Module.setValue(ptr + offset, config.numThreads, 'i32');
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
offset += 4;
Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider
... ... @@ -134,14 +135,21 @@ function initModelConfig(config, Module) {
function initFeatureExtractorConfig(config, Module) {
let ptr = Module._malloc(4 * 2);
Module.setValue(ptr, config.samplingRate, 'i32');
Module.setValue(ptr + 4, config.featureDim, 'i32');
Module.setValue(ptr, config.samplingRate || 16000, 'i32');
Module.setValue(ptr + 4, config.featureDim || 80, 'i32');
return {
ptr: ptr, len: 8,
}
}
function initKwsConfig(config, Module) {
if (!('featConfig' in config)) {
config.featConfig = {
sampleRate: 16000,
featureDim: 80,
};
}
let featConfig = initFeatureExtractorConfig(config.featConfig, Module);
let modelConfig = initModelConfig(config.modelConfig, Module);
... ... @@ -155,16 +163,16 @@ function initKwsConfig(config, Module) {
Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset)
offset += modelConfig.len;
Module.setValue(ptr + offset, config.maxActivePaths, 'i32');
Module.setValue(ptr + offset, config.maxActivePaths || 4, 'i32');
offset += 4;
Module.setValue(ptr + offset, config.numTrailingBlanks, 'i32');
Module.setValue(ptr + offset, config.numTrailingBlanks || 1, 'i32');
offset += 4;
Module.setValue(ptr + offset, config.keywordsScore, 'float');
Module.setValue(ptr + offset, config.keywordsScore || 1.0, 'float');
offset += 4;
Module.setValue(ptr + offset, config.keywordsThreshold, 'float');
Module.setValue(ptr + offset, config.keywordsThreshold || 0.25, 'float');
offset += 4;
let keywordsLen = Module.lengthBytesUTF8(config.keywords) + 1;
... ...
... ... @@ -49,6 +49,32 @@ set(exported_functions
SherpaOnnxDestroyKeywordSpotter
SherpaOnnxGetKeywordResult
SherpaOnnxIsKeywordStreamReady
# VAD
SherpaOnnxCreateCircularBuffer
SherpaOnnxDestroyCircularBuffer
SherpaOnnxCircularBufferPush
SherpaOnnxCircularBufferGet
SherpaOnnxCircularBufferFree
SherpaOnnxCircularBufferPop
SherpaOnnxCircularBufferSize
SherpaOnnxCircularBufferHead
SherpaOnnxCircularBufferReset
SherpaOnnxCreateVoiceActivityDetector
SherpaOnnxDestroyVoiceActivityDetector
SherpaOnnxVoiceActivityDetectorAcceptWaveform
SherpaOnnxVoiceActivityDetectorEmpty
SherpaOnnxVoiceActivityDetectorDetected
SherpaOnnxVoiceActivityDetectorPop
SherpaOnnxVoiceActivityDetectorClear
SherpaOnnxVoiceActivityDetectorFront
SherpaOnnxDestroySpeechSegment
SherpaOnnxVoiceActivityDetectorReset
SherpaOnnxVoiceActivityDetectorFlush
#
SherpaOnnxFileExists
SherpaOnnxReadWave
SherpaOnnxFreeWave
SherpaOnnxWriteWave
)
... ... @@ -82,6 +108,8 @@ install(
${CMAKE_SOURCE_DIR}/wasm/asr/sherpa-onnx-asr.js
${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js
${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js
${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js
${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js"
"$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm"
DESTINATION
... ...
// return an object
// {
// samples: a float32 array
// sampleRate: an integer
// }
function readWave(filename, Module) {
const filenameLen = Module.lengthBytesUTF8(filename) + 1;
const pFilename = Module._malloc(filenameLen);
Module.stringToUTF8(filename, pFilename, filenameLen);
const w = Module._SherpaOnnxReadWave(pFilename);
Module._free(pFilename);
const samplesPtr = Module.HEAP32[w / 4] / 4;
const sampleRate = Module.HEAP32[w / 4 + 1];
const numSamples = Module.HEAP32[w / 4 + 2];
const samples = new Float32Array(numSamples);
for (let i = 0; i < numSamples; i++) {
samples[i] = Module.HEAPF32[samplesPtr + i];
}
Module._SherpaOnnxFreeWave(w);
return {samples: samples, sampleRate: sampleRate};
}
// data is an object
// {
// samples: a float32 array
// sampleRate: an integer
// }
function writeWave(filename, data, Module) {
const pSamples =
Module._malloc(data.samples.length * data.samples.BYTES_PER_ELEMENT);
Module.HEAPF32.set(data.samples, pSamples / data.samples.BYTES_PER_ELEMENT);
const filenameLen = Module.lengthBytesUTF8(filename) + 1;
const pFilename = Module._malloc(filenameLen);
Module.stringToUTF8(filename, pFilename, filenameLen);
Module._SherpaOnnxWriteWave(
pSamples, data.samples.length, data.sampleRate, pFilename);
Module._free(pFilename);
Module._free(pSamples);
}
if (typeof process == 'object' && typeof process.versions == 'object' &&
typeof process.versions.node == 'string') {
module.exports = {
readWave,
writeWave,
};
}
... ...