继续操作前请注册或者登录。
Fangjun Kuang
Committed by GitHub

Add JavaScript API for Moonshine models (#1480)

@@ -10,6 +10,19 @@ arch=$(node -p "require('os').arch()") @@ -10,6 +10,19 @@ arch=$(node -p "require('os').arch()")
10 platform=$(node -p "require('os').platform()") 10 platform=$(node -p "require('os').platform()")
11 node_version=$(node -p "process.versions.node.split('.')[0]") 11 node_version=$(node -p "process.versions.node.split('.')[0]")
12 12
  13 +echo "----------non-streaming asr moonshine + vad----------"
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  15 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  16 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  17 +
  18 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  19 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  20 +
  21 +node ./test_vad_with_non_streaming_asr_moonshine.js
  22 +rm -rf sherpa-onnx-*
  23 +rm *.wav
  24 +rm *.onnx
  25 +
13 echo "----------non-streaming speaker diarization----------" 26 echo "----------non-streaming speaker diarization----------"
14 27
15 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 28 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
@@ -24,7 +37,7 @@ node ./test_offline_speaker_diarization.js @@ -24,7 +37,7 @@ node ./test_offline_speaker_diarization.js
24 37
25 rm -rfv *.onnx *.wav sherpa-onnx-pyannote-* 38 rm -rfv *.onnx *.wav sherpa-onnx-pyannote-*
26 39
27 -echo "----------non-streaming asr + vad----------" 40 +echo "----------non-streaming asr whisper + vad----------"
28 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 41 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
29 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 42 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
30 rm sherpa-onnx-whisper-tiny.en.tar.bz2 43 rm sherpa-onnx-whisper-tiny.en.tar.bz2
@@ -218,6 +231,11 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2 @@ -218,6 +231,11 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2
218 node ./test_asr_non_streaming_whisper.js 231 node ./test_asr_non_streaming_whisper.js
219 rm -rf sherpa-onnx-whisper-tiny.en 232 rm -rf sherpa-onnx-whisper-tiny.en
220 233
  234 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  235 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  236 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
221 237
  238 +node ./test_asr_non_streaming_moonshine.js
  239 +rm -rf sherpa-onnx-*
222 240
223 ls -lh 241 ls -lh
@@ -21,6 +21,23 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segm @@ -21,6 +21,23 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segm
21 node ./test-offline-speaker-diarization.js 21 node ./test-offline-speaker-diarization.js
22 rm -rfv *.wav *.onnx sherpa-onnx-pyannote-* 22 rm -rfv *.wav *.onnx sherpa-onnx-pyannote-*
23 23
  24 +echo '-----vad+moonshine----------'
  25 +
  26 +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  27 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  28 +rm sherpa-onnx-whisper-tiny.en.tar.bz2
  29 +
  30 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  31 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  32 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  33 +
  34 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  35 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  36 +node ./test-vad-with-non-streaming-asr-whisper.js
  37 +rm Obama.wav
  38 +rm silero_vad.onnx
  39 +rm -rf sherpa-onnx-moonshine-*
  40 +
24 echo '-----vad+whisper----------' 41 echo '-----vad+whisper----------'
25 42
26 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 43 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
@@ -90,6 +107,13 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2 @@ -90,6 +107,13 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2
90 node ./test-offline-whisper.js 107 node ./test-offline-whisper.js
91 rm -rf sherpa-onnx-whisper-tiny.en 108 rm -rf sherpa-onnx-whisper-tiny.en
92 109
  110 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  111 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  112 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  113 +
  114 +node ./test-offline-moonshine.js
  115 +rm -rf sherpa-onnx-moonshine-*
  116 +
93 # online asr 117 # online asr
94 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 118 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
95 tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 119 tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
@@ -112,6 +112,8 @@ The following tables list the examples in this folder. @@ -112,6 +112,8 @@ The following tables list the examples in this folder.
112 |[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model| 112 |[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model|
113 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)| 113 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
114 |[./test_vad_with_non_streaming_asr_whisper.js](./test_vad_with_non_streaming_asr_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper) + [Silero VAD](https://github.com/snakers4/silero-vad)| 114 |[./test_vad_with_non_streaming_asr_whisper.js](./test_vad_with_non_streaming_asr_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper) + [Silero VAD](https://github.com/snakers4/silero-vad)|
  115 +|[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)|
  116 +|[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)|
115 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| 117 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
116 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| 118 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
117 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| 119 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
@@ -122,6 +124,7 @@ The following tables list the examples in this folder. @@ -122,6 +124,7 @@ The following tables list the examples in this folder.
122 |---|---| 124 |---|---|
123 |[./test_vad_asr_non_streaming_transducer_microphone.js](./test_vad_asr_non_streaming_transducer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer transducer model| 125 |[./test_vad_asr_non_streaming_transducer_microphone.js](./test_vad_asr_non_streaming_transducer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer transducer model|
124 |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| 126 |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
  127 +|[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)|
125 |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| 128 |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
126 |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| 129 |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
127 |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| 130 |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
@@ -260,6 +263,33 @@ npm install naudiodon2 @@ -260,6 +263,33 @@ npm install naudiodon2
260 node ./test_vad_asr_non_streaming_whisper_microphone.js 263 node ./test_vad_asr_non_streaming_whisper_microphone.js
261 ``` 264 ```
262 265
  266 +### Non-streaming speech recognition with Moonshine
  267 +
  268 +```bash
  269 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  270 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  271 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  272 +
  273 +node ./test_asr_non_streaming_moonshine.js
  274 +
  275 +# To run VAD + non-streaming ASR with Moonshine using a microphone
  276 +npm install naudiodon2
  277 +node ./test_vad_asr_non_streaming_moonshine_microphone.js
  278 +```
  279 +
  280 +### Non-streaming speech recognition with Moonshine + VAD
  281 +
  282 +```bash
  283 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  284 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  285 +rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  286 +
  287 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  288 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  289 +
  290 +node ./test_vad_with_non_streaming_asr_moonshine.js
  291 +```
  292 +
263 ### Non-streaming speech recognition with Whisper + VAD 293 ### Non-streaming speech recognition with Whisper + VAD
264 294
265 ```bash 295 ```bash
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// Please download test files from
  5 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  6 +const config = {
  7 + 'featConfig': {
  8 + 'sampleRate': 16000,
  9 + 'featureDim': 80,
  10 + },
  11 + 'modelConfig': {
  12 + 'moonshine': {
  13 + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx',
  14 + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx',
  15 + 'uncachedDecoder':
  16 + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx',
  17 + 'cachedDecoder':
  18 + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx',
  19 + },
  20 + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt',
  21 + 'numThreads': 2,
  22 + 'provider': 'cpu',
  23 + 'debug': 1,
  24 + }
  25 +};
  26 +
  27 +const waveFilename = './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav';
  28 +
  29 +const recognizer = new sherpa_onnx.OfflineRecognizer(config);
  30 +console.log('Started')
  31 +let start = Date.now();
  32 +const stream = recognizer.createStream();
  33 +const wave = sherpa_onnx.readWave(waveFilename);
  34 +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
  35 +
  36 +recognizer.decode(stream);
  37 +result = recognizer.getResult(stream)
  38 +let stop = Date.now();
  39 +console.log('Done')
  40 +
  41 +const elapsed_seconds = (stop - start) / 1000;
  42 +const duration = wave.samples.length / wave.sampleRate;
  43 +const real_time_factor = elapsed_seconds / duration;
  44 +console.log('Wave duration', duration.toFixed(3), 'secodns')
  45 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  46 +console.log(
  47 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  48 + real_time_factor.toFixed(3))
  49 +console.log(waveFilename)
  50 +console.log('result\n', result)
  1 +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +//
  3 +const portAudio = require('naudiodon2');
  4 +// console.log(portAudio.getDevices());
  5 +
  6 +const sherpa_onnx = require('sherpa-onnx-node');
  7 +
  8 +function createRecognizer() {
  9 + // Please download test files from
  10 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  11 + const config = {
  12 + 'featConfig': {
  13 + 'sampleRate': 16000,
  14 + 'featureDim': 80,
  15 + },
  16 + 'modelConfig': {
  17 + 'moonshine': {
  18 + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx',
  19 + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx',
  20 + 'uncachedDecoder':
  21 + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx',
  22 + 'cachedDecoder':
  23 + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx',
  24 + },
  25 + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt',
  26 + 'numThreads': 2,
  27 + 'provider': 'cpu',
  28 + 'debug': 1,
  29 + }
  30 + };
  31 +
  32 + return new sherpa_onnx.OfflineRecognizer(config);
  33 +}
  34 +
  35 +function createVad() {
  36 + // please download silero_vad.onnx from
  37 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  38 + const config = {
  39 + sileroVad: {
  40 + model: './silero_vad.onnx',
  41 + threshold: 0.5,
  42 + minSpeechDuration: 0.25,
  43 + minSilenceDuration: 0.5,
  44 + windowSize: 512,
  45 + },
  46 + sampleRate: 16000,
  47 + debug: true,
  48 + numThreads: 1,
  49 + };
  50 +
  51 + const bufferSizeInSeconds = 60;
  52 +
  53 + return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
  54 +}
  55 +
  56 +const recognizer = createRecognizer();
  57 +const vad = createVad();
  58 +
  59 +const bufferSizeInSeconds = 30;
  60 +const buffer =
  61 + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
  62 +
  63 +const ai = new portAudio.AudioIO({
  64 + inOptions: {
  65 + channelCount: 1,
  66 + closeOnError: true, // Close the stream if an audio error is detected, if
  67 + // set false then just log the error
  68 + deviceId: -1, // Use -1 or omit the deviceId to select the default device
  69 + sampleFormat: portAudio.SampleFormatFloat32,
  70 + sampleRate: vad.config.sampleRate
  71 + }
  72 +});
  73 +
  74 +let printed = false;
  75 +let index = 0;
  76 +ai.on('data', data => {
  77 + const windowSize = vad.config.sileroVad.windowSize;
  78 + buffer.push(new Float32Array(data.buffer));
  79 + while (buffer.size() > windowSize) {
  80 + const samples = buffer.get(buffer.head(), windowSize);
  81 + buffer.pop(windowSize);
  82 + vad.acceptWaveform(samples);
  83 + }
  84 +
  85 + while (!vad.isEmpty()) {
  86 + const segment = vad.front();
  87 + vad.pop();
  88 + const stream = recognizer.createStream();
  89 + stream.acceptWaveform({
  90 + samples: segment.samples,
  91 + sampleRate: recognizer.config.featConfig.sampleRate
  92 + });
  93 + recognizer.decode(stream);
  94 + const r = recognizer.getResult(stream);
  95 + if (r.text.length > 0) {
  96 + const text = r.text.toLowerCase().trim();
  97 + console.log(`${index}: ${text}`);
  98 +
  99 + const filename = `${index}-${text}-${
  100 + new Date()
  101 + .toLocaleTimeString('en-US', {hour12: false})
  102 + .split(' ')[0]}.wav`;
  103 + sherpa_onnx.writeWave(
  104 + filename,
  105 + {samples: segment.samples, sampleRate: vad.config.sampleRate});
  106 +
  107 + index += 1;
  108 + }
  109 + }
  110 +});
  111 +
  112 +ai.start();
  113 +console.log('Started! Please speak')
  1 +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +const sherpa_onnx = require('sherpa-onnx-node');
  4 +
  5 +function createRecognizer() {
  6 + // Please download test files from
  7 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  8 + const config = {
  9 + 'featConfig': {
  10 + 'sampleRate': 16000,
  11 + 'featureDim': 80,
  12 + },
  13 + 'modelConfig': {
  14 + 'moonshine': {
  15 + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx',
  16 + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx',
  17 + 'uncachedDecoder':
  18 + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx',
  19 + 'cachedDecoder':
  20 + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx',
  21 + },
  22 + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt',
  23 + 'numThreads': 2,
  24 + 'provider': 'cpu',
  25 + 'debug': 1,
  26 + }
  27 + };
  28 +
  29 + return new sherpa_onnx.OfflineRecognizer(config);
  30 +}
  31 +
  32 +function createVad() {
  33 + // please download silero_vad.onnx from
  34 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  35 + const config = {
  36 + sileroVad: {
  37 + model: './silero_vad.onnx',
  38 + threshold: 0.5,
  39 + minSpeechDuration: 0.25,
  40 + minSilenceDuration: 0.5,
  41 + maxSpeechDuration: 5,
  42 + windowSize: 512,
  43 + },
  44 + sampleRate: 16000,
  45 + debug: true,
  46 + numThreads: 1,
  47 + };
  48 +
  49 + const bufferSizeInSeconds = 60;
  50 +
  51 + return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
  52 +}
  53 +
  54 +const recognizer = createRecognizer();
  55 +const vad = createVad();
  56 +
  57 +// please download ./Obama.wav from
  58 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  59 +const waveFilename = './Obama.wav';
  60 +const wave = sherpa_onnx.readWave(waveFilename);
  61 +
  62 +if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
  63 + throw new Error(
  64 + 'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
  65 +}
  66 +
  67 +console.log('Started')
  68 +let start = Date.now();
  69 +
  70 +const windowSize = vad.config.sileroVad.windowSize;
  71 +for (let i = 0; i < wave.samples.length; i += windowSize) {
  72 + const thisWindow = wave.samples.subarray(i, i + windowSize);
  73 + vad.acceptWaveform(thisWindow);
  74 +
  75 + while (!vad.isEmpty()) {
  76 + const segment = vad.front();
  77 + vad.pop();
  78 +
  79 + let start_time = segment.start / wave.sampleRate;
  80 + let end_time = start_time + segment.samples.length / wave.sampleRate;
  81 +
  82 + start_time = start_time.toFixed(2);
  83 + end_time = end_time.toFixed(2);
  84 +
  85 + const stream = recognizer.createStream();
  86 + stream.acceptWaveform(
  87 + {samples: segment.samples, sampleRate: wave.sampleRate});
  88 +
  89 + recognizer.decode(stream);
  90 + const r = recognizer.getResult(stream);
  91 + if (r.text.length > 0) {
  92 + const text = r.text.toLowerCase().trim();
  93 + console.log(`${start_time} -- ${end_time}: ${text}`);
  94 + }
  95 + }
  96 +}
  97 +
  98 +vad.flush();
  99 +
  100 +while (!vad.isEmpty()) {
  101 + const segment = vad.front();
  102 + vad.pop();
  103 +
  104 + let start_time = segment.start / wave.sampleRate;
  105 + let end_time = start_time + segment.samples.length / wave.sampleRate;
  106 +
  107 + start_time = start_time.toFixed(2);
  108 + end_time = end_time.toFixed(2);
  109 +
  110 + const stream = recognizer.createStream();
  111 + stream.acceptWaveform(
  112 + {samples: segment.samples, sampleRate: wave.sampleRate});
  113 +
  114 + recognizer.decode(stream);
  115 + const r = recognizer.getResult(stream);
  116 + if (r.text.length > 0) {
  117 + const text = r.text.toLowerCase().trim();
  118 + console.log(`${start_time} -- ${end_time}: ${text}`);
  119 + }
  120 +}
  121 +
  122 +let stop = Date.now();
  123 +console.log('Done')
  124 +
  125 +const elapsed_seconds = (stop - start) / 1000;
  126 +const duration = wave.samples.length / wave.sampleRate;
  127 +const real_time_factor = elapsed_seconds / duration;
  128 +console.log('Wave duration', duration.toFixed(3), 'seconds')
  129 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
  130 +console.log(
  131 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  132 + real_time_factor.toFixed(3))
@@ -133,7 +133,25 @@ tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 @@ -133,7 +133,25 @@ tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
133 node ./test-offline-transducer.js 133 node ./test-offline-transducer.js
134 ``` 134 ```
135 135
  136 +## ./test-vad-with-non-streaming-asr-whisper.js
  137 +
  138 +[./test-vad-with-non-streaming-asr-whisper.js](./test-vad-with-non-streaming-asr-whisper.js)
  139 +shows how to use VAD + whisper to decode a very long file.
  140 +
  141 +You can use the following command to run it:
  142 +
  143 +```bash
  144 +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  145 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  146 +
  147 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  148 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  149 +
  150 +node ./test-vad-with-non-streaming-asr-whisper.js
  151 +```
  152 +
136 ## ./test-offline-whisper.js 153 ## ./test-offline-whisper.js
  154 +
137 [./test-offline-whisper.js](./test-offline-whisper.js) demonstrates 155 [./test-offline-whisper.js](./test-offline-whisper.js) demonstrates
138 how to decode a file with a Whisper model. In the code we use 156 how to decode a file with a Whisper model. In the code we use
139 [sherpa-onnx-whisper-tiny.en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html). 157 [sherpa-onnx-whisper-tiny.en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html).
@@ -146,7 +164,40 @@ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 @@ -146,7 +164,40 @@ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
146 node ./test-offline-whisper.js 164 node ./test-offline-whisper.js
147 ``` 165 ```
148 166
  167 +## ./test-offline-moonshine.js
  168 +
  169 +[./test-offline-moonshine.js](./test-offline-moonshine.js) demonstrates
  170 +how to decode a file with a Moonshine model. In the code we use
  171 +[sherpa-onnx-moonshine-tiny-en-int8](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2).
  172 +
  173 +You can use the following command to run it:
  174 +
  175 +```bash
  176 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  177 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  178 +
  179 +node ./test-offline-moonshine.js
  180 +```
  181 +
  182 +## ./test-vad-with-non-streaming-asr-moonshine.js
  183 +
  184 +[./test-vad-with-non-streaming-asr-moonshine.js](./test-vad-with-non-streaming-asr-moonshine.js)
  185 +shows how to use VAD + whisper to decode a very long file.
  186 +
  187 +You can use the following command to run it:
  188 +
  189 +```bash
  190 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  191 +tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  192 +
  193 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
  194 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  195 +
  196 +node ./test-vad-with-non-streaming-asr-moonshine.js
  197 +```
  198 +
149 ## ./test-online-paraformer-microphone.js 199 ## ./test-online-paraformer-microphone.js
  200 +
150 [./test-online-paraformer-microphone.js](./test-online-paraformer-microphone.js) 201 [./test-online-paraformer-microphone.js](./test-online-paraformer-microphone.js)
151 demonstrates how to do real-time speech recognition from microphone 202 demonstrates how to do real-time speech recognition from microphone
152 with a streaming Paraformer model. In the code we use 203 with a streaming Paraformer model. In the code we use
  1 +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +//
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOfflineRecognizer() {
  6 + let modelConfig = {
  7 + moonshine: {
  8 + preprocessor: './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx',
  9 + encoder: './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx',
  10 + uncachedDecoder:
  11 + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx',
  12 + cachedDecoder:
  13 + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx',
  14 + },
  15 + tokens: './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt',
  16 + };
  17 +
  18 + let config = {
  19 + modelConfig: modelConfig,
  20 + };
  21 +
  22 + return sherpa_onnx.createOfflineRecognizer(config);
  23 +}
  24 +
  25 +recognizer = createOfflineRecognizer();
  26 +stream = recognizer.createStream();
  27 +
  28 +const waveFilename = './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav';
  29 +const wave = sherpa_onnx.readWave(waveFilename);
  30 +stream.acceptWaveform(wave.sampleRate, wave.samples);
  31 +
  32 +recognizer.decode(stream);
  33 +const text = recognizer.getResult(stream).text;
  34 +console.log(text);
  35 +
  36 +stream.free();
  37 +recognizer.free();
  1 +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createRecognizer() {
  6 + // Please download test files from
  7 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  8 + const config = {
  9 + 'modelConfig': {
  10 + 'moonshine': {
  11 + 'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx',
  12 + 'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx',
  13 + 'uncachedDecoder':
  14 + './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx',
  15 + 'cachedDecoder':
  16 + './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx',
  17 + },
  18 + 'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt',
  19 + 'debug': 0,
  20 + }
  21 + };
  22 +
  23 + return sherpa_onnx.createOfflineRecognizer(config);
  24 +}
  25 +
  26 +function createVad() {
  27 + // please download silero_vad.onnx from
  28 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  29 + const config = {
  30 + sileroVad: {
  31 + model: './silero_vad.onnx',
  32 + threshold: 0.5,
  33 + minSpeechDuration: 0.25,
  34 + minSilenceDuration: 0.5,
  35 + maxSpeechDuration: 5,
  36 + windowSize: 512,
  37 + },
  38 + sampleRate: 16000,
  39 + debug: true,
  40 + numThreads: 1,
  41 + bufferSizeInSeconds: 60,
  42 + };
  43 +
  44 + return sherpa_onnx.createVad(config);
  45 +}
  46 +
  47 +const recognizer = createRecognizer();
  48 +const vad = createVad();
  49 +
  50 +// please download ./Obama.wav from
  51 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  52 +const waveFilename = './Obama.wav';
  53 +const wave = sherpa_onnx.readWave(waveFilename);
  54 +
  55 +if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
  56 + throw new Error(
  57 + 'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
  58 +}
  59 +
  60 +console.log('Started')
  61 +let start = Date.now();
  62 +
  63 +const windowSize = vad.config.sileroVad.windowSize;
  64 +for (let i = 0; i < wave.samples.length; i += windowSize) {
  65 + const thisWindow = wave.samples.subarray(i, i + windowSize);
  66 + vad.acceptWaveform(thisWindow);
  67 +
  68 + while (!vad.isEmpty()) {
  69 + const segment = vad.front();
  70 + vad.pop();
  71 +
  72 + let start_time = segment.start / wave.sampleRate;
  73 + let end_time = start_time + segment.samples.length / wave.sampleRate;
  74 +
  75 + start_time = start_time.toFixed(2);
  76 + end_time = end_time.toFixed(2);
  77 +
  78 + const stream = recognizer.createStream();
  79 + stream.acceptWaveform(wave.sampleRate, segment.samples);
  80 +
  81 + recognizer.decode(stream);
  82 + const r = recognizer.getResult(stream);
  83 + if (r.text.length > 0) {
  84 + const text = r.text.toLowerCase().trim();
  85 + console.log(`${start_time} -- ${end_time}: ${text}`);
  86 + }
  87 +
  88 + stream.free();
  89 + }
  90 +}
  91 +
  92 +vad.flush();
  93 +
  94 +while (!vad.isEmpty()) {
  95 + const segment = vad.front();
  96 + vad.pop();
  97 +
  98 + let start_time = segment.start / wave.sampleRate;
  99 + let end_time = start_time + segment.samples.length / wave.sampleRate;
  100 +
  101 + start_time = start_time.toFixed(2);
  102 + end_time = end_time.toFixed(2);
  103 +
  104 + const stream = recognizer.createStream();
  105 + stream.acceptWaveform(wave.sampleRate, segment.samples);
  106 +
  107 + recognizer.decode(stream);
  108 + const r = recognizer.getResult(stream);
  109 + if (r.text.length > 0) {
  110 + const text = r.text.toLowerCase().trim();
  111 + console.log(`${start_time} -- ${end_time}: ${text}`);
  112 + }
  113 +}
  114 +
  115 +let stop = Date.now();
  116 +console.log('Done')
  117 +
  118 +const elapsed_seconds = (stop - start) / 1000;
  119 +const duration = wave.samples.length / wave.sampleRate;
  120 +const real_time_factor = elapsed_seconds / duration;
  121 +console.log('Wave duration', duration.toFixed(3), 'seconds')
  122 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
  123 +console.log(
  124 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  125 + real_time_factor.toFixed(3))
  126 +
  127 +vad.free();
  128 +recognizer.free();
@@ -41,4 +41,11 @@ @@ -41,4 +41,11 @@
41 } \ 41 } \
42 } while (0) 42 } while (0)
43 43
  44 +#define SHERPA_ONNX_DELETE_C_STR(p) \
  45 + do { \
  46 + if (p) { \
  47 + delete[] p; \
  48 + } \
  49 + } while (0)
  50 +
44 #endif // SCRIPTS_NODE_ADDON_API_SRC_MACROS_H_ 51 #endif // SCRIPTS_NODE_ADDON_API_SRC_MACROS_H_
@@ -80,6 +80,25 @@ static SherpaOnnxOfflineWhisperModelConfig GetOfflineWhisperModelConfig( @@ -80,6 +80,25 @@ static SherpaOnnxOfflineWhisperModelConfig GetOfflineWhisperModelConfig(
80 return c; 80 return c;
81 } 81 }
82 82
  83 +static SherpaOnnxOfflineMoonshineModelConfig GetOfflineMoonshineModelConfig(
  84 + Napi::Object obj) {
  85 + SherpaOnnxOfflineMoonshineModelConfig c;
  86 + memset(&c, 0, sizeof(c));
  87 +
  88 + if (!obj.Has("moonshine") || !obj.Get("moonshine").IsObject()) {
  89 + return c;
  90 + }
  91 +
  92 + Napi::Object o = obj.Get("moonshine").As<Napi::Object>();
  93 +
  94 + SHERPA_ONNX_ASSIGN_ATTR_STR(preprocessor, preprocessor);
  95 + SHERPA_ONNX_ASSIGN_ATTR_STR(encoder, encoder);
  96 + SHERPA_ONNX_ASSIGN_ATTR_STR(uncached_decoder, uncachedDecoder);
  97 + SHERPA_ONNX_ASSIGN_ATTR_STR(cached_decoder, cachedDecoder);
  98 +
  99 + return c;
  100 +}
  101 +
83 static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig( 102 static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig(
84 Napi::Object obj) { 103 Napi::Object obj) {
85 SherpaOnnxOfflineTdnnModelConfig c; 104 SherpaOnnxOfflineTdnnModelConfig c;
@@ -130,6 +149,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { @@ -130,6 +149,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
130 c.whisper = GetOfflineWhisperModelConfig(o); 149 c.whisper = GetOfflineWhisperModelConfig(o);
131 c.tdnn = GetOfflineTdnnModelConfig(o); 150 c.tdnn = GetOfflineTdnnModelConfig(o);
132 c.sense_voice = GetOfflineSenseVoiceModelConfig(o); 151 c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
  152 + c.moonshine = GetOfflineMoonshineModelConfig(o);
133 153
134 SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); 154 SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
135 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); 155 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -206,97 +226,42 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { @@ -206,97 +226,42 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
206 const SherpaOnnxOfflineRecognizer *recognizer = 226 const SherpaOnnxOfflineRecognizer *recognizer =
207 SherpaOnnxCreateOfflineRecognizer(&c); 227 SherpaOnnxCreateOfflineRecognizer(&c);
208 228
209 - if (c.model_config.transducer.encoder) {  
210 - delete[] c.model_config.transducer.encoder;  
211 - }  
212 -  
213 - if (c.model_config.transducer.decoder) {  
214 - delete[] c.model_config.transducer.decoder;  
215 - }  
216 -  
217 - if (c.model_config.transducer.joiner) {  
218 - delete[] c.model_config.transducer.joiner;  
219 - }  
220 -  
221 - if (c.model_config.paraformer.model) {  
222 - delete[] c.model_config.paraformer.model;  
223 - }  
224 -  
225 - if (c.model_config.nemo_ctc.model) {  
226 - delete[] c.model_config.nemo_ctc.model;  
227 - }  
228 -  
229 - if (c.model_config.whisper.encoder) {  
230 - delete[] c.model_config.whisper.encoder;  
231 - }  
232 -  
233 - if (c.model_config.whisper.decoder) {  
234 - delete[] c.model_config.whisper.decoder;  
235 - }  
236 -  
237 - if (c.model_config.whisper.language) {  
238 - delete[] c.model_config.whisper.language;  
239 - }  
240 -  
241 - if (c.model_config.whisper.task) {  
242 - delete[] c.model_config.whisper.task;  
243 - }  
244 -  
245 - if (c.model_config.tdnn.model) {  
246 - delete[] c.model_config.tdnn.model;  
247 - }  
248 -  
249 - if (c.model_config.sense_voice.model) {  
250 - delete[] c.model_config.sense_voice.model;  
251 - }  
252 -  
253 - if (c.model_config.sense_voice.language) {  
254 - delete[] c.model_config.sense_voice.language;  
255 - }  
256 -  
257 - if (c.model_config.tokens) {  
258 - delete[] c.model_config.tokens;  
259 - }  
260 -  
261 - if (c.model_config.provider) {  
262 - delete[] c.model_config.provider;  
263 - } 229 + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.encoder);
  230 + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.decoder);
  231 + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.joiner);
264 232
265 - if (c.model_config.model_type) {  
266 - delete[] c.model_config.model_type;  
267 - } 233 + SHERPA_ONNX_DELETE_C_STR(c.model_config.paraformer.model);
268 234
269 - if (c.model_config.modeling_unit) {  
270 - delete[] c.model_config.modeling_unit;  
271 - } 235 + SHERPA_ONNX_DELETE_C_STR(c.model_config.nemo_ctc.model);
272 236
273 - if (c.model_config.bpe_vocab) {  
274 - delete[] c.model_config.bpe_vocab;  
275 - } 237 + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.encoder);
  238 + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.decoder);
  239 + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.language);
  240 + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.task);
276 241
277 - if (c.model_config.telespeech_ctc) {  
278 - delete[] c.model_config.telespeech_ctc;  
279 - } 242 + SHERPA_ONNX_DELETE_C_STR(c.model_config.tdnn.model);
280 243
281 - if (c.lm_config.model) {  
282 - delete[] c.lm_config.model;  
283 - } 244 + SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.model);
  245 + SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.language);
284 246
285 - if (c.decoding_method) {  
286 - delete[] c.decoding_method;  
287 - } 247 + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.preprocessor);
  248 + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.encoder);
  249 + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.uncached_decoder);
  250 + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.cached_decoder);
288 251
289 - if (c.hotwords_file) {  
290 - delete[] c.hotwords_file;  
291 - } 252 + SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
  253 + SHERPA_ONNX_DELETE_C_STR(c.model_config.provider);
  254 + SHERPA_ONNX_DELETE_C_STR(c.model_config.model_type);
  255 + SHERPA_ONNX_DELETE_C_STR(c.model_config.modeling_unit);
  256 + SHERPA_ONNX_DELETE_C_STR(c.model_config.bpe_vocab);
  257 + SHERPA_ONNX_DELETE_C_STR(c.model_config.telespeech_ctc);
292 258
293 - if (c.rule_fsts) {  
294 - delete[] c.rule_fsts;  
295 - } 259 + SHERPA_ONNX_DELETE_C_STR(c.lm_config.model);
296 260
297 - if (c.rule_fars) {  
298 - delete[] c.rule_fars;  
299 - } 261 + SHERPA_ONNX_DELETE_C_STR(c.decoding_method);
  262 + SHERPA_ONNX_DELETE_C_STR(c.hotwords_file);
  263 + SHERPA_ONNX_DELETE_C_STR(c.rule_fsts);
  264 + SHERPA_ONNX_DELETE_C_STR(c.rule_fars);
300 265
301 if (!recognizer) { 266 if (!recognizer) {
302 Napi::TypeError::New(env, "Please check your config!") 267 Napi::TypeError::New(env, "Please check your config!")
@@ -35,6 +35,10 @@ function freeConfig(config, Module) { @@ -35,6 +35,10 @@ function freeConfig(config, Module) {
35 freeConfig(config.whisper, Module) 35 freeConfig(config.whisper, Module)
36 } 36 }
37 37
  38 + if ('moonshine' in config) {
  39 + freeConfig(config.moonshine, Module)
  40 + }
  41 +
38 if ('tdnn' in config) { 42 if ('tdnn' in config) {
39 freeConfig(config.tdnn, Module) 43 freeConfig(config.tdnn, Module)
40 } 44 }
@@ -563,7 +567,7 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { @@ -563,7 +567,7 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
563 const n = encoderLen + decoderLen + languageLen + taskLen; 567 const n = encoderLen + decoderLen + languageLen + taskLen;
564 const buffer = Module._malloc(n); 568 const buffer = Module._malloc(n);
565 569
566 - const len = 5 * 4; // 4 pointers 570 + const len = 5 * 4; // 4 pointers + 1 int32
567 const ptr = Module._malloc(len); 571 const ptr = Module._malloc(len);
568 572
569 let offset = 0; 573 let offset = 0;
@@ -598,6 +602,55 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { @@ -598,6 +602,55 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
598 } 602 }
599 } 603 }
600 604
  605 +function initSherpaOnnxOfflineMoonshineModelConfig(config, Module) {
  606 + const preprocessorLen = Module.lengthBytesUTF8(config.preprocessor || '') + 1;
  607 + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
  608 + const uncachedDecoderLen =
  609 + Module.lengthBytesUTF8(config.uncachedDecoder || '') + 1;
  610 + const cachedDecoderLen =
  611 + Module.lengthBytesUTF8(config.cachedDecoder || '') + 1;
  612 +
  613 + const n =
  614 + preprocessorLen + encoderLen + uncachedDecoderLen + cachedDecoderLen;
  615 + const buffer = Module._malloc(n);
  616 +
  617 + const len = 4 * 4; // 4 pointers
  618 + const ptr = Module._malloc(len);
  619 +
  620 + let offset = 0;
  621 + Module.stringToUTF8(
  622 + config.preprocessor || '', buffer + offset, preprocessorLen);
  623 + offset += preprocessorLen;
  624 +
  625 + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen);
  626 + offset += encoderLen;
  627 +
  628 + Module.stringToUTF8(
  629 + config.uncachedDecoder || '', buffer + offset, uncachedDecoderLen);
  630 + offset += uncachedDecoderLen;
  631 +
  632 + Module.stringToUTF8(
  633 + config.cachedDecoder || '', buffer + offset, cachedDecoderLen);
  634 + offset += cachedDecoderLen;
  635 +
  636 + offset = 0;
  637 + Module.setValue(ptr, buffer + offset, 'i8*');
  638 + offset += preprocessorLen;
  639 +
  640 + Module.setValue(ptr + 4, buffer + offset, 'i8*');
  641 + offset += encoderLen;
  642 +
  643 + Module.setValue(ptr + 8, buffer + offset, 'i8*');
  644 + offset += uncachedDecoderLen;
  645 +
  646 + Module.setValue(ptr + 12, buffer + offset, 'i8*');
  647 + offset += cachedDecoderLen;
  648 +
  649 + return {
  650 + buffer: buffer, ptr: ptr, len: len,
  651 + }
  652 +}
  653 +
601 function initSherpaOnnxOfflineTdnnModelConfig(config, Module) { 654 function initSherpaOnnxOfflineTdnnModelConfig(config, Module) {
602 const n = Module.lengthBytesUTF8(config.model || '') + 1; 655 const n = Module.lengthBytesUTF8(config.model || '') + 1;
603 const buffer = Module._malloc(n); 656 const buffer = Module._malloc(n);
@@ -693,6 +746,15 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -693,6 +746,15 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
693 }; 746 };
694 } 747 }
695 748
  749 + if (!('moonshine' in config)) {
  750 + config.moonshine = {
  751 + preprocessor: '',
  752 + encoder: '',
  753 + uncachedDecoder: '',
  754 + cachedDecoder: '',
  755 + };
  756 + }
  757 +
696 if (!('tdnn' in config)) { 758 if (!('tdnn' in config)) {
697 config.tdnn = { 759 config.tdnn = {
698 model: '', 760 model: '',
@@ -724,8 +786,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -724,8 +786,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
724 const senseVoice = 786 const senseVoice =
725 initSherpaOnnxOfflineSenseVoiceModelConfig(config.senseVoice, Module); 787 initSherpaOnnxOfflineSenseVoiceModelConfig(config.senseVoice, Module);
726 788
  789 + const moonshine =
  790 + initSherpaOnnxOfflineMoonshineModelConfig(config.moonshine, Module);
  791 +
727 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + 792 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
728 - tdnn.len + 8 * 4 + senseVoice.len; 793 + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len;
729 794
730 const ptr = Module._malloc(len); 795 const ptr = Module._malloc(len);
731 796
@@ -745,7 +810,6 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -745,7 +810,6 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
745 Module._CopyHeap(tdnn.ptr, tdnn.len, ptr + offset); 810 Module._CopyHeap(tdnn.ptr, tdnn.len, ptr + offset);
746 offset += tdnn.len; 811 offset += tdnn.len;
747 812
748 -  
749 const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; 813 const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
750 const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; 814 const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
751 const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1; 815 const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1;
@@ -817,11 +881,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -817,11 +881,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
817 offset += 4; 881 offset += 4;
818 882
819 Module._CopyHeap(senseVoice.ptr, senseVoice.len, ptr + offset); 883 Module._CopyHeap(senseVoice.ptr, senseVoice.len, ptr + offset);
  884 + offset += senseVoice.len;
  885 +
  886 + Module._CopyHeap(moonshine.ptr, moonshine.len, ptr + offset);
820 887
821 return { 888 return {
822 buffer: buffer, ptr: ptr, len: len, transducer: transducer, 889 buffer: buffer, ptr: ptr, len: len, transducer: transducer,
823 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, 890 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
824 - senseVoice: senseVoice, 891 + senseVoice: senseVoice, moonshine: moonshine,
825 } 892 }
826 } 893 }
827 894
@@ -15,6 +15,7 @@ static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); @@ -15,6 +15,7 @@ static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, "");
15 15
16 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); 16 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, "");
  18 +static_assert(sizeof(SherpaOnnxOfflineMoonshineModelConfig) == 4 * 4, "");
18 static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, ""); 19 static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, "");
19 static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, ""); 20 static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, "");
20 static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, ""); 21 static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, "");
@@ -25,7 +26,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == @@ -25,7 +26,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
25 sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + 26 sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) +
26 sizeof(SherpaOnnxOfflineWhisperModelConfig) + 27 sizeof(SherpaOnnxOfflineWhisperModelConfig) +
27 sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + 28 sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 +
28 - sizeof(SherpaOnnxOfflineSenseVoiceModelConfig), 29 + sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) +
  30 + sizeof(SherpaOnnxOfflineMoonshineModelConfig),
29 ""); 31 "");
30 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); 32 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
31 static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == 33 static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) ==
@@ -66,6 +68,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -66,6 +68,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
66 auto whisper = &model_config->whisper; 68 auto whisper = &model_config->whisper;
67 auto tdnn = &model_config->tdnn; 69 auto tdnn = &model_config->tdnn;
68 auto sense_voice = &model_config->sense_voice; 70 auto sense_voice = &model_config->sense_voice;
  71 + auto moonshine = &model_config->moonshine;
69 72
70 fprintf(stdout, "----------offline transducer model config----------\n"); 73 fprintf(stdout, "----------offline transducer model config----------\n");
71 fprintf(stdout, "encoder: %s\n", transducer->encoder); 74 fprintf(stdout, "encoder: %s\n", transducer->encoder);
@@ -93,6 +96,12 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -93,6 +96,12 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
93 fprintf(stdout, "language: %s\n", sense_voice->language); 96 fprintf(stdout, "language: %s\n", sense_voice->language);
94 fprintf(stdout, "use_itn: %d\n", sense_voice->use_itn); 97 fprintf(stdout, "use_itn: %d\n", sense_voice->use_itn);
95 98
  99 + fprintf(stdout, "----------offline moonshine model config----------\n");
  100 + fprintf(stdout, "preprocessor: %s\n", moonshine->preprocessor);
  101 + fprintf(stdout, "encoder: %s\n", moonshine->encoder);
  102 + fprintf(stdout, "uncached_decoder: %s\n", moonshine->uncached_decoder);
  103 + fprintf(stdout, "cached_decoder: %s\n", moonshine->cached_decoder);
  104 +
96 fprintf(stdout, "tokens: %s\n", model_config->tokens); 105 fprintf(stdout, "tokens: %s\n", model_config->tokens);
97 fprintf(stdout, "num_threads: %d\n", model_config->num_threads); 106 fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
98 fprintf(stdout, "provider: %s\n", model_config->provider); 107 fprintf(stdout, "provider: %s\n", model_config->provider);