Fangjun Kuang
Committed by GitHub

Add Javascript (WebAssembly) API for Dolphin CTC models (#2093)

@@ -9,6 +9,13 @@ git status @@ -9,6 +9,13 @@ git status
9 ls -lh 9 ls -lh
10 ls -lh node_modules 10 ls -lh node_modules
11 11
  12 +# asr with offline dolphin ctc
  13 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  14 +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  15 +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  16 +node ./test-offline-dolphin-ctc.js
  17 +rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
  18 +
12 # speech enhancement 19 # speech enhancement
13 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx 20 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx
14 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav 21 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav
@@ -56,7 +63,7 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/m @@ -56,7 +63,7 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/m
56 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 63 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
57 rm matcha-icefall-en_US-ljspeech.tar.bz2 64 rm matcha-icefall-en_US-ljspeech.tar.bz2
58 65
59 -wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx 66 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
60 67
61 node ./test-offline-tts-matcha-en.js 68 node ./test-offline-tts-matcha-en.js
62 69
@@ -21,8 +21,8 @@ jobs: @@ -21,8 +21,8 @@ jobs:
21 fail-fast: false 21 fail-fast: false
22 matrix: 22 matrix:
23 os: [ubuntu-latest] 23 os: [ubuntu-latest]
24 - total: ["8"]  
25 - index: ["0", "1", "2", "3", "4", "5", "6", "7"] 24 + total: ["11"]
  25 + index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
26 26
27 steps: 27 steps:
28 - uses: actions/checkout@v4 28 - uses: actions/checkout@v4
@@ -119,6 +119,7 @@ We also have spaces built using WebAssembly. They are listed below: @@ -119,6 +119,7 @@ We also have spaces built using WebAssembly. They are listed below:
119 |VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]| 119 |VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]|
120 |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]| 120 |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]|
121 |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| 121 |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]|
  122 +|VAD + speech recognition (多语种及多种中文方言) with [Dolphin][Dolphin]-base |[Click me][wasm-hf-vad-asr-multi-lang-dolphin-base]| [地址][wasm-ms-vad-asr-multi-lang-dolphin-base]|
122 |Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| 123 |Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]|
123 |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| 124 |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]|
124 |Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]| 125 |Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]|
@@ -390,6 +391,10 @@ It uses TTS from sherpa-onnx. See also [✨ Speak command that uses the new glob @@ -390,6 +391,10 @@ It uses TTS from sherpa-onnx. See also [✨ Speak command that uses the new glob
390 [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer 391 [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
391 [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small 392 [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
392 [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small 393 [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
  394 +[Dolphin]: https://github.com/DataoceanAI/Dolphin
  395 +[wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
  396 +[wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
  397 +
393 [wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en 398 [wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en
394 [wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en 399 [wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en
395 [wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de 400 [wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de
@@ -140,6 +140,20 @@ node ./test-offline-tts-vits-zh.js @@ -140,6 +140,20 @@ node ./test-offline-tts-vits-zh.js
140 In the following, we demonstrate how to decode files and how to perform 140 In the following, we demonstrate how to decode files and how to perform
141 speech recognition with a microphone with `nodejs`. 141 speech recognition with a microphone with `nodejs`.
142 142
  143 +## ./test-offline-dolphin-ctc.js
  144 +
  145 +[./test-offline-dolphin-ctc.js](./test-offline-dolphin-ctc.js) demonstrates
  146 +how to decode a file with a [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model.
  147 +
  148 +You can use the following command to run it:
  149 +
  150 +```bash
  151 +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  152 +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  153 +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  154 +node ./test-offline-dolphin-ctc.js
  155 +```
  156 +
143 ## ./test-offline-nemo-ctc.js 157 ## ./test-offline-nemo-ctc.js
144 158
145 [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates 159 [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +//
  3 +const fs = require('fs');
  4 +const {Readable} = require('stream');
  5 +const wav = require('wav');
  6 +
  7 +const sherpa_onnx = require('sherpa-onnx');
  8 +
  9 +function createOfflineRecognizer() {
  10 + let config = {
  11 + modelConfig: {
  12 + dolphin: {
  13 + model:
  14 + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx',
  15 + },
  16 + tokens:
  17 + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt',
  18 + }
  19 + };
  20 +
  21 + return sherpa_onnx.createOfflineRecognizer(config);
  22 +}
  23 +
  24 +const recognizer = createOfflineRecognizer();
  25 +const stream = recognizer.createStream();
  26 +
  27 +const waveFilename =
  28 + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav';
  29 +const wave = sherpa_onnx.readWave(waveFilename);
  30 +stream.acceptWaveform(wave.sampleRate, wave.samples);
  31 +
  32 +recognizer.decode(stream);
  33 +const text = recognizer.getResult(stream).text;
  34 +console.log(text);
  35 +
  36 +stream.free();
  37 +recognizer.free();
@@ -197,6 +197,21 @@ def get_models(): @@ -197,6 +197,21 @@ def get_models():
197 git diff 197 git diff
198 """, 198 """,
199 ), 199 ),
  200 + Model(
  201 + model_name="sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02",
  202 + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc",
  203 + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc",
  204 + short_name="vad-asr-multi_lang-dolphin_ctc",
  205 + cmd="""
  206 + pushd $model_name
  207 + mv model.int8.onnx ../dolphin.onnx
  208 + mv tokens.txt ../
  209 + popd
  210 + rm -rf $model_name
  211 + sed -i.bak 's%Zipformer%<a href="https://github.com/DataoceanAI/Dolphin">Dolphin</a> (多种中文方言及非常多种语言)%g' ../index.html
  212 + git diff
  213 + """,
  214 + ),
200 ] 215 ]
201 return models 216 return models
202 217
@@ -39,6 +39,10 @@ function freeConfig(config, Module) { @@ -39,6 +39,10 @@ function freeConfig(config, Module) {
39 freeConfig(config.fireRedAsr, Module) 39 freeConfig(config.fireRedAsr, Module)
40 } 40 }
41 41
  42 + if ('dolphin' in config) {
  43 + freeConfig(config.dolphin, Module)
  44 + }
  45 +
42 if ('moonshine' in config) { 46 if ('moonshine' in config) {
43 freeConfig(config.moonshine, Module) 47 freeConfig(config.moonshine, Module)
44 } 48 }
@@ -562,6 +566,23 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) { @@ -562,6 +566,23 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) {
562 } 566 }
563 } 567 }
564 568
  569 +function initSherpaOnnxOfflineDolphinModelConfig(config, Module) {
  570 + const n = Module.lengthBytesUTF8(config.model || '') + 1;
  571 +
  572 + const buffer = Module._malloc(n);
  573 +
  574 + const len = 1 * 4; // 1 pointer
  575 + const ptr = Module._malloc(len);
  576 +
  577 + Module.stringToUTF8(config.model || '', buffer, n);
  578 +
  579 + Module.setValue(ptr, buffer, 'i8*');
  580 +
  581 + return {
  582 + buffer: buffer, ptr: ptr, len: len,
  583 + }
  584 +}
  585 +
565 function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { 586 function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
566 const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; 587 const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
567 const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; 588 const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
@@ -769,6 +790,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -769,6 +790,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
769 }; 790 };
770 } 791 }
771 792
  793 + if (!('dolphin' in config)) {
  794 + config.dolphin = {
  795 + model: '',
  796 + };
  797 + }
  798 +
772 if (!('whisper' in config)) { 799 if (!('whisper' in config)) {
773 config.whisper = { 800 config.whisper = {
774 encoder: '', 801 encoder: '',
@@ -832,8 +859,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -832,8 +859,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
832 const fireRedAsr = 859 const fireRedAsr =
833 initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module); 860 initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module);
834 861
  862 + const dolphin =
  863 + initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module);
  864 +
835 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + 865 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
836 - tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len; 866 + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
  867 + dolphin.len;
837 868
838 const ptr = Module._malloc(len); 869 const ptr = Module._malloc(len);
839 870
@@ -932,10 +963,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -932,10 +963,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
932 Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset); 963 Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset);
933 offset += fireRedAsr.len; 964 offset += fireRedAsr.len;
934 965
  966 + Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset);
  967 + offset += dolphin.len;
  968 +
935 return { 969 return {
936 buffer: buffer, ptr: ptr, len: len, transducer: transducer, 970 buffer: buffer, ptr: ptr, len: len, transducer: transducer,
937 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, 971 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
938 - senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr 972 + senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
  973 + dolphin: dolphin
939 } 974 }
940 } 975 }
941 976
@@ -13,6 +13,7 @@ extern "C" { @@ -13,6 +13,7 @@ extern "C" {
13 static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); 13 static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, "");
14 static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); 14 static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, "");
15 15
  16 +static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, "");
16 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); 18 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, "");
18 static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, ""); 19 static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, "");
@@ -29,7 +30,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == @@ -29,7 +30,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
29 sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + 30 sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 +
30 sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + 31 sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) +
31 sizeof(SherpaOnnxOfflineMoonshineModelConfig) + 32 sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
32 - sizeof(SherpaOnnxOfflineFireRedAsrModelConfig), 33 + sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) +
  34 + sizeof(SherpaOnnxOfflineDolphinModelConfig),
33 35
34 ""); 36 "");
35 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); 37 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
@@ -73,6 +75,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -73,6 +75,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
73 auto sense_voice = &model_config->sense_voice; 75 auto sense_voice = &model_config->sense_voice;
74 auto moonshine = &model_config->moonshine; 76 auto moonshine = &model_config->moonshine;
75 auto fire_red_asr = &model_config->fire_red_asr; 77 auto fire_red_asr = &model_config->fire_red_asr;
  78 + auto dolphin = &model_config->dolphin;
76 79
77 fprintf(stdout, "----------offline transducer model config----------\n"); 80 fprintf(stdout, "----------offline transducer model config----------\n");
78 fprintf(stdout, "encoder: %s\n", transducer->encoder); 81 fprintf(stdout, "encoder: %s\n", transducer->encoder);
@@ -110,6 +113,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -110,6 +113,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
110 fprintf(stdout, "encoder: %s\n", fire_red_asr->encoder); 113 fprintf(stdout, "encoder: %s\n", fire_red_asr->encoder);
111 fprintf(stdout, "decoder: %s\n", fire_red_asr->decoder); 114 fprintf(stdout, "decoder: %s\n", fire_red_asr->decoder);
112 115
  116 + fprintf(stdout, "----------offline Dolphin model config----------\n");
  117 + fprintf(stdout, "model: %s\n", dolphin->model);
  118 +
113 fprintf(stdout, "tokens: %s\n", model_config->tokens); 119 fprintf(stdout, "tokens: %s\n", model_config->tokens);
114 fprintf(stdout, "num_threads: %d\n", model_config->num_threads); 120 fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
115 fprintf(stdout, "provider: %s\n", model_config->provider); 121 fprintf(stdout, "provider: %s\n", model_config->provider);
@@ -15,7 +15,7 @@ let resultList = []; @@ -15,7 +15,7 @@ let resultList = [];
15 clearBtn.onclick = function() { 15 clearBtn.onclick = function() {
16 resultList = []; 16 resultList = [];
17 textArea.value = getDisplayResult(); 17 textArea.value = getDisplayResult();
18 - textArea.scrollTop = textArea.scrollHeight; // auto scroll 18 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
19 }; 19 };
20 20
21 function getDisplayResult() { 21 function getDisplayResult() {
@@ -46,11 +46,11 @@ let audioCtx; @@ -46,11 +46,11 @@ let audioCtx;
46 let mediaStream; 46 let mediaStream;
47 47
48 let expectedSampleRate = 16000; 48 let expectedSampleRate = 16000;
49 -let recordSampleRate; // the sampleRate of the microphone  
50 -let recorder = null; // the microphone  
51 -let leftchannel = []; // TODO: Use a single channel 49 +let recordSampleRate; // the sampleRate of the microphone
  50 +let recorder = null; // the microphone
  51 +let leftchannel = []; // TODO: Use a single channel
52 52
53 -let recordingLength = 0; // number of samples so far 53 +let recordingLength = 0; // number of samples so far
54 54
55 let vad = null; 55 let vad = null;
56 let buffer = null; 56 let buffer = null;
@@ -73,48 +73,50 @@ function createOfflineRecognizerSenseVoice() {} @@ -73,48 +73,50 @@ function createOfflineRecognizerSenseVoice() {}
73 73
74 function initOfflineRecognizer() { 74 function initOfflineRecognizer() {
75 let config = { 75 let config = {
76 - modelConfig : {  
77 - debug : 1,  
78 - tokens : './tokens.txt', 76 + modelConfig: {
  77 + debug: 1,
  78 + tokens: './tokens.txt',
79 }, 79 },
80 }; 80 };
81 if (fileExists('sense-voice.onnx') == 1) { 81 if (fileExists('sense-voice.onnx') == 1) {
82 config.modelConfig.senseVoice = { 82 config.modelConfig.senseVoice = {
83 - model : './sense-voice.onnx',  
84 - useInverseTextNormalization : 1, 83 + model: './sense-voice.onnx',
  84 + useInverseTextNormalization: 1,
85 }; 85 };
86 } else if (fileExists('whisper-encoder.onnx')) { 86 } else if (fileExists('whisper-encoder.onnx')) {
87 config.modelConfig.whisper = { 87 config.modelConfig.whisper = {
88 - encoder : './whisper-encoder.onnx',  
89 - decoder : './whisper-decoder.onnx', 88 + encoder: './whisper-encoder.onnx',
  89 + decoder: './whisper-decoder.onnx',
90 }; 90 };
91 } else if (fileExists('transducer-encoder.onnx')) { 91 } else if (fileExists('transducer-encoder.onnx')) {
92 config.modelConfig.transducer = { 92 config.modelConfig.transducer = {
93 - encoder : './transducer-encoder.onnx',  
94 - decoder : './transducer-decoder.onnx',  
95 - joiner : './transducer-joiner.onnx', 93 + encoder: './transducer-encoder.onnx',
  94 + decoder: './transducer-decoder.onnx',
  95 + joiner: './transducer-joiner.onnx',
96 }; 96 };
97 config.modelConfig.modelType = 'transducer'; 97 config.modelConfig.modelType = 'transducer';
98 } else if (fileExists('nemo-transducer-encoder.onnx')) { 98 } else if (fileExists('nemo-transducer-encoder.onnx')) {
99 config.modelConfig.transducer = { 99 config.modelConfig.transducer = {
100 - encoder : './nemo-transducer-encoder.onnx',  
101 - decoder : './nemo-transducer-decoder.onnx',  
102 - joiner : './nemo-transducer-joiner.onnx', 100 + encoder: './nemo-transducer-encoder.onnx',
  101 + decoder: './nemo-transducer-decoder.onnx',
  102 + joiner: './nemo-transducer-joiner.onnx',
103 }; 103 };
104 config.modelConfig.modelType = 'nemo_transducer'; 104 config.modelConfig.modelType = 'nemo_transducer';
105 } else if (fileExists('paraformer.onnx')) { 105 } else if (fileExists('paraformer.onnx')) {
106 config.modelConfig.paraformer = { 106 config.modelConfig.paraformer = {
107 - model : './paraformer.onnx', 107 + model: './paraformer.onnx',
108 }; 108 };
109 } else if (fileExists('telespeech.onnx')) { 109 } else if (fileExists('telespeech.onnx')) {
110 config.modelConfig.telespeechCtc = './telespeech.onnx'; 110 config.modelConfig.telespeechCtc = './telespeech.onnx';
111 } else if (fileExists('moonshine-preprocessor.onnx')) { 111 } else if (fileExists('moonshine-preprocessor.onnx')) {
112 config.modelConfig.moonshine = { 112 config.modelConfig.moonshine = {
113 - preprocessor : './moonshine-preprocessor.onnx',  
114 - encoder : './moonshine-encoder.onnx',  
115 - uncachedDecoder : './moonshine-uncached-decoder.onnx',  
116 - cachedDecoder : './moonshine-cached-decoder.onnx' 113 + preprocessor: './moonshine-preprocessor.onnx',
  114 + encoder: './moonshine-encoder.onnx',
  115 + uncachedDecoder: './moonshine-uncached-decoder.onnx',
  116 + cachedDecoder: './moonshine-cached-decoder.onnx'
117 }; 117 };
  118 + } else if (fileExists('dolphin.onnx')) {
  119 + config.modelConfig.dolphin = {model: './dolphin.onnx'};
118 } else { 120 } else {
119 console.log('Please specify a model.'); 121 console.log('Please specify a model.');
120 alert('Please specify a model.'); 122 alert('Please specify a model.');
@@ -133,7 +135,7 @@ Module.locateFile = function(path, scriptDirectory = '') { @@ -133,7 +135,7 @@ Module.locateFile = function(path, scriptDirectory = '') {
133 Module.setStatus = function(status) { 135 Module.setStatus = function(status) {
134 console.log(`status ${status}`); 136 console.log(`status ${status}`);
135 const statusElement = document.getElementById('status'); 137 const statusElement = document.getElementById('status');
136 - if (status == "Running...") { 138 + if (status == 'Running...') {
137 status = 'Model downloaded. Initializing recongizer...' 139 status = 'Model downloaded. Initializing recongizer...'
138 } 140 }
139 statusElement.textContent = status; 141 statusElement.textContent = status;
@@ -170,11 +172,11 @@ if (navigator.mediaDevices.getUserMedia) { @@ -170,11 +172,11 @@ if (navigator.mediaDevices.getUserMedia) {
170 console.log('getUserMedia supported.'); 172 console.log('getUserMedia supported.');
171 173
172 // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia 174 // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
173 - const constraints = {audio : true}; 175 + const constraints = {audio: true};
174 176
175 let onSuccess = function(stream) { 177 let onSuccess = function(stream) {
176 if (!audioCtx) { 178 if (!audioCtx) {
177 - audioCtx = new AudioContext({sampleRate : expectedSampleRate}); 179 + audioCtx = new AudioContext({sampleRate: expectedSampleRate});
178 } 180 }
179 console.log(audioCtx); 181 console.log(audioCtx);
180 recordSampleRate = audioCtx.sampleRate; 182 recordSampleRate = audioCtx.sampleRate;
@@ -299,7 +301,7 @@ if (navigator.mediaDevices.getUserMedia) { @@ -299,7 +301,7 @@ if (navigator.mediaDevices.getUserMedia) {
299 } 301 }
300 302
301 textArea.value = getDisplayResult(); 303 textArea.value = getDisplayResult();
302 - textArea.scrollTop = textArea.scrollHeight; // auto scroll 304 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
303 }; 305 };
304 306
305 startBtn.onclick = function() { 307 startBtn.onclick = function() {
@@ -330,8 +332,9 @@ if (navigator.mediaDevices.getUserMedia) { @@ -330,8 +332,9 @@ if (navigator.mediaDevices.getUserMedia) {
330 }; 332 };
331 }; 333 };
332 334
333 - let onError = function(  
334 - err) { console.log('The following error occured: ' + err); }; 335 + let onError = function(err) {
  336 + console.log('The following error occured: ' + err);
  337 + };
335 338
336 navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); 339 navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
337 } else { 340 } else {
@@ -364,22 +367,22 @@ function toWav(samples) { @@ -364,22 +367,22 @@ function toWav(samples) {
364 367
365 // http://soundfile.sapp.org/doc/WaveFormat/ 368 // http://soundfile.sapp.org/doc/WaveFormat/
366 // F F I R 369 // F F I R
367 - view.setUint32(0, 0x46464952, true); // chunkID  
368 - view.setUint32(4, 36 + samples.length * 2, true); // chunkSize 370 + view.setUint32(0, 0x46464952, true); // chunkID
  371 + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
369 // E V A W 372 // E V A W
370 - view.setUint32(8, 0x45564157, true); // format  
371 - // 373 + view.setUint32(8, 0x45564157, true); // format
  374 + //
372 // t m f 375 // t m f
373 - view.setUint32(12, 0x20746d66, true); // subchunk1ID  
374 - view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM  
375 - view.setUint32(20, 1, true); // audioFormat, 1 for PCM  
376 - view.setUint16(22, 1, true); // numChannels: 1 channel  
377 - view.setUint32(24, expectedSampleRate, true); // sampleRate  
378 - view.setUint32(28, expectedSampleRate * 2, true); // byteRate  
379 - view.setUint16(32, 2, true); // blockAlign  
380 - view.setUint16(34, 16, true); // bitsPerSample  
381 - view.setUint32(36, 0x61746164, true); // Subchunk2ID  
382 - view.setUint32(40, samples.length * 2, true); // subchunk2Size 376 + view.setUint32(12, 0x20746d66, true); // subchunk1ID
  377 + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
  378 + view.setUint32(20, 1, true); // audioFormat, 1 for PCM
  379 + view.setUint16(22, 1, true); // numChannels: 1 channel
  380 + view.setUint32(24, expectedSampleRate, true); // sampleRate
  381 + view.setUint32(28, expectedSampleRate * 2, true); // byteRate
  382 + view.setUint16(32, 2, true); // blockAlign
  383 + view.setUint16(34, 16, true); // bitsPerSample
  384 + view.setUint32(36, 0x61746164, true); // Subchunk2ID
  385 + view.setUint32(40, samples.length * 2, true); // subchunk2Size
383 386
384 let offset = 44; 387 let offset = 44;
385 for (let i = 0; i < samples.length; ++i) { 388 for (let i = 0; i < samples.length; ++i) {
@@ -387,7 +390,7 @@ function toWav(samples) { @@ -387,7 +390,7 @@ function toWav(samples) {
387 offset += 2; 390 offset += 2;
388 } 391 }
389 392
390 - return new Blob([ view ], {type : 'audio/wav'}); 393 + return new Blob([view], {type: 'audio/wav'});
391 } 394 }
392 395
393 // this function is copied from 396 // this function is copied from