Committed by
GitHub
Add Javascript (WebAssembly) API for Dolphin CTC models (#2093)
正在显示
9 个修改的文件
包含
172 行增加
和
50 行删除
| @@ -9,6 +9,13 @@ git status | @@ -9,6 +9,13 @@ git status | ||
| 9 | ls -lh | 9 | ls -lh |
| 10 | ls -lh node_modules | 10 | ls -lh node_modules |
| 11 | 11 | ||
| 12 | +# asr with offline dolphin ctc | ||
| 13 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | ||
| 14 | +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | ||
| 15 | +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | ||
| 16 | +node ./test-offline-dolphin-ctc.js | ||
| 17 | +rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 | ||
| 18 | + | ||
| 12 | # speech enhancement | 19 | # speech enhancement |
| 13 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx | 20 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx |
| 14 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav | 21 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav |
| @@ -56,7 +63,7 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/m | @@ -56,7 +63,7 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/m | ||
| 56 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | 63 | tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 |
| 57 | rm matcha-icefall-en_US-ljspeech.tar.bz2 | 64 | rm matcha-icefall-en_US-ljspeech.tar.bz2 |
| 58 | 65 | ||
| 59 | -wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx | 66 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx |
| 60 | 67 | ||
| 61 | node ./test-offline-tts-matcha-en.js | 68 | node ./test-offline-tts-matcha-en.js |
| 62 | 69 |
| @@ -21,8 +21,8 @@ jobs: | @@ -21,8 +21,8 @@ jobs: | ||
| 21 | fail-fast: false | 21 | fail-fast: false |
| 22 | matrix: | 22 | matrix: |
| 23 | os: [ubuntu-latest] | 23 | os: [ubuntu-latest] |
| 24 | - total: ["8"] | ||
| 25 | - index: ["0", "1", "2", "3", "4", "5", "6", "7"] | 24 | + total: ["11"] |
| 25 | + index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] | ||
| 26 | 26 | ||
| 27 | steps: | 27 | steps: |
| 28 | - uses: actions/checkout@v4 | 28 | - uses: actions/checkout@v4 |
| @@ -119,6 +119,7 @@ We also have spaces built using WebAssembly. They are listed below: | @@ -119,6 +119,7 @@ We also have spaces built using WebAssembly. They are listed below: | ||
| 119 | |VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]| | 119 | |VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]| |
| 120 | |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]| | 120 | |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]| |
| 121 | |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| | 121 | |VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| |
| 122 | +|VAD + speech recognition (多语种及多种中文方言) with [Dolphin][Dolphin]-base |[Click me][wasm-hf-vad-asr-multi-lang-dolphin-base]| [地址][wasm-ms-vad-asr-multi-lang-dolphin-base]| | ||
| 122 | |Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| | 123 | |Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| |
| 123 | |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| | 124 | |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| |
| 124 | |Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]| | 125 | |Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]| |
| @@ -390,6 +391,10 @@ It uses TTS from sherpa-onnx. See also [✨ Speak command that uses the new glob | @@ -390,6 +391,10 @@ It uses TTS from sherpa-onnx. See also [✨ Speak command that uses the new glob | ||
| 390 | [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer | 391 | [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer |
| 391 | [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small | 392 | [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small |
| 392 | [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small | 393 | [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small |
| 394 | +[Dolphin]: https://github.com/DataoceanAI/Dolphin | ||
| 395 | +[wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc | ||
| 396 | +[wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc | ||
| 397 | + | ||
| 393 | [wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en | 398 | [wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en |
| 394 | [wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en | 399 | [wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en |
| 395 | [wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de | 400 | [wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de |
| @@ -140,6 +140,20 @@ node ./test-offline-tts-vits-zh.js | @@ -140,6 +140,20 @@ node ./test-offline-tts-vits-zh.js | ||
| 140 | In the following, we demonstrate how to decode files and how to perform | 140 | In the following, we demonstrate how to decode files and how to perform |
| 141 | speech recognition with a microphone with `nodejs`. | 141 | speech recognition with a microphone with `nodejs`. |
| 142 | 142 | ||
| 143 | +## ./test-offline-dolphin-ctc.js | ||
| 144 | + | ||
| 145 | +[./test-offline-dolphin-ctc.js](./test-offline-dolphin-ctc.js) demonstrates | ||
| 146 | +how to decode a file with a [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model. | ||
| 147 | + | ||
| 148 | +You can use the following command to run it: | ||
| 149 | + | ||
| 150 | +```bash | ||
| 151 | +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | ||
| 152 | +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | ||
| 153 | +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 | ||
| 154 | +node ./test-offline-dolphin-ctc.js | ||
| 155 | +``` | ||
| 156 | + | ||
| 143 | ## ./test-offline-nemo-ctc.js | 157 | ## ./test-offline-nemo-ctc.js |
| 144 | 158 | ||
| 145 | [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates | 159 | [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates |
nodejs-examples/test-offline-dolphin-ctc.js
0 → 100644
| 1 | +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | +// | ||
| 3 | +const fs = require('fs'); | ||
| 4 | +const {Readable} = require('stream'); | ||
| 5 | +const wav = require('wav'); | ||
| 6 | + | ||
| 7 | +const sherpa_onnx = require('sherpa-onnx'); | ||
| 8 | + | ||
| 9 | +function createOfflineRecognizer() { | ||
| 10 | + let config = { | ||
| 11 | + modelConfig: { | ||
| 12 | + dolphin: { | ||
| 13 | + model: | ||
| 14 | + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx', | ||
| 15 | + }, | ||
| 16 | + tokens: | ||
| 17 | + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt', | ||
| 18 | + } | ||
| 19 | + }; | ||
| 20 | + | ||
| 21 | + return sherpa_onnx.createOfflineRecognizer(config); | ||
| 22 | +} | ||
| 23 | + | ||
| 24 | +const recognizer = createOfflineRecognizer(); | ||
| 25 | +const stream = recognizer.createStream(); | ||
| 26 | + | ||
| 27 | +const waveFilename = | ||
| 28 | + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav'; | ||
| 29 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 30 | +stream.acceptWaveform(wave.sampleRate, wave.samples); | ||
| 31 | + | ||
| 32 | +recognizer.decode(stream); | ||
| 33 | +const text = recognizer.getResult(stream).text; | ||
| 34 | +console.log(text); | ||
| 35 | + | ||
| 36 | +stream.free(); | ||
| 37 | +recognizer.free(); |
| @@ -197,6 +197,21 @@ def get_models(): | @@ -197,6 +197,21 @@ def get_models(): | ||
| 197 | git diff | 197 | git diff |
| 198 | """, | 198 | """, |
| 199 | ), | 199 | ), |
| 200 | + Model( | ||
| 201 | + model_name="sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02", | ||
| 202 | + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc", | ||
| 203 | + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc", | ||
| 204 | + short_name="vad-asr-multi_lang-dolphin_ctc", | ||
| 205 | + cmd=""" | ||
| 206 | + pushd $model_name | ||
| 207 | + mv model.int8.onnx ../dolphin.onnx | ||
| 208 | + mv tokens.txt ../ | ||
| 209 | + popd | ||
| 210 | + rm -rf $model_name | ||
| 211 | + sed -i.bak 's%Zipformer%<a href="https://github.com/DataoceanAI/Dolphin">Dolphin</a> (多种中文方言及非常多种语言)%g' ../index.html | ||
| 212 | + git diff | ||
| 213 | + """, | ||
| 214 | + ), | ||
| 200 | ] | 215 | ] |
| 201 | return models | 216 | return models |
| 202 | 217 |
| @@ -39,6 +39,10 @@ function freeConfig(config, Module) { | @@ -39,6 +39,10 @@ function freeConfig(config, Module) { | ||
| 39 | freeConfig(config.fireRedAsr, Module) | 39 | freeConfig(config.fireRedAsr, Module) |
| 40 | } | 40 | } |
| 41 | 41 | ||
| 42 | + if ('dolphin' in config) { | ||
| 43 | + freeConfig(config.dolphin, Module) | ||
| 44 | + } | ||
| 45 | + | ||
| 42 | if ('moonshine' in config) { | 46 | if ('moonshine' in config) { |
| 43 | freeConfig(config.moonshine, Module) | 47 | freeConfig(config.moonshine, Module) |
| 44 | } | 48 | } |
| @@ -562,6 +566,23 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) { | @@ -562,6 +566,23 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) { | ||
| 562 | } | 566 | } |
| 563 | } | 567 | } |
| 564 | 568 | ||
| 569 | +function initSherpaOnnxOfflineDolphinModelConfig(config, Module) { | ||
| 570 | + const n = Module.lengthBytesUTF8(config.model || '') + 1; | ||
| 571 | + | ||
| 572 | + const buffer = Module._malloc(n); | ||
| 573 | + | ||
| 574 | + const len = 1 * 4; // 1 pointer | ||
| 575 | + const ptr = Module._malloc(len); | ||
| 576 | + | ||
| 577 | + Module.stringToUTF8(config.model || '', buffer, n); | ||
| 578 | + | ||
| 579 | + Module.setValue(ptr, buffer, 'i8*'); | ||
| 580 | + | ||
| 581 | + return { | ||
| 582 | + buffer: buffer, ptr: ptr, len: len, | ||
| 583 | + } | ||
| 584 | +} | ||
| 585 | + | ||
| 565 | function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { | 586 | function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { |
| 566 | const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; | 587 | const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; |
| 567 | const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; | 588 | const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; |
| @@ -769,6 +790,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -769,6 +790,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 769 | }; | 790 | }; |
| 770 | } | 791 | } |
| 771 | 792 | ||
| 793 | + if (!('dolphin' in config)) { | ||
| 794 | + config.dolphin = { | ||
| 795 | + model: '', | ||
| 796 | + }; | ||
| 797 | + } | ||
| 798 | + | ||
| 772 | if (!('whisper' in config)) { | 799 | if (!('whisper' in config)) { |
| 773 | config.whisper = { | 800 | config.whisper = { |
| 774 | encoder: '', | 801 | encoder: '', |
| @@ -832,8 +859,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -832,8 +859,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 832 | const fireRedAsr = | 859 | const fireRedAsr = |
| 833 | initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module); | 860 | initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module); |
| 834 | 861 | ||
| 862 | + const dolphin = | ||
| 863 | + initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module); | ||
| 864 | + | ||
| 835 | const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + | 865 | const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + |
| 836 | - tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len; | 866 | + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + |
| 867 | + dolphin.len; | ||
| 837 | 868 | ||
| 838 | const ptr = Module._malloc(len); | 869 | const ptr = Module._malloc(len); |
| 839 | 870 | ||
| @@ -932,10 +963,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | @@ -932,10 +963,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { | ||
| 932 | Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset); | 963 | Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset); |
| 933 | offset += fireRedAsr.len; | 964 | offset += fireRedAsr.len; |
| 934 | 965 | ||
| 966 | + Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset); | ||
| 967 | + offset += dolphin.len; | ||
| 968 | + | ||
| 935 | return { | 969 | return { |
| 936 | buffer: buffer, ptr: ptr, len: len, transducer: transducer, | 970 | buffer: buffer, ptr: ptr, len: len, transducer: transducer, |
| 937 | paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, | 971 | paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, |
| 938 | - senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr | 972 | + senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, |
| 973 | + dolphin: dolphin | ||
| 939 | } | 974 | } |
| 940 | } | 975 | } |
| 941 | 976 |
| @@ -13,6 +13,7 @@ extern "C" { | @@ -13,6 +13,7 @@ extern "C" { | ||
| 13 | static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); | 13 | static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); |
| 14 | static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); | 14 | static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); |
| 15 | 15 | ||
| 16 | +static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, ""); | ||
| 16 | static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); | 17 | static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); |
| 17 | static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); | 18 | static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); |
| 18 | static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, ""); | 19 | static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, ""); |
| @@ -29,7 +30,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == | @@ -29,7 +30,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == | ||
| 29 | sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + | 30 | sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + |
| 30 | sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + | 31 | sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + |
| 31 | sizeof(SherpaOnnxOfflineMoonshineModelConfig) + | 32 | sizeof(SherpaOnnxOfflineMoonshineModelConfig) + |
| 32 | - sizeof(SherpaOnnxOfflineFireRedAsrModelConfig), | 33 | + sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + |
| 34 | + sizeof(SherpaOnnxOfflineDolphinModelConfig), | ||
| 33 | 35 | ||
| 34 | ""); | 36 | ""); |
| 35 | static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); | 37 | static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); |
| @@ -73,6 +75,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | @@ -73,6 +75,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | ||
| 73 | auto sense_voice = &model_config->sense_voice; | 75 | auto sense_voice = &model_config->sense_voice; |
| 74 | auto moonshine = &model_config->moonshine; | 76 | auto moonshine = &model_config->moonshine; |
| 75 | auto fire_red_asr = &model_config->fire_red_asr; | 77 | auto fire_red_asr = &model_config->fire_red_asr; |
| 78 | + auto dolphin = &model_config->dolphin; | ||
| 76 | 79 | ||
| 77 | fprintf(stdout, "----------offline transducer model config----------\n"); | 80 | fprintf(stdout, "----------offline transducer model config----------\n"); |
| 78 | fprintf(stdout, "encoder: %s\n", transducer->encoder); | 81 | fprintf(stdout, "encoder: %s\n", transducer->encoder); |
| @@ -110,6 +113,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | @@ -110,6 +113,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { | ||
| 110 | fprintf(stdout, "encoder: %s\n", fire_red_asr->encoder); | 113 | fprintf(stdout, "encoder: %s\n", fire_red_asr->encoder); |
| 111 | fprintf(stdout, "decoder: %s\n", fire_red_asr->decoder); | 114 | fprintf(stdout, "decoder: %s\n", fire_red_asr->decoder); |
| 112 | 115 | ||
| 116 | + fprintf(stdout, "----------offline Dolphin model config----------\n"); | ||
| 117 | + fprintf(stdout, "model: %s\n", dolphin->model); | ||
| 118 | + | ||
| 113 | fprintf(stdout, "tokens: %s\n", model_config->tokens); | 119 | fprintf(stdout, "tokens: %s\n", model_config->tokens); |
| 114 | fprintf(stdout, "num_threads: %d\n", model_config->num_threads); | 120 | fprintf(stdout, "num_threads: %d\n", model_config->num_threads); |
| 115 | fprintf(stdout, "provider: %s\n", model_config->provider); | 121 | fprintf(stdout, "provider: %s\n", model_config->provider); |
| @@ -15,7 +15,7 @@ let resultList = []; | @@ -15,7 +15,7 @@ let resultList = []; | ||
| 15 | clearBtn.onclick = function() { | 15 | clearBtn.onclick = function() { |
| 16 | resultList = []; | 16 | resultList = []; |
| 17 | textArea.value = getDisplayResult(); | 17 | textArea.value = getDisplayResult(); |
| 18 | - textArea.scrollTop = textArea.scrollHeight; // auto scroll | 18 | + textArea.scrollTop = textArea.scrollHeight; // auto scroll |
| 19 | }; | 19 | }; |
| 20 | 20 | ||
| 21 | function getDisplayResult() { | 21 | function getDisplayResult() { |
| @@ -46,11 +46,11 @@ let audioCtx; | @@ -46,11 +46,11 @@ let audioCtx; | ||
| 46 | let mediaStream; | 46 | let mediaStream; |
| 47 | 47 | ||
| 48 | let expectedSampleRate = 16000; | 48 | let expectedSampleRate = 16000; |
| 49 | -let recordSampleRate; // the sampleRate of the microphone | ||
| 50 | -let recorder = null; // the microphone | ||
| 51 | -let leftchannel = []; // TODO: Use a single channel | 49 | +let recordSampleRate; // the sampleRate of the microphone |
| 50 | +let recorder = null; // the microphone | ||
| 51 | +let leftchannel = []; // TODO: Use a single channel | ||
| 52 | 52 | ||
| 53 | -let recordingLength = 0; // number of samples so far | 53 | +let recordingLength = 0; // number of samples so far |
| 54 | 54 | ||
| 55 | let vad = null; | 55 | let vad = null; |
| 56 | let buffer = null; | 56 | let buffer = null; |
| @@ -73,48 +73,50 @@ function createOfflineRecognizerSenseVoice() {} | @@ -73,48 +73,50 @@ function createOfflineRecognizerSenseVoice() {} | ||
| 73 | 73 | ||
| 74 | function initOfflineRecognizer() { | 74 | function initOfflineRecognizer() { |
| 75 | let config = { | 75 | let config = { |
| 76 | - modelConfig : { | ||
| 77 | - debug : 1, | ||
| 78 | - tokens : './tokens.txt', | 76 | + modelConfig: { |
| 77 | + debug: 1, | ||
| 78 | + tokens: './tokens.txt', | ||
| 79 | }, | 79 | }, |
| 80 | }; | 80 | }; |
| 81 | if (fileExists('sense-voice.onnx') == 1) { | 81 | if (fileExists('sense-voice.onnx') == 1) { |
| 82 | config.modelConfig.senseVoice = { | 82 | config.modelConfig.senseVoice = { |
| 83 | - model : './sense-voice.onnx', | ||
| 84 | - useInverseTextNormalization : 1, | 83 | + model: './sense-voice.onnx', |
| 84 | + useInverseTextNormalization: 1, | ||
| 85 | }; | 85 | }; |
| 86 | } else if (fileExists('whisper-encoder.onnx')) { | 86 | } else if (fileExists('whisper-encoder.onnx')) { |
| 87 | config.modelConfig.whisper = { | 87 | config.modelConfig.whisper = { |
| 88 | - encoder : './whisper-encoder.onnx', | ||
| 89 | - decoder : './whisper-decoder.onnx', | 88 | + encoder: './whisper-encoder.onnx', |
| 89 | + decoder: './whisper-decoder.onnx', | ||
| 90 | }; | 90 | }; |
| 91 | } else if (fileExists('transducer-encoder.onnx')) { | 91 | } else if (fileExists('transducer-encoder.onnx')) { |
| 92 | config.modelConfig.transducer = { | 92 | config.modelConfig.transducer = { |
| 93 | - encoder : './transducer-encoder.onnx', | ||
| 94 | - decoder : './transducer-decoder.onnx', | ||
| 95 | - joiner : './transducer-joiner.onnx', | 93 | + encoder: './transducer-encoder.onnx', |
| 94 | + decoder: './transducer-decoder.onnx', | ||
| 95 | + joiner: './transducer-joiner.onnx', | ||
| 96 | }; | 96 | }; |
| 97 | config.modelConfig.modelType = 'transducer'; | 97 | config.modelConfig.modelType = 'transducer'; |
| 98 | } else if (fileExists('nemo-transducer-encoder.onnx')) { | 98 | } else if (fileExists('nemo-transducer-encoder.onnx')) { |
| 99 | config.modelConfig.transducer = { | 99 | config.modelConfig.transducer = { |
| 100 | - encoder : './nemo-transducer-encoder.onnx', | ||
| 101 | - decoder : './nemo-transducer-decoder.onnx', | ||
| 102 | - joiner : './nemo-transducer-joiner.onnx', | 100 | + encoder: './nemo-transducer-encoder.onnx', |
| 101 | + decoder: './nemo-transducer-decoder.onnx', | ||
| 102 | + joiner: './nemo-transducer-joiner.onnx', | ||
| 103 | }; | 103 | }; |
| 104 | config.modelConfig.modelType = 'nemo_transducer'; | 104 | config.modelConfig.modelType = 'nemo_transducer'; |
| 105 | } else if (fileExists('paraformer.onnx')) { | 105 | } else if (fileExists('paraformer.onnx')) { |
| 106 | config.modelConfig.paraformer = { | 106 | config.modelConfig.paraformer = { |
| 107 | - model : './paraformer.onnx', | 107 | + model: './paraformer.onnx', |
| 108 | }; | 108 | }; |
| 109 | } else if (fileExists('telespeech.onnx')) { | 109 | } else if (fileExists('telespeech.onnx')) { |
| 110 | config.modelConfig.telespeechCtc = './telespeech.onnx'; | 110 | config.modelConfig.telespeechCtc = './telespeech.onnx'; |
| 111 | } else if (fileExists('moonshine-preprocessor.onnx')) { | 111 | } else if (fileExists('moonshine-preprocessor.onnx')) { |
| 112 | config.modelConfig.moonshine = { | 112 | config.modelConfig.moonshine = { |
| 113 | - preprocessor : './moonshine-preprocessor.onnx', | ||
| 114 | - encoder : './moonshine-encoder.onnx', | ||
| 115 | - uncachedDecoder : './moonshine-uncached-decoder.onnx', | ||
| 116 | - cachedDecoder : './moonshine-cached-decoder.onnx' | 113 | + preprocessor: './moonshine-preprocessor.onnx', |
| 114 | + encoder: './moonshine-encoder.onnx', | ||
| 115 | + uncachedDecoder: './moonshine-uncached-decoder.onnx', | ||
| 116 | + cachedDecoder: './moonshine-cached-decoder.onnx' | ||
| 117 | }; | 117 | }; |
| 118 | + } else if (fileExists('dolphin.onnx')) { | ||
| 119 | + config.modelConfig.dolphin = {model: './dolphin.onnx'}; | ||
| 118 | } else { | 120 | } else { |
| 119 | console.log('Please specify a model.'); | 121 | console.log('Please specify a model.'); |
| 120 | alert('Please specify a model.'); | 122 | alert('Please specify a model.'); |
| @@ -133,7 +135,7 @@ Module.locateFile = function(path, scriptDirectory = '') { | @@ -133,7 +135,7 @@ Module.locateFile = function(path, scriptDirectory = '') { | ||
| 133 | Module.setStatus = function(status) { | 135 | Module.setStatus = function(status) { |
| 134 | console.log(`status ${status}`); | 136 | console.log(`status ${status}`); |
| 135 | const statusElement = document.getElementById('status'); | 137 | const statusElement = document.getElementById('status'); |
| 136 | - if (status == "Running...") { | 138 | + if (status == 'Running...') { |
| 137 | status = 'Model downloaded. Initializing recongizer...' | 139 | status = 'Model downloaded. Initializing recongizer...' |
| 138 | } | 140 | } |
| 139 | statusElement.textContent = status; | 141 | statusElement.textContent = status; |
| @@ -170,11 +172,11 @@ if (navigator.mediaDevices.getUserMedia) { | @@ -170,11 +172,11 @@ if (navigator.mediaDevices.getUserMedia) { | ||
| 170 | console.log('getUserMedia supported.'); | 172 | console.log('getUserMedia supported.'); |
| 171 | 173 | ||
| 172 | // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia | 174 | // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia |
| 173 | - const constraints = {audio : true}; | 175 | + const constraints = {audio: true}; |
| 174 | 176 | ||
| 175 | let onSuccess = function(stream) { | 177 | let onSuccess = function(stream) { |
| 176 | if (!audioCtx) { | 178 | if (!audioCtx) { |
| 177 | - audioCtx = new AudioContext({sampleRate : expectedSampleRate}); | 179 | + audioCtx = new AudioContext({sampleRate: expectedSampleRate}); |
| 178 | } | 180 | } |
| 179 | console.log(audioCtx); | 181 | console.log(audioCtx); |
| 180 | recordSampleRate = audioCtx.sampleRate; | 182 | recordSampleRate = audioCtx.sampleRate; |
| @@ -299,7 +301,7 @@ if (navigator.mediaDevices.getUserMedia) { | @@ -299,7 +301,7 @@ if (navigator.mediaDevices.getUserMedia) { | ||
| 299 | } | 301 | } |
| 300 | 302 | ||
| 301 | textArea.value = getDisplayResult(); | 303 | textArea.value = getDisplayResult(); |
| 302 | - textArea.scrollTop = textArea.scrollHeight; // auto scroll | 304 | + textArea.scrollTop = textArea.scrollHeight; // auto scroll |
| 303 | }; | 305 | }; |
| 304 | 306 | ||
| 305 | startBtn.onclick = function() { | 307 | startBtn.onclick = function() { |
| @@ -330,8 +332,9 @@ if (navigator.mediaDevices.getUserMedia) { | @@ -330,8 +332,9 @@ if (navigator.mediaDevices.getUserMedia) { | ||
| 330 | }; | 332 | }; |
| 331 | }; | 333 | }; |
| 332 | 334 | ||
| 333 | - let onError = function( | ||
| 334 | - err) { console.log('The following error occured: ' + err); }; | 335 | + let onError = function(err) { |
| 336 | + console.log('The following error occured: ' + err); | ||
| 337 | + }; | ||
| 335 | 338 | ||
| 336 | navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); | 339 | navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); |
| 337 | } else { | 340 | } else { |
| @@ -364,22 +367,22 @@ function toWav(samples) { | @@ -364,22 +367,22 @@ function toWav(samples) { | ||
| 364 | 367 | ||
| 365 | // http://soundfile.sapp.org/doc/WaveFormat/ | 368 | // http://soundfile.sapp.org/doc/WaveFormat/ |
| 366 | // F F I R | 369 | // F F I R |
| 367 | - view.setUint32(0, 0x46464952, true); // chunkID | ||
| 368 | - view.setUint32(4, 36 + samples.length * 2, true); // chunkSize | 370 | + view.setUint32(0, 0x46464952, true); // chunkID |
| 371 | + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize | ||
| 369 | // E V A W | 372 | // E V A W |
| 370 | - view.setUint32(8, 0x45564157, true); // format | ||
| 371 | - // | 373 | + view.setUint32(8, 0x45564157, true); // format |
| 374 | + // | ||
| 372 | // t m f | 375 | // t m f |
| 373 | - view.setUint32(12, 0x20746d66, true); // subchunk1ID | ||
| 374 | - view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM | ||
| 375 | - view.setUint32(20, 1, true); // audioFormat, 1 for PCM | ||
| 376 | - view.setUint16(22, 1, true); // numChannels: 1 channel | ||
| 377 | - view.setUint32(24, expectedSampleRate, true); // sampleRate | ||
| 378 | - view.setUint32(28, expectedSampleRate * 2, true); // byteRate | ||
| 379 | - view.setUint16(32, 2, true); // blockAlign | ||
| 380 | - view.setUint16(34, 16, true); // bitsPerSample | ||
| 381 | - view.setUint32(36, 0x61746164, true); // Subchunk2ID | ||
| 382 | - view.setUint32(40, samples.length * 2, true); // subchunk2Size | 376 | + view.setUint32(12, 0x20746d66, true); // subchunk1ID |
| 377 | + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM | ||
| 378 | + view.setUint32(20, 1, true); // audioFormat, 1 for PCM | ||
| 379 | + view.setUint16(22, 1, true); // numChannels: 1 channel | ||
| 380 | + view.setUint32(24, expectedSampleRate, true); // sampleRate | ||
| 381 | + view.setUint32(28, expectedSampleRate * 2, true); // byteRate | ||
| 382 | + view.setUint16(32, 2, true); // blockAlign | ||
| 383 | + view.setUint16(34, 16, true); // bitsPerSample | ||
| 384 | + view.setUint32(36, 0x61746164, true); // Subchunk2ID | ||
| 385 | + view.setUint32(40, samples.length * 2, true); // subchunk2Size | ||
| 383 | 386 | ||
| 384 | let offset = 44; | 387 | let offset = 44; |
| 385 | for (let i = 0; i < samples.length; ++i) { | 388 | for (let i = 0; i < samples.length; ++i) { |
| @@ -387,7 +390,7 @@ function toWav(samples) { | @@ -387,7 +390,7 @@ function toWav(samples) { | ||
| 387 | offset += 2; | 390 | offset += 2; |
| 388 | } | 391 | } |
| 389 | 392 | ||
| 390 | - return new Blob([ view ], {type : 'audio/wav'}); | 393 | + return new Blob([view], {type: 'audio/wav'}); |
| 391 | } | 394 | } |
| 392 | 395 | ||
| 393 | // this function is copied from | 396 | // this function is copied from |
-
请 注册 或 登录 后发表评论