Fangjun Kuang
Committed by GitHub

Add JavaScript API (WebAssembly) for FireRedAsr model. (#1874)

@@ -14,6 +14,7 @@ find dart-api-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.43/1\.10\ @@ -14,6 +14,7 @@ find dart-api-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.43/1\.10\
14 find flutter-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \; 14 find flutter-examples -name *.yaml -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \;
15 find flutter -name *.podspec -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \; 15 find flutter -name *.podspec -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \;
16 find nodejs-addon-examples -name package.json -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \; 16 find nodejs-addon-examples -name package.json -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \;
  17 +find nodejs-examples -name package.json -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \;
17 18
18 find harmony-os -name "README.md" -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \; 19 find harmony-os -name "README.md" -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \;
19 find harmony-os -name oh-package.json5 -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \; 20 find harmony-os -name oh-package.json5 -type f -exec sed -i.bak 's/1\.10\.43/1\.10\.44/g' {} \;
@@ -216,6 +216,21 @@ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 @@ -216,6 +216,21 @@ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
216 node ./test-offline-whisper.js 216 node ./test-offline-whisper.js
217 ``` 217 ```
218 218
  219 +## ./test-offline-fire-red-asr.js
  220 +
  221 +[./test-offline-fire-red-asr.js](./test-offline-fire-red-asr.js) demonstrates
  222 +how to decode a file with a FireRedAsr AED model.
  223 +
  224 +You can use the following command to run it:
  225 +
  226 +```bash
  227 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16.tar.bz2
  228 +tar xvf sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16.tar.bz2
  229 +rm sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16.tar.bz2
  230 +
  231 +node ./test-offline-fire-red-asr.js
  232 +```
  233 +
219 ## ./test-offline-moonshine.js 234 ## ./test-offline-moonshine.js
220 235
221 [./test-offline-moonshine.js](./test-offline-moonshine.js) demonstrates 236 [./test-offline-moonshine.js](./test-offline-moonshine.js) demonstrates
1 { 1 {
2 "dependencies": { 2 "dependencies": {
3 "naudiodon2": "^2.4.0", 3 "naudiodon2": "^2.4.0",
4 - "sherpa-onnx": "*", 4 + "sherpa-onnx": "^1.10.44",
5 "wav": "^1.0.2" 5 "wav": "^1.0.2"
6 } 6 }
7 } 7 }
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +//
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOfflineRecognizer() {
  6 + let modelConfig = {
  7 + fireRedAsr: {
  8 + encoder:
  9 + './sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/encoder.int8.onnx',
  10 + decoder:
  11 + './sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/decoder.int8.onnx',
  12 + },
  13 + tokens: './sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/tokens.txt',
  14 + debug: 1,
  15 + };
  16 +
  17 + let config = {
  18 + modelConfig: modelConfig,
  19 + };
  20 +
  21 + return sherpa_onnx.createOfflineRecognizer(config);
  22 +}
  23 +
  24 +recognizer = createOfflineRecognizer();
  25 +stream = recognizer.createStream();
  26 +
  27 +const waveFilename =
  28 + './sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav';
  29 +const wave = sherpa_onnx.readWave(waveFilename);
  30 +stream.acceptWaveform(wave.sampleRate, wave.samples);
  31 +
  32 +recognizer.decode(stream);
  33 +const text = recognizer.getResult(stream).text;
  34 +console.log(text);
  35 +
  36 +stream.free();
  37 +recognizer.free();
@@ -35,6 +35,10 @@ function freeConfig(config, Module) { @@ -35,6 +35,10 @@ function freeConfig(config, Module) {
35 freeConfig(config.whisper, Module) 35 freeConfig(config.whisper, Module)
36 } 36 }
37 37
  38 + if ('fireRedAsr' in config) {
  39 + freeConfig(config.fireRedAsr, Module)
  40 + }
  41 +
38 if ('moonshine' in config) { 42 if ('moonshine' in config) {
39 freeConfig(config.moonshine, Module) 43 freeConfig(config.moonshine, Module)
40 } 44 }
@@ -651,6 +655,35 @@ function initSherpaOnnxOfflineMoonshineModelConfig(config, Module) { @@ -651,6 +655,35 @@ function initSherpaOnnxOfflineMoonshineModelConfig(config, Module) {
651 } 655 }
652 } 656 }
653 657
  658 +function initSherpaOnnxOfflineFireRedAsrModelConfig(config, Module) {
  659 + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
  660 + const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
  661 +
  662 + const n = encoderLen + decoderLen;
  663 + const buffer = Module._malloc(n);
  664 +
  665 + const len = 2 * 4; // 2 pointers
  666 + const ptr = Module._malloc(len);
  667 +
  668 + let offset = 0;
  669 + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen);
  670 + offset += encoderLen;
  671 +
  672 + Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen);
  673 + offset += decoderLen;
  674 +
  675 + offset = 0;
  676 + Module.setValue(ptr, buffer + offset, 'i8*');
  677 + offset += encoderLen;
  678 +
  679 + Module.setValue(ptr + 4, buffer + offset, 'i8*');
  680 + offset += decoderLen;
  681 +
  682 + return {
  683 + buffer: buffer, ptr: ptr, len: len,
  684 + }
  685 +}
  686 +
654 function initSherpaOnnxOfflineTdnnModelConfig(config, Module) { 687 function initSherpaOnnxOfflineTdnnModelConfig(config, Module) {
655 const n = Module.lengthBytesUTF8(config.model || '') + 1; 688 const n = Module.lengthBytesUTF8(config.model || '') + 1;
656 const buffer = Module._malloc(n); 689 const buffer = Module._malloc(n);
@@ -755,6 +788,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -755,6 +788,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
755 }; 788 };
756 } 789 }
757 790
  791 + if (!('fireRedAsr' in config)) {
  792 + config.fireRedAsr = {
  793 + encoder: '',
  794 + decoder: '',
  795 + };
  796 + }
  797 +
758 if (!('tdnn' in config)) { 798 if (!('tdnn' in config)) {
759 config.tdnn = { 799 config.tdnn = {
760 model: '', 800 model: '',
@@ -789,8 +829,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -789,8 +829,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
789 const moonshine = 829 const moonshine =
790 initSherpaOnnxOfflineMoonshineModelConfig(config.moonshine, Module); 830 initSherpaOnnxOfflineMoonshineModelConfig(config.moonshine, Module);
791 831
  832 + const fireRedAsr =
  833 + initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module);
  834 +
792 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + 835 const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
793 - tdnn.len + 8 * 4 + senseVoice.len + moonshine.len; 836 + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len;
794 837
795 const ptr = Module._malloc(len); 838 const ptr = Module._malloc(len);
796 839
@@ -884,11 +927,15 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { @@ -884,11 +927,15 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
884 offset += senseVoice.len; 927 offset += senseVoice.len;
885 928
886 Module._CopyHeap(moonshine.ptr, moonshine.len, ptr + offset); 929 Module._CopyHeap(moonshine.ptr, moonshine.len, ptr + offset);
  930 + offset += moonshine.len;
  931 +
  932 + Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset);
  933 + offset += fireRedAsr.len;
887 934
888 return { 935 return {
889 buffer: buffer, ptr: ptr, len: len, transducer: transducer, 936 buffer: buffer, ptr: ptr, len: len, transducer: transducer,
890 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, 937 paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
891 - senseVoice: senseVoice, moonshine: moonshine, 938 + senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr
892 } 939 }
893 } 940 }
894 941
@@ -15,6 +15,7 @@ static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); @@ -15,6 +15,7 @@ static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, "");
15 15
16 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); 16 static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); 17 static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, "");
  18 +static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, "");
18 static_assert(sizeof(SherpaOnnxOfflineMoonshineModelConfig) == 4 * 4, ""); 19 static_assert(sizeof(SherpaOnnxOfflineMoonshineModelConfig) == 4 * 4, "");
19 static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, ""); 20 static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, "");
20 static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, ""); 21 static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, "");
@@ -27,7 +28,9 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == @@ -27,7 +28,9 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
27 sizeof(SherpaOnnxOfflineWhisperModelConfig) + 28 sizeof(SherpaOnnxOfflineWhisperModelConfig) +
28 sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 + 29 sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 +
29 sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + 30 sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) +
30 - sizeof(SherpaOnnxOfflineMoonshineModelConfig), 31 + sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
  32 + sizeof(SherpaOnnxOfflineFireRedAsrModelConfig),
  33 +
31 ""); 34 "");
32 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); 35 static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
33 static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == 36 static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) ==
@@ -69,6 +72,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -69,6 +72,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
69 auto tdnn = &model_config->tdnn; 72 auto tdnn = &model_config->tdnn;
70 auto sense_voice = &model_config->sense_voice; 73 auto sense_voice = &model_config->sense_voice;
71 auto moonshine = &model_config->moonshine; 74 auto moonshine = &model_config->moonshine;
  75 + auto fire_red_asr = &model_config->fire_red_asr;
72 76
73 fprintf(stdout, "----------offline transducer model config----------\n"); 77 fprintf(stdout, "----------offline transducer model config----------\n");
74 fprintf(stdout, "encoder: %s\n", transducer->encoder); 78 fprintf(stdout, "encoder: %s\n", transducer->encoder);
@@ -102,6 +106,10 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { @@ -102,6 +106,10 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
102 fprintf(stdout, "uncached_decoder: %s\n", moonshine->uncached_decoder); 106 fprintf(stdout, "uncached_decoder: %s\n", moonshine->uncached_decoder);
103 fprintf(stdout, "cached_decoder: %s\n", moonshine->cached_decoder); 107 fprintf(stdout, "cached_decoder: %s\n", moonshine->cached_decoder);
104 108
  109 + fprintf(stdout, "----------offline FireRedAsr model config----------\n");
  110 + fprintf(stdout, "encoder: %s\n", fire_red_asr->encoder);
  111 + fprintf(stdout, "decoder: %s\n", fire_red_asr->decoder);
  112 +
105 fprintf(stdout, "tokens: %s\n", model_config->tokens); 113 fprintf(stdout, "tokens: %s\n", model_config->tokens);
106 fprintf(stdout, "num_threads: %d\n", model_config->num_threads); 114 fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
107 fprintf(stdout, "provider: %s\n", model_config->provider); 115 fprintf(stdout, "provider: %s\n", model_config->provider);