Fangjun Kuang
Committed by GitHub

Add Javascript (node-addon) API for Dolphin CTC models (#2094)

@@ -10,6 +10,16 @@ arch=$(node -p "require('os').arch()") @@ -10,6 +10,16 @@ arch=$(node -p "require('os').arch()")
10 platform=$(node -p "require('os').platform()") 10 platform=$(node -p "require('os').platform()")
11 node_version=$(node -p "process.versions.node.split('.')[0]") 11 node_version=$(node -p "process.versions.node.split('.')[0]")
12 12
  13 +echo "----------non-streaming ASR dolphin CTC----------"
  14 +
  15 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  16 +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  17 +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  18 +
  19 +node ./test_asr_non_streaming_dolphin_ctc.js
  20 +
  21 +rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
  22 +
13 echo "----------non-streaming speech denoiser----------" 23 echo "----------non-streaming speech denoiser----------"
14 24
15 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx 25 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx
@@ -6,6 +6,7 @@ export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from @@ -6,6 +6,7 @@ export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from
6 export { Samples, 6 export { Samples,
7 OfflineStream, 7 OfflineStream,
8 FeatureConfig, 8 FeatureConfig,
  9 + OfflineDolphinModelConfig,
9 OfflineTransducerModelConfig, 10 OfflineTransducerModelConfig,
10 OfflineParaformerModelConfig, 11 OfflineParaformerModelConfig,
11 OfflineNemoEncDecCtcModelConfig, 12 OfflineNemoEncDecCtcModelConfig,
@@ -44,6 +44,22 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig( @@ -44,6 +44,22 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig(
44 return c; 44 return c;
45 } 45 }
46 46
  47 +static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinfig(
  48 + Napi::Object obj) {
  49 + SherpaOnnxOfflineDolphinModelConfig c;
  50 + memset(&c, 0, sizeof(c));
  51 +
  52 + if (!obj.Has("dolphin") || !obj.Get("dolphin").IsObject()) {
  53 + return c;
  54 + }
  55 +
  56 + Napi::Object o = obj.Get("dolphin").As<Napi::Object>();
  57 +
  58 + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
  59 +
  60 + return c;
  61 +}
  62 +
47 static SherpaOnnxOfflineNemoEncDecCtcModelConfig GetOfflineNeMoCtcModelConfig( 63 static SherpaOnnxOfflineNemoEncDecCtcModelConfig GetOfflineNeMoCtcModelConfig(
48 Napi::Object obj) { 64 Napi::Object obj) {
49 SherpaOnnxOfflineNemoEncDecCtcModelConfig c; 65 SherpaOnnxOfflineNemoEncDecCtcModelConfig c;
@@ -168,6 +184,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { @@ -168,6 +184,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
168 c.sense_voice = GetOfflineSenseVoiceModelConfig(o); 184 c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
169 c.moonshine = GetOfflineMoonshineModelConfig(o); 185 c.moonshine = GetOfflineMoonshineModelConfig(o);
170 c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o); 186 c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o);
  187 + c.dolphin = GetOfflineDolphinfig(o);
171 188
172 SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); 189 SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
173 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); 190 SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -292,6 +309,8 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { @@ -292,6 +309,8 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
292 SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.encoder); 309 SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.encoder);
293 SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder); 310 SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder);
294 311
  312 + SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model);
  313 +
295 SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); 314 SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
296 SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); 315 SHERPA_ONNX_DELETE_C_STR(c.model_config.provider);
297 SHERPA_ONNX_DELETE_C_STR(c.model_config.model_type); 316 SHERPA_ONNX_DELETE_C_STR(c.model_config.model_type);
@@ -45,6 +45,10 @@ export class OfflineNemoEncDecCtcModelConfig { @@ -45,6 +45,10 @@ export class OfflineNemoEncDecCtcModelConfig {
45 public model: string = ''; 45 public model: string = '';
46 } 46 }
47 47
  48 +export class OfflineDolphinModelConfig {
  49 + public model: string = '';
  50 +}
  51 +
48 export class OfflineWhisperModelConfig { 52 export class OfflineWhisperModelConfig {
49 public encoder: string = ''; 53 public encoder: string = '';
50 public decoder: string = ''; 54 public decoder: string = '';
@@ -86,6 +90,7 @@ export class OfflineModelConfig { @@ -86,6 +90,7 @@ export class OfflineModelConfig {
86 public telespeechCtc: string = ''; 90 public telespeechCtc: string = '';
87 public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig(); 91 public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig();
88 public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig(); 92 public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig();
  93 + public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig();
89 } 94 }
90 95
91 export class OfflineLMConfig { 96 export class OfflineLMConfig {
@@ -159,4 +164,4 @@ export class OfflineRecognizer { @@ -159,4 +164,4 @@ export class OfflineRecognizer {
159 164
160 return r; 165 return r;
161 } 166 }
162 -}  
  167 +}
@@ -122,6 +122,7 @@ The following tables list the examples in this folder. @@ -122,6 +122,7 @@ The following tables list the examples in this folder.
122 |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| 122 |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)|
123 |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| 123 |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)|
124 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| 124 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
  125 +|[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search|
125 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| 126 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
126 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| 127 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
127 128
@@ -332,6 +333,16 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_v @@ -332,6 +333,16 @@ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_v
332 node ./test_vad_with_non_streaming_asr_whisper.js 333 node ./test_vad_with_non_streaming_asr_whisper.js
333 ``` 334 ```
334 335
  336 +### Non-streaming speech recognition with Dolphin CTC models
  337 +
  338 +```bash
  339 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  340 +tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  341 +rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  342 +
  343 +node ./test_asr_non_streaming_dolphin_ctc.js
  344 +```
  345 +
335 ### Non-streaming speech recognition with NeMo CTC models 346 ### Non-streaming speech recognition with NeMo CTC models
336 347
337 ```bash 348 ```bash
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// Please download test files from
  5 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  6 +const config = {
  7 + 'featConfig': {
  8 + 'sampleRate': 16000,
  9 + 'featureDim': 80,
  10 + },
  11 + 'modelConfig': {
  12 + 'dolphin': {
  13 + 'model':
  14 + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx',
  15 + },
  16 + 'tokens':
  17 + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt',
  18 + 'numThreads': 2,
  19 + 'provider': 'cpu',
  20 + 'debug': 1,
  21 + }
  22 +};
  23 +
  24 +const waveFilename =
  25 + './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav';
  26 +
  27 +const recognizer = new sherpa_onnx.OfflineRecognizer(config);
  28 +console.log('Started')
  29 +let start = Date.now();
  30 +const stream = recognizer.createStream();
  31 +const wave = sherpa_onnx.readWave(waveFilename);
  32 +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
  33 +
  34 +recognizer.decode(stream);
  35 +result = recognizer.getResult(stream)
  36 +let stop = Date.now();
  37 +console.log('Done')
  38 +
  39 +const elapsed_seconds = (stop - start) / 1000;
  40 +const duration = wave.samples.length / wave.sampleRate;
  41 +const real_time_factor = elapsed_seconds / duration;
  42 +console.log('Wave duration', duration.toFixed(3), 'seconds')
  43 +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
  44 +console.log(
  45 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  46 + real_time_factor.toFixed(3))
  47 +console.log(waveFilename)
  48 +console.log('result\n', result)