Fangjun Kuang
Committed by GitHub

Add audio tagging APIs for node-addon-api (#875)

@@ -18,7 +18,7 @@ fi @@ -18,7 +18,7 @@ fi
18 SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) 18 SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
19 echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" 19 echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
20 20
21 -# SHERPA_ONNX_VERSION=1.0.21 21 +# SHERPA_ONNX_VERSION=1.0.22
22 22
23 if [ -z $owner ]; then 23 if [ -z $owner ]; then
24 owner=k2-fsa 24 owner=k2-fsa
@@ -6,6 +6,22 @@ d=nodejs-addon-examples @@ -6,6 +6,22 @@ d=nodejs-addon-examples
6 echo "dir: $d" 6 echo "dir: $d"
7 cd $d 7 cd $d
8 8
  9 +echo "----------audio tagging----------"
  10 +
  11 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
  12 +tar xvf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
  13 +rm sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
  14 +
  15 +node ./test_audio_tagging_zipformer.js
  16 +rm -rf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15
  17 +
  18 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
  19 +tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
  20 +rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
  21 +
  22 +node ./test_audio_tagging_ced.js
  23 +rm -rf sherpa-onnx-ced-mini-audio-tagging-2024-04-19
  24 +
9 echo "----------speaker identification----------" 25 echo "----------speaker identification----------"
10 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx 26 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
11 27
@@ -33,6 +33,11 @@ jobs: @@ -33,6 +33,11 @@ jobs:
33 with: 33 with:
34 python-version: ${{ matrix.python-version }} 34 python-version: ${{ matrix.python-version }}
35 35
  36 + - name: Update pip
  37 + shell: bash
  38 + run: |
  39 + pip install -U pip
  40 +
36 - uses: actions/setup-node@v4 41 - uses: actions/setup-node@v4
37 with: 42 with:
38 registry-url: 'https://registry.npmjs.org' 43 registry-url: 'https://registry.npmjs.org'
@@ -55,7 +55,7 @@ jobs: @@ -55,7 +55,7 @@ jobs:
55 55
56 SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) 56 SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
57 echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" 57 echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION"
58 - # SHERPA_ONNX_VERSION=1.0.21 58 + # SHERPA_ONNX_VERSION=1.0.22
59 59
60 src_dir=.github/scripts/node-addon 60 src_dir=.github/scripts/node-addon
61 sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json 61 sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g $src_dir/package.json
@@ -27,7 +27,82 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH @@ -27,7 +27,82 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH
27 export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH 27 export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH
28 ``` 28 ```
29 29
30 -# Voice Activity detection (VAD) 30 +# Examples
  31 +
  32 +The following tables list the examples in this folder.
  33 +
  34 +## Voice activity detection (VAD)
  35 +
  36 +|File| Description|
  37 +|---|---|
  38 +|[./test_vad_microphone.js](./test_vad_microphone.js)| VAD with a microphone. It uses [silero-vad](https://github.com/snakers4/silero-vad)|
  39 +
  40 +## Speaker identification
  41 +
  42 +|File| Description|
  43 +|---|---|
  44 +|[ ./test_speaker_identification.js]( ./test_speaker_identification.js)| Speaker identification from a file|
  45 +
  46 +## Spoken language identification
  47 +
  48 +|File| Description|
  49 +|---|---|
  50 +|[./test_vad_spoken_language_identification_microphone.js](./test_vad_spoken_language_identification_microphone.js)|Spoken language identification from a microphone using a multi-lingual [Whisper](https://github.com/openai/whisper) model|
  51 +
  52 +## Audio tagging
  53 +
  54 +|File| Description|
  55 +|---|---|
  56 +|[./test_audio_tagging_zipformer.js](./test_audio_tagging_zipformer.js)| Audio tagging with a Zipformer model|
  57 +|[./test_audio_tagging_ced.js](./test_audio_tagging_ced.js)| Audio tagging with a [CED](https://github.com/RicherMans/CED) model|
  58 +
  59 +## Streaming speech-to-text from files
  60 +
  61 +|File| Description|
  62 +|---|---|
  63 +|[./test_asr_streaming_transducer.js](./test_asr_streaming_transducer.js)| Streaming speech recognition from a file using a Zipformer transducer model|
  64 +|[./test_asr_streaming_ctc.js](./test_asr_streaming_ctc.js)| Streaming speech recognition from a file using a Zipformer CTC model with greedy search|
  65 +|[./test_asr_streaming_ctc_hlg.js](./test_asr_streaming_ctc_hlg.js)| Streaming speech recognition from a file using a Zipformer CTC model with HLG decoding|
  66 +|[./test_asr_streaming_paraformer.js](./test_asr_streaming_paraformer.js)|Streaming speech recognition from a file using a [Paraformer](https://github.com/alibaba-damo-academy/FunASR) model|
  67 +
  68 +## Streaming speech-to-text from a microphone
  69 +
  70 +|File| Description|
  71 +|---|---|
  72 +|[./test_asr_streaming_transducer_microphone.js](./test_asr_streaming_transducer_microphone.js)| Streaming speech recognition from a microphone using a Zipformer transducer model|
  73 +|[./test_asr_streaming_ctc_microphone.js](./test_asr_streaming_ctc_microphone.js)| Streaming speech recognition from a microphone using a Zipformer CTC model with greedy search|
  74 +|[./test_asr_streaming_ctc_hlg_microphone.js](./test_asr_streaming_ctc_hlg_microphone.js)|Streaming speech recognition from a microphone using a Zipformer CTC model with HLG decoding|
  75 +|[./test_asr_streaming_paraformer_microphone.js](./test_asr_streaming_paraformer_microphone.js)| Streaming speech recognition from a microphone using a [Paraformer](https://github.com/alibaba-damo-academy/FunASR) model|
  76 +
  77 +## Non-Streaming speech-to-text from files
  78 +
  79 +|File| Description|
  80 +|---|---|
  81 +|[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model|
  82 +|[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
  83 +|[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
  84 +|[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
  85 +
  86 +## Non-Streaming speech-to-text from a microphone with VAD
  87 +
  88 +|File| Description|
  89 +|---|---|
  90 +|[./test_vad_asr_non_streaming_transducer_microphone.js](./test_vad_asr_non_streaming_transducer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer transducer model|
  91 +|[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
  92 +|[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
  93 +|[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
  94 +
  95 +## Text-to-speech
  96 +
  97 +|File| Description|
  98 +|---|---|
  99 +|[./test_tts_non_streaming_vits_piper_en.js](./test_tts_non_streaming_vits_piper_en.js)| Text-to-speech with a [piper](https://github.com/rhasspy/piper) English model|
  100 +|[./test_tts_non_streaming_vits_coqui_de.js](./test_tts_non_streaming_vits_coqui_de.js)| Text-to-speech with a [coqui](https://github.com/coqui-ai/TTS) German model|
  101 +|[./test_tts_non_streaming_vits_zh_ll.js](./test_tts_non_streaming_vits_zh_ll.js)| Text-to-speech with a Chinese model using [cppjieba](https://github.com/yanyiwu/cppjieba)|
  102 +|[./test_tts_non_streaming_vits_zh_aishell3.js](./test_tts_non_streaming_vits_zh_aishell3.js)| Text-to-speech with a Chinese TTS model|
  103 +
  104 +
  105 +### Voice Activity detection (VAD)
31 106
32 ```bash 107 ```bash
33 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx 108 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
@@ -39,7 +114,27 @@ npm install naudiodon2 @@ -39,7 +114,27 @@ npm install naudiodon2
39 node ./test_vad_microphone.js 114 node ./test_vad_microphone.js
40 ``` 115 ```
41 116
42 -## Streaming speech recognition with Zipformer transducer 117 +### Audio tagging with zipformer
  118 +
  119 +```bash
  120 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
  121 +tar xvf sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
  122 +rm sherpa-onnx-zipformer-small-audio-tagging-2024-04-15.tar.bz2
  123 +
  124 +node ./test_audio_tagging_zipformer.js
  125 +```
  126 +
  127 +### Audio tagging with CED
  128 +
  129 +```bash
  130 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
  131 +tar xvf sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
  132 +rm sherpa-onnx-ced-mini-audio-tagging-2024-04-19.tar.bz2
  133 +
  134 +node ./test_audio_tagging_ced.js
  135 +```
  136 +
  137 +### Streaming speech recognition with Zipformer transducer
43 138
44 ```bash 139 ```bash
45 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 140 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
@@ -54,7 +149,7 @@ npm install naudiodon2 @@ -54,7 +149,7 @@ npm install naudiodon2
54 node ./test_asr_streaming_transducer_microphone.js 149 node ./test_asr_streaming_transducer_microphone.js
55 ``` 150 ```
56 151
57 -## Streaming speech recognition with Zipformer CTC 152 +### Streaming speech recognition with Zipformer CTC
58 153
59 ```bash 154 ```bash
60 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 155 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
@@ -73,7 +168,7 @@ node ./test_asr_streaming_ctc_microphone.js @@ -73,7 +168,7 @@ node ./test_asr_streaming_ctc_microphone.js
73 node ./test_asr_streaming_ctc_hlg_microphone.js 168 node ./test_asr_streaming_ctc_hlg_microphone.js
74 ``` 169 ```
75 170
76 -## Streaming speech recognition with Paraformer 171 +### Streaming speech recognition with Paraformer
77 172
78 ```bash 173 ```bash
79 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 174 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
@@ -88,7 +183,7 @@ npm install naudiodon2 @@ -88,7 +183,7 @@ npm install naudiodon2
88 node ./test_asr_streaming_paraformer_microphone.js 183 node ./test_asr_streaming_paraformer_microphone.js
89 ``` 184 ```
90 185
91 -## Non-streaming speech recognition with Zipformer transducer 186 +### Non-streaming speech recognition with Zipformer transducer
92 187
93 ```bash 188 ```bash
94 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2 189 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-04-01.tar.bz2
@@ -102,7 +197,7 @@ npm install naudiodon2 @@ -102,7 +197,7 @@ npm install naudiodon2
102 node ./test_vad_asr_non_streaming_transducer_microphone.js 197 node ./test_vad_asr_non_streaming_transducer_microphone.js
103 ``` 198 ```
104 199
105 -## Non-streaming speech recognition with Whisper 200 +### Non-streaming speech recognition with Whisper
106 201
107 ```bash 202 ```bash
108 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 203 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
@@ -116,7 +211,7 @@ npm install naudiodon2 @@ -116,7 +211,7 @@ npm install naudiodon2
116 node ./test_vad_asr_non_streaming_whisper_microphone.js 211 node ./test_vad_asr_non_streaming_whisper_microphone.js
117 ``` 212 ```
118 213
119 -## Non-streaming speech recognition with NeMo CTC models 214 +### Non-streaming speech recognition with NeMo CTC models
120 215
121 ```bash 216 ```bash
122 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 217 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2
@@ -130,7 +225,7 @@ npm install naudiodon2 @@ -130,7 +225,7 @@ npm install naudiodon2
130 node ./test_vad_asr_non_streaming_nemo_ctc_microphone.js 225 node ./test_vad_asr_non_streaming_nemo_ctc_microphone.js
131 ``` 226 ```
132 227
133 -## Non-streaming speech recognition with Paraformer 228 +### Non-streaming speech recognition with Paraformer
134 229
135 ```bash 230 ```bash
136 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 231 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2
@@ -144,7 +239,7 @@ npm install naudiodon2 @@ -144,7 +239,7 @@ npm install naudiodon2
144 node ./test_vad_asr_non_streaming_paraformer_microphone.js 239 node ./test_vad_asr_non_streaming_paraformer_microphone.js
145 ``` 240 ```
146 241
147 -## Text-to-speech with piper VITS models (TTS) 242 +### Text-to-speech with piper VITS models (TTS)
148 243
149 ```bash 244 ```bash
150 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2 245 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_GB-cori-medium.tar.bz2
@@ -154,7 +249,7 @@ rm vits-piper-en_GB-cori-medium.tar.bz2 @@ -154,7 +249,7 @@ rm vits-piper-en_GB-cori-medium.tar.bz2
154 node ./test_tts_non_streaming_vits_piper_en.js 249 node ./test_tts_non_streaming_vits_piper_en.js
155 ``` 250 ```
156 251
157 -## Text-to-speech with piper Coqui-ai/TTS models (TTS) 252 +### Text-to-speech with piper Coqui-ai/TTS models (TTS)
158 253
159 ```bash 254 ```bash
160 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2 255 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2
@@ -164,7 +259,7 @@ rm vits-coqui-de-css10.tar.bz2 @@ -164,7 +259,7 @@ rm vits-coqui-de-css10.tar.bz2
164 node ./test_tts_non_streaming_vits_coqui_de.js 259 node ./test_tts_non_streaming_vits_coqui_de.js
165 ``` 260 ```
166 261
167 -## Text-to-speech with vits Chinese models (1/2) 262 +### Text-to-speech with vits Chinese models (1/2)
168 263
169 ```bash 264 ```bash
170 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 265 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
@@ -174,7 +269,7 @@ rm sherpa-onnx-vits-zh-ll.tar.bz2 @@ -174,7 +269,7 @@ rm sherpa-onnx-vits-zh-ll.tar.bz2
174 node ./test_tts_non_streaming_vits_zh_ll.js 269 node ./test_tts_non_streaming_vits_zh_ll.js
175 ``` 270 ```
176 271
177 -## Text-to-speech with vits Chinese models (2/2) 272 +### Text-to-speech with vits Chinese models (2/2)
178 273
179 ```bash 274 ```bash
180 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 275 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
@@ -184,7 +279,7 @@ rm vits-icefall-zh-aishell3.tar.bz2 @@ -184,7 +279,7 @@ rm vits-icefall-zh-aishell3.tar.bz2
184 node ./test_tts_non_streaming_vits_zh_aishell3.js 279 node ./test_tts_non_streaming_vits_zh_aishell3.js
185 ``` 280 ```
186 281
187 -## Spoken language identification with Whisper multi-lingual models 282 +### Spoken language identification with Whisper multi-lingual models
188 283
189 ```bash 284 ```bash
190 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 285 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
@@ -202,7 +297,7 @@ npm install naudiodon2 @@ -202,7 +297,7 @@ npm install naudiodon2
202 node ./test_vad_spoken_language_identification_microphone.js 297 node ./test_vad_spoken_language_identification_microphone.js
203 ``` 298 ```
204 299
205 -## Speaker identification 300 +### Speaker identification
206 301
207 You can find more models at 302 You can find more models at
208 <https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models> 303 <https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models>
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// Please download models files from
  5 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
  6 +function createAudioTagging() {
  7 + const config = {
  8 + model: {
  9 + ced: './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/model.int8.onnx',
  10 + numThreads: 1,
  11 + debug: true,
  12 + },
  13 + labels:
  14 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/class_labels_indices.csv',
  15 + topK: 5,
  16 + };
  17 + return new sherpa_onnx.AudioTagging(config);
  18 +}
  19 +
  20 +const at = createAudioTagging();
  21 +
  22 +const testWaves = [
  23 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/1.wav',
  24 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/2.wav',
  25 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/3.wav',
  26 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/4.wav',
  27 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/5.wav',
  28 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/6.wav',
  29 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/7.wav',
  30 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/8.wav',
  31 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/9.wav',
  32 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/10.wav',
  33 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/11.wav',
  34 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/12.wav',
  35 + './sherpa-onnx-ced-mini-audio-tagging-2024-04-19/test_wavs/13.wav',
  36 +];
  37 +
  38 +console.log('------');
  39 +
  40 +for (let filename of testWaves) {
  41 + const start = performance.now();
  42 + const stream = at.createStream();
  43 + const wave = sherpa_onnx.readWave(filename);
  44 + stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
  45 + const events = at.compute(stream);
  46 + const stop = performance.now();
  47 +
  48 + const elapsed_seconds = (stop - start) / 1000;
  49 + const duration = wave.samples.length / wave.sampleRate;
  50 + const real_time_factor = elapsed_seconds / duration;
  51 +
  52 + console.log('input file:', filename);
  53 + console.log('Probability\t\tName');
  54 + for (let e of events) {
  55 + console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`);
  56 + }
  57 + console.log('Wave duration', duration.toFixed(3), 'secodns')
  58 + console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  59 + console.log(
  60 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  61 + real_time_factor.toFixed(3))
  62 + console.log('------');
  63 +}
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +const sherpa_onnx = require('sherpa-onnx-node');
  3 +
  4 +// Please download models files from
  5 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
  6 +function createAudioTagging() {
  7 + const config = {
  8 + model: {
  9 + zipformer: {
  10 + model:
  11 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/model.int8.onnx'
  12 + },
  13 + numThreads: 1,
  14 + debug: true,
  15 + },
  16 + labels:
  17 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/class_labels_indices.csv',
  18 + topK: 5,
  19 + };
  20 + return new sherpa_onnx.AudioTagging(config);
  21 +}
  22 +
  23 +const at = createAudioTagging();
  24 +
  25 +const testWaves = [
  26 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/1.wav',
  27 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/2.wav',
  28 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/3.wav',
  29 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/4.wav',
  30 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/5.wav',
  31 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/6.wav',
  32 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/7.wav',
  33 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/8.wav',
  34 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/9.wav',
  35 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/10.wav',
  36 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/11.wav',
  37 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/12.wav',
  38 + './sherpa-onnx-zipformer-small-audio-tagging-2024-04-15/test_wavs/13.wav',
  39 +];
  40 +
  41 +console.log('------');
  42 +
  43 +for (let filename of testWaves) {
  44 + const start = performance.now();
  45 + const stream = at.createStream();
  46 + const wave = sherpa_onnx.readWave(filename);
  47 + stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
  48 + const events = at.compute(stream);
  49 + const stop = performance.now();
  50 +
  51 + const elapsed_seconds = (stop - start) / 1000;
  52 + const duration = wave.samples.length / wave.sampleRate;
  53 + const real_time_factor = elapsed_seconds / duration;
  54 +
  55 + console.log('input file:', filename);
  56 + console.log('Probability\t\tName');
  57 + for (let e of events) {
  58 + console.log(`${e.prob.toFixed(3)}\t\t\t${e.name}`);
  59 + }
  60 + console.log('Wave duration', duration.toFixed(3), 'secodns')
  61 + console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
  62 + console.log(
  63 + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
  64 + real_time_factor.toFixed(3))
  65 + console.log('------');
  66 +}
@@ -18,6 +18,7 @@ add_definitions(-DNAPI_VERSION=3) @@ -18,6 +18,7 @@ add_definitions(-DNAPI_VERSION=3)
18 include_directories(${CMAKE_JS_INC}) 18 include_directories(${CMAKE_JS_INC})
19 19
20 set(srcs 20 set(srcs
  21 + src/audio-tagging.cc
21 src/non-streaming-asr.cc 22 src/non-streaming-asr.cc
22 src/non-streaming-tts.cc 23 src/non-streaming-tts.cc
23 src/sherpa-onnx-node-addon-api.cc 24 src/sherpa-onnx-node-addon-api.cc
  1 +const addon = require('./addon.js');
  2 +const non_streaming_asr = require('./non-streaming-asr.js');
  3 +
  4 +class AudioTagging {
  5 + constructor(config) {
  6 + this.handle = addon.createAudioTagging(config);
  7 + this.config = config;
  8 + }
  9 +
  10 + createStream() {
  11 + return new non_streaming_asr.OfflineStream(
  12 + addon.audioTaggingCreateOfflineStream(this.handle));
  13 + }
  14 +
  15 + /* Return an array. Each element is
  16 + * an object {name: "xxx", prob: xxx, index: xxx};
  17 + *
  18 + */
  19 + compute(stream, topK = -1) {
  20 + return addon.audioTaggingCompute(this.handle, stream.handle, topK);
  21 + }
  22 +}
  23 +
  24 +module.exports = {
  25 + AudioTagging,
  26 +}
@@ -5,6 +5,7 @@ const non_streaming_tts = require('./non-streaming-tts.js'); @@ -5,6 +5,7 @@ const non_streaming_tts = require('./non-streaming-tts.js');
5 const vad = require('./vad.js'); 5 const vad = require('./vad.js');
6 const slid = require('./spoken-language-identification.js'); 6 const slid = require('./spoken-language-identification.js');
7 const sid = require('./speaker-identification.js'); 7 const sid = require('./speaker-identification.js');
  8 +const at = require('./audio-tagg.js');
8 9
9 module.exports = { 10 module.exports = {
10 OnlineRecognizer: streaming_asr.OnlineRecognizer, 11 OnlineRecognizer: streaming_asr.OnlineRecognizer,
@@ -18,4 +19,5 @@ module.exports = { @@ -18,4 +19,5 @@ module.exports = {
18 SpokenLanguageIdentification: slid.SpokenLanguageIdentification, 19 SpokenLanguageIdentification: slid.SpokenLanguageIdentification,
19 SpeakerEmbeddingExtractor: sid.SpeakerEmbeddingExtractor, 20 SpeakerEmbeddingExtractor: sid.SpeakerEmbeddingExtractor,
20 SpeakerEmbeddingManager: sid.SpeakerEmbeddingManager, 21 SpeakerEmbeddingManager: sid.SpeakerEmbeddingManager,
  22 + AudioTagging: at.AudioTagging,
21 } 23 }
  1 +// scripts/node-addon-api/src/audio-tagging.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#include <sstream>
  5 +
  6 +#include "macros.h" // NOLINT
  7 +#include "napi.h" // NOLINT
  8 +#include "sherpa-onnx/c-api/c-api.h"
  9 +
  10 +static SherpaOnnxOfflineZipformerAudioTaggingModelConfig
  11 +GetAudioTaggingZipformerModelConfig(Napi::Object obj) {
  12 + SherpaOnnxOfflineZipformerAudioTaggingModelConfig c;
  13 + memset(&c, 0, sizeof(c));
  14 +
  15 + if (!obj.Has("zipformer") || !obj.Get("zipformer").IsObject()) {
  16 + return c;
  17 + }
  18 +
  19 + Napi::Object o = obj.Get("zipformer").As<Napi::Object>();
  20 +
  21 + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
  22 +
  23 + return c;
  24 +}
  25 +
  26 +static SherpaOnnxAudioTaggingModelConfig GetAudioTaggingModelConfig(
  27 + Napi::Object obj) {
  28 + SherpaOnnxAudioTaggingModelConfig c;
  29 + memset(&c, 0, sizeof(c));
  30 +
  31 + if (!obj.Has("model") || !obj.Get("model").IsObject()) {
  32 + return c;
  33 + }
  34 +
  35 + Napi::Object o = obj.Get("model").As<Napi::Object>();
  36 + c.zipformer = GetAudioTaggingZipformerModelConfig(o);
  37 +
  38 + SHERPA_ONNX_ASSIGN_ATTR_STR(ced, ced);
  39 +
  40 + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
  41 +
  42 + if (o.Has("debug") &&
  43 + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) {
  44 + if (o.Get("debug").IsBoolean()) {
  45 + c.debug = o.Get("debug").As<Napi::Boolean>().Value();
  46 + } else {
  47 + c.debug = o.Get("debug").As<Napi::Number>().Int32Value();
  48 + }
  49 + }
  50 + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider);
  51 +
  52 + return c;
  53 +}
  54 +
  55 +static Napi::External<SherpaOnnxAudioTagging> CreateAudioTaggingWrapper(
  56 + const Napi::CallbackInfo &info) {
  57 + Napi::Env env = info.Env();
  58 + if (info.Length() != 1) {
  59 + std::ostringstream os;
  60 + os << "Expect only 1 argument. Given: " << info.Length();
  61 +
  62 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  63 +
  64 + return {};
  65 + }
  66 +
  67 + if (!info[0].IsObject()) {
  68 + Napi::TypeError::New(env, "You should pass an object as the only argument.")
  69 + .ThrowAsJavaScriptException();
  70 +
  71 + return {};
  72 + }
  73 +
  74 + Napi::Object o = info[0].As<Napi::Object>();
  75 +
  76 + SherpaOnnxAudioTaggingConfig c;
  77 + memset(&c, 0, sizeof(c));
  78 + c.model = GetAudioTaggingModelConfig(o);
  79 +
  80 + SHERPA_ONNX_ASSIGN_ATTR_STR(labels, labels);
  81 + SHERPA_ONNX_ASSIGN_ATTR_INT32(top_k, topK);
  82 +
  83 + const SherpaOnnxAudioTagging *at = SherpaOnnxCreateAudioTagging(&c);
  84 +
  85 + if (c.model.zipformer.model) {
  86 + delete[] c.model.zipformer.model;
  87 + }
  88 +
  89 + if (c.model.ced) {
  90 + delete[] c.model.ced;
  91 + }
  92 +
  93 + if (c.model.provider) {
  94 + delete[] c.model.provider;
  95 + }
  96 +
  97 + if (c.labels) {
  98 + delete[] c.labels;
  99 + }
  100 +
  101 + if (!at) {
  102 + Napi::TypeError::New(env, "Please check your config!")
  103 + .ThrowAsJavaScriptException();
  104 +
  105 + return {};
  106 + }
  107 +
  108 + return Napi::External<SherpaOnnxAudioTagging>::New(
  109 + env, const_cast<SherpaOnnxAudioTagging *>(at),
  110 + [](Napi::Env env, SherpaOnnxAudioTagging *at) {
  111 + SherpaOnnxDestroyAudioTagging(at);
  112 + });
  113 +}
  114 +
  115 +static Napi::External<SherpaOnnxOfflineStream>
  116 +AudioTaggingCreateOfflineStreamWrapper(const Napi::CallbackInfo &info) {
  117 + Napi::Env env = info.Env();
  118 + if (info.Length() != 1) {
  119 + std::ostringstream os;
  120 + os << "Expect only 1 argument. Given: " << info.Length();
  121 +
  122 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  123 +
  124 + return {};
  125 + }
  126 +
  127 + if (!info[0].IsExternal()) {
  128 + Napi::TypeError::New(
  129 + env, "You should pass an audio tagging pointer as the only argument")
  130 + .ThrowAsJavaScriptException();
  131 +
  132 + return {};
  133 + }
  134 +
  135 + SherpaOnnxAudioTagging *at =
  136 + info[0].As<Napi::External<SherpaOnnxAudioTagging>>().Data();
  137 +
  138 + const SherpaOnnxOfflineStream *stream =
  139 + SherpaOnnxAudioTaggingCreateOfflineStream(at);
  140 +
  141 + return Napi::External<SherpaOnnxOfflineStream>::New(
  142 + env, const_cast<SherpaOnnxOfflineStream *>(stream),
  143 + [](Napi::Env env, SherpaOnnxOfflineStream *stream) {
  144 + DestroyOfflineStream(stream);
  145 + });
  146 +}
  147 +
  148 +static Napi::Object AudioTaggingComputeWrapper(const Napi::CallbackInfo &info) {
  149 + Napi::Env env = info.Env();
  150 + if (info.Length() != 3) {
  151 + std::ostringstream os;
  152 + os << "Expect only 3 arguments. Given: " << info.Length();
  153 +
  154 + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
  155 +
  156 + return {};
  157 + }
  158 +
  159 + if (!info[0].IsExternal()) {
  160 + Napi::TypeError::New(
  161 + env, "You should pass an audio tagging pointer as the first argument")
  162 + .ThrowAsJavaScriptException();
  163 +
  164 + return {};
  165 + }
  166 +
  167 + if (!info[1].IsExternal()) {
  168 + Napi::TypeError::New(
  169 + env, "You should pass a offline stream pointer as the second argument")
  170 + .ThrowAsJavaScriptException();
  171 +
  172 + return {};
  173 + }
  174 +
  175 + if (!info[2].IsNumber()) {
  176 + Napi::TypeError::New(env,
  177 + "You should pass an integer as the third argument")
  178 + .ThrowAsJavaScriptException();
  179 +
  180 + return {};
  181 + }
  182 +
  183 + SherpaOnnxAudioTagging *at =
  184 + info[0].As<Napi::External<SherpaOnnxAudioTagging>>().Data();
  185 +
  186 + SherpaOnnxOfflineStream *stream =
  187 + info[1].As<Napi::External<SherpaOnnxOfflineStream>>().Data();
  188 +
  189 + int32_t top_k = info[2].As<Napi::Number>().Int32Value();
  190 +
  191 + const SherpaOnnxAudioEvent *const *events =
  192 + SherpaOnnxAudioTaggingCompute(at, stream, top_k);
  193 +
  194 + auto p = events;
  195 + int32_t k = 0;
  196 + while (p && *p) {
  197 + ++k;
  198 + ++p;
  199 + }
  200 +
  201 + Napi::Array ans = Napi::Array::New(env, k);
  202 + for (int32_t i = 0; i != k; ++i) {
  203 + Napi::Object obj = Napi::Object::New(env);
  204 + obj.Set(Napi::String::New(env, "name"),
  205 + Napi::String::New(env, events[i]->name));
  206 + obj.Set(Napi::String::New(env, "index"),
  207 + Napi::Number::New(env, events[i]->index));
  208 + obj.Set(Napi::String::New(env, "prob"),
  209 + Napi::Number::New(env, events[i]->prob));
  210 + ans[i] = obj;
  211 + }
  212 +
  213 + SherpaOnnxAudioTaggingFreeResults(events);
  214 +
  215 + return ans;
  216 +}
  217 +
  218 +void InitAudioTagging(Napi::Env env, Napi::Object exports) {
  219 + exports.Set(Napi::String::New(env, "createAudioTagging"),
  220 + Napi::Function::New(env, CreateAudioTaggingWrapper));
  221 +
  222 + exports.Set(Napi::String::New(env, "audioTaggingCreateOfflineStream"),
  223 + Napi::Function::New(env, AudioTaggingCreateOfflineStreamWrapper));
  224 +
  225 + exports.Set(Napi::String::New(env, "audioTaggingCompute"),
  226 + Napi::Function::New(env, AudioTaggingComputeWrapper));
  227 +}
@@ -19,6 +19,8 @@ void InitSpokenLanguageID(Napi::Env env, Napi::Object exports); @@ -19,6 +19,8 @@ void InitSpokenLanguageID(Napi::Env env, Napi::Object exports);
19 19
20 void InitSpeakerID(Napi::Env env, Napi::Object exports); 20 void InitSpeakerID(Napi::Env env, Napi::Object exports);
21 21
  22 +void InitAudioTagging(Napi::Env env, Napi::Object exports);
  23 +
22 Napi::Object Init(Napi::Env env, Napi::Object exports) { 24 Napi::Object Init(Napi::Env env, Napi::Object exports) {
23 InitStreamingAsr(env, exports); 25 InitStreamingAsr(env, exports);
24 InitNonStreamingAsr(env, exports); 26 InitNonStreamingAsr(env, exports);
@@ -28,6 +30,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) { @@ -28,6 +30,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
28 InitWaveWriter(env, exports); 30 InitWaveWriter(env, exports);
29 InitSpokenLanguageID(env, exports); 31 InitSpokenLanguageID(env, exports);
30 InitSpeakerID(env, exports); 32 InitSpeakerID(env, exports);
  33 + InitAudioTagging(env, exports);
31 34
32 return exports; 35 return exports;
33 } 36 }