Lovemefan
Committed by GitHub

add WebAssembly for Kws (#648)

@@ -23,6 +23,7 @@ option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF) @@ -23,6 +23,7 @@ option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF)
23 option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF) 23 option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF)
24 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) 24 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF)
25 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) 25 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF)
  26 +option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF)
26 option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) 27 option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF)
27 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) 28 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON)
28 option(SHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY "True to link libstdc++ statically. Used only when BUILD_SHARED_LIBS is OFF on Linux" ON) 29 option(SHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY "True to link libstdc++ statically. Used only when BUILD_SHARED_LIBS is OFF on Linux" ON)
@@ -135,6 +136,10 @@ if(SHERPA_ONNX_ENABLE_WASM) @@ -135,6 +136,10 @@ if(SHERPA_ONNX_ENABLE_WASM)
135 add_definitions(-DSHERPA_ONNX_ENABLE_WASM=1) 136 add_definitions(-DSHERPA_ONNX_ENABLE_WASM=1)
136 endif() 137 endif()
137 138
  139 +if(SHERPA_ONNX_ENABLE_WASM_KWS)
  140 + add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1)
  141 +endif()
  142 +
138 if(NOT CMAKE_CXX_STANDARD) 143 if(NOT CMAKE_CXX_STANDARD)
139 set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.") 144 set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
140 endif() 145 endif()
  1 +#!/usr/bin/env bash
  2 +
  3 +if [ x"$EMSCRIPTEN" == x"" ]; then
  4 + if ! command -v emcc &> /dev/null; then
  5 + echo "Please install emscripten first"
  6 + echo ""
  7 + echo "You can use the following commands to install it:"
  8 + echo ""
  9 + echo "git clone https://github.com/emscripten-core/emsdk.git"
  10 + echo "cd emsdk"
  11 + echo "git pull"
  12 + echo "./emsdk install latest"
  13 + echo "./emsdk activate latest"
  14 + echo "source ./emsdk_env.sh"
  15 + exit 1
  16 + else
  17 + EMSCRIPTEN=$(dirname $(realpath $(which emcc)))
  18 + fi
  19 +fi
  20 +
  21 +export EMSCRIPTEN=$EMSCRIPTEN
  22 +echo "EMSCRIPTEN: $EMSCRIPTEN"
  23 +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then
  24 + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake"
  25 + echo "Please make sure you have installed emsdk correctly"
  26 + exit 1
  27 +fi
  28 +
  29 +mkdir -p build-wasm-simd-kws
  30 +pushd build-wasm-simd-kws
  31 +
  32 +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON
  33 +
  34 +cmake \
  35 + -DCMAKE_INSTALL_PREFIX=./install \
  36 + -DCMAKE_BUILD_TYPE=Release \
  37 + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \
  38 + \
  39 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  40 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  41 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  42 + -DBUILD_SHARED_LIBS=OFF \
  43 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  44 + -DSHERPA_ONNX_ENABLE_JNI=OFF \
  45 + -DSHERPA_ONNX_ENABLE_C_API=ON \
  46 + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
  47 + -DSHERPA_ONNX_ENABLE_GPU=OFF \
  48 + -DSHERPA_ONNX_ENABLE_WASM=ON \
  49 + -DSHERPA_ONNX_ENABLE_WASM_KWS=ON \
  50 + -DSHERPA_ONNX_ENABLE_BINARY=OFF \
  51 + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \
  52 + ..
  53 +make -j8
  54 +make install
  55 +
  56 +ls -lh install/bin/wasm
@@ -481,7 +481,7 @@ SherpaOnnxKeywordSpotter* CreateKeywordSpotter( @@ -481,7 +481,7 @@ SherpaOnnxKeywordSpotter* CreateKeywordSpotter(
481 SherpaOnnxKeywordSpotter* spotter = new SherpaOnnxKeywordSpotter; 481 SherpaOnnxKeywordSpotter* spotter = new SherpaOnnxKeywordSpotter;
482 482
483 spotter->impl = 483 spotter->impl =
484 - std::make_unique<sherpa_onnx::KeywordSpotter>(spotter_config); 484 + std::make_unique<sherpa_onnx::KeywordSpotter>(spotter_config);
485 485
486 return spotter; 486 return spotter;
487 } 487 }
@@ -493,7 +493,7 @@ void DestroyKeywordSpotter(SherpaOnnxKeywordSpotter* spotter) { @@ -493,7 +493,7 @@ void DestroyKeywordSpotter(SherpaOnnxKeywordSpotter* spotter) {
493 SherpaOnnxOnlineStream* CreateKeywordStream( 493 SherpaOnnxOnlineStream* CreateKeywordStream(
494 const SherpaOnnxKeywordSpotter* spotter) { 494 const SherpaOnnxKeywordSpotter* spotter) {
495 SherpaOnnxOnlineStream* stream = 495 SherpaOnnxOnlineStream* stream =
496 - new SherpaOnnxOnlineStream(spotter->impl->CreateStream()); 496 + new SherpaOnnxOnlineStream(spotter->impl->CreateStream());
497 return stream; 497 return stream;
498 } 498 }
499 499
@@ -512,7 +512,7 @@ void DecodeMultipleKeywordStreams( @@ -512,7 +512,7 @@ void DecodeMultipleKeywordStreams(
512 int32_t n) { 512 int32_t n) {
513 std::vector<sherpa_onnx::OnlineStream*> ss(n); 513 std::vector<sherpa_onnx::OnlineStream*> ss(n);
514 for (int32_t i = 0; i != n; ++i) { 514 for (int32_t i = 0; i != n; ++i) {
515 - ss[i] = streams[i]->impl.get(); 515 + ss[i] = streams[i]->impl.get();
516 } 516 }
517 spotter->impl->DecodeStreams(ss.data(), n); 517 spotter->impl->DecodeStreams(ss.data(), n);
518 } 518 }
@@ -593,7 +593,6 @@ void DestroyKeywordResult(const SherpaOnnxKeywordResult *r) { @@ -593,7 +593,6 @@ void DestroyKeywordResult(const SherpaOnnxKeywordResult *r) {
593 } 593 }
594 } 594 }
595 595
596 -  
597 // ============================================================ 596 // ============================================================
598 // For VAD 597 // For VAD
599 // ============================================================ 598 // ============================================================
@@ -266,8 +266,14 @@ class KeywordSpotterTransducerImpl : public KeywordSpotterImpl { @@ -266,8 +266,14 @@ class KeywordSpotterTransducerImpl : public KeywordSpotterImpl {
266 } 266 }
267 267
268 void InitKeywords() { 268 void InitKeywords() {
  269 +#ifdef SHERPA_ONNX_ENABLE_WASM_KWS
  270 + // Due to the limitations of the wasm file system,
  271 + // the keyword_file variable is directly parsed as a string of keywords
  272 + // if WASM KWS on
  273 + std::istringstream is(config_.keywords_file);
  274 + InitKeywords(is);
  275 +#else
269 // each line in keywords_file contains space-separated words 276 // each line in keywords_file contains space-separated words
270 -  
271 std::ifstream is(config_.keywords_file); 277 std::ifstream is(config_.keywords_file);
272 if (!is) { 278 if (!is) {
273 SHERPA_ONNX_LOGE("Open keywords file failed: %s", 279 SHERPA_ONNX_LOGE("Open keywords file failed: %s",
@@ -275,6 +281,7 @@ class KeywordSpotterTransducerImpl : public KeywordSpotterImpl { @@ -275,6 +281,7 @@ class KeywordSpotterTransducerImpl : public KeywordSpotterImpl {
275 exit(-1); 281 exit(-1);
276 } 282 }
277 InitKeywords(is); 283 InitKeywords(is);
  284 +#endif
278 } 285 }
279 286
280 #if __ANDROID_API__ >= 9 287 #if __ANDROID_API__ >= 9
@@ -94,10 +94,17 @@ bool KeywordSpotterConfig::Validate() const { @@ -94,10 +94,17 @@ bool KeywordSpotterConfig::Validate() const {
94 SHERPA_ONNX_LOGE("Please provide --keywords-file."); 94 SHERPA_ONNX_LOGE("Please provide --keywords-file.");
95 return false; 95 return false;
96 } 96 }
  97 +
  98 +#ifndef SHERPA_ONNX_ENABLE_WASM_KWS
  99 + // due to the limitations of the wasm file system,
  100 + // keywords file will be packaged into the sherpa-onnx-wasm-kws-main.data file
  101 + // Solution: take keyword_file variable is directly
  102 + // parsed as a string of keywords
97 if (!std::ifstream(keywords_file.c_str()).good()) { 103 if (!std::ifstream(keywords_file.c_str()).good()) {
98 SHERPA_ONNX_LOGE("Keywords file %s does not exist.", keywords_file.c_str()); 104 SHERPA_ONNX_LOGE("Keywords file %s does not exist.", keywords_file.c_str());
99 return false; 105 return false;
100 } 106 }
  107 +#endif
101 108
102 return model_config.Validate(); 109 return model_config.Validate();
103 } 110 }
@@ -2,16 +2,14 @@ @@ -2,16 +2,14 @@
2 // 2 //
3 // Copyright (c) 2023-2024 Xiaomi Corporation 3 // Copyright (c) 2023-2024 Xiaomi Corporation
4 4
5 -#include "sherpa-onnx/csrc/transducer-keyword-decoder.h"  
6 -  
7 #include <algorithm> 5 #include <algorithm>
8 #include <cmath> 6 #include <cmath>
9 -#include <cstring>  
10 #include <utility> 7 #include <utility>
11 #include <vector> 8 #include <vector>
12 9
13 #include "sherpa-onnx/csrc/log.h" 10 #include "sherpa-onnx/csrc/log.h"
14 #include "sherpa-onnx/csrc/onnx-utils.h" 11 #include "sherpa-onnx/csrc/onnx-utils.h"
  12 +#include "sherpa-onnx/csrc/transducer-keyword-decoder.h"
15 13
16 namespace sherpa_onnx { 14 namespace sherpa_onnx {
17 15
@@ -6,6 +6,10 @@ if(SHERPA_ONNX_ENABLE_WASM_ASR) @@ -6,6 +6,10 @@ if(SHERPA_ONNX_ENABLE_WASM_ASR)
6 add_subdirectory(asr) 6 add_subdirectory(asr)
7 endif() 7 endif()
8 8
  9 +if(SHERPA_ONNX_ENABLE_WASM_KWS)
  10 + add_subdirectory(kws)
  11 +endif()
  12 +
9 if(SHERPA_ONNX_ENABLE_WASM_NODEJS) 13 if(SHERPA_ONNX_ENABLE_WASM_NODEJS)
10 add_subdirectory(nodejs) 14 add_subdirectory(nodejs)
11 endif() 15 endif()
  1 +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH})
  2 + message(FATAL_ERROR "Please use ./build-wasm-simd-kws.sh to build for wasm KWS")
  3 +endif()
  4 +
  5 +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/decoder-epoch-12-avg-2-chunk-16-left-64.onnx")
  6 + message(WARNING "${CMAKE_CURRENT_SOURCE_DIR}/assets/decoder-epoch-12-avg-2-chunk-16-left-64.onnx does not exist")
  7 + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue")
  8 +endif()
  9 +
  10 +set(exported_functions
  11 + AcceptWaveform
  12 + CreateKeywordSpotter
  13 + DestroyKeywordSpotter
  14 + CreateKeywordStream
  15 + DecodeKeywordStream
  16 + GetKeywordResult
  17 + DestroyKeywordResult
  18 + IsKeywordStreamReady
  19 + InputFinished
  20 +)
  21 +set(mangled_exported_functions)
  22 +foreach(x IN LISTS exported_functions)
  23 + list(APPEND mangled_exported_functions "_${x}")
  24 +endforeach()
  25 +
  26 +list(JOIN mangled_exported_functions "," all_exported_functions)
  27 +
  28 +include_directories(${CMAKE_SOURCE_DIR})
  29 +set(MY_FLAGS "-s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=512MB -s ALLOW_MEMORY_GROWTH=1")
  30 +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ")
  31 +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ")
  32 +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ")
  33 +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ")
  34 +message(STATUS "MY_FLAGS: ${MY_FLAGS}")
  35 +
  36 +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}")
  37 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}")
  38 +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}")
  39 +
  40 +add_executable(sherpa-onnx-wasm-kws-main sherpa-onnx-wasm-main-kws.cc)
  41 +target_link_libraries(sherpa-onnx-wasm-kws-main sherpa-onnx-c-api)
  42 +install(TARGETS sherpa-onnx-wasm-kws-main DESTINATION bin/wasm)
  43 +
  44 +install(
  45 + FILES
  46 + "sherpa-onnx-kws.js"
  47 + "app.js"
  48 + "index.html"
  49 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-kws-main>/sherpa-onnx-wasm-kws-main.js"
  50 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-kws-main>/sherpa-onnx-wasm-kws-main.wasm"
  51 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-kws-main>/sherpa-onnx-wasm-kws-main.data"
  52 + DESTINATION
  53 + bin/wasm
  54 +)
  1 +// This file copies and modifies code
  2 +// from https://mdn.github.io/web-dictaphone/scripts/app.js
  3 +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e
  4 +
  5 +const startBtn = document.getElementById('startBtn');
  6 +const stopBtn = document.getElementById('stopBtn');
  7 +const clearBtn = document.getElementById('clearBtn');
  8 +const hint = document.getElementById('hint');
  9 +const soundClips = document.getElementById('sound-clips');
  10 +
  11 +let textArea = document.getElementById('results');
  12 +
  13 +let lastResult = '';
  14 +let resultList = [];
  15 +
  16 +clearBtn.onclick = function() {
  17 + resultList = [];
  18 + textArea.value = getDisplayResult();
  19 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  20 +};
  21 +
  22 +function getDisplayResult() {
  23 + let i = 0;
  24 + let ans = '';
  25 + for (let s in resultList) {
  26 + if (resultList[s] == '') {
  27 + continue;
  28 + }
  29 +
  30 + ans += '' + i + ': ' + resultList[s] + '\n';
  31 + i += 1;
  32 + }
  33 +
  34 + return ans;
  35 +}
  36 +
  37 +
  38 +Module = {};
  39 +Module.onRuntimeInitialized = function() {
  40 + console.log('inited!');
  41 + hint.innerText = 'Model loaded! Please click start';
  42 +
  43 + startBtn.disabled = false;
  44 +
  45 + recognizer = createKws(Module);
  46 + console.log('recognizer is created!', recognizer);
  47 +};
  48 +
  49 +let audioCtx;
  50 +let mediaStream;
  51 +
  52 +let expectedSampleRate = 16000;
  53 +let recordSampleRate; // the sampleRate of the microphone
  54 +let recorder = null; // the microphone
  55 +let leftchannel = []; // TODO: Use a single channel
  56 +
  57 +let recordingLength = 0; // number of samples so far
  58 +
  59 +let recognizer = null;
  60 +let recognizer_stream = null;
  61 +
  62 +if (navigator.mediaDevices.getUserMedia) {
  63 + console.log('getUserMedia supported.');
  64 +
  65 + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
  66 + const constraints = {audio: true};
  67 +
  68 + let onSuccess = function(stream) {
  69 + if (!audioCtx) {
  70 + audioCtx = new AudioContext({sampleRate: 16000});
  71 + }
  72 + console.log(audioCtx);
  73 + recordSampleRate = audioCtx.sampleRate;
  74 + console.log('sample rate ' + recordSampleRate);
  75 +
  76 + // creates an audio node from the microphone incoming stream
  77 + mediaStream = audioCtx.createMediaStreamSource(stream);
  78 + console.log('media stream', mediaStream);
  79 +
  80 + // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
  81 + // bufferSize: the onaudioprocess event is called when the buffer is full
  82 + var bufferSize = 4096;
  83 + var numberOfInputChannels = 1;
  84 + var numberOfOutputChannels = 2;
  85 + if (audioCtx.createScriptProcessor) {
  86 + recorder = audioCtx.createScriptProcessor(
  87 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  88 + } else {
  89 + recorder = audioCtx.createJavaScriptNode(
  90 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  91 + }
  92 + console.log('recorder', recorder);
  93 +
  94 + recorder.onaudioprocess = function(e) {
  95 + let samples = new Float32Array(e.inputBuffer.getChannelData(0))
  96 + samples = downsampleBuffer(samples, expectedSampleRate);
  97 +
  98 + if (recognizer_stream == null) {
  99 + recognizer_stream = recognizer.createStream();
  100 + }
  101 +
  102 + recognizer_stream.acceptWaveform(expectedSampleRate, samples);
  103 + while (recognizer.isReady(recognizer_stream)) {
  104 + recognizer.decode(recognizer_stream);
  105 + }
  106 +
  107 +
  108 + let result = recognizer.getResult(recognizer_stream);
  109 + console.log(result)
  110 +
  111 + if (result.keyword.length > 0) {
  112 + lastResult = result;
  113 + resultList.push(JSON.stringify(result));
  114 + }
  115 +
  116 +
  117 + textArea.value = getDisplayResult();
  118 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  119 +
  120 + let buf = new Int16Array(samples.length);
  121 + for (var i = 0; i < samples.length; ++i) {
  122 + let s = samples[i];
  123 + if (s >= 1)
  124 + s = 1;
  125 + else if (s <= -1)
  126 + s = -1;
  127 +
  128 + samples[i] = s;
  129 + buf[i] = s * 32767;
  130 + }
  131 +
  132 + leftchannel.push(buf);
  133 + recordingLength += bufferSize;
  134 + };
  135 +
  136 + startBtn.onclick = function() {
  137 + mediaStream.connect(recorder);
  138 + recorder.connect(audioCtx.destination);
  139 +
  140 + console.log('recorder started');
  141 +
  142 + stopBtn.disabled = false;
  143 + startBtn.disabled = true;
  144 + };
  145 +
  146 + stopBtn.onclick = function() {
  147 + console.log('recorder stopped');
  148 +
  149 + // stopBtn recording
  150 + recorder.disconnect(audioCtx.destination);
  151 + mediaStream.disconnect(recorder);
  152 +
  153 + startBtn.style.background = '';
  154 + startBtn.style.color = '';
  155 + // mediaRecorder.requestData();
  156 +
  157 + stopBtn.disabled = true;
  158 + startBtn.disabled = false;
  159 +
  160 + var clipName = new Date().toISOString();
  161 +
  162 + const clipContainer = document.createElement('article');
  163 + const clipLabel = document.createElement('p');
  164 + const audio = document.createElement('audio');
  165 + const deleteButton = document.createElement('button');
  166 + clipContainer.classList.add('clip');
  167 + audio.setAttribute('controls', '');
  168 + deleteButton.textContent = 'Delete';
  169 + deleteButton.className = 'delete';
  170 +
  171 + clipLabel.textContent = clipName;
  172 +
  173 + clipContainer.appendChild(audio);
  174 +
  175 + clipContainer.appendChild(clipLabel);
  176 + clipContainer.appendChild(deleteButton);
  177 + soundClips.appendChild(clipContainer);
  178 +
  179 + audio.controls = true;
  180 + let samples = flatten(leftchannel);
  181 + const blob = toWav(samples);
  182 +
  183 + leftchannel = [];
  184 + const audioURL = window.URL.createObjectURL(blob);
  185 + audio.src = audioURL;
  186 + console.log('recorder stopped');
  187 +
  188 + deleteButton.onclick = function(e) {
  189 + let evtTgt = e.target;
  190 + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
  191 + };
  192 +
  193 + clipLabel.onclick = function() {
  194 + const existingName = clipLabel.textContent;
  195 + const newClipName = prompt('Enter a new name for your sound clip?');
  196 + if (newClipName === null) {
  197 + clipLabel.textContent = existingName;
  198 + } else {
  199 + clipLabel.textContent = newClipName;
  200 + }
  201 + };
  202 + };
  203 + };
  204 +
  205 + let onError = function(err) {
  206 + console.log('The following error occured: ' + err);
  207 + };
  208 +
  209 + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
  210 +} else {
  211 + console.log('getUserMedia not supported on your browser!');
  212 + alert('getUserMedia not supported on your browser!');
  213 +}
  214 +
  215 +
  216 +// this function is copied/modified from
  217 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  218 +function flatten(listOfSamples) {
  219 + let n = 0;
  220 + for (let i = 0; i < listOfSamples.length; ++i) {
  221 + n += listOfSamples[i].length;
  222 + }
  223 + let ans = new Int16Array(n);
  224 +
  225 + let offset = 0;
  226 + for (let i = 0; i < listOfSamples.length; ++i) {
  227 + ans.set(listOfSamples[i], offset);
  228 + offset += listOfSamples[i].length;
  229 + }
  230 + return ans;
  231 +}
  232 +
  233 +// this function is copied/modified from
  234 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  235 +function toWav(samples) {
  236 + let buf = new ArrayBuffer(44 + samples.length * 2);
  237 + var view = new DataView(buf);
  238 +
  239 + // http://soundfile.sapp.org/doc/WaveFormat/
  240 + // F F I R
  241 + view.setUint32(0, 0x46464952, true); // chunkID
  242 + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
  243 + // E V A W
  244 + view.setUint32(8, 0x45564157, true); // format
  245 + //
  246 + // t m f
  247 + view.setUint32(12, 0x20746d66, true); // subchunk1ID
  248 + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
  249 + view.setUint32(20, 1, true); // audioFormat, 1 for PCM
  250 + view.setUint16(22, 1, true); // numChannels: 1 channel
  251 + view.setUint32(24, expectedSampleRate, true); // sampleRate
  252 + view.setUint32(28, expectedSampleRate * 2, true); // byteRate
  253 + view.setUint16(32, 2, true); // blockAlign
  254 + view.setUint16(34, 16, true); // bitsPerSample
  255 + view.setUint32(36, 0x61746164, true); // Subchunk2ID
  256 + view.setUint32(40, samples.length * 2, true); // subchunk2Size
  257 +
  258 + let offset = 44;
  259 + for (let i = 0; i < samples.length; ++i) {
  260 + view.setInt16(offset, samples[i], true);
  261 + offset += 2;
  262 + }
  263 +
  264 + return new Blob([view], {type: 'audio/wav'});
  265 +}
  266 +
  267 +// this function is copied from
  268 +// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46
  269 +function downsampleBuffer(buffer, exportSampleRate) {
  270 + if (exportSampleRate === recordSampleRate) {
  271 + return buffer;
  272 + }
  273 + var sampleRateRatio = recordSampleRate / exportSampleRate;
  274 + var newLength = Math.round(buffer.length / sampleRateRatio);
  275 + var result = new Float32Array(newLength);
  276 + var offsetResult = 0;
  277 + var offsetBuffer = 0;
  278 + while (offsetResult < result.length) {
  279 + var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
  280 + var accum = 0, count = 0;
  281 + for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
  282 + accum += buffer[i];
  283 + count++;
  284 + }
  285 + result[offsetResult] = accum / count;
  286 + offsetResult++;
  287 + offsetBuffer = nextOffsetBuffer;
  288 + }
  289 + return result;
  290 +};
  1 +# Introduction
  2 +
  3 +Please refer to
  4 +https://www.modelscope.cn/models/pkufool/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/summary
  5 +to download a model.
  6 +
  7 +# Kws
  8 +
  9 +The following is an example:
  10 +```
  11 +cd sherpa-onnx/wasm/kws
  12 +git clone https://www.modelscope.cn/pkufool/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.git assets
  13 +```
  14 +
  15 +You should have the following files in `assets` before you can run
  16 +`build-wasm-simd-kws.sh`
  17 +
  18 +```
  19 +├── decoder-epoch-12-avg-2-chunk-16-left-64.onnx
  20 +├── encoder-epoch-12-avg-2-chunk-16-left-64.onnx
  21 +├── joiner-epoch-12-avg-2-chunk-16-left-64.onnx
  22 +├── keywords_raw.txt
  23 +├── keywords.txt
  24 +├── README.md
  25 +└── tokens.txt
  26 +
  27 +```
  1 +<html lang="en">
  2 +
  3 +<head>
  4 + <meta charset="utf-8">
  5 + <meta name="viewport" content="width=device-width" />
  6 + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for kws</title>
  7 + <style>
  8 + h1,div {
  9 + text-align: center;
  10 + }
  11 + textarea {
  12 + width:100%;
  13 + }
  14 + </style>
  15 +</head>
  16 +
  17 +<body>
  18 + <h1>
  19 + WebAssembly<br/>
  20 + Kws Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a>
  21 + </h1>
  22 + <div>
  23 + <span id="hint">Loading model ... ...</span>
  24 + <br/>
  25 + <br/>
  26 + <button id="startBtn" disabled>Start</button>
  27 + <button id="stopBtn" disabled>Stop</button>
  28 + <button id="clearBtn">Clear</button>
  29 + <br/>
  30 + <br/>
  31 + <textarea id="results" rows="10" readonly></textarea>
  32 + </div>
  33 +
  34 + <section flex="1" overflow="auto" id="sound-clips">
  35 + </section>
  36 +
  37 + <script src="sherpa-onnx-kws.js"></script>
  38 + <script src="app.js"></script>
  39 + <script src="sherpa-onnx-wasm-kws-main.js"></script>
  40 +</body>
  1 +
  2 +
  3 +function freeConfig(config, Module) {
  4 + if ('buffer' in config) {
  5 + Module._free(config.buffer);
  6 + }
  7 + Module._free(config.ptr);
  8 +}
  9 +
  10 +
  11 +function initSherpaOnnxOnlineTransducerModelConfig(config, Module) {
  12 + const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1;
  13 + const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1;
  14 + const joinerLen = Module.lengthBytesUTF8(config.joiner) + 1;
  15 +
  16 + const n = encoderLen + decoderLen + joinerLen;
  17 +
  18 + const buffer = Module._malloc(n);
  19 +
  20 + const len = 3 * 4; // 3 pointers
  21 + const ptr = Module._malloc(len);
  22 +
  23 + let offset = 0;
  24 + Module.stringToUTF8(config.encoder, buffer + offset, encoderLen);
  25 + offset += encoderLen;
  26 +
  27 + Module.stringToUTF8(config.decoder, buffer + offset, decoderLen);
  28 + offset += decoderLen;
  29 +
  30 + Module.stringToUTF8(config.joiner, buffer + offset, joinerLen);
  31 +
  32 + offset = 0;
  33 + Module.setValue(ptr, buffer + offset, 'i8*');
  34 + offset += encoderLen;
  35 +
  36 + Module.setValue(ptr + 4, buffer + offset, 'i8*');
  37 + offset += decoderLen;
  38 +
  39 + Module.setValue(ptr + 8, buffer + offset, 'i8*');
  40 +
  41 + return {
  42 + buffer: buffer, ptr: ptr, len: len,
  43 + }
  44 +}
  45 +
  46 +// The user should free the returned pointers
  47 +function initModelConfig(config, Module) {
  48 + const transducer =
  49 + initSherpaOnnxOnlineTransducerModelConfig(config.transducer, Module);
  50 + const paraformer_len = 2 * 4
  51 + const ctc_len = 1 * 4
  52 +
  53 + const len = transducer.len + paraformer_len + ctc_len + 5 * 4;
  54 + const ptr = Module._malloc(len);
  55 +
  56 + let offset = 0;
  57 + Module._CopyHeap(transducer.ptr, transducer.len, ptr + offset);
  58 +
  59 + const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1;
  60 + const providerLen = Module.lengthBytesUTF8(config.provider) + 1;
  61 + const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1;
  62 + const bufferLen = tokensLen + providerLen + modelTypeLen;
  63 + const buffer = Module._malloc(bufferLen);
  64 +
  65 + offset = 0;
  66 + Module.stringToUTF8(config.tokens, buffer, tokensLen);
  67 + offset += tokensLen;
  68 +
  69 + Module.stringToUTF8(config.provider, buffer + offset, providerLen);
  70 + offset += providerLen;
  71 +
  72 + Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen);
  73 +
  74 + offset = transducer.len + paraformer_len + ctc_len;
  75 + Module.setValue(ptr + offset, buffer, 'i8*'); // tokens
  76 + offset += 4;
  77 +
  78 + Module.setValue(ptr + offset, config.numThreads, 'i32');
  79 + offset += 4;
  80 +
  81 + Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider
  82 + offset += 4;
  83 +
  84 + Module.setValue(ptr + offset, config.debug, 'i32');
  85 + offset += 4;
  86 +
  87 + Module.setValue(
  88 + ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType
  89 + offset += 4;
  90 +
  91 + return {
  92 + buffer: buffer, ptr: ptr, len: len,
  93 + }
  94 +}
  95 +
  96 +function initFeatureExtractorConfig(config, Module) {
  97 + let ptr = Module._malloc(4 * 2);
  98 + Module.setValue(ptr, config.samplingRate, 'i32');
  99 + Module.setValue(ptr + 4, config.featureDim, 'i32');
  100 + return {
  101 + ptr: ptr, len: 8,
  102 + }
  103 +}
  104 +
  105 +function initKwsConfig(config, Module) {
  106 + let featConfig =
  107 + initFeatureExtractorConfig(config.featConfig, Module);
  108 +
  109 + let modelConfig = initModelConfig(config.modelConfig, Module);
  110 + let numBytes =
  111 + featConfig.len + modelConfig.len + 4 * 5;
  112 +
  113 + let ptr = Module._malloc(numBytes);
  114 + let offset = 0;
  115 + Module._CopyHeap(featConfig.ptr, featConfig.len, ptr + offset);
  116 + offset += featConfig.len;
  117 +
  118 + Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset)
  119 + offset += modelConfig.len;
  120 +
  121 +
  122 + Module.setValue(ptr + offset, config.maxActivePaths, 'i32');
  123 + offset += 4;
  124 +
  125 + Module.setValue(ptr + offset, config.numTrailingBlanks, 'i32');
  126 + offset += 4;
  127 +
  128 + Module.setValue(ptr + offset, config.keywordsScore, 'float');
  129 + offset += 4;
  130 +
  131 + Module.setValue(ptr + offset, config.keywordsThreshold, 'float');
  132 + offset += 4;
  133 +
  134 + let keywordsLen = Module.lengthBytesUTF8(config.keywords) + 1;
  135 + let keywordsBuffer = Module._malloc(keywordsLen);
  136 + Module.stringToUTF8(config.keywords, keywordsBuffer, keywordsLen);
  137 + Module.setValue(ptr + offset, keywordsBuffer, 'i8*');
  138 + offset += 4;
  139 +
  140 + return {
  141 + ptr: ptr, len: numBytes, featConfig: featConfig, modelConfig: modelConfig
  142 + }
  143 +}
  144 +
  145 +class Stream {
  146 + constructor(handle, Module) {
  147 + this.handle = handle;
  148 + this.pointer = null;
  149 + this.n = 0;
  150 + this.Module = Module;
  151 + }
  152 +
  153 + free() {
  154 + if (this.handle) {
  155 + this.Module._DestroyOnlineKwsStream(this.handle);
  156 + this.handle = null;
  157 + this.Module._free(this.pointer);
  158 + this.pointer = null;
  159 + this.n = 0;
  160 + }
  161 + }
  162 +
  163 + /**
  164 + * @param sampleRate {Number}
  165 + * @param samples {Float32Array} Containing samples in the range [-1, 1]
  166 + */
  167 + acceptWaveform(sampleRate, samples) {
  168 + if (this.n < samples.length) {
  169 + this.Module._free(this.pointer);
  170 + this.pointer =
  171 + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT);
  172 + this.n = samples.length
  173 + }
  174 +
  175 + this.Module.HEAPF32.set(samples, this.pointer / samples.BYTES_PER_ELEMENT);
  176 + this.Module._AcceptWaveform(
  177 + this.handle, sampleRate, this.pointer, samples.length);
  178 + }
  179 +
  180 + inputFinished() {
  181 + _InputFinished(this.handle);
  182 + }
  183 +};
  184 +
  185 +class Kws {
  186 + constructor(configObj, Module) {
  187 + this.config = configObj;
  188 + let config = initKwsConfig(configObj, Module)
  189 + let handle = Module._CreateKeywordSpotter(config.ptr);
  190 +
  191 +
  192 + freeConfig(config.featConfig, Module);
  193 + freeConfig(config.modelConfig, Module);
  194 + freeConfig(config, Module);
  195 +
  196 + this.handle = handle;
  197 + this.Module = Module;
  198 + }
  199 +
  200 + free() {
  201 + this.Module._DestroyKeywordSpotter(this.handle);
  202 + this.handle = 0
  203 + }
  204 +
  205 + createStream() {
  206 + let handle = this.Module._CreateKeywordStream(this.handle);
  207 + return new Stream(handle, this.Module);
  208 + }
  209 +
  210 + isReady(stream) {
  211 + return this.Module._IsKeywordStreamReady(this.handle, stream.handle) === 1;
  212 + }
  213 +
  214 +
  215 + decode(stream) {
  216 + return this.Module._DecodeKeywordStream(this.handle, stream.handle);
  217 + }
  218 +
  219 + getResult(stream) {
  220 + let r = this.Module._GetKeywordResult(this.handle, stream.handle);
  221 + let jsonPtr = this.Module.getValue(r + 24, 'i8*');
  222 + let json = this.Module.UTF8ToString(jsonPtr);
  223 + this.Module._DestroyKeywordResult(r);
  224 + return JSON.parse(json);
  225 + }
  226 +}
  227 +
  228 +function createKws(Module, myConfig) {
  229 + let transducerConfig = {
  230 + encoder: './encoder-epoch-12-avg-2-chunk-16-left-64.onnx',
  231 + decoder: './decoder-epoch-12-avg-2-chunk-16-left-64.onnx',
  232 + joiner: './joiner-epoch-12-avg-2-chunk-16-left-64.onnx',
  233 + }
  234 + let modelConfig = {
  235 + transducer: transducerConfig,
  236 + tokens: './tokens.txt',
  237 + provider: 'cpu',
  238 + modelType: "",
  239 + numThreads: 1,
  240 + debug: 1
  241 + };
  242 +
  243 + let featConfig = {
  244 + samplingRate: 16000,
  245 + featureDim: 80,
  246 + };
  247 +
  248 + let configObj = {
  249 + featConfig: featConfig,
  250 + modelConfig: modelConfig,
  251 + maxActivePaths: 4,
  252 + numTrailingBlanks: 1,
  253 + keywordsScore: 1.0,
  254 + keywordsThreshold: 0.25,
  255 + keywords: "x iǎo ài t óng x ué @小爱同学\n" +
  256 + "j ūn g ē n iú b ī @军哥牛逼"
  257 + };
  258 +
  259 + if (myConfig) {
  260 + configObj = myConfig;
  261 + }
  262 + return new Kws(configObj, Module);
  263 +}
  264 +
  265 +if (typeof process == 'object' && typeof process.versions == 'object' &&
  266 + typeof process.versions.node == 'string') {
  267 + module.exports = {
  268 + createKws,
  269 + };
  270 +}
  1 +// wasm/sherpa-onnx-wasm-main-kws.cc
  2 +//
  3 +// Copyright (c) 2024 lovemefan
  4 +#include <stdio.h>
  5 +
  6 +#include <algorithm>
  7 +#include <memory>
  8 +
  9 +#include "sherpa-onnx/c-api/c-api.h"
  10 +
  11 +// see also
  12 +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html
  13 +
  14 +extern "C" {
  15 +
  16 +static_assert(sizeof(SherpaOnnxOnlineTransducerModelConfig) == 3 * 4, "");
  17 +static_assert(sizeof(SherpaOnnxOnlineParaformerModelConfig) == 2 * 4, "");
  18 +static_assert(sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) == 1 * 4, "");
  19 +static_assert(sizeof(SherpaOnnxOnlineModelConfig) ==
  20 + sizeof(SherpaOnnxOnlineTransducerModelConfig) +
  21 + sizeof(SherpaOnnxOnlineParaformerModelConfig) +
  22 + sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4,
  23 + "");
  24 +static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
  25 +static_assert(sizeof(SherpaOnnxKeywordSpotterConfig) ==
  26 + sizeof(SherpaOnnxFeatureConfig) +
  27 + sizeof(SherpaOnnxOnlineModelConfig) + 5 * 4,
  28 + "");
  29 +
  30 +void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
  31 + std::copy(src, src + num_bytes, dst);
  32 +}
  33 +}
@@ -37,6 +37,14 @@ set(exported_functions @@ -37,6 +37,14 @@ set(exported_functions
37 DecodeMultipleOfflineStreams 37 DecodeMultipleOfflineStreams
38 GetOfflineStreamResult 38 GetOfflineStreamResult
39 DestroyOfflineRecognizerResult 39 DestroyOfflineRecognizerResult
  40 + # online kws
  41 + CreateKeywordSpotter
  42 + DestroyKeywordSpotter
  43 + CreateKeywordStream
  44 + DecodeKeywordStream
  45 + GetKeywordResult
  46 + DestroyKeywordResult
  47 + IsKeywordStreamReady
40 ) 48 )
41 49
42 50
@@ -69,6 +77,7 @@ install( @@ -69,6 +77,7 @@ install(
69 FILES 77 FILES
70 ${CMAKE_SOURCE_DIR}/wasm/asr/sherpa-onnx-asr.js 78 ${CMAKE_SOURCE_DIR}/wasm/asr/sherpa-onnx-asr.js
71 ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js 79 ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js
  80 + ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js
72 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js" 81 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.js"
73 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm" 82 "$<TARGET_FILE_DIR:sherpa-onnx-wasm-nodejs>/sherpa-onnx-wasm-nodejs.wasm"
74 DESTINATION 83 DESTINATION