Fangjun Kuang
Committed by GitHub

Add WebAssembly for ASR (#604)

  1 +name: wasm-simd-hf-space-en-asr-zipformer
  2 +
  3 +on:
  4 + release:
  5 + types:
  6 + - published
  7 +
  8 + workflow_dispatch:
  9 +
  10 +concurrency:
  11 + group: wasm-simd-hf-space-en-asr-zipformer-${{ github.ref }}
  12 + cancel-in-progress: true
  13 +
  14 +jobs:
  15 + wasm-simd-hf-space-en-asr-zipformer:
  16 + runs-on: ${{ matrix.os }}
  17 + strategy:
  18 + fail-fast: false
  19 + matrix:
  20 + os: [ubuntu-latest]
  21 +
  22 + steps:
  23 + - uses: actions/checkout@v4
  24 + with:
  25 + fetch-depth: 0
  26 + - name: Install emsdk
  27 + uses: mymindstorm/setup-emsdk@v14
  28 +
  29 + - name: View emsdk version
  30 + shell: bash
  31 + run: |
  32 + emcc -v
  33 + echo "--------------------"
  34 + emcc --check
  35 +
  36 + - name: Download model files
  37 + shell: bash
  38 + run: |
  39 + cd wasm/asr/assets
  40 + ls -lh
  41 + echo "----------"
  42 +
  43 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2
  44 + tar xvf sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2
  45 + rm sherpa-onnx-streaming-zipformer-en-2023-06-21.tar.bz2
  46 + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/encoder-epoch-99-avg-1.int8.onnx encoder.onnx
  47 + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/decoder-epoch-99-avg-1.onnx decoder.onnx
  48 + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/joiner-epoch-99-avg-1.onnx joiner.onnx
  49 + mv sherpa-onnx-streaming-zipformer-en-2023-06-21/tokens.txt ./
  50 +
  51 + rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-21
  52 +
  53 + ls -lh
  54 +
  55 + - name: Build sherpa-onnx for WebAssembly (ASR)
  56 + shell: bash
  57 + run: |
  58 + ./build-wasm-simd-asr.sh
  59 +
  60 + - name: collect files
  61 + shell: bash
  62 + run: |
  63 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  64 +
  65 + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-en-asr-zipformer
  66 + mv build-wasm-simd-asr/install/bin/wasm/asr $dst
  67 + ls -lh $dst
  68 + tar cjfv ${dst}.tar.bz2 ./${dst}
  69 +
  70 + - name: Upload wasm files
  71 + uses: actions/upload-artifact@v4
  72 + with:
  73 + name: sherpa-onnx-wasm-simd-en-asr-zipformer
  74 + path: ./sherpa-onnx-wasm-simd-*.tar.bz2
  75 +
  76 + - name: Publish to ModelScope
  77 + env:
  78 + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }}
  79 + uses: nick-fields/retry@v2
  80 + with:
  81 + max_attempts: 20
  82 + timeout_seconds: 200
  83 + shell: bash
  84 + command: |
  85 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  86 +
  87 + git config --global user.email "csukuangfj@gmail.com"
  88 + git config --global user.name "Fangjun Kuang"
  89 +
  90 + rm -rf ms
  91 + export GIT_LFS_SKIP_SMUDGE=1
  92 +
  93 + git clone https://www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en.git ms
  94 + cd ms
  95 + git fetch
  96 + git pull
  97 + git merge -m "merge remote" --ff origin main
  98 +
  99 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  100 +
  101 + git status
  102 + git lfs track "*.data"
  103 + git lfs track "*.wasm"
  104 + ls -lh
  105 +
  106 + git add .
  107 + git commit -m "update model"
  108 + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en.git
  109 +
  110 + - name: Publish to huggingface
  111 + env:
  112 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  113 + uses: nick-fields/retry@v2
  114 + with:
  115 + max_attempts: 20
  116 + timeout_seconds: 200
  117 + shell: bash
  118 + command: |
  119 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  120 +
  121 + git config --global user.email "csukuangfj@gmail.com"
  122 + git config --global user.name "Fangjun Kuang"
  123 +
  124 + rm -rf huggingface
  125 + export GIT_LFS_SKIP_SMUDGE=1
  126 +
  127 + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en huggingface
  128 + cd huggingface
  129 + git fetch
  130 + git pull
  131 + git merge -m "merge remote" --ff origin main
  132 +
  133 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  134 +
  135 + git status
  136 + git lfs track "*.data"
  137 + git lfs track "*.wasm"
  138 + ls -lh
  139 +
  140 + git add .
  141 + git commit -m "update model"
  142 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en main
  1 +name: wasm-simd-hf-space-zh-en-asr-paraformer
  2 +
  3 +on:
  4 + release:
  5 + types:
  6 + - published
  7 +
  8 + workflow_dispatch:
  9 +
  10 +concurrency:
  11 + group: wasm-simd-hf-space-zh-en-asr-paraformer-${{ github.ref }}
  12 + cancel-in-progress: true
  13 +
  14 +jobs:
  15 + wasm-simd-hf-space-zh-en-asr-paraformer:
  16 + runs-on: ${{ matrix.os }}
  17 + strategy:
  18 + fail-fast: false
  19 + matrix:
  20 + os: [ubuntu-latest]
  21 +
  22 + steps:
  23 + - uses: actions/checkout@v4
  24 + with:
  25 + fetch-depth: 0
  26 + - name: Install emsdk
  27 + uses: mymindstorm/setup-emsdk@v14
  28 +
  29 + - name: View emsdk version
  30 + shell: bash
  31 + run: |
  32 + emcc -v
  33 + echo "--------------------"
  34 + emcc --check
  35 +
  36 + - name: Download model files
  37 + shell: bash
  38 + run: |
  39 + cd wasm/asr/assets
  40 + ls -lh
  41 + echo "----------"
  42 +
  43 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  44 + tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  45 + rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  46 +
  47 + mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx encoder.onnx
  48 + mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx decoder.onnx
  49 + mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt ./
  50 +
  51 + rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
  52 +
  53 + ls -lh
  54 +
  55 + cd ../
  56 +
  57 + sed -i.bak s/"type = 0"/"type = 1"/g ./sherpa-onnx.js
  58 + sed -i.bak s/Zipformer/Paraformer/g ./index.html
  59 +
  60 + git diff
  61 +
  62 + - name: Build sherpa-onnx for WebAssembly (ASR)
  63 + shell: bash
  64 + run: |
  65 + ./build-wasm-simd-asr.sh
  66 +
  67 + - name: collect files
  68 + shell: bash
  69 + run: |
  70 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  71 +
  72 + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-zh-en-asr-paraformer
  73 + mv build-wasm-simd-asr/install/bin/wasm/asr $dst
  74 + ls -lh $dst
  75 + tar cjfv ${dst}.tar.bz2 ./${dst}
  76 +
  77 + - name: Upload wasm files
  78 + uses: actions/upload-artifact@v4
  79 + with:
  80 + name: sherpa-onnx-wasm-simd-zh-en-asr-paraformer
  81 + path: ./sherpa-onnx-wasm-simd-*.tar.bz2
  82 +
  83 + - name: Publish to ModelScope
  84 + env:
  85 + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }}
  86 + uses: nick-fields/retry@v2
  87 + with:
  88 + max_attempts: 20
  89 + timeout_seconds: 200
  90 + shell: bash
  91 + command: |
  92 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  93 +
  94 + git config --global user.email "csukuangfj@gmail.com"
  95 + git config --global user.name "Fangjun Kuang"
  96 +
  97 + rm -rf ms
  98 + export GIT_LFS_SKIP_SMUDGE=1
  99 +
  100 + git clone https://www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer.git ms
  101 + cd ms
  102 + git fetch
  103 + git pull
  104 + git merge -m "merge remote" --ff origin main
  105 +
  106 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  107 +
  108 + git status
  109 + git lfs track "*.data"
  110 + git lfs track "*.wasm"
  111 + ls -lh
  112 +
  113 + git add .
  114 + git commit -m "update model"
  115 + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer.git
  116 +
  117 + - name: Publish to huggingface
  118 + env:
  119 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  120 + uses: nick-fields/retry@v2
  121 + with:
  122 + max_attempts: 20
  123 + timeout_seconds: 200
  124 + shell: bash
  125 + command: |
  126 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  127 +
  128 + git config --global user.email "csukuangfj@gmail.com"
  129 + git config --global user.name "Fangjun Kuang"
  130 +
  131 + rm -rf huggingface
  132 + export GIT_LFS_SKIP_SMUDGE=1
  133 +
  134 + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer huggingface
  135 + cd huggingface
  136 + git fetch
  137 + git pull
  138 + git merge -m "merge remote" --ff origin main
  139 +
  140 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  141 +
  142 + git status
  143 + git lfs track "*.data"
  144 + git lfs track "*.wasm"
  145 + ls -lh
  146 +
  147 + git add .
  148 + git commit -m "update model"
  149 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer main
  1 +name: wasm-simd-hf-space-zh-en-asr-zipformer
  2 +
  3 +on:
  4 + release:
  5 + types:
  6 + - published
  7 +
  8 + workflow_dispatch:
  9 +
  10 +concurrency:
  11 + group: wasm-simd-hf-space-zh-en-asr-zipformer-${{ github.ref }}
  12 + cancel-in-progress: true
  13 +
  14 +jobs:
  15 + wasm-simd-hf-space-zh-en-asr-zipformer:
  16 + runs-on: ${{ matrix.os }}
  17 + strategy:
  18 + fail-fast: false
  19 + matrix:
  20 + os: [ubuntu-latest]
  21 +
  22 + steps:
  23 + - uses: actions/checkout@v4
  24 + with:
  25 + fetch-depth: 0
  26 + - name: Install emsdk
  27 + uses: mymindstorm/setup-emsdk@v14
  28 +
  29 + - name: View emsdk version
  30 + shell: bash
  31 + run: |
  32 + emcc -v
  33 + echo "--------------------"
  34 + emcc --check
  35 +
  36 + - name: Download model files
  37 + shell: bash
  38 + run: |
  39 + cd wasm/asr/assets
  40 + ls -lh
  41 + echo "----------"
  42 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  43 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  44 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  45 + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx encoder.onnx
  46 + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx decoder.onnx
  47 + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx joiner.onnx
  48 + mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ./
  49 + rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/
  50 +
  51 + ls -lh
  52 +
  53 + - name: Build sherpa-onnx for WebAssembly (ASR)
  54 + shell: bash
  55 + run: |
  56 + ./build-wasm-simd-asr.sh
  57 +
  58 + - name: collect files
  59 + shell: bash
  60 + run: |
  61 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  62 +
  63 + dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-zh-en-asr-zipformer
  64 + mv build-wasm-simd-asr/install/bin/wasm/asr $dst
  65 + ls -lh $dst
  66 + tar cjfv ${dst}.tar.bz2 ./${dst}
  67 +
  68 + - name: Upload wasm files
  69 + uses: actions/upload-artifact@v4
  70 + with:
  71 + name: sherpa-onnx-wasm-simd-zh-en-asr-zipformer
  72 + path: ./sherpa-onnx-wasm-simd-*.tar.bz2
  73 +
  74 + - name: Publish to ModelScope
  75 + env:
  76 + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }}
  77 + uses: nick-fields/retry@v2
  78 + with:
  79 + max_attempts: 20
  80 + timeout_seconds: 200
  81 + shell: bash
  82 + command: |
  83 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  84 +
  85 + git config --global user.email "csukuangfj@gmail.com"
  86 + git config --global user.name "Fangjun Kuang"
  87 +
  88 + rm -rf ms
  89 + export GIT_LFS_SKIP_SMUDGE=1
  90 +
  91 + git clone https://www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en.git ms
  92 + cd ms
  93 + git fetch
  94 + git pull
  95 + git merge -m "merge remote" --ff origin main
  96 +
  97 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  98 +
  99 + git status
  100 + git lfs track "*.data"
  101 + git lfs track "*.wasm"
  102 + ls -lh
  103 +
  104 + git add .
  105 + git commit -m "update model"
  106 + git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en.git
  107 +
  108 + - name: Publish to huggingface
  109 + env:
  110 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  111 + uses: nick-fields/retry@v2
  112 + with:
  113 + max_attempts: 20
  114 + timeout_seconds: 200
  115 + shell: bash
  116 + command: |
  117 + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2)
  118 +
  119 + git config --global user.email "csukuangfj@gmail.com"
  120 + git config --global user.name "Fangjun Kuang"
  121 +
  122 + rm -rf huggingface
  123 + export GIT_LFS_SKIP_SMUDGE=1
  124 +
  125 + git clone https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en huggingface
  126 + cd huggingface
  127 + git fetch
  128 + git pull
  129 + git merge -m "merge remote" --ff origin main
  130 +
  131 + cp -v ../sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-*/* .
  132 +
  133 + git status
  134 + git lfs track "*.data"
  135 + git lfs track "*.wasm"
  136 + ls -lh
  137 +
  138 + git add .
  139 + git commit -m "update model"
  140 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en main
@@ -81,3 +81,6 @@ vits-piper-en_US-amy-low @@ -81,3 +81,6 @@ vits-piper-en_US-amy-low
81 vits-piper-*-*-* 81 vits-piper-*-*-*
82 log 82 log
83 *.exe 83 *.exe
  84 +vits-piper-*
  85 +vits-coqui-*
  86 +vits-mms-*
@@ -22,6 +22,7 @@ option(SHERPA_ONNX_ENABLE_WEBSOCKET "Whether to build webscoket server/client" O @@ -22,6 +22,7 @@ option(SHERPA_ONNX_ENABLE_WEBSOCKET "Whether to build webscoket server/client" O
22 option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF) 22 option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF)
23 option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF) 23 option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF)
24 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) 24 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF)
  25 +option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF)
25 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) 26 option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON)
26 option(SHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY "True to link libstdc++ statically. Used only when BUILD_SHARED_LIBS is OFF on Linux" ON) 27 option(SHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY "True to link libstdc++ statically. Used only when BUILD_SHARED_LIBS is OFF on Linux" ON)
27 28
@@ -106,10 +107,17 @@ message(STATUS "SHERPA_ONNX_ENABLE_WEBSOCKET ${SHERPA_ONNX_ENABLE_WEBSOCKET}") @@ -106,10 +107,17 @@ message(STATUS "SHERPA_ONNX_ENABLE_WEBSOCKET ${SHERPA_ONNX_ENABLE_WEBSOCKET}")
106 message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}") 107 message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}")
107 message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}") 108 message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}")
108 message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") 109 message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}")
  110 +message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}")
109 111
110 if(SHERPA_ONNX_ENABLE_WASM_TTS) 112 if(SHERPA_ONNX_ENABLE_WASM_TTS)
111 if(NOT SHERPA_ONNX_ENABLE_WASM) 113 if(NOT SHERPA_ONNX_ENABLE_WASM)
112 - message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for tts") 114 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for TTS")
  115 + endif()
  116 +endif()
  117 +
  118 +if(SHERPA_ONNX_ENABLE_WASM_ASR)
  119 + if(NOT SHERPA_ONNX_ENABLE_WASM)
  120 + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for ASR")
113 endif() 121 endif()
114 endif() 122 endif()
115 123
  1 +#!/usr/bin/env bash
  2 +# Copyright (c) 2024 Xiaomi Corporation
  3 +#
  4 +# This script is to build sherpa-onnx for WebAssembly (ASR)
  5 +
  6 +set -ex
  7 +
  8 +if [ x"$EMSCRIPTEN" == x"" ]; then
  9 + if ! command -v emcc &> /dev/null; then
  10 + echo "Please install emscripten first"
  11 + echo ""
  12 + echo "You can use the following commands to install it:"
  13 + echo ""
  14 + echo "git clone https://github.com/emscripten-core/emsdk.git"
  15 + echo "cd emsdk"
  16 + echo "git pull"
  17 + echo "./emsdk install latest"
  18 + echo "./emsdk activate latest"
  19 + echo "source ./emsdk_env.sh"
  20 + exit 1
  21 + else
  22 + EMSCRIPTEN=$(dirname $(realpath $(which emcc)))
  23 + fi
  24 +fi
  25 +
  26 +export EMSCRIPTEN=$EMSCRIPTEN
  27 +echo "EMSCRIPTEN: $EMSCRIPTEN"
  28 +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then
  29 + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake"
  30 + echo "Please make sure you have installed emsdk correctly"
  31 + exit 1
  32 +fi
  33 +
  34 +mkdir -p build-wasm-simd-asr
  35 +pushd build-wasm-simd-asr
  36 +
  37 +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON
  38 +
  39 +cmake \
  40 + -DCMAKE_INSTALL_PREFIX=./install \
  41 + -DCMAKE_BUILD_TYPE=Release \
  42 + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \
  43 + \
  44 + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
  45 + -DSHERPA_ONNX_ENABLE_TESTS=OFF \
  46 + -DSHERPA_ONNX_ENABLE_CHECK=OFF \
  47 + -DBUILD_SHARED_LIBS=OFF \
  48 + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
  49 + -DSHERPA_ONNX_ENABLE_JNI=OFF \
  50 + -DSHERPA_ONNX_ENABLE_C_API=ON \
  51 + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \
  52 + -DSHERPA_ONNX_ENABLE_GPU=OFF \
  53 + -DSHERPA_ONNX_ENABLE_WASM=ON \
  54 + -DSHERPA_ONNX_ENABLE_WASM_ASR=ON \
  55 + -DSHERPA_ONNX_ENABLE_BINARY=OFF \
  56 + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \
  57 + ..
  58 +make -j2
  59 +make install
  60 +
  61 +ls -lh install/bin/wasm/asr
1 #!/usr/bin/env bash 1 #!/usr/bin/env bash
2 # Copyright (c) 2024 Xiaomi Corporation 2 # Copyright (c) 2024 Xiaomi Corporation
3 # 3 #
4 -# This script is to build sherpa-onnx for WebAssembly 4 +# This script is to build sherpa-onnx for WebAssembly (TTS)
5 5
6 set -ex 6 set -ex
7 7
@@ -37,7 +37,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] @@ -37,7 +37,7 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
37 37
38 and if you want to select card 3 and the device 0 on that card, please use: 38 and if you want to select card 3 and the device 0 on that card, please use:
39 39
40 - hw:3,0 40 + plughw:3,0
41 41
42 )"; 42 )";
43 43
@@ -107,11 +107,12 @@ void OnlineZipformerTransducerModel::InitEncoder(void *model_data, @@ -107,11 +107,12 @@ void OnlineZipformerTransducerModel::InitEncoder(void *model_data,
107 107
108 if (config_.debug) { 108 if (config_.debug) {
109 auto print = [](const std::vector<int32_t> &v, const char *name) { 109 auto print = [](const std::vector<int32_t> &v, const char *name) {
110 - fprintf(stderr, "%s: ", name); 110 + std::ostringstream os;
  111 + os << name << ": ";
111 for (auto i : v) { 112 for (auto i : v) {
112 - fprintf(stderr, "%d ", i); 113 + os << i << " ";
113 } 114 }
114 - fprintf(stderr, "\n"); 115 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
115 }; 116 };
116 print(encoder_dims_, "encoder_dims"); 117 print(encoder_dims_, "encoder_dims");
117 print(attention_dims_, "attention_dims"); 118 print(attention_dims_, "attention_dims");
@@ -282,11 +282,12 @@ class OnlineZipformer2CtcModel::Impl { @@ -282,11 +282,12 @@ class OnlineZipformer2CtcModel::Impl {
282 282
283 if (config_.debug) { 283 if (config_.debug) {
284 auto print = [](const std::vector<int32_t> &v, const char *name) { 284 auto print = [](const std::vector<int32_t> &v, const char *name) {
285 - fprintf(stderr, "%s: ", name); 285 + std::ostringstream os;
  286 + os << name << ": ";
286 for (auto i : v) { 287 for (auto i : v) {
287 - fprintf(stderr, "%d ", i); 288 + os << i << " ";
288 } 289 }
289 - fprintf(stderr, "\n"); 290 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
290 }; 291 };
291 print(encoder_dims_, "encoder_dims"); 292 print(encoder_dims_, "encoder_dims");
292 print(query_head_dims_, "query_head_dims"); 293 print(query_head_dims_, "query_head_dims");
@@ -111,11 +111,12 @@ void OnlineZipformer2TransducerModel::InitEncoder(void *model_data, @@ -111,11 +111,12 @@ void OnlineZipformer2TransducerModel::InitEncoder(void *model_data,
111 111
112 if (config_.debug) { 112 if (config_.debug) {
113 auto print = [](const std::vector<int32_t> &v, const char *name) { 113 auto print = [](const std::vector<int32_t> &v, const char *name) {
114 - fprintf(stderr, "%s: ", name); 114 + std::ostringstream os;
  115 + os << name << ": ";
115 for (auto i : v) { 116 for (auto i : v) {
116 - fprintf(stderr, "%d ", i); 117 + os << i << " ";
117 } 118 }
118 - fprintf(stderr, "\n"); 119 + SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
119 }; 120 };
120 print(encoder_dims_, "encoder_dims"); 121 print(encoder_dims_, "encoder_dims");
121 print(query_head_dims_, "query_head_dims"); 122 print(query_head_dims_, "query_head_dims");
@@ -54,10 +54,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] @@ -54,10 +54,6 @@ card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
54 54
55 and if you want to select card 3 and the device 0 on that card, please use: 55 and if you want to select card 3 and the device 0 on that card, please use:
56 56
57 - hw:3,0  
58 -  
59 -or  
60 -  
61 plughw:3,0 57 plughw:3,0
62 58
63 as the device_name. 59 as the device_name.
1 if(SHERPA_ONNX_ENABLE_WASM_TTS) 1 if(SHERPA_ONNX_ENABLE_WASM_TTS)
2 add_subdirectory(tts) 2 add_subdirectory(tts)
3 endif() 3 endif()
  4 +
  5 +if(SHERPA_ONNX_ENABLE_WASM_ASR)
  6 + add_subdirectory(asr)
  7 +endif()
  1 +if(NOT $ENV{SHERPA_ONNX_IS_USING_BUILD_WASM_SH})
  2 + message(FATAL_ERROR "Please use ./build-wasm-simd-asr.sh to build for wasm ASR")
  3 +endif()
  4 +
  5 +if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets/encoder.onnx")
  6 + message(FATAL_ERROR "Please read ${CMAKE_CURRENT_SOURCE_DIR}/assets/README.md before you continue")
  7 +endif()
  8 +
  9 +set(exported_functions
  10 + MyPrint
  11 + # online ASR
  12 + AcceptWaveform
  13 + CreateOnlineRecognizer
  14 + CreateOnlineStream
  15 + DecodeOnlineStream
  16 + DestroyOnlineRecognizer
  17 + DestroyOnlineRecognizerResult
  18 + DestroyOnlineStream
  19 + GetOnlineStreamResult
  20 + InputFinished
  21 + IsEndpoint
  22 + IsOnlineStreamReady
  23 + Reset
  24 + #
  25 +)
  26 +set(mangled_exported_functions)
  27 +foreach(x IN LISTS exported_functions)
  28 + list(APPEND mangled_exported_functions "_${x}")
  29 +endforeach()
  30 +list(JOIN mangled_exported_functions "," all_exported_functions)
  31 +
  32 +include_directories(${CMAKE_SOURCE_DIR})
  33 +set(MY_FLAGS " -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=512MB -s ALLOW_MEMORY_GROWTH=1")
  34 +string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") # 10MB
  35 +string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ")
  36 +string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ")
  37 +string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue'] ")
  38 +
  39 +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}")
  40 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}")
  41 +set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}")
  42 +
  43 +if (NOT CMAKE_EXECUTABLE_SUFFIX STREQUAL ".js")
  44 + message(FATAL_ERROR "The default suffix for building executables should be .js!")
  45 +endif()
  46 +# set(CMAKE_EXECUTABLE_SUFFIX ".html")
  47 +
  48 +add_executable(sherpa-onnx-wasm-asr-main sherpa-onnx-wasm-asr-main.cc)
  49 +target_link_libraries(sherpa-onnx-wasm-asr-main sherpa-onnx-c-api)
  50 +install(TARGETS sherpa-onnx-wasm-asr-main DESTINATION bin/wasm/asr)
  51 +
  52 +install(
  53 + FILES
  54 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-asr-main>/sherpa-onnx-wasm-asr-main.js"
  55 + "index.html"
  56 + "sherpa-onnx.js"
  57 + "app.js"
  58 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-asr-main>/sherpa-onnx-wasm-asr-main.wasm"
  59 + "$<TARGET_FILE_DIR:sherpa-onnx-wasm-asr-main>/sherpa-onnx-wasm-asr-main.data"
  60 + DESTINATION
  61 + bin/wasm/asr
  62 +)
  1 +// This file copies and modifies code
  2 +// from https://mdn.github.io/web-dictaphone/scripts/app.js
  3 +// and https://gist.github.com/meziantou/edb7217fddfbb70e899e
  4 +
  5 +const startBtn = document.getElementById('startBtn');
  6 +const stopBtn = document.getElementById('stopBtn');
  7 +const clearBtn = document.getElementById('clearBtn');
  8 +const hint = document.getElementById('hint');
  9 +const soundClips = document.getElementById('sound-clips');
  10 +
  11 +let textArea = document.getElementById('results');
  12 +
  13 +let lastResult = '';
  14 +let resultList = [];
  15 +
  16 +clearBtn.onclick = function() {
  17 + resultList = [];
  18 + textArea.value = getDisplayResult();
  19 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  20 +};
  21 +
  22 +function getDisplayResult() {
  23 + let i = 0;
  24 + let ans = '';
  25 + for (let s in resultList) {
  26 + if (resultList[s] == '') {
  27 + continue;
  28 + }
  29 +
  30 + ans += '' + i + ': ' + resultList[s] + '\n';
  31 + i += 1;
  32 + }
  33 +
  34 + if (lastResult.length > 0) {
  35 + ans += '' + i + ': ' + lastResult + '\n';
  36 + }
  37 + return ans;
  38 +}
  39 +
  40 +
  41 +Module = {};
  42 +Module.onRuntimeInitialized = function() {
  43 + console.log('inited!');
  44 + hint.innerText = 'Model loaded! Please click start';
  45 +
  46 + startBtn.disabled = false;
  47 +
  48 + recognizer = createRecognizer();
  49 + console.log('recognizer is created!', recognizer);
  50 +};
  51 +
  52 +let audioCtx;
  53 +let mediaStream;
  54 +
  55 +let expectedSampleRate = 16000;
  56 +let recordSampleRate; // the sampleRate of the microphone
  57 +let recorder = null; // the microphone
  58 +let leftchannel = []; // TODO: Use a single channel
  59 +
  60 +let recordingLength = 0; // number of samples so far
  61 +
  62 +let recognizer = null;
  63 +let recognizer_stream = null;
  64 +
  65 +if (navigator.mediaDevices.getUserMedia) {
  66 + console.log('getUserMedia supported.');
  67 +
  68 + // see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
  69 + const constraints = {audio: true};
  70 +
  71 + let onSuccess = function(stream) {
  72 + if (!audioCtx) {
  73 + audioCtx = new AudioContext({sampleRate: 16000});
  74 + }
  75 + console.log(audioCtx);
  76 + recordSampleRate = audioCtx.sampleRate;
  77 + console.log('sample rate ' + recordSampleRate);
  78 +
  79 + // creates an audio node from the microphone incoming stream
  80 + mediaStream = audioCtx.createMediaStreamSource(stream);
  81 + console.log('media stream', mediaStream);
  82 +
  83 + // https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
  84 + // bufferSize: the onaudioprocess event is called when the buffer is full
  85 + var bufferSize = 4096;
  86 + var numberOfInputChannels = 1;
  87 + var numberOfOutputChannels = 2;
  88 + if (audioCtx.createScriptProcessor) {
  89 + recorder = audioCtx.createScriptProcessor(
  90 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  91 + } else {
  92 + recorder = audioCtx.createJavaScriptNode(
  93 + bufferSize, numberOfInputChannels, numberOfOutputChannels);
  94 + }
  95 + console.log('recorder', recorder);
  96 +
  97 + recorder.onaudioprocess = function(e) {
  98 + let samples = new Float32Array(e.inputBuffer.getChannelData(0))
  99 + samples = downsampleBuffer(samples, expectedSampleRate);
  100 +
  101 + if (recognizer_stream == null) {
  102 + recognizer_stream = recognizer.createStream();
  103 + }
  104 +
  105 + recognizer_stream.acceptWaveform(expectedSampleRate, samples);
  106 + while (recognizer.isReady(recognizer_stream)) {
  107 + recognizer.decode(recognizer_stream);
  108 + }
  109 +
  110 + let isEndpoint = recognizer.isEndpoint(recognizer_stream);
  111 + let result = recognizer.getResult(recognizer_stream);
  112 +
  113 +
  114 + if (result.length > 0 && lastResult != result) {
  115 + lastResult = result;
  116 + }
  117 +
  118 + if (isEndpoint) {
  119 + if (lastResult.length > 0) {
  120 + resultList.push(lastResult);
  121 + lastResult = '';
  122 + }
  123 + recognizer.reset(recognizer_stream);
  124 + }
  125 +
  126 + textArea.value = getDisplayResult();
  127 + textArea.scrollTop = textArea.scrollHeight; // auto scroll
  128 +
  129 + let buf = new Int16Array(samples.length);
  130 + for (var i = 0; i < samples.length; ++i) {
  131 + let s = samples[i];
  132 + if (s >= 1)
  133 + s = 1;
  134 + else if (s <= -1)
  135 + s = -1;
  136 +
  137 + samples[i] = s;
  138 + buf[i] = s * 32767;
  139 + }
  140 +
  141 + leftchannel.push(buf);
  142 + recordingLength += bufferSize;
  143 + };
  144 +
  145 + startBtn.onclick = function() {
  146 + mediaStream.connect(recorder);
  147 + recorder.connect(audioCtx.destination);
  148 +
  149 + console.log('recorder started');
  150 +
  151 + stopBtn.disabled = false;
  152 + startBtn.disabled = true;
  153 + };
  154 +
  155 + stopBtn.onclick = function() {
  156 + console.log('recorder stopped');
  157 +
  158 + // stopBtn recording
  159 + recorder.disconnect(audioCtx.destination);
  160 + mediaStream.disconnect(recorder);
  161 +
  162 + startBtn.style.background = '';
  163 + startBtn.style.color = '';
  164 + // mediaRecorder.requestData();
  165 +
  166 + stopBtn.disabled = true;
  167 + startBtn.disabled = false;
  168 +
  169 + var clipName = new Date().toISOString();
  170 +
  171 + const clipContainer = document.createElement('article');
  172 + const clipLabel = document.createElement('p');
  173 + const audio = document.createElement('audio');
  174 + const deleteButton = document.createElement('button');
  175 + clipContainer.classList.add('clip');
  176 + audio.setAttribute('controls', '');
  177 + deleteButton.textContent = 'Delete';
  178 + deleteButton.className = 'delete';
  179 +
  180 + clipLabel.textContent = clipName;
  181 +
  182 + clipContainer.appendChild(audio);
  183 +
  184 + clipContainer.appendChild(clipLabel);
  185 + clipContainer.appendChild(deleteButton);
  186 + soundClips.appendChild(clipContainer);
  187 +
  188 + audio.controls = true;
  189 + let samples = flatten(leftchannel);
  190 + const blob = toWav(samples);
  191 +
  192 + leftchannel = [];
  193 + const audioURL = window.URL.createObjectURL(blob);
  194 + audio.src = audioURL;
  195 + console.log('recorder stopped');
  196 +
  197 + deleteButton.onclick = function(e) {
  198 + let evtTgt = e.target;
  199 + evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
  200 + };
  201 +
  202 + clipLabel.onclick = function() {
  203 + const existingName = clipLabel.textContent;
  204 + const newClipName = prompt('Enter a new name for your sound clip?');
  205 + if (newClipName === null) {
  206 + clipLabel.textContent = existingName;
  207 + } else {
  208 + clipLabel.textContent = newClipName;
  209 + }
  210 + };
  211 + };
  212 + };
  213 +
  214 + let onError = function(err) {
  215 + console.log('The following error occured: ' + err);
  216 + };
  217 +
  218 + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
  219 +} else {
  220 + console.log('getUserMedia not supported on your browser!');
  221 + alert('getUserMedia not supported on your browser!');
  222 +}
  223 +
  224 +
  225 +// this function is copied/modified from
  226 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  227 +function flatten(listOfSamples) {
  228 + let n = 0;
  229 + for (let i = 0; i < listOfSamples.length; ++i) {
  230 + n += listOfSamples[i].length;
  231 + }
  232 + let ans = new Int16Array(n);
  233 +
  234 + let offset = 0;
  235 + for (let i = 0; i < listOfSamples.length; ++i) {
  236 + ans.set(listOfSamples[i], offset);
  237 + offset += listOfSamples[i].length;
  238 + }
  239 + return ans;
  240 +}
  241 +
  242 +// this function is copied/modified from
  243 +// https://gist.github.com/meziantou/edb7217fddfbb70e899e
  244 +function toWav(samples) {
  245 + let buf = new ArrayBuffer(44 + samples.length * 2);
  246 + var view = new DataView(buf);
  247 +
  248 + // http://soundfile.sapp.org/doc/WaveFormat/
  249 + // F F I R
  250 + view.setUint32(0, 0x46464952, true); // chunkID
  251 + view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
  252 + // E V A W
  253 + view.setUint32(8, 0x45564157, true); // format
  254 + //
  255 + // t m f
  256 + view.setUint32(12, 0x20746d66, true); // subchunk1ID
  257 + view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
  258 + view.setUint32(20, 1, true); // audioFormat, 1 for PCM
  259 + view.setUint16(22, 1, true); // numChannels: 1 channel
  260 + view.setUint32(24, expectedSampleRate, true); // sampleRate
  261 + view.setUint32(28, expectedSampleRate * 2, true); // byteRate
  262 + view.setUint16(32, 2, true); // blockAlign
  263 + view.setUint16(34, 16, true); // bitsPerSample
  264 + view.setUint32(36, 0x61746164, true); // Subchunk2ID
  265 + view.setUint32(40, samples.length * 2, true); // subchunk2Size
  266 +
  267 + let offset = 44;
  268 + for (let i = 0; i < samples.length; ++i) {
  269 + view.setInt16(offset, samples[i], true);
  270 + offset += 2;
  271 + }
  272 +
  273 + return new Blob([view], {type: 'audio/wav'});
  274 +}
  275 +
  276 +// this function is copied from
  277 +// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46
  278 +function downsampleBuffer(buffer, exportSampleRate) {
  279 + if (exportSampleRate === recordSampleRate) {
  280 + return buffer;
  281 + }
  282 + var sampleRateRatio = recordSampleRate / exportSampleRate;
  283 + var newLength = Math.round(buffer.length / sampleRateRatio);
  284 + var result = new Float32Array(newLength);
  285 + var offsetResult = 0;
  286 + var offsetBuffer = 0;
  287 + while (offsetResult < result.length) {
  288 + var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
  289 + var accum = 0, count = 0;
  290 + for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
  291 + accum += buffer[i];
  292 + count++;
  293 + }
  294 + result[offsetResult] = accum / count;
  295 + offsetResult++;
  296 + offsetBuffer = nextOffsetBuffer;
  297 + }
  298 + return result;
  299 +};
  1 +# Introduction
  2 +
  3 +Please refer to
  4 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  5 +or
  6 +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
  7 +to download a model.
  8 +
  9 +# Streaming ASR
  10 +
  11 +## Transducer
  12 +```bash
  13 +cd sherpa-onnx/wasm/asr/assets
  14 +
  15 +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  16 +tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  17 +rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
  18 +
  19 +# Note it is not an error that we rename encoder.int8.onnx to encoder.onnx
  20 +
  21 +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx encoder.onnx
  22 +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx decoder.onnx
  23 +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx joiner.onnx
  24 +mv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ./
  25 +rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/
  26 +
  27 +cd ../../..
  28 +
  29 +./build-wasm-simd-asr.sh
  30 +```
  31 +
  32 +You should have the following files in `assets` before you can run
  33 +`build-wasm-simd-asr.sh`
  34 +
  35 +```
  36 +assets fangjun$ tree -L 1
  37 +.
  38 +├── README.md
  39 +├── decoder.onnx
  40 +├── encoder.onnx
  41 +├── joiner.onnx
  42 +└── tokens.txt
  43 +
  44 +0 directories, 5 files
  45 +```
  46 +
  47 +## Paraformer
  48 +
  49 +```
  50 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  51 +tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  52 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  53 +
  54 +mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx encoder.onnx
  55 +mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx decoder.onnx
  56 +mv sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt ./
  57 +
  58 +rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
  59 +
  60 +cd ../
  61 +
  62 +sed -i.bak s/"type = 0"/"type = 1"/g ./sherpa-onnx.js
  63 +sed -i.bak s/Zipformer/Paraformer/g ./index.html
  64 +
  65 +cd ../..
  66 +
  67 +./build-wasm-simd-asr.sh
  68 +```
  69 +
  70 +You should have the following files in `assets` before you can run
  71 +`build-wasm-simd-asr.sh`
  72 +
  73 +```
  74 +assets fangjun$ tree -L 1
  75 +.
  76 +├── README.md
  77 +├── decoder.onnx
  78 +├── encoder.onnx
  79 +└── tokens.txt
  80 +
  81 +0 directories, 4 files
  82 +```
  1 +<html lang="en">
  2 +
  3 +<head>
  4 + <meta charset="utf-8">
  5 + <meta name="viewport" content="width=device-width" />
  6 + <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title>
  7 + <style>
  8 + h1,div {
  9 + text-align: center;
  10 + }
  11 + textarea {
  12 + width:100%;
  13 + }
  14 + </style>
  15 +</head>
  16 +
  17 +<body>
  18 + <h1>
  19 + Next-gen Kaldi + WebAssembly<br/>
  20 + ASR Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a><br/>
  21 + (with Zipformer)
  22 + </h1>
  23 +
  24 + <div>
  25 + <span id="hint">Loading model ... ...</span>
  26 + <br/>
  27 + <br/>
  28 + <button id="startBtn" disabled>Start</button>
  29 + <button id="stopBtn" disabled>Stop</button>
  30 + <button id="clearBtn">Clear</button>
  31 + <br/>
  32 + <br/>
  33 + <textarea id="results" rows="10" readonly></textarea>
  34 + </div>
  35 +
  36 + <section flex="1" overflow="auto" id="sound-clips">
  37 + </section>
  38 +
  39 + <script src="sherpa-onnx.js"></script>
  40 + <script src="app.js"></script>
  41 + <script src="sherpa-onnx-wasm-asr-main.js"></script>
  42 +</body>
  1 +// wasm/sherpa-onnx-wasm-asr-main.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +#include <stdio.h>
  5 +
  6 +#include <algorithm>
  7 +#include <memory>
  8 +
  9 +#include "sherpa-onnx/c-api/c-api.h"
  10 +
  11 +// see also
  12 +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html
  13 +
  14 +extern "C" {
  15 +
  16 +static_assert(sizeof(SherpaOnnxOnlineTransducerModelConfig) == 3 * 4, "");
  17 +static_assert(sizeof(SherpaOnnxOnlineParaformerModelConfig) == 2 * 4, "");
  18 +static_assert(sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) == 1 * 4, "");
  19 +static_assert(sizeof(SherpaOnnxOnlineModelConfig) ==
  20 + sizeof(SherpaOnnxOnlineTransducerModelConfig) +
  21 + sizeof(SherpaOnnxOnlineParaformerModelConfig) +
  22 + sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4,
  23 + "");
  24 +static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
  25 +static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) ==
  26 + sizeof(SherpaOnnxFeatureConfig) +
  27 + sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4,
  28 + "");
  29 +
  30 +void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
  31 + auto model_config = &config->model_config;
  32 + auto feat = &config->feat_config;
  33 + auto transducer_model_config = &model_config->transducer;
  34 + auto paraformer_model_config = &model_config->paraformer;
  35 + auto ctc_model_config = &model_config->zipformer2_ctc;
  36 +
  37 + fprintf(stdout, "----------online transducer model config----------\n");
  38 + fprintf(stdout, "encoder: %s\n", transducer_model_config->encoder);
  39 + fprintf(stdout, "decoder: %s\n", transducer_model_config->decoder);
  40 + fprintf(stdout, "joiner: %s\n", transducer_model_config->joiner);
  41 +
  42 + fprintf(stdout, "----------online parformer model config----------\n");
  43 + fprintf(stdout, "encoder: %s\n", paraformer_model_config->encoder);
  44 + fprintf(stdout, "decoder: %s\n", paraformer_model_config->decoder);
  45 +
  46 + fprintf(stdout, "----------online ctc model config----------\n");
  47 + fprintf(stdout, "model: %s\n", ctc_model_config->model);
  48 + fprintf(stdout, "tokens: %s\n", model_config->tokens);
  49 + fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
  50 + fprintf(stdout, "provider: %s\n", model_config->provider);
  51 + fprintf(stdout, "debug: %d\n", model_config->debug);
  52 + fprintf(stdout, "model type: %s\n", model_config->model_type);
  53 +
  54 + fprintf(stdout, "----------feat config----------\n");
  55 + fprintf(stdout, "sample rate: %d\n", feat->sample_rate);
  56 + fprintf(stdout, "feat dim: %d\n", feat->feature_dim);
  57 +
  58 + fprintf(stdout, "----------recognizer config----------\n");
  59 + fprintf(stdout, "decoding method: %s\n", config->decoding_method);
  60 + fprintf(stdout, "max active paths: %d\n", config->max_active_paths);
  61 + fprintf(stdout, "enable_endpoint: %d\n", config->enable_endpoint);
  62 + fprintf(stdout, "rule1_min_trailing_silence: %.2f\n",
  63 + config->rule1_min_trailing_silence);
  64 + fprintf(stdout, "rule2_min_trailing_silence: %.2f\n",
  65 + config->rule2_min_trailing_silence);
  66 + fprintf(stdout, "rule3_min_utterance_length: %.2f\n",
  67 + config->rule3_min_utterance_length);
  68 + fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file);
  69 + fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score);
  70 +}
  71 +
  72 +void CopyHeap(const char *src, int32_t num_bytes, char *dst) {
  73 + std::copy(src, src + num_bytes, dst);
  74 +}
  75 +}
  1 +function freeConfig(config) {
  2 + if ('buffer' in config) {
  3 + _free(config.buffer);
  4 + }
  5 +
  6 + if ('config' in config) {
  7 + freeConfig(config.config)
  8 + }
  9 +
  10 + if ('transducer' in config) {
  11 + freeConfig(config.transducer)
  12 + }
  13 +
  14 + if ('paraformer' in config) {
  15 + freeConfig(config.paraformer)
  16 + }
  17 +
  18 + if ('ctc' in config) {
  19 + freeConfig(config.ctc)
  20 + }
  21 +
  22 + if ('feat' in config) {
  23 + freeConfig(config.feat)
  24 + }
  25 +
  26 + if ('model' in config) {
  27 + freeConfig(config.model)
  28 + }
  29 +
  30 + _free(config.ptr);
  31 +}
  32 +
  33 +// The user should free the returned pointers
  34 +function initSherpaOnnxOnlineTransducerModelConfig(config) {
  35 + let encoderLen = lengthBytesUTF8(config.encoder) + 1;
  36 + let decoderLen = lengthBytesUTF8(config.decoder) + 1;
  37 + let joinerLen = lengthBytesUTF8(config.joiner) + 1;
  38 +
  39 + let n = encoderLen + decoderLen + joinerLen;
  40 +
  41 + let buffer = _malloc(n);
  42 +
  43 + let len = 3 * 4; // 3 pointers
  44 + let ptr = _malloc(len);
  45 +
  46 + let offset = 0;
  47 + stringToUTF8(config.encoder, buffer + offset, encoderLen);
  48 + offset += encoderLen;
  49 +
  50 + stringToUTF8(config.decoder, buffer + offset, decoderLen);
  51 + offset += decoderLen;
  52 +
  53 + stringToUTF8(config.joiner, buffer + offset, joinerLen);
  54 +
  55 + offset = 0;
  56 + setValue(ptr, buffer + offset, 'i8*');
  57 + offset += encoderLen;
  58 +
  59 + setValue(ptr + 4, buffer + offset, 'i8*');
  60 + offset += decoderLen;
  61 +
  62 + setValue(ptr + 8, buffer + offset, 'i8*');
  63 +
  64 + return {
  65 + buffer: buffer, ptr: ptr, len: len,
  66 + }
  67 +}
  68 +
  69 +function initSherpaOnnxOnlineParaformerModelConfig(config) {
  70 + let encoderLen = lengthBytesUTF8(config.encoder) + 1;
  71 + let decoderLen = lengthBytesUTF8(config.decoder) + 1;
  72 +
  73 + let n = encoderLen + decoderLen;
  74 + let buffer = _malloc(n);
  75 +
  76 + let len = 2 * 4; // 2 pointers
  77 + let ptr = _malloc(len);
  78 +
  79 + let offset = 0;
  80 + stringToUTF8(config.encoder, buffer + offset, encoderLen);
  81 + offset += encoderLen;
  82 +
  83 + stringToUTF8(config.decoder, buffer + offset, decoderLen);
  84 +
  85 + offset = 0;
  86 + setValue(ptr, buffer + offset, 'i8*');
  87 + offset += encoderLen;
  88 +
  89 + setValue(ptr + 4, buffer + offset, 'i8*');
  90 +
  91 + return {
  92 + buffer: buffer, ptr: ptr, len: len,
  93 + }
  94 +}
  95 +
  96 +function initSherpaOnnxOnlineZipformer2CtcModelConfig(config) {
  97 + let n = lengthBytesUTF8(config.model) + 1;
  98 + let buffer = _malloc(n);
  99 +
  100 + let len = 1 * 4; // 1 pointer
  101 + let ptr = _malloc(len);
  102 +
  103 + stringToUTF8(config.model, buffer, n);
  104 +
  105 + setValue(ptr, buffer, 'i8*');
  106 +
  107 + return {
  108 + buffer: buffer, ptr: ptr, len: len,
  109 + }
  110 +}
  111 +
  112 +function initSherpaOnnxOnlineModelConfig(config) {
  113 + let transducer = initSherpaOnnxOnlineTransducerModelConfig(config.transducer);
  114 + let paraformer = initSherpaOnnxOnlineParaformerModelConfig(config.paraformer);
  115 + let ctc = initSherpaOnnxOnlineZipformer2CtcModelConfig(config.zipformer2Ctc);
  116 +
  117 + let len = transducer.len + paraformer.len + ctc.len + 5 * 4;
  118 + let ptr = _malloc(len);
  119 +
  120 + let offset = 0;
  121 + _CopyHeap(transducer.ptr, transducer.len, ptr + offset);
  122 + offset += transducer.len;
  123 +
  124 + _CopyHeap(paraformer.ptr, paraformer.len, ptr + offset);
  125 + offset += paraformer.len;
  126 +
  127 + _CopyHeap(ctc.ptr, ctc.len, ptr + offset);
  128 + offset += ctc.len;
  129 +
  130 + let tokensLen = lengthBytesUTF8(config.tokens) + 1;
  131 + let providerLen = lengthBytesUTF8(config.provider) + 1;
  132 + let modelTypeLen = lengthBytesUTF8(config.modelType) + 1;
  133 + let bufferLen = tokensLen + providerLen + modelTypeLen;
  134 + let buffer = _malloc(bufferLen);
  135 +
  136 + offset = 0;
  137 + stringToUTF8(config.tokens, buffer, tokensLen);
  138 + offset += tokensLen;
  139 +
  140 + stringToUTF8(config.provider, buffer + offset, providerLen);
  141 + offset += providerLen;
  142 +
  143 + stringToUTF8(config.modelType, buffer + offset, modelTypeLen);
  144 +
  145 + offset = transducer.len + paraformer.len + ctc.len;
  146 + setValue(ptr + offset, buffer, 'i8*'); // tokens
  147 + offset += 4;
  148 +
  149 + setValue(ptr + offset, config.numThreads, 'i32');
  150 + offset += 4;
  151 +
  152 + setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider
  153 + offset += 4;
  154 +
  155 + setValue(ptr + offset, config.debug, 'i32');
  156 + offset += 4;
  157 +
  158 + setValue(ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType
  159 + offset += 4;
  160 +
  161 + return {
  162 + buffer: buffer, ptr: ptr, len: len, transducer: transducer,
  163 + paraformer: paraformer, ctc: ctc
  164 + }
  165 +}
  166 +
  167 +function initSherpaOnnxFeatureConfig(config) {
  168 + let len = 2 * 4; // 2 pointers
  169 + let ptr = _malloc(len);
  170 +
  171 + setValue(ptr, config.sampleRate, 'i32');
  172 + setValue(ptr + 4, config.featureDim, 'i32');
  173 + return {ptr: ptr, len: len};
  174 +}
  175 +
  176 +function initSherpaOnnxOnlineRecognizerConfig(config) {
  177 + let feat = initSherpaOnnxFeatureConfig(config.featConfig);
  178 + let model = initSherpaOnnxOnlineModelConfig(config.modelConfig);
  179 +
  180 + let len = feat.len + model.len + 8 * 4;
  181 + let ptr = _malloc(len);
  182 +
  183 + let offset = 0;
  184 + _CopyHeap(feat.ptr, feat.len, ptr + offset);
  185 + offset += feat.len;
  186 +
  187 + _CopyHeap(model.ptr, model.len, ptr + offset);
  188 + offset += model.len;
  189 +
  190 + let decodingMethodLen = lengthBytesUTF8(config.decodingMethod) + 1;
  191 + let hotwordsFileLen = lengthBytesUTF8(config.hotwordsFile) + 1;
  192 + let bufferLen = decodingMethodLen + hotwordsFileLen;
  193 + let buffer = _malloc(bufferLen);
  194 +
  195 + offset = 0;
  196 + stringToUTF8(config.decodingMethod, buffer, decodingMethodLen);
  197 + offset += decodingMethodLen;
  198 +
  199 + stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen);
  200 +
  201 + offset = feat.len + model.len;
  202 + setValue(ptr + offset, buffer, 'i8*'); // decoding method
  203 + offset += 4;
  204 +
  205 + setValue(ptr + offset, config.maxActivePaths, 'i32');
  206 + offset += 4;
  207 +
  208 + setValue(ptr + offset, config.enableEndpoint, 'i32');
  209 + offset += 4;
  210 +
  211 + setValue(ptr + offset, config.rule1MinTrailingSilence, 'float');
  212 + offset += 4;
  213 +
  214 + setValue(ptr + offset, config.rule2MinTrailingSilence, 'float');
  215 + offset += 4;
  216 +
  217 + setValue(ptr + offset, config.rule3MinUtteranceLength, 'float');
  218 + offset += 4;
  219 +
  220 + setValue(ptr + offset, buffer + decodingMethodLen, 'i8*');
  221 + offset += 4;
  222 +
  223 + setValue(ptr + offset, config.hotwordsScore, 'float');
  224 + offset += 4;
  225 +
  226 + return {
  227 + buffer: buffer, ptr: ptr, len: len, feat: feat, model: model
  228 + }
  229 +}
  230 +
  231 +
  232 +function createRecognizer() {
  233 + let onlineTransducerModelConfig = {
  234 + encoder: '',
  235 + decoder: '',
  236 + joiner: '',
  237 + }
  238 +
  239 + let onlineParaformerModelConfig = {
  240 + encoder: '',
  241 + decoder: '',
  242 + }
  243 +
  244 + let onlineZipformer2CtcModelConfig = {
  245 + model: '',
  246 + }
  247 +
  248 + let type = 0;
  249 +
  250 + switch (type) {
  251 + case 0:
  252 + // transducer
  253 + onlineTransducerModelConfig.encoder = './encoder.onnx';
  254 + onlineTransducerModelConfig.decoder = './decoder.onnx';
  255 + onlineTransducerModelConfig.joiner = './joiner.onnx';
  256 + break;
  257 + case 1:
  258 + // paraformer
  259 + onlineParaformerModelConfig.encoder = './encoder.onnx';
  260 + onlineParaformerModelConfig.decoder = './decoder.onnx';
  261 + break;
  262 + case 2:
  263 + // ctc
  264 + onlineZipformer2CtcModelConfig.model = './encoder.onnx';
  265 + break;
  266 + }
  267 +
  268 +
  269 + let onlineModelConfig = {
  270 + transducer: onlineTransducerModelConfig,
  271 + paraformer: onlineParaformerModelConfig,
  272 + zipformer2Ctc: onlineZipformer2CtcModelConfig,
  273 + tokens: './tokens.txt',
  274 + numThreads: 1,
  275 + provider: 'cpu',
  276 + debug: 1,
  277 + modelType: '',
  278 + }
  279 +
  280 + let featureConfig = {
  281 + sampleRate: 16000,
  282 + featureDim: 80,
  283 + }
  284 +
  285 + let recognizerConfig = {
  286 + featConfig: featureConfig,
  287 + modelConfig: onlineModelConfig,
  288 + decodingMethod: 'greedy_search',
  289 + maxActivePaths: 4,
  290 + enableEndpoint: 1,
  291 + rule1MinTrailingSilence: 2.4,
  292 + rule2MinTrailingSilence: 1.2,
  293 + rule3MinUtteranceLength: 20,
  294 + hotwordsFile: '',
  295 + hotwordsScore: 1.5,
  296 + }
  297 +
  298 + return new OnlineRecognizer(recognizerConfig);
  299 +}
  300 +
  301 +class OnlineStream {
  302 + constructor(handle) {
  303 + this.handle = handle;
  304 + this.pointer = null; // buffer
  305 + this.n = 0; // buffer size
  306 + }
  307 +
  308 + free() {
  309 + if (this.handle) {
  310 + _DestroyOnlineStream(this.handle);
  311 + this.handle = null;
  312 + _free(this.pointer);
  313 + this.pointer = null;
  314 + this.n = 0;
  315 + }
  316 + }
  317 +
  318 + /**
  319 + * @param sampleRate {Number}
  320 + * @param samples {Float32Array} Containing samples in the range [-1, 1]
  321 + */
  322 + acceptWaveform(sampleRate, samples) {
  323 + if (this.n < samples.length) {
  324 + _free(this.pointer);
  325 + this.pointer = _malloc(samples.length * samples.BYTES_PER_ELEMENT);
  326 + this.n = samples.length
  327 + }
  328 +
  329 + Module.HEAPF32.set(samples, this.pointer / samples.BYTES_PER_ELEMENT);
  330 + _AcceptWaveform(this.handle, sampleRate, this.pointer, samples.length);
  331 + }
  332 +
  333 + inputFinished() {
  334 + _InputFinished(this.handle);
  335 + }
  336 +};
  337 +
  338 +class OnlineRecognizer {
  339 + constructor(configObj) {
  340 + let config = initSherpaOnnxOnlineRecognizerConfig(configObj)
  341 + let handle = _CreateOnlineRecognizer(config.ptr);
  342 +
  343 + freeConfig(config);
  344 +
  345 + this.handle = handle;
  346 + }
  347 +
  348 + free() {
  349 + _DestroyOnlineRecognizer(this.handle);
  350 + this.handle = 0
  351 + }
  352 +
  353 + createStream() {
  354 + let handle = _CreateOnlineStream(this.handle);
  355 + return new OnlineStream(handle);
  356 + }
  357 +
  358 + isReady(stream) {
  359 + return _IsOnlineStreamReady(this.handle, stream.handle) == 1;
  360 + }
  361 +
  362 + decode(stream) {
  363 + return _DecodeOnlineStream(this.handle, stream.handle);
  364 + }
  365 +
  366 + isEndpoint(stream) {
  367 + return _IsEndpoint(this.handle, stream.handle) == 1;
  368 + }
  369 +
  370 + reset(stream) {
  371 + _Reset(this.handle, stream.handle);
  372 + }
  373 +
  374 + getResult(stream) {
  375 + let r = _GetOnlineStreamResult(this.handle, stream.handle);
  376 + let textPtr = getValue(r, 'i8*');
  377 + let text = UTF8ToString(textPtr);
  378 + _DestroyOnlineRecognizerResult(r);
  379 + return text;
  380 + }
  381 +}