Fangjun Kuang
Committed by GitHub

Add a byte-level BPE Chinese+English non-streaming zipformer model (#1645)

@@ -8,6 +8,27 @@ log() { @@ -8,6 +8,27 @@ log() {
8 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" 8 echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
9 } 9 }
10 10
  11 +log "test offline zipformer (byte-level bpe, Chinese+English)"
  12 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-zh-en-2023-11-22.tar.bz2
  13 +tar xvf sherpa-onnx-zipformer-zh-en-2023-11-22.tar.bz2
  14 +rm sherpa-onnx-zipformer-zh-en-2023-11-22.tar.bz2
  15 +
  16 +repo=sherpa-onnx-zipformer-zh-en-2023-11-22
  17 +
  18 +./python-api-examples/offline-decode-files.py \
  19 + --tokens=$repo/tokens.txt \
  20 + --encoder=$repo/encoder-epoch-34-avg-19.int8.onnx \
  21 + --decoder=$repo/decoder-epoch-34-avg-19.onnx \
  22 + --joiner=$repo/joiner-epoch-34-avg-19.int8.onnx \
  23 + --num-threads=2 \
  24 + --decoding-method=greedy_search \
  25 + --debug=true \
  26 + $repo/test_wavs/0.wav \
  27 + $repo/test_wavs/1.wav \
  28 + $repo/test_wavs/2.wav
  29 +
  30 +rm -rf sherpa-onnx-zipformer-zh-en-2023-11-22
  31 +
11 log "test offline Moonshine" 32 log "test offline Moonshine"
12 33
13 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 34 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  1 +name: add-new-asr-models
  2 +
  3 +on:
  4 + # push:
  5 + # branches:
  6 + # - new-asr-models
  7 + workflow_dispatch:
  8 +
  9 +concurrency:
  10 + group: add-new-asr-models-${{ github.ref }}
  11 + cancel-in-progress: true
  12 +
  13 +jobs:
  14 + add-new-asr-models:
  15 + runs-on: ${{ matrix.os }}
  16 + name: New asr models
  17 + strategy:
  18 + fail-fast: false
  19 + matrix:
  20 + os: [ubuntu-latest]
  21 +
  22 + steps:
  23 + - uses: actions/checkout@v4
  24 + with:
  25 + fetch-depth: 0
  26 +
  27 + - name: Download icefall-asr-zipformer-multi-zh-en-2023-11-22
  28 + shell: bash
  29 + run: |
  30 + d=sherpa-onnx-zipformer-zh-en-2023-11-22
  31 + mkdir $d
  32 + pushd $d
  33 +
  34 + wget -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/data/lang_bbpe_2000/tokens.txt
  35 + wget -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/data/lang_bbpe_2000/bbpe.model
  36 + wget -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/exp/decoder-epoch-34-avg-19.onnx
  37 + wget -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/exp/encoder-epoch-34-avg-19.int8.onnx
  38 + wget -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/exp/encoder-epoch-34-avg-19.onnx
  39 + wget -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/exp/joiner-epoch-34-avg-19.int8.onnx
  40 + wget -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/exp/joiner-epoch-34-avg-19.onnx
  41 +
  42 + mkdir test_wavs
  43 + cd test_wavs
  44 + wget -O 0.wav -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/test_wavs/_1634_210_2577_1_1525157964032_3712259_29.wav
  45 + wget -O 1.wav -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/test_wavs/_1634_210_2577_1_1525157964032_3712259_55.wav
  46 +
  47 + wget -O 2.wav -q https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/resolve/main/test_wavs/_1634_210_2577_1_1525157964032_3712259_75.wav
  48 + popd
  49 + tar cvjf $d.tar.bz2 $d
  50 + ls -lh $d
  51 + rm -rf $d
  52 +
  53 + - name: Release
  54 + uses: svenstaro/upload-release-action@v2
  55 + with:
  56 + file_glob: true
  57 + file: ./*.tar.bz2
  58 + overwrite: true
  59 + repo_name: k2-fsa/sherpa-onnx
  60 + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
  61 + tag: asr-models
@@ -229,6 +229,18 @@ export function getOfflineModelConfig(type: number): OfflineModelConfig { @@ -229,6 +229,18 @@ export function getOfflineModelConfig(type: number): OfflineModelConfig {
229 229
230 break; 230 break;
231 } 231 }
  232 +
  233 + case 23: {
  234 + const modelDir = "sherpa-onnx-zipformer-zh-en-2023-11-22";
  235 + c.transducer.encoder = `${modelDir}/encoder-epoch-34-avg-19.int8.onnx`;
  236 + c.transducer.decoder = `${modelDir}/decoder-epoch-34-avg-19.onnx`;
  237 + c.transducer.joiner = `${modelDir}/joiner-epoch-34-avg-19.int8.onnx`;
  238 + c.tokens = `${modelDir}/tokens.txt`;
  239 + c.modelType = "transducer";
  240 +
  241 + break;
  242 + }
  243 +
232 default: { 244 default: {
233 console.log(`Please specify a supported type. Given type ${type}`); 245 console.log(`Please specify a supported type. Given type ${type}`);
234 } 246 }
@@ -423,6 +423,26 @@ def get_models(): @@ -423,6 +423,26 @@ def get_models():
423 popd 423 popd
424 """, 424 """,
425 ), 425 ),
  426 + Model(
  427 + model_name="sherpa-onnx-zipformer-zh-en-2023-11-22",
  428 + idx=23,
  429 + lang="zh_en",
  430 + lang2="Chinese,English",
  431 + short_name="zipformer",
  432 + cmd="""
  433 + pushd $model_name
  434 +
  435 + rm -rfv test_wavs
  436 +
  437 + rm -fv encoder-epoch-34-avg-19.onnx
  438 + rm -fv joiner-epoch-34-avg-19.onnx
  439 + rm -fv bbpe.model
  440 +
  441 + ls -lh
  442 +
  443 + popd
  444 + """,
  445 + ),
426 ] 446 ]
427 return models 447 return models
428 448
@@ -451,6 +451,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { @@ -451,6 +451,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
451 tokens = "$modelDir/tokens.txt", 451 tokens = "$modelDir/tokens.txt",
452 ) 452 )
453 } 453 }
  454 +
  455 + 23 -> {
  456 + val modelDir = "sherpa-onnx-zipformer-zh-en-2023-11-22"
  457 + return OfflineModelConfig(
  458 + transducer = OfflineTransducerModelConfig(
  459 + encoder = "$modelDir/encoder-epoch-34-avg-19.int8.onnx",
  460 + decoder = "$modelDir/decoder-epoch-34-avg-19.onnx",
  461 + joiner = "$modelDir/joiner-epoch-34-avg-19.int8.onnx",
  462 + ),
  463 + tokens = "$modelDir/tokens.txt",
  464 + modelType = "transducer",
  465 + )
  466 + }
454 } 467 }
455 return null 468 return null
456 } 469 }