Add C++ support for non-streaming NeMo fast conformer hybrid transducer ctc (the ctc branch) (#848)

Fangjun Kuang · GitHub
Commit 5d8c35e44ea967f3a1763c7402099f886a26b1a9 5d8c35e4 1 parent 5ed3ec1c
.github/scripts/test-offline-ctc.sh
.github/scripts/test-spoken-language-identification.sh
.github/workflows/linux.yaml
.github/workflows/macos.yaml
sherpa-onnx/csrc/offline-ctc-model.cc
sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h
sherpa-onnx/csrc/offline-recognizer-impl.cc
sherpa-onnx/csrc/symbol-table.cc
--- a/.github/scripts/test-offline-ctc.sh
查看文件 @5d8c35e
+++ b/.github/scripts/test-offline-ctc.sh
查看文件 @5d8c35e
@@ -13,14 +13,111 @@ echo "PATH: $PATH"
 which $EXE
+log "-----------------------------------------------------------------"
+log "Run Nemo fast conformer hybrid transducer ctc models (CTC branch)"
+log "-----------------------------------------------------------------"
+
+url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2
+name=$(basename $url)
+curl -SL -O $url
+tar xvf $name
+rm $name
+repo=$(basename -s .tar.bz2 $name)
+ls -lh $repo
+
+log "test $repo"
+test_wavs=(
+de-german.wav
+es-spanish.wav
+hr-croatian.wav
+po-polish.wav
+uk-ukrainian.wav
+en-english.wav
+fr-french.wav
+it-italian.wav
+ru-russian.wav
+)
+for w in ${test_wavs[@]}; do
+  time $EXE \
+    --tokens=$repo/tokens.txt \
+    --nemo-ctc-model=$repo/model.onnx \
+    --debug=1 \
+    $repo/test_wavs/$w
+done
+
+rm -rf $repo
+
+url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-en-24500.tar.bz2
+name=$(basename $url)
+curl -SL -O $url
+tar xvf $name
+rm $name
+repo=$(basename -s .tar.bz2 $name)
+ls -lh $repo
+
+log "Test $repo"
+
+time $EXE \
+  --tokens=$repo/tokens.txt \
+  --nemo-ctc-model=$repo/model.onnx \
+  --debug=1 \
+  $repo/test_wavs/en-english.wav
+
+rm -rf $repo
+
+url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-es-1424.tar.bz2
+name=$(basename $url)
+curl -SL -O $url
+tar xvf $name
+rm $name
+repo=$(basename -s .tar.bz2 $name)
+ls -lh $repo
+
+log "test $repo"
+
+time $EXE \
+  --tokens=$repo/tokens.txt \
+  --nemo-ctc-model=$repo/model.onnx \
+  --debug=1 \
+  $repo/test_wavs/es-spanish.wav
+
+rm -rf $repo
+
+url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288.tar.bz2
+name=$(basename $url)
+curl -SL -O $url
+tar xvf $name
+rm $name
+repo=$(basename -s .tar.bz2 $name)
+ls -lh $repo
+
+log "Test $repo"
+
+test_wavs=(
+en-english.wav
+de-german.wav
+fr-french.wav
+es-spanish.wav
+)
+
+for w in ${test_wavs[@]}; do
+  time $EXE \
+    --tokens=$repo/tokens.txt \
+    --nemo-ctc-model=$repo/model.onnx \
+    --debug=1 \
+    $repo/test_wavs/$w
+done
+
+rm -rf $repo
+
 log "------------------------------------------------------------"
 log "Run Wenet models"
 log "------------------------------------------------------------"
 wenet_models=(
 sherpa-onnx-zh-wenet-aishell
-sherpa-onnx-zh-wenet-aishell2
+# sherpa-onnx-zh-wenet-aishell2
 # sherpa-onnx-zh-wenet-wenetspeech
-sherpa-onnx-zh-wenet-multi-cn
+# sherpa-onnx-zh-wenet-multi-cn
 sherpa-onnx-en-wenet-librispeech
 # sherpa-onnx-en-wenet-gigaspeech
 )
--- a/.github/scripts/test-spoken-language-identification.sh
查看文件 @5d8c35e
+++ b/.github/scripts/test-spoken-language-identification.sh
查看文件 @5d8c35e
@@ -62,6 +62,11 @@ for wav in ${waves[@]}; do
   ls -lh *.wav
 done
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/spoken-language-identification-test-wavs.tar.bz2
+tar xvf spoken-language-identification-test-wavs.tar.bz2
+rm spoken-language-identification-test-wavs.tar.bz2
+data=spoken-language-identification-test-wavs
+
 for name in ${names[@]}; do
   log "------------------------------------------------------------"
   log "Run $name"
@@ -85,14 +90,14 @@ for name in ${names[@]}; do
     time $EXE \
       --whisper-encoder=$repo/${name}-encoder.onnx \
       --whisper-decoder=$repo/${name}-decoder.onnx \
-      $wav
+      $data/$wav
     log "test int8 onnx"
     time $EXE \
       --whisper-encoder=$repo/${name}-encoder.int8.onnx \
       --whisper-decoder=$repo/${name}-decoder.int8.onnx \
-      $wav
+      $data/$wav
   done
   rm -rf $repo
 done
--- a/.github/workflows/linux.yaml
查看文件 @5d8c35e
+++ b/.github/workflows/linux.yaml
查看文件 @5d8c35e
@@ -128,13 +128,13 @@ jobs:
           name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
           path: install/*
-      - name: Test offline punctuation
+      - name: Test spoken language identification (C++ API)
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH
-          export EXE=sherpa-onnx-offline-punctuation
+          export EXE=sherpa-onnx-offline-language-identification
-          .github/scripts/test-offline-punctuation.sh
+          .github/scripts/test-spoken-language-identification.sh
       - name: Test C API
         shell: bash
@@ -147,13 +147,13 @@ jobs:
           .github/scripts/test-c-api.sh
-      - name: Test Audio tagging
+      - name: Test offline CTC
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH
-          export EXE=sherpa-onnx-offline-audio-tagging
+          export EXE=sherpa-onnx-offline
-          .github/scripts/test-audio-tagging.sh
+          .github/scripts/test-offline-ctc.sh
       - name: Test online CTC
         shell: bash
@@ -163,14 +163,21 @@ jobs:
           .github/scripts/test-online-ctc.sh
+      - name: Test offline punctuation
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-punctuation
+
+          .github/scripts/test-offline-punctuation.sh
-      - name: Test spoken language identification (C++ API)
+      - name: Test Audio tagging
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH
-          export EXE=sherpa-onnx-offline-language-identification
+          export EXE=sherpa-onnx-offline-audio-tagging
-          .github/scripts/test-spoken-language-identification.sh
+          .github/scripts/test-audio-tagging.sh
       - name: Test transducer kws
         shell: bash
@@ -180,7 +187,6 @@ jobs:
           .github/scripts/test-kws.sh
-
       - name: Test offline Whisper
         if: matrix.build_type != 'Debug'
         shell: bash
@@ -192,14 +198,6 @@ jobs:
           .github/scripts/test-offline-whisper.sh
-      - name: Test offline CTC
-        shell: bash
-        run: |
-          export PATH=$PWD/build/bin:$PATH
-          export EXE=sherpa-onnx-offline
-
-          .github/scripts/test-offline-ctc.sh
-
       - name: Test offline TTS
         if: matrix.with_tts == 'ON'
         shell: bash
--- a/.github/workflows/macos.yaml
查看文件 @5d8c35e
+++ b/.github/workflows/macos.yaml
查看文件 @5d8c35e
@@ -107,6 +107,14 @@ jobs:
           otool -L build/bin/sherpa-onnx
           otool -l build/bin/sherpa-onnx
+      - name: Test online CTC
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx
+
+          .github/scripts/test-online-ctc.sh
+
       - name: Test offline punctuation
         shell: bash
         run: |
@@ -150,14 +158,6 @@ jobs:
           .github/scripts/test-kws.sh
-      - name: Test online CTC
-        shell: bash
-        run: |
-          export PATH=$PWD/build/bin:$PATH
-          export EXE=sherpa-onnx
-
-          .github/scripts/test-online-ctc.sh
-
       - name: Test offline TTS
         if: matrix.with_tts == 'ON'
         shell: bash
--- a/sherpa-onnx/csrc/offline-ctc-model.cc
查看文件 @5d8c35e
+++ b/sherpa-onnx/csrc/offline-ctc-model.cc
查看文件 @5d8c35e
@@ -20,6 +20,7 @@ namespace {
 enum class ModelType {
   kEncDecCTCModelBPE,
+  kEncDecHybridRNNTCTCBPEModel,
   kTdnn,
   kZipformerCtc,
   kWenetCtc,
@@ -55,7 +56,10 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,
         "No model_type in the metadata!\n"
         "If you are using models from NeMo, please refer to\n"
         "https://huggingface.co/csukuangfj/"
-        "sherpa-onnx-nemo-ctc-en-citrinet-512/blob/main/add-model-metadata.py"
+        "sherpa-onnx-nemo-ctc-en-citrinet-512/blob/main/add-model-metadata.py\n"
+        "or "
+        "https://github.com/k2-fsa/sherpa-onnx/tree/master/scripts/nemo/"
+        "fast-conformer-hybrid-transducer-ctc\n"
         "If you are using models from WeNet, please refer to\n"
         "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/"
         "run.sh\n"
@@ -66,6 +70,8 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,
   if (model_type.get() == std::string("EncDecCTCModelBPE")) {
     return ModelType::kEncDecCTCModelBPE;
+  } else if (model_type.get() == std::string("EncDecHybridRNNTCTCBPEModel")) {
+    return ModelType::kEncDecHybridRNNTCTCBPEModel;
   } else if (model_type.get() == std::string("tdnn")) {
     return ModelType::kTdnn;
   } else if (model_type.get() == std::string("zipformer2_ctc")) {
@@ -106,6 +112,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
     case ModelType::kEncDecCTCModelBPE:
       return std::make_unique<OfflineNemoEncDecCtcModel>(config);
       break;
+    case ModelType::kEncDecHybridRNNTCTCBPEModel:
+      return std::make_unique<OfflineNemoEncDecHybridRNNTCTCBPEModel>(config);
+      break;
     case ModelType::kTdnn:
       return std::make_unique<OfflineTdnnCtcModel>(config);
       break;
@@ -153,6 +162,9 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
     case ModelType::kEncDecCTCModelBPE:
       return std::make_unique<OfflineNemoEncDecCtcModel>(mgr, config);
       break;
+    case ModelType::kEncDecHybridRNNTCTCBPEModel:
+      return std::make_unique<OfflineNemoEncDecHybridRNNTCTCBPEModel>(config);
+      break;
     case ModelType::kTdnn:
       return std::make_unique<OfflineTdnnCtcModel>(mgr, config);
       break;
--- a/sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h
查看文件 @5d8c35e
+++ b/sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h
查看文件 @5d8c35e
@@ -81,6 +81,8 @@ class OfflineNemoEncDecCtcModel : public OfflineCtcModel {
   std::unique_ptr<Impl> impl_;
 };
+using OfflineNemoEncDecHybridRNNTCTCBPEModel = OfflineNemoEncDecCtcModel;
+
 }  // namespace sherpa_onnx
 #endif  // SHERPA_ONNX_CSRC_OFFLINE_NEMO_ENC_DEC_CTC_MODEL_H_
--- a/sherpa-onnx/csrc/offline-recognizer-impl.cc
查看文件 @5d8c35e
+++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc
查看文件 @5d8c35e
@@ -122,7 +122,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
     return std::make_unique<OfflineRecognizerParaformerImpl>(config);
   }
-  if (model_type == "EncDecCTCModelBPE" || model_type == "tdnn" ||
+  if (model_type == "EncDecCTCModelBPE" ||
+      model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" ||
       model_type == "zipformer2_ctc" || model_type == "wenet_ctc") {
     return std::make_unique<OfflineRecognizerCtcImpl>(config);
   }
@@ -137,6 +138,7 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
       " - Non-streaming transducer models from icefall\n"
       " - Non-streaming Paraformer models from FunASR\n"
       " - EncDecCTCModelBPE models from NeMo\n"
+      " - EncDecHybridRNNTCTCBPEModel models from NeMo\n"
       " - Whisper models\n"
       " - Tdnn models\n"
       " - Zipformer CTC models\n"
@@ -252,7 +254,8 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
     return std::make_unique<OfflineRecognizerParaformerImpl>(mgr, config);
   }
-  if (model_type == "EncDecCTCModelBPE" || model_type == "tdnn" ||
+  if (model_type == "EncDecCTCModelBPE" ||
+      model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" ||
       model_type == "zipformer2_ctc" || model_type == "wenet_ctc") {
     return std::make_unique<OfflineRecognizerCtcImpl>(mgr, config);
   }
@@ -267,6 +270,7 @@ std::unique_ptr<OfflineRecognizerImpl> OfflineRecognizerImpl::Create(
       " - Non-streaming transducer models from icefall\n"
       " - Non-streaming Paraformer models from FunASR\n"
       " - EncDecCTCModelBPE models from NeMo\n"
+      " - EncDecHybridRNNTCTCBPEModel models from NeMo\n"
       " - Whisper models\n"
       " - Tdnn models\n"
       " - Zipformer CTC models\n"
--- a/sherpa-onnx/csrc/symbol-table.cc
查看文件 @5d8c35e
+++ b/sherpa-onnx/csrc/symbol-table.cc
查看文件 @5d8c35e
@@ -67,9 +67,13 @@ void SymbolTable::Init(std::istream &is) {
     // the following check.
     //
     // Note: Only id2sym_ matters as we use it to convert ID to symbols.
+#if 0
+    // we disable the test here since for some multi-lingual BPE models
+    // from NeMo, the same symbol can appear multiple times with different IDs.
     if (sym != " ") {
       assert(sym2id_.count(sym) == 0);
     }
+#endif
     assert(id2sym_.count(id) == 0);