Fangjun Kuang
Committed by GitHub

Export models from https://github.com/voicekit-team/T-one to sherpa-onnx (#2571)

This PR exports models from the T-one repository (https://github.com/voicekit-team/T-one) to sherpa-onnx format, creating a complete pipeline for Russian speech recognition using streaming CTC models.

- Adds scripts to download, process, and test T-one models in sherpa-onnx format
- Creates GitHub workflow for automated model export and publishing
- Updates kaldi-native-fbank dependency to version 1.22.1
  1 +name: export-t-one-to-onnx
  2 +
  3 +on:
  4 + workflow_dispatch:
  5 +
  6 +concurrency:
  7 + group: export-t-one-to-onnx-${{ github.ref }}
  8 + cancel-in-progress: true
  9 +
  10 +jobs:
  11 + export-t-one-to-onnx:
  12 + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
  13 + name: export t-one
  14 + runs-on: ${{ matrix.os }}
  15 + strategy:
  16 + fail-fast: false
  17 + matrix:
  18 + os: [ubuntu-latest]
  19 + python-version: ["3.10"]
  20 +
  21 + steps:
  22 + - uses: actions/checkout@v4
  23 +
  24 + - name: Setup Python ${{ matrix.python-version }}
  25 + uses: actions/setup-python@v5
  26 + with:
  27 + python-version: ${{ matrix.python-version }}
  28 +
  29 + - name: Install Python dependencies
  30 + shell: bash
  31 + run: |
  32 + pip install onnx==1.17.0 onnxruntime==1.17.1 soundfile librosa kaldi_native_fbank "numpy<2"
  33 +
  34 + - name: Run
  35 + shell: bash
  36 + run: |
  37 + cd scripts/t-one
  38 +
  39 + wget https://raw.githubusercontent.com/voicekit-team/T-one/refs/heads/main/LICENSE
  40 + ./run.sh
  41 +
  42 + d=sherpa-onnx-streaming-t-one-russian-2025-09-08
  43 + mkdir $d
  44 + cp -v ./tokens.txt $d
  45 + cp -v ./model.onnx $d
  46 + cp -v ./russian_test_short_from_t_one.wav $d/0.wav
  47 + cp -v ./LICENSE $d
  48 + cp -v ./README.md $d
  49 +
  50 + ls -lh $d
  51 +
  52 + tar cjfv $d.tar.bz2 $d
  53 +
  54 + ls -lh $d.tar.bz2
  55 +
  56 + mv $d.tar.bz2 ../..
  57 + mv $d ../..
  58 +
  59 + - name: Publish to huggingface
  60 + env:
  61 + HF_TOKEN: ${{ secrets.HF_TOKEN }}
  62 + uses: nick-fields/retry@v3
  63 + with:
  64 + max_attempts: 20
  65 + timeout_seconds: 200
  66 + shell: bash
  67 + command: |
  68 + git config --global user.email "csukuangfj@gmail.com"
  69 + git config --global user.name "Fangjun Kuang"
  70 +
  71 + rm -rf huggingface
  72 + export GIT_LFS_SKIP_SMUDGE=1
  73 + export GIT_CLONE_PROTECTION_ACTIVE=false
  74 +
  75 + m=sherpa-onnx-streaming-t-one-russian-2025-09-08
  76 +
  77 + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
  78 + cd huggingface
  79 + git fetch
  80 + git pull
  81 + echo "pwd: $PWD"
  82 + ls -lh ../$m
  83 + git lfs track "*.wav"
  84 +
  85 + rm -rf ./*
  86 +
  87 + cp -v ../$m/* ./
  88 +
  89 + git lfs track "*.onnx"
  90 + git add .
  91 +
  92 + ls -lh
  93 +
  94 + git status
  95 +
  96 + git commit -m "add models"
  97 + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main || true
  98 +
  99 + cd ..
  100 +
  101 + - name: Release
  102 + uses: svenstaro/upload-release-action@v2
  103 + with:
  104 + file_glob: true
  105 + file: ./*.tar.bz2
  106 + overwrite: true
  107 + repo_name: k2-fsa/sherpa-onnx
  108 + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
  109 + tag: asr-models
@@ -148,3 +148,4 @@ voices.bin @@ -148,3 +148,4 @@ voices.bin
148 kitten-nano-en-v0_1-fp16 148 kitten-nano-en-v0_1-fp16
149 *.egg-info 149 *.egg-info
150 *.jar 150 *.jar
  151 +vocab.json
1 function(download_kaldi_native_fbank) 1 function(download_kaldi_native_fbank)
2 include(FetchContent) 2 include(FetchContent)
3 3
4 - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.3.tar.gz")  
5 - set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.3.tar.gz")  
6 - set(kaldi_native_fbank_HASH "SHA256=d409eddae5a46dc796f0841880f489ff0728b96ae26218702cd438c28667c70e") 4 + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.22.1.tar.gz")
  5 + set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.22.1.tar.gz")
  6 + set(kaldi_native_fbank_HASH "SHA256=b292ddd1fa121f28371d11c14dd016c59c54b3f0dbb2bb2cfdc82d562564d0f5")
7 7
8 set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) 8 set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
9 set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) 9 set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
@@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
12 # If you don't have access to the Internet, 12 # If you don't have access to the Internet,
13 # please pre-download kaldi-native-fbank 13 # please pre-download kaldi-native-fbank
14 set(possible_file_locations 14 set(possible_file_locations
15 - $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.3.tar.gz  
16 - ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.3.tar.gz  
17 - ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.3.tar.gz  
18 - /tmp/kaldi-native-fbank-1.21.3.tar.gz  
19 - /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.3.tar.gz 15 + $ENV{HOME}/Downloads/kaldi-native-fbank-1.22.1.tar.gz
  16 + ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.22.1.tar.gz
  17 + ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.22.1.tar.gz
  18 + /tmp/kaldi-native-fbank-1.22.1.tar.gz
  19 + /star-fj/fangjun/download/github/kaldi-native-fbank-1.22.1.tar.gz
20 ) 20 )
21 21
22 foreach(f IN LISTS possible_file_locations) 22 foreach(f IN LISTS possible_file_locations)
  1 +# Introduction
  2 +
  3 +This folder contains scripts for exporting models from
  4 +https://github.com/voicekit-team/T-one
  5 +to sherpa-onnx.
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +
  5 +import onnx
  6 +
  7 +
  8 +def main():
  9 + meta_data = {
  10 + "model_type": "t-one",
  11 + "language": "Russian",
  12 + "version": 1,
  13 + "maintainer": "k2-fsa",
  14 + "sample_rate": 8000,
  15 + "frame_length_ms": 300, # chunk_duration_ms
  16 + "state_dim": 219729,
  17 + "comment": "This is a streaming CTC model for Russian with expected audio sample rate 8000",
  18 + "url": "https://github.com/voicekit-team/T-one",
  19 + "see_also": "https://huggingface.co/t-tech/T-one",
  20 + }
  21 + model = onnx.load("./model.onnx")
  22 +
  23 + while len(model.metadata_props):
  24 + model.metadata_props.pop()
  25 +
  26 + for key, value in meta_data.items():
  27 + meta = model.metadata_props.add()
  28 + meta.key = key
  29 + meta.value = str(value)
  30 + print("--------------------")
  31 +
  32 + print(model.metadata_props)
  33 +
  34 + onnx.save(model, "./model.onnx")
  35 +
  36 +
  37 +if __name__ == "__main__":
  38 + main()
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +import json
  5 +
  6 +
  7 +def main():
  8 + with open("vocab.json") as f:
  9 + token2id = json.load(f)
  10 +
  11 + with open("tokens.txt", "w", encoding="utf-8") as f:
  12 + for s, i in token2id.items():
  13 + if s == "|":
  14 + s = " "
  15 + if s == "[PAD]":
  16 + s = "<blk>"
  17 +
  18 + f.write(f"{s} {i}\n")
  19 +
  20 +
  21 +if __name__ == "__main__":
  22 + main()
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./model.onnx ]; then
  6 + curl -SL -O https://hf-mirror.com/t-tech/T-one/resolve/main/model.onnx
  7 +fi
  8 +
  9 +if [ ! -f ./vocab.json ]; then
  10 + curl -SL -O https://hf-mirror.com/t-tech/T-one/resolve/main/vocab.json
  11 +fi
  12 +
  13 +if [ ! -f ./russian_test_short_from_t_one.wav ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/russian_test_short_from_t_one.wav
  15 +fi
  16 +
  17 +python3 ./add_meta_data.py
  18 +
  19 +if [ ! -f ./tokens.txt ]; then
  20 + python3 ./generate_tokens.py
  21 +fi
  22 +
  23 +./test.py --model ./model.onnx --tokens ./tokens.txt --wave ./russian_test_short_from_t_one.wav
  1 +#!/usr/bin/env python3
  2 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
  3 +
  4 +import argparse
  5 +from typing import Tuple
  6 +
  7 +import kaldi_native_fbank as knf
  8 +import numpy as np
  9 +import onnxruntime as ort
  10 +import soundfile as sf
  11 +
  12 +
  13 +def get_args():
  14 + parser = argparse.ArgumentParser(
  15 + formatter_class=argparse.ArgumentDefaultsHelpFormatter
  16 + )
  17 +
  18 + parser.add_argument(
  19 + "--model",
  20 + type=str,
  21 + required=True,
  22 + help="Path to model.onnx",
  23 + )
  24 +
  25 + parser.add_argument(
  26 + "--tokens",
  27 + type=str,
  28 + required=True,
  29 + help="Path to tokens.txt",
  30 + )
  31 +
  32 + parser.add_argument(
  33 + "--wave",
  34 + type=str,
  35 + required=True,
  36 + help="The input wave to be recognized",
  37 + )
  38 +
  39 + return parser.parse_args()
  40 +
  41 +
  42 +class OnnxModel:
  43 + def __init__(self, filename):
  44 + session_opts = ort.SessionOptions()
  45 + session_opts.inter_op_num_threads = 1
  46 + session_opts.intra_op_num_threads = 1
  47 +
  48 + self.session_opts = session_opts
  49 +
  50 + self.model = ort.InferenceSession(
  51 + filename,
  52 + sess_options=self.session_opts,
  53 + providers=["CPUExecutionProvider"],
  54 + )
  55 +
  56 + meta = self.model.get_modelmeta().custom_metadata_map
  57 +
  58 + self.frame_length_ms = int(meta["frame_length_ms"])
  59 + self.sample_rate = int(meta["sample_rate"])
  60 + self.state_dim = int(meta["state_dim"])
  61 +
  62 + def get_init_state(self, batch_size=1):
  63 + return np.zeros((batch_size, self.state_dim), dtype=np.float16)
  64 +
  65 + def __call__(self, x, state):
  66 + """
  67 + Args:
  68 + x: (batch_size, num_samples, 1), int32
  69 + state: (batch_size, 219729)
  70 + Returns:
  71 + log_probs: (batch_size, num_frames, vocab_size)
  72 + next_state: (batch_size, 219729)
  73 + """
  74 + log_prob, next_state = self.model.run(
  75 + [
  76 + self.model.get_outputs()[0].name,
  77 + self.model.get_outputs()[1].name,
  78 + ],
  79 + {
  80 + self.model.get_inputs()[0].name: x,
  81 + self.model.get_inputs()[1].name: state,
  82 + },
  83 + )
  84 + return log_prob, next_state
  85 +
  86 +
  87 +def load_audio(filename: str) -> Tuple[np.ndarray, int]:
  88 + data, sample_rate = sf.read(
  89 + filename,
  90 + always_2d=True,
  91 + dtype="float32",
  92 + )
  93 + data = data[:, 0] # use only the first channel
  94 + samples = np.ascontiguousarray(data)
  95 + return samples, sample_rate
  96 +
  97 +
  98 +def load_tokens(filename):
  99 + ans = dict()
  100 + with open(filename, encoding="utf-8") as f:
  101 + for line in f:
  102 + fields = line.strip().split()
  103 + if len(fields) == 1:
  104 + ans[int(fields[0])] = " "
  105 + else:
  106 + ans[int(fields[1])] = fields[0]
  107 + return ans
  108 +
  109 +
  110 +def compute_feat(
  111 + samples,
  112 + sample_rate,
  113 + frame_length_ms: int,
  114 +):
  115 + opts = knf.RawAudioSamplesOptions()
  116 + opts.frame_opts.samp_freq = sample_rate
  117 + opts.frame_opts.frame_length_ms = frame_length_ms
  118 + opts.frame_opts.frame_shift_ms = frame_length_ms
  119 +
  120 + raw_audio_samples = knf.OnlineRawAudioSamples(opts)
  121 +
  122 + raw_audio_samples.accept_waveform(sample_rate, samples)
  123 + raw_audio_samples.input_finished()
  124 +
  125 + features = []
  126 +
  127 + for i in range(raw_audio_samples.num_frames_ready):
  128 + f = raw_audio_samples.get_frame(i)
  129 + features.append(f)
  130 +
  131 + return (np.array(features, dtype=np.float32) * 32768).astype(np.int32)
  132 +
  133 +
  134 +def main():
  135 + args = get_args()
  136 + print(vars(args))
  137 +
  138 + model = OnnxModel(filename=args.model)
  139 +
  140 + samples, sample_rate = load_audio(args.wave)
  141 + if sample_rate != model.sample_rate:
  142 + import librosa
  143 +
  144 + samples = librosa.resample(
  145 + samples, orig_sr=sample_rate, target_sr=model.sample_rate
  146 + )
  147 + sample_rate = model.sample_rate
  148 +
  149 + # Pad 0.5 seconds
  150 + samples = np.pad(samples, (0, 4000))
  151 +
  152 + features = compute_feat(
  153 + samples=samples,
  154 + sample_rate=sample_rate,
  155 + frame_length_ms=model.frame_length_ms,
  156 + )
  157 + print(features.shape)
  158 +
  159 + id2token = load_tokens(args.tokens)
  160 +
  161 + blank = -2
  162 + for idx, token in id2token.items():
  163 + if token == "<blk>":
  164 + blank = idx
  165 +
  166 + state = model.get_init_state()
  167 + token_id_list = []
  168 + for f in features:
  169 + log_probs, state = model(f[None, :, None], state)
  170 +
  171 + max_token_ids = log_probs[0].argmax(axis=-1).tolist()
  172 + token_id_list += max_token_ids
  173 +
  174 + unique_ids = []
  175 + prev = -1
  176 + for t in token_id_list:
  177 + if t == blank:
  178 + prev = t
  179 + continue
  180 +
  181 + if t == prev:
  182 + continue
  183 +
  184 + prev = t
  185 + unique_ids.append(prev)
  186 + text = "".join([id2token[i] for i in unique_ids])
  187 + print(text)
  188 +
  189 +
  190 +if __name__ == "__main__":
  191 + main()