Fangjun Kuang
Committed by GitHub

Export models from https://github.com/voicekit-team/T-one to sherpa-onnx (#2571)

This PR exports models from the T-one repository (https://github.com/voicekit-team/T-one) to sherpa-onnx format, creating a complete pipeline for Russian speech recognition using streaming CTC models.

- Adds scripts to download, process, and test T-one models in sherpa-onnx format
- Creates GitHub workflow for automated model export and publishing
- Updates kaldi-native-fbank dependency to version 1.22.1
name: export-t-one-to-onnx
on:
workflow_dispatch:
concurrency:
group: export-t-one-to-onnx-${{ github.ref }}
cancel-in-progress: true
jobs:
export-t-one-to-onnx:
if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
name: export t-one
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.10"]
steps:
- uses: actions/checkout@v4
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Python dependencies
shell: bash
run: |
pip install onnx==1.17.0 onnxruntime==1.17.1 soundfile librosa kaldi_native_fbank "numpy<2"
- name: Run
shell: bash
run: |
cd scripts/t-one
wget https://raw.githubusercontent.com/voicekit-team/T-one/refs/heads/main/LICENSE
./run.sh
d=sherpa-onnx-streaming-t-one-russian-2025-09-08
mkdir $d
cp -v ./tokens.txt $d
cp -v ./model.onnx $d
cp -v ./russian_test_short_from_t_one.wav $d/0.wav
cp -v ./LICENSE $d
cp -v ./README.md $d
ls -lh $d
tar cjfv $d.tar.bz2 $d
ls -lh $d.tar.bz2
mv $d.tar.bz2 ../..
mv $d ../..
- name: Publish to huggingface
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
m=sherpa-onnx-streaming-t-one-russian-2025-09-08
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface
cd huggingface
git fetch
git pull
echo "pwd: $PWD"
ls -lh ../$m
git lfs track "*.wav"
rm -rf ./*
cp -v ../$m/* ./
git lfs track "*.onnx"
git add .
ls -lh
git status
git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main || true
cd ..
- name: Release
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
file: ./*.tar.bz2
overwrite: true
repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: asr-models
... ...
... ... @@ -148,3 +148,4 @@ voices.bin
kitten-nano-en-v0_1-fp16
*.egg-info
*.jar
vocab.json
... ...
function(download_kaldi_native_fbank)
include(FetchContent)
set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.3.tar.gz")
set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.3.tar.gz")
set(kaldi_native_fbank_HASH "SHA256=d409eddae5a46dc796f0841880f489ff0728b96ae26218702cd438c28667c70e")
set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.22.1.tar.gz")
set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.22.1.tar.gz")
set(kaldi_native_fbank_HASH "SHA256=b292ddd1fa121f28371d11c14dd016c59c54b3f0dbb2bb2cfdc82d562564d0f5")
set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
... ... @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
# If you don't have access to the Internet,
# please pre-download kaldi-native-fbank
set(possible_file_locations
$ENV{HOME}/Downloads/kaldi-native-fbank-1.21.3.tar.gz
${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.3.tar.gz
${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.3.tar.gz
/tmp/kaldi-native-fbank-1.21.3.tar.gz
/star-fj/fangjun/download/github/kaldi-native-fbank-1.21.3.tar.gz
$ENV{HOME}/Downloads/kaldi-native-fbank-1.22.1.tar.gz
${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.22.1.tar.gz
${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.22.1.tar.gz
/tmp/kaldi-native-fbank-1.22.1.tar.gz
/star-fj/fangjun/download/github/kaldi-native-fbank-1.22.1.tar.gz
)
foreach(f IN LISTS possible_file_locations)
... ...
# Introduction
This folder contains scripts for exporting models from
https://github.com/voicekit-team/T-one
to sherpa-onnx.
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import onnx
def main():
meta_data = {
"model_type": "t-one",
"language": "Russian",
"version": 1,
"maintainer": "k2-fsa",
"sample_rate": 8000,
"frame_length_ms": 300, # chunk_duration_ms
"state_dim": 219729,
"comment": "This is a streaming CTC model for Russian with expected audio sample rate 8000",
"url": "https://github.com/voicekit-team/T-one",
"see_also": "https://huggingface.co/t-tech/T-one",
}
model = onnx.load("./model.onnx")
while len(model.metadata_props):
model.metadata_props.pop()
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
print("--------------------")
print(model.metadata_props)
onnx.save(model, "./model.onnx")
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import json
def main():
with open("vocab.json") as f:
token2id = json.load(f)
with open("tokens.txt", "w", encoding="utf-8") as f:
for s, i in token2id.items():
if s == "|":
s = " "
if s == "[PAD]":
s = "<blk>"
f.write(f"{s} {i}\n")
if __name__ == "__main__":
main()
... ...
#!/usr/bin/env bash
set -ex
if [ ! -f ./model.onnx ]; then
curl -SL -O https://hf-mirror.com/t-tech/T-one/resolve/main/model.onnx
fi
if [ ! -f ./vocab.json ]; then
curl -SL -O https://hf-mirror.com/t-tech/T-one/resolve/main/vocab.json
fi
if [ ! -f ./russian_test_short_from_t_one.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/russian_test_short_from_t_one.wav
fi
python3 ./add_meta_data.py
if [ ! -f ./tokens.txt ]; then
python3 ./generate_tokens.py
fi
./test.py --model ./model.onnx --tokens ./tokens.txt --wave ./russian_test_short_from_t_one.wav
... ...
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import argparse
from typing import Tuple
import kaldi_native_fbank as knf
import numpy as np
import onnxruntime as ort
import soundfile as sf
def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--model",
type=str,
required=True,
help="Path to model.onnx",
)
parser.add_argument(
"--tokens",
type=str,
required=True,
help="Path to tokens.txt",
)
parser.add_argument(
"--wave",
type=str,
required=True,
help="The input wave to be recognized",
)
return parser.parse_args()
class OnnxModel:
def __init__(self, filename):
session_opts = ort.SessionOptions()
session_opts.inter_op_num_threads = 1
session_opts.intra_op_num_threads = 1
self.session_opts = session_opts
self.model = ort.InferenceSession(
filename,
sess_options=self.session_opts,
providers=["CPUExecutionProvider"],
)
meta = self.model.get_modelmeta().custom_metadata_map
self.frame_length_ms = int(meta["frame_length_ms"])
self.sample_rate = int(meta["sample_rate"])
self.state_dim = int(meta["state_dim"])
def get_init_state(self, batch_size=1):
return np.zeros((batch_size, self.state_dim), dtype=np.float16)
def __call__(self, x, state):
"""
Args:
x: (batch_size, num_samples, 1), int32
state: (batch_size, 219729)
Returns:
log_probs: (batch_size, num_frames, vocab_size)
next_state: (batch_size, 219729)
"""
log_prob, next_state = self.model.run(
[
self.model.get_outputs()[0].name,
self.model.get_outputs()[1].name,
],
{
self.model.get_inputs()[0].name: x,
self.model.get_inputs()[1].name: state,
},
)
return log_prob, next_state
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate
def load_tokens(filename):
ans = dict()
with open(filename, encoding="utf-8") as f:
for line in f:
fields = line.strip().split()
if len(fields) == 1:
ans[int(fields[0])] = " "
else:
ans[int(fields[1])] = fields[0]
return ans
def compute_feat(
samples,
sample_rate,
frame_length_ms: int,
):
opts = knf.RawAudioSamplesOptions()
opts.frame_opts.samp_freq = sample_rate
opts.frame_opts.frame_length_ms = frame_length_ms
opts.frame_opts.frame_shift_ms = frame_length_ms
raw_audio_samples = knf.OnlineRawAudioSamples(opts)
raw_audio_samples.accept_waveform(sample_rate, samples)
raw_audio_samples.input_finished()
features = []
for i in range(raw_audio_samples.num_frames_ready):
f = raw_audio_samples.get_frame(i)
features.append(f)
return (np.array(features, dtype=np.float32) * 32768).astype(np.int32)
def main():
args = get_args()
print(vars(args))
model = OnnxModel(filename=args.model)
samples, sample_rate = load_audio(args.wave)
if sample_rate != model.sample_rate:
import librosa
samples = librosa.resample(
samples, orig_sr=sample_rate, target_sr=model.sample_rate
)
sample_rate = model.sample_rate
# Pad 0.5 seconds
samples = np.pad(samples, (0, 4000))
features = compute_feat(
samples=samples,
sample_rate=sample_rate,
frame_length_ms=model.frame_length_ms,
)
print(features.shape)
id2token = load_tokens(args.tokens)
blank = -2
for idx, token in id2token.items():
if token == "<blk>":
blank = idx
state = model.get_init_state()
token_id_list = []
for f in features:
log_probs, state = model(f[None, :, None], state)
max_token_ids = log_probs[0].argmax(axis=-1).tolist()
token_id_list += max_token_ids
unique_ids = []
prev = -1
for t in token_id_list:
if t == blank:
prev = t
continue
if t == prev:
continue
prev = t
unique_ids.append(prev)
text = "".join([id2token[i] for i in unique_ids])
print(text)
if __name__ == "__main__":
main()
... ...