Fangjun Kuang
Committed by GitHub

Add various language bindings for streaming T-one Russian ASR models (#2576)

This PR adds support for streaming T-one Russian ASR models across various language bindings in the sherpa-onnx library. The changes enable T-one CTC (Connectionist Temporal Classification) model integration by adding new configuration structures and example implementations.

- Introduces OnlineToneCtcModelConfig structures across all language bindings (C, C++, Swift, Java, Kotlin, Go, etc.)
- Adds T-one CTC model support to WASM implementations for both ASR and keyword spotting
- Provides comprehensive example implementations demonstrating T-one model usage in multiple programming languages
正在显示 62 个修改的文件 包含 1351 行增加96 行删除
... ... @@ -4,6 +4,36 @@ set -ex
cd dart-api-examples
pushd streaming-asr
echo '----------streaming T-one ctc----------'
./run-t-one-ctc.sh
rm -rf sherpa-onnx-*
echo '----------streaming zipformer ctc HLG----------'
./run-zipformer-ctc-hlg.sh
rm -rf sherpa-onnx-*
echo '----------streaming zipformer ctc----------'
./run-zipformer-ctc.sh
rm -rf sherpa-onnx-*
echo '----------streaming zipformer transducer----------'
./run-zipformer-transducer-itn.sh
./run-zipformer-transducer.sh
rm -f itn*
rm -rf sherpa-onnx-*
echo '----------streaming NeMo transducer----------'
./run-nemo-transducer.sh
rm -rf sherpa-onnx-*
echo '----------streaming paraformer----------'
./run-paraformer.sh
rm -rf sherpa-onnx-*
popd # streaming-asr
pushd tts
echo '----------matcha tts----------'
... ... @@ -167,29 +197,3 @@ popd
pushd keyword-spotter
./run-zh.sh
popd
pushd streaming-asr
echo '----------streaming zipformer ctc HLG----------'
./run-zipformer-ctc-hlg.sh
rm -rf sherpa-onnx-*
echo '----------streaming zipformer ctc----------'
./run-zipformer-ctc.sh
rm -rf sherpa-onnx-*
echo '----------streaming zipformer transducer----------'
./run-zipformer-transducer-itn.sh
./run-zipformer-transducer.sh
rm -f itn*
rm -rf sherpa-onnx-*
echo '----------streaming NeMo transducer----------'
./run-nemo-transducer.sh
rm -rf sherpa-onnx-*
echo '----------streaming paraformer----------'
./run-paraformer.sh
rm -rf sherpa-onnx-*
popd # streaming-asr
... ...
... ... @@ -10,6 +10,17 @@ arch=$(node -p "require('os').arch()")
platform=$(node -p "require('os').platform()")
node_version=$(node -p "process.versions.node.split('.')[0]")
echo "----------streaming ASR T-one----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
node ./test_asr_streaming_t_one_ctc.js
rm -rf sherpa-onnx-streaming-t-one-russian-2025-09-08
echo "----------KittenTTS----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
... ...
... ... @@ -9,6 +9,13 @@ git status
ls -lh
ls -lh node_modules
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
node ./test-online-t-one-ctc.js
rm -rf sherpa-onnx-streaming-t-one-russian-2025-09-08
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
... ...
... ... @@ -9,6 +9,9 @@ ls -lh
./run-test-version.sh
./run-decode-file-t-one-streaming.sh
rm -rf sherpa-onnx-streaming-*
./run-compute-speaker-embeddings.sh
rm -fv *.wav *.onnx
... ...
... ... @@ -75,6 +75,36 @@ jobs:
otool -L ./install/lib/libsherpa-onnx-c-api.dylib
fi
- name: Test T-one
shell: bash
run: |
name=streaming-t-one-ctc-c-api
gcc -o $name ./c-api-examples/$name.c \
-I ./build/install/include \
-L ./build/install/lib/ \
-l sherpa-onnx-c-api \
-l onnxruntime
ls -lh $name
if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
ldd ./$name
echo "----"
readelf -d ./$name
fi
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
./$name
rm $name
rm -rf sherpa-onnx-streaming-t-one-russian-2025-09-08
- name: Test KittenTTS
shell: bash
run: |
... ... @@ -530,7 +560,8 @@ jobs:
rm -rf sherpa-onnx-*
- name: Test ffmpeg
if: matrix.os == 'macos-latest'
# if: matrix.os == 'macos-latest'
if: false
shell: bash
run: |
brew install ffmpeg
... ...
... ... @@ -78,6 +78,40 @@ jobs:
otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib
fi
- name: Test T-one
shell: bash
run: |
name=streaming-t-one-ctc-cxx-api
g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \
-I ./build/install/include \
-L ./build/install/lib/ \
-l sherpa-onnx-cxx-api \
-l sherpa-onnx-c-api \
-l onnxruntime
ls -lh $name
if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
ls -lh ./$name
ldd ./$name
echo "----"
readelf -d ./$name
fi
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
echo "---"
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
./$name
rm -rf sherpa-onnx-streaming-t-one-russian-2025-09-08
rm -v ./$name
- name: Test KittenTTS
shell: bash
run: |
... ...
... ... @@ -126,6 +126,43 @@ jobs:
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
fi
- name: Run Pascal test (Streaming ASR)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd streaming-asr
./run-t-one-ctc.sh
rm -rf sherpa-onnx-*
echo "---"
./run-zipformer-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
./run-nemo-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
if [[ ${{ matrix.os }} != 'windows-latest' ]]; then
./run-paraformer.sh
rm -rf sherpa-onnx-*
echo "---"
./run-zipformer-ctc.sh
echo "---"
./run-zipformer-ctc-hlg.sh
rm -rf sherpa-onnx-*
echo "---"
fi
ls -lh
popd
- name: Run Pascal test (VAD test)
shell: bash
run: |
... ... @@ -321,36 +358,3 @@ jobs:
echo "---"
ls -lh
popd
- name: Run Pascal test (Streaming ASR)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd streaming-asr
./run-zipformer-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
./run-nemo-transducer.sh
rm -rf sherpa-onnx-*
echo "---"
if [[ ${{ matrix.os }} != 'windows-latest' ]]; then
./run-paraformer.sh
rm -rf sherpa-onnx-*
echo "---"
./run-zipformer-ctc.sh
echo "---"
./run-zipformer-ctc-hlg.sh
rm -rf sherpa-onnx-*
echo "---"
fi
ls -lh
popd
... ...
... ... @@ -108,6 +108,13 @@ jobs:
cd ./java-api-examples
./run-version-test.sh
- name: Run java test (Streaming T-one)
shell: bash
run: |
cd ./java-api-examples
./run-streaming-decode-file-tone-ctc.sh
rm -rf sherpa-onnx-streaming-t-one-*
- name: Run java test (Nemo Canary)
shell: bash
run: |
... ...
... ... @@ -140,19 +140,6 @@ jobs:
name: ${{ matrix.os }}-libs
path: to-upload/
- name: Test non-streaming decoding files with NeMo Canary
shell: bash
run: |
cd scripts/go/_internal/non-streaming-canary-decode-files/
ls -lh
go mod tidy
cat go.mod
go build
ls -lh
./run.sh
rm -rf sherpa-onnx-nemo-*
- name: Test streaming decoding files
shell: bash
run: |
... ... @@ -163,6 +150,9 @@ jobs:
go build
ls -lh
echo "Test T-one CTC"
./run-t-one-ctc.sh
echo "Test zipformer2 CTC"
./run-zipformer2-ctc-with-hr.sh
./run-zipformer2-ctc.sh
... ... @@ -179,6 +169,21 @@ jobs:
./run-paraformer.sh
rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en
- name: Test non-streaming decoding files with NeMo Canary
shell: bash
run: |
cd scripts/go/_internal/non-streaming-canary-decode-files/
ls -lh
go mod tidy
cat go.mod
go build
ls -lh
./run.sh
rm -rf sherpa-onnx-nemo-*
- name: Test non-streaming decoding files
shell: bash
run: |
... ...
... ... @@ -150,3 +150,4 @@ kitten-nano-en-v0_1-fp16
*.jar
vocab.json
*.so
sherpa-onnx-streaming-t-one-russian-2025-09-08
... ...
... ... @@ -44,6 +44,9 @@ target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)
add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c)
target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api)
add_executable(streaming-t-one-ctc-c-api streaming-t-one-ctc-c-api.c)
target_link_libraries(streaming-t-one-ctc-c-api sherpa-onnx-c-api)
add_executable(audio-tagging-c-api audio-tagging-c-api.c)
target_link_libraries(audio-tagging-c-api sherpa-onnx-c-api)
... ...
// c-api-examples/streaming-t-one-ctc-c-api.c
//
// Copyright (c) 2025 Xiaomi Corporation
//
// This file demonstrates how to use streaming T-one with sherpa-onnx's C
// API.
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
// tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
// rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
//
// clang-format on
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "sherpa-onnx/c-api/c-api.h"
int32_t main() {
const char *wav_filename =
"sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav";
const char *model =
"sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx";
const char *tokens =
"sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt";
const char *provider = "cpu";
const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
if (wave == NULL) {
fprintf(stderr, "Failed to read %s\n", wav_filename);
return -1;
}
// Zipformer config
SherpaOnnxOnlineToneCtcModelConfig t_one_ctc;
memset(&t_one_ctc, 0, sizeof(t_one_ctc));
t_one_ctc.model = model;
// Online model config
SherpaOnnxOnlineModelConfig online_model_config;
memset(&online_model_config, 0, sizeof(online_model_config));
online_model_config.debug = 1;
online_model_config.num_threads = 1;
online_model_config.provider = provider;
online_model_config.tokens = tokens;
online_model_config.t_one_ctc = t_one_ctc;
// Recognizer config
SherpaOnnxOnlineRecognizerConfig recognizer_config;
memset(&recognizer_config, 0, sizeof(recognizer_config));
recognizer_config.decoding_method = "greedy_search";
recognizer_config.model_config = online_model_config;
const SherpaOnnxOnlineRecognizer *recognizer =
SherpaOnnxCreateOnlineRecognizer(&recognizer_config);
if (recognizer == NULL) {
fprintf(stderr, "Please check your config!\n");
SherpaOnnxFreeWave(wave);
return -1;
}
const SherpaOnnxOnlineStream *stream =
SherpaOnnxCreateOnlineStream(recognizer);
const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50);
int32_t segment_id = 0;
// simulate streaming. You can choose an arbitrary N
#define N 3200
fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
wave->sample_rate, wave->num_samples,
(float)wave->num_samples / wave->sample_rate);
float left_paddings[2400] = {0}; // 0.3 seconds at 8 kHz sample rate
SherpaOnnxOnlineStreamAcceptWaveform(stream, wave->sample_rate, left_paddings,
2400);
int32_t k = 0;
while (k < wave->num_samples) {
int32_t start = k;
int32_t end =
(start + N > wave->num_samples) ? wave->num_samples : (start + N);
k += N;
SherpaOnnxOnlineStreamAcceptWaveform(stream, wave->sample_rate,
wave->samples + start, end - start);
while (SherpaOnnxIsOnlineStreamReady(recognizer, stream)) {
SherpaOnnxDecodeOnlineStream(recognizer, stream);
}
const SherpaOnnxOnlineRecognizerResult *r =
SherpaOnnxGetOnlineStreamResult(recognizer, stream);
if (strlen(r->text)) {
SherpaOnnxPrint(display, segment_id, r->text);
}
if (SherpaOnnxOnlineStreamIsEndpoint(recognizer, stream)) {
if (strlen(r->text)) {
++segment_id;
}
SherpaOnnxOnlineStreamReset(recognizer, stream);
}
SherpaOnnxDestroyOnlineRecognizerResult(r);
}
// add some tail padding
float tail_paddings[4800] = {0}; // 0.6 seconds at 8 kHz sample rate
SherpaOnnxOnlineStreamAcceptWaveform(stream, wave->sample_rate, tail_paddings,
4800);
SherpaOnnxOnlineStreamInputFinished(stream);
while (SherpaOnnxIsOnlineStreamReady(recognizer, stream)) {
SherpaOnnxDecodeOnlineStream(recognizer, stream);
}
SherpaOnnxFreeWave(wave);
const SherpaOnnxOnlineRecognizerResult *r =
SherpaOnnxGetOnlineStreamResult(recognizer, stream);
if (strlen(r->text)) {
SherpaOnnxPrint(display, segment_id, r->text);
}
SherpaOnnxDestroyOnlineRecognizerResult(r);
SherpaOnnxDestroyDisplay(display);
SherpaOnnxDestroyOnlineStream(stream);
SherpaOnnxDestroyOnlineRecognizer(recognizer);
fprintf(stderr, "\n");
return 0;
}
... ...
... ... @@ -15,6 +15,9 @@ target_link_libraries(kws-cxx-api sherpa-onnx-cxx-api)
add_executable(streaming-zipformer-rtf-cxx-api ./streaming-zipformer-rtf-cxx-api.cc)
target_link_libraries(streaming-zipformer-rtf-cxx-api sherpa-onnx-cxx-api)
add_executable(streaming-t-one-ctc-cxx-api streaming-t-one-ctc-cxx-api.cc)
target_link_libraries(streaming-t-one-ctc-cxx-api sherpa-onnx-cxx-api)
add_executable(whisper-cxx-api ./whisper-cxx-api.cc)
target_link_libraries(whisper-cxx-api sherpa-onnx-cxx-api)
... ...
// cxx-api-examples/streaming-t-one-ctc-cxx-api.cc
// Copyright (c) 2025 Xiaomi Corporation
//
// This file demonstrates how to use streaming T-one
// with sherpa-onnx's C++ API.
//
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
// tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
// rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
//
// clang-format on
#include <chrono> // NOLINT
#include <iostream>
#include <string>
#include "sherpa-onnx/c-api/cxx-api.h"
int32_t main() {
using namespace sherpa_onnx::cxx; // NOLINT
OnlineRecognizerConfig config;
// please see
config.model_config.t_one_ctc.model =
"sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx";
config.model_config.tokens =
"sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt";
config.model_config.num_threads = 1;
std::cout << "Loading model\n";
OnlineRecognizer recognizer = OnlineRecognizer::Create(config);
if (!recognizer.Get()) {
std::cerr << "Please check your config\n";
return -1;
}
std::cout << "Loading model done\n";
std::string wave_filename =
"sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav";
Wave wave = ReadWave(wave_filename);
if (wave.samples.empty()) {
std::cerr << "Failed to read: '" << wave_filename << "'\n";
return -1;
}
std::cout << "Start recognition\n";
const auto begin = std::chrono::steady_clock::now();
OnlineStream stream = recognizer.CreateStream();
std::vector<float> left_padding(2400); // 0.3 seconds at 8kHz
std::vector<float> tail_padding(4800); // 0.6 seconds at 8kHz
stream.AcceptWaveform(wave.sample_rate, left_padding.data(),
left_padding.size());
stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
wave.samples.size());
stream.AcceptWaveform(wave.sample_rate, tail_padding.data(),
tail_padding.size());
stream.InputFinished();
while (recognizer.IsReady(&stream)) {
recognizer.Decode(&stream);
}
OnlineRecognizerResult result = recognizer.GetResult(&stream);
const auto end = std::chrono::steady_clock::now();
const float elapsed_seconds =
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
.count() /
1000.;
float duration = wave.samples.size() / static_cast<float>(wave.sample_rate);
float rtf = elapsed_seconds / duration;
std::cout << "text: " << result.text << "\n";
printf("Number of threads: %d\n", config.model_config.num_threads);
printf("Duration: %.3fs\n", duration);
printf("Elapsed seconds: %.3fs\n", elapsed_seconds);
printf("(Real time factor) RTF = %.3f / %.3f = %.3f\n", elapsed_seconds,
duration, rtf);
return 0;
}
... ...
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:io';
import 'dart:typed_data';
import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
import './init.dart';
void main(List<String> arguments) async {
await initSherpaOnnx();
final parser = ArgParser()
..addOption('model', help: 'Path to the model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption('input-wav', help: 'Path to input.wav to transcribe');
final res = parser.parse(arguments);
if (res['model'] == null ||
res['tokens'] == null ||
res['input-wav'] == null) {
print(parser.usage);
exit(1);
}
final model = res['model'] as String;
final tokens = res['tokens'] as String;
final inputWav = res['input-wav'] as String;
final ctc = sherpa_onnx.OnlineToneCtcModelConfig(
model: model,
);
final modelConfig = sherpa_onnx.OnlineModelConfig(
toneCtc: ctc,
tokens: tokens,
debug: true,
numThreads: 1,
);
final config = sherpa_onnx.OnlineRecognizerConfig(model: modelConfig);
final recognizer = sherpa_onnx.OnlineRecognizer(config);
final waveData = sherpa_onnx.readWave(inputWav);
final stream = recognizer.createStream();
// 0.3 seconds, assume sampleRate is 8kHz
final leftPaddings = Float32List(2400);
stream.acceptWaveform(
samples: leftPaddings,
sampleRate: waveData.sampleRate,
);
// simulate streaming. You can choose an arbitrary chunk size.
// chunkSize of a single sample is also ok, i.e, chunkSize = 1
final chunkSize = 1600; // 0.1 second for 16kHz
final numChunks = waveData.samples.length ~/ chunkSize;
var last = '';
for (int i = 0; i != numChunks; ++i) {
int start = i * chunkSize;
stream.acceptWaveform(
samples:
Float32List.sublistView(waveData.samples, start, start + chunkSize),
sampleRate: waveData.sampleRate,
);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
final result = recognizer.getResult(stream);
if (result.text != last && result.text != '') {
last = result.text;
print(last);
}
}
// 0.6 seconds, assume sampleRate is 8kHz
final tailPaddings = Float32List(4800);
stream.acceptWaveform(
samples: tailPaddings,
sampleRate: waveData.sampleRate,
);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
final result = recognizer.getResult(stream);
if (result.text != '') {
print(result.text);
}
stream.free();
recognizer.free();
}
... ...
#!/usr/bin/env bash
set -ex
dart pub get
if [ ! -f ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
fi
dart run \
./bin/t-one-ctc.dart \
--model ./sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx \
--tokens ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt \
--input-wav ./sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav
... ...
... ... @@ -38,6 +38,9 @@ class OnlineDecodeFiles
[Option("zipformer2-ctc", Required = false, HelpText = "Path to zipformer2 CTC onnx model")]
public string Zipformer2Ctc { get; set; } = string.Empty;
[Option("t-one-ctc", Required = false, HelpText = "Path to T-one CTC onnx model")]
public string ToneCtc { get; set; } = string.Empty;
[Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")]
public int NumThreads { get; set; } = 1;
... ... @@ -173,6 +176,7 @@ to download pre-trained streaming models.
config.ModelConfig.Paraformer.Decoder = options.ParaformerDecoder;
config.ModelConfig.Zipformer2Ctc.Model = options.Zipformer2Ctc;
config.ModelConfig.ToneCtc.Model = options.ToneCtc;
config.ModelConfig.Tokens = options.Tokens;
config.ModelConfig.Provider = options.Provider;
... ... @@ -203,10 +207,15 @@ to download pre-trained streaming models.
var s = recognizer.CreateStream();
var waveReader = new WaveReader(files[i]);
var leftPadding = new float[(int)(waveReader.SampleRate * 0.3)];
s.AcceptWaveform(waveReader.SampleRate, leftPadding);
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
var tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
var tailPadding = new float[(int)(waveReader.SampleRate * 0.6)];
s.AcceptWaveform(waveReader.SampleRate, tailPadding);
s.InputFinished();
streams.Add(s);
... ...
#!/usr/bin/env bash
set -ex
if [ ! -f ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
fi
dotnet run -c Release \
--tokens ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt \
--t-one-ctc ./sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx \
--files ./sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav
... ...
... ... @@ -107,12 +107,34 @@ class OnlineNemoCtcModelConfig {
final String model;
}
class OnlineToneCtcModelConfig {
const OnlineToneCtcModelConfig({this.model = ''});
factory OnlineToneCtcModelConfig.fromJson(Map<String, dynamic> json) {
return OnlineToneCtcModelConfig(
model: json['model'] as String? ?? '',
);
}
@override
String toString() {
return 'OnlineToneCtcModelConfig(model: $model)';
}
Map<String, dynamic> toJson() => {
'model': model,
};
final String model;
}
class OnlineModelConfig {
const OnlineModelConfig({
this.transducer = const OnlineTransducerModelConfig(),
this.paraformer = const OnlineParaformerModelConfig(),
this.zipformer2Ctc = const OnlineZipformer2CtcModelConfig(),
this.nemoCtc = const OnlineNemoCtcModelConfig(),
this.toneCtc = const OnlineToneCtcModelConfig(),
required this.tokens,
this.numThreads = 1,
this.provider = 'cpu',
... ... @@ -132,6 +154,8 @@ class OnlineModelConfig {
json['zipformer2Ctc'] as Map<String, dynamic>? ?? const {}),
nemoCtc: OnlineNemoCtcModelConfig.fromJson(
json['nemoCtc'] as Map<String, dynamic>? ?? const {}),
toneCtc: OnlineToneCtcModelConfig.fromJson(
json['toneCtc'] as Map<String, dynamic>? ?? const {}),
tokens: json['tokens'] as String,
numThreads: json['numThreads'] as int? ?? 1,
provider: json['provider'] as String? ?? 'cpu',
... ... @@ -144,7 +168,7 @@ class OnlineModelConfig {
@override
String toString() {
return 'OnlineModelConfig(transducer: $transducer, paraformer: $paraformer, zipformer2Ctc: $zipformer2Ctc, nemoCtc: $nemoCtc, tokens: $tokens, numThreads: $numThreads, provider: $provider, debug: $debug, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab)';
return 'OnlineModelConfig(transducer: $transducer, paraformer: $paraformer, zipformer2Ctc: $zipformer2Ctc, nemoCtc: $nemoCtc, toneCtc: $toneCtc, tokens: $tokens, numThreads: $numThreads, provider: $provider, debug: $debug, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab)';
}
Map<String, dynamic> toJson() => {
... ... @@ -152,6 +176,7 @@ class OnlineModelConfig {
'paraformer': paraformer.toJson(),
'zipformer2Ctc': zipformer2Ctc.toJson(),
'nemoCtc': nemoCtc.toJson(),
'toneCtc': toneCtc.toJson(),
'tokens': tokens,
'numThreads': numThreads,
'provider': provider,
... ... @@ -165,6 +190,7 @@ class OnlineModelConfig {
final OnlineParaformerModelConfig paraformer;
final OnlineZipformer2CtcModelConfig zipformer2Ctc;
final OnlineNemoCtcModelConfig nemoCtc;
final OnlineToneCtcModelConfig toneCtc;
final String tokens;
... ... @@ -362,6 +388,9 @@ class OnlineRecognizer {
// nemoCtc
c.ref.model.nemoCtc.model = config.model.nemoCtc.model.toNativeUtf8();
// toneCtc
c.ref.model.toneCtc.model = config.model.toneCtc.model.toNativeUtf8();
c.ref.model.tokens = config.model.tokens.toNativeUtf8();
c.ref.model.numThreads = config.model.numThreads;
c.ref.model.provider = config.model.provider.toNativeUtf8();
... ... @@ -415,6 +444,7 @@ class OnlineRecognizer {
calloc.free(c.ref.model.modelType);
calloc.free(c.ref.model.provider);
calloc.free(c.ref.model.tokens);
calloc.free(c.ref.model.toneCtc.model);
calloc.free(c.ref.model.nemoCtc.model);
calloc.free(c.ref.model.zipformer2Ctc.model);
calloc.free(c.ref.model.paraformer.encoder);
... ...
... ... @@ -403,6 +403,10 @@ final class SherpaOnnxOnlineNemoCtcModelConfig extends Struct {
external Pointer<Utf8> model;
}
final class SherpaOnnxOnlineToneCtcModelConfig extends Struct {
external Pointer<Utf8> model;
}
final class SherpaOnnxOnlineModelConfig extends Struct {
external SherpaOnnxOnlineTransducerModelConfig transducer;
external SherpaOnnxOnlineParaformerModelConfig paraformer;
... ... @@ -430,6 +434,8 @@ final class SherpaOnnxOnlineModelConfig extends Struct {
external int tokensBufSize;
external SherpaOnnxOnlineNemoCtcModelConfig nemoCtc;
external SherpaOnnxOnlineToneCtcModelConfig toneCtc;
}
final class SherpaOnnxOnlineCtcFstDecoderConfig extends Struct {
... ...
... ... @@ -27,6 +27,7 @@ func main() {
flag.StringVar(&config.ModelConfig.Paraformer.Encoder, "paraformer-encoder", "", "Path to the paraformer encoder model")
flag.StringVar(&config.ModelConfig.Paraformer.Decoder, "paraformer-decoder", "", "Path to the paraformer decoder model")
flag.StringVar(&config.ModelConfig.Zipformer2Ctc.Model, "zipformer2-ctc", "", "Path to the zipformer2 CTC model")
flag.StringVar(&config.ModelConfig.ToneCtc.Model, "t-one-ctc", "", "Path to the T-one CTC model")
flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file")
flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing")
flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message")
... ... @@ -59,9 +60,12 @@ func main() {
stream := sherpa.NewOnlineStream(recognizer)
defer sherpa.DeleteOnlineStream(stream)
leftPadding := make([]float32, int(float32(sampleRate)*0.3))
stream.AcceptWaveform(sampleRate, leftPadding)
stream.AcceptWaveform(sampleRate, samples)
tailPadding := make([]float32, int(float32(sampleRate)*0.3))
tailPadding := make([]float32, int(float32(sampleRate)*0.6))
stream.AcceptWaveform(sampleRate, tailPadding)
for recognizer.IsReady(stream) {
... ...
#!/usr/bin/env bash
set -ex
if [ ! -f ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
fi
go mod tidy
go build
./streaming-decode-files \
--t-one-ctc ./sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx \
--tokens ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt \
./sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav
... ...
... ... @@ -26,8 +26,9 @@ export { Samples,
export { OnlineStream,
OnlineNemoCtcModelConfig,
OnlineTransducerModelConfig,
OnlineParaformerModelConfig,
OnlineToneCtcModelConfig,
OnlineTransducerModelConfig,
OnlineZipformer2CtcModelConfig,
OnlineModelConfig,
OnlineCtcFstDecoderConfig,
... ...
... ... @@ -89,6 +89,22 @@ static SherpaOnnxOnlineNemoCtcModelConfig GetOnlineNemoCtcModelConfig(
return c;
}
static SherpaOnnxOnlineToneCtcModelConfig GetOnlineToneCtcModelConfig(
Napi::Object obj) {
SherpaOnnxOnlineToneCtcModelConfig c;
memset(&c, 0, sizeof(c));
if (!obj.Has("toneCtc") || !obj.Get("toneCtc").IsObject()) {
return c;
}
Napi::Object o = obj.Get("toneCtc").As<Napi::Object>();
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
return c;
}
static SherpaOnnxOnlineParaformerModelConfig GetOnlineParaformerModelConfig(
Napi::Object obj) {
SherpaOnnxOnlineParaformerModelConfig c;
... ... @@ -120,6 +136,7 @@ SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) {
c.paraformer = GetOnlineParaformerModelConfig(o);
c.zipformer2_ctc = GetOnlineZipformer2CtcModelConfig(o);
c.nemo_ctc = GetOnlineNemoCtcModelConfig(o);
c.t_one_ctc = GetOnlineToneCtcModelConfig(o);
SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
... ... @@ -265,6 +282,7 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper(
SHERPA_ONNX_DELETE_C_STR(c.model_config.paraformer.encoder);
SHERPA_ONNX_DELETE_C_STR(c.model_config.paraformer.decoder);
SHERPA_ONNX_DELETE_C_STR(c.model_config.t_one_ctc.model);
SHERPA_ONNX_DELETE_C_STR(c.model_config.nemo_ctc.model);
SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer2_ctc.model);
SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
... ...
... ... @@ -50,11 +50,16 @@ export class OnlineNemoCtcModelConfig {
public model: string = '';
}
export class OnlineToneCtcModelConfig {
public model: string = '';
}
export class OnlineModelConfig {
public transducer: OnlineTransducerModelConfig = new OnlineTransducerModelConfig();
public paraformer: OnlineParaformerModelConfig = new OnlineParaformerModelConfig();
public zipformer2_ctc: OnlineZipformer2CtcModelConfig = new OnlineZipformer2CtcModelConfig();
public nemo_ctc: OnlineNemoCtcModelConfig = new OnlineNemoCtcModelConfig();
public zipformer2Ctc: OnlineZipformer2CtcModelConfig = new OnlineZipformer2CtcModelConfig();
public nemoCtc: OnlineNemoCtcModelConfig = new OnlineNemoCtcModelConfig();
public toneCtc: OnlineToneCtcModelConfig = new OnlineToneCtcModelConfig();
public tokens: string = '';
public numThreads: number = 1;
public provider: string = 'cpu';
... ...
// Copyright 2024 Xiaomi Corporation
// This file shows how to use an online T-one CTC model, i.e.,
// streaming T-one CTC model, to decode files.
import com.k2fsa.sherpa.onnx.*;
public class StreamingDecodeFileToneCtc {
public static void main(String[] args) {
String model = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx";
String tokens = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt";
String waveFilename = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav";
WaveReader reader = new WaveReader(waveFilename);
OnlineToneCtcModelConfig ctc = OnlineToneCtcModelConfig.builder().setModel(model).build();
OnlineModelConfig modelConfig =
OnlineModelConfig.builder()
.setToneCtc(ctc)
.setTokens(tokens)
.setNumThreads(1)
.setDebug(true)
.build();
OnlineRecognizerConfig config =
OnlineRecognizerConfig.builder()
.setOnlineModelConfig(modelConfig)
.setDecodingMethod("greedy_search")
.build();
OnlineRecognizer recognizer = new OnlineRecognizer(config);
OnlineStream stream = recognizer.createStream();
float[] leftPaddings = new float[(int) (0.3 * reader.getSampleRate())];
stream.acceptWaveform(leftPaddings, reader.getSampleRate());
stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
float[] tailPaddings = new float[(int) (0.6 * reader.getSampleRate())];
stream.acceptWaveform(tailPaddings, reader.getSampleRate());
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
String text = recognizer.getResult(stream).getText();
System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
stream.release();
recognizer.release();
}
}
... ...
#!/usr/bin/env bash
set -ex
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
mkdir -p ../build
pushd ../build
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..
make -j4
ls -lh lib
popd
fi
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
pushd ../sherpa-onnx/java-api
make
popd
fi
if [ ! -f ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
fi
java \
-Djava.library.path=$PWD/../build/lib \
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
StreamingDecodeFileToneCtc.java
... ...
... ... @@ -72,6 +72,12 @@ function testSpeakerEmbeddingExtractor() {
function testOnlineAsr() {
if [ ! -f ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
fi
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
git lfs install
GIT_CLONE_PROTECTION_ACTIVE=false git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
... ...
... ... @@ -5,6 +5,7 @@ fun main() {
testOnlineAsr("zipformer2-ctc")
testOnlineAsr("ctc-hlg")
testOnlineAsr("nemo-ctc")
testOnlineAsr("tone-ctc")
}
fun testOnlineAsr(type: String) {
... ... @@ -54,6 +55,17 @@ fun testOnlineAsr(type: String) {
debug = false,
)
}
"tone-ctc" -> {
waveFilename = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav"
OnlineModelConfig(
toneCtc = OnlineToneCtcModelConfig(
model = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx",
),
tokens = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt",
numThreads = 1,
debug = false,
)
}
"ctc-hlg" -> {
waveFilename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/1.wav"
ctcFstDecoderConfig.graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst"
... ... @@ -95,12 +107,16 @@ fun testOnlineAsr(type: String) {
val sampleRate: Int = objArray[1] as Int
val stream = recognizer.createStream()
val leftPaddings = FloatArray((sampleRate * 0.3).toInt()) // 0.3 seconds
stream.acceptWaveform(leftPaddings, sampleRate = sampleRate)
stream.acceptWaveform(samples, sampleRate = sampleRate)
while (recognizer.isReady(stream)) {
recognizer.decode(stream)
}
val tailPaddings = FloatArray((sampleRate * 0.5).toInt()) // 0.5 seconds
val tailPaddings = FloatArray((sampleRate * 0.6).toInt()) // 0.6 seconds
stream.acceptWaveform(tailPaddings, sampleRate = sampleRate)
stream.inputFinished()
while (recognizer.isReady(stream)) {
... ...
... ... @@ -97,6 +97,7 @@ The following tables list the examples in this folder.
|File| Description|
|---|---|
|[./test_asr_streaming_t_one_ctc.js](./test_asr_streaming_t_one_ctc.js)| Streaming speech recognition from a file using a T-one CTC model|
|[./test_asr_streaming_transducer.js](./test_asr_streaming_transducer.js)| Streaming speech recognition from a file using a Zipformer transducer model|
|[./test_asr_streaming_transducer_with_hr.js](./test_asr_streaming_transducer_with_hr.js)| Streaming speech recognition from a file using a Zipformer transducer model with homophone replacer|
|[./test_asr_streaming_ctc.js](./test_asr_streaming_ctc.js)| Streaming speech recognition from a file using a Zipformer CTC model with greedy search|
... ... @@ -230,6 +231,16 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/hr-files/lex
node ./test_asr_streaming_transducer_with_hr.js
```
### Streaming speech recognition with T-one CTC
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
node ./test_asr_streaming_t_one_ctc.js
```
### Streaming speech recognition with Zipformer transducer
```bash
... ...
// Copyright (c) 2025 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx-node');
// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const config = {
'modelConfig': {
'toneCtc': {
'model': './sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx',
},
'tokens': './sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt',
'numThreads': 2,
'provider': 'cpu',
'debug': 1,
}
};
const waveFilename = './sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav';
const recognizer = new sherpa_onnx.OnlineRecognizer(config);
console.log('Started')
let start = Date.now();
const stream = recognizer.createStream();
const wave = sherpa_onnx.readWave(waveFilename);
const leftPadding = new Float32Array(wave.sampleRate * 0.3);
stream.acceptWaveform({samples: leftPadding, sampleRate: wave.sampleRate});
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
const tailPadding = new Float32Array(wave.sampleRate * 0.6);
stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate});
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
result = recognizer.getResult(stream)
let stop = Date.now();
console.log('Done')
const elapsed_seconds = (stop - start) / 1000;
const duration = wave.samples.length / wave.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'seconds')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
console.log(
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
real_time_factor.toFixed(3))
console.log(waveFilename)
console.log('result\n', result)
... ...
... ... @@ -393,6 +393,18 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
node ./test-online-paraformer-microphone-mic.js
```
## ./test-online-t-one-ctc.js
[./test-online-t-one-ctc.js](./test-online-t-one-ctc.js) demonstrates
how to decode a file using a streaming T-one model.
You can use the following command to run it:
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
node ./test-online-t-one-ctc.js
```
## ./test-online-paraformer.js
[./test-online-paraformer.js](./test-online-paraformer.js) demonstrates
... ...
// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOnlineRecognizer() {
let toneCtc = {
model: './sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx',
};
let onlineModelConfig = {
toneCtc: toneCtc,
tokens: './sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt',
numThreads: 1,
provider: 'cpu',
debug: 1,
};
let recognizerConfig = {
modelConfig: onlineModelConfig,
decodingMethod: 'greedy_search',
maxActivePaths: 4,
enableEndpoint: 1,
rule1MinTrailingSilence: 2.4,
rule2MinTrailingSilence: 1.2,
rule3MinUtteranceLength: 20,
};
return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
}
const recognizer = createOnlineRecognizer();
const stream = recognizer.createStream();
const waveFilename = './sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
const leftPadding = new Float32Array(wave.sampleRate * 0.3);
const tailPadding = new Float32Array(wave.sampleRate * 0.6);
stream.acceptWaveform(wave.sampleRate, leftPadding);
stream.acceptWaveform(wave.sampleRate, wave.samples);
stream.acceptWaveform(wave.sampleRate, tailPadding);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();
... ...
... ... @@ -4,3 +4,4 @@ paraformer
zipformer_ctc
zipformer_ctc_hlg
nemo_transducer
t_one_ctc
... ...
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
ls -lh lib
popd
fi
if [ ! -f ./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
fi
fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./t_one_ctc.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./t_one_ctc
... ...
{ Copyright (c) 2025 Xiaomi Corporation }
{
This file shows how to use a streaming T-one CTC model
to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program t_one_ctc;
{$mode objfpc}
uses
sherpa_onnx,
DateUtils,
SysUtils;
var
Config: TSherpaOnnxOnlineRecognizerConfig;
Recognizer: TSherpaOnnxOnlineRecognizer;
Stream: TSherpaOnnxOnlineStream;
RecognitionResult: TSherpaOnnxOnlineRecognizerResult;
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;
LeftPaddings: array of Single;
TailPaddings: array of Single;
Start: TDateTime;
Stop: TDateTime;
Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
{Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
to download model files used in this file.}
Config.ModelConfig.ToneCtc.Model := './sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
WaveFilename := './sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav';
Wave := SherpaOnnxReadWave(WaveFilename);
Recognizer := TSherpaOnnxOnlineRecognizer.Create(Config);
Start := Now;
Stream := Recognizer.CreateStream();
SetLength(LeftPaddings, Round(Wave.SampleRate * 0.3)); {0.3 seconds of padding}
Stream.AcceptWaveform(LeftPaddings, Wave.SampleRate);
Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
SetLength(TailPaddings, Round(Wave.SampleRate * 0.6)); {0.6 seconds of padding}
Stream.AcceptWaveform(TailPaddings, Wave.SampleRate);
Stream.InputFinished();
while Recognizer.IsReady(Stream) do
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Stop := Now;
Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;
WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
{Free resources to avoid memory leak.
Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.
... ...
... ... @@ -469,6 +469,21 @@ def get_models():
popd
""",
),
Model(
model_name="sherpa-onnx-streaming-t-one-russian-2025-09-08",
idx=27,
lang="ru",
short_name="t_one_ctc_2025_09_08",
cmd="""
pushd $model_name
rm -v *.wav
ls -lh
popd
""",
),
]
return models
... ...
... ... @@ -25,6 +25,7 @@ namespace SherpaOnnx
TokensBuf = "";
TokensBufSize = 0;
NemoCtc = new OnlineNemoCtcModelConfig();
ToneCtc = new OnlineToneCtcModelConfig();
}
public OnlineTransducerModelConfig Transducer;
... ... @@ -58,6 +59,8 @@ namespace SherpaOnnx
public int TokensBufSize;
public OnlineNemoCtcModelConfig NemoCtc;
public OnlineToneCtcModelConfig ToneCtc;
}
}
... ...
/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
[StructLayout(LayoutKind.Sequential)]
public struct OnlineToneCtcModelConfig
{
public OnlineToneCtcModelConfig()
{
Model = "";
}
[MarshalAs(UnmanagedType.LPStr)]
public string Model;
}
}
... ...
../../../../go-api-examples/non-streaming-tts/run-kitten-en.sh
\ No newline at end of file
... ...
../../../../go-api-examples/streaming-decode-files/run-t-one-ctc.sh
\ No newline at end of file
... ...
... ... @@ -81,6 +81,10 @@ type OnlineNemoCtcModelConfig struct {
Model string // Path to the onnx model
}
type OnlineToneCtcModelConfig struct {
Model string // Path to the onnx model
}
// Configuration for online/streaming models
//
// Please refer to
... ... @@ -92,6 +96,7 @@ type OnlineModelConfig struct {
Paraformer OnlineParaformerModelConfig
Zipformer2Ctc OnlineZipformer2CtcModelConfig
NemoCtc OnlineNemoCtcModelConfig
ToneCtc OnlineToneCtcModelConfig
Tokens string // Path to tokens.txt
NumThreads int // Number of threads to use for neural network computation
Provider string // Optional. Valid values are: cpu, cuda, coreml
... ... @@ -205,6 +210,9 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer {
c.model_config.nemo_ctc.model = C.CString(config.ModelConfig.NemoCtc.Model)
defer C.free(unsafe.Pointer(c.model_config.nemo_ctc.model))
c.model_config.t_one_ctc.model = C.CString(config.ModelConfig.ToneCtc.Model)
defer C.free(unsafe.Pointer(c.model_config.t_one_ctc.model))
c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
defer C.free(unsafe.Pointer(c.model_config.tokens))
... ...
... ... @@ -100,6 +100,9 @@ static sherpa_onnx::OnlineRecognizerConfig GetOnlineRecognizerConfig(
recognizer_config.model_config.nemo_ctc.model =
SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, "");
recognizer_config.model_config.t_one_ctc.model =
SHERPA_ONNX_OR(config->model_config.t_one_ctc.model, "");
recognizer_config.model_config.num_threads =
SHERPA_ONNX_OR(config->model_config.num_threads, 1);
recognizer_config.model_config.provider_config.provider =
... ... @@ -691,8 +694,7 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult(
if (!result.durations.empty() && result.durations.size() == r->count) {
r->durations = new float[r->count];
std::copy(result.durations.begin(), result.durations.end(),
r->durations);
std::copy(result.durations.begin(), result.durations.end(), r->durations);
} else {
r->durations = nullptr;
}
... ...
... ... @@ -104,6 +104,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineNemoCtcModelConfig {
const char *model;
} SherpaOnnxOnlineNemoCtcModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineToneCtcModelConfig {
const char *model;
} SherpaOnnxOnlineToneCtcModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineModelConfig {
SherpaOnnxOnlineTransducerModelConfig transducer;
SherpaOnnxOnlineParaformerModelConfig paraformer;
... ... @@ -125,6 +129,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineModelConfig {
/// byte size excluding the trailing '\0'
int32_t tokens_buf_size;
SherpaOnnxOnlineNemoCtcModelConfig nemo_ctc;
SherpaOnnxOnlineToneCtcModelConfig t_one_ctc;
} SherpaOnnxOnlineModelConfig;
/// It expects 16 kHz 16-bit single channel wave format.
... ...
... ... @@ -70,6 +70,7 @@ OnlineRecognizer OnlineRecognizer::Create(
config.model_config.zipformer2_ctc.model.c_str();
c.model_config.nemo_ctc.model = config.model_config.nemo_ctc.model.c_str();
c.model_config.t_one_ctc.model = config.model_config.t_one_ctc.model.c_str();
c.model_config.tokens = config.model_config.tokens.c_str();
c.model_config.num_threads = config.model_config.num_threads;
... ...
... ... @@ -36,11 +36,16 @@ struct OnlineNemoCtcModelConfig {
std::string model;
};
struct OnlineToneCtcModelConfig {
std::string model;
};
struct OnlineModelConfig {
OnlineTransducerModelConfig transducer;
OnlineParaformerModelConfig paraformer;
OnlineZipformer2CtcModelConfig zipformer2_ctc;
OnlineNemoCtcModelConfig nemo_ctc;
OnlineToneCtcModelConfig t_one_ctc;
std::string tokens;
int32_t num_threads = 1;
std::string provider = "cpu";
... ...
... ... @@ -19,6 +19,7 @@ java_files += HomophoneReplacerConfig.java
java_files += OnlineLMConfig.java
java_files += OnlineParaformerModelConfig.java
java_files += OnlineZipformer2CtcModelConfig.java
java_files += OnlineToneCtcModelConfig.java
java_files += OnlineNeMoCtcModelConfig.java
java_files += OnlineTransducerModelConfig.java
java_files += OnlineModelConfig.java
... ...
... ... @@ -237,6 +237,7 @@ public class LibraryUtils {
dir.deleteOnExit(); // schedule the directory itself
}
static boolean isAndroid() {
String vmName = System.getProperty("java.vm.name", "").toLowerCase(Locale.ROOT);
String specVendor = System.getProperty("java.specification.vendor", "");
return vmName.contains("dalvik") || vmName.contains("art") ||
... ...
... ... @@ -8,6 +8,7 @@ public class OnlineModelConfig {
private final OnlineParaformerModelConfig paraformer;
private final OnlineZipformer2CtcModelConfig zipformer2Ctc;
private final OnlineNeMoCtcModelConfig neMoCtc;
private final OnlineToneCtcModelConfig toneCtc;
private final String tokens;
private final int numThreads;
private final boolean debug;
... ... @@ -21,6 +22,7 @@ public class OnlineModelConfig {
this.paraformer = builder.paraformer;
this.zipformer2Ctc = builder.zipformer2Ctc;
this.neMoCtc = builder.neMoCtc;
this.toneCtc = builder.toneCtc;
this.tokens = builder.tokens;
this.numThreads = builder.numThreads;
this.debug = builder.debug;
... ... @@ -50,6 +52,10 @@ public class OnlineModelConfig {
return neMoCtc;
}
public OnlineToneCtcModelConfig getToneCtc() {
return toneCtc;
}
public String getTokens() {
return tokens;
}
... ... @@ -83,6 +89,7 @@ public class OnlineModelConfig {
private OnlineTransducerModelConfig transducer = OnlineTransducerModelConfig.builder().build();
private OnlineZipformer2CtcModelConfig zipformer2Ctc = OnlineZipformer2CtcModelConfig.builder().build();
private OnlineNeMoCtcModelConfig neMoCtc = OnlineNeMoCtcModelConfig.builder().build();
private OnlineToneCtcModelConfig toneCtc = OnlineToneCtcModelConfig.builder().build();
private String tokens = "";
private int numThreads = 1;
private boolean debug = true;
... ... @@ -115,6 +122,11 @@ public class OnlineModelConfig {
return this;
}
public Builder setToneCtc(OnlineToneCtcModelConfig toneCtc) {
this.toneCtc = toneCtc;
return this;
}
public Builder setTokens(String tokens) {
this.tokens = tokens;
return this;
... ...
... ... @@ -28,5 +28,4 @@ public class OnlineNeMoCtcModelConfig {
return this;
}
}
}
}
\ No newline at end of file
... ...
package com.k2fsa.sherpa.onnx;
public class OnlineToneCtcModelConfig {
private final String model;
private OnlineToneCtcModelConfig(Builder builder) {
this.model = builder.model;
}
public static Builder builder() {
return new Builder();
}
public String getModel() {
return model;
}
public static class Builder {
private String model = "";
public OnlineToneCtcModelConfig build() {
return new OnlineToneCtcModelConfig(this);
}
public Builder setModel(String model) {
this.model = model;
return this;
}
}
}
... ...
... ... @@ -82,6 +82,18 @@ OnlineModelConfig GetOnlineModelConfig(JNIEnv *env, jclass model_config_cls,
ans.nemo_ctc.model = p;
env->ReleaseStringUTFChars(s, p);
// streaming T-one CTC
fid = env->GetFieldID(model_config_cls, "toneCtc",
"Lcom/k2fsa/sherpa/onnx/OnlineToneCtcModelConfig;");
jobject t_one_ctc_config = env->GetObjectField(model_config, fid);
jclass t_one_ctc_config_cls = env->GetObjectClass(t_one_ctc_config);
fid = env->GetFieldID(t_one_ctc_config_cls, "model", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(t_one_ctc_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.t_one_ctc.model = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(model_config_cls, "tokens", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(model_config, fid);
p = env->GetStringUTFChars(s, nullptr);
... ...
... ... @@ -33,11 +33,16 @@ data class OnlineNeMoCtcModelConfig(
var model: String = "",
)
data class OnlineToneCtcModelConfig(
var model: String = "",
)
data class OnlineModelConfig(
var transducer: OnlineTransducerModelConfig = OnlineTransducerModelConfig(),
var paraformer: OnlineParaformerModelConfig = OnlineParaformerModelConfig(),
var zipformer2Ctc: OnlineZipformer2CtcModelConfig = OnlineZipformer2CtcModelConfig(),
var neMoCtc: OnlineNeMoCtcModelConfig = OnlineNeMoCtcModelConfig(),
var toneCtc: OnlineToneCtcModelConfig = OnlineToneCtcModelConfig(),
var tokens: String = "",
var numThreads: Int = 1,
var debug: Boolean = false,
... ... @@ -518,6 +523,16 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
)
}
27 -> {
val modelDir = "sherpa-onnx-streaming-t-one-russian-2025-09-08"
return OnlineModelConfig(
toneCtc = OnlineToneCtcModelConfig(
model = "$modelDir/model.onnx",
),
tokens = "$modelDir/tokens.txt",
)
}
1000 -> {
val modelDir = "sherpa-onnx-rk3588-streaming-zipformer-bilingual-zh-en-2023-02-20"
return OnlineModelConfig(
... ...
... ... @@ -182,6 +182,11 @@ type
function ToString: AnsiString;
end;
TSherpaOnnxOnlineToneCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOnlineModelConfig = record
Transducer: TSherpaOnnxOnlineTransducerModelConfig;
Paraformer: TSherpaOnnxOnlineParaformerModelConfig;
... ... @@ -196,6 +201,7 @@ type
TokensBuf: AnsiString;
TokensBufSize: Integer;
NemoCtc: TSherpaOnnxOnlineNemoCtcModelConfig;
ToneCtc: TSherpaOnnxOnlineToneCtcModelConfig;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
end;
... ... @@ -714,6 +720,10 @@ type
Model: PAnsiChar;
end;
SherpaOnnxOnlineToneCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOnlineModelConfig= record
Transducer: SherpaOnnxOnlineTransducerModelConfig;
Paraformer: SherpaOnnxOnlineParaformerModelConfig;
... ... @@ -728,6 +738,7 @@ type
TokensBuf: PAnsiChar;
TokensBufSize: cint32;
NemoCtc: SherpaOnnxOnlineNemoCtcModelConfig;
ToneCtc: SherpaOnnxOnlineToneCtcModelConfig;
end;
SherpaOnnxFeatureConfig = record
SampleRate: cint32;
... ... @@ -1350,6 +1361,12 @@ begin
[Self.Model]);
end;
function TSherpaOnnxOnlineToneCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineToneCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOnlineModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOnlineModelConfig(Transducer := %s, ' +
... ... @@ -1362,12 +1379,13 @@ begin
'ModelType := %s, ' +
'ModelingUnit := %s, ' +
'BpeVocab := %s, ' +
'NemoCtc := %s)',
'NemoCtc := %s, ' +
'ToneCtc := %s)',
[Self.Transducer.ToString, Self.Paraformer.ToString,
Self.Zipformer2Ctc.ToString, Self.Tokens,
Self.NumThreads, Self.Provider, Self.Debug.ToString,
Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
Self.NemoCtc.ToString
Self.NemoCtc.ToString, Self.ToneCtc.ToString
]);
end;
... ... @@ -1467,6 +1485,7 @@ begin
C.ModelConfig.Zipformer2Ctc.Model := PAnsiChar(Config.ModelConfig.Zipformer2Ctc.Model);
C.ModelConfig.NemoCtc.Model := PAnsiChar(Config.ModelConfig.NemoCtc.Model);
C.ModelConfig.ToneCtc.Model := PAnsiChar(Config.ModelConfig.ToneCtc.Model);
C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens);
C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads;
... ...
... ... @@ -22,3 +22,4 @@ zipformer-ctc-asr
dolphin-ctc-asr
tts-kitten-en
compute-speaker-embeddings
decode-file-t-one-streaming
... ...
... ... @@ -76,6 +76,14 @@ func sherpaOnnxOnlineNemoCtcModelConfig(
)
}
func sherpaOnnxOnlineToneCtcModelConfig(
model: String = ""
) -> SherpaOnnxOnlineToneCtcModelConfig {
return SherpaOnnxOnlineToneCtcModelConfig(
model: toCPointer(model)
)
}
/// Return an instance of SherpaOnnxOnlineModelConfig.
///
/// Please refer to
... ... @@ -101,7 +109,8 @@ func sherpaOnnxOnlineModelConfig(
bpeVocab: String = "",
tokensBuf: String = "",
tokensBufSize: Int = 0,
nemoCtc: SherpaOnnxOnlineNemoCtcModelConfig = sherpaOnnxOnlineNemoCtcModelConfig()
nemoCtc: SherpaOnnxOnlineNemoCtcModelConfig = sherpaOnnxOnlineNemoCtcModelConfig(),
toneCtc: SherpaOnnxOnlineToneCtcModelConfig = sherpaOnnxOnlineToneCtcModelConfig()
) -> SherpaOnnxOnlineModelConfig {
return SherpaOnnxOnlineModelConfig(
transducer: transducer,
... ... @@ -116,7 +125,8 @@ func sherpaOnnxOnlineModelConfig(
bpe_vocab: toCPointer(bpeVocab),
tokens_buf: toCPointer(tokensBuf),
tokens_buf_size: Int32(tokensBufSize),
nemo_ctc: nemoCtc
nemo_ctc: nemoCtc,
t_one_ctc: toneCtc
)
}
... ...
import AVFoundation
extension AudioBuffer {
func array() -> [Float] {
return Array(UnsafeBufferPointer(self))
}
}
extension AVAudioPCMBuffer {
func array() -> [Float] {
return self.audioBufferList.pointee.mBuffers.array()
}
}
func run() {
let filePath = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/0.wav"
let model =
"./sherpa-onnx-streaming-t-one-russian-2025-09-08/model.onnx"
let tokens = "./sherpa-onnx-streaming-t-one-russian-2025-09-08/tokens.txt"
let toneCtcConfig = sherpaOnnxOnlineToneCtcModelConfig(
model: model)
let modelConfig = sherpaOnnxOnlineModelConfig(
tokens: tokens,
toneCtc: toneCtcConfig
)
let featConfig = sherpaOnnxFeatureConfig(
sampleRate: 8000,
featureDim: 80
)
var config = sherpaOnnxOnlineRecognizerConfig(
featConfig: featConfig, // not used
modelConfig: modelConfig
)
let recognizer = SherpaOnnxRecognizer(config: &config)
let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
let audioFormat = audioFile.processingFormat
assert(audioFormat.sampleRate == 8000)
assert(audioFormat.channelCount == 1)
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
let audioFrameCount = UInt32(audioFile.length)
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
try! audioFile.read(into: audioFileBuffer!)
let array: [Float]! = audioFileBuffer?.array()
let leftPadding = [Float](repeating: 0.0, count: 2400)
recognizer.acceptWaveform(samples: leftPadding, sampleRate: Int(audioFormat.sampleRate))
recognizer.acceptWaveform(samples: array, sampleRate: Int(audioFormat.sampleRate))
let tailPadding = [Float](repeating: 0.0, count: 4800)
recognizer.acceptWaveform(samples: tailPadding, sampleRate: Int(audioFormat.sampleRate))
recognizer.inputFinished()
while recognizer.isReady() {
recognizer.decode()
}
let result = recognizer.getResult()
print("\nresult is:\n\(result.text)")
print("\nresult is:\n\(result.timestamps)")
}
@main
struct App {
static func main() {
run()
}
}
... ...
#!/usr/bin/env bash
set -ex
if [ ! -d ../build-swift-macos ]; then
echo "Please run ../build-swift-macos.sh first!"
exit 1
fi
if [ ! -d ./sherpa-onnx-streaming-t-one-russian-2025-09-08 ]; then
echo "Downloading the pre-trained model for testing."
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
tar xvf sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
rm sherpa-onnx-streaming-t-one-russian-2025-09-08.tar.bz2
fi
if [ ! -e ./decode-file-t-one-streaming ]; then
# Note: We use -lc++ to link against libc++ instead of libstdc++
swiftc \
-lc++ \
-I ../build-swift-macos/install/include \
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
./decode-file-t-one-streaming.swift ./SherpaOnnx.swift \
-L ../build-swift-macos/install/lib/ \
-l sherpa-onnx \
-l onnxruntime \
-o decode-file-t-one-streaming
strip decode-file-t-one-streaming
else
echo "./decode-file-t-one-streaming exists - skip building"
fi
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
./decode-file-t-one-streaming
... ...
... ... @@ -31,6 +31,10 @@ function freeConfig(config, Module) {
freeConfig(config.nemoCtc, Module)
}
if ('toneCtc' in config) {
freeConfig(config.toneCtc, Module)
}
if ('whisper' in config) {
freeConfig(config.whisper, Module)
}
... ... @@ -173,6 +177,22 @@ function initSherpaOnnxOnlineNemoCtcModelConfig(config, Module) {
}
}
function initSherpaOnnxOnlineToneCtcModelConfig(config, Module) {
const n = Module.lengthBytesUTF8(config.model || '') + 1;
const buffer = Module._malloc(n);
const len = 1 * 4; // 1 pointer
const ptr = Module._malloc(len);
Module.stringToUTF8(config.model || '', buffer, n);
Module.setValue(ptr, buffer, 'i8*');
return {
buffer: buffer, ptr: ptr, len: len,
}
}
function initSherpaOnnxOnlineModelConfig(config, Module) {
if (!('transducer' in config)) {
config.transducer = {
... ... @@ -201,6 +221,12 @@ function initSherpaOnnxOnlineModelConfig(config, Module) {
};
}
if (!('toneCtc' in config)) {
config.toneCtc = {
model: '',
};
}
if (!('tokensBuf' in config)) {
config.tokensBuf = '';
}
... ... @@ -221,8 +247,11 @@ function initSherpaOnnxOnlineModelConfig(config, Module) {
const nemoCtc =
initSherpaOnnxOnlineNemoCtcModelConfig(config.nemoCtc, Module);
const len =
transducer.len + paraformer.len + zipformer2Ctc.len + 9 * 4 + nemoCtc.len;
const toneCtc =
initSherpaOnnxOnlineToneCtcModelConfig(config.toneCtc, Module);
const len = transducer.len + paraformer.len + zipformer2Ctc.len + 9 * 4 +
nemoCtc.len + toneCtc.len;
const ptr = Module._malloc(len);
... ... @@ -308,9 +337,13 @@ function initSherpaOnnxOnlineModelConfig(config, Module) {
Module._CopyHeap(nemoCtc.ptr, nemoCtc.len, ptr + offset);
offset += nemoCtc.len;
Module._CopyHeap(toneCtc.ptr, toneCtc.len, ptr + offset);
offset += toneCtc.len;
return {
buffer: buffer, ptr: ptr, len: len, transducer: transducer,
paraformer: paraformer, zipformer2Ctc: zipformer2Ctc, nemoCtc: nemoCtc
paraformer: paraformer, zipformer2Ctc: zipformer2Ctc, nemoCtc: nemoCtc,
toneCtc: toneCtc,
}
}
... ... @@ -519,6 +552,10 @@ function createOnlineRecognizer(Module, myConfig) {
model: '',
};
const onlineToneCtcModelConfig = {
model: '',
};
let type = 0;
switch (type) {
... ... @@ -541,6 +578,10 @@ function createOnlineRecognizer(Module, myConfig) {
// nemoCtc
onlineNemoCtcModelConfig.model = './nemo-ctc.onnx';
break;
case 4:
// toneCtc
onlineToneCtcModelConfig.model = './tone-ctc.onnx';
break;
}
... ... @@ -549,6 +590,7 @@ function createOnlineRecognizer(Module, myConfig) {
paraformer: onlineParaformerModelConfig,
zipformer2Ctc: onlineZipformer2CtcModelConfig,
nemoCtc: onlineNemoCtcModelConfig,
toneCtc: onlineToneCtcModelConfig,
tokens: './tokens.txt',
numThreads: 1,
provider: 'cpu',
... ... @@ -559,8 +601,8 @@ function createOnlineRecognizer(Module, myConfig) {
};
const featureConfig = {
sampleRate: 16000,
featureDim: 80,
sampleRate: 16000, // it is ignored when toneCtc is used
featureDim: 80, // it is ignored when toneCtc is used
};
let recognizerConfig = {
... ...
... ... @@ -21,7 +21,8 @@ static_assert(sizeof(SherpaOnnxOnlineModelConfig) ==
sizeof(SherpaOnnxOnlineTransducerModelConfig) +
sizeof(SherpaOnnxOnlineParaformerModelConfig) +
sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 9 * 4 +
sizeof(SherpaOnnxOnlineNemoCtcModelConfig),
sizeof(SherpaOnnxOnlineNemoCtcModelConfig) +
sizeof(SherpaOnnxOnlineToneCtcModelConfig),
"");
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
static_assert(sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) == 2 * 4, "");
... ... @@ -39,6 +40,7 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
auto paraformer_model_config = &model_config->paraformer;
auto ctc_model_config = &model_config->zipformer2_ctc;
auto nemo_ctc = &model_config->nemo_ctc;
auto t_one_ctc = &model_config->t_one_ctc;
fprintf(stdout, "----------online transducer model config----------\n");
fprintf(stdout, "encoder: %s\n", transducer_model_config->encoder);
... ... @@ -55,6 +57,9 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) {
fprintf(stdout, "----------online nemo ctc model config----------\n");
fprintf(stdout, "model: %s\n", nemo_ctc->model);
fprintf(stdout, "----------online t-one ctc model config----------\n");
fprintf(stdout, "model: %s\n", t_one_ctc->model);
fprintf(stdout, "tokens: %s\n", model_config->tokens);
fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
fprintf(stdout, "provider: %s\n", model_config->provider);
... ...
... ... @@ -75,9 +75,10 @@ function initModelConfig(config, Module) {
const paraformer_len = 2 * 4
const zipfomer2_ctc_len = 1 * 4
const nemo_ctc_len = 1 * 4
const t_one_ctc_len = 1 * 4
const len = transducer.len + paraformer_len + zipfomer2_ctc_len + 9 * 4 +
nemo_ctc_len;
nemo_ctc_len + t_one_ctc_len;
const ptr = Module._malloc(len);
Module.HEAPU8.fill(0, ptr, ptr + len);
... ... @@ -152,6 +153,7 @@ function initModelConfig(config, Module) {
Module.setValue(ptr + offset, config.tokensBufSize || 0, 'i32');
offset += 4;
// skip nemo_ctc and t_one_ctc
return {
buffer: buffer, ptr: ptr, len: len, transducer: transducer
... ...
... ... @@ -20,7 +20,8 @@ static_assert(sizeof(SherpaOnnxOnlineModelConfig) ==
sizeof(SherpaOnnxOnlineTransducerModelConfig) +
sizeof(SherpaOnnxOnlineParaformerModelConfig) +
sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 9 * 4 +
sizeof(SherpaOnnxOnlineNemoCtcModelConfig),
sizeof(SherpaOnnxOnlineNemoCtcModelConfig) +
sizeof(SherpaOnnxOnlineToneCtcModelConfig),
"");
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
static_assert(sizeof(SherpaOnnxKeywordSpotterConfig) ==
... ...