Fangjun Kuang
Committed by GitHub

Add Dart API for Dolphin CTC models (#2095)

@@ -61,6 +61,11 @@ echo '----------ced----------' @@ -61,6 +61,11 @@ echo '----------ced----------'
61 popd 61 popd
62 62
63 pushd vad-with-non-streaming-asr 63 pushd vad-with-non-streaming-asr
  64 +
  65 +echo '----------Dolphin CTC----------'
  66 +./run-dolphin-ctc.sh
  67 +rm -rf sherpa-onnx-*
  68 +
64 echo '----------TeleSpeech CTC----------' 69 echo '----------TeleSpeech CTC----------'
65 ./run-telespeech-ctc.sh 70 ./run-telespeech-ctc.sh
66 rm -rf sherpa-onnx-* 71 rm -rf sherpa-onnx-*
@@ -110,6 +115,10 @@ echo '----------NeMo transducer----------' @@ -110,6 +115,10 @@ echo '----------NeMo transducer----------'
110 ./run-nemo-transducer.sh 115 ./run-nemo-transducer.sh
111 rm -rf sherpa-onnx-* 116 rm -rf sherpa-onnx-*
112 117
  118 +echo '----------Dolphin CTC----------'
  119 +./run-dolphin-ctc.sh
  120 +rm -rf sherpa-onnx-*
  121 +
113 echo '----------NeMo CTC----------' 122 echo '----------NeMo CTC----------'
114 ./run-nemo-ctc.sh 123 ./run-nemo-ctc.sh
115 rm -rf sherpa-onnx-* 124 rm -rf sherpa-onnx-*
@@ -4,6 +4,7 @@ This folder contains examples for non-streaming ASR with Dart API. @@ -4,6 +4,7 @@ This folder contains examples for non-streaming ASR with Dart API.
4 4
5 | File | Description| 5 | File | Description|
6 |------|------------| 6 |------|------------|
  7 +|[./bin/dolphin-ctc.dart](./bin/dolphin-ctc.dart)| Use a [Dolphin](https://github.com/DataoceanAI/Dolphin) Ctc model for speech recognition. See [./run-dolphin-ctc.sh](./run-dolphin-ctc.sh)|
7 |[./bin/nemo-ctc.dart](./bin/nemo-ctc.dart)| Use a NeMo Ctc model for speech recognition. See [./run-nemo-ctc.sh](./run-nemo-ctc.sh)| 8 |[./bin/nemo-ctc.dart](./bin/nemo-ctc.dart)| Use a NeMo Ctc model for speech recognition. See [./run-nemo-ctc.sh](./run-nemo-ctc.sh)|
8 |[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)| 9 |[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)|
9 |[./bin/paraformer.dart](./bin/paraformer.dart)|Use a paraformer model for speech recognition. See [./run-paraformer.sh](./run-paraformer.sh)| 10 |[./bin/paraformer.dart](./bin/paraformer.dart)|Use a paraformer model for speech recognition. See [./run-paraformer.sh](./run-paraformer.sh)|
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +import 'dart:io';
  3 +
  4 +import 'package:args/args.dart';
  5 +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
  6 +
  7 +import './init.dart';
  8 +
  9 +void main(List<String> arguments) async {
  10 + await initSherpaOnnx();
  11 +
  12 + final parser = ArgParser()
  13 + ..addOption('model', help: 'Path to the Dolphin CTC model')
  14 + ..addOption('tokens', help: 'Path to tokens.txt')
  15 + ..addOption('input-wav', help: 'Path to input.wav to transcribe');
  16 +
  17 + final res = parser.parse(arguments);
  18 + if (res['model'] == null ||
  19 + res['tokens'] == null ||
  20 + res['input-wav'] == null) {
  21 + print(parser.usage);
  22 + exit(1);
  23 + }
  24 +
  25 + final model = res['model'] as String;
  26 + final tokens = res['tokens'] as String;
  27 + final inputWav = res['input-wav'] as String;
  28 +
  29 + final dolphin = sherpa_onnx.OfflineDolphinModelConfig(model: model);
  30 +
  31 + final modelConfig = sherpa_onnx.OfflineModelConfig(
  32 + dolphin: dolphin,
  33 + tokens: tokens,
  34 + debug: true,
  35 + numThreads: 1,
  36 + );
  37 + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
  38 + final recognizer = sherpa_onnx.OfflineRecognizer(config);
  39 +
  40 + final waveData = sherpa_onnx.readWave(inputWav);
  41 + final stream = recognizer.createStream();
  42 +
  43 + stream.acceptWaveform(
  44 + samples: waveData.samples, sampleRate: waveData.sampleRate);
  45 + recognizer.decode(stream);
  46 +
  47 + final result = recognizer.getResult(stream);
  48 + print(result.text);
  49 +
  50 + stream.free();
  51 + recognizer.free();
  52 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +dart pub get
  6 +
  7 +if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
  8 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  9 + tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  10 + rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  11 + ls -lh sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
  12 +fi
  13 +
  14 +dart run \
  15 + ./bin/dolphin-ctc.dart \
  16 + --model ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx \
  17 + --tokens ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt \
  18 + --input-wav ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +import 'dart:io';
  3 +import 'dart:typed_data';
  4 +
  5 +import 'package:args/args.dart';
  6 +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
  7 +
  8 +import './init.dart';
  9 +
  10 +void main(List<String> arguments) async {
  11 + await initSherpaOnnx();
  12 +
  13 + final parser = ArgParser()
  14 + ..addOption('silero-vad', help: 'Path to silero_vad.onnx')
  15 + ..addOption('model', help: 'Path to the Dolphin CTC model')
  16 + ..addOption('tokens', help: 'Path to tokens.txt')
  17 + ..addOption('input-wav', help: 'Path to input.wav to transcribe');
  18 +
  19 + final res = parser.parse(arguments);
  20 + if (res['silero-vad'] == null ||
  21 + res['model'] == null ||
  22 + res['tokens'] == null ||
  23 + res['input-wav'] == null) {
  24 + print(parser.usage);
  25 + exit(1);
  26 + }
  27 +
  28 + // create VAD
  29 + final sileroVad = res['silero-vad'] as String;
  30 +
  31 + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig(
  32 + model: sileroVad,
  33 + minSilenceDuration: 0.25,
  34 + minSpeechDuration: 0.5,
  35 + maxSpeechDuration: 5.0,
  36 + );
  37 +
  38 + final vadConfig = sherpa_onnx.VadModelConfig(
  39 + sileroVad: sileroVadConfig,
  40 + numThreads: 1,
  41 + debug: true,
  42 + );
  43 +
  44 + final vad = sherpa_onnx.VoiceActivityDetector(
  45 + config: vadConfig, bufferSizeInSeconds: 10);
  46 +
  47 + // create offline recognizer
  48 + final model = res['model'] as String;
  49 + final tokens = res['tokens'] as String;
  50 + final inputWav = res['input-wav'] as String;
  51 +
  52 + final dolphin = sherpa_onnx.OfflineDolphinModelConfig(model: model);
  53 +
  54 + final modelConfig = sherpa_onnx.OfflineModelConfig(
  55 + dolphin: dolphin,
  56 + tokens: tokens,
  57 + debug: true,
  58 + numThreads: 1,
  59 + );
  60 + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
  61 + final recognizer = sherpa_onnx.OfflineRecognizer(config);
  62 +
  63 + final waveData = sherpa_onnx.readWave(inputWav);
  64 + if (waveData.sampleRate != 16000) {
  65 + print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}');
  66 + exit(1);
  67 + }
  68 +
  69 + int numSamples = waveData.samples.length;
  70 + int numIter = numSamples ~/ vadConfig.sileroVad.windowSize;
  71 +
  72 + for (int i = 0; i != numIter; ++i) {
  73 + int start = i * vadConfig.sileroVad.windowSize;
  74 + vad.acceptWaveform(Float32List.sublistView(
  75 + waveData.samples, start, start + vadConfig.sileroVad.windowSize));
  76 +
  77 + while (!vad.isEmpty()) {
  78 + final samples = vad.front().samples;
  79 + final startTime = vad.front().start.toDouble() / waveData.sampleRate;
  80 + final endTime =
  81 + startTime + samples.length.toDouble() / waveData.sampleRate;
  82 +
  83 + final stream = recognizer.createStream();
  84 + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
  85 + recognizer.decode(stream);
  86 +
  87 + final result = recognizer.getResult(stream);
  88 + stream.free();
  89 + print(
  90 + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
  91 +
  92 + vad.pop();
  93 + }
  94 + }
  95 +
  96 + vad.flush();
  97 +
  98 + while (!vad.isEmpty()) {
  99 + final samples = vad.front().samples;
  100 + final startTime = vad.front().start.toDouble() / waveData.sampleRate;
  101 + final endTime = startTime + samples.length.toDouble() / waveData.sampleRate;
  102 +
  103 + final stream = recognizer.createStream();
  104 + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
  105 + recognizer.decode(stream);
  106 +
  107 + final result = recognizer.getResult(stream);
  108 + stream.free();
  109 + print(
  110 + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
  111 +
  112 + vad.pop();
  113 + }
  114 +
  115 + vad.free();
  116 +
  117 + recognizer.free();
  118 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +dart pub get
  6 +
  7 +if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then
  8 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  9 + tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  10 + rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
  11 + ls -lh sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
  12 +fi
  13 +
  14 +if [ ! -f ./lei-jun-test.wav ]; then
  15 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  16 +fi
  17 +
  18 +if [[ ! -f ./silero_vad.onnx ]]; then
  19 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  20 +fi
  21 +
  22 +dart run \
  23 + ./bin/dolphin-ctc.dart \
  24 + --silero-vad ./silero_vad.onnx \
  25 + --model ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx \
  26 + --tokens ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt \
  27 + --input-wav ./lei-jun-test.wav
@@ -82,6 +82,27 @@ class OfflineNemoEncDecCtcModelConfig { @@ -82,6 +82,27 @@ class OfflineNemoEncDecCtcModelConfig {
82 final String model; 82 final String model;
83 } 83 }
84 84
  85 +class OfflineDolphinModelConfig {
  86 + const OfflineDolphinModelConfig({this.model = ''});
  87 +
  88 + factory OfflineDolphinModelConfig.fromJson(Map<String, dynamic> json) {
  89 + return OfflineDolphinModelConfig(
  90 + model: json['model'] as String? ?? '',
  91 + );
  92 + }
  93 +
  94 + @override
  95 + String toString() {
  96 + return 'OfflineDolphinModelConfig(model: $model)';
  97 + }
  98 +
  99 + Map<String, dynamic> toJson() => {
  100 + 'model': model,
  101 + };
  102 +
  103 + final String model;
  104 +}
  105 +
85 class OfflineWhisperModelConfig { 106 class OfflineWhisperModelConfig {
86 const OfflineWhisperModelConfig( 107 const OfflineWhisperModelConfig(
87 {this.encoder = '', 108 {this.encoder = '',
@@ -265,6 +286,7 @@ class OfflineModelConfig { @@ -265,6 +286,7 @@ class OfflineModelConfig {
265 this.senseVoice = const OfflineSenseVoiceModelConfig(), 286 this.senseVoice = const OfflineSenseVoiceModelConfig(),
266 this.moonshine = const OfflineMoonshineModelConfig(), 287 this.moonshine = const OfflineMoonshineModelConfig(),
267 this.fireRedAsr = const OfflineFireRedAsrModelConfig(), 288 this.fireRedAsr = const OfflineFireRedAsrModelConfig(),
  289 + this.dolphin = const OfflineDolphinModelConfig(),
268 required this.tokens, 290 required this.tokens,
269 this.numThreads = 1, 291 this.numThreads = 1,
270 this.debug = true, 292 this.debug = true,
@@ -309,6 +331,10 @@ class OfflineModelConfig { @@ -309,6 +331,10 @@ class OfflineModelConfig {
309 ? OfflineFireRedAsrModelConfig.fromJson( 331 ? OfflineFireRedAsrModelConfig.fromJson(
310 json['fireRedAsr'] as Map<String, dynamic>) 332 json['fireRedAsr'] as Map<String, dynamic>)
311 : const OfflineFireRedAsrModelConfig(), 333 : const OfflineFireRedAsrModelConfig(),
  334 + dolphin: json['dolphin'] != null
  335 + ? OfflineDolphinModelConfig.fromJson(
  336 + json['dolphin'] as Map<String, dynamic>)
  337 + : const OfflineDolphinModelConfig(),
312 tokens: json['tokens'] as String, 338 tokens: json['tokens'] as String,
313 numThreads: json['numThreads'] as int? ?? 1, 339 numThreads: json['numThreads'] as int? ?? 1,
314 debug: json['debug'] as bool? ?? true, 340 debug: json['debug'] as bool? ?? true,
@@ -322,7 +348,7 @@ class OfflineModelConfig { @@ -322,7 +348,7 @@ class OfflineModelConfig {
322 348
323 @override 349 @override
324 String toString() { 350 String toString() {
325 - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; 351 + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)';
326 } 352 }
327 353
328 Map<String, dynamic> toJson() => { 354 Map<String, dynamic> toJson() => {
@@ -334,6 +360,7 @@ class OfflineModelConfig { @@ -334,6 +360,7 @@ class OfflineModelConfig {
334 'senseVoice': senseVoice.toJson(), 360 'senseVoice': senseVoice.toJson(),
335 'moonshine': moonshine.toJson(), 361 'moonshine': moonshine.toJson(),
336 'fireRedAsr': fireRedAsr.toJson(), 362 'fireRedAsr': fireRedAsr.toJson(),
  363 + 'dolphin': dolphin.toJson(),
337 'tokens': tokens, 364 'tokens': tokens,
338 'numThreads': numThreads, 365 'numThreads': numThreads,
339 'debug': debug, 366 'debug': debug,
@@ -352,6 +379,7 @@ class OfflineModelConfig { @@ -352,6 +379,7 @@ class OfflineModelConfig {
352 final OfflineSenseVoiceModelConfig senseVoice; 379 final OfflineSenseVoiceModelConfig senseVoice;
353 final OfflineMoonshineModelConfig moonshine; 380 final OfflineMoonshineModelConfig moonshine;
354 final OfflineFireRedAsrModelConfig fireRedAsr; 381 final OfflineFireRedAsrModelConfig fireRedAsr;
  382 + final OfflineDolphinModelConfig dolphin;
355 383
356 final String tokens; 384 final String tokens;
357 final int numThreads; 385 final int numThreads;
@@ -544,6 +572,8 @@ class OfflineRecognizer { @@ -544,6 +572,8 @@ class OfflineRecognizer {
544 c.ref.model.fireRedAsr.decoder = 572 c.ref.model.fireRedAsr.decoder =
545 config.model.fireRedAsr.decoder.toNativeUtf8(); 573 config.model.fireRedAsr.decoder.toNativeUtf8();
546 574
  575 + c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8();
  576 +
547 c.ref.model.tokens = config.model.tokens.toNativeUtf8(); 577 c.ref.model.tokens = config.model.tokens.toNativeUtf8();
548 578
549 c.ref.model.numThreads = config.model.numThreads; 579 c.ref.model.numThreads = config.model.numThreads;
@@ -581,6 +611,7 @@ class OfflineRecognizer { @@ -581,6 +611,7 @@ class OfflineRecognizer {
581 calloc.free(c.ref.model.modelType); 611 calloc.free(c.ref.model.modelType);
582 calloc.free(c.ref.model.provider); 612 calloc.free(c.ref.model.provider);
583 calloc.free(c.ref.model.tokens); 613 calloc.free(c.ref.model.tokens);
  614 + calloc.free(c.ref.model.dolphin.model);
584 calloc.free(c.ref.model.fireRedAsr.decoder); 615 calloc.free(c.ref.model.fireRedAsr.decoder);
585 calloc.free(c.ref.model.fireRedAsr.encoder); 616 calloc.free(c.ref.model.fireRedAsr.encoder);
586 calloc.free(c.ref.model.moonshine.cachedDecoder); 617 calloc.free(c.ref.model.moonshine.cachedDecoder);
@@ -261,6 +261,10 @@ final class SherpaOnnxOfflineNemoEncDecCtcModelConfig extends Struct { @@ -261,6 +261,10 @@ final class SherpaOnnxOfflineNemoEncDecCtcModelConfig extends Struct {
261 external Pointer<Utf8> model; 261 external Pointer<Utf8> model;
262 } 262 }
263 263
  264 +final class SherpaOnnxOfflineDolphinModelConfig extends Struct {
  265 + external Pointer<Utf8> model;
  266 +}
  267 +
264 final class SherpaOnnxOfflineWhisperModelConfig extends Struct { 268 final class SherpaOnnxOfflineWhisperModelConfig extends Struct {
265 external Pointer<Utf8> encoder; 269 external Pointer<Utf8> encoder;
266 external Pointer<Utf8> decoder; 270 external Pointer<Utf8> decoder;
@@ -327,6 +331,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct { @@ -327,6 +331,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct {
327 external SherpaOnnxOfflineSenseVoiceModelConfig senseVoice; 331 external SherpaOnnxOfflineSenseVoiceModelConfig senseVoice;
328 external SherpaOnnxOfflineMoonshineModelConfig moonshine; 332 external SherpaOnnxOfflineMoonshineModelConfig moonshine;
329 external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; 333 external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr;
  334 + external SherpaOnnxOfflineDolphinModelConfig dolphin;
330 } 335 }
331 336
332 final class SherpaOnnxOfflineRecognizerConfig extends Struct { 337 final class SherpaOnnxOfflineRecognizerConfig extends Struct {