Kirill Bukaev
Committed by GitHub

Add Dart API for spoken language identification (#2596)

# Introduction
This example shows how to use the Dart API from sherpa-onnx for spoken language identification.
| File | Description|
|------|------------|
|[./bin/spoken_language_identification.dart](./bin/spoken_language_identification.dart)| Use a whisper model for spoken language identification. See also [./run-whisper.sh](./run-whisper.sh)|
... ...
include: package:lints/recommended.yaml
analyzer:
language:
strict-casts: true
strict-inference: true
strict-raw-types: true
linter:
rules:
- always_use_package_imports
- avoid_dynamic_calls
- cancel_subscriptions
- close_sinks
- unawaited_futures
- use_super_parameters
... ...
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:io';
import 'dart:isolate';
import 'package:path/path.dart' as p;
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
Future<void> initSherpaOnnx() async {
String platform = '';
if (Platform.isMacOS) {
platform = 'macos';
} else if (Platform.isLinux) {
platform = 'linux';
} else if (Platform.isWindows) {
platform = 'windows';
} else {
throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}');
}
var uri = await Isolate.resolvePackageUri(
Uri.parse('package:sherpa_onnx_$platform/any_path_is_ok_here.dart'));
if (uri == null) {
print('File not found');
exit(1);
}
var libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform);
if (platform == 'linux') {
final arch = Platform.version.contains('arm64') ||
Platform.version.contains('aarch64')
? 'aarch64'
: 'x64';
libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform, arch);
}
sherpa_onnx.initBindings(libPath);
}
... ...
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:io';
import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
import './init.dart';
void main(List<String> arguments) async {
await initSherpaOnnx();
final parser = ArgParser()
..addOption('encoder', help: 'Path to the whisper encoder model')
..addOption('decoder', help: 'Path to the whisper decoder model')
..addOption('tail-paddings', help: 'Tail paddings for the whisper model', defaultsTo: '0')
..addOption('wav', help: 'Path to test.wav for language identification')
..addFlag('help', abbr: 'h', help: 'Show this help message', negatable: false);
final res = parser.parse(arguments);
if (res['help'] as bool) {
print(parser.usage);
exit(0);
}
if (res['encoder'] == null || res['decoder'] == null || res['wav'] == null) {
print(parser.usage);
exit(1);
}
final encoder = res['encoder'] as String;
final decoder = res['decoder'] as String;
final tailPaddings = int.tryParse(res['tail-paddings'] as String) ?? 0;
final wav = res['wav'] as String;
final whisperConfig = sherpa_onnx.SpokenLanguageIdentificationWhisperConfig(
encoder: encoder,
decoder: decoder,
tailPaddings: tailPaddings,
);
final config = sherpa_onnx.SpokenLanguageIdentificationConfig(
whisper: whisperConfig,
numThreads: 1,
debug: true,
provider: 'cpu',
);
final slid = sherpa_onnx.SpokenLanguageIdentification(config);
final waveData = sherpa_onnx.readWave(wav);
final stream = slid.createStream();
stream.acceptWaveform(samples: waveData.samples, sampleRate: waveData.sampleRate);
final result = slid.compute(stream);
print('File: $wav');
print('Detected language: ${result.lang}');
stream.free();
slid.free();
}
... ...
name: spoken_language_identification
description: >
This example demonstrates how to use the Dart API for spoken language identification.
version: 1.0.0
environment:
sdk: ">=3.0.0 <4.0.0"
# Add regular dependencies here.
dependencies:
sherpa_onnx: ^1.12.13
# sherpa_onnx:
# path: ../../flutter/sherpa_onnx
path: ^1.9.0
args: ^2.5.0
dev_dependencies:
lints: ^3.0.0
... ...
#!/usr/bin/env bash
set -ex
dart pub get
if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.tar.bz2
rm sherpa-onnx-whisper-tiny.tar.bz2
fi
# Download test WAV files
waves=(
# ar-arabic.wav
# bg-bulgarian.wav
# cs-czech.wav
# da-danish.wav
# de-german.wav
# el-greek.wav
en-english.wav
es-spanish.wav
# fa-persian.wav
# fi-finnish.wav
# fr-french.wav
# hi-hindi.wav
# hr-croatian.wav
# id-indonesian.wav
# it-italian.wav
# ja-japanese.wav
# ko-korean.wav
# nl-dutch.wav
# no-norwegian.wav
# pl-polish.wav
# pt-portuguese.wav
# ro-romanian.wav
ru-russian.wav
# sk-slovak.wav
# sv-swedish.wav
# ta-tamil.wav
# tl-tagalog.wav
# tr-turkish.wav
# uk-ukrainian.wav
zh-chinese.wav
)
for wav in ${waves[@]}; do
if [ ! -f ./$wav ]; then
echo "Downloading $wav"
curl -SL -O https://hf-mirror.com/spaces/k2-fsa/spoken-language-identification/resolve/main/test_wavs/$wav
fi
echo "Testing $wav"
dart run \
./bin/spoken_language_identification.dart \
--encoder ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx \
--decoder ./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx \
--wav ./$wav
echo "----------------------------------------"
done
... ...
... ... @@ -15,6 +15,7 @@ export 'src/online_punctuation.dart';
export 'src/online_recognizer.dart';
export 'src/online_stream.dart';
export 'src/speaker_identification.dart';
export 'src/spoken_language_identification.dart';
export 'src/tts.dart';
export 'src/vad.dart';
export 'src/version.dart';
... ...
... ... @@ -626,6 +626,32 @@ final class SherpaOnnxOfflineSpeakerDiarization extends Opaque {}
final class SherpaOnnxOfflineSpeakerDiarizationResult extends Opaque {}
final class SherpaOnnxSpokenLanguageIdentificationWhisperConfig extends Struct {
external Pointer<Utf8> encoder;
external Pointer<Utf8> decoder;
@Int32()
external int tailPaddings;
}
final class SherpaOnnxSpokenLanguageIdentificationConfig extends Struct {
external SherpaOnnxSpokenLanguageIdentificationWhisperConfig whisper;
@Int32()
external int numThreads;
@Int32()
external int debug;
external Pointer<Utf8> provider;
}
final class SherpaOnnxSpokenLanguageIdentificationResult extends Struct {
external Pointer<Utf8> lang;
}
final class SherpaOnnxSpokenLanguageIdentification extends Opaque {}
final class SherpaOnnxOfflineSpeechDenoiser extends Opaque {}
typedef SherpaOnnxCreateOfflineSpeechDenoiserNative
... ... @@ -661,6 +687,40 @@ typedef SherpaOnnxDestroyDenoisedAudioNative = Void Function(
typedef SherpaOnnxDestroyDenoisedAudio = void Function(
Pointer<SherpaOnnxDenoisedAudio>);
typedef SherpaOnnxCreateSpokenLanguageIdentificationNative
= Pointer<SherpaOnnxSpokenLanguageIdentification> Function(
Pointer<SherpaOnnxSpokenLanguageIdentificationConfig>);
typedef SherpaOnnxCreateSpokenLanguageIdentification
= SherpaOnnxCreateSpokenLanguageIdentificationNative;
typedef SherpaOnnxDestroySpokenLanguageIdentificationNative = Void Function(
Pointer<SherpaOnnxSpokenLanguageIdentification>);
typedef SherpaOnnxDestroySpokenLanguageIdentification = void Function(
Pointer<SherpaOnnxSpokenLanguageIdentification>);
typedef SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative
= Pointer<SherpaOnnxOfflineStream> Function(
Pointer<SherpaOnnxSpokenLanguageIdentification>);
typedef SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream
= SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative;
typedef SherpaOnnxSpokenLanguageIdentificationComputeNative
= Pointer<SherpaOnnxSpokenLanguageIdentificationResult> Function(
Pointer<SherpaOnnxSpokenLanguageIdentification>,
Pointer<SherpaOnnxOfflineStream>);
typedef SherpaOnnxSpokenLanguageIdentificationCompute
= SherpaOnnxSpokenLanguageIdentificationComputeNative;
typedef SherpaOnnxDestroySpokenLanguageIdentificationResultNative = Void
Function(Pointer<SherpaOnnxSpokenLanguageIdentificationResult>);
typedef SherpaOnnxDestroySpokenLanguageIdentificationResult = void Function(
Pointer<SherpaOnnxSpokenLanguageIdentificationResult>);
typedef SherpaOnnxCreateOfflineSpeakerDiarizationNative
= Pointer<SherpaOnnxOfflineSpeakerDiarization> Function(
Pointer<SherpaOnnxOfflineSpeakerDiarizationConfig>);
... ... @@ -1344,6 +1404,17 @@ class SherpaOnnxBindings {
static SherpaOnnxOfflineSpeechDenoiserRun? sherpaOnnxOfflineSpeechDenoiserRun;
static SherpaOnnxDestroyDenoisedAudio? sherpaOnnxDestroyDenoisedAudio;
static SherpaOnnxCreateSpokenLanguageIdentification?
sherpaOnnxCreateSpokenLanguageIdentification;
static SherpaOnnxDestroySpokenLanguageIdentification?
sherpaOnnxDestroySpokenLanguageIdentification;
static SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream?
sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream;
static SherpaOnnxSpokenLanguageIdentificationCompute?
sherpaOnnxSpokenLanguageIdentificationCompute;
static SherpaOnnxDestroySpokenLanguageIdentificationResult?
sherpaOnnxDestroySpokenLanguageIdentificationResult;
static SherpaOnnxCreateOfflineSpeakerDiarization?
sherpaOnnxCreateOfflineSpeakerDiarization;
static SherpaOnnxDestroyOfflineSpeakerDiarization?
... ... @@ -1574,6 +1645,41 @@ class SherpaOnnxBindings {
'SherpaOnnxDestroyDenoisedAudio')
.asFunction();
sherpaOnnxCreateSpokenLanguageIdentification ??= dynamicLibrary
.lookup<
NativeFunction<
SherpaOnnxCreateSpokenLanguageIdentificationNative>>(
'SherpaOnnxCreateSpokenLanguageIdentification')
.asFunction();
sherpaOnnxDestroySpokenLanguageIdentification ??= dynamicLibrary
.lookup<
NativeFunction<
SherpaOnnxDestroySpokenLanguageIdentificationNative>>(
'SherpaOnnxDestroySpokenLanguageIdentification')
.asFunction();
sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream ??= dynamicLibrary
.lookup<
NativeFunction<
SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative>>(
'SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream')
.asFunction();
sherpaOnnxSpokenLanguageIdentificationCompute ??= dynamicLibrary
.lookup<
NativeFunction<
SherpaOnnxSpokenLanguageIdentificationComputeNative>>(
'SherpaOnnxSpokenLanguageIdentificationCompute')
.asFunction();
sherpaOnnxDestroySpokenLanguageIdentificationResult ??= dynamicLibrary
.lookup<
NativeFunction<
SherpaOnnxDestroySpokenLanguageIdentificationResultNative>>(
'SherpaOnnxDestroySpokenLanguageIdentificationResult')
.asFunction();
sherpaOnnxCreateOfflineSpeakerDiarization ??= dynamicLibrary
.lookup<
NativeFunction<
... ...
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:ffi';
import 'package:ffi/ffi.dart';
import './offline_stream.dart';
import './sherpa_onnx_bindings.dart';
import './utils.dart';
class SpokenLanguageIdentificationWhisperConfig {
const SpokenLanguageIdentificationWhisperConfig({
this.encoder = '',
this.decoder = '',
this.tailPaddings = 0,
});
factory SpokenLanguageIdentificationWhisperConfig.fromJson(
Map<String, dynamic> json) {
return SpokenLanguageIdentificationWhisperConfig(
encoder: json['encoder'] as String? ?? '',
decoder: json['decoder'] as String? ?? '',
tailPaddings: json['tailPaddings'] as int? ?? 0,
);
}
@override
String toString() {
return 'SpokenLanguageIdentificationWhisperConfig(encoder: $encoder, decoder: $decoder, tailPaddings: $tailPaddings)';
}
Map<String, dynamic> toJson() => {
'encoder': encoder,
'decoder': decoder,
'tailPaddings': tailPaddings,
};
final String encoder;
final String decoder;
final int tailPaddings;
}
class SpokenLanguageIdentificationConfig {
const SpokenLanguageIdentificationConfig({
this.whisper = const SpokenLanguageIdentificationWhisperConfig(),
this.numThreads = 1,
this.debug = false,
this.provider = 'cpu',
});
factory SpokenLanguageIdentificationConfig.fromJson(
Map<String, dynamic> json) {
return SpokenLanguageIdentificationConfig(
whisper: json['whisper'] != null
? SpokenLanguageIdentificationWhisperConfig.fromJson(
json['whisper'] as Map<String, dynamic>)
: const SpokenLanguageIdentificationWhisperConfig(),
numThreads: json['numThreads'] as int? ?? 1,
debug: json['debug'] as bool? ?? false,
provider: json['provider'] as String? ?? 'cpu',
);
}
@override
String toString() {
return 'SpokenLanguageIdentificationConfig(whisper: $whisper, numThreads: $numThreads, debug: $debug, provider: $provider)';
}
Map<String, dynamic> toJson() => {
'whisper': whisper.toJson(),
'numThreads': numThreads,
'debug': debug,
'provider': provider,
};
final SpokenLanguageIdentificationWhisperConfig whisper;
final int numThreads;
final bool debug;
final String provider;
}
class SpokenLanguageIdentificationResult {
const SpokenLanguageIdentificationResult({
required this.lang,
});
factory SpokenLanguageIdentificationResult.fromJson(
Map<String, dynamic> json) {
return SpokenLanguageIdentificationResult(
lang: json['lang'] as String? ?? '',
);
}
@override
String toString() {
return 'SpokenLanguageIdentificationResult(lang: $lang)';
}
Map<String, dynamic> toJson() => {
'lang': lang,
};
final String lang;
}
class SpokenLanguageIdentification {
SpokenLanguageIdentification.fromPtr(
{required this.ptr, required this.config});
SpokenLanguageIdentification._({required this.ptr, required this.config});
void free() {
SherpaOnnxBindings.sherpaOnnxDestroySpokenLanguageIdentification?.call(ptr);
ptr = nullptr;
}
/// The user is responsible to call the SpokenLanguageIdentification.free()
/// method of the returned instance to avoid memory leak.
factory SpokenLanguageIdentification(
SpokenLanguageIdentificationConfig config) {
final c = convertConfig(config);
if (SherpaOnnxBindings.sherpaOnnxCreateSpokenLanguageIdentification ==
null) {
freeConfig(c);
throw Exception("Please initialize sherpa-onnx first");
}
final ptr = SherpaOnnxBindings.sherpaOnnxCreateSpokenLanguageIdentification
?.call(c) ??
nullptr;
if (ptr == nullptr) {
freeConfig(c);
throw Exception(
"Failed to create spoken language identification. Please check your config");
}
freeConfig(c);
return SpokenLanguageIdentification._(ptr: ptr, config: config);
}
static Pointer<SherpaOnnxSpokenLanguageIdentificationConfig> convertConfig(
SpokenLanguageIdentificationConfig config) {
final c = calloc<SherpaOnnxSpokenLanguageIdentificationConfig>();
c.ref.whisper.encoder = config.whisper.encoder.toNativeUtf8();
c.ref.whisper.decoder = config.whisper.decoder.toNativeUtf8();
c.ref.whisper.tailPaddings = config.whisper.tailPaddings;
c.ref.numThreads = config.numThreads;
c.ref.debug = config.debug ? 1 : 0;
c.ref.provider = config.provider.toNativeUtf8();
return c;
}
static void freeConfig(
Pointer<SherpaOnnxSpokenLanguageIdentificationConfig> c) {
malloc.free(c.ref.whisper.encoder);
malloc.free(c.ref.whisper.decoder);
malloc.free(c.ref.provider);
malloc.free(c);
}
/// The user has to invoke stream.free() on the returned instance
/// to avoid memory leak
OfflineStream createStream() {
final p = SherpaOnnxBindings
.sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream
?.call(ptr) ??
nullptr;
return OfflineStream(ptr: p);
}
SpokenLanguageIdentificationResult compute(OfflineStream stream) {
final result = SherpaOnnxBindings
.sherpaOnnxSpokenLanguageIdentificationCompute
?.call(ptr, stream.ptr) ??
nullptr;
if (result == nullptr) {
return const SpokenLanguageIdentificationResult(lang: '');
}
final lang = toDartString(result.ref.lang);
SherpaOnnxBindings.sherpaOnnxDestroySpokenLanguageIdentificationResult
?.call(result);
return SpokenLanguageIdentificationResult(lang: lang);
}
Pointer<SherpaOnnxSpokenLanguageIdentification> ptr;
SpokenLanguageIdentificationConfig config;
}
... ...