Add Dart API for spoken language identification (#2596)

Kirill Bukaev · GitHub
Commit 12b96ac2da0d44eb6f1a127ef8a5c1ad401b7718 12b96ac2 1 parent 1b9987dc
dart-api-examples/spoken-language-identification/README.md
dart-api-examples/spoken-language-identification/analysis_options.yaml
dart-api-examples/spoken-language-identification/bin/init.dart
dart-api-examples/spoken-language-identification/bin/spoken_language_identification.dart
dart-api-examples/spoken-language-identification/pubspec.yaml
dart-api-examples/spoken-language-identification/run-whisper.sh
flutter/sherpa_onnx/lib/sherpa_onnx.dart
flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
flutter/sherpa_onnx/lib/src/spoken_language_identification.dart
--- a/dart-api-examples/spoken-language-identification/README.md 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/README.md 0 → 100644
查看文件 @12b96ac
+# Introduction
+
+This example shows how to use the Dart API from sherpa-onnx for spoken language identification.
+
+| File | Description|
+|------|------------|
+|[./bin/spoken_language_identification.dart](./bin/spoken_language_identification.dart)| Use a whisper model for spoken language identification. See also [./run-whisper.sh](./run-whisper.sh)|
--- a/dart-api-examples/spoken-language-identification/analysis_options.yaml 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/analysis_options.yaml 0 → 100644
查看文件 @12b96ac
+include: package:lints/recommended.yaml
+
+analyzer:
+  language:
+    strict-casts: true
+    strict-inference: true
+    strict-raw-types: true
+
+linter:
+  rules:
+    - always_use_package_imports
+    - avoid_dynamic_calls
+    - cancel_subscriptions
+    - close_sinks
+    - unawaited_futures
+    - use_super_parameters
--- a/dart-api-examples/spoken-language-identification/bin/init.dart 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/bin/init.dart 0 → 100644
查看文件 @12b96ac
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:io';
+import 'dart:isolate';
+import 'package:path/path.dart' as p;
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+
+Future<void> initSherpaOnnx() async {
+  String platform = '';
+
+  if (Platform.isMacOS) {
+    platform = 'macos';
+  } else if (Platform.isLinux) {
+    platform = 'linux';
+  } else if (Platform.isWindows) {
+    platform = 'windows';
+  } else {
+    throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}');
+  }
+
+  var uri = await Isolate.resolvePackageUri(
+      Uri.parse('package:sherpa_onnx_$platform/any_path_is_ok_here.dart'));
+
+  if (uri == null) {
+    print('File not found');
+    exit(1);
+  }
+
+  var libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform);
+  if (platform == 'linux') {
+    final arch = Platform.version.contains('arm64') ||
+            Platform.version.contains('aarch64')
+        ? 'aarch64'
+        : 'x64';
+    libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform, arch);
+  }
+
+  sherpa_onnx.initBindings(libPath);
+}
--- a/dart-api-examples/spoken-language-identification/bin/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/bin/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:io';
+
+import 'package:args/args.dart';
+import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+import './init.dart';
+
+void main(List<String> arguments) async {
+  await initSherpaOnnx();
+
+  final parser = ArgParser()
+    ..addOption('encoder', help: 'Path to the whisper encoder model')
+    ..addOption('decoder', help: 'Path to the whisper decoder model')
+    ..addOption('tail-paddings', help: 'Tail paddings for the whisper model', defaultsTo: '0')
+    ..addOption('wav', help: 'Path to test.wav for language identification')
+    ..addFlag('help', abbr: 'h', help: 'Show this help message', negatable: false);
+
+  final res = parser.parse(arguments);
+  if (res['help'] as bool) {
+    print(parser.usage);
+    exit(0);
+  }
+
+  if (res['encoder'] == null || res['decoder'] == null || res['wav'] == null) {
+    print(parser.usage);
+    exit(1);
+  }
+
+  final encoder = res['encoder'] as String;
+  final decoder = res['decoder'] as String;
+  final tailPaddings = int.tryParse(res['tail-paddings'] as String) ?? 0;
+  final wav = res['wav'] as String;
+
+  final whisperConfig = sherpa_onnx.SpokenLanguageIdentificationWhisperConfig(
+    encoder: encoder,
+    decoder: decoder,
+    tailPaddings: tailPaddings,
+  );
+
+  final config = sherpa_onnx.SpokenLanguageIdentificationConfig(
+    whisper: whisperConfig,
+    numThreads: 1,
+    debug: true,
+    provider: 'cpu',
+  );
+
+  final slid = sherpa_onnx.SpokenLanguageIdentification(config);
+
+  final waveData = sherpa_onnx.readWave(wav);
+
+  final stream = slid.createStream();
+  stream.acceptWaveform(samples: waveData.samples, sampleRate: waveData.sampleRate);
+
+  final result = slid.compute(stream);
+
+  print('File: $wav');
+  print('Detected language: ${result.lang}');
+
+  stream.free();
+  slid.free();
+}
--- a/dart-api-examples/spoken-language-identification/pubspec.yaml 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/pubspec.yaml 0 → 100644
查看文件 @12b96ac
+name: spoken_language_identification
+
+description: >
+  This example demonstrates how to use the Dart API for spoken language identification.
+
+version: 1.0.0
+
+environment:
+  sdk: ">=3.0.0 <4.0.0"
+
+# Add regular dependencies here.
+dependencies:
+  sherpa_onnx: ^1.12.13
+  # sherpa_onnx:
+  #   path: ../../flutter/sherpa_onnx
+  path: ^1.9.0
+  args: ^2.5.0
+
+dev_dependencies:
+  lints: ^3.0.0
--- a/dart-api-examples/spoken-language-identification/run-whisper.sh 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/run-whisper.sh 0 → 100644
查看文件 @12b96ac
+#!/usr/bin/env bash
+
+set -ex
+
+dart pub get
+
+if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+  tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+  rm sherpa-onnx-whisper-tiny.tar.bz2
+fi
+
+# Download test WAV files
+waves=(
+# ar-arabic.wav
+# bg-bulgarian.wav
+# cs-czech.wav
+# da-danish.wav
+# de-german.wav
+# el-greek.wav
+en-english.wav
+es-spanish.wav
+# fa-persian.wav
+# fi-finnish.wav
+# fr-french.wav
+# hi-hindi.wav
+# hr-croatian.wav
+# id-indonesian.wav
+# it-italian.wav
+# ja-japanese.wav
+# ko-korean.wav
+# nl-dutch.wav
+# no-norwegian.wav
+# pl-polish.wav
+# pt-portuguese.wav
+# ro-romanian.wav
+ru-russian.wav
+# sk-slovak.wav
+# sv-swedish.wav
+# ta-tamil.wav
+# tl-tagalog.wav
+# tr-turkish.wav
+# uk-ukrainian.wav
+zh-chinese.wav
+)
+
+for wav in ${waves[@]}; do
+  if [ ! -f ./$wav ]; then
+    echo "Downloading $wav"
+    curl -SL -O https://hf-mirror.com/spaces/k2-fsa/spoken-language-identification/resolve/main/test_wavs/$wav
+  fi
+  
+  echo "Testing $wav"
+  dart run \
+    ./bin/spoken_language_identification.dart \
+    --encoder ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx \
+    --decoder ./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx \
+    --wav ./$wav
+  
+  echo "----------------------------------------"
+done
--- a/flutter/sherpa_onnx/lib/sherpa_onnx.dart
查看文件 @12b96ac
+++ b/flutter/sherpa_onnx/lib/sherpa_onnx.dart
查看文件 @12b96ac
@@ -15,6 +15,7 @@ export 'src/online_punctuation.dart';
 export 'src/online_recognizer.dart';
 export 'src/online_stream.dart';
 export 'src/speaker_identification.dart';
+export 'src/spoken_language_identification.dart';
 export 'src/tts.dart';
 export 'src/vad.dart';
 export 'src/version.dart';
--- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
查看文件 @12b96ac
+++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
查看文件 @12b96ac
@@ -626,6 +626,32 @@ final class SherpaOnnxOfflineSpeakerDiarization extends Opaque {}
 final class SherpaOnnxOfflineSpeakerDiarizationResult extends Opaque {}
+final class SherpaOnnxSpokenLanguageIdentificationWhisperConfig extends Struct {
+  external Pointer<Utf8> encoder;
+  external Pointer<Utf8> decoder;
+
+  @Int32()
+  external int tailPaddings;
+}
+
+final class SherpaOnnxSpokenLanguageIdentificationConfig extends Struct {
+  external SherpaOnnxSpokenLanguageIdentificationWhisperConfig whisper;
+
+  @Int32()
+  external int numThreads;
+
+  @Int32()
+  external int debug;
+
+  external Pointer<Utf8> provider;
+}
+
+final class SherpaOnnxSpokenLanguageIdentificationResult extends Struct {
+  external Pointer<Utf8> lang;
+}
+
+final class SherpaOnnxSpokenLanguageIdentification extends Opaque {}
+
 final class SherpaOnnxOfflineSpeechDenoiser extends Opaque {}
 typedef SherpaOnnxCreateOfflineSpeechDenoiserNative
@@ -661,6 +687,40 @@ typedef SherpaOnnxDestroyDenoisedAudioNative = Void Function(
 typedef SherpaOnnxDestroyDenoisedAudio = void Function(
     Pointer<SherpaOnnxDenoisedAudio>);
+typedef SherpaOnnxCreateSpokenLanguageIdentificationNative
+    = Pointer<SherpaOnnxSpokenLanguageIdentification> Function(
+        Pointer<SherpaOnnxSpokenLanguageIdentificationConfig>);
+
+typedef SherpaOnnxCreateSpokenLanguageIdentification
+    = SherpaOnnxCreateSpokenLanguageIdentificationNative;
+
+typedef SherpaOnnxDestroySpokenLanguageIdentificationNative = Void Function(
+    Pointer<SherpaOnnxSpokenLanguageIdentification>);
+
+typedef SherpaOnnxDestroySpokenLanguageIdentification = void Function(
+    Pointer<SherpaOnnxSpokenLanguageIdentification>);
+
+typedef SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative
+    = Pointer<SherpaOnnxOfflineStream> Function(
+        Pointer<SherpaOnnxSpokenLanguageIdentification>);
+
+typedef SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream
+    = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative;
+
+typedef SherpaOnnxSpokenLanguageIdentificationComputeNative
+    = Pointer<SherpaOnnxSpokenLanguageIdentificationResult> Function(
+        Pointer<SherpaOnnxSpokenLanguageIdentification>,
+        Pointer<SherpaOnnxOfflineStream>);
+
+typedef SherpaOnnxSpokenLanguageIdentificationCompute
+    = SherpaOnnxSpokenLanguageIdentificationComputeNative;
+
+typedef SherpaOnnxDestroySpokenLanguageIdentificationResultNative = Void
+    Function(Pointer<SherpaOnnxSpokenLanguageIdentificationResult>);
+
+typedef SherpaOnnxDestroySpokenLanguageIdentificationResult = void Function(
+    Pointer<SherpaOnnxSpokenLanguageIdentificationResult>);
+
 typedef SherpaOnnxCreateOfflineSpeakerDiarizationNative
     = Pointer<SherpaOnnxOfflineSpeakerDiarization> Function(
         Pointer<SherpaOnnxOfflineSpeakerDiarizationConfig>);
@@ -1344,6 +1404,17 @@ class SherpaOnnxBindings {
   static SherpaOnnxOfflineSpeechDenoiserRun? sherpaOnnxOfflineSpeechDenoiserRun;
   static SherpaOnnxDestroyDenoisedAudio? sherpaOnnxDestroyDenoisedAudio;
+  static SherpaOnnxCreateSpokenLanguageIdentification?
+      sherpaOnnxCreateSpokenLanguageIdentification;
+  static SherpaOnnxDestroySpokenLanguageIdentification?
+      sherpaOnnxDestroySpokenLanguageIdentification;
+  static SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream?
+      sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream;
+  static SherpaOnnxSpokenLanguageIdentificationCompute?
+      sherpaOnnxSpokenLanguageIdentificationCompute;
+  static SherpaOnnxDestroySpokenLanguageIdentificationResult?
+      sherpaOnnxDestroySpokenLanguageIdentificationResult;
+
   static SherpaOnnxCreateOfflineSpeakerDiarization?
       sherpaOnnxCreateOfflineSpeakerDiarization;
   static SherpaOnnxDestroyOfflineSpeakerDiarization?
@@ -1574,6 +1645,41 @@ class SherpaOnnxBindings {
             'SherpaOnnxDestroyDenoisedAudio')
         .asFunction();
+    sherpaOnnxCreateSpokenLanguageIdentification ??= dynamicLibrary
+        .lookup<
+                NativeFunction<
+                    SherpaOnnxCreateSpokenLanguageIdentificationNative>>(
+            'SherpaOnnxCreateSpokenLanguageIdentification')
+        .asFunction();
+
+    sherpaOnnxDestroySpokenLanguageIdentification ??= dynamicLibrary
+        .lookup<
+                NativeFunction<
+                    SherpaOnnxDestroySpokenLanguageIdentificationNative>>(
+            'SherpaOnnxDestroySpokenLanguageIdentification')
+        .asFunction();
+
+    sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream ??= dynamicLibrary
+        .lookup<
+                NativeFunction<
+                    SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative>>(
+            'SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream')
+        .asFunction();
+
+    sherpaOnnxSpokenLanguageIdentificationCompute ??= dynamicLibrary
+        .lookup<
+                NativeFunction<
+                    SherpaOnnxSpokenLanguageIdentificationComputeNative>>(
+            'SherpaOnnxSpokenLanguageIdentificationCompute')
+        .asFunction();
+
+    sherpaOnnxDestroySpokenLanguageIdentificationResult ??= dynamicLibrary
+        .lookup<
+                NativeFunction<
+                    SherpaOnnxDestroySpokenLanguageIdentificationResultNative>>(
+            'SherpaOnnxDestroySpokenLanguageIdentificationResult')
+        .asFunction();
+
     sherpaOnnxCreateOfflineSpeakerDiarization ??= dynamicLibrary
         .lookup<
                 NativeFunction<
--- a/flutter/sherpa_onnx/lib/src/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+++ b/flutter/sherpa_onnx/lib/src/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+// Copyright (c)  2024  Xiaomi Corporation
+import 'dart:ffi';
+
+import 'package:ffi/ffi.dart';
+
+import './offline_stream.dart';
+import './sherpa_onnx_bindings.dart';
+import './utils.dart';
+
+class SpokenLanguageIdentificationWhisperConfig {
+  const SpokenLanguageIdentificationWhisperConfig({
+    this.encoder = '',
+    this.decoder = '',
+    this.tailPaddings = 0,
+  });
+
+  factory SpokenLanguageIdentificationWhisperConfig.fromJson(
+      Map<String, dynamic> json) {
+    return SpokenLanguageIdentificationWhisperConfig(
+      encoder: json['encoder'] as String? ?? '',
+      decoder: json['decoder'] as String? ?? '',
+      tailPaddings: json['tailPaddings'] as int? ?? 0,
+    );
+  }
+
+  @override
+  String toString() {
+    return 'SpokenLanguageIdentificationWhisperConfig(encoder: $encoder, decoder: $decoder, tailPaddings: $tailPaddings)';
+  }
+
+  Map<String, dynamic> toJson() => {
+        'encoder': encoder,
+        'decoder': decoder,
+        'tailPaddings': tailPaddings,
+      };
+
+  final String encoder;
+  final String decoder;
+  final int tailPaddings;
+}
+
+class SpokenLanguageIdentificationConfig {
+  const SpokenLanguageIdentificationConfig({
+    this.whisper = const SpokenLanguageIdentificationWhisperConfig(),
+    this.numThreads = 1,
+    this.debug = false,
+    this.provider = 'cpu',
+  });
+
+  factory SpokenLanguageIdentificationConfig.fromJson(
+      Map<String, dynamic> json) {
+    return SpokenLanguageIdentificationConfig(
+      whisper: json['whisper'] != null
+          ? SpokenLanguageIdentificationWhisperConfig.fromJson(
+              json['whisper'] as Map<String, dynamic>)
+          : const SpokenLanguageIdentificationWhisperConfig(),
+      numThreads: json['numThreads'] as int? ?? 1,
+      debug: json['debug'] as bool? ?? false,
+      provider: json['provider'] as String? ?? 'cpu',
+    );
+  }
+
+  @override
+  String toString() {
+    return 'SpokenLanguageIdentificationConfig(whisper: $whisper, numThreads: $numThreads, debug: $debug, provider: $provider)';
+  }
+
+  Map<String, dynamic> toJson() => {
+        'whisper': whisper.toJson(),
+        'numThreads': numThreads,
+        'debug': debug,
+        'provider': provider,
+      };
+
+  final SpokenLanguageIdentificationWhisperConfig whisper;
+  final int numThreads;
+  final bool debug;
+  final String provider;
+}
+
+class SpokenLanguageIdentificationResult {
+  const SpokenLanguageIdentificationResult({
+    required this.lang,
+  });
+
+  factory SpokenLanguageIdentificationResult.fromJson(
+      Map<String, dynamic> json) {
+    return SpokenLanguageIdentificationResult(
+      lang: json['lang'] as String? ?? '',
+    );
+  }
+
+  @override
+  String toString() {
+    return 'SpokenLanguageIdentificationResult(lang: $lang)';
+  }
+
+  Map<String, dynamic> toJson() => {
+        'lang': lang,
+      };
+
+  final String lang;
+}
+
+class SpokenLanguageIdentification {
+  SpokenLanguageIdentification.fromPtr(
+      {required this.ptr, required this.config});
+
+  SpokenLanguageIdentification._({required this.ptr, required this.config});
+
+  void free() {
+    SherpaOnnxBindings.sherpaOnnxDestroySpokenLanguageIdentification?.call(ptr);
+    ptr = nullptr;
+  }
+
+  /// The user is responsible to call the SpokenLanguageIdentification.free()
+  /// method of the returned instance to avoid memory leak.
+  factory SpokenLanguageIdentification(
+      SpokenLanguageIdentificationConfig config) {
+    final c = convertConfig(config);
+
+    if (SherpaOnnxBindings.sherpaOnnxCreateSpokenLanguageIdentification ==
+        null) {
+      freeConfig(c);
+      throw Exception("Please initialize sherpa-onnx first");
+    }
+
+    final ptr = SherpaOnnxBindings.sherpaOnnxCreateSpokenLanguageIdentification
+            ?.call(c) ??
+        nullptr;
+
+    if (ptr == nullptr) {
+      freeConfig(c);
+      throw Exception(
+          "Failed to create spoken language identification. Please check your config");
+    }
+
+    freeConfig(c);
+
+    return SpokenLanguageIdentification._(ptr: ptr, config: config);
+  }
+
+  static Pointer<SherpaOnnxSpokenLanguageIdentificationConfig> convertConfig(
+      SpokenLanguageIdentificationConfig config) {
+    final c = calloc<SherpaOnnxSpokenLanguageIdentificationConfig>();
+
+    c.ref.whisper.encoder = config.whisper.encoder.toNativeUtf8();
+    c.ref.whisper.decoder = config.whisper.decoder.toNativeUtf8();
+    c.ref.whisper.tailPaddings = config.whisper.tailPaddings;
+
+    c.ref.numThreads = config.numThreads;
+    c.ref.debug = config.debug ? 1 : 0;
+    c.ref.provider = config.provider.toNativeUtf8();
+
+    return c;
+  }
+
+  static void freeConfig(
+      Pointer<SherpaOnnxSpokenLanguageIdentificationConfig> c) {
+    malloc.free(c.ref.whisper.encoder);
+    malloc.free(c.ref.whisper.decoder);
+    malloc.free(c.ref.provider);
+    malloc.free(c);
+  }
+
+  /// The user has to invoke stream.free() on the returned instance
+  /// to avoid memory leak
+  OfflineStream createStream() {
+    final p = SherpaOnnxBindings
+            .sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream
+            ?.call(ptr) ??
+        nullptr;
+    return OfflineStream(ptr: p);
+  }
+
+  SpokenLanguageIdentificationResult compute(OfflineStream stream) {
+    final result = SherpaOnnxBindings
+            .sherpaOnnxSpokenLanguageIdentificationCompute
+            ?.call(ptr, stream.ptr) ??
+        nullptr;
+
+    if (result == nullptr) {
+      return const SpokenLanguageIdentificationResult(lang: '');
+    }
+
+    final lang = toDartString(result.ref.lang);
+
+    SherpaOnnxBindings.sherpaOnnxDestroySpokenLanguageIdentificationResult
+        ?.call(result);
+
+    return SpokenLanguageIdentificationResult(lang: lang);
+  }
+
+  Pointer<SherpaOnnxSpokenLanguageIdentification> ptr;
+  SpokenLanguageIdentificationConfig config;
+}