Add Dart API for spoken language identification (#2596)

Kirill Bukaev · GitHub
Commit 12b96ac2da0d44eb6f1a127ef8a5c1ad401b7718 12b96ac2 1 parent 1b9987dc
dart-api-examples/spoken-language-identification/README.md
dart-api-examples/spoken-language-identification/analysis_options.yaml
dart-api-examples/spoken-language-identification/bin/init.dart
dart-api-examples/spoken-language-identification/bin/spoken_language_identification.dart
dart-api-examples/spoken-language-identification/pubspec.yaml
dart-api-examples/spoken-language-identification/run-whisper.sh
flutter/sherpa_onnx/lib/sherpa_onnx.dart
flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
flutter/sherpa_onnx/lib/src/spoken_language_identification.dart
--- a/dart-api-examples/spoken-language-identification/README.md 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/README.md 0 → 100644
查看文件 @12b96ac
+ # Introduction
+ 
+ This example shows how to use the Dart API from sherpa-onnx for spoken language identification.
+ 
+ | File | Description|
+ |------|------------|
+ |[./bin/spoken_language_identification.dart](./bin/spoken_language_identification.dart)| Use a whisper model for spoken language identification. See also [./run-whisper.sh](./run-whisper.sh)|
--- a/dart-api-examples/spoken-language-identification/analysis_options.yaml 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/analysis_options.yaml 0 → 100644
查看文件 @12b96ac
+ include: package:lints/recommended.yaml
+ 
+ analyzer:
+   language:
+     strict-casts: true
+     strict-inference: true
+     strict-raw-types: true
+ 
+ linter:
+   rules:
+     - always_use_package_imports
+     - avoid_dynamic_calls
+     - cancel_subscriptions
+     - close_sinks
+     - unawaited_futures
+     - use_super_parameters
--- a/dart-api-examples/spoken-language-identification/bin/init.dart 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/bin/init.dart 0 → 100644
查看文件 @12b96ac
+ // Copyright (c)  2024  Xiaomi Corporation
+ import 'dart:io';
+ import 'dart:isolate';
+ import 'package:path/path.dart' as p;
+ import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+ 
+ Future<void> initSherpaOnnx() async {
+   String platform = '';
+ 
+   if (Platform.isMacOS) {
+     platform = 'macos';
+   } else if (Platform.isLinux) {
+     platform = 'linux';
+   } else if (Platform.isWindows) {
+     platform = 'windows';
+   } else {
+     throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}');
+   }
+ 
+   var uri = await Isolate.resolvePackageUri(
+       Uri.parse('package:sherpa_onnx_$platform/any_path_is_ok_here.dart'));
+ 
+   if (uri == null) {
+     print('File not found');
+     exit(1);
+   }
+ 
+   var libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform);
+   if (platform == 'linux') {
+     final arch = Platform.version.contains('arm64') ||
+             Platform.version.contains('aarch64')
+         ? 'aarch64'
+         : 'x64';
+     libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform, arch);
+   }
+ 
+   sherpa_onnx.initBindings(libPath);
+ }
--- a/dart-api-examples/spoken-language-identification/bin/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/bin/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+ // Copyright (c)  2024  Xiaomi Corporation
+ import 'dart:io';
+ 
+ import 'package:args/args.dart';
+ import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+ import './init.dart';
+ 
+ void main(List<String> arguments) async {
+   await initSherpaOnnx();
+ 
+   final parser = ArgParser()
+     ..addOption('encoder', help: 'Path to the whisper encoder model')
+     ..addOption('decoder', help: 'Path to the whisper decoder model')
+     ..addOption('tail-paddings', help: 'Tail paddings for the whisper model', defaultsTo: '0')
+     ..addOption('wav', help: 'Path to test.wav for language identification')
+     ..addFlag('help', abbr: 'h', help: 'Show this help message', negatable: false);
+ 
+   final res = parser.parse(arguments);
+   if (res['help'] as bool) {
+     print(parser.usage);
+     exit(0);
+   }
+ 
+   if (res['encoder'] == null || res['decoder'] == null || res['wav'] == null) {
+     print(parser.usage);
+     exit(1);
+   }
+ 
+   final encoder = res['encoder'] as String;
+   final decoder = res['decoder'] as String;
+   final tailPaddings = int.tryParse(res['tail-paddings'] as String) ?? 0;
+   final wav = res['wav'] as String;
+ 
+   final whisperConfig = sherpa_onnx.SpokenLanguageIdentificationWhisperConfig(
+     encoder: encoder,
+     decoder: decoder,
+     tailPaddings: tailPaddings,
+   );
+ 
+   final config = sherpa_onnx.SpokenLanguageIdentificationConfig(
+     whisper: whisperConfig,
+     numThreads: 1,
+     debug: true,
+     provider: 'cpu',
+   );
+ 
+   final slid = sherpa_onnx.SpokenLanguageIdentification(config);
+ 
+   final waveData = sherpa_onnx.readWave(wav);
+ 
+   final stream = slid.createStream();
+   stream.acceptWaveform(samples: waveData.samples, sampleRate: waveData.sampleRate);
+ 
+   final result = slid.compute(stream);
+ 
+   print('File: $wav');
+   print('Detected language: ${result.lang}');
+ 
+   stream.free();
+   slid.free();
+ }
--- a/dart-api-examples/spoken-language-identification/pubspec.yaml 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/pubspec.yaml 0 → 100644
查看文件 @12b96ac
+ name: spoken_language_identification
+ 
+ description: >
+   This example demonstrates how to use the Dart API for spoken language identification.
+ 
+ version: 1.0.0
+ 
+ environment:
+   sdk: ">=3.0.0 <4.0.0"
+ 
+ # Add regular dependencies here.
+ dependencies:
+   sherpa_onnx: ^1.12.13
+   # sherpa_onnx:
+   #   path: ../../flutter/sherpa_onnx
+   path: ^1.9.0
+   args: ^2.5.0
+ 
+ dev_dependencies:
+   lints: ^3.0.0
--- a/dart-api-examples/spoken-language-identification/run-whisper.sh 0 → 100644
查看文件 @12b96ac
+++ b/dart-api-examples/spoken-language-identification/run-whisper.sh 0 → 100644
查看文件 @12b96ac
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ dart pub get
+ 
+ if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+   tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+   rm sherpa-onnx-whisper-tiny.tar.bz2
+ fi
+ 
+ # Download test WAV files
+ waves=(
+ # ar-arabic.wav
+ # bg-bulgarian.wav
+ # cs-czech.wav
+ # da-danish.wav
+ # de-german.wav
+ # el-greek.wav
+ en-english.wav
+ es-spanish.wav
+ # fa-persian.wav
+ # fi-finnish.wav
+ # fr-french.wav
+ # hi-hindi.wav
+ # hr-croatian.wav
+ # id-indonesian.wav
+ # it-italian.wav
+ # ja-japanese.wav
+ # ko-korean.wav
+ # nl-dutch.wav
+ # no-norwegian.wav
+ # pl-polish.wav
+ # pt-portuguese.wav
+ # ro-romanian.wav
+ ru-russian.wav
+ # sk-slovak.wav
+ # sv-swedish.wav
+ # ta-tamil.wav
+ # tl-tagalog.wav
+ # tr-turkish.wav
+ # uk-ukrainian.wav
+ zh-chinese.wav
+ )
+ 
+ for wav in ${waves[@]}; do
+   if [ ! -f ./$wav ]; then
+     echo "Downloading $wav"
+     curl -SL -O https://hf-mirror.com/spaces/k2-fsa/spoken-language-identification/resolve/main/test_wavs/$wav
+   fi
+   
+   echo "Testing $wav"
+   dart run \
+     ./bin/spoken_language_identification.dart \
+     --encoder ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx \
+     --decoder ./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx \
+     --wav ./$wav
+   
+   echo "----------------------------------------"
+ done
--- a/flutter/sherpa_onnx/lib/sherpa_onnx.dart
查看文件 @12b96ac
+++ b/flutter/sherpa_onnx/lib/sherpa_onnx.dart
查看文件 @12b96ac
@@ -15,6 +15,7 @@ export 'src/online_punctuation.dart';
 export 'src/online_recognizer.dart';
 export 'src/online_stream.dart';
 export 'src/speaker_identification.dart';
+ export 'src/spoken_language_identification.dart';
 export 'src/tts.dart';
 export 'src/vad.dart';
 export 'src/version.dart';
--- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
查看文件 @12b96ac
+++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
查看文件 @12b96ac
@@ -626,6 +626,32 @@ final class SherpaOnnxOfflineSpeakerDiarization extends Opaque {}
 
 final class SherpaOnnxOfflineSpeakerDiarizationResult extends Opaque {}
 
+ final class SherpaOnnxSpokenLanguageIdentificationWhisperConfig extends Struct {
+   external Pointer<Utf8> encoder;
+   external Pointer<Utf8> decoder;
+ 
+   @Int32()
+   external int tailPaddings;
+ }
+ 
+ final class SherpaOnnxSpokenLanguageIdentificationConfig extends Struct {
+   external SherpaOnnxSpokenLanguageIdentificationWhisperConfig whisper;
+ 
+   @Int32()
+   external int numThreads;
+ 
+   @Int32()
+   external int debug;
+ 
+   external Pointer<Utf8> provider;
+ }
+ 
+ final class SherpaOnnxSpokenLanguageIdentificationResult extends Struct {
+   external Pointer<Utf8> lang;
+ }
+ 
+ final class SherpaOnnxSpokenLanguageIdentification extends Opaque {}
+ 
 final class SherpaOnnxOfflineSpeechDenoiser extends Opaque {}
 
 typedef SherpaOnnxCreateOfflineSpeechDenoiserNative
@@ -661,6 +687,40 @@ typedef SherpaOnnxDestroyDenoisedAudioNative = Void Function(
 typedef SherpaOnnxDestroyDenoisedAudio = void Function(
     Pointer<SherpaOnnxDenoisedAudio>);
 
+ typedef SherpaOnnxCreateSpokenLanguageIdentificationNative
+     = Pointer<SherpaOnnxSpokenLanguageIdentification> Function(
+         Pointer<SherpaOnnxSpokenLanguageIdentificationConfig>);
+ 
+ typedef SherpaOnnxCreateSpokenLanguageIdentification
+     = SherpaOnnxCreateSpokenLanguageIdentificationNative;
+ 
+ typedef SherpaOnnxDestroySpokenLanguageIdentificationNative = Void Function(
+     Pointer<SherpaOnnxSpokenLanguageIdentification>);
+ 
+ typedef SherpaOnnxDestroySpokenLanguageIdentification = void Function(
+     Pointer<SherpaOnnxSpokenLanguageIdentification>);
+ 
+ typedef SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative
+     = Pointer<SherpaOnnxOfflineStream> Function(
+         Pointer<SherpaOnnxSpokenLanguageIdentification>);
+ 
+ typedef SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream
+     = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative;
+ 
+ typedef SherpaOnnxSpokenLanguageIdentificationComputeNative
+     = Pointer<SherpaOnnxSpokenLanguageIdentificationResult> Function(
+         Pointer<SherpaOnnxSpokenLanguageIdentification>,
+         Pointer<SherpaOnnxOfflineStream>);
+ 
+ typedef SherpaOnnxSpokenLanguageIdentificationCompute
+     = SherpaOnnxSpokenLanguageIdentificationComputeNative;
+ 
+ typedef SherpaOnnxDestroySpokenLanguageIdentificationResultNative = Void
+     Function(Pointer<SherpaOnnxSpokenLanguageIdentificationResult>);
+ 
+ typedef SherpaOnnxDestroySpokenLanguageIdentificationResult = void Function(
+     Pointer<SherpaOnnxSpokenLanguageIdentificationResult>);
+ 
 typedef SherpaOnnxCreateOfflineSpeakerDiarizationNative
     = Pointer<SherpaOnnxOfflineSpeakerDiarization> Function(
         Pointer<SherpaOnnxOfflineSpeakerDiarizationConfig>);
@@ -1344,6 +1404,17 @@ class SherpaOnnxBindings {
   static SherpaOnnxOfflineSpeechDenoiserRun? sherpaOnnxOfflineSpeechDenoiserRun;
   static SherpaOnnxDestroyDenoisedAudio? sherpaOnnxDestroyDenoisedAudio;
 
+   static SherpaOnnxCreateSpokenLanguageIdentification?
+       sherpaOnnxCreateSpokenLanguageIdentification;
+   static SherpaOnnxDestroySpokenLanguageIdentification?
+       sherpaOnnxDestroySpokenLanguageIdentification;
+   static SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream?
+       sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream;
+   static SherpaOnnxSpokenLanguageIdentificationCompute?
+       sherpaOnnxSpokenLanguageIdentificationCompute;
+   static SherpaOnnxDestroySpokenLanguageIdentificationResult?
+       sherpaOnnxDestroySpokenLanguageIdentificationResult;
+ 
   static SherpaOnnxCreateOfflineSpeakerDiarization?
       sherpaOnnxCreateOfflineSpeakerDiarization;
   static SherpaOnnxDestroyOfflineSpeakerDiarization?
@@ -1574,6 +1645,41 @@ class SherpaOnnxBindings {
             'SherpaOnnxDestroyDenoisedAudio')
         .asFunction();
 
+     sherpaOnnxCreateSpokenLanguageIdentification ??= dynamicLibrary
+         .lookup<
+                 NativeFunction<
+                     SherpaOnnxCreateSpokenLanguageIdentificationNative>>(
+             'SherpaOnnxCreateSpokenLanguageIdentification')
+         .asFunction();
+ 
+     sherpaOnnxDestroySpokenLanguageIdentification ??= dynamicLibrary
+         .lookup<
+                 NativeFunction<
+                     SherpaOnnxDestroySpokenLanguageIdentificationNative>>(
+             'SherpaOnnxDestroySpokenLanguageIdentification')
+         .asFunction();
+ 
+     sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream ??= dynamicLibrary
+         .lookup<
+                 NativeFunction<
+                     SherpaOnnxSpokenLanguageIdentificationCreateOfflineStreamNative>>(
+             'SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream')
+         .asFunction();
+ 
+     sherpaOnnxSpokenLanguageIdentificationCompute ??= dynamicLibrary
+         .lookup<
+                 NativeFunction<
+                     SherpaOnnxSpokenLanguageIdentificationComputeNative>>(
+             'SherpaOnnxSpokenLanguageIdentificationCompute')
+         .asFunction();
+ 
+     sherpaOnnxDestroySpokenLanguageIdentificationResult ??= dynamicLibrary
+         .lookup<
+                 NativeFunction<
+                     SherpaOnnxDestroySpokenLanguageIdentificationResultNative>>(
+             'SherpaOnnxDestroySpokenLanguageIdentificationResult')
+         .asFunction();
+ 
     sherpaOnnxCreateOfflineSpeakerDiarization ??= dynamicLibrary
         .lookup<
                 NativeFunction<
--- a/flutter/sherpa_onnx/lib/src/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+++ b/flutter/sherpa_onnx/lib/src/spoken_language_identification.dart 0 → 100644
查看文件 @12b96ac
+ // Copyright (c)  2024  Xiaomi Corporation
+ import 'dart:ffi';
+ 
+ import 'package:ffi/ffi.dart';
+ 
+ import './offline_stream.dart';
+ import './sherpa_onnx_bindings.dart';
+ import './utils.dart';
+ 
+ class SpokenLanguageIdentificationWhisperConfig {
+   const SpokenLanguageIdentificationWhisperConfig({
+     this.encoder = '',
+     this.decoder = '',
+     this.tailPaddings = 0,
+   });
+ 
+   factory SpokenLanguageIdentificationWhisperConfig.fromJson(
+       Map<String, dynamic> json) {
+     return SpokenLanguageIdentificationWhisperConfig(
+       encoder: json['encoder'] as String? ?? '',
+       decoder: json['decoder'] as String? ?? '',
+       tailPaddings: json['tailPaddings'] as int? ?? 0,
+     );
+   }
+ 
+   @override
+   String toString() {
+     return 'SpokenLanguageIdentificationWhisperConfig(encoder: $encoder, decoder: $decoder, tailPaddings: $tailPaddings)';
+   }
+ 
+   Map<String, dynamic> toJson() => {
+         'encoder': encoder,
+         'decoder': decoder,
+         'tailPaddings': tailPaddings,
+       };
+ 
+   final String encoder;
+   final String decoder;
+   final int tailPaddings;
+ }
+ 
+ class SpokenLanguageIdentificationConfig {
+   const SpokenLanguageIdentificationConfig({
+     this.whisper = const SpokenLanguageIdentificationWhisperConfig(),
+     this.numThreads = 1,
+     this.debug = false,
+     this.provider = 'cpu',
+   });
+ 
+   factory SpokenLanguageIdentificationConfig.fromJson(
+       Map<String, dynamic> json) {
+     return SpokenLanguageIdentificationConfig(
+       whisper: json['whisper'] != null
+           ? SpokenLanguageIdentificationWhisperConfig.fromJson(
+               json['whisper'] as Map<String, dynamic>)
+           : const SpokenLanguageIdentificationWhisperConfig(),
+       numThreads: json['numThreads'] as int? ?? 1,
+       debug: json['debug'] as bool? ?? false,
+       provider: json['provider'] as String? ?? 'cpu',
+     );
+   }
+ 
+   @override
+   String toString() {
+     return 'SpokenLanguageIdentificationConfig(whisper: $whisper, numThreads: $numThreads, debug: $debug, provider: $provider)';
+   }
+ 
+   Map<String, dynamic> toJson() => {
+         'whisper': whisper.toJson(),
+         'numThreads': numThreads,
+         'debug': debug,
+         'provider': provider,
+       };
+ 
+   final SpokenLanguageIdentificationWhisperConfig whisper;
+   final int numThreads;
+   final bool debug;
+   final String provider;
+ }
+ 
+ class SpokenLanguageIdentificationResult {
+   const SpokenLanguageIdentificationResult({
+     required this.lang,
+   });
+ 
+   factory SpokenLanguageIdentificationResult.fromJson(
+       Map<String, dynamic> json) {
+     return SpokenLanguageIdentificationResult(
+       lang: json['lang'] as String? ?? '',
+     );
+   }
+ 
+   @override
+   String toString() {
+     return 'SpokenLanguageIdentificationResult(lang: $lang)';
+   }
+ 
+   Map<String, dynamic> toJson() => {
+         'lang': lang,
+       };
+ 
+   final String lang;
+ }
+ 
+ class SpokenLanguageIdentification {
+   SpokenLanguageIdentification.fromPtr(
+       {required this.ptr, required this.config});
+ 
+   SpokenLanguageIdentification._({required this.ptr, required this.config});
+ 
+   void free() {
+     SherpaOnnxBindings.sherpaOnnxDestroySpokenLanguageIdentification?.call(ptr);
+     ptr = nullptr;
+   }
+ 
+   /// The user is responsible to call the SpokenLanguageIdentification.free()
+   /// method of the returned instance to avoid memory leak.
+   factory SpokenLanguageIdentification(
+       SpokenLanguageIdentificationConfig config) {
+     final c = convertConfig(config);
+ 
+     if (SherpaOnnxBindings.sherpaOnnxCreateSpokenLanguageIdentification ==
+         null) {
+       freeConfig(c);
+       throw Exception("Please initialize sherpa-onnx first");
+     }
+ 
+     final ptr = SherpaOnnxBindings.sherpaOnnxCreateSpokenLanguageIdentification
+             ?.call(c) ??
+         nullptr;
+ 
+     if (ptr == nullptr) {
+       freeConfig(c);
+       throw Exception(
+           "Failed to create spoken language identification. Please check your config");
+     }
+ 
+     freeConfig(c);
+ 
+     return SpokenLanguageIdentification._(ptr: ptr, config: config);
+   }
+ 
+   static Pointer<SherpaOnnxSpokenLanguageIdentificationConfig> convertConfig(
+       SpokenLanguageIdentificationConfig config) {
+     final c = calloc<SherpaOnnxSpokenLanguageIdentificationConfig>();
+ 
+     c.ref.whisper.encoder = config.whisper.encoder.toNativeUtf8();
+     c.ref.whisper.decoder = config.whisper.decoder.toNativeUtf8();
+     c.ref.whisper.tailPaddings = config.whisper.tailPaddings;
+ 
+     c.ref.numThreads = config.numThreads;
+     c.ref.debug = config.debug ? 1 : 0;
+     c.ref.provider = config.provider.toNativeUtf8();
+ 
+     return c;
+   }
+ 
+   static void freeConfig(
+       Pointer<SherpaOnnxSpokenLanguageIdentificationConfig> c) {
+     malloc.free(c.ref.whisper.encoder);
+     malloc.free(c.ref.whisper.decoder);
+     malloc.free(c.ref.provider);
+     malloc.free(c);
+   }
+ 
+   /// The user has to invoke stream.free() on the returned instance
+   /// to avoid memory leak
+   OfflineStream createStream() {
+     final p = SherpaOnnxBindings
+             .sherpaOnnxSpokenLanguageIdentificationCreateOfflineStream
+             ?.call(ptr) ??
+         nullptr;
+     return OfflineStream(ptr: p);
+   }
+ 
+   SpokenLanguageIdentificationResult compute(OfflineStream stream) {
+     final result = SherpaOnnxBindings
+             .sherpaOnnxSpokenLanguageIdentificationCompute
+             ?.call(ptr, stream.ptr) ??
+         nullptr;
+ 
+     if (result == nullptr) {
+       return const SpokenLanguageIdentificationResult(lang: '');
+     }
+ 
+     final lang = toDartString(result.ref.lang);
+ 
+     SherpaOnnxBindings.sherpaOnnxDestroySpokenLanguageIdentificationResult
+         ?.call(result);
+ 
+     return SpokenLanguageIdentificationResult(lang: lang);
+   }
+ 
+   Pointer<SherpaOnnxSpokenLanguageIdentification> ptr;
+   SpokenLanguageIdentificationConfig config;
+ }