继续操作前请注册或者登录。
Fangjun Kuang
Committed by GitHub

Add Dart API for ten-vad (#2386)

... ... @@ -4,6 +4,12 @@ set -ex
cd dart-api-examples
pushd vad
./run-ten-vad.sh
./run.sh
rm *.onnx
popd
pushd non-streaming-asr
echo '----------Zipformer CTC----------'
... ... @@ -186,9 +192,3 @@ echo '----------streaming paraformer----------'
rm -rf sherpa-onnx-*
popd # streaming-asr
pushd vad
./run.sh
rm *.onnx
popd
... ...
// Copyright (c) 2024 Xiaomi Corporation
import 'dart:io';
import 'dart:typed_data';
import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
import './init.dart';
void main(List<String> arguments) async {
await initSherpaOnnx();
final parser = ArgParser()
..addOption('ten-vad', help: 'Path to ten-vad.onnx')
..addOption('input-wav', help: 'Path to input.wav')
..addOption('output-wav', help: 'Path to output.wav');
final res = parser.parse(arguments);
if (res['ten-vad'] == null ||
res['input-wav'] == null ||
res['output-wav'] == null) {
print(parser.usage);
exit(1);
}
final tenVad = res['ten-vad'] as String;
final inputWav = res['input-wav'] as String;
final outputWav = res['output-wav'] as String;
final tenVadConfig = sherpa_onnx.TenVadModelConfig(
model: tenVad,
threshold: 0.25,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
windowSize: 256,
);
final config = sherpa_onnx.VadModelConfig(
tenVad: tenVadConfig,
numThreads: 1,
debug: true,
);
final vad = sherpa_onnx.VoiceActivityDetector(
config: config, bufferSizeInSeconds: 10);
final waveData = sherpa_onnx.readWave(inputWav);
if (waveData.sampleRate != 16000) {
print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}');
exit(1);
}
int numSamples = waveData.samples.length;
int numIter = numSamples ~/ config.tenVad.windowSize;
List<List<double>> allSamples = [];
for (int i = 0; i != numIter; ++i) {
int start = i * config.tenVad.windowSize;
vad.acceptWaveform(Float32List.sublistView(
waveData.samples, start, start + config.tenVad.windowSize));
if (vad.isDetected()) {
while (!vad.isEmpty()) {
allSamples.add(vad.front().samples);
vad.pop();
}
}
}
vad.flush();
while (!vad.isEmpty()) {
allSamples.add(vad.front().samples);
vad.pop();
}
vad.free();
final s = Float32List.fromList(allSamples.expand((x) => x).toList());
sherpa_onnx.writeWave(
filename: outputWav, samples: s, sampleRate: waveData.sampleRate);
print('Saved to $outputWav');
}
... ...
#!/usr/bin/env bash
set -ex
dart pub get
if [[ ! -f ./ten-vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
fi
if [[ ! -f ./lei-jun-test.wav ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi
dart run \
./bin/ten-vad.dart \
--ten-vad ./ten-vad.onnx \
--input-wav ./lei-jun-test.wav \
--output-wav ./lei-jun-test-no-silence.wav
ls -lh *.wav
... ...
... ... @@ -487,6 +487,25 @@ final class SherpaOnnxSileroVadModelConfig extends Struct {
external double maxSpeechDuration;
}
final class SherpaOnnxTenVadModelConfig extends Struct {
external Pointer<Utf8> model;
@Float()
external double threshold;
@Float()
external double minSilenceDuration;
@Float()
external double minSpeechDuration;
@Int32()
external int windowSize;
@Float()
external double maxSpeechDuration;
}
final class SherpaOnnxVadModelConfig extends Struct {
external SherpaOnnxSileroVadModelConfig sileroVad;
... ... @@ -500,6 +519,8 @@ final class SherpaOnnxVadModelConfig extends Struct {
@Int32()
external int debug;
external SherpaOnnxTenVadModelConfig tenVad;
}
final class SherpaOnnxSpeechSegment extends Struct {
... ...
... ... @@ -49,6 +49,50 @@ class SileroVadModelConfig {
final double maxSpeechDuration;
}
class TenVadModelConfig {
const TenVadModelConfig(
{this.model = '',
this.threshold = 0.5,
this.minSilenceDuration = 0.5,
this.minSpeechDuration = 0.25,
this.windowSize = 256,
this.maxSpeechDuration = 5.0});
factory TenVadModelConfig.fromJson(Map<String, dynamic> json) {
return TenVadModelConfig(
model: json['model'] as String? ?? '',
threshold: (json['threshold'] as num?)?.toDouble() ?? 0.5,
minSilenceDuration:
(json['minSilenceDuration'] as num?)?.toDouble() ?? 0.5,
minSpeechDuration:
(json['minSpeechDuration'] as num?)?.toDouble() ?? 0.25,
windowSize: json['windowSize'] as int? ?? 256,
maxSpeechDuration: (json['maxSpeechDuration'] as num?)?.toDouble() ?? 5.0,
);
}
@override
String toString() {
return 'TenVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)';
}
Map<String, dynamic> toJson() => {
'model': model,
'threshold': threshold,
'minSilenceDuration': minSilenceDuration,
'minSpeechDuration': minSpeechDuration,
'windowSize': windowSize,
'maxSpeechDuration': maxSpeechDuration,
};
final String model;
final double threshold;
final double minSilenceDuration;
final double minSpeechDuration;
final int windowSize;
final double maxSpeechDuration;
}
class VadModelConfig {
VadModelConfig({
this.sileroVad = const SileroVadModelConfig(),
... ... @@ -56,9 +100,11 @@ class VadModelConfig {
this.numThreads = 1,
this.provider = 'cpu',
this.debug = true,
this.tenVad = const TenVadModelConfig(),
});
final SileroVadModelConfig sileroVad;
final TenVadModelConfig tenVad;
final int sampleRate;
final int numThreads;
final String provider;
... ... @@ -68,6 +114,8 @@ class VadModelConfig {
return VadModelConfig(
sileroVad: SileroVadModelConfig.fromJson(
json['sileroVad'] as Map<String, dynamic>? ?? const {}),
tenVad: TenVadModelConfig.fromJson(
json['tenVad'] as Map<String, dynamic>? ?? const {}),
sampleRate: json['sampleRate'] as int? ?? 16000,
numThreads: json['numThreads'] as int? ?? 1,
provider: json['provider'] as String? ?? 'cpu',
... ... @@ -77,6 +125,7 @@ class VadModelConfig {
Map<String, dynamic> toJson() => {
'sileroVad': sileroVad.toJson(),
'tenVad': tenVad.toJson(),
'sampleRate': sampleRate,
'numThreads': numThreads,
'provider': provider,
... ... @@ -85,7 +134,7 @@ class VadModelConfig {
@override
String toString() {
return 'VadModelConfig(sileroVad: $sileroVad, sampleRate: $sampleRate, numThreads: $numThreads, provider: $provider, debug: $debug)';
return 'VadModelConfig(sileroVad: $sileroVad, tenVad: $tenVad, sampleRate: $sampleRate, numThreads: $numThreads, provider: $provider, debug: $debug)';
}
}
... ... @@ -168,8 +217,8 @@ class VoiceActivityDetector {
{required VadModelConfig config, required double bufferSizeInSeconds}) {
final c = calloc<SherpaOnnxVadModelConfig>();
final modelPtr = config.sileroVad.model.toNativeUtf8();
c.ref.sileroVad.model = modelPtr;
final sileroVadModelPtr = config.sileroVad.model.toNativeUtf8();
c.ref.sileroVad.model = sileroVadModelPtr;
c.ref.sileroVad.threshold = config.sileroVad.threshold;
c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
... ... @@ -177,6 +226,15 @@ class VoiceActivityDetector {
c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration;
final tenVadModelPtr = config.tenVad.model.toNativeUtf8();
c.ref.tenVad.model = tenVadModelPtr;
c.ref.tenVad.threshold = config.tenVad.threshold;
c.ref.tenVad.minSilenceDuration = config.tenVad.minSilenceDuration;
c.ref.tenVad.minSpeechDuration = config.tenVad.minSpeechDuration;
c.ref.tenVad.windowSize = config.tenVad.windowSize;
c.ref.tenVad.maxSpeechDuration = config.tenVad.maxSpeechDuration;
c.ref.sampleRate = config.sampleRate;
c.ref.numThreads = config.numThreads;
... ... @@ -190,7 +248,8 @@ class VoiceActivityDetector {
nullptr;
calloc.free(providerPtr);
calloc.free(modelPtr);
calloc.free(tenVadModelPtr);
calloc.free(sileroVadModelPtr);
calloc.free(c);
return VoiceActivityDetector._(ptr: ptr, config: config);
... ...