Fangjun Kuang
Committed by GitHub

ADD VAD+ASR example for dart with CircularBuffer. (#1293)

@@ -40,6 +40,7 @@ echo "----paraformer----" @@ -40,6 +40,7 @@ echo "----paraformer----"
40 rm -rf sherpa-onnx-* 40 rm -rf sherpa-onnx-*
41 41
42 echo "----SenseVoice zh----" 42 echo "----SenseVoice zh----"
  43 +./run-sense-voice-zh-2.sh
43 ./run-sense-voice-zh.sh 44 ./run-sense-voice-zh.sh
44 rm -rf sherpa-onnx-* 45 rm -rf sherpa-onnx-*
45 46
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +//
  3 +// Different from ./sense-voice.dart, this file uses a CircularBuffer
  4 +import 'dart:io';
  5 +
  6 +import 'package:args/args.dart';
  7 +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
  8 +
  9 +import './init.dart';
  10 +
  11 +void main(List<String> arguments) async {
  12 + await initSherpaOnnx();
  13 +
  14 + final parser = ArgParser()
  15 + ..addOption('silero-vad', help: 'Path to silero_vad.onnx')
  16 + ..addOption('model', help: 'Path to the SenseVoice model')
  17 + ..addOption('tokens', help: 'Path to tokens.txt')
  18 + ..addOption('language',
  19 + help: 'auto, zh, en, ja, ko, yue, or leave it empty to use auto',
  20 + defaultsTo: '')
  21 + ..addOption('use-itn',
  22 + help: 'true to use inverse text normalization', defaultsTo: 'false')
  23 + ..addOption('input-wav', help: 'Path to input.wav to transcribe');
  24 +
  25 + final res = parser.parse(arguments);
  26 + if (res['silero-vad'] == null ||
  27 + res['model'] == null ||
  28 + res['tokens'] == null ||
  29 + res['input-wav'] == null) {
  30 + print(parser.usage);
  31 + exit(1);
  32 + }
  33 +
  34 + // create VAD
  35 + final sileroVad = res['silero-vad'] as String;
  36 +
  37 + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig(
  38 + model: sileroVad,
  39 + minSilenceDuration: 0.25,
  40 + minSpeechDuration: 0.5,
  41 + );
  42 +
  43 + final vadConfig = sherpa_onnx.VadModelConfig(
  44 + sileroVad: sileroVadConfig,
  45 + numThreads: 1,
  46 + debug: true,
  47 + );
  48 +
  49 + final vad = sherpa_onnx.VoiceActivityDetector(
  50 + config: vadConfig, bufferSizeInSeconds: 10);
  51 +
  52 + // create SenseVoice
  53 + final model = res['model'] as String;
  54 + final tokens = res['tokens'] as String;
  55 + final inputWav = res['input-wav'] as String;
  56 + final language = res['language'] as String;
  57 + final useItn = (res['use-itn'] as String).toLowerCase() == 'true';
  58 +
  59 + final senseVoice = sherpa_onnx.OfflineSenseVoiceModelConfig(
  60 + model: model, language: language, useInverseTextNormalization: useItn);
  61 +
  62 + final modelConfig = sherpa_onnx.OfflineModelConfig(
  63 + senseVoice: senseVoice,
  64 + tokens: tokens,
  65 + debug: true,
  66 + numThreads: 1,
  67 + );
  68 + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
  69 + final recognizer = sherpa_onnx.OfflineRecognizer(config);
  70 +
  71 + final waveData = sherpa_onnx.readWave(inputWav);
  72 + if (waveData.sampleRate != 16000) {
  73 + print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}');
  74 + exit(1);
  75 + }
  76 +
  77 + final buffer = sherpa_onnx.CircularBuffer(capacity: 30 * 16000);
  78 + buffer.push(waveData.samples);
  79 +
  80 + while (buffer.size > vadConfig.sileroVad.windowSize) {
  81 + final samples =
  82 + buffer.get(startIndex: buffer.head, n: vadConfig.sileroVad.windowSize);
  83 + buffer.pop(vadConfig.sileroVad.windowSize);
  84 +
  85 + vad.acceptWaveform(samples);
  86 +
  87 + if (vad.isDetected()) {
  88 + while (!vad.isEmpty()) {
  89 + final samples = vad.front().samples;
  90 + final startTime = vad.front().start.toDouble() / waveData.sampleRate;
  91 + final endTime =
  92 + startTime + samples.length.toDouble() / waveData.sampleRate;
  93 +
  94 + final stream = recognizer.createStream();
  95 + stream.acceptWaveform(
  96 + samples: samples, sampleRate: waveData.sampleRate);
  97 + recognizer.decode(stream);
  98 +
  99 + final result = recognizer.getResult(stream);
  100 + stream.free();
  101 + print(
  102 + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
  103 +
  104 + vad.pop();
  105 + }
  106 + }
  107 + }
  108 +
  109 + vad.flush();
  110 +
  111 + while (!vad.isEmpty()) {
  112 + final samples = vad.front().samples;
  113 + final startTime = vad.front().start.toDouble() / waveData.sampleRate;
  114 + final endTime = startTime + samples.length.toDouble() / waveData.sampleRate;
  115 +
  116 + final stream = recognizer.createStream();
  117 + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
  118 + recognizer.decode(stream);
  119 +
  120 + final result = recognizer.getResult(stream);
  121 + stream.free();
  122 + print(
  123 + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
  124 +
  125 + vad.pop();
  126 + }
  127 +
  128 + buffer.free();
  129 + vad.free();
  130 +
  131 + recognizer.free();
  132 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +dart pub get
  6 +
  7 +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
  8 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  9 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  10 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  11 +fi
  12 +
  13 +if [ ! -f ./lei-jun-test.wav ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  15 +fi
  16 +
  17 +if [[ ! -f ./silero_vad.onnx ]]; then
  18 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  19 +fi
  20 +
  21 +dart run \
  22 + ./bin/sense-voice-2.dart \
  23 + --silero-vad ./silero_vad.onnx \
  24 + --model ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \
  25 + --tokens ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \
  26 + --use-itn true \
  27 + --input-wav ./lei-jun-test.wav
  28 +