test-vad-microphone-offline-transducer.js 3.3 KB
// Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
//
const sherpa_onnx = require('sherpa-onnx');
const portAudio = require('naudiodon2');
console.log(portAudio.getDevices());

function createOfflineRecognizer() {
  const featConfig = new sherpa_onnx.FeatureConfig();
  featConfig.sampleRate = 16000;
  featConfig.featureDim = 80;

  // test online recognizer
  const transducer = new sherpa_onnx.OfflineTransducerModelConfig();
  transducer.encoder =
      './sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx';
  transducer.decoder =
      './sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx';
  transducer.joiner =
      './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx';
  const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt';

  const modelConfig = new sherpa_onnx.OfflineModelConfig();
  modelConfig.transducer = transducer;
  modelConfig.tokens = tokens;
  modelConfig.modelType = 'transducer';

  const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig();
  recognizerConfig.featConfig = featConfig;
  recognizerConfig.modelConfig = modelConfig;
  recognizerConfig.decodingMethod = 'greedy_search';

  const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig);
  return recognizer;
}

function createVad() {
  const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig();
  sileroVadModelConfig.model = './silero_vad.onnx';
  sileroVadModelConfig.minSpeechDuration = 0.3;   // seconds
  sileroVadModelConfig.minSilenceDuration = 0.3;  // seconds
  sileroVadModelConfig.windowSize = 512;

  const vadModelConfig = new sherpa_onnx.VadModelConfig();
  vadModelConfig.sileroVad = sileroVadModelConfig;
  vadModelConfig.sampleRate = 16000;

  const bufferSizeInSeconds = 60;
  const vad = new sherpa_onnx.VoiceActivityDetector(
      vadModelConfig, bufferSizeInSeconds);
  return vad;
}

const recognizer = createOfflineRecognizer();
const vad = createVad();

const bufferSizeInSeconds = 30;
const buffer =
    new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);

const ai = new portAudio.AudioIO({
  inOptions: {
    channelCount: 1,
    closeOnError: true,  // Close the stream if an audio error is detected, if
                         // set false then just log the error
    deviceId: -1,  // Use -1 or omit the deviceId to select the default device
    sampleFormat: portAudio.SampleFormatFloat32,
    sampleRate: vad.config.sampleRate
  }
});

let printed = false;
let index = 0;
ai.on('data', data => {
  const windowSize = vad.config.sileroVad.windowSize;
  buffer.push(new Float32Array(data.buffer));
  while (buffer.size() > windowSize) {
    const samples = buffer.get(buffer.head(), windowSize);
    buffer.pop(windowSize);
    vad.acceptWaveform(samples)
  }

  while (!vad.isEmpty()) {
    const segment = vad.front();
    vad.pop();
    const stream = recognizer.createStream();
    stream.acceptWaveform(
        recognizer.config.featConfig.sampleRate, segment.samples);
    recognizer.decode(stream);
    const r = recognizer.getResult(stream);
    stream.free();
    if (r.text.length > 0) {
      console.log(`${index}: ${r.text}`);
      index += 1;
    }
  }
});

ai.on('close', () => {
  console.log('Free resources');
  recognizer.free();
  vad.free();
  buffer.free();
});

ai.start();
console.log('Started! Please speak')