test_asr_streaming_paraformer_microphone.js 2.7 KB
// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
//
const portAudio = require('naudiodon2');
// console.log(portAudio.getDevices());

const sherpa_onnx = require('sherpa-onnx-node');

function createOnlineRecognizer() {
  const config = {
    'featConfig': {
      'sampleRate': 16000,
      'featureDim': 80,
    },
    'modelConfig': {
      'paraformer': {
        'encoder':
            './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx',
        'decoder':
            './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx',
      },
      'tokens': './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
      'numThreads': 2,
      'provider': 'cpu',
      'debug': 1,
    },
    'decodingMethod': 'greedy_search',
    'maxActivePaths': 4,
    'enableEndpoint': true,
    'rule1MinTrailingSilence': 2.4,
    'rule2MinTrailingSilence': 1.2,
    'rule3MinUtteranceLength': 20
  };

  return new sherpa_onnx.OnlineRecognizer(config);
}

const recognizer = createOnlineRecognizer();
const stream = recognizer.createStream();

let lastText = '';
let segmentIndex = 0;

const ai = new portAudio.AudioIO({
  inOptions: {
    channelCount: 1,
    closeOnError: true,  // Close the stream if an audio error is detected, if
                         // set false then just log the error
    deviceId: -1,  // Use -1 or omit the deviceId to select the default device
    sampleFormat: portAudio.SampleFormatFloat32,
    sampleRate: recognizer.config.featConfig.sampleRate
  }
});

const display = new sherpa_onnx.Display(50);

ai.on('data', data => {
  const samples = new Float32Array(data.buffer);

  stream.acceptWaveform(
      {sampleRate: recognizer.config.featConfig.sampleRate, samples: samples});

  while (recognizer.isReady(stream)) {
    recognizer.decode(stream);
  }

  const isEndpoint = recognizer.isEndpoint(stream);
  let text = recognizer.getResult(stream).text.toLowerCase();

  if (isEndpoint) {
    // for online paraformer models, we have to manually padding on endpoint
    // so that the last word can be recognized
    const tailPadding =
        new Float32Array(recognizer.config.featConfig.sampleRate * 0.4);
    stream.acceptWaveform({
      samples: tailPadding,
      sampleRate: recognizer.config.featConfig.sampleRate
    });
    while (recognizer.isReady(stream)) {
      recognizer.decode(stream);
    }
    text = recognizer.getResult(stream).text.toLowerCase();
  }

  if (text.length > 0 && lastText != text) {
    lastText = text;
    display.print(segmentIndex, lastText);
  }
  if (isEndpoint) {
    if (text.length > 0) {
      lastText = text;
      segmentIndex += 1;
    }
    recognizer.reset(stream)
  }
});

ai.start();
console.log('Started! Please speak')