wanghsinche
Committed by GitHub

feat: add mic example for better compatibility (#1909)

Co-authored-by: wanghsinche <wanghsinche>
1 # Introduction 1 # Introduction
2 2
3 -Note: You need `Node >= 18`. 3 +Note: You need `Node >= 18`.
  4 +
  5 +Note: For Mac M1 and other silicon chip series, do check the example `test-online-paraformer-microphone-mic.js`
4 6
5 This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx). 7 This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
6 8
@@ -278,6 +280,25 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 @@ -278,6 +280,25 @@ rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
278 node ./test-online-paraformer-microphone.js 280 node ./test-online-paraformer-microphone.js
279 ``` 281 ```
280 282
  283 +
  284 +## ./test-online-paraformer-microphone-mic.js
  285 +
  286 +[./test-online-paraformer-microphone-mic.js](./test-online-paraformer-microphone-mic.js)
  287 +demonstrates how to do real-time speech recognition from microphone
  288 +with a streaming Paraformer model. In the code we use
  289 +[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english).
  290 +
  291 +It uses `mic` for better compatibility, do check its [npm](https://www.npmjs.com/package/mic) before running it.
  292 +
  293 +You can use the following command to run it:
  294 +
  295 +```bash
  296 +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  297 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
  298 +node ./test-online-paraformer-microphone-mic.js
  299 +```
  300 +
  301 +
281 ## ./test-online-paraformer.js 302 ## ./test-online-paraformer.js
282 [./test-online-paraformer.js](./test-online-paraformer.js) demonstrates 303 [./test-online-paraformer.js](./test-online-paraformer.js) demonstrates
283 how to decode a file using a streaming Paraformer model. In the code we use 304 how to decode a file using a streaming Paraformer model. In the code we use
1 { 1 {
2 "dependencies": { 2 "dependencies": {
  3 + "mic": "^2.1.2",
3 "naudiodon2": "^2.4.0", 4 "naudiodon2": "^2.4.0",
4 "sherpa-onnx": "^1.10.45", 5 "sherpa-onnx": "^1.10.45",
5 "wav": "^1.0.2" 6 "wav": "^1.0.2"
  1 +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +const mic = require('mic'); // It uses `mic` for better compatibility, do check its [npm](https://www.npmjs.com/package/mic) before running it.
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOnlineRecognizer() {
  6 + let onlineParaformerModelConfig = {
  7 + encoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx',
  8 + decoder: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx',
  9 + };
  10 +
  11 + let onlineModelConfig = {
  12 + paraformer: onlineParaformerModelConfig,
  13 + tokens: './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt',
  14 + };
  15 +
  16 + let recognizerConfig = {
  17 + modelConfig: onlineModelConfig,
  18 + enableEndpoint: 1,
  19 + rule1MinTrailingSilence: 2.4,
  20 + rule2MinTrailingSilence: 1.2,
  21 + rule3MinUtteranceLength: 20,
  22 + };
  23 +
  24 + return sherpa_onnx.createOnlineRecognizer(recognizerConfig);
  25 +}
  26 +
  27 +/**
  28 + * SpeechSession class, work as a session manager with the formatOutput function
  29 + * Sample output:
  30 +=== Automated Speech Recognition ===
  31 +Current Session #1
  32 +Time: 8:44:46 PM
  33 +------------------------
  34 +Recognized Sentences:
  35 +[8:44:43 PM] 1. it's so great three result is great great 她还支持中文
  36 +[8:44:46 PM] 2. 很厉
  37 +------------------------
  38 +Recognizing: 真的很厉害太厉害
  39 +
  40 +*/
  41 +class SpeechSession {
  42 + constructor() {
  43 + this.startTime = Date.now();
  44 + this.sentences = [];
  45 + this.currentText = '';
  46 + this.lastUpdateTime = Date.now();
  47 + }
  48 +
  49 + addOrUpdateText(text) {
  50 + this.currentText = text;
  51 + this.lastUpdateTime = Date.now();
  52 + }
  53 +
  54 + finalizeSentence() {
  55 + if (this.currentText.trim()) {
  56 + this.sentences.push({
  57 + text: this.currentText.trim(),
  58 + timestamp: new Date().toLocaleTimeString()
  59 + });
  60 + }
  61 + this.currentText = '';
  62 + }
  63 +
  64 + shouldStartNewSession() {
  65 + return Date.now() - this.lastUpdateTime > 10000; // 10 seconds of silence
  66 + }
  67 +}
  68 +
  69 +function formatOutput() {
  70 + clearConsole();
  71 + console.log('\n=== Automated Speech Recognition ===');
  72 + console.log(`Current Session #${sessionCount}`);
  73 + console.log('Time:', new Date().toLocaleTimeString());
  74 + console.log('------------------------');
  75 +
  76 + // 显示历史句子
  77 + if (currentSession.sentences.length > 0) {
  78 + console.log('Recognized Sentences:');
  79 + currentSession.sentences.forEach((sentence, index) => {
  80 + console.log(`[${sentence.timestamp}] ${index + 1}. ${sentence.text}`);
  81 + });
  82 + console.log('------------------------');
  83 + }
  84 +
  85 + // 显示当前正在识别的内容
  86 + if (currentSession.currentText) {
  87 + console.log('Recognizing:', currentSession.currentText);
  88 + }
  89 + }
  90 +
  91 +
  92 +const recognizer = createOnlineRecognizer();
  93 +const stream = recognizer.createStream();
  94 +let currentSession = new SpeechSession();
  95 +let sessionCount = 1;
  96 +
  97 +function clearConsole() {
  98 + process.stdout.write('\x1B[2J\x1B[0f');
  99 +}
  100 +
  101 +
  102 +function exitHandler(options, exitCode) {
  103 + if (options.cleanup) {
  104 + console.log('\nCleaned up resources...');
  105 + micInstance.stop();
  106 + stream.free();
  107 + recognizer.free();
  108 + }
  109 + if (exitCode || exitCode === 0) console.log('Exit code:', exitCode);
  110 + if (options.exit) process.exit();
  111 +}
  112 +
  113 +const micInstance = mic({
  114 + rate: recognizer.config.featConfig.sampleRate,
  115 + channels: 1,
  116 + debug: false, // 关闭调试输出
  117 + device: 'default',
  118 + bitwidth: 16,
  119 + encoding: 'signed-integer',
  120 + exitOnSilence: 6,
  121 + fileType: 'raw'
  122 +});
  123 +
  124 +const micInputStream = micInstance.getAudioStream();
  125 +
  126 +function startMic() {
  127 + return new Promise((resolve, reject) => {
  128 + micInputStream.once('startComplete', () => {
  129 + console.log('Mic phone started.');
  130 + resolve();
  131 + });
  132 +
  133 + micInputStream.once('error', (err) => {
  134 + console.error('Mic phone start error:', err);
  135 + reject(err);
  136 + });
  137 +
  138 + micInstance.start();
  139 + });
  140 +}
  141 +
  142 +micInputStream.on('data', buffer => {
  143 + const int16Array = new Int16Array(buffer.buffer);
  144 + const samples = new Float32Array(int16Array.length);
  145 +
  146 + for (let i = 0; i < int16Array.length; i++) {
  147 + samples[i] = int16Array[i] / 32768.0;
  148 + }
  149 +
  150 + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples);
  151 +
  152 + while (recognizer.isReady(stream)) {
  153 + recognizer.decode(stream);
  154 + }
  155 +
  156 + const isEndpoint = recognizer.isEndpoint(stream);
  157 + const text = recognizer.getResult(stream).text;
  158 +
  159 + if (text.length > 0) {
  160 + // 检查是否需要开始新会话
  161 + if (currentSession.shouldStartNewSession()) {
  162 + currentSession.finalizeSentence();
  163 + sessionCount++;
  164 + currentSession = new SpeechSession();
  165 + }
  166 +
  167 + currentSession.addOrUpdateText(text);
  168 + formatOutput();
  169 + }
  170 +
  171 + if (isEndpoint) {
  172 + if (text.length > 0) {
  173 + currentSession.finalizeSentence();
  174 + formatOutput();
  175 + }
  176 + recognizer.reset(stream);
  177 + }
  178 +});
  179 +
  180 +micInputStream.on('error', err => {
  181 + console.error('Audio stream error:', err);
  182 +});
  183 +
  184 +micInputStream.on('close', () => {
  185 + console.log('Mic phone closed.');
  186 +});
  187 +
  188 +process.on('exit', exitHandler.bind(null, {cleanup: true}));
  189 +process.on('SIGINT', exitHandler.bind(null, {exit: true}));
  190 +process.on('SIGUSR1', exitHandler.bind(null, {exit: true}));
  191 +process.on('SIGUSR2', exitHandler.bind(null, {exit: true}));
  192 +process.on('uncaughtException', exitHandler.bind(null, {exit: true}));
  193 +
  194 +async function main() {
  195 + try {
  196 + console.log('Starting ...');
  197 + await startMic();
  198 + console.log('Initialized, waiting for speech ...');
  199 + formatOutput();
  200 + } catch (err) {
  201 + console.error('Failed to initialize:', err);
  202 + process.exit(1);
  203 + }
  204 +}
  205 +
  206 +main();