Committed by
GitHub
Add JavaScript API for SenseVoice (#1157)
正在显示
6 个修改的文件
包含
210 行增加
和
1 行删除
| @@ -20,6 +20,13 @@ if [[ $arch != "ia32" && $platform != "win32" ]]; then | @@ -20,6 +20,13 @@ if [[ $arch != "ia32" && $platform != "win32" ]]; then | ||
| 20 | node ./test_asr_non_streaming_nemo_ctc.js | 20 | node ./test_asr_non_streaming_nemo_ctc.js |
| 21 | rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k | 21 | rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k |
| 22 | 22 | ||
| 23 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 24 | + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 25 | + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 26 | + | ||
| 27 | + node ./test_asr_non_streaming_sense_voice.js | ||
| 28 | + rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 | ||
| 29 | + | ||
| 23 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 | 30 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 |
| 24 | tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 | 31 | tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 |
| 25 | rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 | 32 | rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 |
| @@ -95,6 +95,7 @@ The following tables list the examples in this folder. | @@ -95,6 +95,7 @@ The following tables list the examples in this folder. | ||
| 95 | |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)| | 95 | |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)| |
| 96 | |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| | 96 | |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |
| 97 | |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| | 97 | |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |
| 98 | +|[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| | ||
| 98 | 99 | ||
| 99 | ## Non-Streaming speech-to-text from a microphone with VAD | 100 | ## Non-Streaming speech-to-text from a microphone with VAD |
| 100 | 101 | ||
| @@ -104,6 +105,7 @@ The following tables list the examples in this folder. | @@ -104,6 +105,7 @@ The following tables list the examples in this folder. | ||
| 104 | |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| | 105 | |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| |
| 105 | |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| | 106 | |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |
| 106 | |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| | 107 | |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |
| 108 | +|[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| | ||
| 107 | 109 | ||
| 108 | ## Text-to-speech | 110 | ## Text-to-speech |
| 109 | 111 | ||
| @@ -252,6 +254,20 @@ npm install naudiodon2 | @@ -252,6 +254,20 @@ npm install naudiodon2 | ||
| 252 | node ./test_vad_asr_non_streaming_paraformer_microphone.js | 254 | node ./test_vad_asr_non_streaming_paraformer_microphone.js |
| 253 | ``` | 255 | ``` |
| 254 | 256 | ||
| 257 | +### Non-streaming speech recognition with SenseVoice | ||
| 258 | + | ||
| 259 | +```bash | ||
| 260 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 261 | +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 262 | +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 263 | + | ||
| 264 | +node ./test_asr_non_streaming_sense_voice.js | ||
| 265 | + | ||
| 266 | +# To run VAD + non-streaming ASR with Paraformer using a microphone | ||
| 267 | +npm install naudiodon2 | ||
| 268 | +node ./test_vad_asr_non_streaming_sense_voice_microphone.js | ||
| 269 | +``` | ||
| 270 | + | ||
| 255 | ### Text-to-speech with piper VITS models (TTS) | 271 | ### Text-to-speech with piper VITS models (TTS) |
| 256 | 272 | ||
| 257 | ```bash | 273 | ```bash |
| 1 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 2 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 3 | + | ||
| 4 | +// Please download test files from | ||
| 5 | +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 6 | +const config = { | ||
| 7 | + 'featConfig': { | ||
| 8 | + 'sampleRate': 16000, | ||
| 9 | + 'featureDim': 80, | ||
| 10 | + }, | ||
| 11 | + 'modelConfig': { | ||
| 12 | + 'senseVoice': { | ||
| 13 | + 'model': | ||
| 14 | + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx', | ||
| 15 | + 'useInverseTextNormalization': 1, | ||
| 16 | + }, | ||
| 17 | + 'tokens': './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt', | ||
| 18 | + 'numThreads': 2, | ||
| 19 | + 'provider': 'cpu', | ||
| 20 | + 'debug': 1, | ||
| 21 | + } | ||
| 22 | +}; | ||
| 23 | + | ||
| 24 | +const waveFilename = | ||
| 25 | + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav'; | ||
| 26 | + | ||
| 27 | +const recognizer = new sherpa_onnx.OfflineRecognizer(config); | ||
| 28 | +console.log('Started') | ||
| 29 | +let start = Date.now(); | ||
| 30 | +const stream = recognizer.createStream(); | ||
| 31 | +const wave = sherpa_onnx.readWave(waveFilename); | ||
| 32 | +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); | ||
| 33 | + | ||
| 34 | +recognizer.decode(stream); | ||
| 35 | +result = recognizer.getResult(stream) | ||
| 36 | +let stop = Date.now(); | ||
| 37 | +console.log('Done') | ||
| 38 | + | ||
| 39 | +const elapsed_seconds = (stop - start) / 1000; | ||
| 40 | +const duration = wave.samples.length / wave.sampleRate; | ||
| 41 | +const real_time_factor = elapsed_seconds / duration; | ||
| 42 | +console.log('Wave duration', duration.toFixed(3), 'secodns') | ||
| 43 | +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') | ||
| 44 | +console.log( | ||
| 45 | + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, | ||
| 46 | + real_time_factor.toFixed(3)) | ||
| 47 | +console.log(waveFilename) | ||
| 48 | +console.log('result\n', result) |
| 1 | +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | +// | ||
| 3 | +const portAudio = require('naudiodon2'); | ||
| 4 | +// console.log(portAudio.getDevices()); | ||
| 5 | + | ||
| 6 | +const sherpa_onnx = require('sherpa-onnx-node'); | ||
| 7 | + | ||
| 8 | +function createRecognizer() { | ||
| 9 | + // Please download test files from | ||
| 10 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | ||
| 11 | + const config = { | ||
| 12 | + 'featConfig': { | ||
| 13 | + 'sampleRate': 16000, | ||
| 14 | + 'featureDim': 80, | ||
| 15 | + }, | ||
| 16 | + 'modelConfig': { | ||
| 17 | + 'senseVoice': { | ||
| 18 | + 'model': | ||
| 19 | + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx', | ||
| 20 | + 'useInverseTextNormalization': 1, | ||
| 21 | + }, | ||
| 22 | + 'tokens': | ||
| 23 | + './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt', | ||
| 24 | + 'numThreads': 2, | ||
| 25 | + 'provider': 'cpu', | ||
| 26 | + 'debug': 1, | ||
| 27 | + } | ||
| 28 | + }; | ||
| 29 | + | ||
| 30 | + return new sherpa_onnx.OfflineRecognizer(config); | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +function createVad() { | ||
| 34 | + // please download silero_vad.onnx from | ||
| 35 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 36 | + const config = { | ||
| 37 | + sileroVad: { | ||
| 38 | + model: './silero_vad.onnx', | ||
| 39 | + threshold: 0.5, | ||
| 40 | + minSpeechDuration: 0.25, | ||
| 41 | + minSilenceDuration: 0.5, | ||
| 42 | + windowSize: 512, | ||
| 43 | + }, | ||
| 44 | + sampleRate: 16000, | ||
| 45 | + debug: true, | ||
| 46 | + numThreads: 1, | ||
| 47 | + }; | ||
| 48 | + | ||
| 49 | + const bufferSizeInSeconds = 60; | ||
| 50 | + | ||
| 51 | + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); | ||
| 52 | +} | ||
| 53 | + | ||
| 54 | +const recognizer = createRecognizer(); | ||
| 55 | +const vad = createVad(); | ||
| 56 | + | ||
| 57 | +const bufferSizeInSeconds = 30; | ||
| 58 | +const buffer = | ||
| 59 | + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); | ||
| 60 | + | ||
| 61 | +const ai = new portAudio.AudioIO({ | ||
| 62 | + inOptions: { | ||
| 63 | + channelCount: 1, | ||
| 64 | + closeOnError: true, // Close the stream if an audio error is detected, if | ||
| 65 | + // set false then just log the error | ||
| 66 | + deviceId: -1, // Use -1 or omit the deviceId to select the default device | ||
| 67 | + sampleFormat: portAudio.SampleFormatFloat32, | ||
| 68 | + sampleRate: vad.config.sampleRate | ||
| 69 | + } | ||
| 70 | +}); | ||
| 71 | + | ||
| 72 | +let printed = false; | ||
| 73 | +let index = 0; | ||
| 74 | +ai.on('data', data => { | ||
| 75 | + const windowSize = vad.config.sileroVad.windowSize; | ||
| 76 | + buffer.push(new Float32Array(data.buffer)); | ||
| 77 | + while (buffer.size() > windowSize) { | ||
| 78 | + const samples = buffer.get(buffer.head(), windowSize); | ||
| 79 | + buffer.pop(windowSize); | ||
| 80 | + vad.acceptWaveform(samples); | ||
| 81 | + } | ||
| 82 | + | ||
| 83 | + while (!vad.isEmpty()) { | ||
| 84 | + const segment = vad.front(); | ||
| 85 | + vad.pop(); | ||
| 86 | + const stream = recognizer.createStream(); | ||
| 87 | + stream.acceptWaveform({ | ||
| 88 | + samples: segment.samples, | ||
| 89 | + sampleRate: recognizer.config.featConfig.sampleRate | ||
| 90 | + }); | ||
| 91 | + recognizer.decode(stream); | ||
| 92 | + const r = recognizer.getResult(stream); | ||
| 93 | + if (r.text.length > 0) { | ||
| 94 | + const text = r.text.toLowerCase().trim(); | ||
| 95 | + console.log(`${index}: ${text}`); | ||
| 96 | + | ||
| 97 | + const filename = `${index}-${text}-${ | ||
| 98 | + new Date() | ||
| 99 | + .toLocaleTimeString('en-US', {hour12: false}) | ||
| 100 | + .split(' ')[0]}.wav`; | ||
| 101 | + sherpa_onnx.writeWave( | ||
| 102 | + filename, | ||
| 103 | + {samples: segment.samples, sampleRate: vad.config.sampleRate}); | ||
| 104 | + | ||
| 105 | + index += 1; | ||
| 106 | + } | ||
| 107 | + } | ||
| 108 | +}); | ||
| 109 | + | ||
| 110 | +ai.start(); | ||
| 111 | +console.log('Started! Please speak') |
| @@ -96,6 +96,24 @@ static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig( | @@ -96,6 +96,24 @@ static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig( | ||
| 96 | return c; | 96 | return c; |
| 97 | } | 97 | } |
| 98 | 98 | ||
| 99 | +static SherpaOnnxOfflineSenseVoiceModelConfig GetOfflineSenseVoiceModelConfig( | ||
| 100 | + Napi::Object obj) { | ||
| 101 | + SherpaOnnxOfflineSenseVoiceModelConfig c; | ||
| 102 | + memset(&c, 0, sizeof(c)); | ||
| 103 | + | ||
| 104 | + if (!obj.Has("senseVoice") || !obj.Get("senseVoice").IsObject()) { | ||
| 105 | + return c; | ||
| 106 | + } | ||
| 107 | + | ||
| 108 | + Napi::Object o = obj.Get("senseVoice").As<Napi::Object>(); | ||
| 109 | + | ||
| 110 | + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); | ||
| 111 | + SHERPA_ONNX_ASSIGN_ATTR_STR(language, language); | ||
| 112 | + SHERPA_ONNX_ASSIGN_ATTR_INT32(use_itn, useInverseTextNormalization); | ||
| 113 | + | ||
| 114 | + return c; | ||
| 115 | +} | ||
| 116 | + | ||
| 99 | static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { | 117 | static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { |
| 100 | SherpaOnnxOfflineModelConfig c; | 118 | SherpaOnnxOfflineModelConfig c; |
| 101 | memset(&c, 0, sizeof(c)); | 119 | memset(&c, 0, sizeof(c)); |
| @@ -111,6 +129,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { | @@ -111,6 +129,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { | ||
| 111 | c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o); | 129 | c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o); |
| 112 | c.whisper = GetOfflineWhisperModelConfig(o); | 130 | c.whisper = GetOfflineWhisperModelConfig(o); |
| 113 | c.tdnn = GetOfflineTdnnModelConfig(o); | 131 | c.tdnn = GetOfflineTdnnModelConfig(o); |
| 132 | + c.sense_voice = GetOfflineSenseVoiceModelConfig(o); | ||
| 114 | 133 | ||
| 115 | SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); | 134 | SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); |
| 116 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); | 135 | SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); |
| @@ -225,6 +244,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { | @@ -225,6 +244,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { | ||
| 225 | delete[] c.model_config.tdnn.model; | 244 | delete[] c.model_config.tdnn.model; |
| 226 | } | 245 | } |
| 227 | 246 | ||
| 247 | + if (c.model_config.sense_voice.model) { | ||
| 248 | + delete[] c.model_config.sense_voice.model; | ||
| 249 | + } | ||
| 250 | + | ||
| 251 | + if (c.model_config.sense_voice.language) { | ||
| 252 | + delete[] c.model_config.sense_voice.language; | ||
| 253 | + } | ||
| 254 | + | ||
| 228 | if (c.model_config.tokens) { | 255 | if (c.model_config.tokens) { |
| 229 | delete[] c.model_config.tokens; | 256 | delete[] c.model_config.tokens; |
| 230 | } | 257 | } |
-
请 注册 或 登录 后发表评论