Fangjun Kuang
Committed by GitHub

Add alsa-based streaming ASR example for sense voice. (#2207)

... ... @@ -82,6 +82,8 @@ jobs:
..
make -j4 install
cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
rm -rf install/lib/pkgconfig
rm -fv install/lib/cargs.h
rm -fv install/lib/libcargs.so
... ... @@ -126,6 +128,8 @@ jobs:
make -j4 install
cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
rm -rf install/lib/pkgconfig
rm -fv install/lib/cargs.h
rm -fv install/lib/libcargs.so
... ... @@ -242,7 +246,7 @@ jobs:
file: sherpa-onnx-*linux-aarch64*.tar.bz2
# repo_name: k2-fsa/sherpa-onnx
# repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
# tag: v1.11.1
# tag: v1.11.5
- name: Test offline Moonshine
if: matrix.build_type != 'Debug'
... ...
... ... @@ -83,6 +83,8 @@ jobs:
make install
cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
ls -lh install/lib
rm -rf install/lib/pkgconfig
... ... @@ -164,7 +166,7 @@ jobs:
file: sherpa-onnx-*linux-aarch64*.tar.bz2
# repo_name: k2-fsa/sherpa-onnx
# repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
# tag: v1.10.42
# tag: v1.11.5
- name: Test offline Moonshine
if: matrix.build_type != 'Debug'
... ...
... ... @@ -168,7 +168,7 @@ jobs:
file: sherpa-onnx-*-android.tar.bz2
# repo_name: k2-fsa/sherpa-onnx
# repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
# tag: v1.11.3
# tag: v1.11.5
build-android-aar:
needs: [build-android-libs]
... ... @@ -297,7 +297,7 @@ jobs:
file: ./*.aar
# repo_name: k2-fsa/sherpa-onnx
# repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
# tag: v1.11.3
# tag: v1.11.5
- name: Release android aar
if: github.repository_owner == 'k2-fsa' && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
... ...
... ... @@ -47,6 +47,23 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
)
endif()
if(SHERPA_ONNX_HAS_ALSA)
add_executable(sense-voice-simulate-streaming-alsa-cxx-api
./sense-voice-simulate-streaming-alsa-cxx-api.cc
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc
)
target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api
sherpa-onnx-cxx-api
portaudio_static
)
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
else()
target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound)
endif()
endif()
add_executable(sense-voice-with-hr-cxx-api ./sense-voice-with-hr-cxx-api.cc)
target_link_libraries(sense-voice-with-hr-cxx-api sherpa-onnx-cxx-api)
... ...
// cxx-api-examples/sense-voice-simulate-streaming-alsa-cxx-api.cc
// Copyright (c) 2025 Xiaomi Corporation
//
// This file demonstrates how to use sense voice with sherpa-onnx's C++ API
// for streaming speech recognition from a microphone.
//
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
//
// clang-format on
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <iostream>
#include <mutex> // NOLINT
#include <queue>
#include <thread>
#include <vector>
#include "portaudio.h" // NOLINT
#include "sherpa-display.h" // NOLINT
#include "sherpa-onnx/c-api/cxx-api.h"
#include "sherpa-onnx/csrc/alsa.h"
std::queue<std::vector<float>> samples_queue;
std::condition_variable condition_variable;
std::mutex mutex;
bool stop = false;
static void Handler(int32_t /*sig*/) {
stop = true;
condition_variable.notify_one();
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
}
static void RecordCallback(sherpa_onnx::Alsa *alsa) {
int32_t chunk = 0.1 * alsa->GetActualSampleRate();
while (!stop) {
std::vector<float> samples = alsa->Read(chunk);
std::lock_guard<std::mutex> lock(mutex);
samples_queue.emplace(std::move(samples));
condition_variable.notify_one();
}
}
static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
using namespace sherpa_onnx::cxx; // NOLINT
VadModelConfig config;
config.silero_vad.model = "./silero_vad.onnx";
config.silero_vad.threshold = 0.5;
config.silero_vad.min_silence_duration = 0.1;
config.silero_vad.min_speech_duration = 0.25;
config.silero_vad.max_speech_duration = 8;
config.sample_rate = 16000;
config.debug = false;
VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
if (!vad.Get()) {
std::cerr << "Failed to create VAD. Please check your config\n";
exit(-1);
}
return vad;
}
static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
using namespace sherpa_onnx::cxx; // NOLINT
OfflineRecognizerConfig config;
config.model_config.sense_voice.model =
"./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
config.model_config.sense_voice.use_itn = false;
config.model_config.sense_voice.language = "auto";
config.model_config.tokens =
"./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
config.model_config.num_threads = 2;
config.model_config.debug = false;
std::cout << "Loading model\n";
OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
if (!recognizer.Get()) {
std::cerr << "Please check your config\n";
exit(-1);
}
std::cout << "Loading model done\n";
return recognizer;
}
int32_t main(int32_t argc, const char *argv[]) {
const char *kUsageMessage = R"usage(
Usage:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
./sense-voice-simulate-streaming-alsa-cxx-api device_name
The device name specifies which microphone to use in case there are several
on your system. You can use
arecord -l
to find all available microphones on your computer. For instance, if it outputs
**** List of CAPTURE Hardware Devices ****
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
Subdevices: 1/1
Subdevice #0: subdevice #0
and if you want to select card 3 and device 0 on that card, please use:
plughw:3,0
as the device_name.
)usage";
if (argc != 2) {
fprintf(stderr, "%s\n", kUsageMessage);
return -1;
}
signal(SIGINT, Handler);
using namespace sherpa_onnx::cxx; // NOLINT
auto vad = CreateVad();
auto recognizer = CreateOfflineRecognizer();
int32_t expected_sample_rate = 16000;
std::string device_name = argv[1];
sherpa_onnx::Alsa alsa(device_name.c_str());
fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
expected_sample_rate);
exit(-1);
}
int32_t window_size = 512; // samples, please don't change
int32_t offset = 0;
std::vector<float> buffer;
bool speech_started = false;
auto started_time = std::chrono::steady_clock::now();
SherpaDisplay display;
std::thread record_thread(RecordCallback, &alsa);
std::cout << "Started! Please speak\n";
while (!stop) {
{
std::unique_lock<std::mutex> lock(mutex);
while (samples_queue.empty() && !stop) {
condition_variable.wait(lock);
}
const auto &s = samples_queue.front();
buffer.insert(buffer.end(), s.begin(), s.end());
samples_queue.pop();
}
for (; offset + window_size < buffer.size(); offset += window_size) {
vad.AcceptWaveform(buffer.data() + offset, window_size);
if (!speech_started && vad.IsDetected()) {
speech_started = true;
started_time = std::chrono::steady_clock::now();
}
}
if (!speech_started) {
if (buffer.size() > 10 * window_size) {
offset -= buffer.size() - 10 * window_size;
buffer = {buffer.end() - 10 * window_size, buffer.end()};
}
}
auto current_time = std::chrono::steady_clock::now();
const float elapsed_seconds =
std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
started_time)
.count() /
1000.;
if (speech_started && elapsed_seconds > 0.2) {
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size());
recognizer.Decode(&stream);
OfflineRecognizerResult result = recognizer.GetResult(&stream);
display.UpdateText(result.text);
display.Display();
started_time = std::chrono::steady_clock::now();
}
while (!vad.IsEmpty()) {
auto segment = vad.Front();
vad.Pop();
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(expected_sample_rate, segment.samples.data(),
segment.samples.size());
recognizer.Decode(&stream);
OfflineRecognizerResult result = recognizer.GetResult(&stream);
display.UpdateText(result.text);
display.FinalizeCurrentSentence();
display.Display();
buffer.clear();
offset = 0;
speech_started = false;
}
}
record_thread.join();
return 0;
}
... ...
... ... @@ -45,10 +45,11 @@ class SherpaDisplay {
private:
static void ClearScreen() {
#ifdef _MSC_VER
system("cls");
auto ret = system("cls");
#else
system("clear");
auto ret = system("clear");
#endif
(void)ret;
}
static std::string GetCurrentDateTime() {
... ...