Fangjun Kuang
Committed by GitHub

Add C API for ten-vad (#2379)

... ... @@ -376,7 +376,7 @@ jobs:
name: matcha-tts-${{ matrix.os }}
path: ./generated-matcha-*.wav
- name: Test vad + Whisper tiny.en
- name: Test silero-vad + Whisper tiny.en
shell: bash
run: |
gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \
... ... @@ -403,7 +403,34 @@ jobs:
rm -rf *.onnx
rm *.wav
- name: Test vad + Moonshine
- name: Test ten-vad + Whisper tiny.en
shell: bash
run: |
gcc -o vad-whisper-c-api ./c-api-examples/vad-whisper-c-api.c \
-I ./build/install/include \
-L ./build/install/lib/ \
-l sherpa-onnx-c-api \
-l onnxruntime
# Now download models
#
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
rm sherpa-onnx-whisper-tiny.en.tar.bz2
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
./vad-whisper-c-api
rm -rf sherpa-onnx-*
rm -rf *.onnx
rm *.wav
- name: Test silero-vad + Moonshine
shell: bash
run: |
gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \
... ... @@ -430,6 +457,33 @@ jobs:
rm -rf *.onnx
rm *.wav
- name: Test ten-vad + Moonshine
shell: bash
run: |
gcc -o vad-moonshine-c-api ./c-api-examples/vad-moonshine-c-api.c \
-I ./build/install/include \
-L ./build/install/lib/ \
-l sherpa-onnx-c-api \
-l onnxruntime
# Now download models
#
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
./vad-moonshine-c-api
rm -rf sherpa-onnx-*
rm -rf *.onnx
rm *.wav
- name: Test Moonshine
shell: bash
run: |
... ... @@ -466,7 +520,7 @@ jobs:
./run.sh
rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20
- name: Test vad + sense-voice
- name: Test silero-vad + sense-voice
shell: bash
run: |
gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \
... ... @@ -505,6 +559,45 @@ jobs:
rm -rf *.onnx
rm *.wav
- name: Test ten-vad + sense-voice
shell: bash
run: |
gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \
-I ./build/install/include \
-L ./build/install/lib/ \
-l sherpa-onnx-c-api \
-l onnxruntime
ls -lh vad-sense-voice-c-api
if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
ldd ./vad-sense-voice-c-api
echo "----"
readelf -d ./vad-sense-voice-c-api
fi
# Now download models
#
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
echo "---"
ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
./vad-sense-voice-c-api
rm -rf sherpa-onnx-sense-voice-*
rm -rf *.onnx
rm *.wav
- name: Test sense-voice
shell: bash
run: |
... ...
... ... @@ -6,7 +6,12 @@
// This file demonstrates how to use VAD + Moonshine with sherpa-onnx's C API.
// clang-format off
//
// To use silero-vad:
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// To use ten-vad:
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
... ... @@ -23,7 +28,27 @@
int32_t main() {
const char *wav_filename = "./Obama.wav";
const char *vad_filename = "./silero_vad.onnx";
if (!SherpaOnnxFileExists(wav_filename)) {
fprintf(stderr, "Please download %s\n", wav_filename);
return -1;
}
const char *vad_filename;
int32_t use_silero_vad = 0;
int32_t use_ten_vad = 0;
if (SherpaOnnxFileExists("./silero_vad.onnx")) {
printf("Use silero-vad\n");
vad_filename = "./silero_vad.onnx";
use_silero_vad = 1;
} else if (SherpaOnnxFileExists("./ten-vad.onnx")) {
printf("Use ten-vad\n");
vad_filename = "./ten-vad.onnx";
use_ten_vad = 1;
} else {
fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");
return -1;
}
const char *preprocessor =
"./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx";
... ... @@ -76,12 +101,22 @@ int32_t main() {
SherpaOnnxVadModelConfig vadConfig;
memset(&vadConfig, 0, sizeof(vadConfig));
if (use_silero_vad) {
vadConfig.silero_vad.model = vad_filename;
vadConfig.silero_vad.threshold = 0.5;
vadConfig.silero_vad.threshold = 0.25;
vadConfig.silero_vad.min_silence_duration = 0.5;
vadConfig.silero_vad.min_speech_duration = 0.5;
vadConfig.silero_vad.max_speech_duration = 10;
vadConfig.silero_vad.window_size = 512;
} else if (use_ten_vad) {
vadConfig.ten_vad.model = vad_filename;
vadConfig.ten_vad.threshold = 0.25;
vadConfig.ten_vad.min_silence_duration = 0.5;
vadConfig.ten_vad.min_speech_duration = 0.5;
vadConfig.ten_vad.max_speech_duration = 10;
vadConfig.ten_vad.window_size = 256;
}
vadConfig.sample_rate = 16000;
vadConfig.num_threads = 1;
vadConfig.debug = 1;
... ... @@ -96,7 +131,9 @@ int32_t main() {
return -1;
}
int32_t window_size = vadConfig.silero_vad.window_size;
int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
: vadConfig.ten_vad.window_size;
int32_t i = 0;
int is_eof = 0;
... ...
... ... @@ -6,7 +6,12 @@
// This file demonstrates how to use VAD + SenseVoice with sherpa-onnx's C API.
// clang-format off
//
// To use silero-vad:
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// To use ten-vad:
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
... ... @@ -23,7 +28,28 @@
int32_t main() {
const char *wav_filename = "./lei-jun-test.wav";
const char *vad_filename = "./silero_vad.onnx";
if (!SherpaOnnxFileExists(wav_filename)) {
fprintf(stderr, "Please download %s\n", wav_filename);
return -1;
}
const char *vad_filename;
int32_t use_silero_vad = 0;
int32_t use_ten_vad = 0;
if (SherpaOnnxFileExists("./silero_vad.onnx")) {
printf("Use silero-vad\n");
vad_filename = "./silero_vad.onnx";
use_silero_vad = 1;
} else if (SherpaOnnxFileExists("./ten-vad.onnx")) {
printf("Use ten-vad\n");
vad_filename = "./ten-vad.onnx";
use_ten_vad = 1;
} else {
fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");
return -1;
}
const char *model_filename =
"./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
const char *tokens_filename =
... ... @@ -77,12 +103,23 @@ int32_t main() {
SherpaOnnxVadModelConfig vadConfig;
memset(&vadConfig, 0, sizeof(vadConfig));
if (use_silero_vad) {
vadConfig.silero_vad.model = vad_filename;
vadConfig.silero_vad.threshold = 0.5;
vadConfig.silero_vad.threshold = 0.25;
vadConfig.silero_vad.min_silence_duration = 0.5;
vadConfig.silero_vad.min_speech_duration = 0.5;
vadConfig.silero_vad.max_speech_duration = 5;
vadConfig.silero_vad.max_speech_duration = 10;
vadConfig.silero_vad.window_size = 512;
} else if (use_ten_vad) {
vadConfig.ten_vad.model = vad_filename;
vadConfig.ten_vad.threshold = 0.25;
vadConfig.ten_vad.min_silence_duration = 0.5;
vadConfig.ten_vad.min_speech_duration = 0.5;
vadConfig.ten_vad.max_speech_duration = 10;
vadConfig.ten_vad.window_size = 256;
}
vadConfig.sample_rate = 16000;
vadConfig.num_threads = 1;
vadConfig.debug = 1;
... ... @@ -97,7 +134,8 @@ int32_t main() {
return -1;
}
int32_t window_size = vadConfig.silero_vad.window_size;
int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
: vadConfig.ten_vad.window_size;
int32_t i = 0;
int is_eof = 0;
... ...
... ... @@ -8,7 +8,12 @@
//
// clang-format off
//
// To use silero-vad:
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// To use ten-vad:
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
... ... @@ -25,7 +30,28 @@
int32_t main() {
const char *wav_filename = "./Obama.wav";
const char *vad_filename = "./silero_vad.onnx";
if (!SherpaOnnxFileExists(wav_filename)) {
fprintf(stderr, "Please download %s\n", wav_filename);
return -1;
}
const char *vad_filename;
int32_t use_silero_vad = 0;
int32_t use_ten_vad = 0;
if (SherpaOnnxFileExists("./silero_vad.onnx")) {
printf("Use silero-vad\n");
vad_filename = "./silero_vad.onnx";
use_silero_vad = 1;
} else if (SherpaOnnxFileExists("./ten-vad.onnx")) {
printf("Use ten-vad\n");
vad_filename = "./ten-vad.onnx";
use_ten_vad = 1;
} else {
fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");
return -1;
}
const char *encoder = "sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx";
const char *decoder = "sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx";
... ... @@ -74,12 +100,23 @@ int32_t main() {
SherpaOnnxVadModelConfig vadConfig;
memset(&vadConfig, 0, sizeof(vadConfig));
if (use_silero_vad) {
vadConfig.silero_vad.model = vad_filename;
vadConfig.silero_vad.threshold = 0.5;
vadConfig.silero_vad.threshold = 0.25;
vadConfig.silero_vad.min_silence_duration = 0.5;
vadConfig.silero_vad.min_speech_duration = 0.5;
vadConfig.silero_vad.max_speech_duration = 10;
vadConfig.silero_vad.window_size = 512;
} else if (use_ten_vad) {
vadConfig.ten_vad.model = vad_filename;
vadConfig.ten_vad.threshold = 0.25;
vadConfig.ten_vad.min_silence_duration = 0.5;
vadConfig.ten_vad.min_speech_duration = 0.5;
vadConfig.ten_vad.max_speech_duration = 10;
vadConfig.ten_vad.window_size = 256;
}
vadConfig.sample_rate = 16000;
vadConfig.num_threads = 1;
vadConfig.debug = 1;
... ... @@ -94,7 +131,8 @@ int32_t main() {
return -1;
}
int32_t window_size = vadConfig.silero_vad.window_size;
int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
: vadConfig.ten_vad.window_size;
int32_t i = 0;
int is_eof = 0;
... ...
... ... @@ -1033,6 +1033,21 @@ sherpa_onnx::VadModelConfig GetVadModelConfig(
vad_config.silero_vad.max_speech_duration =
SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);
vad_config.ten_vad.model = SHERPA_ONNX_OR(config->ten_vad.model, "");
vad_config.ten_vad.threshold = SHERPA_ONNX_OR(config->ten_vad.threshold, 0.5);
vad_config.ten_vad.min_silence_duration =
SHERPA_ONNX_OR(config->ten_vad.min_silence_duration, 0.5);
vad_config.ten_vad.min_speech_duration =
SHERPA_ONNX_OR(config->ten_vad.min_speech_duration, 0.25);
vad_config.ten_vad.window_size =
SHERPA_ONNX_OR(config->ten_vad.window_size, 256);
vad_config.ten_vad.max_speech_duration =
SHERPA_ONNX_OR(config->ten_vad.max_speech_duration, 20);
vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
... ...
... ... @@ -71,6 +71,9 @@ SHERPA_ONNX_API const char *SherpaOnnxGetGitSha1();
// Example return value: "Fri Jun 20 11:22:52 2025"
SHERPA_ONNX_API const char *SherpaOnnxGetGitDate();
// return 1 if the given file exists; return 0 otherwise
SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
/// Please refer to
/// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
/// to download pre-trained models. That is, you can find encoder-xxx.onnx
... ... @@ -845,6 +848,30 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
float max_speech_duration;
} SherpaOnnxSileroVadModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxTenVadModelConfig {
// Path to the ten-vad model
const char *model;
// threshold to classify a segment as speech
//
// If the predicted probability of a segment is larger than this
// value, then it is classified as speech.
float threshold;
// in seconds
float min_silence_duration;
// in seconds
float min_speech_duration;
int32_t window_size;
// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,
// the threshold value is reset to its original value.
float max_speech_duration;
} SherpaOnnxTenVadModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
SherpaOnnxSileroVadModelConfig silero_vad;
... ... @@ -852,6 +879,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
int32_t num_threads;
const char *provider;
int32_t debug;
SherpaOnnxTenVadModelConfig ten_vad;
} SherpaOnnxVadModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxCircularBuffer
... ... @@ -1567,9 +1595,6 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
const SherpaOnnxLinearResampler *p);
// Return 1 if the file exists; return 0 if the file does not exist.
SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
// =========================================================================
// For offline speaker diarization (i.e., non-streaming speaker diarization)
// =========================================================================
... ...
... ... @@ -655,6 +655,13 @@ VoiceActivityDetector VoiceActivityDetector::Create(
c.silero_vad.window_size = config.silero_vad.window_size;
c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration;
c.ten_vad.model = config.ten_vad.model.c_str();
c.ten_vad.threshold = config.ten_vad.threshold;
c.ten_vad.min_silence_duration = config.ten_vad.min_silence_duration;
c.ten_vad.min_speech_duration = config.ten_vad.min_speech_duration;
c.ten_vad.window_size = config.ten_vad.window_size;
c.ten_vad.max_speech_duration = config.ten_vad.max_speech_duration;
c.sample_rate = config.sample_rate;
c.num_threads = config.num_threads;
c.provider = config.provider.c_str();
... ... @@ -758,4 +765,8 @@ std::string GetGitSha1() { return SherpaOnnxGetGitSha1(); }
std::string GetGitDate() { return SherpaOnnxGetGitDate(); }
bool FileExists(const std::string &filename) {
return SherpaOnnxFileExists(filename.c_str());
}
} // namespace sherpa_onnx::cxx
... ...
... ... @@ -552,8 +552,18 @@ struct SileroVadModelConfig {
float max_speech_duration = 20;
};
struct TenVadModelConfig {
std::string model;
float threshold = 0.5;
float min_silence_duration = 0.5;
float min_speech_duration = 0.25;
int32_t window_size = 256;
float max_speech_duration = 20;
};
struct VadModelConfig {
SileroVadModelConfig silero_vad;
TenVadModelConfig ten_vad;
int32_t sample_rate = 16000;
int32_t num_threads = 1;
... ... @@ -642,6 +652,7 @@ class SHERPA_ONNX_API LinearResampler
std::string GetVersionStr();
std::string GetGitSha1();
std::string GetGitDate();
bool FileExists(const std::string &filename);
} // namespace sherpa_onnx::cxx
... ...
... ... @@ -321,7 +321,7 @@ class TenVadModel::Impl {
static void LogMel(const float *in, int32_t n, float *out) {
for (int32_t i = 0; i != n; ++i) {
// 20.79441541679836 is log(32768*32768)
out[i] = logf(in[i] + 1e-10) - 20.79441541679836f;
out[i] = logf(in[i] + 1e-10f) - 20.79441541679836f;
}
}
... ...