xuning

feat(c-api): 增强语音识别功能并添加使用文档

1. 在speaker-identification-c-api.c中重构代码,支持命令行参数配置
2. 在vad-sense-voice-c-api.c中添加智能分段合并和文本差异检测功能
3. 新增README_usage.md详细说明使用方法和功能
4. 优化CMake脚本处理Windows系统信息获取

新增功能包括:
- 支持多说话人注册和识别
- 智能合并短语音片段
- 文本规范化比较
- 完整的API使用文档
- 更健壮的系统信息获取
# VAD + SenseVoice C API 使用说明
## 功能
这个程序实现了以下功能:
1. 使用VAD(语音活动检测)分割音频
2. 使用SenseVoice模型进行语音识别
3. 将转录结果保存到txt文件
4. 计算程序执行时的CPU消耗
## 所需文件
运行前需要下载以下文件:
### 1. 音频文件
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
```
### 2. VAD模型(二选一)
#### 选项1:Silero VAD
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
```
#### 选项2:Ten VAD
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
```
### 3. SenseVoice模型
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
```
## 编译方法
### Windows系统
1. 确保已安装GCC(如MinGW)或MSVC
2. 双击运行 `build_windows.bat`
3. 或使用命令行:
```bash
build_windows.bat
```
### Linux/macOS系统
```bash
make vad-sense-voice-c-api
```
## 运行程序
```bash
./vad-sense-voice-c-api.exe # Windows
./vad-sense-voice-c-api # Linux/macOS
```
## 输出文件
程序运行后会生成:
- `transcription_result.txt`:包含所有转录结果和性能统计
### 文件内容示例
```
音频转录结果:
================
片段 1 (0.000-2.500秒): 大家好,我是雷军
片段 2 (3.000-5.200秒): 今天很高兴见到大家
性能统计:
================
总执行时间: 1.234 秒
CPU时间: 0.987 秒
CPU使用率: 80.0%
转录片段数: 2
```
## 故障排除
### 编译错误
- **Windows**: 确保已安装MinGW或Visual Studio
- **Linux**: 确保已安装build-essential
- **macOS**: 确保已安装Xcode命令行工具
### 运行时错误
- 检查所有必需文件是否存在
- 确保模型文件路径正确
- 检查音频文件格式(需要16kHz采样率)
\ No newline at end of file
... ...
... ... @@ -2,24 +2,6 @@
//
// Copyright (c) 2024 Xiaomi Corporation
// We assume you have pre-downloaded the speaker embedding extractor model
// from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
//
// An example command to download
// "3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"
// is given below:
//
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx
//
// clang-format on
//
// Also, please download the test wave files from
//
// https://github.com/csukuangfj/sr-data
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
... ... @@ -46,212 +28,166 @@ static const float *ComputeEmbedding(
exit(-1);
}
// we will free `v` outside of this function
const float *v =
SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream);
SherpaOnnxDestroyOnlineStream(stream);
SherpaOnnxFreeWave(wave);
// Remeber to free v to avoid memory leak
return v;
}
int32_t main() {
SherpaOnnxSpeakerEmbeddingExtractorConfig config;
memset(&config, 0, sizeof(config));
// please download the model from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
config.model = "./3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx";
config.num_threads = 1;
config.debug = 0;
config.provider = "cpu";
void PrintUsage(const char *program_name) {
fprintf(stderr, "Usage: %s <model_path> <threshold> <speaker1_name> <speaker1_wav1> [speaker1_wav2] [speaker1_wav3] <speaker2_name> <speaker2_wav1> [speaker2_wav2] [speaker2_wav3] <output_file> <test_wav1> <test_wav2> ...\n", program_name);
fprintf(stderr, "Example: %s 3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx 0.6 fangjun ./sr-data/enroll/fangjun-sr-1.wav ./sr-data/enroll/fangjun-sr-2.wav ./sr-data/enroll/fangjun-sr-3.wav leijun ./sr-data/enroll/leijun-sr-1.wav ./sr-data/enroll/leijun-sr-2.wav result.txt ./sr-data/test/fangjun-test-sr-1.wav ./sr-data/test/leijun-test-sr-1.wav ./sr-data/test/liudehua-test-sr-1.wav\n", program_name);
}
const SherpaOnnxSpeakerEmbeddingExtractor *ex =
SherpaOnnxCreateSpeakerEmbeddingExtractor(&config);
if (!ex) {
fprintf(stderr, "Failed to create speaker embedding extractor");
int32_t main(int32_t argc, char *argv[]) {
if (argc < 7) {
PrintUsage(argv[0]);
return -1;
}
int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex);
// Parse command line arguments
const char *model_path = argv[1];
float threshold = atof(argv[2]);
const SherpaOnnxSpeakerEmbeddingManager *manager =
SherpaOnnxCreateSpeakerEmbeddingManager(dim);
// Find the position of output file and test files
int32_t output_file_index = -1;
for (int32_t i = 3; i < argc; i++) {
if (strstr(argv[i], ".txt") != NULL) {
output_file_index = i;
break;
}
}
// Please download the test data from
// https://github.com/csukuangfj/sr-data
const char *spk1_1 = "./sr-data/enroll/fangjun-sr-1.wav";
const char *spk1_2 = "./sr-data/enroll/fangjun-sr-2.wav";
const char *spk1_3 = "./sr-data/enroll/fangjun-sr-3.wav";
if (output_file_index == -1 || output_file_index >= argc - 1) {
fprintf(stderr, "Output file not found or no test files provided\n");
PrintUsage(argv[0]);
return -1;
}
const char *spk2_1 = "./sr-data/enroll/leijun-sr-1.wav";
const char *spk2_2 = "./sr-data/enroll/leijun-sr-2.wav";
const char *output_file = argv[output_file_index];
int32_t num_test_files = argc - output_file_index - 1;
const char **test_files = (const char **)&argv[output_file_index + 1];
const float *spk1_vec[4] = {NULL};
spk1_vec[0] = ComputeEmbedding(ex, spk1_1);
spk1_vec[1] = ComputeEmbedding(ex, spk1_2);
spk1_vec[2] = ComputeEmbedding(ex, spk1_3);
// Parse speaker information
int32_t num_speakers = 0;
const char *speaker_names[10] = {NULL};
const char *speaker_files[10][4] = {NULL};
int32_t speaker_file_counts[10] = {0};
const float *spk2_vec[3] = {NULL};
spk2_vec[0] = ComputeEmbedding(ex, spk2_1);
spk2_vec[1] = ComputeEmbedding(ex, spk2_2);
int32_t current_index = 3;
while (current_index < output_file_index && num_speakers < 10) {
// Speaker name
speaker_names[num_speakers] = argv[current_index++];
if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "fangjun", spk1_vec)) {
fprintf(stderr, "Failed to register fangjun\n");
exit(-1);
// Speaker wave files
int32_t file_count = 0;
while (current_index < output_file_index &&
strstr(argv[current_index], ".wav") != NULL &&
file_count < 4) {
speaker_files[num_speakers][file_count++] = argv[current_index++];
}
if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "fangjun")) {
fprintf(stderr, "Failed to find fangjun\n");
exit(-1);
speaker_file_counts[num_speakers] = file_count;
num_speakers++;
}
if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "leijun", spk2_vec)) {
fprintf(stderr, "Failed to register leijun\n");
exit(-1);
// Open output file
FILE *fp = fopen(output_file, "w");
if (!fp) {
fprintf(stderr, "Failed to open output file: %s\n", output_file);
return -1;
}
if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "leijun")) {
fprintf(stderr, "Failed to find leijun\n");
exit(-1);
}
fprintf(fp, "Speaker Identification Results\n");
fprintf(fp, "Model: %s\n", model_path);
fprintf(fp, "Threshold: %.2f\n", threshold);
fprintf(fp, "========================================\n");
if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 2) {
fprintf(stderr, "There should be two speakers: fangjun and leijun\n");
exit(-1);
}
// Initialize speaker embedding extractor
SherpaOnnxSpeakerEmbeddingExtractorConfig config;
memset(&config, 0, sizeof(config));
config.model = model_path;
config.num_threads = 1;
config.debug = 0;
config.provider = "cpu";
const char *const *all_speakers =
SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager);
const char *const *p = all_speakers;
fprintf(stderr, "list of registered speakers\n-----\n");
while (p[0]) {
fprintf(stderr, "speaker: %s\n", p[0]);
++p;
const SherpaOnnxSpeakerEmbeddingExtractor *ex =
SherpaOnnxCreateSpeakerEmbeddingExtractor(&config);
if (!ex) {
fprintf(stderr, "Failed to create speaker embedding extractor");
fclose(fp);
return -1;
}
fprintf(stderr, "----\n");
SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers);
const char *test1 = "./sr-data/test/fangjun-test-sr-1.wav";
const char *test2 = "./sr-data/test/leijun-test-sr-1.wav";
const char *test3 = "./sr-data/test/liudehua-test-sr-1.wav";
const float *v1 = ComputeEmbedding(ex, test1);
const float *v2 = ComputeEmbedding(ex, test2);
const float *v3 = ComputeEmbedding(ex, test3);
int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex);
const SherpaOnnxSpeakerEmbeddingManager *manager =
SherpaOnnxCreateSpeakerEmbeddingManager(dim);
float threshold = 0.6;
// Register speakers
for (int32_t i = 0; i < num_speakers; i++) {
const float *embeddings[4] = {NULL};
int32_t count = speaker_file_counts[i];
const char *name1 =
SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold);
if (name1) {
fprintf(stderr, "%s: Found %s\n", test1, name1);
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1);
} else {
fprintf(stderr, "%s: Not found\n", test1);
for (int32_t j = 0; j < count; j++) {
embeddings[j] = ComputeEmbedding(ex, speaker_files[i][j]);
}
const char *name2 =
SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold);
if (name2) {
fprintf(stderr, "%s: Found %s\n", test2, name2);
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2);
} else {
fprintf(stderr, "%s: Not found\n", test2);
}
const char *name3 =
SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v3, threshold);
if (name3) {
fprintf(stderr, "%s: Found %s\n", test3, name3);
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name3);
} else {
fprintf(stderr, "%s: Not found\n", test3);
}
int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v1,
threshold);
if (ok) {
fprintf(stderr, "%s matches fangjun\n", test1);
} else {
fprintf(stderr, "%s does NOT match fangjun\n", test1);
if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, speaker_names[i], embeddings)) {
fprintf(stderr, "Failed to register %s\n", speaker_names[i]);
fprintf(fp, "Failed to register %s\n", speaker_names[i]);
fclose(fp);
exit(-1);
}
ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v2,
threshold);
if (ok) {
fprintf(stderr, "%s matches fangjun\n", test2);
} else {
fprintf(stderr, "%s does NOT match fangjun\n", test2);
for (int32_t j = 0; j < count; j++) {
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(embeddings[j]);
}
fprintf(stderr, "Removing fangjun\n");
if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "fangjun")) {
fprintf(stderr, "Failed to remove fangjun\n");
exit(-1);
fprintf(stderr, "Registered speaker: %s with %d wave files\n", speaker_names[i], count);
fprintf(fp, "Registered speaker: %s with %d wave files\n", speaker_names[i], count);
}
if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 1) {
fprintf(stderr, "There should be only 1 speaker left\n");
exit(-1);
}
fprintf(fp, "\nTest Results:\n");
fprintf(fp, "========================================\n");
name1 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold);
if (name1) {
fprintf(stderr, "%s: Found %s\n", test1, name1);
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1);
} else {
fprintf(stderr, "%s: Not found\n", test1);
}
// Process test files
for (int32_t i = 0; i < num_test_files; i++) {
const char *test_file = test_files[i];
const float *v = ComputeEmbedding(ex, test_file);
fprintf(stderr, "Removing leijun\n");
if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "leijun")) {
fprintf(stderr, "Failed to remove leijun\n");
exit(-1);
}
const char *name = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v, threshold);
if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 0) {
fprintf(stderr, "There should be only 1 speaker left\n");
exit(-1);
}
fprintf(stderr, "Testing %s: ", test_file);
fprintf(fp, "Test file: %s\n", test_file);
name2 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold);
if (name2) {
fprintf(stderr, "%s: Found %s\n", test2, name2);
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2);
if (name) {
fprintf(stderr, "Found %s\n", name);
fprintf(fp, " Result: Found speaker - %s\n", name);
SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name);
} else {
fprintf(stderr, "%s: Not found\n", test2);
fprintf(stderr, "Not found\n");
fprintf(fp, " Result: Speaker not found\n");
}
all_speakers = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager);
p = all_speakers;
fprintf(stderr, "list of registered speakers\n-----\n");
while (p[0]) {
fprintf(stderr, "speaker: %s\n", p[0]);
++p;
// Verify against all registered speakers
for (int32_t j = 0; j < num_speakers; j++) {
int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, speaker_names[j], v, threshold);
fprintf(fp, " Verify with %s: %s\n", speaker_names[j], ok ? "MATCH" : "NO MATCH");
}
fprintf(stderr, "----\n");
SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v1);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v2);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v3);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[0]);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[1]);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[2]);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[0]);
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[1]);
fprintf(fp, "\n");
SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v);
}
// Cleanup
SherpaOnnxDestroySpeakerEmbeddingManager(manager);
SherpaOnnxDestroySpeakerEmbeddingExtractor(ex);
fclose(fp);
fprintf(stderr, "Results saved to: %s\n", output_file);
return 0;
}
\ No newline at end of file
... ...
... ... @@ -23,10 +23,270 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <wchar.h>
#include <locale.h>
#include <stdbool.h>
#include <stdint.h>
#include "sherpa-onnx/c-api/c-api.h"
// Function to normalize string: remove punctuation and spaces, convert to lowercase
void normalize_string(const char* input, char* output) {
int i = 0, j = 0;
while (input[i] != '\0') {
// Skip punctuation characters and spaces (both English and Chinese)
if (!ispunct((unsigned char)input[i]) &&
!isspace((unsigned char)input[i]) &&
!(input[i] >= 0x3000 && input[i] <= 0x303F) && // CJK punctuation
!(input[i] >= 0xFF00 && input[i] <= 0xFF0F) && // Fullwidth forms
!(input[i] >= 0xFF1A && input[i] <= 0xFF20) && // Fullwidth forms
!(input[i] >= 0xFF3B && input[i] <= 0xFF40) && // Fullwidth forms
!(input[i] >= 0xFF5B && input[i] <= 0xFF65)) { // Fullwidth forms
// Convert to lowercase and add to output
output[j++] = tolower((unsigned char)input[i]);
}
i++;
}
output[j] = '\0';
}
// Function to get the first meaningful character (non-punctuation, non-space)
char get_first_meaningful_char(const char* str) {
int i = 0;
while (str[i] != '\0') {
if (!ispunct((unsigned char)str[i]) &&
!isspace((unsigned char)str[i]) &&
!(str[i] >= 0x3000 && str[i] <= 0x303F) &&
!(str[i] >= 0xFF00 && str[i] <= 0xFF0F) &&
!(str[i] >= 0xFF1A && str[i] <= 0xFF20) &&
!(str[i] >= 0xFF3B && str[i] <= 0xFF40) &&
!(str[i] >= 0xFF5B && str[i] <= 0xFF65)) {
return tolower((unsigned char)str[i]);
}
i++;
}
return '\0';
}
// Function to check if two strings are effectively the same after normalization
int are_strings_effectively_same(const char* str1, const char* str2) {
char norm1[1024], norm2[1024];
normalize_string(str1, norm1);
normalize_string(str2, norm2);
return strcmp(norm1, norm2) == 0;
}
// 判断是否为 CJK 统一表意字符
static bool is_cjk_ideograph(uint32_t ch)
{
return (ch >= 0x4E00 && ch <= 0x9FFF) || // CJK Unified Ideographs
(ch >= 0x3400 && ch <= 0x4DBF) || // CJK Extension A
(ch >= 0x20000 && ch <= 0x2A6DF) || // CJK Extension B
(ch >= 0x2A700 && ch <= 0x2B73F) || // CJK Extension C
(ch >= 0x2B740 && ch <= 0x2B81F) || // CJK Extension D
(ch >= 0x2B820 && ch <= 0x2CEAF) || // CJK Extension E
(ch >= 0x2CEB0 && ch <= 0x2EBEF) || // CJK Extension F
(ch >= 0x3007 && ch <= 0x3007) || // 〇
(ch >= 0x3021 && ch <= 0x3029) || // 〡〢〣〤〥〦〧〨〩
(ch >= 0x3038 && ch <= 0x303B); // 〸〹〺〻〼
}
// 反向解码一个 UTF-8 字符,返回其长度(字节)和码点
static int prev_utf8_char(const char *s, int pos, uint32_t *out_ch)
{
int start = pos;
// 找到当前字符起始字节
while (start > 0 && (s[start] & 0xC0) == 0x80)
--start;
// 解码
const unsigned char *p = (const unsigned char *)&s[start];
if ((*p & 0x80) == 0) { // 1-byte
*out_ch = *p;
} else if ((*p & 0xE0) == 0xC0) { // 2-byte
*out_ch = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
} else if ((*p & 0xF0) == 0xE0) { // 3-byte
*out_ch = ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
} else if ((*p & 0xF8) == 0xF0) { // 4-byte
*out_ch = ((p[0] & 0x07) << 18) | ((p[1] & 0x3F) << 12) |
((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
} else {
*out_ch = 0xFFFD; // 非法序列,用替换字符
}
return pos - start + 1; // 返回字节长度
}
// 新实现:按“中日文单字 / 英文整词”取最后 n 个语义单元
void get_last_n_words(const char *str, int n, char *output)
{
if (!str || !output || n <= 0) {
*output = '\0';
return;
}
int len = strlen(str);
if (len == 0) {
*output = '\0';
return;
}
// 用来存反向收集到的单元
char units[256][256];
int unit_cnt = 0;
int pos = len; // 从 '\0' 前一个位置开始
while (pos > 0 && unit_cnt < n) {
uint32_t ch;
int char_len = prev_utf8_char(str, pos - 1, &ch);
pos -= char_len;
if (ch < 128 && ((ch | 32) - 'a' < 26)) {
// ===== 英文单词 =====
int word_end = pos + char_len;
int word_start = pos;
// 向前找单词起始
while (word_start > 0) {
uint32_t tmp;
int tmp_len = prev_utf8_char(str, word_start - 1, &tmp);
if (tmp < 128 && ((tmp | 32) - 'a' < 26))
word_start -= tmp_len;
else
break;
}
// 拷贝整个单词
int wlen = word_end - word_start;
if (wlen >= (int)sizeof(units[unit_cnt])) wlen = sizeof(units[unit_cnt]) - 1;
memcpy(units[unit_cnt], str + word_start, wlen);
units[unit_cnt][wlen] = '\0';
++unit_cnt;
pos = word_start; // 继续向前扫描
} else if (is_cjk_ideograph(ch) || ch > 0xFF00) {
// ===== CJK 或全角符号 =====
if (char_len >= (int)sizeof(units[unit_cnt])) char_len = sizeof(units[unit_cnt]) - 1;
memcpy(units[unit_cnt], str + pos, char_len);
units[unit_cnt][char_len] = '\0';
++unit_cnt;
}
// 其他标点/空格直接跳过
}
// 反向拼回 output
output[0] = '\0';
for (int i = unit_cnt - 1; i >= 0; --i) {
if (i < unit_cnt - 1) strcat(output, " ");
strcat(output, units[i]);
}
}
// 在第二个字符串中查找锚点文本的位置
const char *find_anchor_end_position(const char *str, const char *anchor) {
if (!anchor || !*anchor) return str;
char normalized_str[1024] = {0};
char normalized_anchor[1024] = {0};
// 规范化两个字符串
normalize_string(str, normalized_str);
normalize_string(anchor, normalized_anchor);
// 在规范化后的字符串中查找锚点
char *found = strstr(normalized_str, normalized_anchor);
if (!found) return str; // 如果找不到锚点,返回整个字符串
// 计算锚点的结束位置
int anchor_end_offset = found - normalized_str + strlen(normalized_anchor);
// 计算在原始字符串中的对应位置
int normalized_count = 0;
const char *ptr = str;
while (*ptr != '\0' && normalized_count < anchor_end_offset) {
if (!ispunct((unsigned char)*ptr) && !isspace((unsigned char)*ptr)) {
normalized_count++;
}
ptr++;
}
return ptr;
}
// 找到下一个单词的开始位置
const char *find_next_word_start(const char *str) {
// 跳过所有标点和空格
while (*str != '\0' &&
(ispunct((unsigned char)*str) || isspace((unsigned char)*str))) {
str++;
}
return str;
}
// 获取基于锚点的差异文本(从锚点后的第一个完整单词开始)
char *get_difference_after_anchor(const char *str1, const char *str2, int num_anchor_words) {
if (are_strings_effectively_same(str1, str2)) {
return strdup("");
}
// 获取语义单元级的锚点文本
char semantic_anchor[256] = {0};
get_last_n_words(str1, num_anchor_words, semantic_anchor);
if (strlen(semantic_anchor) == 0) {
return strdup(str2);
}
// 关键:对语义锚点再做一次字符级规范化,匹配 find_anchor_end_position 的行为
char normalized_anchor[256] = {0};
normalize_string(semantic_anchor, normalized_anchor);
// 使用规范化后的锚点查找位置
const char *anchor_end = find_anchor_end_position(str2, normalized_anchor);
const char *next_word_start = find_next_word_start(anchor_end);
return strdup(next_word_start);
}
// Structure to store previous segment information
typedef struct {
float* samples;
int32_t n;
int32_t start;
char* text;
} PreviousSegment;
void free_previous_segment(PreviousSegment* seg) {
if (seg) {
if (seg->samples) free(seg->samples);
if (seg->text) free(seg->text);
free(seg);
}
}
PreviousSegment* copy_segment(const SherpaOnnxSpeechSegment* segment, const char* text) {
PreviousSegment* prev = (PreviousSegment*)malloc(sizeof(PreviousSegment));
if (!prev) return NULL;
prev->n = segment->n;
prev->start = segment->start;
prev->samples = (float*)malloc(segment->n * sizeof(float));
if (!prev->samples) {
free(prev);
return NULL;
}
memcpy(prev->samples, segment->samples, segment->n * sizeof(float));
prev->text = strdup(text);
if (!prev->text) {
free(prev->samples);
free(prev);
return NULL;
}
return prev;
}
int32_t main() {
setlocale(LC_ALL, ""); // Set locale for wide character handling
const char *wav_filename = "./lei-jun-test.wav";
if (!SherpaOnnxFileExists(wav_filename)) {
fprintf(stderr, "Please download %s\n", wav_filename);
... ... @@ -51,7 +311,7 @@ int32_t main() {
}
const char *model_filename =
"./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
"./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx";
const char *tokens_filename =
"./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
const char *language = "auto";
... ... @@ -107,9 +367,9 @@ int32_t main() {
if (use_silero_vad) {
vadConfig.silero_vad.model = vad_filename;
vadConfig.silero_vad.threshold = 0.25;
vadConfig.silero_vad.min_silence_duration = 0.5;
vadConfig.silero_vad.min_speech_duration = 0.5;
vadConfig.silero_vad.max_speech_duration = 10;
vadConfig.silero_vad.min_silence_duration = 1.5;
vadConfig.silero_vad.min_speech_duration = 0.3;
vadConfig.silero_vad.max_speech_duration = 20;
vadConfig.silero_vad.window_size = 512;
} else if (use_ten_vad) {
vadConfig.ten_vad.model = vad_filename;
... ... @@ -139,6 +399,9 @@ int32_t main() {
int32_t i = 0;
int is_eof = 0;
// Variables to store previous segment information
PreviousSegment *prev_segment = NULL;
while (!is_eof) {
if (i + window_size < wave->num_samples) {
SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i,
... ... @@ -152,32 +415,91 @@ int32_t main() {
const SherpaOnnxSpeechSegment *segment =
SherpaOnnxVoiceActivityDetectorFront(vad);
float duration = segment->n / 16000.0f;
// Process the current segment
const SherpaOnnxOfflineStream *stream =
SherpaOnnxCreateOfflineStream(recognizer);
SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate,
segment->samples, segment->n);
SherpaOnnxDecodeOfflineStream(recognizer, stream);
const SherpaOnnxOfflineRecognizerResult *result =
SherpaOnnxGetOfflineStreamResult(stream);
float start = segment->start / 16000.0f;
float duration = segment->n / 16000.0f;
float stop = start + duration;
if (duration < 1.5f && prev_segment != NULL) {
// Current segment is less than 1 second and we have a previous segment
// Merge with previous segment
// Create merged samples
int32_t merged_n = prev_segment->n + segment->n;
float *merged_samples = (float*)malloc(merged_n * sizeof(float));
memcpy(merged_samples, prev_segment->samples, prev_segment->n * sizeof(float));
memcpy(merged_samples + prev_segment->n, segment->samples, segment->n * sizeof(float));
// Create stream for merged segment
const SherpaOnnxOfflineStream *merged_stream =
SherpaOnnxCreateOfflineStream(recognizer);
SherpaOnnxAcceptWaveformOffline(merged_stream, wave->sample_rate,
merged_samples, merged_n);
SherpaOnnxDecodeOfflineStream(recognizer, merged_stream);
const SherpaOnnxOfflineRecognizerResult *merged_result =
SherpaOnnxGetOfflineStreamResult(merged_stream);
// Get the meaningful difference starting from first character
char *diff_text = get_difference_after_anchor(prev_segment->text, merged_result->text, 3);
if (strlen(diff_text) == 0) {
fprintf(stderr, "%.3f -- %.3f: %s (short segment, no meaningful difference)\n",
start, stop, merged_result->text);
} else {
fprintf(stderr, "%.3f -- %.3f: %s (short segment, meaningful diff: %s)\n",
start, stop, merged_result->text, diff_text);
}
// Don't update prev_segment for short segments (requirement 1)
// Only update if the current segment is >= 1 second
SherpaOnnxDestroyOfflineRecognizerResult(merged_result);
SherpaOnnxDestroyOfflineStream(merged_stream);
free(merged_samples);
free(diff_text);
} else {
// Normal processing for segments >= 1 second
fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text);
// Store current segment and result only if duration >= 1 second (requirement 1)
if (duration >= 1.5f) {
if (prev_segment != NULL) {
free_previous_segment(prev_segment);
}
prev_segment = copy_segment(segment, result->text);
} else {
// Short segment, don't store as previous
if (prev_segment != NULL) {
free_previous_segment(prev_segment);
prev_segment = NULL;
}
}
}
SherpaOnnxDestroyOfflineRecognizerResult(result);
SherpaOnnxDestroyOfflineStream(stream);
SherpaOnnxDestroySpeechSegment(segment);
SherpaOnnxVoiceActivityDetectorPop(vad);
}
i += window_size;
}
// Clean up
if (prev_segment != NULL) {
free_previous_segment(prev_segment);
}
SherpaOnnxDestroyOfflineRecognizer(recognizer);
SherpaOnnxDestroyVoiceActivityDetector(vad);
SherpaOnnxFreeWave(wave);
... ...
... ... @@ -63,8 +63,17 @@ elseif(WIN32)
# Now SHERPA_ONNX_OS_TWO_LINES contains something like
# Caption Version
# Microsoft Windows 10 Pro 10.0.18362
string(REPLACE "\n" ";" SHERPA_ONNX_OS_LIST ${SHERPA_ONNX_OS_TWO_LINES})
if(SHERPA_ONNX_OS_TWO_LINES)
string(REPLACE "\n" ";" SHERPA_ONNX_OS_LIST "${SHERPA_ONNX_OS_TWO_LINES}")
list(LENGTH SHERPA_ONNX_OS_LIST _list_length)
if(_list_length GREATER 1)
list(GET SHERPA_ONNX_OS_LIST 1 SHERPA_ONNX_OS)
else()
set(SHERPA_ONNX_OS "Windows")
endif()
else()
set(SHERPA_ONNX_OS "Windows")
endif()
else()
set(SHERPA_ONNX_OS "Unknown")
endif()
... ...