feat(c-api): 增强语音识别功能并添加使用文档
1. 在speaker-identification-c-api.c中重构代码,支持命令行参数配置 2. 在vad-sense-voice-c-api.c中添加智能分段合并和文本差异检测功能 3. 新增README_usage.md详细说明使用方法和功能 4. 优化CMake脚本处理Windows系统信息获取 新增功能包括: - 支持多说话人注册和识别 - 智能合并短语音片段 - 文本规范化比较 - 完整的API使用文档 - 更健壮的系统信息获取
正在显示
4 个修改的文件
包含
684 行增加
和
331 行删除
c-api-examples/README_usage.md
0 → 100644
| 1 | +# VAD + SenseVoice C API 使用说明 | ||
| 2 | + | ||
| 3 | +## 功能 | ||
| 4 | +这个程序实现了以下功能: | ||
| 5 | +1. 使用VAD(语音活动检测)分割音频 | ||
| 6 | +2. 使用SenseVoice模型进行语音识别 | ||
| 7 | +3. 将转录结果保存到txt文件 | ||
| 8 | +4. 计算程序执行时的CPU消耗 | ||
| 9 | + | ||
| 10 | +## 所需文件 | ||
| 11 | +运行前需要下载以下文件: | ||
| 12 | + | ||
| 13 | +### 1. 音频文件 | ||
| 14 | +```bash | ||
| 15 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav | ||
| 16 | +``` | ||
| 17 | + | ||
| 18 | +### 2. VAD模型(二选一) | ||
| 19 | +#### 选项1:Silero VAD | ||
| 20 | +```bash | ||
| 21 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx | ||
| 22 | +``` | ||
| 23 | + | ||
| 24 | +#### 选项2:Ten VAD | ||
| 25 | +```bash | ||
| 26 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx | ||
| 27 | +``` | ||
| 28 | + | ||
| 29 | +### 3. SenseVoice模型 | ||
| 30 | +```bash | ||
| 31 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 32 | +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 33 | +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 | ||
| 34 | +``` | ||
| 35 | + | ||
| 36 | +## 编译方法 | ||
| 37 | + | ||
| 38 | +### Windows系统 | ||
| 39 | +1. 确保已安装GCC(如MinGW)或MSVC | ||
| 40 | +2. 双击运行 `build_windows.bat` | ||
| 41 | +3. 或使用命令行: | ||
| 42 | +```bash | ||
| 43 | +build_windows.bat | ||
| 44 | +``` | ||
| 45 | + | ||
| 46 | +### Linux/macOS系统 | ||
| 47 | +```bash | ||
| 48 | +make vad-sense-voice-c-api | ||
| 49 | +``` | ||
| 50 | + | ||
| 51 | +## 运行程序 | ||
| 52 | +```bash | ||
| 53 | +./vad-sense-voice-c-api.exe # Windows | ||
| 54 | +./vad-sense-voice-c-api # Linux/macOS | ||
| 55 | +``` | ||
| 56 | + | ||
| 57 | +## 输出文件 | ||
| 58 | +程序运行后会生成: | ||
| 59 | +- `transcription_result.txt`:包含所有转录结果和性能统计 | ||
| 60 | + | ||
| 61 | +### 文件内容示例 | ||
| 62 | +``` | ||
| 63 | +音频转录结果: | ||
| 64 | +================ | ||
| 65 | +片段 1 (0.000-2.500秒): 大家好,我是雷军 | ||
| 66 | +片段 2 (3.000-5.200秒): 今天很高兴见到大家 | ||
| 67 | + | ||
| 68 | +性能统计: | ||
| 69 | +================ | ||
| 70 | +总执行时间: 1.234 秒 | ||
| 71 | +CPU时间: 0.987 秒 | ||
| 72 | +CPU使用率: 80.0% | ||
| 73 | +转录片段数: 2 | ||
| 74 | +``` | ||
| 75 | + | ||
| 76 | +## 故障排除 | ||
| 77 | + | ||
| 78 | +### 编译错误 | ||
| 79 | +- **Windows**: 确保已安装MinGW或Visual Studio | ||
| 80 | +- **Linux**: 确保已安装build-essential | ||
| 81 | +- **macOS**: 确保已安装Xcode命令行工具 | ||
| 82 | + | ||
| 83 | +### 运行时错误 | ||
| 84 | +- 检查所有必需文件是否存在 | ||
| 85 | +- 确保模型文件路径正确 | ||
| 86 | +- 检查音频文件格式(需要16kHz采样率) |
| @@ -2,24 +2,6 @@ | @@ -2,24 +2,6 @@ | ||
| 2 | // | 2 | // |
| 3 | // Copyright (c) 2024 Xiaomi Corporation | 3 | // Copyright (c) 2024 Xiaomi Corporation |
| 4 | 4 | ||
| 5 | -// We assume you have pre-downloaded the speaker embedding extractor model | ||
| 6 | -// from | ||
| 7 | -// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 8 | -// | ||
| 9 | -// An example command to download | ||
| 10 | -// "3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx" | ||
| 11 | -// is given below: | ||
| 12 | -// | ||
| 13 | -// clang-format off | ||
| 14 | -// | ||
| 15 | -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx | ||
| 16 | -// | ||
| 17 | -// clang-format on | ||
| 18 | -// | ||
| 19 | -// Also, please download the test wave files from | ||
| 20 | -// | ||
| 21 | -// https://github.com/csukuangfj/sr-data | ||
| 22 | - | ||
| 23 | #include <stdio.h> | 5 | #include <stdio.h> |
| 24 | #include <stdlib.h> | 6 | #include <stdlib.h> |
| 25 | #include <string.h> | 7 | #include <string.h> |
| @@ -46,26 +28,88 @@ static const float *ComputeEmbedding( | @@ -46,26 +28,88 @@ static const float *ComputeEmbedding( | ||
| 46 | exit(-1); | 28 | exit(-1); |
| 47 | } | 29 | } |
| 48 | 30 | ||
| 49 | - // we will free `v` outside of this function | ||
| 50 | const float *v = | 31 | const float *v = |
| 51 | SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream); | 32 | SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream); |
| 52 | 33 | ||
| 53 | SherpaOnnxDestroyOnlineStream(stream); | 34 | SherpaOnnxDestroyOnlineStream(stream); |
| 54 | SherpaOnnxFreeWave(wave); | 35 | SherpaOnnxFreeWave(wave); |
| 55 | 36 | ||
| 56 | - // Remeber to free v to avoid memory leak | ||
| 57 | return v; | 37 | return v; |
| 58 | } | 38 | } |
| 59 | 39 | ||
| 60 | -int32_t main() { | ||
| 61 | - SherpaOnnxSpeakerEmbeddingExtractorConfig config; | 40 | +void PrintUsage(const char *program_name) { |
| 41 | + fprintf(stderr, "Usage: %s <model_path> <threshold> <speaker1_name> <speaker1_wav1> [speaker1_wav2] [speaker1_wav3] <speaker2_name> <speaker2_wav1> [speaker2_wav2] [speaker2_wav3] <output_file> <test_wav1> <test_wav2> ...\n", program_name); | ||
| 42 | + fprintf(stderr, "Example: %s 3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx 0.6 fangjun ./sr-data/enroll/fangjun-sr-1.wav ./sr-data/enroll/fangjun-sr-2.wav ./sr-data/enroll/fangjun-sr-3.wav leijun ./sr-data/enroll/leijun-sr-1.wav ./sr-data/enroll/leijun-sr-2.wav result.txt ./sr-data/test/fangjun-test-sr-1.wav ./sr-data/test/leijun-test-sr-1.wav ./sr-data/test/liudehua-test-sr-1.wav\n", program_name); | ||
| 43 | +} | ||
| 62 | 44 | ||
| 63 | - memset(&config, 0, sizeof(config)); | 45 | +int32_t main(int32_t argc, char *argv[]) { |
| 46 | + if (argc < 7) { | ||
| 47 | + PrintUsage(argv[0]); | ||
| 48 | + return -1; | ||
| 49 | + } | ||
| 64 | 50 | ||
| 65 | - // please download the model from | ||
| 66 | - // https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 67 | - config.model = "./3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"; | 51 | + // Parse command line arguments |
| 52 | + const char *model_path = argv[1]; | ||
| 53 | + float threshold = atof(argv[2]); | ||
| 54 | + | ||
| 55 | + // Find the position of output file and test files | ||
| 56 | + int32_t output_file_index = -1; | ||
| 57 | + for (int32_t i = 3; i < argc; i++) { | ||
| 58 | + if (strstr(argv[i], ".txt") != NULL) { | ||
| 59 | + output_file_index = i; | ||
| 60 | + break; | ||
| 61 | + } | ||
| 62 | + } | ||
| 63 | + | ||
| 64 | + if (output_file_index == -1 || output_file_index >= argc - 1) { | ||
| 65 | + fprintf(stderr, "Output file not found or no test files provided\n"); | ||
| 66 | + PrintUsage(argv[0]); | ||
| 67 | + return -1; | ||
| 68 | + } | ||
| 69 | + | ||
| 70 | + const char *output_file = argv[output_file_index]; | ||
| 71 | + int32_t num_test_files = argc - output_file_index - 1; | ||
| 72 | + const char **test_files = (const char **)&argv[output_file_index + 1]; | ||
| 73 | + | ||
| 74 | + // Parse speaker information | ||
| 75 | + int32_t num_speakers = 0; | ||
| 76 | + const char *speaker_names[10] = {NULL}; | ||
| 77 | + const char *speaker_files[10][4] = {NULL}; | ||
| 78 | + int32_t speaker_file_counts[10] = {0}; | ||
| 79 | + | ||
| 80 | + int32_t current_index = 3; | ||
| 81 | + while (current_index < output_file_index && num_speakers < 10) { | ||
| 82 | + // Speaker name | ||
| 83 | + speaker_names[num_speakers] = argv[current_index++]; | ||
| 84 | + | ||
| 85 | + // Speaker wave files | ||
| 86 | + int32_t file_count = 0; | ||
| 87 | + while (current_index < output_file_index && | ||
| 88 | + strstr(argv[current_index], ".wav") != NULL && | ||
| 89 | + file_count < 4) { | ||
| 90 | + speaker_files[num_speakers][file_count++] = argv[current_index++]; | ||
| 91 | + } | ||
| 92 | + | ||
| 93 | + speaker_file_counts[num_speakers] = file_count; | ||
| 94 | + num_speakers++; | ||
| 95 | + } | ||
| 96 | + | ||
| 97 | + // Open output file | ||
| 98 | + FILE *fp = fopen(output_file, "w"); | ||
| 99 | + if (!fp) { | ||
| 100 | + fprintf(stderr, "Failed to open output file: %s\n", output_file); | ||
| 101 | + return -1; | ||
| 102 | + } | ||
| 103 | + | ||
| 104 | + fprintf(fp, "Speaker Identification Results\n"); | ||
| 105 | + fprintf(fp, "Model: %s\n", model_path); | ||
| 106 | + fprintf(fp, "Threshold: %.2f\n", threshold); | ||
| 107 | + fprintf(fp, "========================================\n"); | ||
| 68 | 108 | ||
| 109 | + // Initialize speaker embedding extractor | ||
| 110 | + SherpaOnnxSpeakerEmbeddingExtractorConfig config; | ||
| 111 | + memset(&config, 0, sizeof(config)); | ||
| 112 | + config.model = model_path; | ||
| 69 | config.num_threads = 1; | 113 | config.num_threads = 1; |
| 70 | config.debug = 0; | 114 | config.debug = 0; |
| 71 | config.provider = "cpu"; | 115 | config.provider = "cpu"; |
| @@ -74,184 +118,76 @@ int32_t main() { | @@ -74,184 +118,76 @@ int32_t main() { | ||
| 74 | SherpaOnnxCreateSpeakerEmbeddingExtractor(&config); | 118 | SherpaOnnxCreateSpeakerEmbeddingExtractor(&config); |
| 75 | if (!ex) { | 119 | if (!ex) { |
| 76 | fprintf(stderr, "Failed to create speaker embedding extractor"); | 120 | fprintf(stderr, "Failed to create speaker embedding extractor"); |
| 121 | + fclose(fp); | ||
| 77 | return -1; | 122 | return -1; |
| 78 | } | 123 | } |
| 79 | 124 | ||
| 80 | int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex); | 125 | int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex); |
| 81 | - | ||
| 82 | const SherpaOnnxSpeakerEmbeddingManager *manager = | 126 | const SherpaOnnxSpeakerEmbeddingManager *manager = |
| 83 | SherpaOnnxCreateSpeakerEmbeddingManager(dim); | 127 | SherpaOnnxCreateSpeakerEmbeddingManager(dim); |
| 84 | 128 | ||
| 85 | - // Please download the test data from | ||
| 86 | - // https://github.com/csukuangfj/sr-data | ||
| 87 | - const char *spk1_1 = "./sr-data/enroll/fangjun-sr-1.wav"; | ||
| 88 | - const char *spk1_2 = "./sr-data/enroll/fangjun-sr-2.wav"; | ||
| 89 | - const char *spk1_3 = "./sr-data/enroll/fangjun-sr-3.wav"; | ||
| 90 | - | ||
| 91 | - const char *spk2_1 = "./sr-data/enroll/leijun-sr-1.wav"; | ||
| 92 | - const char *spk2_2 = "./sr-data/enroll/leijun-sr-2.wav"; | ||
| 93 | - | ||
| 94 | - const float *spk1_vec[4] = {NULL}; | ||
| 95 | - spk1_vec[0] = ComputeEmbedding(ex, spk1_1); | ||
| 96 | - spk1_vec[1] = ComputeEmbedding(ex, spk1_2); | ||
| 97 | - spk1_vec[2] = ComputeEmbedding(ex, spk1_3); | ||
| 98 | - | ||
| 99 | - const float *spk2_vec[3] = {NULL}; | ||
| 100 | - spk2_vec[0] = ComputeEmbedding(ex, spk2_1); | ||
| 101 | - spk2_vec[1] = ComputeEmbedding(ex, spk2_2); | ||
| 102 | - | ||
| 103 | - if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "fangjun", spk1_vec)) { | ||
| 104 | - fprintf(stderr, "Failed to register fangjun\n"); | ||
| 105 | - exit(-1); | ||
| 106 | - } | ||
| 107 | - | ||
| 108 | - if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "fangjun")) { | ||
| 109 | - fprintf(stderr, "Failed to find fangjun\n"); | ||
| 110 | - exit(-1); | ||
| 111 | - } | ||
| 112 | - | ||
| 113 | - if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "leijun", spk2_vec)) { | ||
| 114 | - fprintf(stderr, "Failed to register leijun\n"); | ||
| 115 | - exit(-1); | ||
| 116 | - } | ||
| 117 | - | ||
| 118 | - if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "leijun")) { | ||
| 119 | - fprintf(stderr, "Failed to find leijun\n"); | ||
| 120 | - exit(-1); | ||
| 121 | - } | ||
| 122 | - | ||
| 123 | - if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 2) { | ||
| 124 | - fprintf(stderr, "There should be two speakers: fangjun and leijun\n"); | ||
| 125 | - exit(-1); | ||
| 126 | - } | ||
| 127 | - | ||
| 128 | - const char *const *all_speakers = | ||
| 129 | - SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager); | ||
| 130 | - const char *const *p = all_speakers; | ||
| 131 | - fprintf(stderr, "list of registered speakers\n-----\n"); | ||
| 132 | - while (p[0]) { | ||
| 133 | - fprintf(stderr, "speaker: %s\n", p[0]); | ||
| 134 | - ++p; | ||
| 135 | - } | ||
| 136 | - fprintf(stderr, "----\n"); | ||
| 137 | - | ||
| 138 | - SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers); | ||
| 139 | - | ||
| 140 | - const char *test1 = "./sr-data/test/fangjun-test-sr-1.wav"; | ||
| 141 | - const char *test2 = "./sr-data/test/leijun-test-sr-1.wav"; | ||
| 142 | - const char *test3 = "./sr-data/test/liudehua-test-sr-1.wav"; | ||
| 143 | - | ||
| 144 | - const float *v1 = ComputeEmbedding(ex, test1); | ||
| 145 | - const float *v2 = ComputeEmbedding(ex, test2); | ||
| 146 | - const float *v3 = ComputeEmbedding(ex, test3); | ||
| 147 | - | ||
| 148 | - float threshold = 0.6; | ||
| 149 | - | ||
| 150 | - const char *name1 = | ||
| 151 | - SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold); | ||
| 152 | - if (name1) { | ||
| 153 | - fprintf(stderr, "%s: Found %s\n", test1, name1); | ||
| 154 | - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1); | ||
| 155 | - } else { | ||
| 156 | - fprintf(stderr, "%s: Not found\n", test1); | ||
| 157 | - } | ||
| 158 | - | ||
| 159 | - const char *name2 = | ||
| 160 | - SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold); | ||
| 161 | - if (name2) { | ||
| 162 | - fprintf(stderr, "%s: Found %s\n", test2, name2); | ||
| 163 | - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2); | ||
| 164 | - } else { | ||
| 165 | - fprintf(stderr, "%s: Not found\n", test2); | ||
| 166 | - } | ||
| 167 | - | ||
| 168 | - const char *name3 = | ||
| 169 | - SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v3, threshold); | ||
| 170 | - if (name3) { | ||
| 171 | - fprintf(stderr, "%s: Found %s\n", test3, name3); | ||
| 172 | - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name3); | ||
| 173 | - } else { | ||
| 174 | - fprintf(stderr, "%s: Not found\n", test3); | ||
| 175 | - } | ||
| 176 | - | ||
| 177 | - int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v1, | ||
| 178 | - threshold); | ||
| 179 | - if (ok) { | ||
| 180 | - fprintf(stderr, "%s matches fangjun\n", test1); | ||
| 181 | - } else { | ||
| 182 | - fprintf(stderr, "%s does NOT match fangjun\n", test1); | ||
| 183 | - } | ||
| 184 | - | ||
| 185 | - ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v2, | ||
| 186 | - threshold); | ||
| 187 | - if (ok) { | ||
| 188 | - fprintf(stderr, "%s matches fangjun\n", test2); | ||
| 189 | - } else { | ||
| 190 | - fprintf(stderr, "%s does NOT match fangjun\n", test2); | ||
| 191 | - } | ||
| 192 | - | ||
| 193 | - fprintf(stderr, "Removing fangjun\n"); | ||
| 194 | - if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "fangjun")) { | ||
| 195 | - fprintf(stderr, "Failed to remove fangjun\n"); | ||
| 196 | - exit(-1); | ||
| 197 | - } | ||
| 198 | - | ||
| 199 | - if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 1) { | ||
| 200 | - fprintf(stderr, "There should be only 1 speaker left\n"); | ||
| 201 | - exit(-1); | ||
| 202 | - } | ||
| 203 | - | ||
| 204 | - name1 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold); | ||
| 205 | - if (name1) { | ||
| 206 | - fprintf(stderr, "%s: Found %s\n", test1, name1); | ||
| 207 | - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1); | ||
| 208 | - } else { | ||
| 209 | - fprintf(stderr, "%s: Not found\n", test1); | ||
| 210 | - } | ||
| 211 | - | ||
| 212 | - fprintf(stderr, "Removing leijun\n"); | ||
| 213 | - if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "leijun")) { | ||
| 214 | - fprintf(stderr, "Failed to remove leijun\n"); | ||
| 215 | - exit(-1); | ||
| 216 | - } | ||
| 217 | - | ||
| 218 | - if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 0) { | ||
| 219 | - fprintf(stderr, "There should be only 1 speaker left\n"); | ||
| 220 | - exit(-1); | ||
| 221 | - } | ||
| 222 | - | ||
| 223 | - name2 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold); | ||
| 224 | - if (name2) { | ||
| 225 | - fprintf(stderr, "%s: Found %s\n", test2, name2); | ||
| 226 | - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2); | ||
| 227 | - } else { | ||
| 228 | - fprintf(stderr, "%s: Not found\n", test2); | ||
| 229 | - } | ||
| 230 | - | ||
| 231 | - all_speakers = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager); | ||
| 232 | - | ||
| 233 | - p = all_speakers; | ||
| 234 | - fprintf(stderr, "list of registered speakers\n-----\n"); | ||
| 235 | - while (p[0]) { | ||
| 236 | - fprintf(stderr, "speaker: %s\n", p[0]); | ||
| 237 | - ++p; | ||
| 238 | - } | ||
| 239 | - fprintf(stderr, "----\n"); | ||
| 240 | - | ||
| 241 | - SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers); | ||
| 242 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v1); | ||
| 243 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v2); | ||
| 244 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v3); | ||
| 245 | - | ||
| 246 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[0]); | ||
| 247 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[1]); | ||
| 248 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[2]); | ||
| 249 | - | ||
| 250 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[0]); | ||
| 251 | - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[1]); | ||
| 252 | - | 129 | + // Register speakers |
| 130 | + for (int32_t i = 0; i < num_speakers; i++) { | ||
| 131 | + const float *embeddings[4] = {NULL}; | ||
| 132 | + int32_t count = speaker_file_counts[i]; | ||
| 133 | + | ||
| 134 | + for (int32_t j = 0; j < count; j++) { | ||
| 135 | + embeddings[j] = ComputeEmbedding(ex, speaker_files[i][j]); | ||
| 136 | + } | ||
| 137 | + | ||
| 138 | + if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, speaker_names[i], embeddings)) { | ||
| 139 | + fprintf(stderr, "Failed to register %s\n", speaker_names[i]); | ||
| 140 | + fprintf(fp, "Failed to register %s\n", speaker_names[i]); | ||
| 141 | + fclose(fp); | ||
| 142 | + exit(-1); | ||
| 143 | + } | ||
| 144 | + | ||
| 145 | + for (int32_t j = 0; j < count; j++) { | ||
| 146 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(embeddings[j]); | ||
| 147 | + } | ||
| 148 | + | ||
| 149 | + fprintf(stderr, "Registered speaker: %s with %d wave files\n", speaker_names[i], count); | ||
| 150 | + fprintf(fp, "Registered speaker: %s with %d wave files\n", speaker_names[i], count); | ||
| 151 | + } | ||
| 152 | + | ||
| 153 | + fprintf(fp, "\nTest Results:\n"); | ||
| 154 | + fprintf(fp, "========================================\n"); | ||
| 155 | + | ||
| 156 | + // Process test files | ||
| 157 | + for (int32_t i = 0; i < num_test_files; i++) { | ||
| 158 | + const char *test_file = test_files[i]; | ||
| 159 | + const float *v = ComputeEmbedding(ex, test_file); | ||
| 160 | + | ||
| 161 | + const char *name = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v, threshold); | ||
| 162 | + | ||
| 163 | + fprintf(stderr, "Testing %s: ", test_file); | ||
| 164 | + fprintf(fp, "Test file: %s\n", test_file); | ||
| 165 | + | ||
| 166 | + if (name) { | ||
| 167 | + fprintf(stderr, "Found %s\n", name); | ||
| 168 | + fprintf(fp, " Result: Found speaker - %s\n", name); | ||
| 169 | + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name); | ||
| 170 | + } else { | ||
| 171 | + fprintf(stderr, "Not found\n"); | ||
| 172 | + fprintf(fp, " Result: Speaker not found\n"); | ||
| 173 | + } | ||
| 174 | + | ||
| 175 | + // Verify against all registered speakers | ||
| 176 | + for (int32_t j = 0; j < num_speakers; j++) { | ||
| 177 | + int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, speaker_names[j], v, threshold); | ||
| 178 | + fprintf(fp, " Verify with %s: %s\n", speaker_names[j], ok ? "MATCH" : "NO MATCH"); | ||
| 179 | + } | ||
| 180 | + | ||
| 181 | + fprintf(fp, "\n"); | ||
| 182 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v); | ||
| 183 | + } | ||
| 184 | + | ||
| 185 | + // Cleanup | ||
| 253 | SherpaOnnxDestroySpeakerEmbeddingManager(manager); | 186 | SherpaOnnxDestroySpeakerEmbeddingManager(manager); |
| 254 | SherpaOnnxDestroySpeakerEmbeddingExtractor(ex); | 187 | SherpaOnnxDestroySpeakerEmbeddingExtractor(ex); |
| 188 | + fclose(fp); | ||
| 189 | + | ||
| 190 | + fprintf(stderr, "Results saved to: %s\n", output_file); | ||
| 255 | 191 | ||
| 256 | return 0; | 192 | return 0; |
| 257 | -} | 193 | +} |
| @@ -23,164 +23,486 @@ | @@ -23,164 +23,486 @@ | ||
| 23 | #include <stdio.h> | 23 | #include <stdio.h> |
| 24 | #include <stdlib.h> | 24 | #include <stdlib.h> |
| 25 | #include <string.h> | 25 | #include <string.h> |
| 26 | +#include <ctype.h> | ||
| 27 | +#include <wchar.h> | ||
| 28 | +#include <locale.h> | ||
| 29 | +#include <stdbool.h> | ||
| 30 | +#include <stdint.h> | ||
| 26 | 31 | ||
| 27 | #include "sherpa-onnx/c-api/c-api.h" | 32 | #include "sherpa-onnx/c-api/c-api.h" |
| 28 | 33 | ||
| 29 | -int32_t main() { | ||
| 30 | - const char *wav_filename = "./lei-jun-test.wav"; | ||
| 31 | - if (!SherpaOnnxFileExists(wav_filename)) { | ||
| 32 | - fprintf(stderr, "Please download %s\n", wav_filename); | ||
| 33 | - return -1; | ||
| 34 | - } | 34 | +// Function to normalize string: remove punctuation and spaces, convert to lowercase |
| 35 | +void normalize_string(const char* input, char* output) { | ||
| 36 | + int i = 0, j = 0; | ||
| 37 | + while (input[i] != '\0') { | ||
| 38 | + // Skip punctuation characters and spaces (both English and Chinese) | ||
| 39 | + if (!ispunct((unsigned char)input[i]) && | ||
| 40 | + !isspace((unsigned char)input[i]) && | ||
| 41 | + !(input[i] >= 0x3000 && input[i] <= 0x303F) && // CJK punctuation | ||
| 42 | + !(input[i] >= 0xFF00 && input[i] <= 0xFF0F) && // Fullwidth forms | ||
| 43 | + !(input[i] >= 0xFF1A && input[i] <= 0xFF20) && // Fullwidth forms | ||
| 44 | + !(input[i] >= 0xFF3B && input[i] <= 0xFF40) && // Fullwidth forms | ||
| 45 | + !(input[i] >= 0xFF5B && input[i] <= 0xFF65)) { // Fullwidth forms | ||
| 46 | + | ||
| 47 | + // Convert to lowercase and add to output | ||
| 48 | + output[j++] = tolower((unsigned char)input[i]); | ||
| 49 | + } | ||
| 50 | + i++; | ||
| 51 | + } | ||
| 52 | + output[j] = '\0'; | ||
| 53 | +} | ||
| 35 | 54 | ||
| 36 | - const char *vad_filename; | ||
| 37 | - int32_t use_silero_vad = 0; | ||
| 38 | - int32_t use_ten_vad = 0; | ||
| 39 | - | ||
| 40 | - if (SherpaOnnxFileExists("./silero_vad.onnx")) { | ||
| 41 | - printf("Use silero-vad\n"); | ||
| 42 | - vad_filename = "./silero_vad.onnx"; | ||
| 43 | - use_silero_vad = 1; | ||
| 44 | - } else if (SherpaOnnxFileExists("./ten-vad.onnx")) { | ||
| 45 | - printf("Use ten-vad\n"); | ||
| 46 | - vad_filename = "./ten-vad.onnx"; | ||
| 47 | - use_ten_vad = 1; | ||
| 48 | - } else { | ||
| 49 | - fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n"); | ||
| 50 | - return -1; | ||
| 51 | - } | 55 | +// Function to get the first meaningful character (non-punctuation, non-space) |
| 56 | +char get_first_meaningful_char(const char* str) { | ||
| 57 | + int i = 0; | ||
| 58 | + while (str[i] != '\0') { | ||
| 59 | + if (!ispunct((unsigned char)str[i]) && | ||
| 60 | + !isspace((unsigned char)str[i]) && | ||
| 61 | + !(str[i] >= 0x3000 && str[i] <= 0x303F) && | ||
| 62 | + !(str[i] >= 0xFF00 && str[i] <= 0xFF0F) && | ||
| 63 | + !(str[i] >= 0xFF1A && str[i] <= 0xFF20) && | ||
| 64 | + !(str[i] >= 0xFF3B && str[i] <= 0xFF40) && | ||
| 65 | + !(str[i] >= 0xFF5B && str[i] <= 0xFF65)) { | ||
| 66 | + return tolower((unsigned char)str[i]); | ||
| 67 | + } | ||
| 68 | + i++; | ||
| 69 | + } | ||
| 70 | + return '\0'; | ||
| 71 | +} | ||
| 52 | 72 | ||
| 53 | - const char *model_filename = | ||
| 54 | - "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; | ||
| 55 | - const char *tokens_filename = | ||
| 56 | - "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
| 57 | - const char *language = "auto"; | ||
| 58 | - const char *provider = "cpu"; | ||
| 59 | - int32_t use_inverse_text_normalization = 1; | ||
| 60 | - | ||
| 61 | - const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); | ||
| 62 | - if (wave == NULL) { | ||
| 63 | - fprintf(stderr, "Failed to read %s\n", wav_filename); | ||
| 64 | - return -1; | ||
| 65 | - } | 73 | +// Function to check if two strings are effectively the same after normalization |
| 74 | +int are_strings_effectively_same(const char* str1, const char* str2) { | ||
| 75 | + char norm1[1024], norm2[1024]; | ||
| 76 | + normalize_string(str1, norm1); | ||
| 77 | + normalize_string(str2, norm2); | ||
| 78 | + return strcmp(norm1, norm2) == 0; | ||
| 79 | +} | ||
| 66 | 80 | ||
| 67 | - if (wave->sample_rate != 16000) { | ||
| 68 | - fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n", | ||
| 69 | - wave->sample_rate); | ||
| 70 | - SherpaOnnxFreeWave(wave); | ||
| 71 | - return -1; | ||
| 72 | - } | 81 | +// 判断是否为 CJK 统一表意字符 |
| 82 | +static bool is_cjk_ideograph(uint32_t ch) | ||
| 83 | +{ | ||
| 84 | + return (ch >= 0x4E00 && ch <= 0x9FFF) || // CJK Unified Ideographs | ||
| 85 | + (ch >= 0x3400 && ch <= 0x4DBF) || // CJK Extension A | ||
| 86 | + (ch >= 0x20000 && ch <= 0x2A6DF) || // CJK Extension B | ||
| 87 | + (ch >= 0x2A700 && ch <= 0x2B73F) || // CJK Extension C | ||
| 88 | + (ch >= 0x2B740 && ch <= 0x2B81F) || // CJK Extension D | ||
| 89 | + (ch >= 0x2B820 && ch <= 0x2CEAF) || // CJK Extension E | ||
| 90 | + (ch >= 0x2CEB0 && ch <= 0x2EBEF) || // CJK Extension F | ||
| 91 | + (ch >= 0x3007 && ch <= 0x3007) || // 〇 | ||
| 92 | + (ch >= 0x3021 && ch <= 0x3029) || // 〡〢〣〤〥〦〧〨〩 | ||
| 93 | + (ch >= 0x3038 && ch <= 0x303B); // 〸〹〺〻〼 | ||
| 94 | +} | ||
| 73 | 95 | ||
| 74 | - SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config; | ||
| 75 | - memset(&sense_voice_config, 0, sizeof(sense_voice_config)); | ||
| 76 | - sense_voice_config.model = model_filename; | ||
| 77 | - sense_voice_config.language = language; | ||
| 78 | - sense_voice_config.use_itn = use_inverse_text_normalization; | ||
| 79 | - | ||
| 80 | - // Offline model config | ||
| 81 | - SherpaOnnxOfflineModelConfig offline_model_config; | ||
| 82 | - memset(&offline_model_config, 0, sizeof(offline_model_config)); | ||
| 83 | - offline_model_config.debug = 0; | ||
| 84 | - offline_model_config.num_threads = 1; | ||
| 85 | - offline_model_config.provider = provider; | ||
| 86 | - offline_model_config.tokens = tokens_filename; | ||
| 87 | - offline_model_config.sense_voice = sense_voice_config; | ||
| 88 | - | ||
| 89 | - // Recognizer config | ||
| 90 | - SherpaOnnxOfflineRecognizerConfig recognizer_config; | ||
| 91 | - memset(&recognizer_config, 0, sizeof(recognizer_config)); | ||
| 92 | - recognizer_config.decoding_method = "greedy_search"; | ||
| 93 | - recognizer_config.model_config = offline_model_config; | ||
| 94 | - | ||
| 95 | - const SherpaOnnxOfflineRecognizer *recognizer = | ||
| 96 | - SherpaOnnxCreateOfflineRecognizer(&recognizer_config); | ||
| 97 | - | ||
| 98 | - if (recognizer == NULL) { | ||
| 99 | - fprintf(stderr, "Please check your recognizer config!\n"); | ||
| 100 | - SherpaOnnxFreeWave(wave); | ||
| 101 | - return -1; | ||
| 102 | - } | 96 | +// 反向解码一个 UTF-8 字符,返回其长度(字节)和码点 |
| 97 | +static int prev_utf8_char(const char *s, int pos, uint32_t *out_ch) | ||
| 98 | +{ | ||
| 99 | + int start = pos; | ||
| 100 | + // 找到当前字符起始字节 | ||
| 101 | + while (start > 0 && (s[start] & 0xC0) == 0x80) | ||
| 102 | + --start; | ||
| 103 | + // 解码 | ||
| 104 | + const unsigned char *p = (const unsigned char *)&s[start]; | ||
| 105 | + if ((*p & 0x80) == 0) { // 1-byte | ||
| 106 | + *out_ch = *p; | ||
| 107 | + } else if ((*p & 0xE0) == 0xC0) { // 2-byte | ||
| 108 | + *out_ch = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F); | ||
| 109 | + } else if ((*p & 0xF0) == 0xE0) { // 3-byte | ||
| 110 | + *out_ch = ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); | ||
| 111 | + } else if ((*p & 0xF8) == 0xF0) { // 4-byte | ||
| 112 | + *out_ch = ((p[0] & 0x07) << 18) | ((p[1] & 0x3F) << 12) | | ||
| 113 | + ((p[2] & 0x3F) << 6) | (p[3] & 0x3F); | ||
| 114 | + } else { | ||
| 115 | + *out_ch = 0xFFFD; // 非法序列,用替换字符 | ||
| 116 | + } | ||
| 117 | + return pos - start + 1; // 返回字节长度 | ||
| 118 | +} | ||
| 103 | 119 | ||
| 104 | - SherpaOnnxVadModelConfig vadConfig; | ||
| 105 | - memset(&vadConfig, 0, sizeof(vadConfig)); | ||
| 106 | - | ||
| 107 | - if (use_silero_vad) { | ||
| 108 | - vadConfig.silero_vad.model = vad_filename; | ||
| 109 | - vadConfig.silero_vad.threshold = 0.25; | ||
| 110 | - vadConfig.silero_vad.min_silence_duration = 0.5; | ||
| 111 | - vadConfig.silero_vad.min_speech_duration = 0.5; | ||
| 112 | - vadConfig.silero_vad.max_speech_duration = 10; | ||
| 113 | - vadConfig.silero_vad.window_size = 512; | ||
| 114 | - } else if (use_ten_vad) { | ||
| 115 | - vadConfig.ten_vad.model = vad_filename; | ||
| 116 | - vadConfig.ten_vad.threshold = 0.25; | ||
| 117 | - vadConfig.ten_vad.min_silence_duration = 0.5; | ||
| 118 | - vadConfig.ten_vad.min_speech_duration = 0.5; | ||
| 119 | - vadConfig.ten_vad.max_speech_duration = 10; | ||
| 120 | - vadConfig.ten_vad.window_size = 256; | ||
| 121 | - } | 120 | +// 新实现:按“中日文单字 / 英文整词”取最后 n 个语义单元 |
| 121 | +void get_last_n_words(const char *str, int n, char *output) | ||
| 122 | +{ | ||
| 123 | + if (!str || !output || n <= 0) { | ||
| 124 | + *output = '\0'; | ||
| 125 | + return; | ||
| 126 | + } | ||
| 122 | 127 | ||
| 123 | - vadConfig.sample_rate = 16000; | ||
| 124 | - vadConfig.num_threads = 1; | ||
| 125 | - vadConfig.debug = 1; | 128 | + int len = strlen(str); |
| 129 | + if (len == 0) { | ||
| 130 | + *output = '\0'; | ||
| 131 | + return; | ||
| 132 | + } | ||
| 126 | 133 | ||
| 127 | - const SherpaOnnxVoiceActivityDetector *vad = | ||
| 128 | - SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); | 134 | + // 用来存反向收集到的单元 |
| 135 | + char units[256][256]; | ||
| 136 | + int unit_cnt = 0; | ||
| 137 | + | ||
| 138 | + int pos = len; // 从 '\0' 前一个位置开始 | ||
| 139 | + while (pos > 0 && unit_cnt < n) { | ||
| 140 | + uint32_t ch; | ||
| 141 | + int char_len = prev_utf8_char(str, pos - 1, &ch); | ||
| 142 | + pos -= char_len; | ||
| 143 | + | ||
| 144 | + if (ch < 128 && ((ch | 32) - 'a' < 26)) { | ||
| 145 | + // ===== 英文单词 ===== | ||
| 146 | + int word_end = pos + char_len; | ||
| 147 | + int word_start = pos; | ||
| 148 | + // 向前找单词起始 | ||
| 149 | + while (word_start > 0) { | ||
| 150 | + uint32_t tmp; | ||
| 151 | + int tmp_len = prev_utf8_char(str, word_start - 1, &tmp); | ||
| 152 | + if (tmp < 128 && ((tmp | 32) - 'a' < 26)) | ||
| 153 | + word_start -= tmp_len; | ||
| 154 | + else | ||
| 155 | + break; | ||
| 156 | + } | ||
| 157 | + // 拷贝整个单词 | ||
| 158 | + int wlen = word_end - word_start; | ||
| 159 | + if (wlen >= (int)sizeof(units[unit_cnt])) wlen = sizeof(units[unit_cnt]) - 1; | ||
| 160 | + memcpy(units[unit_cnt], str + word_start, wlen); | ||
| 161 | + units[unit_cnt][wlen] = '\0'; | ||
| 162 | + ++unit_cnt; | ||
| 163 | + pos = word_start; // 继续向前扫描 | ||
| 164 | + } else if (is_cjk_ideograph(ch) || ch > 0xFF00) { | ||
| 165 | + // ===== CJK 或全角符号 ===== | ||
| 166 | + if (char_len >= (int)sizeof(units[unit_cnt])) char_len = sizeof(units[unit_cnt]) - 1; | ||
| 167 | + memcpy(units[unit_cnt], str + pos, char_len); | ||
| 168 | + units[unit_cnt][char_len] = '\0'; | ||
| 169 | + ++unit_cnt; | ||
| 170 | + } | ||
| 171 | + // 其他标点/空格直接跳过 | ||
| 172 | + } | ||
| 129 | 173 | ||
| 130 | - if (vad == NULL) { | ||
| 131 | - fprintf(stderr, "Please check your recognizer config!\n"); | ||
| 132 | - SherpaOnnxFreeWave(wave); | ||
| 133 | - SherpaOnnxDestroyOfflineRecognizer(recognizer); | ||
| 134 | - return -1; | 174 | + // 反向拼回 output |
| 175 | + output[0] = '\0'; | ||
| 176 | + for (int i = unit_cnt - 1; i >= 0; --i) { | ||
| 177 | + if (i < unit_cnt - 1) strcat(output, " "); | ||
| 178 | + strcat(output, units[i]); | ||
| 179 | + } | ||
| 180 | +} | ||
| 181 | + | ||
| 182 | +// 在第二个字符串中查找锚点文本的位置 | ||
| 183 | +const char *find_anchor_end_position(const char *str, const char *anchor) { | ||
| 184 | + if (!anchor || !*anchor) return str; | ||
| 185 | + | ||
| 186 | + char normalized_str[1024] = {0}; | ||
| 187 | + char normalized_anchor[1024] = {0}; | ||
| 188 | + | ||
| 189 | + // 规范化两个字符串 | ||
| 190 | + normalize_string(str, normalized_str); | ||
| 191 | + normalize_string(anchor, normalized_anchor); | ||
| 192 | + | ||
| 193 | + // 在规范化后的字符串中查找锚点 | ||
| 194 | + char *found = strstr(normalized_str, normalized_anchor); | ||
| 195 | + if (!found) return str; // 如果找不到锚点,返回整个字符串 | ||
| 196 | + | ||
| 197 | + // 计算锚点的结束位置 | ||
| 198 | + int anchor_end_offset = found - normalized_str + strlen(normalized_anchor); | ||
| 199 | + | ||
| 200 | + // 计算在原始字符串中的对应位置 | ||
| 201 | + int normalized_count = 0; | ||
| 202 | + const char *ptr = str; | ||
| 203 | + | ||
| 204 | + while (*ptr != '\0' && normalized_count < anchor_end_offset) { | ||
| 205 | + if (!ispunct((unsigned char)*ptr) && !isspace((unsigned char)*ptr)) { | ||
| 206 | + normalized_count++; | ||
| 207 | + } | ||
| 208 | + ptr++; | ||
| 135 | } | 209 | } |
| 210 | + | ||
| 211 | + return ptr; | ||
| 212 | +} | ||
| 213 | +// 找到下一个单词的开始位置 | ||
| 214 | +const char *find_next_word_start(const char *str) { | ||
| 215 | + // 跳过所有标点和空格 | ||
| 216 | + while (*str != '\0' && | ||
| 217 | + (ispunct((unsigned char)*str) || isspace((unsigned char)*str))) { | ||
| 218 | + str++; | ||
| 219 | + } | ||
| 220 | + return str; | ||
| 221 | +} | ||
| 136 | 222 | ||
| 137 | - int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size | ||
| 138 | - : vadConfig.ten_vad.window_size; | ||
| 139 | - int32_t i = 0; | ||
| 140 | - int is_eof = 0; | 223 | +// 获取基于锚点的差异文本(从锚点后的第一个完整单词开始) |
| 224 | +char *get_difference_after_anchor(const char *str1, const char *str2, int num_anchor_words) { | ||
| 225 | + if (are_strings_effectively_same(str1, str2)) { | ||
| 226 | + return strdup(""); | ||
| 227 | + } | ||
| 141 | 228 | ||
| 142 | - while (!is_eof) { | ||
| 143 | - if (i + window_size < wave->num_samples) { | ||
| 144 | - SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, | ||
| 145 | - window_size); | ||
| 146 | - } else { | ||
| 147 | - SherpaOnnxVoiceActivityDetectorFlush(vad); | ||
| 148 | - is_eof = 1; | 229 | + // 获取语义单元级的锚点文本 |
| 230 | + char semantic_anchor[256] = {0}; | ||
| 231 | + get_last_n_words(str1, num_anchor_words, semantic_anchor); | ||
| 232 | + | ||
| 233 | + if (strlen(semantic_anchor) == 0) { | ||
| 234 | + return strdup(str2); | ||
| 149 | } | 235 | } |
| 150 | 236 | ||
| 151 | - while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { | ||
| 152 | - const SherpaOnnxSpeechSegment *segment = | ||
| 153 | - SherpaOnnxVoiceActivityDetectorFront(vad); | 237 | + // 关键:对语义锚点再做一次字符级规范化,匹配 find_anchor_end_position 的行为 |
| 238 | + char normalized_anchor[256] = {0}; | ||
| 239 | + normalize_string(semantic_anchor, normalized_anchor); | ||
| 154 | 240 | ||
| 155 | - const SherpaOnnxOfflineStream *stream = | ||
| 156 | - SherpaOnnxCreateOfflineStream(recognizer); | 241 | + // 使用规范化后的锚点查找位置 |
| 242 | + const char *anchor_end = find_anchor_end_position(str2, normalized_anchor); | ||
| 243 | + const char *next_word_start = find_next_word_start(anchor_end); | ||
| 157 | 244 | ||
| 158 | - SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, | ||
| 159 | - segment->samples, segment->n); | 245 | + return strdup(next_word_start); |
| 246 | +} | ||
| 160 | 247 | ||
| 161 | - SherpaOnnxDecodeOfflineStream(recognizer, stream); | 248 | +// Structure to store previous segment information |
| 249 | +typedef struct { | ||
| 250 | + float* samples; | ||
| 251 | + int32_t n; | ||
| 252 | + int32_t start; | ||
| 253 | + char* text; | ||
| 254 | +} PreviousSegment; | ||
| 255 | + | ||
| 256 | +void free_previous_segment(PreviousSegment* seg) { | ||
| 257 | + if (seg) { | ||
| 258 | + if (seg->samples) free(seg->samples); | ||
| 259 | + if (seg->text) free(seg->text); | ||
| 260 | + free(seg); | ||
| 261 | + } | ||
| 262 | +} | ||
| 162 | 263 | ||
| 163 | - const SherpaOnnxOfflineRecognizerResult *result = | ||
| 164 | - SherpaOnnxGetOfflineStreamResult(stream); | 264 | +PreviousSegment* copy_segment(const SherpaOnnxSpeechSegment* segment, const char* text) { |
| 265 | + PreviousSegment* prev = (PreviousSegment*)malloc(sizeof(PreviousSegment)); | ||
| 266 | + if (!prev) return NULL; | ||
| 267 | + | ||
| 268 | + prev->n = segment->n; | ||
| 269 | + prev->start = segment->start; | ||
| 270 | + prev->samples = (float*)malloc(segment->n * sizeof(float)); | ||
| 271 | + if (!prev->samples) { | ||
| 272 | + free(prev); | ||
| 273 | + return NULL; | ||
| 274 | + } | ||
| 275 | + memcpy(prev->samples, segment->samples, segment->n * sizeof(float)); | ||
| 276 | + | ||
| 277 | + prev->text = strdup(text); | ||
| 278 | + if (!prev->text) { | ||
| 279 | + free(prev->samples); | ||
| 280 | + free(prev); | ||
| 281 | + return NULL; | ||
| 282 | + } | ||
| 283 | + | ||
| 284 | + return prev; | ||
| 285 | +} | ||
| 165 | 286 | ||
| 166 | - float start = segment->start / 16000.0f; | ||
| 167 | - float duration = segment->n / 16000.0f; | ||
| 168 | - float stop = start + duration; | 287 | +int32_t main() { |
| 288 | + setlocale(LC_ALL, ""); // Set locale for wide character handling | ||
| 169 | 289 | ||
| 170 | - fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); | 290 | + const char *wav_filename = "./lei-jun-test.wav"; |
| 291 | + if (!SherpaOnnxFileExists(wav_filename)) { | ||
| 292 | + fprintf(stderr, "Please download %s\n", wav_filename); | ||
| 293 | + return -1; | ||
| 294 | + } | ||
| 171 | 295 | ||
| 172 | - SherpaOnnxDestroyOfflineRecognizerResult(result); | ||
| 173 | - SherpaOnnxDestroyOfflineStream(stream); | 296 | + const char *vad_filename; |
| 297 | + int32_t use_silero_vad = 0; | ||
| 298 | + int32_t use_ten_vad = 0; | ||
| 299 | + | ||
| 300 | + if (SherpaOnnxFileExists("./silero_vad.onnx")) { | ||
| 301 | + printf("Use silero-vad\n"); | ||
| 302 | + vad_filename = "./silero_vad.onnx"; | ||
| 303 | + use_silero_vad = 1; | ||
| 304 | + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) { | ||
| 305 | + printf("Use ten-vad\n"); | ||
| 306 | + vad_filename = "./ten-vad.onnx"; | ||
| 307 | + use_ten_vad = 1; | ||
| 308 | + } else { | ||
| 309 | + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n"); | ||
| 310 | + return -1; | ||
| 311 | + } | ||
| 174 | 312 | ||
| 175 | - SherpaOnnxDestroySpeechSegment(segment); | ||
| 176 | - SherpaOnnxVoiceActivityDetectorPop(vad); | 313 | + const char *model_filename = |
| 314 | + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx"; | ||
| 315 | + const char *tokens_filename = | ||
| 316 | + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; | ||
| 317 | + const char *language = "auto"; | ||
| 318 | + const char *provider = "cpu"; | ||
| 319 | + int32_t use_inverse_text_normalization = 1; | ||
| 320 | + | ||
| 321 | + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); | ||
| 322 | + if (wave == NULL) { | ||
| 323 | + fprintf(stderr, "Failed to read %s\n", wav_filename); | ||
| 324 | + return -1; | ||
| 177 | } | 325 | } |
| 178 | - i += window_size; | ||
| 179 | - } | ||
| 180 | 326 | ||
| 181 | - SherpaOnnxDestroyOfflineRecognizer(recognizer); | ||
| 182 | - SherpaOnnxDestroyVoiceActivityDetector(vad); | ||
| 183 | - SherpaOnnxFreeWave(wave); | 327 | + if (wave->sample_rate != 16000) { |
| 328 | + fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n", | ||
| 329 | + wave->sample_rate); | ||
| 330 | + SherpaOnnxFreeWave(wave); | ||
| 331 | + return -1; | ||
| 332 | + } | ||
| 333 | + | ||
| 334 | + SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config; | ||
| 335 | + memset(&sense_voice_config, 0, sizeof(sense_voice_config)); | ||
| 336 | + sense_voice_config.model = model_filename; | ||
| 337 | + sense_voice_config.language = language; | ||
| 338 | + sense_voice_config.use_itn = use_inverse_text_normalization; | ||
| 339 | + | ||
| 340 | + // Offline model config | ||
| 341 | + SherpaOnnxOfflineModelConfig offline_model_config; | ||
| 342 | + memset(&offline_model_config, 0, sizeof(offline_model_config)); | ||
| 343 | + offline_model_config.debug = 0; | ||
| 344 | + offline_model_config.num_threads = 1; | ||
| 345 | + offline_model_config.provider = provider; | ||
| 346 | + offline_model_config.tokens = tokens_filename; | ||
| 347 | + offline_model_config.sense_voice = sense_voice_config; | ||
| 348 | + | ||
| 349 | + // Recognizer config | ||
| 350 | + SherpaOnnxOfflineRecognizerConfig recognizer_config; | ||
| 351 | + memset(&recognizer_config, 0, sizeof(recognizer_config)); | ||
| 352 | + recognizer_config.decoding_method = "greedy_search"; | ||
| 353 | + recognizer_config.model_config = offline_model_config; | ||
| 354 | + | ||
| 355 | + const SherpaOnnxOfflineRecognizer *recognizer = | ||
| 356 | + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); | ||
| 357 | + | ||
| 358 | + if (recognizer == NULL) { | ||
| 359 | + fprintf(stderr, "Please check your recognizer config!\n"); | ||
| 360 | + SherpaOnnxFreeWave(wave); | ||
| 361 | + return -1; | ||
| 362 | + } | ||
| 363 | + | ||
| 364 | + SherpaOnnxVadModelConfig vadConfig; | ||
| 365 | + memset(&vadConfig, 0, sizeof(vadConfig)); | ||
| 366 | + | ||
| 367 | + if (use_silero_vad) { | ||
| 368 | + vadConfig.silero_vad.model = vad_filename; | ||
| 369 | + vadConfig.silero_vad.threshold = 0.25; | ||
| 370 | + vadConfig.silero_vad.min_silence_duration = 1.5; | ||
| 371 | + vadConfig.silero_vad.min_speech_duration = 0.3; | ||
| 372 | + vadConfig.silero_vad.max_speech_duration = 20; | ||
| 373 | + vadConfig.silero_vad.window_size = 512; | ||
| 374 | + } else if (use_ten_vad) { | ||
| 375 | + vadConfig.ten_vad.model = vad_filename; | ||
| 376 | + vadConfig.ten_vad.threshold = 0.25; | ||
| 377 | + vadConfig.ten_vad.min_silence_duration = 0.5; | ||
| 378 | + vadConfig.ten_vad.min_speech_duration = 0.5; | ||
| 379 | + vadConfig.ten_vad.max_speech_duration = 10; | ||
| 380 | + vadConfig.ten_vad.window_size = 256; | ||
| 381 | + } | ||
| 382 | + | ||
| 383 | + vadConfig.sample_rate = 16000; | ||
| 384 | + vadConfig.num_threads = 1; | ||
| 385 | + vadConfig.debug = 1; | ||
| 386 | + | ||
| 387 | + const SherpaOnnxVoiceActivityDetector *vad = | ||
| 388 | + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); | ||
| 389 | + | ||
| 390 | + if (vad == NULL) { | ||
| 391 | + fprintf(stderr, "Please check your recognizer config!\n"); | ||
| 392 | + SherpaOnnxFreeWave(wave); | ||
| 393 | + SherpaOnnxDestroyOfflineRecognizer(recognizer); | ||
| 394 | + return -1; | ||
| 395 | + } | ||
| 396 | + | ||
| 397 | + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size | ||
| 398 | + : vadConfig.ten_vad.window_size; | ||
| 399 | + int32_t i = 0; | ||
| 400 | + int is_eof = 0; | ||
| 401 | + | ||
| 402 | + // Variables to store previous segment information | ||
| 403 | + PreviousSegment *prev_segment = NULL; | ||
| 404 | + | ||
| 405 | + while (!is_eof) { | ||
| 406 | + if (i + window_size < wave->num_samples) { | ||
| 407 | + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, | ||
| 408 | + window_size); | ||
| 409 | + } else { | ||
| 410 | + SherpaOnnxVoiceActivityDetectorFlush(vad); | ||
| 411 | + is_eof = 1; | ||
| 412 | + } | ||
| 413 | + | ||
| 414 | + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { | ||
| 415 | + const SherpaOnnxSpeechSegment *segment = | ||
| 416 | + SherpaOnnxVoiceActivityDetectorFront(vad); | ||
| 417 | + | ||
| 418 | + float duration = segment->n / 16000.0f; | ||
| 419 | + | ||
| 420 | + // Process the current segment | ||
| 421 | + const SherpaOnnxOfflineStream *stream = | ||
| 422 | + SherpaOnnxCreateOfflineStream(recognizer); | ||
| 423 | + | ||
| 424 | + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, | ||
| 425 | + segment->samples, segment->n); | ||
| 426 | + SherpaOnnxDecodeOfflineStream(recognizer, stream); | ||
| 427 | + const SherpaOnnxOfflineRecognizerResult *result = | ||
| 428 | + SherpaOnnxGetOfflineStreamResult(stream); | ||
| 429 | + | ||
| 430 | + float start = segment->start / 16000.0f; | ||
| 431 | + float stop = start + duration; | ||
| 432 | + | ||
| 433 | + if (duration < 1.5f && prev_segment != NULL) { | ||
| 434 | + // Current segment is less than 1 second and we have a previous segment | ||
| 435 | + // Merge with previous segment | ||
| 436 | + | ||
| 437 | + // Create merged samples | ||
| 438 | + int32_t merged_n = prev_segment->n + segment->n; | ||
| 439 | + float *merged_samples = (float*)malloc(merged_n * sizeof(float)); | ||
| 440 | + memcpy(merged_samples, prev_segment->samples, prev_segment->n * sizeof(float)); | ||
| 441 | + memcpy(merged_samples + prev_segment->n, segment->samples, segment->n * sizeof(float)); | ||
| 442 | + | ||
| 443 | + // Create stream for merged segment | ||
| 444 | + const SherpaOnnxOfflineStream *merged_stream = | ||
| 445 | + SherpaOnnxCreateOfflineStream(recognizer); | ||
| 446 | + SherpaOnnxAcceptWaveformOffline(merged_stream, wave->sample_rate, | ||
| 447 | + merged_samples, merged_n); | ||
| 448 | + SherpaOnnxDecodeOfflineStream(recognizer, merged_stream); | ||
| 449 | + const SherpaOnnxOfflineRecognizerResult *merged_result = | ||
| 450 | + SherpaOnnxGetOfflineStreamResult(merged_stream); | ||
| 451 | + | ||
| 452 | + // Get the meaningful difference starting from first character | ||
| 453 | + char *diff_text = get_difference_after_anchor(prev_segment->text, merged_result->text, 3); | ||
| 454 | + | ||
| 455 | + if (strlen(diff_text) == 0) { | ||
| 456 | + fprintf(stderr, "%.3f -- %.3f: %s (short segment, no meaningful difference)\n", | ||
| 457 | + start, stop, merged_result->text); | ||
| 458 | + } else { | ||
| 459 | + fprintf(stderr, "%.3f -- %.3f: %s (short segment, meaningful diff: %s)\n", | ||
| 460 | + start, stop, merged_result->text, diff_text); | ||
| 461 | + } | ||
| 462 | + | ||
| 463 | + // Don't update prev_segment for short segments (requirement 1) | ||
| 464 | + // Only update if the current segment is >= 1 second | ||
| 465 | + | ||
| 466 | + SherpaOnnxDestroyOfflineRecognizerResult(merged_result); | ||
| 467 | + SherpaOnnxDestroyOfflineStream(merged_stream); | ||
| 468 | + free(merged_samples); | ||
| 469 | + free(diff_text); | ||
| 470 | + | ||
| 471 | + } else { | ||
| 472 | + // Normal processing for segments >= 1 second | ||
| 473 | + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); | ||
| 474 | + | ||
| 475 | + // Store current segment and result only if duration >= 1 second (requirement 1) | ||
| 476 | + if (duration >= 1.5f) { | ||
| 477 | + if (prev_segment != NULL) { | ||
| 478 | + free_previous_segment(prev_segment); | ||
| 479 | + } | ||
| 480 | + prev_segment = copy_segment(segment, result->text); | ||
| 481 | + } else { | ||
| 482 | + // Short segment, don't store as previous | ||
| 483 | + if (prev_segment != NULL) { | ||
| 484 | + free_previous_segment(prev_segment); | ||
| 485 | + prev_segment = NULL; | ||
| 486 | + } | ||
| 487 | + } | ||
| 488 | + } | ||
| 489 | + | ||
| 490 | + SherpaOnnxDestroyOfflineRecognizerResult(result); | ||
| 491 | + SherpaOnnxDestroyOfflineStream(stream); | ||
| 492 | + SherpaOnnxDestroySpeechSegment(segment); | ||
| 493 | + SherpaOnnxVoiceActivityDetectorPop(vad); | ||
| 494 | + } | ||
| 495 | + i += window_size; | ||
| 496 | + } | ||
| 497 | + | ||
| 498 | + // Clean up | ||
| 499 | + if (prev_segment != NULL) { | ||
| 500 | + free_previous_segment(prev_segment); | ||
| 501 | + } | ||
| 502 | + | ||
| 503 | + SherpaOnnxDestroyOfflineRecognizer(recognizer); | ||
| 504 | + SherpaOnnxDestroyVoiceActivityDetector(vad); | ||
| 505 | + SherpaOnnxFreeWave(wave); | ||
| 184 | 506 | ||
| 185 | - return 0; | 507 | + return 0; |
| 186 | } | 508 | } |
| @@ -63,8 +63,17 @@ elseif(WIN32) | @@ -63,8 +63,17 @@ elseif(WIN32) | ||
| 63 | # Now SHERPA_ONNX_OS_TWO_LINES contains something like | 63 | # Now SHERPA_ONNX_OS_TWO_LINES contains something like |
| 64 | # Caption Version | 64 | # Caption Version |
| 65 | # Microsoft Windows 10 Pro 10.0.18362 | 65 | # Microsoft Windows 10 Pro 10.0.18362 |
| 66 | - string(REPLACE "\n" ";" SHERPA_ONNX_OS_LIST ${SHERPA_ONNX_OS_TWO_LINES}) | ||
| 67 | - list(GET SHERPA_ONNX_OS_LIST 1 SHERPA_ONNX_OS) | 66 | + if(SHERPA_ONNX_OS_TWO_LINES) |
| 67 | + string(REPLACE "\n" ";" SHERPA_ONNX_OS_LIST "${SHERPA_ONNX_OS_TWO_LINES}") | ||
| 68 | + list(LENGTH SHERPA_ONNX_OS_LIST _list_length) | ||
| 69 | + if(_list_length GREATER 1) | ||
| 70 | + list(GET SHERPA_ONNX_OS_LIST 1 SHERPA_ONNX_OS) | ||
| 71 | + else() | ||
| 72 | + set(SHERPA_ONNX_OS "Windows") | ||
| 73 | + endif() | ||
| 74 | + else() | ||
| 75 | + set(SHERPA_ONNX_OS "Windows") | ||
| 76 | + endif() | ||
| 68 | else() | 77 | else() |
| 69 | set(SHERPA_ONNX_OS "Unknown") | 78 | set(SHERPA_ONNX_OS "Unknown") |
| 70 | endif() | 79 | endif() |
-
请 注册 或 登录 后发表评论