xuning

feat(c-api): 增强语音识别功能并添加使用文档

1. 在speaker-identification-c-api.c中重构代码,支持命令行参数配置
2. 在vad-sense-voice-c-api.c中添加智能分段合并和文本差异检测功能
3. 新增README_usage.md详细说明使用方法和功能
4. 优化CMake脚本处理Windows系统信息获取

新增功能包括:
- 支持多说话人注册和识别
- 智能合并短语音片段
- 文本规范化比较
- 完整的API使用文档
- 更健壮的系统信息获取
  1 +# VAD + SenseVoice C API 使用说明
  2 +
  3 +## 功能
  4 +这个程序实现了以下功能:
  5 +1. 使用VAD(语音活动检测)分割音频
  6 +2. 使用SenseVoice模型进行语音识别
  7 +3. 将转录结果保存到txt文件
  8 +4. 计算程序执行时的CPU消耗
  9 +
  10 +## 所需文件
  11 +运行前需要下载以下文件:
  12 +
  13 +### 1. 音频文件
  14 +```bash
  15 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
  16 +```
  17 +
  18 +### 2. VAD模型(二选一)
  19 +#### 选项1:Silero VAD
  20 +```bash
  21 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
  22 +```
  23 +
  24 +#### 选项2:Ten VAD
  25 +```bash
  26 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
  27 +```
  28 +
  29 +### 3. SenseVoice模型
  30 +```bash
  31 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  32 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  33 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  34 +```
  35 +
  36 +## 编译方法
  37 +
  38 +### Windows系统
  39 +1. 确保已安装GCC(如MinGW)或MSVC
  40 +2. 双击运行 `build_windows.bat`
  41 +3. 或使用命令行:
  42 +```bash
  43 +build_windows.bat
  44 +```
  45 +
  46 +### Linux/macOS系统
  47 +```bash
  48 +make vad-sense-voice-c-api
  49 +```
  50 +
  51 +## 运行程序
  52 +```bash
  53 +./vad-sense-voice-c-api.exe # Windows
  54 +./vad-sense-voice-c-api # Linux/macOS
  55 +```
  56 +
  57 +## 输出文件
  58 +程序运行后会生成:
  59 +- `transcription_result.txt`:包含所有转录结果和性能统计
  60 +
  61 +### 文件内容示例
  62 +```
  63 +音频转录结果:
  64 +================
  65 +片段 1 (0.000-2.500秒): 大家好,我是雷军
  66 +片段 2 (3.000-5.200秒): 今天很高兴见到大家
  67 +
  68 +性能统计:
  69 +================
  70 +总执行时间: 1.234 秒
  71 +CPU时间: 0.987 秒
  72 +CPU使用率: 80.0%
  73 +转录片段数: 2
  74 +```
  75 +
  76 +## 故障排除
  77 +
  78 +### 编译错误
  79 +- **Windows**: 确保已安装MinGW或Visual Studio
  80 +- **Linux**: 确保已安装build-essential
  81 +- **macOS**: 确保已安装Xcode命令行工具
  82 +
  83 +### 运行时错误
  84 +- 检查所有必需文件是否存在
  85 +- 确保模型文件路径正确
  86 +- 检查音频文件格式(需要16kHz采样率)
@@ -2,24 +2,6 @@ @@ -2,24 +2,6 @@
2 // 2 //
3 // Copyright (c) 2024 Xiaomi Corporation 3 // Copyright (c) 2024 Xiaomi Corporation
4 4
5 -// We assume you have pre-downloaded the speaker embedding extractor model  
6 -// from  
7 -// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models  
8 -//  
9 -// An example command to download  
10 -// "3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"  
11 -// is given below:  
12 -//  
13 -// clang-format off  
14 -//  
15 -// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx  
16 -//  
17 -// clang-format on  
18 -//  
19 -// Also, please download the test wave files from  
20 -//  
21 -// https://github.com/csukuangfj/sr-data  
22 -  
23 #include <stdio.h> 5 #include <stdio.h>
24 #include <stdlib.h> 6 #include <stdlib.h>
25 #include <string.h> 7 #include <string.h>
@@ -46,26 +28,88 @@ static const float *ComputeEmbedding( @@ -46,26 +28,88 @@ static const float *ComputeEmbedding(
46 exit(-1); 28 exit(-1);
47 } 29 }
48 30
49 - // we will free `v` outside of this function  
50 const float *v = 31 const float *v =
51 SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream); 32 SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream);
52 33
53 SherpaOnnxDestroyOnlineStream(stream); 34 SherpaOnnxDestroyOnlineStream(stream);
54 SherpaOnnxFreeWave(wave); 35 SherpaOnnxFreeWave(wave);
55 36
56 - // Remeber to free v to avoid memory leak  
57 return v; 37 return v;
58 } 38 }
59 39
60 -int32_t main() {  
61 - SherpaOnnxSpeakerEmbeddingExtractorConfig config; 40 +void PrintUsage(const char *program_name) {
  41 + fprintf(stderr, "Usage: %s <model_path> <threshold> <speaker1_name> <speaker1_wav1> [speaker1_wav2] [speaker1_wav3] <speaker2_name> <speaker2_wav1> [speaker2_wav2] [speaker2_wav3] <output_file> <test_wav1> <test_wav2> ...\n", program_name);
  42 + fprintf(stderr, "Example: %s 3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx 0.6 fangjun ./sr-data/enroll/fangjun-sr-1.wav ./sr-data/enroll/fangjun-sr-2.wav ./sr-data/enroll/fangjun-sr-3.wav leijun ./sr-data/enroll/leijun-sr-1.wav ./sr-data/enroll/leijun-sr-2.wav result.txt ./sr-data/test/fangjun-test-sr-1.wav ./sr-data/test/leijun-test-sr-1.wav ./sr-data/test/liudehua-test-sr-1.wav\n", program_name);
  43 +}
62 44
63 - memset(&config, 0, sizeof(config)); 45 +int32_t main(int32_t argc, char *argv[]) {
  46 + if (argc < 7) {
  47 + PrintUsage(argv[0]);
  48 + return -1;
  49 + }
64 50
65 - // please download the model from  
66 - // https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models  
67 - config.model = "./3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"; 51 + // Parse command line arguments
  52 + const char *model_path = argv[1];
  53 + float threshold = atof(argv[2]);
  54 +
  55 + // Find the position of output file and test files
  56 + int32_t output_file_index = -1;
  57 + for (int32_t i = 3; i < argc; i++) {
  58 + if (strstr(argv[i], ".txt") != NULL) {
  59 + output_file_index = i;
  60 + break;
  61 + }
  62 + }
  63 +
  64 + if (output_file_index == -1 || output_file_index >= argc - 1) {
  65 + fprintf(stderr, "Output file not found or no test files provided\n");
  66 + PrintUsage(argv[0]);
  67 + return -1;
  68 + }
  69 +
  70 + const char *output_file = argv[output_file_index];
  71 + int32_t num_test_files = argc - output_file_index - 1;
  72 + const char **test_files = (const char **)&argv[output_file_index + 1];
  73 +
  74 + // Parse speaker information
  75 + int32_t num_speakers = 0;
  76 + const char *speaker_names[10] = {NULL};
  77 + const char *speaker_files[10][4] = {NULL};
  78 + int32_t speaker_file_counts[10] = {0};
  79 +
  80 + int32_t current_index = 3;
  81 + while (current_index < output_file_index && num_speakers < 10) {
  82 + // Speaker name
  83 + speaker_names[num_speakers] = argv[current_index++];
  84 +
  85 + // Speaker wave files
  86 + int32_t file_count = 0;
  87 + while (current_index < output_file_index &&
  88 + strstr(argv[current_index], ".wav") != NULL &&
  89 + file_count < 4) {
  90 + speaker_files[num_speakers][file_count++] = argv[current_index++];
  91 + }
  92 +
  93 + speaker_file_counts[num_speakers] = file_count;
  94 + num_speakers++;
  95 + }
  96 +
  97 + // Open output file
  98 + FILE *fp = fopen(output_file, "w");
  99 + if (!fp) {
  100 + fprintf(stderr, "Failed to open output file: %s\n", output_file);
  101 + return -1;
  102 + }
  103 +
  104 + fprintf(fp, "Speaker Identification Results\n");
  105 + fprintf(fp, "Model: %s\n", model_path);
  106 + fprintf(fp, "Threshold: %.2f\n", threshold);
  107 + fprintf(fp, "========================================\n");
68 108
  109 + // Initialize speaker embedding extractor
  110 + SherpaOnnxSpeakerEmbeddingExtractorConfig config;
  111 + memset(&config, 0, sizeof(config));
  112 + config.model = model_path;
69 config.num_threads = 1; 113 config.num_threads = 1;
70 config.debug = 0; 114 config.debug = 0;
71 config.provider = "cpu"; 115 config.provider = "cpu";
@@ -74,184 +118,76 @@ int32_t main() { @@ -74,184 +118,76 @@ int32_t main() {
74 SherpaOnnxCreateSpeakerEmbeddingExtractor(&config); 118 SherpaOnnxCreateSpeakerEmbeddingExtractor(&config);
75 if (!ex) { 119 if (!ex) {
76 fprintf(stderr, "Failed to create speaker embedding extractor"); 120 fprintf(stderr, "Failed to create speaker embedding extractor");
  121 + fclose(fp);
77 return -1; 122 return -1;
78 } 123 }
79 124
80 int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex); 125 int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex);
81 -  
82 const SherpaOnnxSpeakerEmbeddingManager *manager = 126 const SherpaOnnxSpeakerEmbeddingManager *manager =
83 SherpaOnnxCreateSpeakerEmbeddingManager(dim); 127 SherpaOnnxCreateSpeakerEmbeddingManager(dim);
84 128
85 - // Please download the test data from  
86 - // https://github.com/csukuangfj/sr-data  
87 - const char *spk1_1 = "./sr-data/enroll/fangjun-sr-1.wav";  
88 - const char *spk1_2 = "./sr-data/enroll/fangjun-sr-2.wav";  
89 - const char *spk1_3 = "./sr-data/enroll/fangjun-sr-3.wav";  
90 -  
91 - const char *spk2_1 = "./sr-data/enroll/leijun-sr-1.wav";  
92 - const char *spk2_2 = "./sr-data/enroll/leijun-sr-2.wav";  
93 -  
94 - const float *spk1_vec[4] = {NULL};  
95 - spk1_vec[0] = ComputeEmbedding(ex, spk1_1);  
96 - spk1_vec[1] = ComputeEmbedding(ex, spk1_2);  
97 - spk1_vec[2] = ComputeEmbedding(ex, spk1_3);  
98 -  
99 - const float *spk2_vec[3] = {NULL};  
100 - spk2_vec[0] = ComputeEmbedding(ex, spk2_1);  
101 - spk2_vec[1] = ComputeEmbedding(ex, spk2_2);  
102 -  
103 - if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "fangjun", spk1_vec)) {  
104 - fprintf(stderr, "Failed to register fangjun\n");  
105 - exit(-1);  
106 - }  
107 -  
108 - if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "fangjun")) {  
109 - fprintf(stderr, "Failed to find fangjun\n");  
110 - exit(-1);  
111 - }  
112 -  
113 - if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "leijun", spk2_vec)) {  
114 - fprintf(stderr, "Failed to register leijun\n");  
115 - exit(-1);  
116 - }  
117 -  
118 - if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "leijun")) {  
119 - fprintf(stderr, "Failed to find leijun\n");  
120 - exit(-1);  
121 - }  
122 -  
123 - if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 2) {  
124 - fprintf(stderr, "There should be two speakers: fangjun and leijun\n");  
125 - exit(-1);  
126 - }  
127 -  
128 - const char *const *all_speakers =  
129 - SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager);  
130 - const char *const *p = all_speakers;  
131 - fprintf(stderr, "list of registered speakers\n-----\n");  
132 - while (p[0]) {  
133 - fprintf(stderr, "speaker: %s\n", p[0]);  
134 - ++p;  
135 - }  
136 - fprintf(stderr, "----\n");  
137 -  
138 - SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers);  
139 -  
140 - const char *test1 = "./sr-data/test/fangjun-test-sr-1.wav";  
141 - const char *test2 = "./sr-data/test/leijun-test-sr-1.wav";  
142 - const char *test3 = "./sr-data/test/liudehua-test-sr-1.wav";  
143 -  
144 - const float *v1 = ComputeEmbedding(ex, test1);  
145 - const float *v2 = ComputeEmbedding(ex, test2);  
146 - const float *v3 = ComputeEmbedding(ex, test3);  
147 -  
148 - float threshold = 0.6;  
149 -  
150 - const char *name1 =  
151 - SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold);  
152 - if (name1) {  
153 - fprintf(stderr, "%s: Found %s\n", test1, name1);  
154 - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1);  
155 - } else {  
156 - fprintf(stderr, "%s: Not found\n", test1);  
157 - }  
158 -  
159 - const char *name2 =  
160 - SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold);  
161 - if (name2) {  
162 - fprintf(stderr, "%s: Found %s\n", test2, name2);  
163 - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2);  
164 - } else {  
165 - fprintf(stderr, "%s: Not found\n", test2);  
166 - }  
167 -  
168 - const char *name3 =  
169 - SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v3, threshold);  
170 - if (name3) {  
171 - fprintf(stderr, "%s: Found %s\n", test3, name3);  
172 - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name3);  
173 - } else {  
174 - fprintf(stderr, "%s: Not found\n", test3);  
175 - }  
176 -  
177 - int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v1,  
178 - threshold);  
179 - if (ok) {  
180 - fprintf(stderr, "%s matches fangjun\n", test1);  
181 - } else {  
182 - fprintf(stderr, "%s does NOT match fangjun\n", test1);  
183 - }  
184 -  
185 - ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v2,  
186 - threshold);  
187 - if (ok) {  
188 - fprintf(stderr, "%s matches fangjun\n", test2);  
189 - } else {  
190 - fprintf(stderr, "%s does NOT match fangjun\n", test2);  
191 - }  
192 -  
193 - fprintf(stderr, "Removing fangjun\n");  
194 - if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "fangjun")) {  
195 - fprintf(stderr, "Failed to remove fangjun\n");  
196 - exit(-1);  
197 - }  
198 -  
199 - if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 1) {  
200 - fprintf(stderr, "There should be only 1 speaker left\n");  
201 - exit(-1);  
202 - }  
203 -  
204 - name1 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold);  
205 - if (name1) {  
206 - fprintf(stderr, "%s: Found %s\n", test1, name1);  
207 - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1);  
208 - } else {  
209 - fprintf(stderr, "%s: Not found\n", test1);  
210 - }  
211 -  
212 - fprintf(stderr, "Removing leijun\n");  
213 - if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "leijun")) {  
214 - fprintf(stderr, "Failed to remove leijun\n");  
215 - exit(-1);  
216 - }  
217 -  
218 - if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 0) {  
219 - fprintf(stderr, "There should be only 1 speaker left\n");  
220 - exit(-1);  
221 - }  
222 -  
223 - name2 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold);  
224 - if (name2) {  
225 - fprintf(stderr, "%s: Found %s\n", test2, name2);  
226 - SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2);  
227 - } else {  
228 - fprintf(stderr, "%s: Not found\n", test2);  
229 - }  
230 -  
231 - all_speakers = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager);  
232 -  
233 - p = all_speakers;  
234 - fprintf(stderr, "list of registered speakers\n-----\n");  
235 - while (p[0]) {  
236 - fprintf(stderr, "speaker: %s\n", p[0]);  
237 - ++p;  
238 - }  
239 - fprintf(stderr, "----\n");  
240 -  
241 - SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers);  
242 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v1);  
243 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v2);  
244 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v3);  
245 -  
246 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[0]);  
247 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[1]);  
248 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[2]);  
249 -  
250 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[0]);  
251 - SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[1]);  
252 - 129 + // Register speakers
  130 + for (int32_t i = 0; i < num_speakers; i++) {
  131 + const float *embeddings[4] = {NULL};
  132 + int32_t count = speaker_file_counts[i];
  133 +
  134 + for (int32_t j = 0; j < count; j++) {
  135 + embeddings[j] = ComputeEmbedding(ex, speaker_files[i][j]);
  136 + }
  137 +
  138 + if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, speaker_names[i], embeddings)) {
  139 + fprintf(stderr, "Failed to register %s\n", speaker_names[i]);
  140 + fprintf(fp, "Failed to register %s\n", speaker_names[i]);
  141 + fclose(fp);
  142 + exit(-1);
  143 + }
  144 +
  145 + for (int32_t j = 0; j < count; j++) {
  146 + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(embeddings[j]);
  147 + }
  148 +
  149 + fprintf(stderr, "Registered speaker: %s with %d wave files\n", speaker_names[i], count);
  150 + fprintf(fp, "Registered speaker: %s with %d wave files\n", speaker_names[i], count);
  151 + }
  152 +
  153 + fprintf(fp, "\nTest Results:\n");
  154 + fprintf(fp, "========================================\n");
  155 +
  156 + // Process test files
  157 + for (int32_t i = 0; i < num_test_files; i++) {
  158 + const char *test_file = test_files[i];
  159 + const float *v = ComputeEmbedding(ex, test_file);
  160 +
  161 + const char *name = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v, threshold);
  162 +
  163 + fprintf(stderr, "Testing %s: ", test_file);
  164 + fprintf(fp, "Test file: %s\n", test_file);
  165 +
  166 + if (name) {
  167 + fprintf(stderr, "Found %s\n", name);
  168 + fprintf(fp, " Result: Found speaker - %s\n", name);
  169 + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name);
  170 + } else {
  171 + fprintf(stderr, "Not found\n");
  172 + fprintf(fp, " Result: Speaker not found\n");
  173 + }
  174 +
  175 + // Verify against all registered speakers
  176 + for (int32_t j = 0; j < num_speakers; j++) {
  177 + int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, speaker_names[j], v, threshold);
  178 + fprintf(fp, " Verify with %s: %s\n", speaker_names[j], ok ? "MATCH" : "NO MATCH");
  179 + }
  180 +
  181 + fprintf(fp, "\n");
  182 + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v);
  183 + }
  184 +
  185 + // Cleanup
253 SherpaOnnxDestroySpeakerEmbeddingManager(manager); 186 SherpaOnnxDestroySpeakerEmbeddingManager(manager);
254 SherpaOnnxDestroySpeakerEmbeddingExtractor(ex); 187 SherpaOnnxDestroySpeakerEmbeddingExtractor(ex);
  188 + fclose(fp);
  189 +
  190 + fprintf(stderr, "Results saved to: %s\n", output_file);
255 191
256 return 0; 192 return 0;
257 -} 193 +}
@@ -23,164 +23,486 @@ @@ -23,164 +23,486 @@
23 #include <stdio.h> 23 #include <stdio.h>
24 #include <stdlib.h> 24 #include <stdlib.h>
25 #include <string.h> 25 #include <string.h>
  26 +#include <ctype.h>
  27 +#include <wchar.h>
  28 +#include <locale.h>
  29 +#include <stdbool.h>
  30 +#include <stdint.h>
26 31
27 #include "sherpa-onnx/c-api/c-api.h" 32 #include "sherpa-onnx/c-api/c-api.h"
28 33
29 -int32_t main() {  
30 - const char *wav_filename = "./lei-jun-test.wav";  
31 - if (!SherpaOnnxFileExists(wav_filename)) {  
32 - fprintf(stderr, "Please download %s\n", wav_filename);  
33 - return -1;  
34 - } 34 +// Function to normalize string: remove punctuation and spaces, convert to lowercase
  35 +void normalize_string(const char* input, char* output) {
  36 + int i = 0, j = 0;
  37 + while (input[i] != '\0') {
  38 + // Skip punctuation characters and spaces (both English and Chinese)
  39 + if (!ispunct((unsigned char)input[i]) &&
  40 + !isspace((unsigned char)input[i]) &&
  41 + !(input[i] >= 0x3000 && input[i] <= 0x303F) && // CJK punctuation
  42 + !(input[i] >= 0xFF00 && input[i] <= 0xFF0F) && // Fullwidth forms
  43 + !(input[i] >= 0xFF1A && input[i] <= 0xFF20) && // Fullwidth forms
  44 + !(input[i] >= 0xFF3B && input[i] <= 0xFF40) && // Fullwidth forms
  45 + !(input[i] >= 0xFF5B && input[i] <= 0xFF65)) { // Fullwidth forms
  46 +
  47 + // Convert to lowercase and add to output
  48 + output[j++] = tolower((unsigned char)input[i]);
  49 + }
  50 + i++;
  51 + }
  52 + output[j] = '\0';
  53 +}
35 54
36 - const char *vad_filename;  
37 - int32_t use_silero_vad = 0;  
38 - int32_t use_ten_vad = 0;  
39 -  
40 - if (SherpaOnnxFileExists("./silero_vad.onnx")) {  
41 - printf("Use silero-vad\n");  
42 - vad_filename = "./silero_vad.onnx";  
43 - use_silero_vad = 1;  
44 - } else if (SherpaOnnxFileExists("./ten-vad.onnx")) {  
45 - printf("Use ten-vad\n");  
46 - vad_filename = "./ten-vad.onnx";  
47 - use_ten_vad = 1;  
48 - } else {  
49 - fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");  
50 - return -1;  
51 - } 55 +// Function to get the first meaningful character (non-punctuation, non-space)
  56 +char get_first_meaningful_char(const char* str) {
  57 + int i = 0;
  58 + while (str[i] != '\0') {
  59 + if (!ispunct((unsigned char)str[i]) &&
  60 + !isspace((unsigned char)str[i]) &&
  61 + !(str[i] >= 0x3000 && str[i] <= 0x303F) &&
  62 + !(str[i] >= 0xFF00 && str[i] <= 0xFF0F) &&
  63 + !(str[i] >= 0xFF1A && str[i] <= 0xFF20) &&
  64 + !(str[i] >= 0xFF3B && str[i] <= 0xFF40) &&
  65 + !(str[i] >= 0xFF5B && str[i] <= 0xFF65)) {
  66 + return tolower((unsigned char)str[i]);
  67 + }
  68 + i++;
  69 + }
  70 + return '\0';
  71 +}
52 72
53 - const char *model_filename =  
54 - "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";  
55 - const char *tokens_filename =  
56 - "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";  
57 - const char *language = "auto";  
58 - const char *provider = "cpu";  
59 - int32_t use_inverse_text_normalization = 1;  
60 -  
61 - const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);  
62 - if (wave == NULL) {  
63 - fprintf(stderr, "Failed to read %s\n", wav_filename);  
64 - return -1;  
65 - } 73 +// Function to check if two strings are effectively the same after normalization
  74 +int are_strings_effectively_same(const char* str1, const char* str2) {
  75 + char norm1[1024], norm2[1024];
  76 + normalize_string(str1, norm1);
  77 + normalize_string(str2, norm2);
  78 + return strcmp(norm1, norm2) == 0;
  79 +}
66 80
67 - if (wave->sample_rate != 16000) {  
68 - fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n",  
69 - wave->sample_rate);  
70 - SherpaOnnxFreeWave(wave);  
71 - return -1;  
72 - } 81 +// 判断是否为 CJK 统一表意字符
  82 +static bool is_cjk_ideograph(uint32_t ch)
  83 +{
  84 + return (ch >= 0x4E00 && ch <= 0x9FFF) || // CJK Unified Ideographs
  85 + (ch >= 0x3400 && ch <= 0x4DBF) || // CJK Extension A
  86 + (ch >= 0x20000 && ch <= 0x2A6DF) || // CJK Extension B
  87 + (ch >= 0x2A700 && ch <= 0x2B73F) || // CJK Extension C
  88 + (ch >= 0x2B740 && ch <= 0x2B81F) || // CJK Extension D
  89 + (ch >= 0x2B820 && ch <= 0x2CEAF) || // CJK Extension E
  90 + (ch >= 0x2CEB0 && ch <= 0x2EBEF) || // CJK Extension F
  91 + (ch >= 0x3007 && ch <= 0x3007) || // 〇
  92 + (ch >= 0x3021 && ch <= 0x3029) || // 〡〢〣〤〥〦〧〨〩
  93 + (ch >= 0x3038 && ch <= 0x303B); // 〸〹〺〻〼
  94 +}
73 95
74 - SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config;  
75 - memset(&sense_voice_config, 0, sizeof(sense_voice_config));  
76 - sense_voice_config.model = model_filename;  
77 - sense_voice_config.language = language;  
78 - sense_voice_config.use_itn = use_inverse_text_normalization;  
79 -  
80 - // Offline model config  
81 - SherpaOnnxOfflineModelConfig offline_model_config;  
82 - memset(&offline_model_config, 0, sizeof(offline_model_config));  
83 - offline_model_config.debug = 0;  
84 - offline_model_config.num_threads = 1;  
85 - offline_model_config.provider = provider;  
86 - offline_model_config.tokens = tokens_filename;  
87 - offline_model_config.sense_voice = sense_voice_config;  
88 -  
89 - // Recognizer config  
90 - SherpaOnnxOfflineRecognizerConfig recognizer_config;  
91 - memset(&recognizer_config, 0, sizeof(recognizer_config));  
92 - recognizer_config.decoding_method = "greedy_search";  
93 - recognizer_config.model_config = offline_model_config;  
94 -  
95 - const SherpaOnnxOfflineRecognizer *recognizer =  
96 - SherpaOnnxCreateOfflineRecognizer(&recognizer_config);  
97 -  
98 - if (recognizer == NULL) {  
99 - fprintf(stderr, "Please check your recognizer config!\n");  
100 - SherpaOnnxFreeWave(wave);  
101 - return -1;  
102 - } 96 +// 反向解码一个 UTF-8 字符,返回其长度(字节)和码点
  97 +static int prev_utf8_char(const char *s, int pos, uint32_t *out_ch)
  98 +{
  99 + int start = pos;
  100 + // 找到当前字符起始字节
  101 + while (start > 0 && (s[start] & 0xC0) == 0x80)
  102 + --start;
  103 + // 解码
  104 + const unsigned char *p = (const unsigned char *)&s[start];
  105 + if ((*p & 0x80) == 0) { // 1-byte
  106 + *out_ch = *p;
  107 + } else if ((*p & 0xE0) == 0xC0) { // 2-byte
  108 + *out_ch = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
  109 + } else if ((*p & 0xF0) == 0xE0) { // 3-byte
  110 + *out_ch = ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
  111 + } else if ((*p & 0xF8) == 0xF0) { // 4-byte
  112 + *out_ch = ((p[0] & 0x07) << 18) | ((p[1] & 0x3F) << 12) |
  113 + ((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
  114 + } else {
  115 + *out_ch = 0xFFFD; // 非法序列,用替换字符
  116 + }
  117 + return pos - start + 1; // 返回字节长度
  118 +}
103 119
104 - SherpaOnnxVadModelConfig vadConfig;  
105 - memset(&vadConfig, 0, sizeof(vadConfig));  
106 -  
107 - if (use_silero_vad) {  
108 - vadConfig.silero_vad.model = vad_filename;  
109 - vadConfig.silero_vad.threshold = 0.25;  
110 - vadConfig.silero_vad.min_silence_duration = 0.5;  
111 - vadConfig.silero_vad.min_speech_duration = 0.5;  
112 - vadConfig.silero_vad.max_speech_duration = 10;  
113 - vadConfig.silero_vad.window_size = 512;  
114 - } else if (use_ten_vad) {  
115 - vadConfig.ten_vad.model = vad_filename;  
116 - vadConfig.ten_vad.threshold = 0.25;  
117 - vadConfig.ten_vad.min_silence_duration = 0.5;  
118 - vadConfig.ten_vad.min_speech_duration = 0.5;  
119 - vadConfig.ten_vad.max_speech_duration = 10;  
120 - vadConfig.ten_vad.window_size = 256;  
121 - } 120 +// 新实现:按“中日文单字 / 英文整词”取最后 n 个语义单元
  121 +void get_last_n_words(const char *str, int n, char *output)
  122 +{
  123 + if (!str || !output || n <= 0) {
  124 + *output = '\0';
  125 + return;
  126 + }
122 127
123 - vadConfig.sample_rate = 16000;  
124 - vadConfig.num_threads = 1;  
125 - vadConfig.debug = 1; 128 + int len = strlen(str);
  129 + if (len == 0) {
  130 + *output = '\0';
  131 + return;
  132 + }
126 133
127 - const SherpaOnnxVoiceActivityDetector *vad =  
128 - SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); 134 + // 用来存反向收集到的单元
  135 + char units[256][256];
  136 + int unit_cnt = 0;
  137 +
  138 + int pos = len; // 从 '\0' 前一个位置开始
  139 + while (pos > 0 && unit_cnt < n) {
  140 + uint32_t ch;
  141 + int char_len = prev_utf8_char(str, pos - 1, &ch);
  142 + pos -= char_len;
  143 +
  144 + if (ch < 128 && ((ch | 32) - 'a' < 26)) {
  145 + // ===== 英文单词 =====
  146 + int word_end = pos + char_len;
  147 + int word_start = pos;
  148 + // 向前找单词起始
  149 + while (word_start > 0) {
  150 + uint32_t tmp;
  151 + int tmp_len = prev_utf8_char(str, word_start - 1, &tmp);
  152 + if (tmp < 128 && ((tmp | 32) - 'a' < 26))
  153 + word_start -= tmp_len;
  154 + else
  155 + break;
  156 + }
  157 + // 拷贝整个单词
  158 + int wlen = word_end - word_start;
  159 + if (wlen >= (int)sizeof(units[unit_cnt])) wlen = sizeof(units[unit_cnt]) - 1;
  160 + memcpy(units[unit_cnt], str + word_start, wlen);
  161 + units[unit_cnt][wlen] = '\0';
  162 + ++unit_cnt;
  163 + pos = word_start; // 继续向前扫描
  164 + } else if (is_cjk_ideograph(ch) || ch > 0xFF00) {
  165 + // ===== CJK 或全角符号 =====
  166 + if (char_len >= (int)sizeof(units[unit_cnt])) char_len = sizeof(units[unit_cnt]) - 1;
  167 + memcpy(units[unit_cnt], str + pos, char_len);
  168 + units[unit_cnt][char_len] = '\0';
  169 + ++unit_cnt;
  170 + }
  171 + // 其他标点/空格直接跳过
  172 + }
129 173
130 - if (vad == NULL) {  
131 - fprintf(stderr, "Please check your recognizer config!\n");  
132 - SherpaOnnxFreeWave(wave);  
133 - SherpaOnnxDestroyOfflineRecognizer(recognizer);  
134 - return -1; 174 + // 反向拼回 output
  175 + output[0] = '\0';
  176 + for (int i = unit_cnt - 1; i >= 0; --i) {
  177 + if (i < unit_cnt - 1) strcat(output, " ");
  178 + strcat(output, units[i]);
  179 + }
  180 +}
  181 +
  182 +// 在第二个字符串中查找锚点文本的位置
  183 +const char *find_anchor_end_position(const char *str, const char *anchor) {
  184 + if (!anchor || !*anchor) return str;
  185 +
  186 + char normalized_str[1024] = {0};
  187 + char normalized_anchor[1024] = {0};
  188 +
  189 + // 规范化两个字符串
  190 + normalize_string(str, normalized_str);
  191 + normalize_string(anchor, normalized_anchor);
  192 +
  193 + // 在规范化后的字符串中查找锚点
  194 + char *found = strstr(normalized_str, normalized_anchor);
  195 + if (!found) return str; // 如果找不到锚点,返回整个字符串
  196 +
  197 + // 计算锚点的结束位置
  198 + int anchor_end_offset = found - normalized_str + strlen(normalized_anchor);
  199 +
  200 + // 计算在原始字符串中的对应位置
  201 + int normalized_count = 0;
  202 + const char *ptr = str;
  203 +
  204 + while (*ptr != '\0' && normalized_count < anchor_end_offset) {
  205 + if (!ispunct((unsigned char)*ptr) && !isspace((unsigned char)*ptr)) {
  206 + normalized_count++;
  207 + }
  208 + ptr++;
135 } 209 }
  210 +
  211 + return ptr;
  212 +}
  213 +// 找到下一个单词的开始位置
  214 +const char *find_next_word_start(const char *str) {
  215 + // 跳过所有标点和空格
  216 + while (*str != '\0' &&
  217 + (ispunct((unsigned char)*str) || isspace((unsigned char)*str))) {
  218 + str++;
  219 + }
  220 + return str;
  221 +}
136 222
137 - int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size  
138 - : vadConfig.ten_vad.window_size;  
139 - int32_t i = 0;  
140 - int is_eof = 0; 223 +// 获取基于锚点的差异文本(从锚点后的第一个完整单词开始)
  224 +char *get_difference_after_anchor(const char *str1, const char *str2, int num_anchor_words) {
  225 + if (are_strings_effectively_same(str1, str2)) {
  226 + return strdup("");
  227 + }
141 228
142 - while (!is_eof) {  
143 - if (i + window_size < wave->num_samples) {  
144 - SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i,  
145 - window_size);  
146 - } else {  
147 - SherpaOnnxVoiceActivityDetectorFlush(vad);  
148 - is_eof = 1; 229 + // 获取语义单元级的锚点文本
  230 + char semantic_anchor[256] = {0};
  231 + get_last_n_words(str1, num_anchor_words, semantic_anchor);
  232 +
  233 + if (strlen(semantic_anchor) == 0) {
  234 + return strdup(str2);
149 } 235 }
150 236
151 - while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) {  
152 - const SherpaOnnxSpeechSegment *segment =  
153 - SherpaOnnxVoiceActivityDetectorFront(vad); 237 + // 关键:对语义锚点再做一次字符级规范化,匹配 find_anchor_end_position 的行为
  238 + char normalized_anchor[256] = {0};
  239 + normalize_string(semantic_anchor, normalized_anchor);
154 240
155 - const SherpaOnnxOfflineStream *stream =  
156 - SherpaOnnxCreateOfflineStream(recognizer); 241 + // 使用规范化后的锚点查找位置
  242 + const char *anchor_end = find_anchor_end_position(str2, normalized_anchor);
  243 + const char *next_word_start = find_next_word_start(anchor_end);
157 244
158 - SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate,  
159 - segment->samples, segment->n); 245 + return strdup(next_word_start);
  246 +}
160 247
161 - SherpaOnnxDecodeOfflineStream(recognizer, stream); 248 +// Structure to store previous segment information
  249 +typedef struct {
  250 + float* samples;
  251 + int32_t n;
  252 + int32_t start;
  253 + char* text;
  254 +} PreviousSegment;
  255 +
  256 +void free_previous_segment(PreviousSegment* seg) {
  257 + if (seg) {
  258 + if (seg->samples) free(seg->samples);
  259 + if (seg->text) free(seg->text);
  260 + free(seg);
  261 + }
  262 +}
162 263
163 - const SherpaOnnxOfflineRecognizerResult *result =  
164 - SherpaOnnxGetOfflineStreamResult(stream); 264 +PreviousSegment* copy_segment(const SherpaOnnxSpeechSegment* segment, const char* text) {
  265 + PreviousSegment* prev = (PreviousSegment*)malloc(sizeof(PreviousSegment));
  266 + if (!prev) return NULL;
  267 +
  268 + prev->n = segment->n;
  269 + prev->start = segment->start;
  270 + prev->samples = (float*)malloc(segment->n * sizeof(float));
  271 + if (!prev->samples) {
  272 + free(prev);
  273 + return NULL;
  274 + }
  275 + memcpy(prev->samples, segment->samples, segment->n * sizeof(float));
  276 +
  277 + prev->text = strdup(text);
  278 + if (!prev->text) {
  279 + free(prev->samples);
  280 + free(prev);
  281 + return NULL;
  282 + }
  283 +
  284 + return prev;
  285 +}
165 286
166 - float start = segment->start / 16000.0f;  
167 - float duration = segment->n / 16000.0f;  
168 - float stop = start + duration; 287 +int32_t main() {
  288 + setlocale(LC_ALL, ""); // Set locale for wide character handling
169 289
170 - fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); 290 + const char *wav_filename = "./lei-jun-test.wav";
  291 + if (!SherpaOnnxFileExists(wav_filename)) {
  292 + fprintf(stderr, "Please download %s\n", wav_filename);
  293 + return -1;
  294 + }
171 295
172 - SherpaOnnxDestroyOfflineRecognizerResult(result);  
173 - SherpaOnnxDestroyOfflineStream(stream); 296 + const char *vad_filename;
  297 + int32_t use_silero_vad = 0;
  298 + int32_t use_ten_vad = 0;
  299 +
  300 + if (SherpaOnnxFileExists("./silero_vad.onnx")) {
  301 + printf("Use silero-vad\n");
  302 + vad_filename = "./silero_vad.onnx";
  303 + use_silero_vad = 1;
  304 + } else if (SherpaOnnxFileExists("./ten-vad.onnx")) {
  305 + printf("Use ten-vad\n");
  306 + vad_filename = "./ten-vad.onnx";
  307 + use_ten_vad = 1;
  308 + } else {
  309 + fprintf(stderr, "Please provide either silero_vad.onnx or ten-vad.onnx\n");
  310 + return -1;
  311 + }
174 312
175 - SherpaOnnxDestroySpeechSegment(segment);  
176 - SherpaOnnxVoiceActivityDetectorPop(vad); 313 + const char *model_filename =
  314 + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx";
  315 + const char *tokens_filename =
  316 + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
  317 + const char *language = "auto";
  318 + const char *provider = "cpu";
  319 + int32_t use_inverse_text_normalization = 1;
  320 +
  321 + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
  322 + if (wave == NULL) {
  323 + fprintf(stderr, "Failed to read %s\n", wav_filename);
  324 + return -1;
177 } 325 }
178 - i += window_size;  
179 - }  
180 326
181 - SherpaOnnxDestroyOfflineRecognizer(recognizer);  
182 - SherpaOnnxDestroyVoiceActivityDetector(vad);  
183 - SherpaOnnxFreeWave(wave); 327 + if (wave->sample_rate != 16000) {
  328 + fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n",
  329 + wave->sample_rate);
  330 + SherpaOnnxFreeWave(wave);
  331 + return -1;
  332 + }
  333 +
  334 + SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config;
  335 + memset(&sense_voice_config, 0, sizeof(sense_voice_config));
  336 + sense_voice_config.model = model_filename;
  337 + sense_voice_config.language = language;
  338 + sense_voice_config.use_itn = use_inverse_text_normalization;
  339 +
  340 + // Offline model config
  341 + SherpaOnnxOfflineModelConfig offline_model_config;
  342 + memset(&offline_model_config, 0, sizeof(offline_model_config));
  343 + offline_model_config.debug = 0;
  344 + offline_model_config.num_threads = 1;
  345 + offline_model_config.provider = provider;
  346 + offline_model_config.tokens = tokens_filename;
  347 + offline_model_config.sense_voice = sense_voice_config;
  348 +
  349 + // Recognizer config
  350 + SherpaOnnxOfflineRecognizerConfig recognizer_config;
  351 + memset(&recognizer_config, 0, sizeof(recognizer_config));
  352 + recognizer_config.decoding_method = "greedy_search";
  353 + recognizer_config.model_config = offline_model_config;
  354 +
  355 + const SherpaOnnxOfflineRecognizer *recognizer =
  356 + SherpaOnnxCreateOfflineRecognizer(&recognizer_config);
  357 +
  358 + if (recognizer == NULL) {
  359 + fprintf(stderr, "Please check your recognizer config!\n");
  360 + SherpaOnnxFreeWave(wave);
  361 + return -1;
  362 + }
  363 +
  364 + SherpaOnnxVadModelConfig vadConfig;
  365 + memset(&vadConfig, 0, sizeof(vadConfig));
  366 +
  367 + if (use_silero_vad) {
  368 + vadConfig.silero_vad.model = vad_filename;
  369 + vadConfig.silero_vad.threshold = 0.25;
  370 + vadConfig.silero_vad.min_silence_duration = 1.5;
  371 + vadConfig.silero_vad.min_speech_duration = 0.3;
  372 + vadConfig.silero_vad.max_speech_duration = 20;
  373 + vadConfig.silero_vad.window_size = 512;
  374 + } else if (use_ten_vad) {
  375 + vadConfig.ten_vad.model = vad_filename;
  376 + vadConfig.ten_vad.threshold = 0.25;
  377 + vadConfig.ten_vad.min_silence_duration = 0.5;
  378 + vadConfig.ten_vad.min_speech_duration = 0.5;
  379 + vadConfig.ten_vad.max_speech_duration = 10;
  380 + vadConfig.ten_vad.window_size = 256;
  381 + }
  382 +
  383 + vadConfig.sample_rate = 16000;
  384 + vadConfig.num_threads = 1;
  385 + vadConfig.debug = 1;
  386 +
  387 + const SherpaOnnxVoiceActivityDetector *vad =
  388 + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30);
  389 +
  390 + if (vad == NULL) {
  391 + fprintf(stderr, "Please check your recognizer config!\n");
  392 + SherpaOnnxFreeWave(wave);
  393 + SherpaOnnxDestroyOfflineRecognizer(recognizer);
  394 + return -1;
  395 + }
  396 +
  397 + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
  398 + : vadConfig.ten_vad.window_size;
  399 + int32_t i = 0;
  400 + int is_eof = 0;
  401 +
  402 + // Variables to store previous segment information
  403 + PreviousSegment *prev_segment = NULL;
  404 +
  405 + while (!is_eof) {
  406 + if (i + window_size < wave->num_samples) {
  407 + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i,
  408 + window_size);
  409 + } else {
  410 + SherpaOnnxVoiceActivityDetectorFlush(vad);
  411 + is_eof = 1;
  412 + }
  413 +
  414 + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) {
  415 + const SherpaOnnxSpeechSegment *segment =
  416 + SherpaOnnxVoiceActivityDetectorFront(vad);
  417 +
  418 + float duration = segment->n / 16000.0f;
  419 +
  420 + // Process the current segment
  421 + const SherpaOnnxOfflineStream *stream =
  422 + SherpaOnnxCreateOfflineStream(recognizer);
  423 +
  424 + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate,
  425 + segment->samples, segment->n);
  426 + SherpaOnnxDecodeOfflineStream(recognizer, stream);
  427 + const SherpaOnnxOfflineRecognizerResult *result =
  428 + SherpaOnnxGetOfflineStreamResult(stream);
  429 +
  430 + float start = segment->start / 16000.0f;
  431 + float stop = start + duration;
  432 +
  433 + if (duration < 1.5f && prev_segment != NULL) {
  434 + // Current segment is less than 1 second and we have a previous segment
  435 + // Merge with previous segment
  436 +
  437 + // Create merged samples
  438 + int32_t merged_n = prev_segment->n + segment->n;
  439 + float *merged_samples = (float*)malloc(merged_n * sizeof(float));
  440 + memcpy(merged_samples, prev_segment->samples, prev_segment->n * sizeof(float));
  441 + memcpy(merged_samples + prev_segment->n, segment->samples, segment->n * sizeof(float));
  442 +
  443 + // Create stream for merged segment
  444 + const SherpaOnnxOfflineStream *merged_stream =
  445 + SherpaOnnxCreateOfflineStream(recognizer);
  446 + SherpaOnnxAcceptWaveformOffline(merged_stream, wave->sample_rate,
  447 + merged_samples, merged_n);
  448 + SherpaOnnxDecodeOfflineStream(recognizer, merged_stream);
  449 + const SherpaOnnxOfflineRecognizerResult *merged_result =
  450 + SherpaOnnxGetOfflineStreamResult(merged_stream);
  451 +
  452 + // Get the meaningful difference starting from first character
  453 + char *diff_text = get_difference_after_anchor(prev_segment->text, merged_result->text, 3);
  454 +
  455 + if (strlen(diff_text) == 0) {
  456 + fprintf(stderr, "%.3f -- %.3f: %s (short segment, no meaningful difference)\n",
  457 + start, stop, merged_result->text);
  458 + } else {
  459 + fprintf(stderr, "%.3f -- %.3f: %s (short segment, meaningful diff: %s)\n",
  460 + start, stop, merged_result->text, diff_text);
  461 + }
  462 +
  463 + // Don't update prev_segment for short segments (requirement 1)
  464 + // Only update if the current segment is >= 1 second
  465 +
  466 + SherpaOnnxDestroyOfflineRecognizerResult(merged_result);
  467 + SherpaOnnxDestroyOfflineStream(merged_stream);
  468 + free(merged_samples);
  469 + free(diff_text);
  470 +
  471 + } else {
  472 + // Normal processing for segments >= 1 second
  473 + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text);
  474 +
  475 + // Store current segment and result only if duration >= 1 second (requirement 1)
  476 + if (duration >= 1.5f) {
  477 + if (prev_segment != NULL) {
  478 + free_previous_segment(prev_segment);
  479 + }
  480 + prev_segment = copy_segment(segment, result->text);
  481 + } else {
  482 + // Short segment, don't store as previous
  483 + if (prev_segment != NULL) {
  484 + free_previous_segment(prev_segment);
  485 + prev_segment = NULL;
  486 + }
  487 + }
  488 + }
  489 +
  490 + SherpaOnnxDestroyOfflineRecognizerResult(result);
  491 + SherpaOnnxDestroyOfflineStream(stream);
  492 + SherpaOnnxDestroySpeechSegment(segment);
  493 + SherpaOnnxVoiceActivityDetectorPop(vad);
  494 + }
  495 + i += window_size;
  496 + }
  497 +
  498 + // Clean up
  499 + if (prev_segment != NULL) {
  500 + free_previous_segment(prev_segment);
  501 + }
  502 +
  503 + SherpaOnnxDestroyOfflineRecognizer(recognizer);
  504 + SherpaOnnxDestroyVoiceActivityDetector(vad);
  505 + SherpaOnnxFreeWave(wave);
184 506
185 - return 0; 507 + return 0;
186 } 508 }
@@ -63,8 +63,17 @@ elseif(WIN32) @@ -63,8 +63,17 @@ elseif(WIN32)
63 # Now SHERPA_ONNX_OS_TWO_LINES contains something like 63 # Now SHERPA_ONNX_OS_TWO_LINES contains something like
64 # Caption Version 64 # Caption Version
65 # Microsoft Windows 10 Pro 10.0.18362 65 # Microsoft Windows 10 Pro 10.0.18362
66 - string(REPLACE "\n" ";" SHERPA_ONNX_OS_LIST ${SHERPA_ONNX_OS_TWO_LINES})  
67 - list(GET SHERPA_ONNX_OS_LIST 1 SHERPA_ONNX_OS) 66 + if(SHERPA_ONNX_OS_TWO_LINES)
  67 + string(REPLACE "\n" ";" SHERPA_ONNX_OS_LIST "${SHERPA_ONNX_OS_TWO_LINES}")
  68 + list(LENGTH SHERPA_ONNX_OS_LIST _list_length)
  69 + if(_list_length GREATER 1)
  70 + list(GET SHERPA_ONNX_OS_LIST 1 SHERPA_ONNX_OS)
  71 + else()
  72 + set(SHERPA_ONNX_OS "Windows")
  73 + endif()
  74 + else()
  75 + set(SHERPA_ONNX_OS "Windows")
  76 + endif()
68 else() 77 else()
69 set(SHERPA_ONNX_OS "Unknown") 78 set(SHERPA_ONNX_OS "Unknown")
70 endif() 79 endif()