xuning

尝试构建静态库

1 include(cargs) 1 include(cargs)
2 2
3 include_directories(${PROJECT_SOURCE_DIR}) 3 include_directories(${PROJECT_SOURCE_DIR})
  4 +
  5 +
  6 +add_executable(vad-sense-voice-lib vad-sense-voice-lib.c)
  7 +target_link_libraries(vad-sense-voice-lib sherpa-onnx-c-api)
  8 +set_target_properties(vad-sense-voice-lib PROPERTIES LINK_FLAGS "-static")
  9 +
4 add_executable(decode-file-c-api decode-file-c-api.c) 10 add_executable(decode-file-c-api decode-file-c-api.c)
5 target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) 11 target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs)
6 12
  1 +#include <stdio.h>
  2 +#include <stdlib.h>
  3 +#include <string.h>
  4 +#include <ctype.h>
  5 +#include <wchar.h>
  6 +#include <locale.h>
  7 +#include <stdbool.h>
  8 +#include <stdint.h>
  9 +
  10 +#include "sherpa-onnx/c-api/c-api.h"
  11 +#include "vad-sense-voice-lib.h"
  12 +// Structure to hold transcription results
  13 +typedef struct {
  14 + float start_time; // Start time in seconds
  15 + float end_time; // End time in seconds
  16 + char* text; // Transcription text
  17 +} TranscriptionResult;
  18 +
  19 +// Structure to store previous segment information
  20 +typedef struct {
  21 + float* samples;
  22 + int32_t n;
  23 + int32_t start;
  24 + char* text;
  25 +} PreviousSegment;
  26 +
  27 +// Function to normalize string: remove punctuation and spaces, convert to lowercase
  28 +void normalize_string(const char* input, char* output) {
  29 + int i = 0, j = 0;
  30 + while (input[i] != '\0') {
  31 + if (!ispunct((unsigned char)input[i]) &&
  32 + !isspace((unsigned char)input[i]) &&
  33 + !(input[i] >= 0x3000 && input[i] <= 0x303F) &&
  34 + !(input[i] >= 0xFF00 && input[i] <= 0xFF0F) &&
  35 + !(input[i] >= 0xFF1A && input[i] <= 0xFF20) &&
  36 + !(input[i] >= 0xFF3B && input[i] <= 0xFF40) &&
  37 + !(input[i] >= 0xFF5B && input[i] <= 0xFF65)) {
  38 + output[j++] = tolower((unsigned char)input[i]);
  39 + }
  40 + i++;
  41 + }
  42 + output[j] = '\0';
  43 +}
  44 +
  45 +// Function to get the first meaningful character
  46 +char get_first_meaningful_char(const char* str) {
  47 + int i = 0;
  48 + while (str[i] != '\0') {
  49 + if (!ispunct((unsigned char)str[i]) &&
  50 + !isspace((unsigned char)str[i]) &&
  51 + !(str[i] >= 0x3000 && str[i] <= 0x303F) &&
  52 + !(str[i] >= 0xFF00 && str[i] <= 0xFF0F) &&
  53 + !(str[i] >= 0xFF1A && str[i] <= 0xFF20) &&
  54 + !(str[i] >= 0xFF3B && str[i] <= 0xFF40) &&
  55 + !(str[i] >= 0xFF5B && str[i] <= 0xFF65)) {
  56 + return tolower((unsigned char)str[i]);
  57 + }
  58 + i++;
  59 + }
  60 + return '\0';
  61 +}
  62 +
  63 +// Function to check if two strings are effectively the same
  64 +int are_strings_effectively_same(const char* str1, const char* str2) {
  65 + char norm1[1024], norm2[1024];
  66 + normalize_string(str1, norm1);
  67 + normalize_string(str2, norm2);
  68 + return strcmp(norm1, norm2) == 0;
  69 +}
  70 +
  71 +// Check if a character is a CJK ideograph
  72 +static bool is_cjk_ideograph(uint32_t ch) {
  73 + return (ch >= 0x4E00 && ch <= 0x9FFF) ||
  74 + (ch >= 0x3400 && ch <= 0x4DBF) ||
  75 + (ch >= 0x20000 && ch <= 0x2A6DF) ||
  76 + (ch >= 0x2A700 && ch <= 0x2B73F) ||
  77 + (ch >= 0x2B740 && ch <= 0x2B81F) ||
  78 + (ch >= 0x2B820 && ch <= 0x2CEAF) ||
  79 + (ch >= 0x2CEB0 && ch <= 0x2EBEF) ||
  80 + (ch >= 0x3007 && ch <= 0x3007) ||
  81 + (ch >= 0x3021 && ch <= 0x3029) ||
  82 + (ch >= 0x3038 && ch <= 0x303B);
  83 +}
  84 +
  85 +// Decode a UTF-8 character backwards
  86 +static int prev_utf8_char(const char *s, int pos, uint32_t *out_ch) {
  87 + int start = pos;
  88 + while (start > 0 && (s[start] & 0xC0) == 0x80)
  89 + --start;
  90 + const unsigned char *p = (const unsigned char *)&s[start];
  91 + if ((*p & 0x80) == 0) {
  92 + *out_ch = *p;
  93 + } else if ((*p & 0xE0) == 0xC0) {
  94 + *out_ch = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
  95 + } else if ((*p & 0xF0) == 0xE0) {
  96 + *out_ch = ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
  97 + } else if ((*p & 0xF8) == 0xF0) {
  98 + *out_ch = ((p[0] & 0x07) << 18) | ((p[1] & 0x3F) << 12) |
  99 + ((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
  100 + } else {
  101 + *out_ch = 0xFFFD;
  102 + }
  103 + return pos - start + 1;
  104 +}
  105 +
  106 +// Get the last n words (CJK single characters or English words)
  107 +void get_last_n_words(const char *str, int n, char *output) {
  108 + if (!str || !output || n <= 0) {
  109 + *output = '\0';
  110 + return;
  111 + }
  112 + int len = strlen(str);
  113 + if (len == 0) {
  114 + *output = '\0';
  115 + return;
  116 + }
  117 + char units[256][256];
  118 + int unit_cnt = 0;
  119 + int pos = len;
  120 + while (pos > 0 && unit_cnt < n) {
  121 + uint32_t ch;
  122 + int char_len = prev_utf8_char(str, pos - 1, &ch);
  123 + pos -= char_len;
  124 + if (ch < 128 && ((ch | 32) - 'a' < 26)) {
  125 + int word_end = pos + char_len;
  126 + int word_start = pos;
  127 + while (word_start > 0) {
  128 + uint32_t tmp;
  129 + int tmp_len = prev_utf8_char(str, word_start - 1, &tmp);
  130 + if (tmp < 128 && ((tmp | 32) - 'a' < 26))
  131 + word_start -= tmp_len;
  132 + else
  133 + break;
  134 + }
  135 + int wlen = word_end - word_start;
  136 + if (wlen >= (int)sizeof(units[unit_cnt])) wlen = sizeof(units[unit_cnt]) - 1;
  137 + memcpy(units[unit_cnt], str + word_start, wlen);
  138 + units[unit_cnt][wlen] = '\0';
  139 + ++unit_cnt;
  140 + pos = word_start;
  141 + } else if (is_cjk_ideograph(ch) || ch > 0xFF00) {
  142 + if (char_len >= (int)sizeof(units[unit_cnt])) char_len = sizeof(units[unit_cnt]) - 1;
  143 + memcpy(units[unit_cnt], str + pos, char_len);
  144 + units[unit_cnt][char_len] = '\0';
  145 + ++unit_cnt;
  146 + }
  147 + }
  148 + output[0] = '\0';
  149 + for (int i = unit_cnt - 1; i >= 0; --i) {
  150 + if (i < unit_cnt - 1) strcat(output, " ");
  151 + strcat(output, units[i]);
  152 + }
  153 +}
  154 +
  155 +// Find the end position of the anchor text
  156 +const char *find_anchor_end_position(const char *str, const char *anchor) {
  157 + if (!anchor || !*anchor) return str;
  158 + char normalized_str[1024] = {0};
  159 + char normalized_anchor[1024] = {0};
  160 + normalize_string(str, normalized_str);
  161 + normalize_string(anchor, normalized_anchor);
  162 + char *found = strstr(normalized_str, normalized_anchor);
  163 + if (!found) return str;
  164 + int anchor_end_offset = found - normalized_str + strlen(normalized_anchor);
  165 + int normalized_count = 0;
  166 + const char *ptr = str;
  167 + while (*ptr != '\0' && normalized_count < anchor_end_offset) {
  168 + if (!ispunct((unsigned char)*ptr) && !isspace((unsigned char)*ptr)) {
  169 + normalized_count++;
  170 + }
  171 + ptr++;
  172 + }
  173 + return ptr;
  174 +}
  175 +
  176 +// Find the start of the next word
  177 +const char *find_next_word_start(const char *str) {
  178 + while (*str != '\0' &&
  179 + (ispunct((unsigned char)*str) || isspace((unsigned char)*str))) {
  180 + str++;
  181 + }
  182 + return str;
  183 +}
  184 +
  185 +// Get the difference after the anchor text
  186 +char *get_difference_after_anchor(const char *str1, const char *str2, int num_anchor_words) {
  187 + if (are_strings_effectively_same(str1, str2)) {
  188 + return strdup("");
  189 + }
  190 + char semantic_anchor[256] = {0};
  191 + get_last_n_words(str1, num_anchor_words, semantic_anchor);
  192 + if (strlen(semantic_anchor) == 0) {
  193 + return strdup(str2);
  194 + }
  195 + char normalized_anchor[256] = {0};
  196 + normalize_string(semantic_anchor, normalized_anchor);
  197 + const char *anchor_end = find_anchor_end_position(str2, normalized_anchor);
  198 + const char *next_word_start = find_next_word_start(anchor_end);
  199 + return strdup(next_word_start);
  200 +}
  201 +
  202 +// Free a previous segment
  203 +void free_previous_segment(PreviousSegment* seg) {
  204 + if (seg) {
  205 + if (seg->samples) free(seg->samples);
  206 + if (seg->text) free(seg->text);
  207 + free(seg);
  208 + }
  209 +}
  210 +
  211 +// Copy a speech segment
  212 +PreviousSegment* copy_segment(const SherpaOnnxSpeechSegment* segment, const char* text) {
  213 + PreviousSegment* prev = (PreviousSegment*)malloc(sizeof(PreviousSegment));
  214 + if (!prev) return NULL;
  215 + prev->n = segment->n;
  216 + prev->start = segment->start;
  217 + prev->samples = (float*)malloc(segment->n * sizeof(float));
  218 + if (!prev->samples) {
  219 + free(prev);
  220 + return NULL;
  221 + }
  222 + memcpy(prev->samples, segment->samples, segment->n * sizeof(float));
  223 + prev->text = strdup(text);
  224 + if (!prev->text) {
  225 + free(prev->samples);
  226 + free(prev);
  227 + return NULL;
  228 + }
  229 + return prev;
  230 +}
  231 +
  232 +// Free transcription results
  233 +void free_transcription_results(TranscriptionResult* results, int32_t count) {
  234 + if (results) {
  235 + for (int32_t i = 0; i < count; i++) {
  236 + if (results[i].text) free(results[i].text);
  237 + }
  238 + free(results);
  239 + }
  240 +}
  241 +
  242 +// Main library function
  243 +TranscriptionResult* process_audio_file(const char* wav_filename, const char* vad_model_path,
  244 + const char* sense_voice_model_path, const char* tokens_path,
  245 + int32_t* result_count) {
  246 + setlocale(LC_ALL, "");
  247 + *result_count = 0;
  248 +
  249 + // Validate input files
  250 + if (!SherpaOnnxFileExists(wav_filename)) {
  251 + fprintf(stderr, "Audio file %s does not exist\n", wav_filename);
  252 + return NULL;
  253 + }
  254 + if (!SherpaOnnxFileExists(vad_model_path)) {
  255 + fprintf(stderr, "VAD model %s does not exist\n", vad_model_path);
  256 + return NULL;
  257 + }
  258 + if (!SherpaOnnxFileExists(sense_voice_model_path)) {
  259 + fprintf(stderr, "SenseVoice model %s does not exist\n", sense_voice_model_path);
  260 + return NULL;
  261 + }
  262 + if (!SherpaOnnxFileExists(tokens_path)) {
  263 + fprintf(stderr, "Tokens file %s does not exist\n", tokens_path);
  264 + return NULL;
  265 + }
  266 +
  267 + // Read WAV file
  268 + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
  269 + if (wave == NULL) {
  270 + fprintf(stderr, "Failed to read %s\n", wav_filename);
  271 + return NULL;
  272 + }
  273 + if (wave->sample_rate != 16000) {
  274 + fprintf(stderr, "Expect sample rate 16000, got %d\n", wave->sample_rate);
  275 + SherpaOnnxFreeWave(wave);
  276 + return NULL;
  277 + }
  278 +
  279 + // Initialize SenseVoice model config
  280 + SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config;
  281 + memset(&sense_voice_config, 0, sizeof(sense_voice_config));
  282 + sense_voice_config.model = sense_voice_model_path;
  283 + sense_voice_config.language = "auto";
  284 + sense_voice_config.use_itn = 1;
  285 +
  286 + // Initialize offline model config
  287 + SherpaOnnxOfflineModelConfig offline_model_config;
  288 + memset(&offline_model_config, 0, sizeof(offline_model_config));
  289 + offline_model_config.debug = 0;
  290 + offline_model_config.num_threads = 1;
  291 + offline_model_config.provider = "cpu";
  292 + offline_model_config.tokens = tokens_path;
  293 + offline_model_config.sense_voice = sense_voice_config;
  294 +
  295 + // Initialize recognizer config
  296 + SherpaOnnxOfflineRecognizerConfig recognizer_config;
  297 + memset(&recognizer_config, 0, sizeof(recognizer_config));
  298 + recognizer_config.decoding_method = "greedy_search";
  299 + recognizer_config.model_config = offline_model_config;
  300 +
  301 + // Create recognizer
  302 + const SherpaOnnxOfflineRecognizer *recognizer =
  303 + SherpaOnnxCreateOfflineRecognizer(&recognizer_config);
  304 + if (recognizer == NULL) {
  305 + fprintf(stderr, "Failed to create recognizer\n");
  306 + SherpaOnnxFreeWave(wave);
  307 + return NULL;
  308 + }
  309 +
  310 + // Initialize VAD config
  311 + SherpaOnnxVadModelConfig vadConfig;
  312 + memset(&vadConfig, 0, sizeof(vadConfig));
  313 + int32_t use_silero_vad = strstr(vad_model_path, "silero_vad.onnx") != NULL;
  314 + int32_t use_ten_vad = strstr(vad_model_path, "ten-vad.onnx") != NULL;
  315 +
  316 + if (use_silero_vad) {
  317 + vadConfig.silero_vad.model = vad_model_path;
  318 + vadConfig.silero_vad.threshold = 0.25;
  319 + vadConfig.silero_vad.min_silence_duration = 1.5;
  320 + vadConfig.silero_vad.min_speech_duration = 0.3;
  321 + vadConfig.silero_vad.max_speech_duration = 20;
  322 + vadConfig.silero_vad.window_size = 512;
  323 + } else if (use_ten_vad) {
  324 + vadConfig.ten_vad.model = vad_model_path;
  325 + vadConfig.ten_vad.threshold = 0.25;
  326 + vadConfig.ten_vad.min_silence_duration = 0.5;
  327 + vadConfig.ten_vad.min_speech_duration = 0.5;
  328 + vadConfig.ten_vad.max_speech_duration = 10;
  329 + vadConfig.ten_vad.window_size = 256;
  330 + } else {
  331 + fprintf(stderr, "Unsupported VAD model: %s\n", vad_model_path);
  332 + SherpaOnnxDestroyOfflineRecognizer(recognizer);
  333 + SherpaOnnxFreeWave(wave);
  334 + return NULL;
  335 + }
  336 + vadConfig.sample_rate = 16000;
  337 + vadConfig.num_threads = 1;
  338 + vadConfig.debug = 1;
  339 +
  340 + // Create VAD
  341 + const SherpaOnnxVoiceActivityDetector *vad =
  342 + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30);
  343 + if (vad == NULL) {
  344 + fprintf(stderr, "Failed to create VAD\n");
  345 + SherpaOnnxDestroyOfflineRecognizer(recognizer);
  346 + SherpaOnnxFreeWave(wave);
  347 + return NULL;
  348 + }
  349 +
  350 + // Initialize result array
  351 + TranscriptionResult* results = NULL;
  352 + int32_t results_capacity = 0;
  353 + int32_t results_count = 0;
  354 +
  355 + int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
  356 + : vadConfig.ten_vad.window_size;
  357 + int32_t i = 0;
  358 + int is_eof = 0;
  359 + PreviousSegment *prev_segment = NULL;
  360 +
  361 + // Process audio
  362 + while (!is_eof) {
  363 + if (i + window_size < wave->num_samples) {
  364 + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, window_size);
  365 + } else {
  366 + SherpaOnnxVoiceActivityDetectorFlush(vad);
  367 + is_eof = 1;
  368 + }
  369 +
  370 + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) {
  371 + const SherpaOnnxSpeechSegment *segment =
  372 + SherpaOnnxVoiceActivityDetectorFront(vad);
  373 + float duration = segment->n / 16000.0f;
  374 +
  375 + // Create stream for current segment
  376 + const SherpaOnnxOfflineStream *stream =
  377 + SherpaOnnxCreateOfflineStream(recognizer);
  378 + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate,
  379 + segment->samples, segment->n);
  380 + SherpaOnnxDecodeOfflineStream(recognizer, stream);
  381 + const SherpaOnnxOfflineRecognizerResult *result =
  382 + SherpaOnnxGetOfflineStreamResult(stream);
  383 +
  384 + float start = segment->start / 16000.0f;
  385 + float stop = start + duration;
  386 +
  387 + // Resize results array if necessary
  388 + if (results_count >= results_capacity) {
  389 + results_capacity = results_capacity ? results_capacity * 2 : 10;
  390 + TranscriptionResult* new_results = (TranscriptionResult*)realloc(
  391 + results, results_capacity * sizeof(TranscriptionResult));
  392 + if (!new_results) {
  393 + free_transcription_results(results, results_count);
  394 + free_previous_segment(prev_segment);
  395 + SherpaOnnxDestroyOfflineRecognizerResult(result);
  396 + SherpaOnnxDestroyOfflineStream(stream);
  397 + SherpaOnnxDestroySpeechSegment(segment);
  398 + SherpaOnnxVoiceActivityDetectorPop(vad);
  399 + SherpaOnnxDestroyOfflineRecognizer(recognizer);
  400 + SherpaOnnxDestroyVoiceActivityDetector(vad);
  401 + SherpaOnnxFreeWave(wave);
  402 + return NULL;
  403 + }
  404 + results = new_results;
  405 + }
  406 +
  407 + if (duration < 1.5f && prev_segment != NULL) {
  408 + // Merge with previous segment
  409 + int32_t merged_n = prev_segment->n + segment->n;
  410 + float *merged_samples = (float*)malloc(merged_n * sizeof(float));
  411 + memcpy(merged_samples, prev_segment->samples, prev_segment->n * sizeof(float));
  412 + memcpy(merged_samples + prev_segment->n, segment->samples, segment->n * sizeof(float));
  413 +
  414 + const SherpaOnnxOfflineStream *merged_stream =
  415 + SherpaOnnxCreateOfflineStream(recognizer);
  416 + SherpaOnnxAcceptWaveformOffline(merged_stream, wave->sample_rate,
  417 + merged_samples, merged_n);
  418 + SherpaOnnxDecodeOfflineStream(recognizer, merged_stream);
  419 + const SherpaOnnxOfflineRecognizerResult *merged_result =
  420 + SherpaOnnxGetOfflineStreamResult(merged_stream);
  421 +
  422 + char *diff_text = get_difference_after_anchor(prev_segment->text, merged_result->text, 3);
  423 + results[results_count].start_time = start;
  424 + results[results_count].end_time = stop;
  425 + results[results_count].text = strdup(strlen(diff_text) == 0 ? "Umm" : diff_text);
  426 +
  427 + SherpaOnnxDestroyOfflineRecognizerResult(merged_result);
  428 + SherpaOnnxDestroyOfflineStream(merged_stream);
  429 + free(merged_samples);
  430 + free(diff_text);
  431 + } else {
  432 + // Normal segment
  433 + results[results_count].start_time = start;
  434 + results[results_count].end_time = stop;
  435 + results[results_count].text = strdup(result->text);
  436 + }
  437 +
  438 + if (!results[results_count].text) {
  439 + free_transcription_results(results, results_count);
  440 + free_previous_segment(prev_segment);
  441 + SherpaOnnxDestroyOfflineRecognizerResult(result);
  442 + SherpaOnnxDestroyOfflineStream(stream);
  443 + SherpaOnnxDestroySpeechSegment(segment);
  444 + SherpaOnnxVoiceActivityDetectorPop(vad);
  445 + SherpaOnnxDestroyOfflineRecognizer(recognizer);
  446 + SherpaOnnxDestroyVoiceActivityDetector(vad);
  447 + SherpaOnnxFreeWave(wave);
  448 + return NULL;
  449 + }
  450 +
  451 + results_count++;
  452 +
  453 + // Update previous segment if duration >= 1.5 seconds
  454 + if (duration >= 1.5f) {
  455 + if (prev_segment) free_previous_segment(prev_segment);
  456 + prev_segment = copy_segment(segment, result->text);
  457 + } else {
  458 + if (prev_segment) {
  459 + free_previous_segment(prev_segment);
  460 + prev_segment = NULL;
  461 + }
  462 + }
  463 +
  464 + SherpaOnnxDestroyOfflineRecognizerResult(result);
  465 + SherpaOnnxDestroyOfflineStream(stream);
  466 + SherpaOnnxDestroySpeechSegment(segment);
  467 + SherpaOnnxVoiceActivityDetectorPop(vad);
  468 + }
  469 + i += window_size;
  470 + }
  471 +
  472 + // Clean up
  473 + if (prev_segment) free_previous_segment(prev_segment);
  474 + SherpaOnnxDestroyOfflineRecognizer(recognizer);
  475 + SherpaOnnxDestroyVoiceActivityDetector(vad);
  476 + SherpaOnnxFreeWave(wave);
  477 +
  478 + *result_count = results_count;
  479 + return results;
  480 +}