尝试构建静态库

xuning
Commit ea72e0487d17c13009f73bd8d7a1217761081a29 ea72e048 1 parent 267f5b2f
c-api-examples/CMakeLists.txt
c-api-examples/vad-sense-voice-lib.c
--- a/c-api-examples/CMakeLists.txt
查看文件 @ea72e04
+++ b/c-api-examples/CMakeLists.txt
查看文件 @ea72e04
 include(cargs)
 
 include_directories(${PROJECT_SOURCE_DIR})
+ 
+ 
+ add_executable(vad-sense-voice-lib vad-sense-voice-lib.c)
+ target_link_libraries(vad-sense-voice-lib sherpa-onnx-c-api)
+ set_target_properties(vad-sense-voice-lib PROPERTIES LINK_FLAGS "-static")
+ 
 add_executable(decode-file-c-api decode-file-c-api.c)
 target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs)
 
--- a/c-api-examples/vad-sense-voice-lib.c 0 → 100644
查看文件 @ea72e04
+++ b/c-api-examples/vad-sense-voice-lib.c 0 → 100644
查看文件 @ea72e04
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <ctype.h>
+ #include <wchar.h>
+ #include <locale.h>
+ #include <stdbool.h>
+ #include <stdint.h>
+ 
+ #include "sherpa-onnx/c-api/c-api.h"
+ #include "vad-sense-voice-lib.h"
+ // Structure to hold transcription results
+ typedef struct {
+     float start_time; // Start time in seconds
+     float end_time;   // End time in seconds
+     char* text;       // Transcription text
+ } TranscriptionResult;
+ 
+ // Structure to store previous segment information
+ typedef struct {
+     float* samples;
+     int32_t n;
+     int32_t start;
+     char* text;
+ } PreviousSegment;
+ 
+ // Function to normalize string: remove punctuation and spaces, convert to lowercase
+ void normalize_string(const char* input, char* output) {
+     int i = 0, j = 0;
+     while (input[i] != '\0') {
+         if (!ispunct((unsigned char)input[i]) && 
+             !isspace((unsigned char)input[i]) &&
+             !(input[i] >= 0x3000 && input[i] <= 0x303F) &&
+             !(input[i] >= 0xFF00 && input[i] <= 0xFF0F) &&
+             !(input[i] >= 0xFF1A && input[i] <= 0xFF20) &&
+             !(input[i] >= 0xFF3B && input[i] <= 0xFF40) &&
+             !(input[i] >= 0xFF5B && input[i] <= 0xFF65)) {
+             output[j++] = tolower((unsigned char)input[i]);
+         }
+         i++;
+     }
+     output[j] = '\0';
+ }
+ 
+ // Function to get the first meaningful character
+ char get_first_meaningful_char(const char* str) {
+     int i = 0;
+     while (str[i] != '\0') {
+         if (!ispunct((unsigned char)str[i]) && 
+             !isspace((unsigned char)str[i]) &&
+             !(str[i] >= 0x3000 && str[i] <= 0x303F) &&
+             !(str[i] >= 0xFF00 && str[i] <= 0xFF0F) &&
+             !(str[i] >= 0xFF1A && str[i] <= 0xFF20) &&
+             !(str[i] >= 0xFF3B && str[i] <= 0xFF40) &&
+             !(str[i] >= 0xFF5B && str[i] <= 0xFF65)) {
+             return tolower((unsigned char)str[i]);
+         }
+         i++;
+     }
+     return '\0';
+ }
+ 
+ // Function to check if two strings are effectively the same
+ int are_strings_effectively_same(const char* str1, const char* str2) {
+     char norm1[1024], norm2[1024];
+     normalize_string(str1, norm1);
+     normalize_string(str2, norm2);
+     return strcmp(norm1, norm2) == 0;
+ }
+ 
+ // Check if a character is a CJK ideograph
+ static bool is_cjk_ideograph(uint32_t ch) {
+     return (ch >= 0x4E00 && ch <= 0x9FFF) ||
+            (ch >= 0x3400 && ch <= 0x4DBF) ||
+            (ch >= 0x20000 && ch <= 0x2A6DF) ||
+            (ch >= 0x2A700 && ch <= 0x2B73F) ||
+            (ch >= 0x2B740 && ch <= 0x2B81F) ||
+            (ch >= 0x2B820 && ch <= 0x2CEAF) ||
+            (ch >= 0x2CEB0 && ch <= 0x2EBEF) ||
+            (ch >= 0x3007 && ch <= 0x3007) ||
+            (ch >= 0x3021 && ch <= 0x3029) ||
+            (ch >= 0x3038 && ch <= 0x303B);
+ }
+ 
+ // Decode a UTF-8 character backwards
+ static int prev_utf8_char(const char *s, int pos, uint32_t *out_ch) {
+     int start = pos;
+     while (start > 0 && (s[start] & 0xC0) == 0x80)
+         --start;
+     const unsigned char *p = (const unsigned char *)&s[start];
+     if ((*p & 0x80) == 0) {
+         *out_ch = *p;
+     } else if ((*p & 0xE0) == 0xC0) {
+         *out_ch = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
+     } else if ((*p & 0xF0) == 0xE0) {
+         *out_ch = ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
+     } else if ((*p & 0xF8) == 0xF0) {
+         *out_ch = ((p[0] & 0x07) << 18) | ((p[1] & 0x3F) << 12) |
+                   ((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
+     } else {
+         *out_ch = 0xFFFD;
+     }
+     return pos - start + 1;
+ }
+ 
+ // Get the last n words (CJK single characters or English words)
+ void get_last_n_words(const char *str, int n, char *output) {
+     if (!str || !output || n <= 0) {
+         *output = '\0';
+         return;
+     }
+     int len = strlen(str);
+     if (len == 0) {
+         *output = '\0';
+         return;
+     }
+     char units[256][256];
+     int unit_cnt = 0;
+     int pos = len;
+     while (pos > 0 && unit_cnt < n) {
+         uint32_t ch;
+         int char_len = prev_utf8_char(str, pos - 1, &ch);
+         pos -= char_len;
+         if (ch < 128 && ((ch | 32) - 'a' < 26)) {
+             int word_end = pos + char_len;
+             int word_start = pos;
+             while (word_start > 0) {
+                 uint32_t tmp;
+                 int tmp_len = prev_utf8_char(str, word_start - 1, &tmp);
+                 if (tmp < 128 && ((tmp | 32) - 'a' < 26))
+                     word_start -= tmp_len;
+                 else
+                     break;
+             }
+             int wlen = word_end - word_start;
+             if (wlen >= (int)sizeof(units[unit_cnt])) wlen = sizeof(units[unit_cnt]) - 1;
+             memcpy(units[unit_cnt], str + word_start, wlen);
+             units[unit_cnt][wlen] = '\0';
+             ++unit_cnt;
+             pos = word_start;
+         } else if (is_cjk_ideograph(ch) || ch > 0xFF00) {
+             if (char_len >= (int)sizeof(units[unit_cnt])) char_len = sizeof(units[unit_cnt]) - 1;
+             memcpy(units[unit_cnt], str + pos, char_len);
+             units[unit_cnt][char_len] = '\0';
+             ++unit_cnt;
+         }
+     }
+     output[0] = '\0';
+     for (int i = unit_cnt - 1; i >= 0; --i) {
+         if (i < unit_cnt - 1) strcat(output, " ");
+         strcat(output, units[i]);
+     }
+ }
+ 
+ // Find the end position of the anchor text
+ const char *find_anchor_end_position(const char *str, const char *anchor) {
+     if (!anchor || !*anchor) return str;
+     char normalized_str[1024] = {0};
+     char normalized_anchor[1024] = {0};
+     normalize_string(str, normalized_str);
+     normalize_string(anchor, normalized_anchor);
+     char *found = strstr(normalized_str, normalized_anchor);
+     if (!found) return str;
+     int anchor_end_offset = found - normalized_str + strlen(normalized_anchor);
+     int normalized_count = 0;
+     const char *ptr = str;
+     while (*ptr != '\0' && normalized_count < anchor_end_offset) {
+         if (!ispunct((unsigned char)*ptr) && !isspace((unsigned char)*ptr)) {
+             normalized_count++;
+         }
+         ptr++;
+     }
+     return ptr;
+ }
+ 
+ // Find the start of the next word
+ const char *find_next_word_start(const char *str) {
+     while (*str != '\0' && 
+            (ispunct((unsigned char)*str) || isspace((unsigned char)*str))) {
+         str++;
+     }
+     return str;
+ }
+ 
+ // Get the difference after the anchor text
+ char *get_difference_after_anchor(const char *str1, const char *str2, int num_anchor_words) {
+     if (are_strings_effectively_same(str1, str2)) {
+         return strdup("");
+     }
+     char semantic_anchor[256] = {0};
+     get_last_n_words(str1, num_anchor_words, semantic_anchor);
+     if (strlen(semantic_anchor) == 0) {
+         return strdup(str2);
+     }
+     char normalized_anchor[256] = {0};
+     normalize_string(semantic_anchor, normalized_anchor);
+     const char *anchor_end = find_anchor_end_position(str2, normalized_anchor);
+     const char *next_word_start = find_next_word_start(anchor_end);
+     return strdup(next_word_start);
+ }
+ 
+ // Free a previous segment
+ void free_previous_segment(PreviousSegment* seg) {
+     if (seg) {
+         if (seg->samples) free(seg->samples);
+         if (seg->text) free(seg->text);
+         free(seg);
+     }
+ }
+ 
+ // Copy a speech segment
+ PreviousSegment* copy_segment(const SherpaOnnxSpeechSegment* segment, const char* text) {
+     PreviousSegment* prev = (PreviousSegment*)malloc(sizeof(PreviousSegment));
+     if (!prev) return NULL;
+     prev->n = segment->n;
+     prev->start = segment->start;
+     prev->samples = (float*)malloc(segment->n * sizeof(float));
+     if (!prev->samples) {
+         free(prev);
+         return NULL;
+     }
+     memcpy(prev->samples, segment->samples, segment->n * sizeof(float));
+     prev->text = strdup(text);
+     if (!prev->text) {
+         free(prev->samples);
+         free(prev);
+         return NULL;
+     }
+     return prev;
+ }
+ 
+ // Free transcription results
+ void free_transcription_results(TranscriptionResult* results, int32_t count) {
+     if (results) {
+         for (int32_t i = 0; i < count; i++) {
+             if (results[i].text) free(results[i].text);
+         }
+         free(results);
+     }
+ }
+ 
+ // Main library function
+ TranscriptionResult* process_audio_file(const char* wav_filename, const char* vad_model_path, 
+                                       const char* sense_voice_model_path, const char* tokens_path,
+                                       int32_t* result_count) {
+     setlocale(LC_ALL, "");
+     *result_count = 0;
+ 
+     // Validate input files
+     if (!SherpaOnnxFileExists(wav_filename)) {
+         fprintf(stderr, "Audio file %s does not exist\n", wav_filename);
+         return NULL;
+     }
+     if (!SherpaOnnxFileExists(vad_model_path)) {
+         fprintf(stderr, "VAD model %s does not exist\n", vad_model_path);
+         return NULL;
+     }
+     if (!SherpaOnnxFileExists(sense_voice_model_path)) {
+         fprintf(stderr, "SenseVoice model %s does not exist\n", sense_voice_model_path);
+         return NULL;
+     }
+     if (!SherpaOnnxFileExists(tokens_path)) {
+         fprintf(stderr, "Tokens file %s does not exist\n", tokens_path);
+         return NULL;
+     }
+ 
+     // Read WAV file
+     const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
+     if (wave == NULL) {
+         fprintf(stderr, "Failed to read %s\n", wav_filename);
+         return NULL;
+     }
+     if (wave->sample_rate != 16000) {
+         fprintf(stderr, "Expect sample rate 16000, got %d\n", wave->sample_rate);
+         SherpaOnnxFreeWave(wave);
+         return NULL;
+     }
+ 
+     // Initialize SenseVoice model config
+     SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config;
+     memset(&sense_voice_config, 0, sizeof(sense_voice_config));
+     sense_voice_config.model = sense_voice_model_path;
+     sense_voice_config.language = "auto";
+     sense_voice_config.use_itn = 1;
+ 
+     // Initialize offline model config
+     SherpaOnnxOfflineModelConfig offline_model_config;
+     memset(&offline_model_config, 0, sizeof(offline_model_config));
+     offline_model_config.debug = 0;
+     offline_model_config.num_threads = 1;
+     offline_model_config.provider = "cpu";
+     offline_model_config.tokens = tokens_path;
+     offline_model_config.sense_voice = sense_voice_config;
+ 
+     // Initialize recognizer config
+     SherpaOnnxOfflineRecognizerConfig recognizer_config;
+     memset(&recognizer_config, 0, sizeof(recognizer_config));
+     recognizer_config.decoding_method = "greedy_search";
+     recognizer_config.model_config = offline_model_config;
+ 
+     // Create recognizer
+     const SherpaOnnxOfflineRecognizer *recognizer =
+         SherpaOnnxCreateOfflineRecognizer(&recognizer_config);
+     if (recognizer == NULL) {
+         fprintf(stderr, "Failed to create recognizer\n");
+         SherpaOnnxFreeWave(wave);
+         return NULL;
+     }
+ 
+     // Initialize VAD config
+     SherpaOnnxVadModelConfig vadConfig;
+     memset(&vadConfig, 0, sizeof(vadConfig));
+     int32_t use_silero_vad = strstr(vad_model_path, "silero_vad.onnx") != NULL;
+     int32_t use_ten_vad = strstr(vad_model_path, "ten-vad.onnx") != NULL;
+ 
+     if (use_silero_vad) {
+         vadConfig.silero_vad.model = vad_model_path;
+         vadConfig.silero_vad.threshold = 0.25;
+         vadConfig.silero_vad.min_silence_duration = 1.5;
+         vadConfig.silero_vad.min_speech_duration = 0.3;
+         vadConfig.silero_vad.max_speech_duration = 20;
+         vadConfig.silero_vad.window_size = 512;
+     } else if (use_ten_vad) {
+         vadConfig.ten_vad.model = vad_model_path;
+         vadConfig.ten_vad.threshold = 0.25;
+         vadConfig.ten_vad.min_silence_duration = 0.5;
+         vadConfig.ten_vad.min_speech_duration = 0.5;
+         vadConfig.ten_vad.max_speech_duration = 10;
+         vadConfig.ten_vad.window_size = 256;
+     } else {
+         fprintf(stderr, "Unsupported VAD model: %s\n", vad_model_path);
+         SherpaOnnxDestroyOfflineRecognizer(recognizer);
+         SherpaOnnxFreeWave(wave);
+         return NULL;
+     }
+     vadConfig.sample_rate = 16000;
+     vadConfig.num_threads = 1;
+     vadConfig.debug = 1;
+ 
+     // Create VAD
+     const SherpaOnnxVoiceActivityDetector *vad =
+         SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30);
+     if (vad == NULL) {
+         fprintf(stderr, "Failed to create VAD\n");
+         SherpaOnnxDestroyOfflineRecognizer(recognizer);
+         SherpaOnnxFreeWave(wave);
+         return NULL;
+     }
+ 
+     // Initialize result array
+     TranscriptionResult* results = NULL;
+     int32_t results_capacity = 0;
+     int32_t results_count = 0;
+ 
+     int32_t window_size = use_silero_vad ? vadConfig.silero_vad.window_size
+                                         : vadConfig.ten_vad.window_size;
+     int32_t i = 0;
+     int is_eof = 0;
+     PreviousSegment *prev_segment = NULL;
+ 
+     // Process audio
+     while (!is_eof) {
+         if (i + window_size < wave->num_samples) {
+             SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, window_size);
+         } else {
+             SherpaOnnxVoiceActivityDetectorFlush(vad);
+             is_eof = 1;
+         }
+ 
+         while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) {
+             const SherpaOnnxSpeechSegment *segment =
+                 SherpaOnnxVoiceActivityDetectorFront(vad);
+             float duration = segment->n / 16000.0f;
+ 
+             // Create stream for current segment
+             const SherpaOnnxOfflineStream *stream =
+                 SherpaOnnxCreateOfflineStream(recognizer);
+             SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate,
+                                           segment->samples, segment->n);
+             SherpaOnnxDecodeOfflineStream(recognizer, stream);
+             const SherpaOnnxOfflineRecognizerResult *result =
+                 SherpaOnnxGetOfflineStreamResult(stream);
+ 
+             float start = segment->start / 16000.0f;
+             float stop = start + duration;
+ 
+             // Resize results array if necessary
+             if (results_count >= results_capacity) {
+                 results_capacity = results_capacity ? results_capacity * 2 : 10;
+                 TranscriptionResult* new_results = (TranscriptionResult*)realloc(
+                     results, results_capacity * sizeof(TranscriptionResult));
+                 if (!new_results) {
+                     free_transcription_results(results, results_count);
+                     free_previous_segment(prev_segment);
+                     SherpaOnnxDestroyOfflineRecognizerResult(result);
+                     SherpaOnnxDestroyOfflineStream(stream);
+                     SherpaOnnxDestroySpeechSegment(segment);
+                     SherpaOnnxVoiceActivityDetectorPop(vad);
+                     SherpaOnnxDestroyOfflineRecognizer(recognizer);
+                     SherpaOnnxDestroyVoiceActivityDetector(vad);
+                     SherpaOnnxFreeWave(wave);
+                     return NULL;
+                 }
+                 results = new_results;
+             }
+ 
+             if (duration < 1.5f && prev_segment != NULL) {
+                 // Merge with previous segment
+                 int32_t merged_n = prev_segment->n + segment->n;
+                 float *merged_samples = (float*)malloc(merged_n * sizeof(float));
+                 memcpy(merged_samples, prev_segment->samples, prev_segment->n * sizeof(float));
+                 memcpy(merged_samples + prev_segment->n, segment->samples, segment->n * sizeof(float));
+ 
+                 const SherpaOnnxOfflineStream *merged_stream =
+                     SherpaOnnxCreateOfflineStream(recognizer);
+                 SherpaOnnxAcceptWaveformOffline(merged_stream, wave->sample_rate,
+                                               merged_samples, merged_n);
+                 SherpaOnnxDecodeOfflineStream(recognizer, merged_stream);
+                 const SherpaOnnxOfflineRecognizerResult *merged_result =
+                     SherpaOnnxGetOfflineStreamResult(merged_stream);
+ 
+                 char *diff_text = get_difference_after_anchor(prev_segment->text, merged_result->text, 3);
+                 results[results_count].start_time = start;
+                 results[results_count].end_time = stop;
+                 results[results_count].text = strdup(strlen(diff_text) == 0 ? "Umm" : diff_text);
+ 
+                 SherpaOnnxDestroyOfflineRecognizerResult(merged_result);
+                 SherpaOnnxDestroyOfflineStream(merged_stream);
+                 free(merged_samples);
+                 free(diff_text);
+             } else {
+                 // Normal segment
+                 results[results_count].start_time = start;
+                 results[results_count].end_time = stop;
+                 results[results_count].text = strdup(result->text);
+             }
+ 
+             if (!results[results_count].text) {
+                 free_transcription_results(results, results_count);
+                 free_previous_segment(prev_segment);
+                 SherpaOnnxDestroyOfflineRecognizerResult(result);
+                 SherpaOnnxDestroyOfflineStream(stream);
+                 SherpaOnnxDestroySpeechSegment(segment);
+                 SherpaOnnxVoiceActivityDetectorPop(vad);
+                 SherpaOnnxDestroyOfflineRecognizer(recognizer);
+                 SherpaOnnxDestroyVoiceActivityDetector(vad);
+                 SherpaOnnxFreeWave(wave);
+                 return NULL;
+             }
+ 
+             results_count++;
+ 
+             // Update previous segment if duration >= 1.5 seconds
+             if (duration >= 1.5f) {
+                 if (prev_segment) free_previous_segment(prev_segment);
+                 prev_segment = copy_segment(segment, result->text);
+             } else {
+                 if (prev_segment) {
+                     free_previous_segment(prev_segment);
+                     prev_segment = NULL;
+                 }
+             }
+ 
+             SherpaOnnxDestroyOfflineRecognizerResult(result);
+             SherpaOnnxDestroyOfflineStream(stream);
+             SherpaOnnxDestroySpeechSegment(segment);
+             SherpaOnnxVoiceActivityDetectorPop(vad);
+         }
+         i += window_size;
+     }
+ 
+     // Clean up
+     if (prev_segment) free_previous_segment(prev_segment);
+     SherpaOnnxDestroyOfflineRecognizer(recognizer);
+     SherpaOnnxDestroyVoiceActivityDetector(vad);
+     SherpaOnnxFreeWave(wave);
+ 
+     *result_count = results_count;
+     return results;
+ }
\ No newline at end of file