Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
xuning
2025-09-30 12:52:06 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
ea72e0487d17c13009f73bd8d7a1217761081a29
ea72e048
1 parent
267f5b2f
尝试构建静态库
隐藏空白字符变更
内嵌
并排对比
正在显示
2 个修改的文件
包含
486 行增加
和
0 行删除
c-api-examples/CMakeLists.txt
c-api-examples/vad-sense-voice-lib.c
c-api-examples/CMakeLists.txt
查看文件 @
ea72e04
include
(
cargs
)
include_directories
(
${
PROJECT_SOURCE_DIR
}
)
add_executable
(
vad-sense-voice-lib vad-sense-voice-lib.c
)
target_link_libraries
(
vad-sense-voice-lib sherpa-onnx-c-api
)
set_target_properties
(
vad-sense-voice-lib PROPERTIES LINK_FLAGS
"-static"
)
add_executable
(
decode-file-c-api decode-file-c-api.c
)
target_link_libraries
(
decode-file-c-api sherpa-onnx-c-api cargs
)
...
...
c-api-examples/vad-sense-voice-lib.c
0 → 100644
查看文件 @
ea72e04
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <wchar.h>
#include <locale.h>
#include <stdbool.h>
#include <stdint.h>
#include "sherpa-onnx/c-api/c-api.h"
#include "vad-sense-voice-lib.h"
// Structure to hold transcription results
typedef
struct
{
float
start_time
;
// Start time in seconds
float
end_time
;
// End time in seconds
char
*
text
;
// Transcription text
}
TranscriptionResult
;
// Structure to store previous segment information
typedef
struct
{
float
*
samples
;
int32_t
n
;
int32_t
start
;
char
*
text
;
}
PreviousSegment
;
// Function to normalize string: remove punctuation and spaces, convert to lowercase
void
normalize_string
(
const
char
*
input
,
char
*
output
)
{
int
i
=
0
,
j
=
0
;
while
(
input
[
i
]
!=
'\0'
)
{
if
(
!
ispunct
((
unsigned
char
)
input
[
i
])
&&
!
isspace
((
unsigned
char
)
input
[
i
])
&&
!
(
input
[
i
]
>=
0x3000
&&
input
[
i
]
<=
0x303F
)
&&
!
(
input
[
i
]
>=
0xFF00
&&
input
[
i
]
<=
0xFF0F
)
&&
!
(
input
[
i
]
>=
0xFF1A
&&
input
[
i
]
<=
0xFF20
)
&&
!
(
input
[
i
]
>=
0xFF3B
&&
input
[
i
]
<=
0xFF40
)
&&
!
(
input
[
i
]
>=
0xFF5B
&&
input
[
i
]
<=
0xFF65
))
{
output
[
j
++
]
=
tolower
((
unsigned
char
)
input
[
i
]);
}
i
++
;
}
output
[
j
]
=
'\0'
;
}
// Function to get the first meaningful character
char
get_first_meaningful_char
(
const
char
*
str
)
{
int
i
=
0
;
while
(
str
[
i
]
!=
'\0'
)
{
if
(
!
ispunct
((
unsigned
char
)
str
[
i
])
&&
!
isspace
((
unsigned
char
)
str
[
i
])
&&
!
(
str
[
i
]
>=
0x3000
&&
str
[
i
]
<=
0x303F
)
&&
!
(
str
[
i
]
>=
0xFF00
&&
str
[
i
]
<=
0xFF0F
)
&&
!
(
str
[
i
]
>=
0xFF1A
&&
str
[
i
]
<=
0xFF20
)
&&
!
(
str
[
i
]
>=
0xFF3B
&&
str
[
i
]
<=
0xFF40
)
&&
!
(
str
[
i
]
>=
0xFF5B
&&
str
[
i
]
<=
0xFF65
))
{
return
tolower
((
unsigned
char
)
str
[
i
]);
}
i
++
;
}
return
'\0'
;
}
// Function to check if two strings are effectively the same
int
are_strings_effectively_same
(
const
char
*
str1
,
const
char
*
str2
)
{
char
norm1
[
1024
],
norm2
[
1024
];
normalize_string
(
str1
,
norm1
);
normalize_string
(
str2
,
norm2
);
return
strcmp
(
norm1
,
norm2
)
==
0
;
}
// Check if a character is a CJK ideograph
static
bool
is_cjk_ideograph
(
uint32_t
ch
)
{
return
(
ch
>=
0x4E00
&&
ch
<=
0x9FFF
)
||
(
ch
>=
0x3400
&&
ch
<=
0x4DBF
)
||
(
ch
>=
0x20000
&&
ch
<=
0x2A6DF
)
||
(
ch
>=
0x2A700
&&
ch
<=
0x2B73F
)
||
(
ch
>=
0x2B740
&&
ch
<=
0x2B81F
)
||
(
ch
>=
0x2B820
&&
ch
<=
0x2CEAF
)
||
(
ch
>=
0x2CEB0
&&
ch
<=
0x2EBEF
)
||
(
ch
>=
0x3007
&&
ch
<=
0x3007
)
||
(
ch
>=
0x3021
&&
ch
<=
0x3029
)
||
(
ch
>=
0x3038
&&
ch
<=
0x303B
);
}
// Decode a UTF-8 character backwards
static
int
prev_utf8_char
(
const
char
*
s
,
int
pos
,
uint32_t
*
out_ch
)
{
int
start
=
pos
;
while
(
start
>
0
&&
(
s
[
start
]
&
0xC0
)
==
0x80
)
--
start
;
const
unsigned
char
*
p
=
(
const
unsigned
char
*
)
&
s
[
start
];
if
((
*
p
&
0x80
)
==
0
)
{
*
out_ch
=
*
p
;
}
else
if
((
*
p
&
0xE0
)
==
0xC0
)
{
*
out_ch
=
((
p
[
0
]
&
0x1F
)
<<
6
)
|
(
p
[
1
]
&
0x3F
);
}
else
if
((
*
p
&
0xF0
)
==
0xE0
)
{
*
out_ch
=
((
p
[
0
]
&
0x0F
)
<<
12
)
|
((
p
[
1
]
&
0x3F
)
<<
6
)
|
(
p
[
2
]
&
0x3F
);
}
else
if
((
*
p
&
0xF8
)
==
0xF0
)
{
*
out_ch
=
((
p
[
0
]
&
0x07
)
<<
18
)
|
((
p
[
1
]
&
0x3F
)
<<
12
)
|
((
p
[
2
]
&
0x3F
)
<<
6
)
|
(
p
[
3
]
&
0x3F
);
}
else
{
*
out_ch
=
0xFFFD
;
}
return
pos
-
start
+
1
;
}
// Get the last n words (CJK single characters or English words)
void
get_last_n_words
(
const
char
*
str
,
int
n
,
char
*
output
)
{
if
(
!
str
||
!
output
||
n
<=
0
)
{
*
output
=
'\0'
;
return
;
}
int
len
=
strlen
(
str
);
if
(
len
==
0
)
{
*
output
=
'\0'
;
return
;
}
char
units
[
256
][
256
];
int
unit_cnt
=
0
;
int
pos
=
len
;
while
(
pos
>
0
&&
unit_cnt
<
n
)
{
uint32_t
ch
;
int
char_len
=
prev_utf8_char
(
str
,
pos
-
1
,
&
ch
);
pos
-=
char_len
;
if
(
ch
<
128
&&
((
ch
|
32
)
-
'a'
<
26
))
{
int
word_end
=
pos
+
char_len
;
int
word_start
=
pos
;
while
(
word_start
>
0
)
{
uint32_t
tmp
;
int
tmp_len
=
prev_utf8_char
(
str
,
word_start
-
1
,
&
tmp
);
if
(
tmp
<
128
&&
((
tmp
|
32
)
-
'a'
<
26
))
word_start
-=
tmp_len
;
else
break
;
}
int
wlen
=
word_end
-
word_start
;
if
(
wlen
>=
(
int
)
sizeof
(
units
[
unit_cnt
]))
wlen
=
sizeof
(
units
[
unit_cnt
])
-
1
;
memcpy
(
units
[
unit_cnt
],
str
+
word_start
,
wlen
);
units
[
unit_cnt
][
wlen
]
=
'\0'
;
++
unit_cnt
;
pos
=
word_start
;
}
else
if
(
is_cjk_ideograph
(
ch
)
||
ch
>
0xFF00
)
{
if
(
char_len
>=
(
int
)
sizeof
(
units
[
unit_cnt
]))
char_len
=
sizeof
(
units
[
unit_cnt
])
-
1
;
memcpy
(
units
[
unit_cnt
],
str
+
pos
,
char_len
);
units
[
unit_cnt
][
char_len
]
=
'\0'
;
++
unit_cnt
;
}
}
output
[
0
]
=
'\0'
;
for
(
int
i
=
unit_cnt
-
1
;
i
>=
0
;
--
i
)
{
if
(
i
<
unit_cnt
-
1
)
strcat
(
output
,
" "
);
strcat
(
output
,
units
[
i
]);
}
}
// Find the end position of the anchor text
const
char
*
find_anchor_end_position
(
const
char
*
str
,
const
char
*
anchor
)
{
if
(
!
anchor
||
!*
anchor
)
return
str
;
char
normalized_str
[
1024
]
=
{
0
};
char
normalized_anchor
[
1024
]
=
{
0
};
normalize_string
(
str
,
normalized_str
);
normalize_string
(
anchor
,
normalized_anchor
);
char
*
found
=
strstr
(
normalized_str
,
normalized_anchor
);
if
(
!
found
)
return
str
;
int
anchor_end_offset
=
found
-
normalized_str
+
strlen
(
normalized_anchor
);
int
normalized_count
=
0
;
const
char
*
ptr
=
str
;
while
(
*
ptr
!=
'\0'
&&
normalized_count
<
anchor_end_offset
)
{
if
(
!
ispunct
((
unsigned
char
)
*
ptr
)
&&
!
isspace
((
unsigned
char
)
*
ptr
))
{
normalized_count
++
;
}
ptr
++
;
}
return
ptr
;
}
// Find the start of the next word
const
char
*
find_next_word_start
(
const
char
*
str
)
{
while
(
*
str
!=
'\0'
&&
(
ispunct
((
unsigned
char
)
*
str
)
||
isspace
((
unsigned
char
)
*
str
)))
{
str
++
;
}
return
str
;
}
// Get the difference after the anchor text
char
*
get_difference_after_anchor
(
const
char
*
str1
,
const
char
*
str2
,
int
num_anchor_words
)
{
if
(
are_strings_effectively_same
(
str1
,
str2
))
{
return
strdup
(
""
);
}
char
semantic_anchor
[
256
]
=
{
0
};
get_last_n_words
(
str1
,
num_anchor_words
,
semantic_anchor
);
if
(
strlen
(
semantic_anchor
)
==
0
)
{
return
strdup
(
str2
);
}
char
normalized_anchor
[
256
]
=
{
0
};
normalize_string
(
semantic_anchor
,
normalized_anchor
);
const
char
*
anchor_end
=
find_anchor_end_position
(
str2
,
normalized_anchor
);
const
char
*
next_word_start
=
find_next_word_start
(
anchor_end
);
return
strdup
(
next_word_start
);
}
// Free a previous segment
void
free_previous_segment
(
PreviousSegment
*
seg
)
{
if
(
seg
)
{
if
(
seg
->
samples
)
free
(
seg
->
samples
);
if
(
seg
->
text
)
free
(
seg
->
text
);
free
(
seg
);
}
}
// Copy a speech segment
PreviousSegment
*
copy_segment
(
const
SherpaOnnxSpeechSegment
*
segment
,
const
char
*
text
)
{
PreviousSegment
*
prev
=
(
PreviousSegment
*
)
malloc
(
sizeof
(
PreviousSegment
));
if
(
!
prev
)
return
NULL
;
prev
->
n
=
segment
->
n
;
prev
->
start
=
segment
->
start
;
prev
->
samples
=
(
float
*
)
malloc
(
segment
->
n
*
sizeof
(
float
));
if
(
!
prev
->
samples
)
{
free
(
prev
);
return
NULL
;
}
memcpy
(
prev
->
samples
,
segment
->
samples
,
segment
->
n
*
sizeof
(
float
));
prev
->
text
=
strdup
(
text
);
if
(
!
prev
->
text
)
{
free
(
prev
->
samples
);
free
(
prev
);
return
NULL
;
}
return
prev
;
}
// Free transcription results
void
free_transcription_results
(
TranscriptionResult
*
results
,
int32_t
count
)
{
if
(
results
)
{
for
(
int32_t
i
=
0
;
i
<
count
;
i
++
)
{
if
(
results
[
i
].
text
)
free
(
results
[
i
].
text
);
}
free
(
results
);
}
}
// Main library function
TranscriptionResult
*
process_audio_file
(
const
char
*
wav_filename
,
const
char
*
vad_model_path
,
const
char
*
sense_voice_model_path
,
const
char
*
tokens_path
,
int32_t
*
result_count
)
{
setlocale
(
LC_ALL
,
""
);
*
result_count
=
0
;
// Validate input files
if
(
!
SherpaOnnxFileExists
(
wav_filename
))
{
fprintf
(
stderr
,
"Audio file %s does not exist
\n
"
,
wav_filename
);
return
NULL
;
}
if
(
!
SherpaOnnxFileExists
(
vad_model_path
))
{
fprintf
(
stderr
,
"VAD model %s does not exist
\n
"
,
vad_model_path
);
return
NULL
;
}
if
(
!
SherpaOnnxFileExists
(
sense_voice_model_path
))
{
fprintf
(
stderr
,
"SenseVoice model %s does not exist
\n
"
,
sense_voice_model_path
);
return
NULL
;
}
if
(
!
SherpaOnnxFileExists
(
tokens_path
))
{
fprintf
(
stderr
,
"Tokens file %s does not exist
\n
"
,
tokens_path
);
return
NULL
;
}
// Read WAV file
const
SherpaOnnxWave
*
wave
=
SherpaOnnxReadWave
(
wav_filename
);
if
(
wave
==
NULL
)
{
fprintf
(
stderr
,
"Failed to read %s
\n
"
,
wav_filename
);
return
NULL
;
}
if
(
wave
->
sample_rate
!=
16000
)
{
fprintf
(
stderr
,
"Expect sample rate 16000, got %d
\n
"
,
wave
->
sample_rate
);
SherpaOnnxFreeWave
(
wave
);
return
NULL
;
}
// Initialize SenseVoice model config
SherpaOnnxOfflineSenseVoiceModelConfig
sense_voice_config
;
memset
(
&
sense_voice_config
,
0
,
sizeof
(
sense_voice_config
));
sense_voice_config
.
model
=
sense_voice_model_path
;
sense_voice_config
.
language
=
"auto"
;
sense_voice_config
.
use_itn
=
1
;
// Initialize offline model config
SherpaOnnxOfflineModelConfig
offline_model_config
;
memset
(
&
offline_model_config
,
0
,
sizeof
(
offline_model_config
));
offline_model_config
.
debug
=
0
;
offline_model_config
.
num_threads
=
1
;
offline_model_config
.
provider
=
"cpu"
;
offline_model_config
.
tokens
=
tokens_path
;
offline_model_config
.
sense_voice
=
sense_voice_config
;
// Initialize recognizer config
SherpaOnnxOfflineRecognizerConfig
recognizer_config
;
memset
(
&
recognizer_config
,
0
,
sizeof
(
recognizer_config
));
recognizer_config
.
decoding_method
=
"greedy_search"
;
recognizer_config
.
model_config
=
offline_model_config
;
// Create recognizer
const
SherpaOnnxOfflineRecognizer
*
recognizer
=
SherpaOnnxCreateOfflineRecognizer
(
&
recognizer_config
);
if
(
recognizer
==
NULL
)
{
fprintf
(
stderr
,
"Failed to create recognizer
\n
"
);
SherpaOnnxFreeWave
(
wave
);
return
NULL
;
}
// Initialize VAD config
SherpaOnnxVadModelConfig
vadConfig
;
memset
(
&
vadConfig
,
0
,
sizeof
(
vadConfig
));
int32_t
use_silero_vad
=
strstr
(
vad_model_path
,
"silero_vad.onnx"
)
!=
NULL
;
int32_t
use_ten_vad
=
strstr
(
vad_model_path
,
"ten-vad.onnx"
)
!=
NULL
;
if
(
use_silero_vad
)
{
vadConfig
.
silero_vad
.
model
=
vad_model_path
;
vadConfig
.
silero_vad
.
threshold
=
0
.
25
;
vadConfig
.
silero_vad
.
min_silence_duration
=
1
.
5
;
vadConfig
.
silero_vad
.
min_speech_duration
=
0
.
3
;
vadConfig
.
silero_vad
.
max_speech_duration
=
20
;
vadConfig
.
silero_vad
.
window_size
=
512
;
}
else
if
(
use_ten_vad
)
{
vadConfig
.
ten_vad
.
model
=
vad_model_path
;
vadConfig
.
ten_vad
.
threshold
=
0
.
25
;
vadConfig
.
ten_vad
.
min_silence_duration
=
0
.
5
;
vadConfig
.
ten_vad
.
min_speech_duration
=
0
.
5
;
vadConfig
.
ten_vad
.
max_speech_duration
=
10
;
vadConfig
.
ten_vad
.
window_size
=
256
;
}
else
{
fprintf
(
stderr
,
"Unsupported VAD model: %s
\n
"
,
vad_model_path
);
SherpaOnnxDestroyOfflineRecognizer
(
recognizer
);
SherpaOnnxFreeWave
(
wave
);
return
NULL
;
}
vadConfig
.
sample_rate
=
16000
;
vadConfig
.
num_threads
=
1
;
vadConfig
.
debug
=
1
;
// Create VAD
const
SherpaOnnxVoiceActivityDetector
*
vad
=
SherpaOnnxCreateVoiceActivityDetector
(
&
vadConfig
,
30
);
if
(
vad
==
NULL
)
{
fprintf
(
stderr
,
"Failed to create VAD
\n
"
);
SherpaOnnxDestroyOfflineRecognizer
(
recognizer
);
SherpaOnnxFreeWave
(
wave
);
return
NULL
;
}
// Initialize result array
TranscriptionResult
*
results
=
NULL
;
int32_t
results_capacity
=
0
;
int32_t
results_count
=
0
;
int32_t
window_size
=
use_silero_vad
?
vadConfig
.
silero_vad
.
window_size
:
vadConfig
.
ten_vad
.
window_size
;
int32_t
i
=
0
;
int
is_eof
=
0
;
PreviousSegment
*
prev_segment
=
NULL
;
// Process audio
while
(
!
is_eof
)
{
if
(
i
+
window_size
<
wave
->
num_samples
)
{
SherpaOnnxVoiceActivityDetectorAcceptWaveform
(
vad
,
wave
->
samples
+
i
,
window_size
);
}
else
{
SherpaOnnxVoiceActivityDetectorFlush
(
vad
);
is_eof
=
1
;
}
while
(
!
SherpaOnnxVoiceActivityDetectorEmpty
(
vad
))
{
const
SherpaOnnxSpeechSegment
*
segment
=
SherpaOnnxVoiceActivityDetectorFront
(
vad
);
float
duration
=
segment
->
n
/
16000
.
0
f
;
// Create stream for current segment
const
SherpaOnnxOfflineStream
*
stream
=
SherpaOnnxCreateOfflineStream
(
recognizer
);
SherpaOnnxAcceptWaveformOffline
(
stream
,
wave
->
sample_rate
,
segment
->
samples
,
segment
->
n
);
SherpaOnnxDecodeOfflineStream
(
recognizer
,
stream
);
const
SherpaOnnxOfflineRecognizerResult
*
result
=
SherpaOnnxGetOfflineStreamResult
(
stream
);
float
start
=
segment
->
start
/
16000
.
0
f
;
float
stop
=
start
+
duration
;
// Resize results array if necessary
if
(
results_count
>=
results_capacity
)
{
results_capacity
=
results_capacity
?
results_capacity
*
2
:
10
;
TranscriptionResult
*
new_results
=
(
TranscriptionResult
*
)
realloc
(
results
,
results_capacity
*
sizeof
(
TranscriptionResult
));
if
(
!
new_results
)
{
free_transcription_results
(
results
,
results_count
);
free_previous_segment
(
prev_segment
);
SherpaOnnxDestroyOfflineRecognizerResult
(
result
);
SherpaOnnxDestroyOfflineStream
(
stream
);
SherpaOnnxDestroySpeechSegment
(
segment
);
SherpaOnnxVoiceActivityDetectorPop
(
vad
);
SherpaOnnxDestroyOfflineRecognizer
(
recognizer
);
SherpaOnnxDestroyVoiceActivityDetector
(
vad
);
SherpaOnnxFreeWave
(
wave
);
return
NULL
;
}
results
=
new_results
;
}
if
(
duration
<
1
.
5
f
&&
prev_segment
!=
NULL
)
{
// Merge with previous segment
int32_t
merged_n
=
prev_segment
->
n
+
segment
->
n
;
float
*
merged_samples
=
(
float
*
)
malloc
(
merged_n
*
sizeof
(
float
));
memcpy
(
merged_samples
,
prev_segment
->
samples
,
prev_segment
->
n
*
sizeof
(
float
));
memcpy
(
merged_samples
+
prev_segment
->
n
,
segment
->
samples
,
segment
->
n
*
sizeof
(
float
));
const
SherpaOnnxOfflineStream
*
merged_stream
=
SherpaOnnxCreateOfflineStream
(
recognizer
);
SherpaOnnxAcceptWaveformOffline
(
merged_stream
,
wave
->
sample_rate
,
merged_samples
,
merged_n
);
SherpaOnnxDecodeOfflineStream
(
recognizer
,
merged_stream
);
const
SherpaOnnxOfflineRecognizerResult
*
merged_result
=
SherpaOnnxGetOfflineStreamResult
(
merged_stream
);
char
*
diff_text
=
get_difference_after_anchor
(
prev_segment
->
text
,
merged_result
->
text
,
3
);
results
[
results_count
].
start_time
=
start
;
results
[
results_count
].
end_time
=
stop
;
results
[
results_count
].
text
=
strdup
(
strlen
(
diff_text
)
==
0
?
"Umm"
:
diff_text
);
SherpaOnnxDestroyOfflineRecognizerResult
(
merged_result
);
SherpaOnnxDestroyOfflineStream
(
merged_stream
);
free
(
merged_samples
);
free
(
diff_text
);
}
else
{
// Normal segment
results
[
results_count
].
start_time
=
start
;
results
[
results_count
].
end_time
=
stop
;
results
[
results_count
].
text
=
strdup
(
result
->
text
);
}
if
(
!
results
[
results_count
].
text
)
{
free_transcription_results
(
results
,
results_count
);
free_previous_segment
(
prev_segment
);
SherpaOnnxDestroyOfflineRecognizerResult
(
result
);
SherpaOnnxDestroyOfflineStream
(
stream
);
SherpaOnnxDestroySpeechSegment
(
segment
);
SherpaOnnxVoiceActivityDetectorPop
(
vad
);
SherpaOnnxDestroyOfflineRecognizer
(
recognizer
);
SherpaOnnxDestroyVoiceActivityDetector
(
vad
);
SherpaOnnxFreeWave
(
wave
);
return
NULL
;
}
results_count
++
;
// Update previous segment if duration >= 1.5 seconds
if
(
duration
>=
1
.
5
f
)
{
if
(
prev_segment
)
free_previous_segment
(
prev_segment
);
prev_segment
=
copy_segment
(
segment
,
result
->
text
);
}
else
{
if
(
prev_segment
)
{
free_previous_segment
(
prev_segment
);
prev_segment
=
NULL
;
}
}
SherpaOnnxDestroyOfflineRecognizerResult
(
result
);
SherpaOnnxDestroyOfflineStream
(
stream
);
SherpaOnnxDestroySpeechSegment
(
segment
);
SherpaOnnxVoiceActivityDetectorPop
(
vad
);
}
i
+=
window_size
;
}
// Clean up
if
(
prev_segment
)
free_previous_segment
(
prev_segment
);
SherpaOnnxDestroyOfflineRecognizer
(
recognizer
);
SherpaOnnxDestroyVoiceActivityDetector
(
vad
);
SherpaOnnxFreeWave
(
wave
);
*
result_count
=
results_count
;
return
results
;
}
\ No newline at end of file
...
...
请
注册
或
登录
后发表评论