Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-10-25 11:49:27 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-10-25 11:49:27 +0800
Commit
6e5efa48c553ab5a10a10d06faf7aba7226b6737
6e5efa48
1 parent
1249710e
Fix splitting utf8 string into words (#385)
隐藏空白字符变更
内嵌
并排对比
正在显示
3 个修改的文件
包含
26 行增加
和
90 行删除
CMakeLists.txt
cmake/utfcpp.cmake
sherpa-onnx/csrc/text-utils.cc
CMakeLists.txt
查看文件 @
6e5efa4
cmake_minimum_required
(
VERSION 3.13 FATAL_ERROR
)
project
(
sherpa-onnx
)
set
(
SHERPA_ONNX_VERSION
"1.8.
4
"
)
set
(
SHERPA_ONNX_VERSION
"1.8.
5
"
)
# Disable warning about
#
...
...
@@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET)
include
(
asio
)
endif
()
include
(
utfcpp
)
add_subdirectory
(
sherpa-onnx
)
if
(
SHERPA_ONNX_ENABLE_C_API
)
...
...
cmake/utfcpp.cmake
已删除
100644 → 0
查看文件 @
1249710
function
(
download_utfcpp
)
include
(
FetchContent
)
set
(
utfcpp_URL
"https://github.com/nemtrif/utfcpp/archive/refs/tags/v3.2.5.tar.gz"
)
set
(
utfcpp_URL2
"https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/utfcpp-3.2.5.tar.gz"
)
set
(
utfcpp_HASH
"SHA256=14fd1b3c466814cb4c40771b7f207b61d2c7a0aa6a5e620ca05c00df27f25afd"
)
# If you don't have access to the Internet,
# please pre-download utfcpp
set
(
possible_file_locations
$ENV{HOME}/Downloads/utfcpp-3.2.5.tar.gz
${
PROJECT_SOURCE_DIR
}
/utfcpp-3.2.5.tar.gz
${
PROJECT_BINARY_DIR
}
/utfcpp-3.2.5.tar.gz
/tmp/utfcpp-3.2.5.tar.gz
/star-fj/fangjun/download/github/utfcpp-3.2.5.tar.gz
)
foreach
(
f IN LISTS possible_file_locations
)
if
(
EXISTS
${
f
}
)
set
(
utfcpp_URL
"
${
f
}
"
)
file
(
TO_CMAKE_PATH
"
${
utfcpp_URL
}
"
utfcpp_URL
)
message
(
STATUS
"Found local downloaded utfcpp:
${
utfcpp_URL
}
"
)
set
(
utfcpp_URL2
)
break
()
endif
()
endforeach
()
FetchContent_Declare
(
utfcpp
URL
${
utfcpp_URL
}
${
utfcpp_URL2
}
URL_HASH
${
utfcpp_HASH
}
)
FetchContent_GetProperties
(
utfcpp
)
if
(
NOT utfcpp_POPULATED
)
message
(
STATUS
"Downloading utfcpp from
${
utfcpp_URL
}
"
)
FetchContent_Populate
(
utfcpp
)
endif
()
message
(
STATUS
"utfcpp is downloaded to
${
utfcpp_SOURCE_DIR
}
"
)
# add_subdirectory(${utfcpp_SOURCE_DIR} ${utfcpp_BINARY_DIR} EXCLUDE_FROM_ALL)
include_directories
(
${
utfcpp_SOURCE_DIR
}
)
endfunction
()
download_utfcpp
()
sherpa-onnx/csrc/text-utils.cc
查看文件 @
6e5efa4
...
...
@@ -16,7 +16,7 @@
#include <utility>
#include <vector>
#include "s
ource/utf8
.h"
#include "s
herpa-onnx/csrc/macros
.h"
// This file is copied/modified from
// https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc
...
...
@@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
std
::
vector
<
double
>
*
out
);
std
::
vector
<
std
::
string
>
SplitUtf8
(
const
std
::
string
&
text
)
{
char
*
begin
=
const_cast
<
char
*>
(
text
.
c_str
());
char
*
end
=
begin
+
text
.
size
();
const
uint8_t
*
begin
=
reinterpret_cast
<
const
uint8_t
*>
(
text
.
c_str
());
const
uint8_t
*
end
=
begin
+
text
.
size
();
std
::
vector
<
std
::
string
>
ans
;
std
::
string
buf
;
while
(
begin
<
end
)
{
uint32_t
code
=
utf8
::
next
(
begin
,
end
);
auto
start
=
begin
;
while
(
start
<
end
)
{
uint8_t
c
=
*
start
;
uint8_t
i
=
0x80
;
int32_t
num_bytes
=
0
;
// 1. is punctuation
if
(
std
::
ispunct
(
code
))
{
if
(
!
buf
.
empty
())
{
ans
.
push_back
(
std
::
move
(
buf
));
}
char
s
[
5
]
=
{
0
};
utf8
::
append
(
code
,
s
);
ans
.
push_back
(
s
);
continue
;
}
// 2. is space
if
(
std
::
isspace
(
code
))
{
if
(
!
buf
.
empty
())
{
ans
.
push_back
(
std
::
move
(
buf
));
}
continue
;
}
// 3. is alpha
if
(
std
::
isalpha
(
code
))
{
buf
.
push_back
(
code
);
continue
;
// see
// https://en.wikipedia.org/wiki/UTF-8
for
(;
c
&
i
;
i
>>=
1
)
{
++
num_bytes
;
}
if
(
!
buf
.
empty
())
{
ans
.
push_back
(
std
::
move
(
buf
));
if
(
num_bytes
==
0
)
{
// this is an ascii
ans
.
emplace_back
(
reinterpret_cast
<
const
char
*>
(
start
),
1
);
++
start
;
}
else
if
(
2
<=
num_bytes
&&
num_bytes
<=
4
)
{
ans
.
emplace_back
(
reinterpret_cast
<
const
char
*>
(
start
),
num_bytes
);
start
+=
num_bytes
;
}
else
{
SHERPA_ONNX_LOGE
(
"Invalid byte at position: %d"
,
static_cast
<
int32_t
>
(
start
-
begin
));
// skip this byte
++
start
;
}
// for others
char
s
[
5
]
=
{
0
};
utf8
::
append
(
code
,
s
);
ans
.
push_back
(
s
);
}
if
(
!
buf
.
empty
())
{
ans
.
push_back
(
std
::
move
(
buf
));
}
return
ans
;
}
}
// namespace sherpa_onnx
...
...
请
注册
或
登录
后发表评论