Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-10-25 14:55:27 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-10-25 14:55:27 +0800
Commit
29a5d06691fe9d9e8cd12896d425ccbf91b73408
29a5d066
1 parent
6e5efa48
Fix utf8 spliting for English (#386)
隐藏空白字符变更
内嵌
并排对比
正在显示
1 个修改的文件
包含
54 行增加
和
1 行删除
sherpa-onnx/csrc/text-utils.cc
sherpa-onnx/csrc/text-utils.cc
查看文件 @
29a5d06
...
...
@@ -162,10 +162,63 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
bool
omit_empty_strings
,
std
::
vector
<
double
>
*
out
);
static
std
::
vector
<
std
::
string
>
MergeCharactersIntoWords
(
const
std
::
vector
<
std
::
string
>
&
words
)
{
std
::
vector
<
std
::
string
>
ans
;
int32_t
n
=
static_cast
<
int32_t
>
(
words
.
size
());
int32_t
i
=
0
;
int32_t
prev
=
-
1
;
while
(
i
<
n
)
{
const
auto
&
w
=
words
[
i
];
if
(
w
.
size
()
>
1
||
(
w
.
size
()
==
1
&&
(
std
::
ispunct
(
w
[
0
])
||
std
::
isspace
(
w
[
0
]))))
{
if
(
prev
!=
-
1
)
{
std
::
string
t
;
for
(;
prev
<
i
;
++
prev
)
{
t
.
append
(
words
[
prev
]);
}
prev
=
-
1
;
ans
.
push_back
(
std
::
move
(
t
));
}
if
(
!
std
::
isspace
(
w
[
0
]))
{
ans
.
push_back
(
w
);
}
++
i
;
continue
;
}
if
(
w
.
size
()
==
1
)
{
if
(
prev
==
-
1
)
{
prev
=
i
;
}
++
i
;
continue
;
}
SHERPA_ONNX_LOGE
(
"Ignore %s"
,
w
.
c_str
());
++
i
;
}
if
(
prev
!=
-
1
)
{
std
::
string
t
;
for
(;
prev
<
i
;
++
prev
)
{
t
.
append
(
words
[
prev
]);
}
ans
.
push_back
(
std
::
move
(
t
));
}
return
ans
;
}
std
::
vector
<
std
::
string
>
SplitUtf8
(
const
std
::
string
&
text
)
{
const
uint8_t
*
begin
=
reinterpret_cast
<
const
uint8_t
*>
(
text
.
c_str
());
const
uint8_t
*
end
=
begin
+
text
.
size
();
// Note that English words are split into single characters.
// We need to invoke MergeCharactersIntoWords() to merge them
std
::
vector
<
std
::
string
>
ans
;
auto
start
=
begin
;
...
...
@@ -195,7 +248,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
}
}
return
ans
;
return
MergeCharactersIntoWords
(
ans
)
;
}
}
// namespace sherpa_onnx
...
...
请
注册
或
登录
后发表评论