Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2024-12-25 19:32:13 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-12-25 19:32:13 +0800
Commit
b6f0f5fc2eb422d47546f388443c1725b85c728a
b6f0f5fc
1 parent
08d77133
Support removing invalid utf-8 sequences. (#1648)
隐藏空白字符变更
内嵌
并排对比
正在显示
6 个修改的文件
包含
164 行增加
和
0 行删除
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/offline-recognizer-impl.cc
sherpa-onnx/csrc/online-recognizer-impl.cc
sherpa-onnx/csrc/text-utils-test.cc
sherpa-onnx/csrc/text-utils.cc
sherpa-onnx/csrc/text-utils.h
sherpa-onnx/csrc/CMakeLists.txt
查看文件 @
b6f0f5f
...
...
@@ -545,6 +545,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
pad-sequence-test.cc
slice-test.cc
stack-test.cc
text-utils-test.cc
text2token-test.cc
transpose-test.cc
unbind-test.cc
...
...
sherpa-onnx/csrc/offline-recognizer-impl.cc
查看文件 @
b6f0f5f
...
...
@@ -488,6 +488,8 @@ OfflineRecognizerImpl::OfflineRecognizerImpl(
std
::
string
OfflineRecognizerImpl
::
ApplyInverseTextNormalization
(
std
::
string
text
)
const
{
text
=
RemoveInvalidUtf8Sequences
(
text
);
if
(
!
itn_list_
.
empty
())
{
for
(
const
auto
&
tn
:
itn_list_
)
{
text
=
tn
->
Normalize
(
text
);
...
...
sherpa-onnx/csrc/online-recognizer-impl.cc
查看文件 @
b6f0f5f
...
...
@@ -194,6 +194,8 @@ OnlineRecognizerImpl::OnlineRecognizerImpl(Manager *mgr,
std
::
string
OnlineRecognizerImpl
::
ApplyInverseTextNormalization
(
std
::
string
text
)
const
{
text
=
RemoveInvalidUtf8Sequences
(
text
);
if
(
!
itn_list_
.
empty
())
{
for
(
const
auto
&
tn
:
itn_list_
)
{
text
=
tn
->
Normalize
(
text
);
...
...
sherpa-onnx/csrc/text-utils-test.cc
0 → 100644
查看文件 @
b6f0f5f
// sherpa-onnx/csrc/text-utils-test.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include "sherpa-onnx/csrc/text-utils.h"
#include "gtest/gtest.h"
namespace
sherpa_onnx
{
TEST
(
RemoveInvalidUtf8Sequences
,
Case1
)
{
std
::
vector
<
uint8_t
>
v
=
{
0xe4
,
0xbb
,
0x8a
,
// 今
0xe5
,
0xa4
,
0xa9
,
// 天
'i'
,
's'
,
' '
,
'M'
,
'o'
,
'd'
,
'a'
,
'y'
,
','
,
// is Monday,
' '
,
'w'
,
'i'
,
'e'
,
' '
,
'h'
,
'e'
,
'i'
,
0xc3
,
// wie heißen Size
0x9f
,
'e'
,
'n'
,
' '
,
'S'
,
'i'
,
'e'
,
0xf0
,
0x9d
,
0x84
,
0x81
};
std
::
vector
<
uint8_t
>
v0
=
v
;
v0
[
1
]
=
0xc0
;
// make the first 3 bytes an invalid utf8 character
std
::
string
s0
{
v0
.
begin
(),
v0
.
end
()};
EXPECT_EQ
(
s0
.
size
(),
v0
.
size
());
auto
s
=
RemoveInvalidUtf8Sequences
(
s0
);
// should remove 今
v0
=
v
;
// v0[23] == 0xc3
// v0[24] == 0x9f
v0
[
23
]
=
0xc1
;
s0
=
{
v0
.
begin
(),
v0
.
end
()};
s
=
RemoveInvalidUtf8Sequences
(
s0
);
// should remove ß
EXPECT_EQ
(
s
.
size
()
+
2
,
v
.
size
());
v0
=
v
;
// v0[31] = 0xf0;
// v0[32] = 0x9d;
// v0[33] = 0x84;
// v0[34] = 0x81;
v0
[
31
]
=
0xf5
;
s0
=
{
v0
.
begin
(),
v0
.
end
()};
s
=
RemoveInvalidUtf8Sequences
(
s0
);
EXPECT_EQ
(
s
.
size
()
+
4
,
v
.
size
());
}
}
// namespace sherpa_onnx
...
...
sherpa-onnx/csrc/text-utils.cc
查看文件 @
b6f0f5f
...
...
@@ -396,4 +396,110 @@ void ToLowerCase(std::string *in_out) {
[](
unsigned
char
c
)
{
return
std
::
tolower
(
c
);
});
}
static
inline
bool
InRange
(
uint8_t
x
,
uint8_t
low
,
uint8_t
high
)
{
return
low
<=
x
&&
x
<=
high
;
}
/*
Please see
https://stackoverflow.com/questions/6555015/check-for-invalid-utf8
Table 3-7. Well-Formed UTF-8 Byte Sequences
Code Points First Byte Second Byte Third Byte Fourth Byte
U+0000..U+007F 00..7F
U+0080..U+07FF C2..DF 80..BF
U+0800..U+0FFF E0 A0..BF 80..BF
U+1000..U+CFFF E1..EC 80..BF 80..BF
U+D000..U+D7FF ED 80..9F 80..BF
U+E000..U+FFFF EE..EF 80..BF 80..BF
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
*/
std
::
string
RemoveInvalidUtf8Sequences
(
const
std
::
string
&
text
,
bool
show_debug_msg
/*= false*/
)
{
int32_t
n
=
static_cast
<
int32_t
>
(
text
.
size
());
std
::
string
ans
;
ans
.
reserve
(
n
);
int32_t
i
=
0
;
const
uint8_t
*
p
=
reinterpret_cast
<
const
uint8_t
*>
(
text
.
data
());
while
(
i
<
n
)
{
if
(
p
[
i
]
<=
0x7f
)
{
ans
.
append
(
text
,
i
,
1
);
i
+=
1
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xc2
,
0xdf
)
&&
i
+
1
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
2
);
i
+=
2
;
continue
;
}
if
(
p
[
i
]
==
0xe0
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0xa0
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
3
);
i
+=
3
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xe1
,
0xec
)
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
3
);
i
+=
3
;
continue
;
}
if
(
p
[
i
]
==
0xed
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0x9f
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
3
);
i
+=
3
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xee
,
0xef
)
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
3
);
i
+=
3
;
continue
;
}
if
(
p
[
i
]
==
0xf0
&&
i
+
3
<
n
&&
InRange
(
p
[
i
+
1
],
0x90
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
3
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
4
);
i
+=
4
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xf1
,
0xf3
)
&&
i
+
3
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
3
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
4
);
i
+=
4
;
continue
;
}
if
(
p
[
i
]
==
0xf4
&&
i
+
3
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0x8f
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
3
],
0x80
,
0xbf
))
{
ans
.
append
(
text
,
i
,
4
);
i
+=
4
;
continue
;
}
if
(
show_debug_msg
)
{
SHERPA_ONNX_LOGE
(
"Ignore invalid utf8 sequence at pos: %d, value: %02x"
,
i
,
p
[
i
]);
}
i
+=
1
;
}
return
ans
;
}
}
// namespace sherpa_onnx
...
...
sherpa-onnx/csrc/text-utils.h
查看文件 @
b6f0f5f
...
...
@@ -124,6 +124,9 @@ std::vector<std::string> SplitUtf8(const std::string &text);
std
::
string
ToLowerCase
(
const
std
::
string
&
s
);
void
ToLowerCase
(
std
::
string
*
in_out
);
std
::
string
RemoveInvalidUtf8Sequences
(
const
std
::
string
&
text
,
bool
show_debug_msg
=
false
);
}
// namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
...
...
请
注册
或
登录
后发表评论