text-utils-test.cc
4.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// sherpa-onnx/csrc/text-utils-test.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include "sherpa-onnx/csrc/text-utils.h"
#include <regex> // NOLINT
#include <sstream>
#include "gtest/gtest.h"
namespace sherpa_onnx {
TEST(ToLowerCase, WideString) {
std::string text =
"Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€";
auto t = ToLowerCase(text);
std::cout << text << "\n";
std::cout << t << "\n";
}
TEST(RemoveInvalidUtf8Sequences, Case1) {
std::vector<uint8_t> v = {
0xe4, 0xbb, 0x8a, // 今
0xe5, 0xa4, 0xa9, // 天
'i', 's', ' ', 'M', 'o', 'd', 'a', 'y', ',', // is Monday,
' ', 'w', 'i', 'e', ' ', 'h', 'e', 'i', 0xc3, // wie heißen Size
0x9f, 'e', 'n', ' ', 'S', 'i', 'e', 0xf0, 0x9d, 0x84, 0x81};
std::vector<uint8_t> v0 = v;
v0[1] = 0xc0; // make the first 3 bytes an invalid utf8 character
std::string s0{v0.begin(), v0.end()};
EXPECT_EQ(s0.size(), v0.size());
auto s = RemoveInvalidUtf8Sequences(s0); // should remove 今
v0 = v;
// v0[23] == 0xc3
// v0[24] == 0x9f
v0[23] = 0xc1;
s0 = {v0.begin(), v0.end()};
s = RemoveInvalidUtf8Sequences(s0); // should remove ß
EXPECT_EQ(s.size() + 2, v.size());
v0 = v;
// v0[31] = 0xf0;
// v0[32] = 0x9d;
// v0[33] = 0x84;
// v0[34] = 0x81;
v0[31] = 0xf5;
s0 = {v0.begin(), v0.end()};
s = RemoveInvalidUtf8Sequences(s0);
EXPECT_EQ(s.size() + 4, v.size());
}
// Tests for sanitizeUtf8
TEST(RemoveInvalidUtf8Sequences, ValidUtf8StringPassesUnchanged) {
std::string input = "Valid UTF-8 🌍";
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), input);
}
TEST(RemoveInvalidUtf8Sequences, SingleInvalidByteReplaced) {
std::string input = "Invalid \xFF UTF-8";
std::string expected = "Invalid UTF-8";
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, TruncatedUtf8SequenceReplaced) {
std::string input = "Broken \xE2\x82"; // Incomplete UTF-8 sequence
std::string expected = "Broken ";
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, MultipleInvalidBytes) {
std::string input = "Test \xC0\xC0\xF8\xA0"; // Multiple invalid sequences
std::string expected = "Test ";
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, BreakingCase_SpaceFollowedByInvalidByte) {
std::string input = "\x20\xC4"; // Space followed by an invalid byte
std::string expected = " "; // 0xC4 removed
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, ValidUtf8WithEdgeCaseCharacters) {
std::string input = "Edge 🏆💯";
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), input);
}
TEST(RemoveInvalidUtf8Sequences, MixedValidAndInvalidBytes) {
std::string input = "Mix \xE2\x82\xAC \xF0\x9F\x98\x81 \xFF";
std::string expected = "Mix € 😁 "; // Invalid bytes removed
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, SpaceFollowedByInvalidByte) {
std::string input = "\x20\xC4"; // Space (0x20) followed by invalid (0xC4)
std::string expected = " "; // Space remains, 0xC4 is removed
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, RemoveTruncatedC4) {
std::string input = "Hello \xc4 world"; // Invalid `0xC4`
std::string expected = "Hello world"; // `0xC4` should be removed
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, SpaceFollowedByInvalidByte_Breaking) {
std::string input = "\x20\xc4"; // Space followed by invalid `0xc4`
std::string expected = " "; // `0xc4` should be removed, space remains
EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
}
TEST(RemoveInvalidUtf8Sequences, DebugSpaceFollowedByInvalidByte) {
std::string input = "\x20\xc4"; // Space followed by invalid `0xc4`
std::string output = RemoveInvalidUtf8Sequences(input);
std::cout << "Processed string: ";
for (unsigned char c : output) {
printf("\\x%02x ", c);
}
std::cout << std::endl;
EXPECT_EQ(output, " "); // Expect `0xc4` to be removed, leaving only space
}
} // namespace sherpa_onnx