regex-lang-test.cc
2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
// sherpa-onnx/csrc/regex-lang-test.cc
//
// Copyright (c) 2025 Xiaomi Corporation
#include <regex> // NOLINT
#include "gtest/gtest.h"
#include "sherpa-onnx/csrc/text-utils.cc"
namespace sherpa_onnx {
static void TestLang(const std::string &expr, const std::string &text,
const std::vector<std::string> &expected) {
auto ws = ToWideString(text);
std::wstring wexpr = ToWideString(expr);
std::wregex we(wexpr);
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
auto end = std::wsregex_iterator();
int32_t k = 0;
for (std::wsregex_iterator i = begin; i != end; ++i) {
std::wsmatch match = *i;
std::wstring match_str = match.str();
auto ms = ToString(match_str);
std::cout << ms << "\n";
EXPECT_EQ(ms, expected[k]);
k++;
}
EXPECT_EQ(k, expected.size());
}
TEST(German, Case1) {
std::cout << "----------Test German----------";
// see https://character-table.netlify.app/german/
std::string expr =
"([\\u0020-\\u005f\\u0061-"
"\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df\\"
"u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-"
"\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)";
std::string text =
"开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€";
std::vector<std::string> expected = {"Übeltäter übergibt Ärzten ",
"öfters äußerst ätzende Öle", "3€"};
TestLang(expr, text, expected);
}
TEST(French, Case1) {
std::string expr =
"([\\u0020-\\u005f\\u0061-"
"\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-"
"\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-"
"\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-"
"\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-"
"\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-"
"\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-"
"\\u2030\\u20ac\\u2212]+)";
std::string text =
"L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon";
std::vector<std::string> expected = {
"L'été, ",
"avec son ciel bleuâtre, ",
"est un moment où, ",
"Noël, maçon",
};
TestLang(expr, text, expected);
}
TEST(English, Case1) {
// https://character-table.netlify.app/english/
std::string expr =
"([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-"
"\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-"
"\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)";
std::string text = "一how are you doing? 二Thank you!";
std::vector<std::string> expected = {
"how are you doing? ",
"Thank you!",
};
TestLang(expr, text, expected);
}
} // namespace sherpa_onnx