cppjieba-test.cc
5.0 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// sherpa-onnx/csrc/cppjieba-test.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include <iostream>
#include <regex> // NOLINT
#include <string>
#include <vector>
#include "cppjieba/Jieba.hpp"
#include "gtest/gtest.h"
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
// Please download dict files form
// https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
const char *const kDictPath = "./dict/jieba.dict.utf8";
const char *const kHmmPath = "./dict/hmm_model.utf8";
const char *const kUserDictPath = "./dict/user.dict.utf8";
const char *const kIdfPath = "./dict/idf.utf8";
const char *const kStopWordPath = "./dict/stop_words.utf8";
TEST(CppJieBa, Case1) {
if (!FileExists(kDictPath)) {
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
return;
}
cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
kStopWordPath);
std::vector<std::string> words;
std::vector<cppjieba::Word> jiebawords;
std::string s = "他来到了网易杭研大厦";
std::cout << s << std::endl;
std::cout << "[demo] Cut With HMM" << std::endl;
jieba.Cut(s, words, true);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
他来到了网易杭研大厦
[demo] Cut With HMM
他/来到/了/网易/杭研/大厦
*/
s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
std::cout << s << std::endl;
std::cout << "[demo] CutForSearch" << std::endl;
jieba.CutForSearch(s, words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
小明硕士毕业于中国科学院计算所,后在日本京都大学深造
[demo] CutForSearch
小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
*/
std::cout << "[demo] Insert User Word" << std::endl;
jieba.Cut("男默女泪", words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
jieba.InsertUserWord("男默女泪");
jieba.Cut("男默女泪", words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
[demo] Insert User Word
男默/女泪
男默女泪
*/
std::cout << "[demo] CutForSearch Word With Offset" << std::endl;
jieba.CutForSearch(s, jiebawords, true);
std::cout << jiebawords << std::endl;
/*
[demo] CutForSearch Word With Offset
[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业",
"offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21},
{"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word":
"科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算",
"offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45},
{"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本",
"offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66},
{"word": "日本京都大学", "offset": 54}, {"word": " 深造", "offset": 72}]
*/
// see more test at
// https://github.com/yanyiwu/cppjieba/blob/master/test/demo.cpp
}
TEST(CppJieBa, Case2) {
if (!FileExists(kDictPath)) {
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
return;
}
cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
kStopWordPath);
std::string s =
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如"
"涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感"
"受着生命的奇迹与温柔";
std::vector<std::string> words;
bool is_hmm = true;
jieba.Cut(s, words, is_hmm);
{
std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}
std::cout << os.str() << "\n";
}
/*
当_夜幕降临_,_星光点点_,_伴随_着_微风_拂面_,
_我_在_静谧_中_感受_着_时光_的_流转_,
_思念_如_涟漪_荡漾_,_梦境_如_画卷_展开_,_我_与_自然_融为一体_,
_沉静_在_这_片_宁静_的_美丽_之中_,_感受_着_生命_的_奇迹_与_温柔
*/
s = "这里有:红的、绿的、蓝的;各种各样的颜色都有!你想要什么呢?测试.";
std::regex punct_re(":|、|;");
std::string s2 = std::regex_replace(s, punct_re, ",");
std::regex punct_re2("[.]");
s2 = std::regex_replace(s2, punct_re2, "。");
std::regex punct_re3("[?]");
s2 = std::regex_replace(s2, punct_re3, "?");
std::regex punct_re4("[!]");
s2 = std::regex_replace(s2, punct_re4, "!");
std::cout << s << "\n" << s2 << "\n";
words.clear();
jieba.Cut(s2, words, is_hmm);
{
std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}
std::cout << os.str() << "\n";
}
}
} // namespace sherpa_onnx