text-utils.h
6.0 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// sherpa-onnx/csrc/text-utils.h
//
// Copyright 2009-2011 Saarland University; Microsoft Corporation
// Copyright 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_TEXT_UTILS_H_
#define SHERPA_ONNX_CSRC_TEXT_UTILS_H_
#include <errno.h>
#include <stdlib.h>
#include <limits>
#include <string>
#include <type_traits>
#include <vector>
#ifdef _MSC_VER
#define SHERPA_ONNX_STRTOLL(cur_cstr, end_cstr) \
_strtoi64(cur_cstr, end_cstr, 10);
#else
#define SHERPA_ONNX_STRTOLL(cur_cstr, end_cstr) strtoll(cur_cstr, end_cstr, 10);
#endif
// This file is copied/modified from
// https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.h
namespace sherpa_onnx {
/// Converts a string into an integer via strtoll and returns false if there was
/// any kind of problem (i.e. the string was not an integer or contained extra
/// non-whitespace junk, or the integer was too large to fit into the type it is
/// being converted into). Only sets *out if everything was OK and it returns
/// true.
template <class Int>
bool ConvertStringToInteger(const std::string &str, Int *out) {
// copied from kaldi/src/util/text-util.h
static_assert(std::is_integral<Int>::value, "");
const char *this_str = str.c_str();
char *end = nullptr;
errno = 0;
int64_t i = SHERPA_ONNX_STRTOLL(this_str, &end);
if (end != this_str) {
while (isspace(*end)) ++end;
}
if (end == this_str || *end != '\0' || errno != 0) return false;
Int iInt = static_cast<Int>(i);
if (static_cast<int64_t>(iInt) != i ||
(i < 0 && !std::numeric_limits<Int>::is_signed)) {
return false;
}
*out = iInt;
return true;
}
/// Split a string using any of the single character delimiters.
/// If omit_empty_strings == true, the output will contain any
/// nonempty strings after splitting on any of the
/// characters in the delimiter. If omit_empty_strings == false,
/// the output will contain n+1 strings if there are n characters
/// in the set "delim" within the input string. In this case
/// the empty string is split to a single empty string.
void SplitStringToVector(const std::string &full, const char *delim,
bool omit_empty_strings,
std::vector<std::string> *out);
/**
\brief Split a string (e.g. 1:2:3) into a vector of integers.
\param [in] delim String containing a list of characters, any of which
is allowed as a delimiter.
\param [in] omit_empty_strings If true, empty strings between delimiters are
allowed and will not produce an output integer; if false,
instances of characters in 'delim' that are consecutive or
at the start or end of the string would be an error.
You'll normally want this to be true if 'delim' consists
of spaces, and false otherwise.
\param [out] out The output list of integers.
*/
template <class I>
bool SplitStringToIntegers(const std::string &full, const char *delim,
bool omit_empty_strings, // typically false [but
// should probably be true
// if "delim" is spaces].
std::vector<I> *out) {
static_assert(std::is_integral<I>::value, "");
if (*(full.c_str()) == '\0') {
out->clear();
return true;
}
std::vector<std::string> split;
SplitStringToVector(full, delim, omit_empty_strings, &split);
out->resize(split.size());
for (size_t i = 0; i < split.size(); i++) {
const char *this_str = split[i].c_str();
char *end = NULL;
int64_t j = 0;
j = SHERPA_ONNX_STRTOLL(this_str, &end);
if (end == this_str || *end != '\0') {
out->clear();
return false;
} else {
I jI = static_cast<I>(j);
if (static_cast<int64_t>(jI) != j) {
// output type cannot fit this integer.
out->clear();
return false;
}
(*out)[i] = jI;
}
}
return true;
}
// This is defined for F = float and double.
template <class F>
bool SplitStringToFloats(const std::string &full, const char *delim,
bool omit_empty_strings, // typically false
std::vector<F> *out);
// This is defined for F = float and double.
template <typename T>
bool ConvertStringToReal(const std::string &str, T *out);
std::vector<std::string> SplitUtf8(const std::string &text);
std::string ToLowerCase(const std::string &s);
void ToLowerCase(std::string *in_out);
std::wstring ToLowerCase(const std::wstring &s);
std::string RemoveInvalidUtf8Sequences(const std::string &text,
bool show_debug_msg = false);
// Return true if text contains valid utf8 sequence.
// Return false otherwise
bool IsUtf8(const std::string &text);
// Return true if text contains valid gb2312 encoded sequence
// Return false otherwise
bool IsGB2312(const std::string &text);
#if defined(_WIN32)
std::string Gb2312ToUtf8(const std::string &text);
#endif
std::wstring ToWideString(const std::string &s);
std::string ToString(const std::wstring &s);
bool EndsWith(const std::string &haystack, const std::string &needle);
std::vector<std::string> SplitString(const std::string &s, int32_t chunk_size);
// Converts a UTF-8 std::string to a UTF-32 std::u32string
std::u32string Utf8ToUtf32(const std::string &str);
// Converts a UTF-32 std::u32string to a UTF-8 std::string
std::string Utf32ToUtf8(const std::u32string &str);
// Helper: Convert ASCII chars in a std::string to uppercase (leaves non-ASCII
// unchanged)
std::string ToUpperAscii(const std::string &str);
// Helper: Convert ASCII chars in a std::string to lowercase (leaves non-ASCII
// unchanged)
std::string ToLowerAscii(const std::string &str);
// Detect if a codepoint is a CJK character
bool IsCJK(char32_t cp);
bool ContainsCJK(const std::string &text);
bool ContainsCJK(const std::u32string &text);
bool StringToBool(const std::string &s);
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_