text2token.py
3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
"""
This script encode the texts (given line by line through `text`) to tokens and
write the results to the file given by ``output``.
Usage:
If the tokens_type is bpe:
python3 ./text2token.py \
--text texts.txt \
--tokens tokens.txt \
--tokens-type bpe \
--bpe-model bpe.model \
--output hotwords.txt
If the tokens_type is cjkchar:
python3 ./text2token.py \
--text texts.txt \
--tokens tokens.txt \
--tokens-type cjkchar \
--output hotwords.txt
If the tokens_type is cjkchar+bpe:
python3 ./text2token.py \
--text texts.txt \
--tokens tokens.txt \
--tokens-type cjkchar+bpe \
--bpe-model bpe.model \
--output hotwords.txt
"""
import argparse
from sherpa_onnx import text2token
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
type=str,
required=True,
help="""Path to the input texts.
Each line in the texts contains the original phrase, it might also contain some
extra items, for example, the boosting score (startting with :), the triggering
threshold (startting with #, only used in keyword spotting task) and the original
phrase (startting with @). Note: extra items will be kept in the output.
example input 1 (tokens_type = ppinyin):
小爱同学 :2.0 #0.6 @小爱同学
你好问问 :3.5 @你好问问
小艺小艺 #0.6 @小艺小艺
example output 1:
x iǎo ài t óng x ué :2.0 #0.6 @小爱同学
n ǐ h ǎo w èn w èn :3.5 @你好问问
x iǎo y ì x iǎo y ì #0.6 @小艺小艺
example input 2 (tokens_type = bpe):
HELLO WORLD :1.5 #0.4
HI GOOGLE :2.0 #0.8
HEY SIRI #0.35
example output 2:
▁HE LL O ▁WORLD :1.5 #0.4
▁HI ▁GO O G LE :2.0 #0.8
▁HE Y ▁S I RI #0.35
""",
)
parser.add_argument(
"--tokens",
type=str,
required=True,
help="The path to tokens.txt.",
)
parser.add_argument(
"--tokens-type",
type=str,
required=True,
choices=["cjkchar", "bpe", "cjkchar+bpe", "fpinyin", "ppinyin"],
help="""The type of modeling units, should be cjkchar, bpe, cjkchar+bpe, fpinyin or ppinyin.
fpinyin means full pinyin, each cjkchar has a pinyin(with tone).
ppinyin means partial pinyin, it splits pinyin into initial and final,
""",
)
parser.add_argument(
"--bpe-model",
type=str,
help="The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe.",
)
parser.add_argument(
"--output",
type=str,
required=True,
help="Path where the encoded tokens will be written to.",
)
return parser.parse_args()
def main():
args = get_args()
texts = []
# extra information like boosting score (start with :), triggering threshold (start with #)
# original keyword (start with @)
extra_info = []
with open(args.text, "r", encoding="utf8") as f:
for line in f:
extra = []
text = []
toks = line.strip().split()
for tok in toks:
if tok[0] == ":" or tok[0] == "#" or tok[0] == "@":
extra.append(tok)
else:
text.append(tok)
texts.append(" ".join(text))
extra_info.append(extra)
encoded_texts = text2token(
texts,
tokens=args.tokens,
tokens_type=args.tokens_type,
bpe_model=args.bpe_model,
)
with open(args.output, "w", encoding="utf8") as f:
for i, txt in enumerate(encoded_texts):
txt += extra_info[i]
f.write(" ".join(txt) + "\n")
if __name__ == "__main__":
main()