text2token.py
2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
"""
This script encode the texts (given line by line through `text`) to tokens and
write the results to the file given by ``output``.
Usage:
If the tokens_type is bpe:
python3 ./text2token.py \
--text texts.txt \
--tokens tokens.txt \
--tokens-type bpe \
--bpe-model bpe.model \
--output hotwords.txt
If the tokens_type is cjkchar:
python3 ./text2token.py \
--text texts.txt \
--tokens tokens.txt \
--tokens-type cjkchar \
--output hotwords.txt
If the tokens_type is cjkchar+bpe:
python3 ./text2token.py \
--text texts.txt \
--tokens tokens.txt \
--tokens-type cjkchar+bpe \
--bpe-model bpe.model \
--output hotwords.txt
"""
import argparse
from sherpa_onnx import text2token
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
type=str,
required=True,
help="Path to the input texts",
)
parser.add_argument(
"--tokens",
type=str,
required=True,
help="The path to tokens.txt.",
)
parser.add_argument(
"--tokens-type",
type=str,
required=True,
help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
)
parser.add_argument(
"--bpe-model",
type=str,
help="The path to bpe.model. Only required when tokens-type is bpe or cjkchar+bpe.",
)
parser.add_argument(
"--output",
type=str,
required=True,
help="Path where the encoded tokens will be written to.",
)
return parser.parse_args()
def main():
args = get_args()
texts = []
with open(args.text, "r", encoding="utf8") as f:
for line in f:
texts.append(line.strip())
encoded_texts = text2token(
texts,
tokens=args.tokens,
tokens_type=args.tokens_type,
bpe_model=args.bpe_model,
)
with open(args.output, "w", encoding="utf8") as f:
for txt in encoded_texts:
f.write(" ".join(txt) + "\n")
if __name__ == "__main__":
main()