generate_lexicon_zh.py
3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import re
from typing import List, Tuple
from misaki import zh
from misaki.token import MToken
from misaki.zh_frontend import ZH_MAP
from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
user_dict = {
"还田": [["huan2"], ["tian2"]],
"行长": [["hang2"], ["zhang3"]],
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}
load_phrases_dict(user_dict)
phrases_dict.phrases_dict.update(**user_dict)
def process_text(self, text, with_erhua=True):
"""
This function is modified from
https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155
Note that we have removed jieba.posseg.lcut().
"""
seg_cut = [(text, "v")]
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
tokens = []
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
initials = []
finals = []
# pypinyin, g2pM
for word, pos in seg_cut:
if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
pos = "X"
elif pos != "x" and word in self.punc:
pos = "x"
tk = MToken(text=word, tag=pos, whitespace="")
if pos in ("x", "eng"):
if not word.isspace():
if pos == "x" and word in self.punc:
tk.phonemes = word
tokens.append(tk)
elif tokens:
tokens[-1].whitespace += word
continue
elif (
tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
):
tokens[-1].whitespace = "/"
# g2p
sub_initials, sub_finals = self._get_initials_finals(word)
# tone sandhi
sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
# er hua
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos
)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
# sum(iterable[, start])
# initials = sum(initials, [])
# finals = sum(finals, [])
phones = []
for c, v in zip(sub_initials, sub_finals):
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c:
phones.append(c)
# replace punctuation by ` `
# if c and c in self.punc:
# phones.append(c)
if v and (v not in self.punc or v != c): # and v not in self.rhy_phns:
phones.append(v)
phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
phones = re.sub(r"(?=\d)", "_", phones).split("_")
tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
tokens.append(tk)
result = "".join(
(self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
for tk in tokens
)
return result, tokens
def generate_chinese_lexicon():
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
g2p = zh.ZHG2P(version="1.1")
lexicon = []
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens: str = process_text(g2p.frontend, w)[0]
lexicon.append((w, tokens))
for key in phrases:
tokens: str = process_text(g2p.frontend, key)[0]
lexicon.append((key, tokens))
return lexicon
def save(filename: str, lexicon: List[Tuple[str, str]]):
with open(filename, "w", encoding="utf-8") as f:
for word, phones in lexicon:
tokens = " ".join(list(phones))
f.write(f"{word} {tokens}\n")
def main():
zh = generate_chinese_lexicon()
save("lexicon-zh.txt", zh)
if __name__ == "__main__":
main()