generate_lexicon_en.py
1.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import json
from typing import List, Tuple
def generate_english_lexicon(kind: str):
assert kind in ("us", "gb"), kind
# If you want to add new words, please add them to
# the user_defined dict.
user_defined = {
"Kokoro": "kˈOkəɹO",
"Misaki": "misˈɑki",
}
user_defined_lower = dict()
for k, v in user_defined.items():
user_defined_lower[k.lower()] = v
with open(f"./{kind}_gold.json", encoding="utf-8") as f:
gold = json.load(f)
with open(f"./{kind}_silver.json", encoding="utf-8") as f:
silver = json.load(f)
# words in us_gold has a higher priority than those in s_silver, so
# we put us_gold after us_silver below
english = {**silver, **gold}
lexicon = dict()
for k, v in english.items():
k_lower = k.lower()
if k_lower in user_defined_lower:
print(f"{k} already exist in the user defined dict. Skip adding")
continue
if isinstance(v, str):
lexicon[k_lower] = v
else:
assert isinstance(v, dict), (k, v)
assert "DEFAULT" in v, (k, v)
lexicon[k_lower] = v["DEFAULT"]
return list(user_defined_lower.items()) + list(lexicon.items())
def save(filename: str, lexicon: List[Tuple[str, str]]):
with open(filename, "w", encoding="utf-8") as f:
for word, phones in lexicon:
tokens = " ".join(list(phones))
f.write(f"{word} {tokens}\n")
def main():
us = generate_english_lexicon("us")
gb = generate_english_lexicon("gb")
save("lexicon-us-en.txt", us)
save("lexicon-gb-en.txt", gb)
if __name__ == "__main__":
main()