generate_lexicon_zh.py 3.9 KB
#!/usr/bin/env python3
# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)

import re
from typing import List, Tuple

from misaki import zh
from misaki.token import MToken
from misaki.zh_frontend import ZH_MAP
from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict

user_dict = {
    "还田": [["huan2"], ["tian2"]],
    "行长": [["hang2"], ["zhang3"]],
    "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}

load_phrases_dict(user_dict)

phrases_dict.phrases_dict.update(**user_dict)


def process_text(self, text, with_erhua=True):
    """
    This function is modified from
    https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155

    Note that we have removed jieba.posseg.lcut().
    """
    seg_cut = [(text, "v")]
    seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
    tokens = []
    seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
    initials = []
    finals = []
    # pypinyin, g2pM
    for word, pos in seg_cut:
        if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
            pos = "X"
        elif pos != "x" and word in self.punc:
            pos = "x"
        tk = MToken(text=word, tag=pos, whitespace="")
        if pos in ("x", "eng"):
            if not word.isspace():
                if pos == "x" and word in self.punc:
                    tk.phonemes = word
                tokens.append(tk)
            elif tokens:
                tokens[-1].whitespace += word
            continue
        elif (
            tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
        ):
            tokens[-1].whitespace = "/"

        # g2p
        sub_initials, sub_finals = self._get_initials_finals(word)
        # tone sandhi
        sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
        # er hua
        if with_erhua:
            sub_initials, sub_finals = self._merge_erhua(
                sub_initials, sub_finals, word, pos
            )

        initials.append(sub_initials)
        finals.append(sub_finals)
        # assert len(sub_initials) == len(sub_finals) == len(word)

        # sum(iterable[, start])
        # initials = sum(initials, [])
        # finals = sum(finals, [])

        phones = []
        for c, v in zip(sub_initials, sub_finals):
            # NOTE: post process for pypinyin outputs
            # we discriminate i, ii and iii
            if c:
                phones.append(c)
            # replace punctuation by ` `
            # if c and c in self.punc:
            #     phones.append(c)
            if v and (v not in self.punc or v != c):  # and v not in self.rhy_phns:
                phones.append(v)
        phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
        phones = re.sub(r"(?=\d)", "_", phones).split("_")
        tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
        tokens.append(tk)

    result = "".join(
        (self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
        for tk in tokens
    )

    return result, tokens


def generate_chinese_lexicon():
    word_dict = pinyin_dict.pinyin_dict
    phrases = phrases_dict.phrases_dict

    g2p = zh.ZHG2P(version="1.1")

    lexicon = []
    for key in word_dict:
        if not (0x4E00 <= key <= 0x9FFF):
            continue
        w = chr(key)
        tokens: str = process_text(g2p.frontend, w)[0]
        lexicon.append((w, tokens))

    for key in phrases:
        tokens: str = process_text(g2p.frontend, key)[0]
        lexicon.append((key, tokens))
    return lexicon


def save(filename: str, lexicon: List[Tuple[str, str]]):
    with open(filename, "w", encoding="utf-8") as f:
        for word, phones in lexicon:
            tokens = " ".join(list(phones))
            f.write(f"{word} {tokens}\n")


def main():
    zh = generate_chinese_lexicon()

    save("lexicon-zh.txt", zh)


if __name__ == "__main__":
    main()