add_meta_data.py
3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import argparse
import json
from pathlib import Path
import numpy as np
import onnx
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model", type=str, required=True, help="input and output onnx model"
)
parser.add_argument("--voices", type=str, required=True, help="Path to voices.json")
return parser.parse_args()
def load_voices(filename):
with open(filename) as f:
voices = json.load(f)
for key in voices:
voices[key] = np.array(voices[key], dtype=np.float32)
return voices
def get_vocab():
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
dicts = {}
for i in range(len((symbols))):
dicts[symbols[i]] = i
return dicts
def generate_tokens():
token2id = get_vocab()
with open("tokens.txt", "w", encoding="utf-8") as f:
for s, i in token2id.items():
f.write(f"{s} {i}\n")
def main():
args = get_args()
print(args.model, args.voices)
model = onnx.load(args.model)
voices = load_voices(args.voices)
if Path("./tokens.txt").is_file():
print("./tokens.txt exist, skip generating it")
else:
generate_tokens()
keys = list(voices.keys())
print(",".join(keys))
if Path("./voices.bin").is_file():
print("./voices.bin exists, skip generating it")
else:
with open("voices.bin", "wb") as f:
for k in keys:
f.write(voices[k].tobytes())
speaker2id_str = ""
id2speaker_str = ""
sep = ""
for i, s in enumerate(keys):
speaker2id_str += f"{sep}{s}->{i}"
id2speaker_str += f"{sep}{i}->{s}"
sep = ","
meta_data = {
"model_type": "kokoro",
"language": "English",
"has_espeak": 1,
"sample_rate": 24000,
"version": 1,
"voice": "en-us",
"style_dim": ",".join(map(str, voices[keys[0]].shape)),
"n_speakers": len(keys),
"speaker2id": speaker2id_str,
"id2speaker": id2speaker_str,
"speaker_names": ",".join(keys),
"model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
"see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
"see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
"maintainer": "k2-fsa",
}
print(model.metadata_props)
while len(model.metadata_props):
model.metadata_props.pop()
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
print("--------------------")
print(model.metadata_props)
onnx.save(model, args.model)
print(f"Please see {args.model}, ./voices.bin, and ./tokens.txt")
if __name__ == "__main__":
main()