offline-tts-zipvoice-model-config.cc
4.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// sherpa-onnx/csrc/offline-tts-zipvoice-model-config.cc
//
// Copyright (c) 2025 Xiaomi Corporation
#include "sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h"
#include <vector>
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
void OfflineTtsZipvoiceModelConfig::Register(ParseOptions *po) {
po->Register("zipvoice-tokens", &tokens,
"Path to tokens.txt for ZipVoice models");
po->Register("zipvoice-data-dir", &data_dir,
"Path to the directory containing dict for espeak-ng.");
po->Register("zipvoice-pinyin-dict", &pinyin_dict,
"Path to the pinyin dictionary for cppinyin (i.e converting "
"Chinese into phones).");
po->Register("zipvoice-text-model", &text_model,
"Path to zipvoice text model");
po->Register("zipvoice-flow-matching-model", &flow_matching_model,
"Path to zipvoice flow-matching model");
po->Register("zipvoice-vocoder", &vocoder, "Path to zipvoice vocoder");
po->Register("zipvoice-feat-scale", &feat_scale,
"Feature scale for ZipVoice (default: 0.1)");
po->Register("zipvoice-t-shift", &t_shift,
"Shift t to smaller ones if t_shift < 1.0 (default: 0.5)");
po->Register(
"zipvoice-target-rms", &target_rms,
"Target speech normalization rms value for ZipVoice (default: 0.1)");
po->Register(
"zipvoice-guidance-scale", &guidance_scale,
"The scale of classifier-free guidance during inference for ZipVoice "
"(default: 1.0)");
}
bool OfflineTtsZipvoiceModelConfig::Validate() const {
if (tokens.empty()) {
SHERPA_ONNX_LOGE("Please provide --zipvoice-tokens");
return false;
}
if (!FileExists(tokens)) {
SHERPA_ONNX_LOGE("--zipvoice-tokens: '%s' does not exist", tokens.c_str());
return false;
}
if (text_model.empty()) {
SHERPA_ONNX_LOGE("Please provide --zipvoice-text-model");
return false;
}
if (!FileExists(text_model)) {
SHERPA_ONNX_LOGE("--zipvoice-text-model: '%s' does not exist",
text_model.c_str());
return false;
}
if (flow_matching_model.empty()) {
SHERPA_ONNX_LOGE("Please provide --zipvoice-flow-matching-model");
return false;
}
if (!FileExists(flow_matching_model)) {
SHERPA_ONNX_LOGE("--zipvoice-flow-matching-model: '%s' does not exist",
flow_matching_model.c_str());
return false;
}
if (vocoder.empty()) {
SHERPA_ONNX_LOGE("Please provide --zipvoice-vocoder");
return false;
}
if (!FileExists(vocoder)) {
SHERPA_ONNX_LOGE("--zipvoice-vocoder: '%s' does not exist",
vocoder.c_str());
return false;
}
if (!data_dir.empty()) {
std::vector<std::string> required_files = {
"phontab",
"phonindex",
"phondata",
"intonations",
};
for (const auto &f : required_files) {
if (!FileExists(data_dir + "/" + f)) {
SHERPA_ONNX_LOGE(
"'%s/%s' does not exist. Please check zipvoice-data-dir",
data_dir.c_str(), f.c_str());
return false;
}
}
}
if (!pinyin_dict.empty() && !FileExists(pinyin_dict)) {
SHERPA_ONNX_LOGE("--zipvoice-pinyin-dict: '%s' does not exist",
pinyin_dict.c_str());
return false;
}
if (feat_scale <= 0) {
SHERPA_ONNX_LOGE("--zipvoice-feat-scale must be positive. Given: %f",
feat_scale);
return false;
}
if (t_shift < 0) {
SHERPA_ONNX_LOGE("--zipvoice-t-shift must be non-negative. Given: %f",
t_shift);
return false;
}
if (target_rms <= 0) {
SHERPA_ONNX_LOGE("--zipvoice-target-rms must be positive. Given: %f",
target_rms);
return false;
}
if (guidance_scale <= 0) {
SHERPA_ONNX_LOGE("--zipvoice-guidance-scale must be positive. Given: %f",
guidance_scale);
return false;
}
return true;
}
std::string OfflineTtsZipvoiceModelConfig::ToString() const {
std::ostringstream os;
os << "OfflineTtsZipvoiceModelConfig(";
os << "tokens=\"" << tokens << "\", ";
os << "text_model=\"" << text_model << "\", ";
os << "flow_matching_model=\"" << flow_matching_model << "\", ";
os << "vocoder=\"" << vocoder << "\", ";
os << "data_dir=\"" << data_dir << "\", ";
os << "pinyin_dict=\"" << pinyin_dict << "\", ";
os << "feat_scale=" << feat_scale << ", ";
os << "t_shift=" << t_shift << ", ";
os << "target_rms=" << target_rms << ", ";
os << "guidance_scale=" << guidance_scale << ")";
return os.str();
}
} // namespace sherpa_onnx