Committed by
GitHub
Support scaling the duration of a pause in TTS. (#1820)
正在显示
24 个修改的文件
包含
171 行增加
和
19 行删除
| @@ -116,7 +116,7 @@ int32_t main() { | @@ -116,7 +116,7 @@ int32_t main() { | ||
| 116 | keywords_spotter_config.keywords_buf = keywords_buf; | 116 | keywords_spotter_config.keywords_buf = keywords_buf; |
| 117 | keywords_spotter_config.keywords_buf_size = keywords_buf_size; | 117 | keywords_spotter_config.keywords_buf_size = keywords_buf_size; |
| 118 | 118 | ||
| 119 | - SherpaOnnxKeywordSpotter *keywords_spotter = | 119 | + const SherpaOnnxKeywordSpotter *keywords_spotter = |
| 120 | SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config); | 120 | SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config); |
| 121 | 121 | ||
| 122 | free((void *)tokens_buf); | 122 | free((void *)tokens_buf); |
| @@ -130,7 +130,7 @@ int32_t main() { | @@ -130,7 +130,7 @@ int32_t main() { | ||
| 130 | return -1; | 130 | return -1; |
| 131 | } | 131 | } |
| 132 | 132 | ||
| 133 | - SherpaOnnxOnlineStream *stream = | 133 | + const SherpaOnnxOnlineStream *stream = |
| 134 | SherpaOnnxCreateKeywordStream(keywords_spotter); | 134 | SherpaOnnxCreateKeywordStream(keywords_spotter); |
| 135 | 135 | ||
| 136 | const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50); | 136 | const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50); |
| @@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct { | @@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct { | ||
| 180 | external int maxNumSenetences; | 180 | external int maxNumSenetences; |
| 181 | 181 | ||
| 182 | external Pointer<Utf8> ruleFars; | 182 | external Pointer<Utf8> ruleFars; |
| 183 | + | ||
| 184 | + @Float() | ||
| 185 | + external double silenceScale; | ||
| 183 | } | 186 | } |
| 184 | 187 | ||
| 185 | final class SherpaOnnxGeneratedAudio extends Struct { | 188 | final class SherpaOnnxGeneratedAudio extends Struct { |
| @@ -114,17 +114,19 @@ class OfflineTtsConfig { | @@ -114,17 +114,19 @@ class OfflineTtsConfig { | ||
| 114 | this.ruleFsts = '', | 114 | this.ruleFsts = '', |
| 115 | this.maxNumSenetences = 1, | 115 | this.maxNumSenetences = 1, |
| 116 | this.ruleFars = '', | 116 | this.ruleFars = '', |
| 117 | + this.silenceScale = 0.2, | ||
| 117 | }); | 118 | }); |
| 118 | 119 | ||
| 119 | @override | 120 | @override |
| 120 | String toString() { | 121 | String toString() { |
| 121 | - return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars)'; | 122 | + return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars, silenceScale: $silenceScale)'; |
| 122 | } | 123 | } |
| 123 | 124 | ||
| 124 | final OfflineTtsModelConfig model; | 125 | final OfflineTtsModelConfig model; |
| 125 | final String ruleFsts; | 126 | final String ruleFsts; |
| 126 | final int maxNumSenetences; | 127 | final int maxNumSenetences; |
| 127 | final String ruleFars; | 128 | final String ruleFars; |
| 129 | + final double silenceScale; | ||
| 128 | } | 130 | } |
| 129 | 131 | ||
| 130 | class GeneratedAudio { | 132 | class GeneratedAudio { |
| @@ -180,6 +182,7 @@ class OfflineTts { | @@ -180,6 +182,7 @@ class OfflineTts { | ||
| 180 | c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); | 182 | c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); |
| 181 | c.ref.maxNumSenetences = config.maxNumSenetences; | 183 | c.ref.maxNumSenetences = config.maxNumSenetences; |
| 182 | c.ref.ruleFars = config.ruleFars.toNativeUtf8(); | 184 | c.ref.ruleFars = config.ruleFars.toNativeUtf8(); |
| 185 | + c.ref.silenceScale = config.silenceScale; | ||
| 183 | 186 | ||
| 184 | final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr; | 187 | final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr; |
| 185 | 188 |
| @@ -146,6 +146,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | @@ -146,6 +146,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | ||
| 146 | SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); | 146 | SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); |
| 147 | SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences); | 147 | SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences); |
| 148 | SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); | 148 | SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); |
| 149 | + SHERPA_ONNX_ASSIGN_ATTR_STR(silence_scale, silenceScale); | ||
| 149 | 150 | ||
| 150 | #if __OHOS__ | 151 | #if __OHOS__ |
| 151 | std::unique_ptr<NativeResourceManager, | 152 | std::unique_ptr<NativeResourceManager, |
| @@ -52,6 +52,7 @@ export class OfflineTtsConfig { | @@ -52,6 +52,7 @@ export class OfflineTtsConfig { | ||
| 52 | public ruleFsts: string = ''; | 52 | public ruleFsts: string = ''; |
| 53 | public ruleFars: string = ''; | 53 | public ruleFars: string = ''; |
| 54 | public maxNumSentences: number = 1; | 54 | public maxNumSentences: number = 1; |
| 55 | + public silenceScale: number = 0.2; | ||
| 55 | } | 56 | } |
| 56 | 57 | ||
| 57 | export class TtsOutput { | 58 | export class TtsOutput { |
| @@ -98,4 +99,4 @@ export class OfflineTts { | @@ -98,4 +99,4 @@ export class OfflineTts { | ||
| 98 | generateAsync(input: TtsInput): Promise<TtsOutput> { | 99 | generateAsync(input: TtsInput): Promise<TtsOutput> { |
| 99 | return offlineTtsGenerateAsync(this.handle, input); | 100 | return offlineTtsGenerateAsync(this.handle, input); |
| 100 | } | 101 | } |
| 101 | -} | ||
| 102 | +} |
| @@ -13,6 +13,7 @@ namespace SherpaOnnx | @@ -13,6 +13,7 @@ namespace SherpaOnnx | ||
| 13 | RuleFsts = ""; | 13 | RuleFsts = ""; |
| 14 | MaxNumSentences = 1; | 14 | MaxNumSentences = 1; |
| 15 | RuleFars = ""; | 15 | RuleFars = ""; |
| 16 | + SilenceScale = 0.2F; | ||
| 16 | } | 17 | } |
| 17 | public OfflineTtsModelConfig Model; | 18 | public OfflineTtsModelConfig Model; |
| 18 | 19 | ||
| @@ -23,6 +24,7 @@ namespace SherpaOnnx | @@ -23,6 +24,7 @@ namespace SherpaOnnx | ||
| 23 | 24 | ||
| 24 | [MarshalAs(UnmanagedType.LPStr)] | 25 | [MarshalAs(UnmanagedType.LPStr)] |
| 25 | public string RuleFars; | 26 | public string RuleFars; |
| 26 | - } | ||
| 27 | 27 | ||
| 28 | -} | ||
| 28 | + public float SilenceScale; | ||
| 29 | + } | ||
| 30 | +} |
| @@ -712,6 +712,7 @@ type OfflineTtsConfig struct { | @@ -712,6 +712,7 @@ type OfflineTtsConfig struct { | ||
| 712 | RuleFsts string | 712 | RuleFsts string |
| 713 | RuleFars string | 713 | RuleFars string |
| 714 | MaxNumSentences int | 714 | MaxNumSentences int |
| 715 | + SilenceScale float32 | ||
| 715 | } | 716 | } |
| 716 | 717 | ||
| 717 | type GeneratedAudio struct { | 718 | type GeneratedAudio struct { |
| @@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { | @@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { | ||
| 744 | defer C.free(unsafe.Pointer(c.rule_fars)) | 745 | defer C.free(unsafe.Pointer(c.rule_fars)) |
| 745 | 746 | ||
| 746 | c.max_num_sentences = C.int(config.MaxNumSentences) | 747 | c.max_num_sentences = C.int(config.MaxNumSentences) |
| 748 | + c.silence_scale = C.float(config.SilenceScale) | ||
| 747 | 749 | ||
| 748 | // vits | 750 | // vits |
| 749 | c.model.vits.model = C.CString(config.Model.Vits.Model) | 751 | c.model.vits.model = C.CString(config.Model.Vits.Model) |
| @@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | @@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | ||
| 1135 | tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); | 1135 | tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); |
| 1136 | tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); | 1136 | tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); |
| 1137 | tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1); | 1137 | tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1); |
| 1138 | + tts_config.silence_scale = SHERPA_ONNX_OR(config->silence_scale, 0.2); | ||
| 1138 | 1139 | ||
| 1139 | if (tts_config.model.debug) { | 1140 | if (tts_config.model.debug) { |
| 1140 | #if __OHOS__ | 1141 | #if __OHOS__ |
| @@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { | @@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { | ||
| 944 | const char *rule_fsts; | 944 | const char *rule_fsts; |
| 945 | int32_t max_num_sentences; | 945 | int32_t max_num_sentences; |
| 946 | const char *rule_fars; | 946 | const char *rule_fars; |
| 947 | + float silence_scale; | ||
| 947 | } SherpaOnnxOfflineTtsConfig; | 948 | } SherpaOnnxOfflineTtsConfig; |
| 948 | 949 | ||
| 949 | SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { | 950 | SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { |
| @@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { | @@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { | ||
| 352 | 352 | ||
| 353 | c.rule_fsts = config.rule_fsts.c_str(); | 353 | c.rule_fsts = config.rule_fsts.c_str(); |
| 354 | c.max_num_sentences = config.max_num_sentences; | 354 | c.max_num_sentences = config.max_num_sentences; |
| 355 | + c.silence_scale = config.silence_scale; | ||
| 355 | c.rule_fars = config.rule_fars.c_str(); | 356 | c.rule_fars = config.rule_fars.c_str(); |
| 356 | 357 | ||
| 357 | auto p = SherpaOnnxCreateOfflineTts(&c); | 358 | auto p = SherpaOnnxCreateOfflineTts(&c); |
| @@ -363,6 +363,7 @@ struct OfflineTtsConfig { | @@ -363,6 +363,7 @@ struct OfflineTtsConfig { | ||
| 363 | std::string rule_fsts; | 363 | std::string rule_fsts; |
| 364 | std::string rule_fars; | 364 | std::string rule_fars; |
| 365 | int32_t max_num_sentences = 1; | 365 | int32_t max_num_sentences = 1; |
| 366 | + float silence_scale = 0.2; | ||
| 366 | }; | 367 | }; |
| 367 | 368 | ||
| 368 | struct GeneratedAudio { | 369 | struct GeneratedAudio { |
| @@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | @@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 420 | GeneratedAudio ans; | 420 | GeneratedAudio ans; |
| 421 | ans.sample_rate = model_->GetMetaData().sample_rate; | 421 | ans.sample_rate = model_->GetMetaData().sample_rate; |
| 422 | ans.samples = std::vector<float>(p, p + total); | 422 | ans.samples = std::vector<float>(p, p + total); |
| 423 | + | ||
| 424 | + float silence_scale = config_.silence_scale; | ||
| 425 | + if (silence_scale != 1) { | ||
| 426 | + ans = ans.ScaleSilence(silence_scale); | ||
| 427 | + } | ||
| 428 | + | ||
| 423 | return ans; | 429 | return ans; |
| 424 | } | 430 | } |
| 425 | 431 |
| @@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | @@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | ||
| 398 | GeneratedAudio ans; | 398 | GeneratedAudio ans; |
| 399 | ans.sample_rate = model_->GetMetaData().sample_rate; | 399 | ans.sample_rate = model_->GetMetaData().sample_rate; |
| 400 | ans.samples = std::vector<float>(p, p + total); | 400 | ans.samples = std::vector<float>(p, p + total); |
| 401 | + | ||
| 402 | + float silence_scale = config_.silence_scale; | ||
| 403 | + if (silence_scale != 1) { | ||
| 404 | + ans = ans.ScaleSilence(silence_scale); | ||
| 405 | + } | ||
| 406 | + | ||
| 401 | return ans; | 407 | return ans; |
| 402 | } | 408 | } |
| 403 | 409 |
| @@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 485 | GeneratedAudio ans; | 485 | GeneratedAudio ans; |
| 486 | ans.sample_rate = model_->GetMetaData().sample_rate; | 486 | ans.sample_rate = model_->GetMetaData().sample_rate; |
| 487 | ans.samples = std::vector<float>(p, p + total); | 487 | ans.samples = std::vector<float>(p, p + total); |
| 488 | + | ||
| 489 | + float silence_scale = config_.silence_scale; | ||
| 490 | + if (silence_scale != 1) { | ||
| 491 | + ans = ans.ScaleSilence(silence_scale); | ||
| 492 | + } | ||
| 493 | + | ||
| 488 | return ans; | 494 | return ans; |
| 489 | } | 495 | } |
| 490 | 496 |
| @@ -4,6 +4,7 @@ | @@ -4,6 +4,7 @@ | ||
| 4 | 4 | ||
| 5 | #include "sherpa-onnx/csrc/offline-tts.h" | 5 | #include "sherpa-onnx/csrc/offline-tts.h" |
| 6 | 6 | ||
| 7 | +#include <cmath> | ||
| 7 | #include <string> | 8 | #include <string> |
| 8 | #include <utility> | 9 | #include <utility> |
| 9 | 10 | ||
| @@ -23,6 +24,72 @@ | @@ -23,6 +24,72 @@ | ||
| 23 | 24 | ||
| 24 | namespace sherpa_onnx { | 25 | namespace sherpa_onnx { |
| 25 | 26 | ||
| 27 | +struct SilenceInterval { | ||
| 28 | + int32_t start; | ||
| 29 | + int32_t end; | ||
| 30 | +}; | ||
| 31 | + | ||
| 32 | +GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const { | ||
| 33 | + if (scale == 1) { | ||
| 34 | + return *this; | ||
| 35 | + } | ||
| 36 | + // if the interval is larger than 0.6 second, then we assume it is a pause | ||
| 37 | + int32_t threshold = static_cast<int32_t>(sample_rate * 0.6); | ||
| 38 | + | ||
| 39 | + std::vector<SilenceInterval> intervals; | ||
| 40 | + int32_t num_samples = static_cast<int32_t>(samples.size()); | ||
| 41 | + | ||
| 42 | + int32_t last = -1; | ||
| 43 | + int32_t i; | ||
| 44 | + for (i = 0; i != num_samples; ++i) { | ||
| 45 | + if (fabs(samples[i]) <= 0.01) { | ||
| 46 | + if (last == -1) { | ||
| 47 | + last = i; | ||
| 48 | + } | ||
| 49 | + continue; | ||
| 50 | + } | ||
| 51 | + | ||
| 52 | + if (last != -1 && i - last < threshold) { | ||
| 53 | + last = -1; | ||
| 54 | + continue; | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + if (last != -1) { | ||
| 58 | + intervals.push_back({last, i}); | ||
| 59 | + last = -1; | ||
| 60 | + } | ||
| 61 | + } | ||
| 62 | + | ||
| 63 | + if (last != -1 && num_samples - last > threshold) { | ||
| 64 | + intervals.push_back({last, num_samples}); | ||
| 65 | + } | ||
| 66 | + | ||
| 67 | + if (intervals.empty()) { | ||
| 68 | + return *this; | ||
| 69 | + } | ||
| 70 | + | ||
| 71 | + GeneratedAudio ans; | ||
| 72 | + ans.sample_rate = sample_rate; | ||
| 73 | + ans.samples.reserve(samples.size()); | ||
| 74 | + | ||
| 75 | + i = 0; | ||
| 76 | + for (const auto &interval : intervals) { | ||
| 77 | + ans.samples.insert(ans.samples.end(), samples.begin() + i, | ||
| 78 | + samples.begin() + interval.start); | ||
| 79 | + i = interval.end; | ||
| 80 | + int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale); | ||
| 81 | + | ||
| 82 | + ans.samples.insert(ans.samples.end(), samples.begin() + interval.start, | ||
| 83 | + samples.begin() + interval.start + n); | ||
| 84 | + } | ||
| 85 | + | ||
| 86 | + if (i < num_samples) { | ||
| 87 | + ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end()); | ||
| 88 | + } | ||
| 89 | + | ||
| 90 | + return ans; | ||
| 91 | +} | ||
| 92 | + | ||
| 26 | void OfflineTtsConfig::Register(ParseOptions *po) { | 93 | void OfflineTtsConfig::Register(ParseOptions *po) { |
| 27 | model.Register(po); | 94 | model.Register(po); |
| 28 | 95 | ||
| @@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) { | @@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) { | ||
| 44 | "Maximum number of sentences that we process at a time. " | 111 | "Maximum number of sentences that we process at a time. " |
| 45 | "This is to avoid OOM for very long input text. " | 112 | "This is to avoid OOM for very long input text. " |
| 46 | "If you set it to -1, then we process all sentences in a single batch."); | 113 | "If you set it to -1, then we process all sentences in a single batch."); |
| 114 | + | ||
| 115 | + po->Register("tts-silence-scale", &silence_scale, | ||
| 116 | + "Duration of the pause is scaled by this number. So a smaller " | ||
| 117 | + "value leads to a shorter pause."); | ||
| 47 | } | 118 | } |
| 48 | 119 | ||
| 49 | bool OfflineTtsConfig::Validate() const { | 120 | bool OfflineTtsConfig::Validate() const { |
| @@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const { | @@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const { | ||
| 69 | } | 140 | } |
| 70 | } | 141 | } |
| 71 | 142 | ||
| 143 | + if (silence_scale < 0.001) { | ||
| 144 | + SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale); | ||
| 145 | + return false; | ||
| 146 | + } | ||
| 147 | + | ||
| 72 | return model.Validate(); | 148 | return model.Validate(); |
| 73 | } | 149 | } |
| 74 | 150 | ||
| @@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const { | @@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const { | ||
| 79 | os << "model=" << model.ToString() << ", "; | 155 | os << "model=" << model.ToString() << ", "; |
| 80 | os << "rule_fsts=\"" << rule_fsts << "\", "; | 156 | os << "rule_fsts=\"" << rule_fsts << "\", "; |
| 81 | os << "rule_fars=\"" << rule_fars << "\", "; | 157 | os << "rule_fars=\"" << rule_fars << "\", "; |
| 82 | - os << "max_num_sentences=" << max_num_sentences << ")"; | 158 | + os << "max_num_sentences=" << max_num_sentences << ", "; |
| 159 | + os << "silence_scale=" << silence_scale << ")"; | ||
| 83 | 160 | ||
| 84 | return os.str(); | 161 | return os.str(); |
| 85 | } | 162 | } |
| @@ -32,14 +32,20 @@ struct OfflineTtsConfig { | @@ -32,14 +32,20 @@ struct OfflineTtsConfig { | ||
| 32 | // If you set it to -1, then we process all sentences in a single batch. | 32 | // If you set it to -1, then we process all sentences in a single batch. |
| 33 | int32_t max_num_sentences = 1; | 33 | int32_t max_num_sentences = 1; |
| 34 | 34 | ||
| 35 | + // A silence interval containing audio samples with value close to 0. | ||
| 36 | + // | ||
| 37 | + // the duration of the new interval is old_duration * silence_scale. | ||
| 38 | + float silence_scale = 0.2; | ||
| 39 | + | ||
| 35 | OfflineTtsConfig() = default; | 40 | OfflineTtsConfig() = default; |
| 36 | OfflineTtsConfig(const OfflineTtsModelConfig &model, | 41 | OfflineTtsConfig(const OfflineTtsModelConfig &model, |
| 37 | const std::string &rule_fsts, const std::string &rule_fars, | 42 | const std::string &rule_fsts, const std::string &rule_fars, |
| 38 | - int32_t max_num_sentences) | 43 | + int32_t max_num_sentences, float silence_scale) |
| 39 | : model(model), | 44 | : model(model), |
| 40 | rule_fsts(rule_fsts), | 45 | rule_fsts(rule_fsts), |
| 41 | rule_fars(rule_fars), | 46 | rule_fars(rule_fars), |
| 42 | - max_num_sentences(max_num_sentences) {} | 47 | + max_num_sentences(max_num_sentences), |
| 48 | + silence_scale(silence_scale) {} | ||
| 43 | 49 | ||
| 44 | void Register(ParseOptions *po); | 50 | void Register(ParseOptions *po); |
| 45 | bool Validate() const; | 51 | bool Validate() const; |
| @@ -50,6 +56,11 @@ struct OfflineTtsConfig { | @@ -50,6 +56,11 @@ struct OfflineTtsConfig { | ||
| 50 | struct GeneratedAudio { | 56 | struct GeneratedAudio { |
| 51 | std::vector<float> samples; | 57 | std::vector<float> samples; |
| 52 | int32_t sample_rate; | 58 | int32_t sample_rate; |
| 59 | + | ||
| 60 | + // Silence means pause here. | ||
| 61 | + // If scale > 1, then it increases the duration of a pause | ||
| 62 | + // If scale < 1, then it reduces the duration of a pause | ||
| 63 | + GeneratedAudio ScaleSilence(float scale) const; | ||
| 53 | }; | 64 | }; |
| 54 | 65 | ||
| 55 | class OfflineTtsImpl; | 66 | class OfflineTtsImpl; |
| @@ -7,12 +7,14 @@ public class OfflineTtsConfig { | @@ -7,12 +7,14 @@ public class OfflineTtsConfig { | ||
| 7 | private final String ruleFsts; | 7 | private final String ruleFsts; |
| 8 | private final String ruleFars; | 8 | private final String ruleFars; |
| 9 | private final int maxNumSentences; | 9 | private final int maxNumSentences; |
| 10 | + private final float silenceScale; | ||
| 10 | 11 | ||
| 11 | private OfflineTtsConfig(Builder builder) { | 12 | private OfflineTtsConfig(Builder builder) { |
| 12 | this.model = builder.model; | 13 | this.model = builder.model; |
| 13 | this.ruleFsts = builder.ruleFsts; | 14 | this.ruleFsts = builder.ruleFsts; |
| 14 | this.ruleFars = builder.ruleFars; | 15 | this.ruleFars = builder.ruleFars; |
| 15 | this.maxNumSentences = builder.maxNumSentences; | 16 | this.maxNumSentences = builder.maxNumSentences; |
| 17 | + this.silenceScale = builder.silenceScale; | ||
| 16 | } | 18 | } |
| 17 | 19 | ||
| 18 | public static Builder builder() { | 20 | public static Builder builder() { |
| @@ -35,11 +37,16 @@ public class OfflineTtsConfig { | @@ -35,11 +37,16 @@ public class OfflineTtsConfig { | ||
| 35 | return maxNumSentences; | 37 | return maxNumSentences; |
| 36 | } | 38 | } |
| 37 | 39 | ||
| 40 | + public float getSilenceScale() { | ||
| 41 | + return silenceScale; | ||
| 42 | + } | ||
| 43 | + | ||
| 38 | public static class Builder { | 44 | public static class Builder { |
| 39 | private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build(); | 45 | private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build(); |
| 40 | private String ruleFsts = ""; | 46 | private String ruleFsts = ""; |
| 41 | private String ruleFars = ""; | 47 | private String ruleFars = ""; |
| 42 | private int maxNumSentences = 1; | 48 | private int maxNumSentences = 1; |
| 49 | + private float silenceScale = 0.2f; | ||
| 43 | 50 | ||
| 44 | public OfflineTtsConfig build() { | 51 | public OfflineTtsConfig build() { |
| 45 | return new OfflineTtsConfig(this); | 52 | return new OfflineTtsConfig(this); |
| @@ -64,5 +71,10 @@ public class OfflineTtsConfig { | @@ -64,5 +71,10 @@ public class OfflineTtsConfig { | ||
| 64 | this.maxNumSentences = maxNumSentences; | 71 | this.maxNumSentences = maxNumSentences; |
| 65 | return this; | 72 | return this; |
| 66 | } | 73 | } |
| 74 | + | ||
| 75 | + public Builder setSilenceScale(float silenceScale) { | ||
| 76 | + this.silenceScale = silenceScale; | ||
| 77 | + return this; | ||
| 78 | + } | ||
| 67 | } | 79 | } |
| 68 | } | 80 | } |
| @@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | @@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | ||
| 187 | fid = env->GetFieldID(cls, "maxNumSentences", "I"); | 187 | fid = env->GetFieldID(cls, "maxNumSentences", "I"); |
| 188 | ans.max_num_sentences = env->GetIntField(config, fid); | 188 | ans.max_num_sentences = env->GetIntField(config, fid); |
| 189 | 189 | ||
| 190 | + fid = env->GetFieldID(cls, "silenceScale", "F"); | ||
| 191 | + ans.silence_scale = env->GetFloatField(config, fid); | ||
| 192 | + | ||
| 190 | return ans; | 193 | return ans; |
| 191 | } | 194 | } |
| 192 | 195 |
| @@ -49,6 +49,7 @@ data class OfflineTtsConfig( | @@ -49,6 +49,7 @@ data class OfflineTtsConfig( | ||
| 49 | var ruleFsts: String = "", | 49 | var ruleFsts: String = "", |
| 50 | var ruleFars: String = "", | 50 | var ruleFars: String = "", |
| 51 | var maxNumSentences: Int = 1, | 51 | var maxNumSentences: Int = 1, |
| 52 | + var silenceScale: Float = 0.2f, | ||
| 52 | ) | 53 | ) |
| 53 | 54 | ||
| 54 | class GeneratedAudio( | 55 | class GeneratedAudio( |
| @@ -106,6 +106,7 @@ type | @@ -106,6 +106,7 @@ type | ||
| 106 | RuleFsts: AnsiString; | 106 | RuleFsts: AnsiString; |
| 107 | MaxNumSentences: Integer; | 107 | MaxNumSentences: Integer; |
| 108 | RuleFars: AnsiString; | 108 | RuleFars: AnsiString; |
| 109 | + SilenceScale: Single; | ||
| 109 | 110 | ||
| 110 | function ToString: AnsiString; | 111 | function ToString: AnsiString; |
| 111 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); | 112 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); |
| @@ -777,6 +778,7 @@ type | @@ -777,6 +778,7 @@ type | ||
| 777 | RuleFsts: PAnsiChar; | 778 | RuleFsts: PAnsiChar; |
| 778 | MaxNumSentences: cint32; | 779 | MaxNumSentences: cint32; |
| 779 | RuleFars: PAnsiChar; | 780 | RuleFars: PAnsiChar; |
| 781 | + SilenceScale: cfloat; | ||
| 780 | end; | 782 | end; |
| 781 | 783 | ||
| 782 | PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig; | 784 | PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig; |
| @@ -1976,15 +1978,17 @@ begin | @@ -1976,15 +1978,17 @@ begin | ||
| 1976 | 'Model := %s, ' + | 1978 | 'Model := %s, ' + |
| 1977 | 'RuleFsts := %s, ' + | 1979 | 'RuleFsts := %s, ' + |
| 1978 | 'MaxNumSentences := %d, ' + | 1980 | 'MaxNumSentences := %d, ' + |
| 1979 | - 'RuleFars := %s' + | 1981 | + 'RuleFars := %s, ' + |
| 1982 | + 'SilenceScale := %f' + | ||
| 1980 | ')', | 1983 | ')', |
| 1981 | - [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars | ||
| 1982 | - ]); | 1984 | + [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars, |
| 1985 | + Self.SilenceScale]); | ||
| 1983 | end; | 1986 | end; |
| 1984 | 1987 | ||
| 1985 | class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); | 1988 | class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); |
| 1986 | begin | 1989 | begin |
| 1987 | Dest.MaxNumSentences := 1; | 1990 | Dest.MaxNumSentences := 1; |
| 1991 | + Dest.SilenceScale := 0.2; | ||
| 1988 | end; | 1992 | end; |
| 1989 | 1993 | ||
| 1990 | constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); | 1994 | constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); |
| @@ -2027,6 +2031,7 @@ begin | @@ -2027,6 +2031,7 @@ begin | ||
| 2027 | C.RuleFsts := PAnsiChar(Config.RuleFsts); | 2031 | C.RuleFsts := PAnsiChar(Config.RuleFsts); |
| 2028 | C.MaxNumSentences := Config.MaxNumSentences; | 2032 | C.MaxNumSentences := Config.MaxNumSentences; |
| 2029 | C.RuleFars := PAnsiChar(Config.RuleFars); | 2033 | C.RuleFars := PAnsiChar(Config.RuleFars); |
| 2034 | + C.SilenceScale := Config.SilenceScale; | ||
| 2030 | 2035 | ||
| 2031 | Self.Handle := SherpaOnnxCreateOfflineTts(@C); | 2036 | Self.Handle := SherpaOnnxCreateOfflineTts(@C); |
| 2032 | 2037 |
| @@ -32,13 +32,15 @@ static void PybindOfflineTtsConfig(py::module *m) { | @@ -32,13 +32,15 @@ static void PybindOfflineTtsConfig(py::module *m) { | ||
| 32 | py::class_<PyClass>(*m, "OfflineTtsConfig") | 32 | py::class_<PyClass>(*m, "OfflineTtsConfig") |
| 33 | .def(py::init<>()) | 33 | .def(py::init<>()) |
| 34 | .def(py::init<const OfflineTtsModelConfig &, const std::string &, | 34 | .def(py::init<const OfflineTtsModelConfig &, const std::string &, |
| 35 | - const std::string &, int32_t>(), | 35 | + const std::string &, int32_t, float>(), |
| 36 | py::arg("model"), py::arg("rule_fsts") = "", | 36 | py::arg("model"), py::arg("rule_fsts") = "", |
| 37 | - py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2) | 37 | + py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2, |
| 38 | + py::arg("silence_scale") = 0.2) | ||
| 38 | .def_readwrite("model", &PyClass::model) | 39 | .def_readwrite("model", &PyClass::model) |
| 39 | .def_readwrite("rule_fsts", &PyClass::rule_fsts) | 40 | .def_readwrite("rule_fsts", &PyClass::rule_fsts) |
| 40 | .def_readwrite("rule_fars", &PyClass::rule_fars) | 41 | .def_readwrite("rule_fars", &PyClass::rule_fars) |
| 41 | .def_readwrite("max_num_sentences", &PyClass::max_num_sentences) | 42 | .def_readwrite("max_num_sentences", &PyClass::max_num_sentences) |
| 43 | + .def_readwrite("silence_scale", &PyClass::silence_scale) | ||
| 42 | .def("validate", &PyClass::Validate) | 44 | .def("validate", &PyClass::Validate) |
| 43 | .def("__str__", &PyClass::ToString); | 45 | .def("__str__", &PyClass::ToString); |
| 44 | } | 46 | } |
| @@ -804,13 +804,15 @@ func sherpaOnnxOfflineTtsConfig( | @@ -804,13 +804,15 @@ func sherpaOnnxOfflineTtsConfig( | ||
| 804 | model: SherpaOnnxOfflineTtsModelConfig, | 804 | model: SherpaOnnxOfflineTtsModelConfig, |
| 805 | ruleFsts: String = "", | 805 | ruleFsts: String = "", |
| 806 | ruleFars: String = "", | 806 | ruleFars: String = "", |
| 807 | - maxNumSentences: Int = 1 | 807 | + maxNumSentences: Int = 1, |
| 808 | + silenceScale: Float = 0.2 | ||
| 808 | ) -> SherpaOnnxOfflineTtsConfig { | 809 | ) -> SherpaOnnxOfflineTtsConfig { |
| 809 | return SherpaOnnxOfflineTtsConfig( | 810 | return SherpaOnnxOfflineTtsConfig( |
| 810 | model: model, | 811 | model: model, |
| 811 | rule_fsts: toCPointer(ruleFsts), | 812 | rule_fsts: toCPointer(ruleFsts), |
| 812 | max_num_sentences: Int32(maxNumSentences), | 813 | max_num_sentences: Int32(maxNumSentences), |
| 813 | - rule_fars: toCPointer(ruleFars) | 814 | + rule_fars: toCPointer(ruleFars), |
| 815 | + silence_scale: silenceScale | ||
| 814 | ) | 816 | ) |
| 815 | } | 817 | } |
| 816 | 818 |
| @@ -21,7 +21,7 @@ function freeConfig(config, Module) { | @@ -21,7 +21,7 @@ function freeConfig(config, Module) { | ||
| 21 | 21 | ||
| 22 | // The user should free the returned pointers | 22 | // The user should free the returned pointers |
| 23 | function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | 23 | function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { |
| 24 | - const modelLen = Module.lengthBytesUTF8(config.model || '')+ 1; | 24 | + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; |
| 25 | const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; | 25 | const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; |
| 26 | const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; | 26 | const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; |
| 27 | const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; | 27 | const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; |
| @@ -282,7 +282,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -282,7 +282,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 282 | function initSherpaOnnxOfflineTtsConfig(config, Module) { | 282 | function initSherpaOnnxOfflineTtsConfig(config, Module) { |
| 283 | const modelConfig = | 283 | const modelConfig = |
| 284 | initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); | 284 | initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); |
| 285 | - const len = modelConfig.len + 3 * 4; | 285 | + const len = modelConfig.len + 4 * 4; |
| 286 | const ptr = Module._malloc(len); | 286 | const ptr = Module._malloc(len); |
| 287 | 287 | ||
| 288 | let offset = 0; | 288 | let offset = 0; |
| @@ -303,6 +303,10 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) { | @@ -303,6 +303,10 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) { | ||
| 303 | offset += 4; | 303 | offset += 4; |
| 304 | 304 | ||
| 305 | Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*'); | 305 | Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*'); |
| 306 | + offset += 4; | ||
| 307 | + | ||
| 308 | + Module.setValue(ptr + offset, config.silenceScale || 0.2, 'float'); | ||
| 309 | + offset += 4; | ||
| 306 | 310 | ||
| 307 | return { | 311 | return { |
| 308 | buffer: buffer, ptr: ptr, len: len, config: modelConfig, | 312 | buffer: buffer, ptr: ptr, len: len, config: modelConfig, |
| @@ -22,7 +22,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == | @@ -22,7 +22,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == | ||
| 22 | sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, | 22 | sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, |
| 23 | ""); | 23 | ""); |
| 24 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == | 24 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == |
| 25 | - sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, | 25 | + sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4, |
| 26 | ""); | 26 | ""); |
| 27 | 27 | ||
| 28 | void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | 28 | void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { |
| @@ -68,6 +68,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -68,6 +68,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 68 | fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); | 68 | fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); |
| 69 | fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars); | 69 | fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars); |
| 70 | fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); | 70 | fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); |
| 71 | + fprintf(stdout, "silence scale: %.3f\n", tts_config->silence_scale); | ||
| 71 | } | 72 | } |
| 72 | 73 | ||
| 73 | void CopyHeap(const char *src, int32_t num_bytes, char *dst) { | 74 | void CopyHeap(const char *src, int32_t num_bytes, char *dst) { |
-
请 注册 或 登录 后发表评论