Fangjun Kuang
Committed by GitHub

Support scaling the duration of a pause in TTS. (#1820)

@@ -116,7 +116,7 @@ int32_t main() { @@ -116,7 +116,7 @@ int32_t main() {
116 keywords_spotter_config.keywords_buf = keywords_buf; 116 keywords_spotter_config.keywords_buf = keywords_buf;
117 keywords_spotter_config.keywords_buf_size = keywords_buf_size; 117 keywords_spotter_config.keywords_buf_size = keywords_buf_size;
118 118
119 - SherpaOnnxKeywordSpotter *keywords_spotter = 119 + const SherpaOnnxKeywordSpotter *keywords_spotter =
120 SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config); 120 SherpaOnnxCreateKeywordSpotter(&keywords_spotter_config);
121 121
122 free((void *)tokens_buf); 122 free((void *)tokens_buf);
@@ -130,7 +130,7 @@ int32_t main() { @@ -130,7 +130,7 @@ int32_t main() {
130 return -1; 130 return -1;
131 } 131 }
132 132
133 - SherpaOnnxOnlineStream *stream = 133 + const SherpaOnnxOnlineStream *stream =
134 SherpaOnnxCreateKeywordStream(keywords_spotter); 134 SherpaOnnxCreateKeywordStream(keywords_spotter);
135 135
136 const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50); 136 const SherpaOnnxDisplay *display = SherpaOnnxCreateDisplay(50);
@@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct { @@ -180,6 +180,9 @@ final class SherpaOnnxOfflineTtsConfig extends Struct {
180 external int maxNumSenetences; 180 external int maxNumSenetences;
181 181
182 external Pointer<Utf8> ruleFars; 182 external Pointer<Utf8> ruleFars;
  183 +
  184 + @Float()
  185 + external double silenceScale;
183 } 186 }
184 187
185 final class SherpaOnnxGeneratedAudio extends Struct { 188 final class SherpaOnnxGeneratedAudio extends Struct {
@@ -114,17 +114,19 @@ class OfflineTtsConfig { @@ -114,17 +114,19 @@ class OfflineTtsConfig {
114 this.ruleFsts = '', 114 this.ruleFsts = '',
115 this.maxNumSenetences = 1, 115 this.maxNumSenetences = 1,
116 this.ruleFars = '', 116 this.ruleFars = '',
  117 + this.silenceScale = 0.2,
117 }); 118 });
118 119
119 @override 120 @override
120 String toString() { 121 String toString() {
121 - return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars)'; 122 + return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars, silenceScale: $silenceScale)';
122 } 123 }
123 124
124 final OfflineTtsModelConfig model; 125 final OfflineTtsModelConfig model;
125 final String ruleFsts; 126 final String ruleFsts;
126 final int maxNumSenetences; 127 final int maxNumSenetences;
127 final String ruleFars; 128 final String ruleFars;
  129 + final double silenceScale;
128 } 130 }
129 131
130 class GeneratedAudio { 132 class GeneratedAudio {
@@ -180,6 +182,7 @@ class OfflineTts { @@ -180,6 +182,7 @@ class OfflineTts {
180 c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); 182 c.ref.ruleFsts = config.ruleFsts.toNativeUtf8();
181 c.ref.maxNumSenetences = config.maxNumSenetences; 183 c.ref.maxNumSenetences = config.maxNumSenetences;
182 c.ref.ruleFars = config.ruleFars.toNativeUtf8(); 184 c.ref.ruleFars = config.ruleFars.toNativeUtf8();
  185 + c.ref.silenceScale = config.silenceScale;
183 186
184 final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr; 187 final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr;
185 188
@@ -146,6 +146,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( @@ -146,6 +146,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
146 SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); 146 SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts);
147 SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences); 147 SHERPA_ONNX_ASSIGN_ATTR_INT32(max_num_sentences, maxNumSentences);
148 SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); 148 SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars);
  149 + SHERPA_ONNX_ASSIGN_ATTR_STR(silence_scale, silenceScale);
149 150
150 #if __OHOS__ 151 #if __OHOS__
151 std::unique_ptr<NativeResourceManager, 152 std::unique_ptr<NativeResourceManager,
@@ -52,6 +52,7 @@ export class OfflineTtsConfig { @@ -52,6 +52,7 @@ export class OfflineTtsConfig {
52 public ruleFsts: string = ''; 52 public ruleFsts: string = '';
53 public ruleFars: string = ''; 53 public ruleFars: string = '';
54 public maxNumSentences: number = 1; 54 public maxNumSentences: number = 1;
  55 + public silenceScale: number = 0.2;
55 } 56 }
56 57
57 export class TtsOutput { 58 export class TtsOutput {
@@ -98,4 +99,4 @@ export class OfflineTts { @@ -98,4 +99,4 @@ export class OfflineTts {
98 generateAsync(input: TtsInput): Promise<TtsOutput> { 99 generateAsync(input: TtsInput): Promise<TtsOutput> {
99 return offlineTtsGenerateAsync(this.handle, input); 100 return offlineTtsGenerateAsync(this.handle, input);
100 } 101 }
101 -}  
  102 +}
@@ -13,6 +13,7 @@ namespace SherpaOnnx @@ -13,6 +13,7 @@ namespace SherpaOnnx
13 RuleFsts = ""; 13 RuleFsts = "";
14 MaxNumSentences = 1; 14 MaxNumSentences = 1;
15 RuleFars = ""; 15 RuleFars = "";
  16 + SilenceScale = 0.2F;
16 } 17 }
17 public OfflineTtsModelConfig Model; 18 public OfflineTtsModelConfig Model;
18 19
@@ -23,6 +24,7 @@ namespace SherpaOnnx @@ -23,6 +24,7 @@ namespace SherpaOnnx
23 24
24 [MarshalAs(UnmanagedType.LPStr)] 25 [MarshalAs(UnmanagedType.LPStr)]
25 public string RuleFars; 26 public string RuleFars;
26 - }  
27 27
28 -}  
  28 + public float SilenceScale;
  29 + }
  30 +}
@@ -712,6 +712,7 @@ type OfflineTtsConfig struct { @@ -712,6 +712,7 @@ type OfflineTtsConfig struct {
712 RuleFsts string 712 RuleFsts string
713 RuleFars string 713 RuleFars string
714 MaxNumSentences int 714 MaxNumSentences int
  715 + SilenceScale float32
715 } 716 }
716 717
717 type GeneratedAudio struct { 718 type GeneratedAudio struct {
@@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { @@ -744,6 +745,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
744 defer C.free(unsafe.Pointer(c.rule_fars)) 745 defer C.free(unsafe.Pointer(c.rule_fars))
745 746
746 c.max_num_sentences = C.int(config.MaxNumSentences) 747 c.max_num_sentences = C.int(config.MaxNumSentences)
  748 + c.silence_scale = C.float(config.SilenceScale)
747 749
748 // vits 750 // vits
749 c.model.vits.model = C.CString(config.Model.Vits.Model) 751 c.model.vits.model = C.CString(config.Model.Vits.Model)
@@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( @@ -1135,6 +1135,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
1135 tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); 1135 tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
1136 tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); 1136 tts_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, "");
1137 tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1); 1137 tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 1);
  1138 + tts_config.silence_scale = SHERPA_ONNX_OR(config->silence_scale, 0.2);
1138 1139
1139 if (tts_config.model.debug) { 1140 if (tts_config.model.debug) {
1140 #if __OHOS__ 1141 #if __OHOS__
@@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { @@ -944,6 +944,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
944 const char *rule_fsts; 944 const char *rule_fsts;
945 int32_t max_num_sentences; 945 int32_t max_num_sentences;
946 const char *rule_fars; 946 const char *rule_fars;
  947 + float silence_scale;
947 } SherpaOnnxOfflineTtsConfig; 948 } SherpaOnnxOfflineTtsConfig;
948 949
949 SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { 950 SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {
@@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { @@ -352,6 +352,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
352 352
353 c.rule_fsts = config.rule_fsts.c_str(); 353 c.rule_fsts = config.rule_fsts.c_str();
354 c.max_num_sentences = config.max_num_sentences; 354 c.max_num_sentences = config.max_num_sentences;
  355 + c.silence_scale = config.silence_scale;
355 c.rule_fars = config.rule_fars.c_str(); 356 c.rule_fars = config.rule_fars.c_str();
356 357
357 auto p = SherpaOnnxCreateOfflineTts(&c); 358 auto p = SherpaOnnxCreateOfflineTts(&c);
@@ -363,6 +363,7 @@ struct OfflineTtsConfig { @@ -363,6 +363,7 @@ struct OfflineTtsConfig {
363 std::string rule_fsts; 363 std::string rule_fsts;
364 std::string rule_fars; 364 std::string rule_fars;
365 int32_t max_num_sentences = 1; 365 int32_t max_num_sentences = 1;
  366 + float silence_scale = 0.2;
366 }; 367 };
367 368
368 struct GeneratedAudio { 369 struct GeneratedAudio {
@@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { @@ -420,6 +420,12 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
420 GeneratedAudio ans; 420 GeneratedAudio ans;
421 ans.sample_rate = model_->GetMetaData().sample_rate; 421 ans.sample_rate = model_->GetMetaData().sample_rate;
422 ans.samples = std::vector<float>(p, p + total); 422 ans.samples = std::vector<float>(p, p + total);
  423 +
  424 + float silence_scale = config_.silence_scale;
  425 + if (silence_scale != 1) {
  426 + ans = ans.ScaleSilence(silence_scale);
  427 + }
  428 +
423 return ans; 429 return ans;
424 } 430 }
425 431
@@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { @@ -398,6 +398,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
398 GeneratedAudio ans; 398 GeneratedAudio ans;
399 ans.sample_rate = model_->GetMetaData().sample_rate; 399 ans.sample_rate = model_->GetMetaData().sample_rate;
400 ans.samples = std::vector<float>(p, p + total); 400 ans.samples = std::vector<float>(p, p + total);
  401 +
  402 + float silence_scale = config_.silence_scale;
  403 + if (silence_scale != 1) {
  404 + ans = ans.ScaleSilence(silence_scale);
  405 + }
  406 +
401 return ans; 407 return ans;
402 } 408 }
403 409
@@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -485,6 +485,12 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
485 GeneratedAudio ans; 485 GeneratedAudio ans;
486 ans.sample_rate = model_->GetMetaData().sample_rate; 486 ans.sample_rate = model_->GetMetaData().sample_rate;
487 ans.samples = std::vector<float>(p, p + total); 487 ans.samples = std::vector<float>(p, p + total);
  488 +
  489 + float silence_scale = config_.silence_scale;
  490 + if (silence_scale != 1) {
  491 + ans = ans.ScaleSilence(silence_scale);
  492 + }
  493 +
488 return ans; 494 return ans;
489 } 495 }
490 496
@@ -4,6 +4,7 @@ @@ -4,6 +4,7 @@
4 4
5 #include "sherpa-onnx/csrc/offline-tts.h" 5 #include "sherpa-onnx/csrc/offline-tts.h"
6 6
  7 +#include <cmath>
7 #include <string> 8 #include <string>
8 #include <utility> 9 #include <utility>
9 10
@@ -23,6 +24,72 @@ @@ -23,6 +24,72 @@
23 24
24 namespace sherpa_onnx { 25 namespace sherpa_onnx {
25 26
  27 +struct SilenceInterval {
  28 + int32_t start;
  29 + int32_t end;
  30 +};
  31 +
  32 +GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
  33 + if (scale == 1) {
  34 + return *this;
  35 + }
  36 + // if the interval is larger than 0.6 second, then we assume it is a pause
  37 + int32_t threshold = static_cast<int32_t>(sample_rate * 0.6);
  38 +
  39 + std::vector<SilenceInterval> intervals;
  40 + int32_t num_samples = static_cast<int32_t>(samples.size());
  41 +
  42 + int32_t last = -1;
  43 + int32_t i;
  44 + for (i = 0; i != num_samples; ++i) {
  45 + if (fabs(samples[i]) <= 0.01) {
  46 + if (last == -1) {
  47 + last = i;
  48 + }
  49 + continue;
  50 + }
  51 +
  52 + if (last != -1 && i - last < threshold) {
  53 + last = -1;
  54 + continue;
  55 + }
  56 +
  57 + if (last != -1) {
  58 + intervals.push_back({last, i});
  59 + last = -1;
  60 + }
  61 + }
  62 +
  63 + if (last != -1 && num_samples - last > threshold) {
  64 + intervals.push_back({last, num_samples});
  65 + }
  66 +
  67 + if (intervals.empty()) {
  68 + return *this;
  69 + }
  70 +
  71 + GeneratedAudio ans;
  72 + ans.sample_rate = sample_rate;
  73 + ans.samples.reserve(samples.size());
  74 +
  75 + i = 0;
  76 + for (const auto &interval : intervals) {
  77 + ans.samples.insert(ans.samples.end(), samples.begin() + i,
  78 + samples.begin() + interval.start);
  79 + i = interval.end;
  80 + int32_t n = static_cast<int32_t>((interval.end - interval.start) * scale);
  81 +
  82 + ans.samples.insert(ans.samples.end(), samples.begin() + interval.start,
  83 + samples.begin() + interval.start + n);
  84 + }
  85 +
  86 + if (i < num_samples) {
  87 + ans.samples.insert(ans.samples.end(), samples.begin() + i, samples.end());
  88 + }
  89 +
  90 + return ans;
  91 +}
  92 +
26 void OfflineTtsConfig::Register(ParseOptions *po) { 93 void OfflineTtsConfig::Register(ParseOptions *po) {
27 model.Register(po); 94 model.Register(po);
28 95
@@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) { @@ -44,6 +111,10 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
44 "Maximum number of sentences that we process at a time. " 111 "Maximum number of sentences that we process at a time. "
45 "This is to avoid OOM for very long input text. " 112 "This is to avoid OOM for very long input text. "
46 "If you set it to -1, then we process all sentences in a single batch."); 113 "If you set it to -1, then we process all sentences in a single batch.");
  114 +
  115 + po->Register("tts-silence-scale", &silence_scale,
  116 + "Duration of the pause is scaled by this number. So a smaller "
  117 + "value leads to a shorter pause.");
47 } 118 }
48 119
49 bool OfflineTtsConfig::Validate() const { 120 bool OfflineTtsConfig::Validate() const {
@@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const { @@ -69,6 +140,11 @@ bool OfflineTtsConfig::Validate() const {
69 } 140 }
70 } 141 }
71 142
  143 + if (silence_scale < 0.001) {
  144 + SHERPA_ONNX_LOGE("--tts-silence-scale '%.3f' is too small", silence_scale);
  145 + return false;
  146 + }
  147 +
72 return model.Validate(); 148 return model.Validate();
73 } 149 }
74 150
@@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const { @@ -79,7 +155,8 @@ std::string OfflineTtsConfig::ToString() const {
79 os << "model=" << model.ToString() << ", "; 155 os << "model=" << model.ToString() << ", ";
80 os << "rule_fsts=\"" << rule_fsts << "\", "; 156 os << "rule_fsts=\"" << rule_fsts << "\", ";
81 os << "rule_fars=\"" << rule_fars << "\", "; 157 os << "rule_fars=\"" << rule_fars << "\", ";
82 - os << "max_num_sentences=" << max_num_sentences << ")"; 158 + os << "max_num_sentences=" << max_num_sentences << ", ";
  159 + os << "silence_scale=" << silence_scale << ")";
83 160
84 return os.str(); 161 return os.str();
85 } 162 }
@@ -32,14 +32,20 @@ struct OfflineTtsConfig { @@ -32,14 +32,20 @@ struct OfflineTtsConfig {
32 // If you set it to -1, then we process all sentences in a single batch. 32 // If you set it to -1, then we process all sentences in a single batch.
33 int32_t max_num_sentences = 1; 33 int32_t max_num_sentences = 1;
34 34
  35 + // A silence interval containing audio samples with value close to 0.
  36 + //
  37 + // the duration of the new interval is old_duration * silence_scale.
  38 + float silence_scale = 0.2;
  39 +
35 OfflineTtsConfig() = default; 40 OfflineTtsConfig() = default;
36 OfflineTtsConfig(const OfflineTtsModelConfig &model, 41 OfflineTtsConfig(const OfflineTtsModelConfig &model,
37 const std::string &rule_fsts, const std::string &rule_fars, 42 const std::string &rule_fsts, const std::string &rule_fars,
38 - int32_t max_num_sentences) 43 + int32_t max_num_sentences, float silence_scale)
39 : model(model), 44 : model(model),
40 rule_fsts(rule_fsts), 45 rule_fsts(rule_fsts),
41 rule_fars(rule_fars), 46 rule_fars(rule_fars),
42 - max_num_sentences(max_num_sentences) {} 47 + max_num_sentences(max_num_sentences),
  48 + silence_scale(silence_scale) {}
43 49
44 void Register(ParseOptions *po); 50 void Register(ParseOptions *po);
45 bool Validate() const; 51 bool Validate() const;
@@ -50,6 +56,11 @@ struct OfflineTtsConfig { @@ -50,6 +56,11 @@ struct OfflineTtsConfig {
50 struct GeneratedAudio { 56 struct GeneratedAudio {
51 std::vector<float> samples; 57 std::vector<float> samples;
52 int32_t sample_rate; 58 int32_t sample_rate;
  59 +
  60 + // Silence means pause here.
  61 + // If scale > 1, then it increases the duration of a pause
  62 + // If scale < 1, then it reduces the duration of a pause
  63 + GeneratedAudio ScaleSilence(float scale) const;
53 }; 64 };
54 65
55 class OfflineTtsImpl; 66 class OfflineTtsImpl;
@@ -7,12 +7,14 @@ public class OfflineTtsConfig { @@ -7,12 +7,14 @@ public class OfflineTtsConfig {
7 private final String ruleFsts; 7 private final String ruleFsts;
8 private final String ruleFars; 8 private final String ruleFars;
9 private final int maxNumSentences; 9 private final int maxNumSentences;
  10 + private final float silenceScale;
10 11
11 private OfflineTtsConfig(Builder builder) { 12 private OfflineTtsConfig(Builder builder) {
12 this.model = builder.model; 13 this.model = builder.model;
13 this.ruleFsts = builder.ruleFsts; 14 this.ruleFsts = builder.ruleFsts;
14 this.ruleFars = builder.ruleFars; 15 this.ruleFars = builder.ruleFars;
15 this.maxNumSentences = builder.maxNumSentences; 16 this.maxNumSentences = builder.maxNumSentences;
  17 + this.silenceScale = builder.silenceScale;
16 } 18 }
17 19
18 public static Builder builder() { 20 public static Builder builder() {
@@ -35,11 +37,16 @@ public class OfflineTtsConfig { @@ -35,11 +37,16 @@ public class OfflineTtsConfig {
35 return maxNumSentences; 37 return maxNumSentences;
36 } 38 }
37 39
  40 + public float getSilenceScale() {
  41 + return silenceScale;
  42 + }
  43 +
38 public static class Builder { 44 public static class Builder {
39 private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build(); 45 private OfflineTtsModelConfig model = OfflineTtsModelConfig.builder().build();
40 private String ruleFsts = ""; 46 private String ruleFsts = "";
41 private String ruleFars = ""; 47 private String ruleFars = "";
42 private int maxNumSentences = 1; 48 private int maxNumSentences = 1;
  49 + private float silenceScale = 0.2f;
43 50
44 public OfflineTtsConfig build() { 51 public OfflineTtsConfig build() {
45 return new OfflineTtsConfig(this); 52 return new OfflineTtsConfig(this);
@@ -64,5 +71,10 @@ public class OfflineTtsConfig { @@ -64,5 +71,10 @@ public class OfflineTtsConfig {
64 this.maxNumSentences = maxNumSentences; 71 this.maxNumSentences = maxNumSentences;
65 return this; 72 return this;
66 } 73 }
  74 +
  75 + public Builder setSilenceScale(float silenceScale) {
  76 + this.silenceScale = silenceScale;
  77 + return this;
  78 + }
67 } 79 }
68 } 80 }
@@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { @@ -187,6 +187,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
187 fid = env->GetFieldID(cls, "maxNumSentences", "I"); 187 fid = env->GetFieldID(cls, "maxNumSentences", "I");
188 ans.max_num_sentences = env->GetIntField(config, fid); 188 ans.max_num_sentences = env->GetIntField(config, fid);
189 189
  190 + fid = env->GetFieldID(cls, "silenceScale", "F");
  191 + ans.silence_scale = env->GetFloatField(config, fid);
  192 +
190 return ans; 193 return ans;
191 } 194 }
192 195
@@ -49,6 +49,7 @@ data class OfflineTtsConfig( @@ -49,6 +49,7 @@ data class OfflineTtsConfig(
49 var ruleFsts: String = "", 49 var ruleFsts: String = "",
50 var ruleFars: String = "", 50 var ruleFars: String = "",
51 var maxNumSentences: Int = 1, 51 var maxNumSentences: Int = 1,
  52 + var silenceScale: Float = 0.2f,
52 ) 53 )
53 54
54 class GeneratedAudio( 55 class GeneratedAudio(
@@ -106,6 +106,7 @@ type @@ -106,6 +106,7 @@ type
106 RuleFsts: AnsiString; 106 RuleFsts: AnsiString;
107 MaxNumSentences: Integer; 107 MaxNumSentences: Integer;
108 RuleFars: AnsiString; 108 RuleFars: AnsiString;
  109 + SilenceScale: Single;
109 110
110 function ToString: AnsiString; 111 function ToString: AnsiString;
111 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); 112 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
@@ -777,6 +778,7 @@ type @@ -777,6 +778,7 @@ type
777 RuleFsts: PAnsiChar; 778 RuleFsts: PAnsiChar;
778 MaxNumSentences: cint32; 779 MaxNumSentences: cint32;
779 RuleFars: PAnsiChar; 780 RuleFars: PAnsiChar;
  781 + SilenceScale: cfloat;
780 end; 782 end;
781 783
782 PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig; 784 PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
@@ -1976,15 +1978,17 @@ begin @@ -1976,15 +1978,17 @@ begin
1976 'Model := %s, ' + 1978 'Model := %s, ' +
1977 'RuleFsts := %s, ' + 1979 'RuleFsts := %s, ' +
1978 'MaxNumSentences := %d, ' + 1980 'MaxNumSentences := %d, ' +
1979 - 'RuleFars := %s' + 1981 + 'RuleFars := %s, ' +
  1982 + 'SilenceScale := %f' +
1980 ')', 1983 ')',
1981 - [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars  
1982 - ]); 1984 + [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars,
  1985 + Self.SilenceScale]);
1983 end; 1986 end;
1984 1987
1985 class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); 1988 class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
1986 begin 1989 begin
1987 Dest.MaxNumSentences := 1; 1990 Dest.MaxNumSentences := 1;
  1991 + Dest.SilenceScale := 0.2;
1988 end; 1992 end;
1989 1993
1990 constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); 1994 constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
@@ -2027,6 +2031,7 @@ begin @@ -2027,6 +2031,7 @@ begin
2027 C.RuleFsts := PAnsiChar(Config.RuleFsts); 2031 C.RuleFsts := PAnsiChar(Config.RuleFsts);
2028 C.MaxNumSentences := Config.MaxNumSentences; 2032 C.MaxNumSentences := Config.MaxNumSentences;
2029 C.RuleFars := PAnsiChar(Config.RuleFars); 2033 C.RuleFars := PAnsiChar(Config.RuleFars);
  2034 + C.SilenceScale := Config.SilenceScale;
2030 2035
2031 Self.Handle := SherpaOnnxCreateOfflineTts(@C); 2036 Self.Handle := SherpaOnnxCreateOfflineTts(@C);
2032 2037
@@ -32,13 +32,15 @@ static void PybindOfflineTtsConfig(py::module *m) { @@ -32,13 +32,15 @@ static void PybindOfflineTtsConfig(py::module *m) {
32 py::class_<PyClass>(*m, "OfflineTtsConfig") 32 py::class_<PyClass>(*m, "OfflineTtsConfig")
33 .def(py::init<>()) 33 .def(py::init<>())
34 .def(py::init<const OfflineTtsModelConfig &, const std::string &, 34 .def(py::init<const OfflineTtsModelConfig &, const std::string &,
35 - const std::string &, int32_t>(), 35 + const std::string &, int32_t, float>(),
36 py::arg("model"), py::arg("rule_fsts") = "", 36 py::arg("model"), py::arg("rule_fsts") = "",
37 - py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2) 37 + py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2,
  38 + py::arg("silence_scale") = 0.2)
38 .def_readwrite("model", &PyClass::model) 39 .def_readwrite("model", &PyClass::model)
39 .def_readwrite("rule_fsts", &PyClass::rule_fsts) 40 .def_readwrite("rule_fsts", &PyClass::rule_fsts)
40 .def_readwrite("rule_fars", &PyClass::rule_fars) 41 .def_readwrite("rule_fars", &PyClass::rule_fars)
41 .def_readwrite("max_num_sentences", &PyClass::max_num_sentences) 42 .def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
  43 + .def_readwrite("silence_scale", &PyClass::silence_scale)
42 .def("validate", &PyClass::Validate) 44 .def("validate", &PyClass::Validate)
43 .def("__str__", &PyClass::ToString); 45 .def("__str__", &PyClass::ToString);
44 } 46 }
@@ -804,13 +804,15 @@ func sherpaOnnxOfflineTtsConfig( @@ -804,13 +804,15 @@ func sherpaOnnxOfflineTtsConfig(
804 model: SherpaOnnxOfflineTtsModelConfig, 804 model: SherpaOnnxOfflineTtsModelConfig,
805 ruleFsts: String = "", 805 ruleFsts: String = "",
806 ruleFars: String = "", 806 ruleFars: String = "",
807 - maxNumSentences: Int = 1 807 + maxNumSentences: Int = 1,
  808 + silenceScale: Float = 0.2
808 ) -> SherpaOnnxOfflineTtsConfig { 809 ) -> SherpaOnnxOfflineTtsConfig {
809 return SherpaOnnxOfflineTtsConfig( 810 return SherpaOnnxOfflineTtsConfig(
810 model: model, 811 model: model,
811 rule_fsts: toCPointer(ruleFsts), 812 rule_fsts: toCPointer(ruleFsts),
812 max_num_sentences: Int32(maxNumSentences), 813 max_num_sentences: Int32(maxNumSentences),
813 - rule_fars: toCPointer(ruleFars) 814 + rule_fars: toCPointer(ruleFars),
  815 + silence_scale: silenceScale
814 ) 816 )
815 } 817 }
816 818
@@ -21,7 +21,7 @@ function freeConfig(config, Module) { @@ -21,7 +21,7 @@ function freeConfig(config, Module) {
21 21
22 // The user should free the returned pointers 22 // The user should free the returned pointers
23 function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { 23 function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
24 - const modelLen = Module.lengthBytesUTF8(config.model || '')+ 1; 24 + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1;
25 const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; 25 const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
26 const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; 26 const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
27 const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; 27 const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
@@ -282,7 +282,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -282,7 +282,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
282 function initSherpaOnnxOfflineTtsConfig(config, Module) { 282 function initSherpaOnnxOfflineTtsConfig(config, Module) {
283 const modelConfig = 283 const modelConfig =
284 initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); 284 initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module);
285 - const len = modelConfig.len + 3 * 4; 285 + const len = modelConfig.len + 4 * 4;
286 const ptr = Module._malloc(len); 286 const ptr = Module._malloc(len);
287 287
288 let offset = 0; 288 let offset = 0;
@@ -303,6 +303,10 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) { @@ -303,6 +303,10 @@ function initSherpaOnnxOfflineTtsConfig(config, Module) {
303 offset += 4; 303 offset += 4;
304 304
305 Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*'); 305 Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');
  306 + offset += 4;
  307 +
  308 + Module.setValue(ptr + offset, config.silenceScale || 0.2, 'float');
  309 + offset += 4;
306 310
307 return { 311 return {
308 buffer: buffer, ptr: ptr, len: len, config: modelConfig, 312 buffer: buffer, ptr: ptr, len: len, config: modelConfig,
@@ -22,7 +22,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == @@ -22,7 +22,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
22 sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, 22 sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
23 ""); 23 "");
24 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == 24 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
25 - sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, 25 + sizeof(SherpaOnnxOfflineTtsModelConfig) + 4 * 4,
26 ""); 26 "");
27 27
28 void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { 28 void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
@@ -68,6 +68,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -68,6 +68,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
68 fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts); 68 fprintf(stdout, "rule_fsts: %s\n", tts_config->rule_fsts);
69 fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars); 69 fprintf(stdout, "rule_fars: %s\n", tts_config->rule_fars);
70 fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences); 70 fprintf(stdout, "max num sentences: %d\n", tts_config->max_num_sentences);
  71 + fprintf(stdout, "silence scale: %.3f\n", tts_config->silence_scale);
71 } 72 }
72 73
73 void CopyHeap(const char *src, int32_t num_bytes, char *dst) { 74 void CopyHeap(const char *src, int32_t num_bytes, char *dst) {