Fangjun Kuang
Committed by GitHub

Add APIs about max speech duration in VAD for various programming languages (#1349)

正在显示 31 个修改的文件 包含 88 行增加9 行删除
@@ -93,6 +93,8 @@ jobs: @@ -93,6 +93,8 @@ jobs:
93 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface 93 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
94 94
95 cd huggingface 95 cd huggingface
  96 + git fetch
  97 + git pull
96 mkdir -p windows-for-dotnet 98 mkdir -p windows-for-dotnet
97 99
98 cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet 100 cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet
@@ -32,6 +32,7 @@ void main(List<String> arguments) async { @@ -32,6 +32,7 @@ void main(List<String> arguments) async {
32 model: sileroVad, 32 model: sileroVad,
33 minSilenceDuration: 0.25, 33 minSilenceDuration: 0.25,
34 minSpeechDuration: 0.5, 34 minSpeechDuration: 0.5,
  35 + maxSpeechDuration: 5.0,
35 ); 36 );
36 37
37 final vadConfig = sherpa_onnx.VadModelConfig( 38 final vadConfig = sherpa_onnx.VadModelConfig(
@@ -38,6 +38,7 @@ void main(List<String> arguments) async { @@ -38,6 +38,7 @@ void main(List<String> arguments) async {
38 model: sileroVad, 38 model: sileroVad,
39 minSilenceDuration: 0.25, 39 minSilenceDuration: 0.25,
40 minSpeechDuration: 0.5, 40 minSpeechDuration: 0.5,
  41 + maxSpeechDuration: 5.0,
41 ); 42 );
42 43
43 final vadConfig = sherpa_onnx.VadModelConfig( 44 final vadConfig = sherpa_onnx.VadModelConfig(
@@ -37,6 +37,7 @@ void main(List<String> arguments) async { @@ -37,6 +37,7 @@ void main(List<String> arguments) async {
37 model: sileroVad, 37 model: sileroVad,
38 minSilenceDuration: 0.25, 38 minSilenceDuration: 0.25,
39 minSpeechDuration: 0.5, 39 minSpeechDuration: 0.5,
  40 + maxSpeechDuration: 5.0,
40 ); 41 );
41 42
42 final vadConfig = sherpa_onnx.VadModelConfig( 43 final vadConfig = sherpa_onnx.VadModelConfig(
@@ -33,6 +33,7 @@ void main(List<String> arguments) async { @@ -33,6 +33,7 @@ void main(List<String> arguments) async {
33 model: sileroVad, 33 model: sileroVad,
34 minSilenceDuration: 0.25, 34 minSilenceDuration: 0.25,
35 minSpeechDuration: 0.5, 35 minSpeechDuration: 0.5,
  36 + maxSpeechDuration: 5.0,
36 ); 37 );
37 38
38 final vadConfig = sherpa_onnx.VadModelConfig( 39 final vadConfig = sherpa_onnx.VadModelConfig(
@@ -34,6 +34,7 @@ void main(List<String> arguments) async { @@ -34,6 +34,7 @@ void main(List<String> arguments) async {
34 model: sileroVad, 34 model: sileroVad,
35 minSilenceDuration: 0.25, 35 minSilenceDuration: 0.25,
36 minSpeechDuration: 0.5, 36 minSpeechDuration: 0.5,
  37 + maxSpeechDuration: 5.0,
37 ); 38 );
38 39
39 final vadConfig = sherpa_onnx.VadModelConfig( 40 final vadConfig = sherpa_onnx.VadModelConfig(
@@ -37,6 +37,7 @@ void main(List<String> arguments) async { @@ -37,6 +37,7 @@ void main(List<String> arguments) async {
37 model: sileroVad, 37 model: sileroVad,
38 minSilenceDuration: 0.25, 38 minSilenceDuration: 0.25,
39 minSpeechDuration: 0.5, 39 minSpeechDuration: 0.5,
  40 + maxSpeechDuration: 5.0,
40 ); 41 );
41 42
42 final vadConfig = sherpa_onnx.VadModelConfig( 43 final vadConfig = sherpa_onnx.VadModelConfig(
@@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct { @@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct {
301 301
302 @Int32() 302 @Int32()
303 external int windowSize; 303 external int windowSize;
  304 +
  305 + @Float()
  306 + external double maxSpeechDuration;
304 } 307 }
305 308
306 final class SherpaOnnxVadModelConfig extends Struct { 309 final class SherpaOnnxVadModelConfig extends Struct {
@@ -11,11 +11,12 @@ class SileroVadModelConfig { @@ -11,11 +11,12 @@ class SileroVadModelConfig {
11 this.threshold = 0.5, 11 this.threshold = 0.5,
12 this.minSilenceDuration = 0.5, 12 this.minSilenceDuration = 0.5,
13 this.minSpeechDuration = 0.25, 13 this.minSpeechDuration = 0.25,
14 - this.windowSize = 512}); 14 + this.windowSize = 512,
  15 + this.maxSpeechDuration = 5.0});
15 16
16 @override 17 @override
17 String toString() { 18 String toString() {
18 - return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)'; 19 + return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)';
19 } 20 }
20 21
21 final String model; 22 final String model;
@@ -23,6 +24,7 @@ class SileroVadModelConfig { @@ -23,6 +24,7 @@ class SileroVadModelConfig {
23 final double minSilenceDuration; 24 final double minSilenceDuration;
24 final double minSpeechDuration; 25 final double minSpeechDuration;
25 final int windowSize; 26 final int windowSize;
  27 + final double maxSpeechDuration;
26 } 28 }
27 29
28 class VadModelConfig { 30 class VadModelConfig {
@@ -127,6 +129,7 @@ class VoiceActivityDetector { @@ -127,6 +129,7 @@ class VoiceActivityDetector {
127 c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration; 129 c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
128 c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration; 130 c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration;
129 c.ref.sileroVad.windowSize = config.sileroVad.windowSize; 131 c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
  132 + c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration;
130 133
131 c.ref.sampleRate = config.sampleRate; 134 c.ref.sampleRate = config.sampleRate;
132 c.ref.numThreads = config.numThreads; 135 c.ref.numThreads = config.numThreads;
@@ -22,6 +22,7 @@ func main() { @@ -22,6 +22,7 @@ func main() {
22 config.SileroVad.MinSilenceDuration = 0.5 22 config.SileroVad.MinSilenceDuration = 0.5
23 config.SileroVad.MinSpeechDuration = 0.25 23 config.SileroVad.MinSpeechDuration = 0.25
24 config.SileroVad.WindowSize = 512 24 config.SileroVad.WindowSize = 512
  25 + config.SileroVad.MaxSpeechDuration = 5.0
25 config.SampleRate = 16000 26 config.SampleRate = 16000
26 config.NumThreads = 1 27 config.NumThreads = 1
27 config.Provider = "cpu" 28 config.Provider = "cpu"
@@ -22,6 +22,7 @@ func main() { @@ -22,6 +22,7 @@ func main() {
22 config.SileroVad.MinSilenceDuration = 0.5 22 config.SileroVad.MinSilenceDuration = 0.5
23 config.SileroVad.MinSpeechDuration = 0.25 23 config.SileroVad.MinSpeechDuration = 0.25
24 config.SileroVad.WindowSize = 512 24 config.SileroVad.WindowSize = 512
  25 + config.SileroVad.MaxSpeechDuration = 5.0
25 config.SampleRate = 16000 26 config.SampleRate = 16000
26 config.NumThreads = 1 27 config.NumThreads = 1
27 config.Provider = "cpu" 28 config.Provider = "cpu"
@@ -18,6 +18,7 @@ public class VadNonStreamingParaformer { @@ -18,6 +18,7 @@ public class VadNonStreamingParaformer {
18 .setMinSilenceDuration(0.25f) 18 .setMinSilenceDuration(0.25f)
19 .setMinSpeechDuration(0.5f) 19 .setMinSpeechDuration(0.5f)
20 .setWindowSize(512) 20 .setWindowSize(512)
  21 + .setMaxSpeechDuration(5.0f)
21 .build(); 22 .build();
22 23
23 VadModelConfig config = 24 VadModelConfig config =
@@ -18,6 +18,7 @@ public class VadNonStreamingSenseVoice { @@ -18,6 +18,7 @@ public class VadNonStreamingSenseVoice {
18 .setMinSilenceDuration(0.25f) 18 .setMinSilenceDuration(0.25f)
19 .setMinSpeechDuration(0.5f) 19 .setMinSpeechDuration(0.5f)
20 .setWindowSize(512) 20 .setWindowSize(512)
  21 + .setMaxSpeechDuration(5.0f)
21 .build(); 22 .build();
22 23
23 VadModelConfig config = 24 VadModelConfig config =
@@ -19,6 +19,7 @@ public class VadRemoveSilence { @@ -19,6 +19,7 @@ public class VadRemoveSilence {
19 .setMinSilenceDuration(0.25f) 19 .setMinSilenceDuration(0.25f)
20 .setMinSpeechDuration(0.5f) 20 .setMinSpeechDuration(0.5f)
21 .setWindowSize(512) 21 .setWindowSize(512)
  22 + .setMaxSpeechDuration(5.0f)
22 .build(); 23 .build();
23 24
24 VadModelConfig config = 25 VadModelConfig config =
@@ -48,8 +48,9 @@ begin @@ -48,8 +48,9 @@ begin
48 WindowSize := 512; {Please don't change it unless you know the details} 48 WindowSize := 512; {Please don't change it unless you know the details}
49 49
50 Config.SileroVad.Model := VadFilename; 50 Config.SileroVad.Model := VadFilename;
51 - Config.SileroVad.MinSpeechDuration := 0.5; 51 + Config.SileroVad.MinSpeechDuration := 0.25;
52 Config.SileroVad.MinSilenceDuration := 0.5; 52 Config.SileroVad.MinSilenceDuration := 0.5;
  53 + Config.SileroVad.MaxSpeechDuration := 5.0;
53 Config.SileroVad.Threshold := 0.5; 54 Config.SileroVad.Threshold := 0.5;
54 Config.SileroVad.WindowSize := WindowSize; 55 Config.SileroVad.WindowSize := WindowSize;
55 Config.NumThreads:= 2; 56 Config.NumThreads:= 2;
@@ -34,6 +34,7 @@ function createVad() { @@ -34,6 +34,7 @@ function createVad() {
34 threshold: 0.5, 34 threshold: 0.5,
35 minSpeechDuration: 0.25, 35 minSpeechDuration: 0.25,
36 minSilenceDuration: 0.5, 36 minSilenceDuration: 0.5,
  37 + maxSpeechDuration: 5,
37 windowSize: 512, 38 windowSize: 512,
38 }, 39 },
39 sampleRate: 16000, 40 sampleRate: 16000,
@@ -29,6 +29,7 @@ function createVad() { @@ -29,6 +29,7 @@ function createVad() {
29 threshold: 0.5, 29 threshold: 0.5,
30 minSpeechDuration: 0.25, 30 minSpeechDuration: 0.25,
31 minSilenceDuration: 0.5, 31 minSilenceDuration: 0.5,
  32 + maxSpeechDuration: 5,
32 windowSize: 512, 33 windowSize: 512,
33 }, 34 },
34 sampleRate: 16000, 35 sampleRate: 16000,
@@ -90,6 +90,15 @@ def main(): @@ -90,6 +90,15 @@ def main():
90 90
91 config = sherpa_onnx.VadModelConfig() 91 config = sherpa_onnx.VadModelConfig()
92 config.silero_vad.model = args.silero_vad_model 92 config.silero_vad.model = args.silero_vad_model
  93 + config.silero_vad.threshold = 0.5
  94 + config.silero_vad.min_silence_duration = 0.25 # seconds
  95 + config.silero_vad.min_speech_duration = 0.25 # seconds
  96 +
  97 + # If the current segment is larger than this value, then it increases
  98 + # the threshold to 0.9 internally. After detecting this segment,
  99 + # it resets the threshold to its original value.
  100 + config.silero_vad.max_speech_duration = 5 # seconds
  101 +
93 config.sample_rate = sample_rate 102 config.sample_rate = sample_rate
94 103
95 window_size = config.silero_vad.window_size 104 window_size = config.silero_vad.window_size
@@ -14,6 +14,7 @@ namespace SherpaOnnx @@ -14,6 +14,7 @@ namespace SherpaOnnx
14 MinSilenceDuration = 0.5F; 14 MinSilenceDuration = 0.5F;
15 MinSpeechDuration = 0.25F; 15 MinSpeechDuration = 0.25F;
16 WindowSize = 512; 16 WindowSize = 512;
  17 + MaxSpeechDuration = 5.0F;
17 } 18 }
18 19
19 [MarshalAs(UnmanagedType.LPStr)] 20 [MarshalAs(UnmanagedType.LPStr)]
@@ -26,5 +27,7 @@ namespace SherpaOnnx @@ -26,5 +27,7 @@ namespace SherpaOnnx
26 public float MinSpeechDuration; 27 public float MinSpeechDuration;
27 28
28 public int WindowSize; 29 public int WindowSize;
  30 +
  31 + public float MaxSpeechDuration;
29 } 32 }
30 } 33 }
@@ -771,6 +771,7 @@ type SileroVadModelConfig struct { @@ -771,6 +771,7 @@ type SileroVadModelConfig struct {
771 MinSilenceDuration float32 771 MinSilenceDuration float32
772 MinSpeechDuration float32 772 MinSpeechDuration float32
773 WindowSize int 773 WindowSize int
  774 + MaxSpeechDuration float32
774 } 775 }
775 776
776 type VadModelConfig struct { 777 type VadModelConfig struct {
@@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3 @@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3
849 c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration) 850 c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
850 c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration) 851 c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
851 c.silero_vad.window_size = C.int(config.SileroVad.WindowSize) 852 c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
  853 + c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)
852 854
853 c.sample_rate = C.int(config.SampleRate) 855 c.sample_rate = C.int(config.SampleRate)
854 c.num_threads = C.int(config.NumThreads) 856 c.num_threads = C.int(config.NumThreads)
@@ -39,6 +39,9 @@ config = { @@ -39,6 +39,9 @@ config = {
39 sileroVad: { 39 sileroVad: {
40 model: "./silero_vad.onnx", 40 model: "./silero_vad.onnx",
41 threshold: 0.5, 41 threshold: 0.5,
  42 + minSilenceDuration: 0.5,
  43 + minSpeechDuration: 0.25,
  44 + maxSpeechDuration: 5,
42 } 45 }
43 } 46 }
44 */ 47 */
@@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig( @@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
279 SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration); 279 SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
280 SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration); 280 SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
281 SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize); 281 SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
  282 + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);
282 283
283 return c; 284 return c;
284 } 285 }
@@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( @@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
907 vad_config.silero_vad.window_size = 907 vad_config.silero_vad.window_size =
908 SHERPA_ONNX_OR(config->silero_vad.window_size, 512); 908 SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
909 909
  910 + vad_config.silero_vad.max_speech_duration =
  911 + SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);
  912 +
910 vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); 913 vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
911 vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); 914 vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
912 vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); 915 vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
@@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { @@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
746 float min_speech_duration; 746 float min_speech_duration;
747 747
748 int window_size; 748 int window_size;
  749 +
  750 + // If a speech segment is longer than this value, then we increase
  751 + // the threshold to 0.9. After finishing detecting the segment,
  752 + // the threshold value is reset to its original value.
  753 + float max_speech_duration;
749 } SherpaOnnxSileroVadModelConfig; 754 } SherpaOnnxSileroVadModelConfig;
750 755
751 SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { 756 SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
@@ -8,6 +8,7 @@ public class SileroVadModelConfig { @@ -8,6 +8,7 @@ public class SileroVadModelConfig {
8 private final float minSilenceDuration; 8 private final float minSilenceDuration;
9 private final float minSpeechDuration; 9 private final float minSpeechDuration;
10 private final int windowSize; 10 private final int windowSize;
  11 + private final float maxSpeechDuration;
11 12
12 private SileroVadModelConfig(Builder builder) { 13 private SileroVadModelConfig(Builder builder) {
13 this.model = builder.model; 14 this.model = builder.model;
@@ -15,6 +16,7 @@ public class SileroVadModelConfig { @@ -15,6 +16,7 @@ public class SileroVadModelConfig {
15 this.minSilenceDuration = builder.minSilenceDuration; 16 this.minSilenceDuration = builder.minSilenceDuration;
16 this.minSpeechDuration = builder.minSpeechDuration; 17 this.minSpeechDuration = builder.minSpeechDuration;
17 this.windowSize = builder.windowSize; 18 this.windowSize = builder.windowSize;
  19 + this.maxSpeechDuration = builder.maxSpeechDuration;
18 } 20 }
19 21
20 public static Builder builder() { 22 public static Builder builder() {
@@ -41,12 +43,17 @@ public class SileroVadModelConfig { @@ -41,12 +43,17 @@ public class SileroVadModelConfig {
41 return windowSize; 43 return windowSize;
42 } 44 }
43 45
  46 + public float getMaxSpeechDuration() {
  47 + return maxSpeechDuration;
  48 + }
  49 +
44 public static class Builder { 50 public static class Builder {
45 private String model = ""; 51 private String model = "";
46 private float threshold = 0.5f; 52 private float threshold = 0.5f;
47 private float minSilenceDuration = 0.25f; 53 private float minSilenceDuration = 0.25f;
48 private float minSpeechDuration = 0.5f; 54 private float minSpeechDuration = 0.5f;
49 private int windowSize = 512; 55 private int windowSize = 512;
  56 + private float maxSpeechDuration = 5.0f;
50 57
51 public SileroVadModelConfig build() { 58 public SileroVadModelConfig build() {
52 return new SileroVadModelConfig(this); 59 return new SileroVadModelConfig(this);
@@ -77,5 +84,10 @@ public class SileroVadModelConfig { @@ -77,5 +84,10 @@ public class SileroVadModelConfig {
77 this.windowSize = windowSize; 84 this.windowSize = windowSize;
78 return this; 85 return this;
79 } 86 }
  87 +
  88 + public Builder setMaxSpeechDuration(float maxSpeechDuration) {
  89 + this.maxSpeechDuration = maxSpeechDuration;
  90 + return this;
  91 + }
80 } 92 }
81 } 93 }
@@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) { @@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
40 fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I"); 40 fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
41 ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid); 41 ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);
42 42
  43 + fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F");
  44 + ans.silero_vad.max_speech_duration =
  45 + env->GetFloatField(silero_vad_config, fid);
  46 +
43 fid = env->GetFieldID(cls, "sampleRate", "I"); 47 fid = env->GetFieldID(cls, "sampleRate", "I");
44 ans.sample_rate = env->GetIntField(config, fid); 48 ans.sample_rate = env->GetIntField(config, fid);
45 49
@@ -9,6 +9,7 @@ data class SileroVadModelConfig( @@ -9,6 +9,7 @@ data class SileroVadModelConfig(
9 var minSilenceDuration: Float = 0.25F, 9 var minSilenceDuration: Float = 0.25F,
10 var minSpeechDuration: Float = 0.25F, 10 var minSpeechDuration: Float = 0.25F,
11 var windowSize: Int = 512, 11 var windowSize: Int = 512,
  12 + var maxSpeechDuration: Float = 5.0F,
12 ) 13 )
13 14
14 data class VadModelConfig( 15 data class VadModelConfig(
@@ -341,6 +341,7 @@ type @@ -341,6 +341,7 @@ type
341 MinSilenceDuration: Single; 341 MinSilenceDuration: Single;
342 MinSpeechDuration: Single; 342 MinSpeechDuration: Single;
343 WindowSize: Integer; 343 WindowSize: Integer;
  344 + MaxSpeechDuration: Single;
344 function ToString: AnsiString; 345 function ToString: AnsiString;
345 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); 346 class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
346 end; 347 end;
@@ -594,6 +595,7 @@ type @@ -594,6 +595,7 @@ type
594 MinSilenceDuration: cfloat; 595 MinSilenceDuration: cfloat;
595 MinSpeechDuration: cfloat; 596 MinSpeechDuration: cfloat;
596 WindowSize: cint32; 597 WindowSize: cint32;
  598 + MaxSpeechDuration: cfloat;
597 end; 599 end;
598 SherpaOnnxVadModelConfig = record 600 SherpaOnnxVadModelConfig = record
599 SileroVad: SherpaOnnxSileroVadModelConfig; 601 SileroVad: SherpaOnnxSileroVadModelConfig;
@@ -1402,10 +1404,11 @@ begin @@ -1402,10 +1404,11 @@ begin
1402 'Threshold := %.2f, ' + 1404 'Threshold := %.2f, ' +
1403 'MinSilenceDuration := %.2f, ' + 1405 'MinSilenceDuration := %.2f, ' +
1404 'MinSpeechDuration := %.2f, ' + 1406 'MinSpeechDuration := %.2f, ' +
1405 - 'WindowSize := %d' + 1407 + 'WindowSize := %d, ' +
  1408 + 'MaxSpeechDuration := %.2f' +
1406 ')', 1409 ')',
1407 [Self.Model, Self.Threshold, Self.MinSilenceDuration, 1410 [Self.Model, Self.Threshold, Self.MinSilenceDuration,
1408 - Self.MinSpeechDuration, Self.WindowSize 1411 + Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
1409 ]); 1412 ]);
1410 end; 1413 end;
1411 1414
@@ -1415,6 +1418,7 @@ begin @@ -1415,6 +1418,7 @@ begin
1415 Dest.MinSilenceDuration := 0.5; 1418 Dest.MinSilenceDuration := 0.5;
1416 Dest.MinSpeechDuration := 0.25; 1419 Dest.MinSpeechDuration := 0.25;
1417 Dest.WindowSize := 512; 1420 Dest.WindowSize := 512;
  1421 + Dest.MaxSpeechDuration := 5.0;
1418 end; 1422 end;
1419 1423
1420 function TSherpaOnnxVadModelConfig.ToString: AnsiString; 1424 function TSherpaOnnxVadModelConfig.ToString: AnsiString;
@@ -1569,6 +1573,7 @@ begin @@ -1569,6 +1573,7 @@ begin
1569 C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; 1573 C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
1570 C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration; 1574 C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
1571 C.SileroVad.WindowSize := Config.SileroVad.WindowSize; 1575 C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
  1576 + C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
1572 1577
1573 C.SampleRate := Config.SampleRate; 1578 C.SampleRate := Config.SampleRate;
1574 C.NumThreads := Config.NumThreads; 1579 C.NumThreads := Config.NumThreads;
@@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig( @@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig(
550 threshold: Float = 0.5, 550 threshold: Float = 0.5,
551 minSilenceDuration: Float = 0.25, 551 minSilenceDuration: Float = 0.25,
552 minSpeechDuration: Float = 0.5, 552 minSpeechDuration: Float = 0.5,
553 - windowSize: Int = 512 553 + windowSize: Int = 512,
  554 + maxSpeechDuration: Float = 5.0
554 ) -> SherpaOnnxSileroVadModelConfig { 555 ) -> SherpaOnnxSileroVadModelConfig {
555 return SherpaOnnxSileroVadModelConfig( 556 return SherpaOnnxSileroVadModelConfig(
556 model: toCPointer(model), 557 model: toCPointer(model),
557 threshold: threshold, 558 threshold: threshold,
558 min_silence_duration: minSilenceDuration, 559 min_silence_duration: minSilenceDuration,
559 min_speech_duration: minSpeechDuration, 560 min_speech_duration: minSpeechDuration,
560 - window_size: Int32(windowSize) 561 + window_size: Int32(windowSize),
  562 + max_speech_duration: maxSpeechDuration
561 ) 563 )
562 } 564 }
563 565
@@ -19,7 +19,7 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { @@ -19,7 +19,7 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) {
19 19
20 const buffer = Module._malloc(n); 20 const buffer = Module._malloc(n);
21 21
22 - const len = 5 * 4; 22 + const len = 6 * 4;
23 const ptr = Module._malloc(len); 23 const ptr = Module._malloc(len);
24 24
25 Module.stringToUTF8(config.model || '', buffer, modelLen); 25 Module.stringToUTF8(config.model || '', buffer, modelLen);
@@ -40,6 +40,9 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { @@ -40,6 +40,9 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) {
40 Module.setValue(ptr + offset, config.windowSize || 512, 'i32'); 40 Module.setValue(ptr + offset, config.windowSize || 512, 'i32');
41 offset += 4; 41 offset += 4;
42 42
  43 + Module.setValue(ptr + offset, config.maxSpeechDuration || 20, 'float');
  44 + offset += 4;
  45 +
43 return { 46 return {
44 buffer: buffer, ptr: ptr, len: len, 47 buffer: buffer, ptr: ptr, len: len,
45 } 48 }
@@ -53,6 +56,7 @@ function initSherpaOnnxVadModelConfig(config, Module) { @@ -53,6 +56,7 @@ function initSherpaOnnxVadModelConfig(config, Module) {
53 minSilenceDuration: 0.50, 56 minSilenceDuration: 0.50,
54 minSpeechDuration: 0.25, 57 minSpeechDuration: 0.25,
55 windowSize: 512, 58 windowSize: 512,
  59 + maxSpeechDuration: 20,
56 }; 60 };
57 } 61 }
58 62
@@ -93,6 +97,7 @@ function createVad(Module, myConfig) { @@ -93,6 +97,7 @@ function createVad(Module, myConfig) {
93 threshold: 0.50, 97 threshold: 0.50,
94 minSilenceDuration: 0.50, 98 minSilenceDuration: 0.50,
95 minSpeechDuration: 0.25, 99 minSpeechDuration: 0.25,
  100 + maxSpeechDuration: 20,
96 windowSize: 512, 101 windowSize: 512,
97 }; 102 };
98 103
@@ -13,7 +13,7 @@ @@ -13,7 +13,7 @@
13 13
14 extern "C" { 14 extern "C" {
15 15
16 -static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, ""); 16 +static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 6 * 4, "");
17 17
18 static_assert(sizeof(SherpaOnnxVadModelConfig) == 18 static_assert(sizeof(SherpaOnnxVadModelConfig) ==
19 sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4, 19 sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4,
@@ -29,6 +29,8 @@ void MyPrint(SherpaOnnxVadModelConfig *config) { @@ -29,6 +29,8 @@ void MyPrint(SherpaOnnxVadModelConfig *config) {
29 fprintf(stdout, "min_speech_duration: %.3f\n", 29 fprintf(stdout, "min_speech_duration: %.3f\n",
30 silero_vad->min_speech_duration); 30 silero_vad->min_speech_duration);
31 fprintf(stdout, "window_size: %d\n", silero_vad->window_size); 31 fprintf(stdout, "window_size: %d\n", silero_vad->window_size);
  32 + fprintf(stdout, "max_speech_duration: %.3f\n",
  33 + silero_vad->max_speech_duration);
32 34
33 fprintf(stdout, "----------config----------\n"); 35 fprintf(stdout, "----------config----------\n");
34 36