Committed by
GitHub
Add APIs about max speech duration in VAD for various programming languages (#1349)
正在显示
31 个修改的文件
包含
88 行增加
和
9 行删除
| @@ -93,6 +93,8 @@ jobs: | @@ -93,6 +93,8 @@ jobs: | ||
| 93 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface | 93 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface |
| 94 | 94 | ||
| 95 | cd huggingface | 95 | cd huggingface |
| 96 | + git fetch | ||
| 97 | + git pull | ||
| 96 | mkdir -p windows-for-dotnet | 98 | mkdir -p windows-for-dotnet |
| 97 | 99 | ||
| 98 | cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet | 100 | cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet |
| @@ -32,6 +32,7 @@ void main(List<String> arguments) async { | @@ -32,6 +32,7 @@ void main(List<String> arguments) async { | ||
| 32 | model: sileroVad, | 32 | model: sileroVad, |
| 33 | minSilenceDuration: 0.25, | 33 | minSilenceDuration: 0.25, |
| 34 | minSpeechDuration: 0.5, | 34 | minSpeechDuration: 0.5, |
| 35 | + maxSpeechDuration: 5.0, | ||
| 35 | ); | 36 | ); |
| 36 | 37 | ||
| 37 | final vadConfig = sherpa_onnx.VadModelConfig( | 38 | final vadConfig = sherpa_onnx.VadModelConfig( |
| @@ -38,6 +38,7 @@ void main(List<String> arguments) async { | @@ -38,6 +38,7 @@ void main(List<String> arguments) async { | ||
| 38 | model: sileroVad, | 38 | model: sileroVad, |
| 39 | minSilenceDuration: 0.25, | 39 | minSilenceDuration: 0.25, |
| 40 | minSpeechDuration: 0.5, | 40 | minSpeechDuration: 0.5, |
| 41 | + maxSpeechDuration: 5.0, | ||
| 41 | ); | 42 | ); |
| 42 | 43 | ||
| 43 | final vadConfig = sherpa_onnx.VadModelConfig( | 44 | final vadConfig = sherpa_onnx.VadModelConfig( |
| @@ -37,6 +37,7 @@ void main(List<String> arguments) async { | @@ -37,6 +37,7 @@ void main(List<String> arguments) async { | ||
| 37 | model: sileroVad, | 37 | model: sileroVad, |
| 38 | minSilenceDuration: 0.25, | 38 | minSilenceDuration: 0.25, |
| 39 | minSpeechDuration: 0.5, | 39 | minSpeechDuration: 0.5, |
| 40 | + maxSpeechDuration: 5.0, | ||
| 40 | ); | 41 | ); |
| 41 | 42 | ||
| 42 | final vadConfig = sherpa_onnx.VadModelConfig( | 43 | final vadConfig = sherpa_onnx.VadModelConfig( |
| @@ -33,6 +33,7 @@ void main(List<String> arguments) async { | @@ -33,6 +33,7 @@ void main(List<String> arguments) async { | ||
| 33 | model: sileroVad, | 33 | model: sileroVad, |
| 34 | minSilenceDuration: 0.25, | 34 | minSilenceDuration: 0.25, |
| 35 | minSpeechDuration: 0.5, | 35 | minSpeechDuration: 0.5, |
| 36 | + maxSpeechDuration: 5.0, | ||
| 36 | ); | 37 | ); |
| 37 | 38 | ||
| 38 | final vadConfig = sherpa_onnx.VadModelConfig( | 39 | final vadConfig = sherpa_onnx.VadModelConfig( |
| @@ -34,6 +34,7 @@ void main(List<String> arguments) async { | @@ -34,6 +34,7 @@ void main(List<String> arguments) async { | ||
| 34 | model: sileroVad, | 34 | model: sileroVad, |
| 35 | minSilenceDuration: 0.25, | 35 | minSilenceDuration: 0.25, |
| 36 | minSpeechDuration: 0.5, | 36 | minSpeechDuration: 0.5, |
| 37 | + maxSpeechDuration: 5.0, | ||
| 37 | ); | 38 | ); |
| 38 | 39 | ||
| 39 | final vadConfig = sherpa_onnx.VadModelConfig( | 40 | final vadConfig = sherpa_onnx.VadModelConfig( |
| @@ -37,6 +37,7 @@ void main(List<String> arguments) async { | @@ -37,6 +37,7 @@ void main(List<String> arguments) async { | ||
| 37 | model: sileroVad, | 37 | model: sileroVad, |
| 38 | minSilenceDuration: 0.25, | 38 | minSilenceDuration: 0.25, |
| 39 | minSpeechDuration: 0.5, | 39 | minSpeechDuration: 0.5, |
| 40 | + maxSpeechDuration: 5.0, | ||
| 40 | ); | 41 | ); |
| 41 | 42 | ||
| 42 | final vadConfig = sherpa_onnx.VadModelConfig( | 43 | final vadConfig = sherpa_onnx.VadModelConfig( |
| @@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct { | @@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct { | ||
| 301 | 301 | ||
| 302 | @Int32() | 302 | @Int32() |
| 303 | external int windowSize; | 303 | external int windowSize; |
| 304 | + | ||
| 305 | + @Float() | ||
| 306 | + external double maxSpeechDuration; | ||
| 304 | } | 307 | } |
| 305 | 308 | ||
| 306 | final class SherpaOnnxVadModelConfig extends Struct { | 309 | final class SherpaOnnxVadModelConfig extends Struct { |
| @@ -11,11 +11,12 @@ class SileroVadModelConfig { | @@ -11,11 +11,12 @@ class SileroVadModelConfig { | ||
| 11 | this.threshold = 0.5, | 11 | this.threshold = 0.5, |
| 12 | this.minSilenceDuration = 0.5, | 12 | this.minSilenceDuration = 0.5, |
| 13 | this.minSpeechDuration = 0.25, | 13 | this.minSpeechDuration = 0.25, |
| 14 | - this.windowSize = 512}); | 14 | + this.windowSize = 512, |
| 15 | + this.maxSpeechDuration = 5.0}); | ||
| 15 | 16 | ||
| 16 | @override | 17 | @override |
| 17 | String toString() { | 18 | String toString() { |
| 18 | - return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)'; | 19 | + return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)'; |
| 19 | } | 20 | } |
| 20 | 21 | ||
| 21 | final String model; | 22 | final String model; |
| @@ -23,6 +24,7 @@ class SileroVadModelConfig { | @@ -23,6 +24,7 @@ class SileroVadModelConfig { | ||
| 23 | final double minSilenceDuration; | 24 | final double minSilenceDuration; |
| 24 | final double minSpeechDuration; | 25 | final double minSpeechDuration; |
| 25 | final int windowSize; | 26 | final int windowSize; |
| 27 | + final double maxSpeechDuration; | ||
| 26 | } | 28 | } |
| 27 | 29 | ||
| 28 | class VadModelConfig { | 30 | class VadModelConfig { |
| @@ -127,6 +129,7 @@ class VoiceActivityDetector { | @@ -127,6 +129,7 @@ class VoiceActivityDetector { | ||
| 127 | c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration; | 129 | c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration; |
| 128 | c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration; | 130 | c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration; |
| 129 | c.ref.sileroVad.windowSize = config.sileroVad.windowSize; | 131 | c.ref.sileroVad.windowSize = config.sileroVad.windowSize; |
| 132 | + c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration; | ||
| 130 | 133 | ||
| 131 | c.ref.sampleRate = config.sampleRate; | 134 | c.ref.sampleRate = config.sampleRate; |
| 132 | c.ref.numThreads = config.numThreads; | 135 | c.ref.numThreads = config.numThreads; |
| @@ -22,6 +22,7 @@ func main() { | @@ -22,6 +22,7 @@ func main() { | ||
| 22 | config.SileroVad.MinSilenceDuration = 0.5 | 22 | config.SileroVad.MinSilenceDuration = 0.5 |
| 23 | config.SileroVad.MinSpeechDuration = 0.25 | 23 | config.SileroVad.MinSpeechDuration = 0.25 |
| 24 | config.SileroVad.WindowSize = 512 | 24 | config.SileroVad.WindowSize = 512 |
| 25 | + config.SileroVad.MaxSpeechDuration = 5.0 | ||
| 25 | config.SampleRate = 16000 | 26 | config.SampleRate = 16000 |
| 26 | config.NumThreads = 1 | 27 | config.NumThreads = 1 |
| 27 | config.Provider = "cpu" | 28 | config.Provider = "cpu" |
| @@ -22,6 +22,7 @@ func main() { | @@ -22,6 +22,7 @@ func main() { | ||
| 22 | config.SileroVad.MinSilenceDuration = 0.5 | 22 | config.SileroVad.MinSilenceDuration = 0.5 |
| 23 | config.SileroVad.MinSpeechDuration = 0.25 | 23 | config.SileroVad.MinSpeechDuration = 0.25 |
| 24 | config.SileroVad.WindowSize = 512 | 24 | config.SileroVad.WindowSize = 512 |
| 25 | + config.SileroVad.MaxSpeechDuration = 5.0 | ||
| 25 | config.SampleRate = 16000 | 26 | config.SampleRate = 16000 |
| 26 | config.NumThreads = 1 | 27 | config.NumThreads = 1 |
| 27 | config.Provider = "cpu" | 28 | config.Provider = "cpu" |
| @@ -18,6 +18,7 @@ public class VadNonStreamingParaformer { | @@ -18,6 +18,7 @@ public class VadNonStreamingParaformer { | ||
| 18 | .setMinSilenceDuration(0.25f) | 18 | .setMinSilenceDuration(0.25f) |
| 19 | .setMinSpeechDuration(0.5f) | 19 | .setMinSpeechDuration(0.5f) |
| 20 | .setWindowSize(512) | 20 | .setWindowSize(512) |
| 21 | + .setMaxSpeechDuration(5.0f) | ||
| 21 | .build(); | 22 | .build(); |
| 22 | 23 | ||
| 23 | VadModelConfig config = | 24 | VadModelConfig config = |
| @@ -18,6 +18,7 @@ public class VadNonStreamingSenseVoice { | @@ -18,6 +18,7 @@ public class VadNonStreamingSenseVoice { | ||
| 18 | .setMinSilenceDuration(0.25f) | 18 | .setMinSilenceDuration(0.25f) |
| 19 | .setMinSpeechDuration(0.5f) | 19 | .setMinSpeechDuration(0.5f) |
| 20 | .setWindowSize(512) | 20 | .setWindowSize(512) |
| 21 | + .setMaxSpeechDuration(5.0f) | ||
| 21 | .build(); | 22 | .build(); |
| 22 | 23 | ||
| 23 | VadModelConfig config = | 24 | VadModelConfig config = |
| @@ -19,6 +19,7 @@ public class VadRemoveSilence { | @@ -19,6 +19,7 @@ public class VadRemoveSilence { | ||
| 19 | .setMinSilenceDuration(0.25f) | 19 | .setMinSilenceDuration(0.25f) |
| 20 | .setMinSpeechDuration(0.5f) | 20 | .setMinSpeechDuration(0.5f) |
| 21 | .setWindowSize(512) | 21 | .setWindowSize(512) |
| 22 | + .setMaxSpeechDuration(5.0f) | ||
| 22 | .build(); | 23 | .build(); |
| 23 | 24 | ||
| 24 | VadModelConfig config = | 25 | VadModelConfig config = |
| @@ -48,8 +48,9 @@ begin | @@ -48,8 +48,9 @@ begin | ||
| 48 | WindowSize := 512; {Please don't change it unless you know the details} | 48 | WindowSize := 512; {Please don't change it unless you know the details} |
| 49 | 49 | ||
| 50 | Config.SileroVad.Model := VadFilename; | 50 | Config.SileroVad.Model := VadFilename; |
| 51 | - Config.SileroVad.MinSpeechDuration := 0.5; | 51 | + Config.SileroVad.MinSpeechDuration := 0.25; |
| 52 | Config.SileroVad.MinSilenceDuration := 0.5; | 52 | Config.SileroVad.MinSilenceDuration := 0.5; |
| 53 | + Config.SileroVad.MaxSpeechDuration := 5.0; | ||
| 53 | Config.SileroVad.Threshold := 0.5; | 54 | Config.SileroVad.Threshold := 0.5; |
| 54 | Config.SileroVad.WindowSize := WindowSize; | 55 | Config.SileroVad.WindowSize := WindowSize; |
| 55 | Config.NumThreads:= 2; | 56 | Config.NumThreads:= 2; |
| @@ -34,6 +34,7 @@ function createVad() { | @@ -34,6 +34,7 @@ function createVad() { | ||
| 34 | threshold: 0.5, | 34 | threshold: 0.5, |
| 35 | minSpeechDuration: 0.25, | 35 | minSpeechDuration: 0.25, |
| 36 | minSilenceDuration: 0.5, | 36 | minSilenceDuration: 0.5, |
| 37 | + maxSpeechDuration: 5, | ||
| 37 | windowSize: 512, | 38 | windowSize: 512, |
| 38 | }, | 39 | }, |
| 39 | sampleRate: 16000, | 40 | sampleRate: 16000, |
| @@ -29,6 +29,7 @@ function createVad() { | @@ -29,6 +29,7 @@ function createVad() { | ||
| 29 | threshold: 0.5, | 29 | threshold: 0.5, |
| 30 | minSpeechDuration: 0.25, | 30 | minSpeechDuration: 0.25, |
| 31 | minSilenceDuration: 0.5, | 31 | minSilenceDuration: 0.5, |
| 32 | + maxSpeechDuration: 5, | ||
| 32 | windowSize: 512, | 33 | windowSize: 512, |
| 33 | }, | 34 | }, |
| 34 | sampleRate: 16000, | 35 | sampleRate: 16000, |
| @@ -90,6 +90,15 @@ def main(): | @@ -90,6 +90,15 @@ def main(): | ||
| 90 | 90 | ||
| 91 | config = sherpa_onnx.VadModelConfig() | 91 | config = sherpa_onnx.VadModelConfig() |
| 92 | config.silero_vad.model = args.silero_vad_model | 92 | config.silero_vad.model = args.silero_vad_model |
| 93 | + config.silero_vad.threshold = 0.5 | ||
| 94 | + config.silero_vad.min_silence_duration = 0.25 # seconds | ||
| 95 | + config.silero_vad.min_speech_duration = 0.25 # seconds | ||
| 96 | + | ||
| 97 | + # If the current segment is larger than this value, then it increases | ||
| 98 | + # the threshold to 0.9 internally. After detecting this segment, | ||
| 99 | + # it resets the threshold to its original value. | ||
| 100 | + config.silero_vad.max_speech_duration = 5 # seconds | ||
| 101 | + | ||
| 93 | config.sample_rate = sample_rate | 102 | config.sample_rate = sample_rate |
| 94 | 103 | ||
| 95 | window_size = config.silero_vad.window_size | 104 | window_size = config.silero_vad.window_size |
| @@ -14,6 +14,7 @@ namespace SherpaOnnx | @@ -14,6 +14,7 @@ namespace SherpaOnnx | ||
| 14 | MinSilenceDuration = 0.5F; | 14 | MinSilenceDuration = 0.5F; |
| 15 | MinSpeechDuration = 0.25F; | 15 | MinSpeechDuration = 0.25F; |
| 16 | WindowSize = 512; | 16 | WindowSize = 512; |
| 17 | + MaxSpeechDuration = 5.0F; | ||
| 17 | } | 18 | } |
| 18 | 19 | ||
| 19 | [MarshalAs(UnmanagedType.LPStr)] | 20 | [MarshalAs(UnmanagedType.LPStr)] |
| @@ -26,5 +27,7 @@ namespace SherpaOnnx | @@ -26,5 +27,7 @@ namespace SherpaOnnx | ||
| 26 | public float MinSpeechDuration; | 27 | public float MinSpeechDuration; |
| 27 | 28 | ||
| 28 | public int WindowSize; | 29 | public int WindowSize; |
| 30 | + | ||
| 31 | + public float MaxSpeechDuration; | ||
| 29 | } | 32 | } |
| 30 | } | 33 | } |
| @@ -771,6 +771,7 @@ type SileroVadModelConfig struct { | @@ -771,6 +771,7 @@ type SileroVadModelConfig struct { | ||
| 771 | MinSilenceDuration float32 | 771 | MinSilenceDuration float32 |
| 772 | MinSpeechDuration float32 | 772 | MinSpeechDuration float32 |
| 773 | WindowSize int | 773 | WindowSize int |
| 774 | + MaxSpeechDuration float32 | ||
| 774 | } | 775 | } |
| 775 | 776 | ||
| 776 | type VadModelConfig struct { | 777 | type VadModelConfig struct { |
| @@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3 | @@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3 | ||
| 849 | c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration) | 850 | c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration) |
| 850 | c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration) | 851 | c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration) |
| 851 | c.silero_vad.window_size = C.int(config.SileroVad.WindowSize) | 852 | c.silero_vad.window_size = C.int(config.SileroVad.WindowSize) |
| 853 | + c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration) | ||
| 852 | 854 | ||
| 853 | c.sample_rate = C.int(config.SampleRate) | 855 | c.sample_rate = C.int(config.SampleRate) |
| 854 | c.num_threads = C.int(config.NumThreads) | 856 | c.num_threads = C.int(config.NumThreads) |
| @@ -39,6 +39,9 @@ config = { | @@ -39,6 +39,9 @@ config = { | ||
| 39 | sileroVad: { | 39 | sileroVad: { |
| 40 | model: "./silero_vad.onnx", | 40 | model: "./silero_vad.onnx", |
| 41 | threshold: 0.5, | 41 | threshold: 0.5, |
| 42 | + minSilenceDuration: 0.5, | ||
| 43 | + minSpeechDuration: 0.25, | ||
| 44 | + maxSpeechDuration: 5, | ||
| 42 | } | 45 | } |
| 43 | } | 46 | } |
| 44 | */ | 47 | */ |
| @@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig( | @@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig( | ||
| 279 | SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration); | 279 | SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration); |
| 280 | SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration); | 280 | SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration); |
| 281 | SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize); | 281 | SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize); |
| 282 | + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration); | ||
| 282 | 283 | ||
| 283 | return c; | 284 | return c; |
| 284 | } | 285 | } |
| @@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( | @@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector( | ||
| 907 | vad_config.silero_vad.window_size = | 907 | vad_config.silero_vad.window_size = |
| 908 | SHERPA_ONNX_OR(config->silero_vad.window_size, 512); | 908 | SHERPA_ONNX_OR(config->silero_vad.window_size, 512); |
| 909 | 909 | ||
| 910 | + vad_config.silero_vad.max_speech_duration = | ||
| 911 | + SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20); | ||
| 912 | + | ||
| 910 | vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); | 913 | vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000); |
| 911 | vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); | 914 | vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); |
| 912 | vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); | 915 | vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu"); |
| @@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { | @@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { | ||
| 746 | float min_speech_duration; | 746 | float min_speech_duration; |
| 747 | 747 | ||
| 748 | int window_size; | 748 | int window_size; |
| 749 | + | ||
| 750 | + // If a speech segment is longer than this value, then we increase | ||
| 751 | + // the threshold to 0.9. After finishing detecting the segment, | ||
| 752 | + // the threshold value is reset to its original value. | ||
| 753 | + float max_speech_duration; | ||
| 749 | } SherpaOnnxSileroVadModelConfig; | 754 | } SherpaOnnxSileroVadModelConfig; |
| 750 | 755 | ||
| 751 | SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { | 756 | SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig { |
| @@ -8,6 +8,7 @@ public class SileroVadModelConfig { | @@ -8,6 +8,7 @@ public class SileroVadModelConfig { | ||
| 8 | private final float minSilenceDuration; | 8 | private final float minSilenceDuration; |
| 9 | private final float minSpeechDuration; | 9 | private final float minSpeechDuration; |
| 10 | private final int windowSize; | 10 | private final int windowSize; |
| 11 | + private final float maxSpeechDuration; | ||
| 11 | 12 | ||
| 12 | private SileroVadModelConfig(Builder builder) { | 13 | private SileroVadModelConfig(Builder builder) { |
| 13 | this.model = builder.model; | 14 | this.model = builder.model; |
| @@ -15,6 +16,7 @@ public class SileroVadModelConfig { | @@ -15,6 +16,7 @@ public class SileroVadModelConfig { | ||
| 15 | this.minSilenceDuration = builder.minSilenceDuration; | 16 | this.minSilenceDuration = builder.minSilenceDuration; |
| 16 | this.minSpeechDuration = builder.minSpeechDuration; | 17 | this.minSpeechDuration = builder.minSpeechDuration; |
| 17 | this.windowSize = builder.windowSize; | 18 | this.windowSize = builder.windowSize; |
| 19 | + this.maxSpeechDuration = builder.maxSpeechDuration; | ||
| 18 | } | 20 | } |
| 19 | 21 | ||
| 20 | public static Builder builder() { | 22 | public static Builder builder() { |
| @@ -41,12 +43,17 @@ public class SileroVadModelConfig { | @@ -41,12 +43,17 @@ public class SileroVadModelConfig { | ||
| 41 | return windowSize; | 43 | return windowSize; |
| 42 | } | 44 | } |
| 43 | 45 | ||
| 46 | + public float getMaxSpeechDuration() { | ||
| 47 | + return maxSpeechDuration; | ||
| 48 | + } | ||
| 49 | + | ||
| 44 | public static class Builder { | 50 | public static class Builder { |
| 45 | private String model = ""; | 51 | private String model = ""; |
| 46 | private float threshold = 0.5f; | 52 | private float threshold = 0.5f; |
| 47 | private float minSilenceDuration = 0.25f; | 53 | private float minSilenceDuration = 0.25f; |
| 48 | private float minSpeechDuration = 0.5f; | 54 | private float minSpeechDuration = 0.5f; |
| 49 | private int windowSize = 512; | 55 | private int windowSize = 512; |
| 56 | + private float maxSpeechDuration = 5.0f; | ||
| 50 | 57 | ||
| 51 | public SileroVadModelConfig build() { | 58 | public SileroVadModelConfig build() { |
| 52 | return new SileroVadModelConfig(this); | 59 | return new SileroVadModelConfig(this); |
| @@ -77,5 +84,10 @@ public class SileroVadModelConfig { | @@ -77,5 +84,10 @@ public class SileroVadModelConfig { | ||
| 77 | this.windowSize = windowSize; | 84 | this.windowSize = windowSize; |
| 78 | return this; | 85 | return this; |
| 79 | } | 86 | } |
| 87 | + | ||
| 88 | + public Builder setMaxSpeechDuration(float maxSpeechDuration) { | ||
| 89 | + this.maxSpeechDuration = maxSpeechDuration; | ||
| 90 | + return this; | ||
| 91 | + } | ||
| 80 | } | 92 | } |
| 81 | } | 93 | } |
| @@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) { | @@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) { | ||
| 40 | fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I"); | 40 | fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I"); |
| 41 | ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid); | 41 | ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid); |
| 42 | 42 | ||
| 43 | + fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F"); | ||
| 44 | + ans.silero_vad.max_speech_duration = | ||
| 45 | + env->GetFloatField(silero_vad_config, fid); | ||
| 46 | + | ||
| 43 | fid = env->GetFieldID(cls, "sampleRate", "I"); | 47 | fid = env->GetFieldID(cls, "sampleRate", "I"); |
| 44 | ans.sample_rate = env->GetIntField(config, fid); | 48 | ans.sample_rate = env->GetIntField(config, fid); |
| 45 | 49 |
| @@ -9,6 +9,7 @@ data class SileroVadModelConfig( | @@ -9,6 +9,7 @@ data class SileroVadModelConfig( | ||
| 9 | var minSilenceDuration: Float = 0.25F, | 9 | var minSilenceDuration: Float = 0.25F, |
| 10 | var minSpeechDuration: Float = 0.25F, | 10 | var minSpeechDuration: Float = 0.25F, |
| 11 | var windowSize: Int = 512, | 11 | var windowSize: Int = 512, |
| 12 | + var maxSpeechDuration: Float = 5.0F, | ||
| 12 | ) | 13 | ) |
| 13 | 14 | ||
| 14 | data class VadModelConfig( | 15 | data class VadModelConfig( |
| @@ -341,6 +341,7 @@ type | @@ -341,6 +341,7 @@ type | ||
| 341 | MinSilenceDuration: Single; | 341 | MinSilenceDuration: Single; |
| 342 | MinSpeechDuration: Single; | 342 | MinSpeechDuration: Single; |
| 343 | WindowSize: Integer; | 343 | WindowSize: Integer; |
| 344 | + MaxSpeechDuration: Single; | ||
| 344 | function ToString: AnsiString; | 345 | function ToString: AnsiString; |
| 345 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); | 346 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); |
| 346 | end; | 347 | end; |
| @@ -594,6 +595,7 @@ type | @@ -594,6 +595,7 @@ type | ||
| 594 | MinSilenceDuration: cfloat; | 595 | MinSilenceDuration: cfloat; |
| 595 | MinSpeechDuration: cfloat; | 596 | MinSpeechDuration: cfloat; |
| 596 | WindowSize: cint32; | 597 | WindowSize: cint32; |
| 598 | + MaxSpeechDuration: cfloat; | ||
| 597 | end; | 599 | end; |
| 598 | SherpaOnnxVadModelConfig = record | 600 | SherpaOnnxVadModelConfig = record |
| 599 | SileroVad: SherpaOnnxSileroVadModelConfig; | 601 | SileroVad: SherpaOnnxSileroVadModelConfig; |
| @@ -1402,10 +1404,11 @@ begin | @@ -1402,10 +1404,11 @@ begin | ||
| 1402 | 'Threshold := %.2f, ' + | 1404 | 'Threshold := %.2f, ' + |
| 1403 | 'MinSilenceDuration := %.2f, ' + | 1405 | 'MinSilenceDuration := %.2f, ' + |
| 1404 | 'MinSpeechDuration := %.2f, ' + | 1406 | 'MinSpeechDuration := %.2f, ' + |
| 1405 | - 'WindowSize := %d' + | 1407 | + 'WindowSize := %d, ' + |
| 1408 | + 'MaxSpeechDuration := %.2f' + | ||
| 1406 | ')', | 1409 | ')', |
| 1407 | [Self.Model, Self.Threshold, Self.MinSilenceDuration, | 1410 | [Self.Model, Self.Threshold, Self.MinSilenceDuration, |
| 1408 | - Self.MinSpeechDuration, Self.WindowSize | 1411 | + Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration |
| 1409 | ]); | 1412 | ]); |
| 1410 | end; | 1413 | end; |
| 1411 | 1414 | ||
| @@ -1415,6 +1418,7 @@ begin | @@ -1415,6 +1418,7 @@ begin | ||
| 1415 | Dest.MinSilenceDuration := 0.5; | 1418 | Dest.MinSilenceDuration := 0.5; |
| 1416 | Dest.MinSpeechDuration := 0.25; | 1419 | Dest.MinSpeechDuration := 0.25; |
| 1417 | Dest.WindowSize := 512; | 1420 | Dest.WindowSize := 512; |
| 1421 | + Dest.MaxSpeechDuration := 5.0; | ||
| 1418 | end; | 1422 | end; |
| 1419 | 1423 | ||
| 1420 | function TSherpaOnnxVadModelConfig.ToString: AnsiString; | 1424 | function TSherpaOnnxVadModelConfig.ToString: AnsiString; |
| @@ -1569,6 +1573,7 @@ begin | @@ -1569,6 +1573,7 @@ begin | ||
| 1569 | C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; | 1573 | C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; |
| 1570 | C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration; | 1574 | C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration; |
| 1571 | C.SileroVad.WindowSize := Config.SileroVad.WindowSize; | 1575 | C.SileroVad.WindowSize := Config.SileroVad.WindowSize; |
| 1576 | + C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration; | ||
| 1572 | 1577 | ||
| 1573 | C.SampleRate := Config.SampleRate; | 1578 | C.SampleRate := Config.SampleRate; |
| 1574 | C.NumThreads := Config.NumThreads; | 1579 | C.NumThreads := Config.NumThreads; |
| @@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig( | @@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig( | ||
| 550 | threshold: Float = 0.5, | 550 | threshold: Float = 0.5, |
| 551 | minSilenceDuration: Float = 0.25, | 551 | minSilenceDuration: Float = 0.25, |
| 552 | minSpeechDuration: Float = 0.5, | 552 | minSpeechDuration: Float = 0.5, |
| 553 | - windowSize: Int = 512 | 553 | + windowSize: Int = 512, |
| 554 | + maxSpeechDuration: Float = 5.0 | ||
| 554 | ) -> SherpaOnnxSileroVadModelConfig { | 555 | ) -> SherpaOnnxSileroVadModelConfig { |
| 555 | return SherpaOnnxSileroVadModelConfig( | 556 | return SherpaOnnxSileroVadModelConfig( |
| 556 | model: toCPointer(model), | 557 | model: toCPointer(model), |
| 557 | threshold: threshold, | 558 | threshold: threshold, |
| 558 | min_silence_duration: minSilenceDuration, | 559 | min_silence_duration: minSilenceDuration, |
| 559 | min_speech_duration: minSpeechDuration, | 560 | min_speech_duration: minSpeechDuration, |
| 560 | - window_size: Int32(windowSize) | 561 | + window_size: Int32(windowSize), |
| 562 | + max_speech_duration: maxSpeechDuration | ||
| 561 | ) | 563 | ) |
| 562 | } | 564 | } |
| 563 | 565 |
| @@ -19,7 +19,7 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { | @@ -19,7 +19,7 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { | ||
| 19 | 19 | ||
| 20 | const buffer = Module._malloc(n); | 20 | const buffer = Module._malloc(n); |
| 21 | 21 | ||
| 22 | - const len = 5 * 4; | 22 | + const len = 6 * 4; |
| 23 | const ptr = Module._malloc(len); | 23 | const ptr = Module._malloc(len); |
| 24 | 24 | ||
| 25 | Module.stringToUTF8(config.model || '', buffer, modelLen); | 25 | Module.stringToUTF8(config.model || '', buffer, modelLen); |
| @@ -40,6 +40,9 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { | @@ -40,6 +40,9 @@ function initSherpaOnnxSileroVadModelConfig(config, Module) { | ||
| 40 | Module.setValue(ptr + offset, config.windowSize || 512, 'i32'); | 40 | Module.setValue(ptr + offset, config.windowSize || 512, 'i32'); |
| 41 | offset += 4; | 41 | offset += 4; |
| 42 | 42 | ||
| 43 | + Module.setValue(ptr + offset, config.maxSpeechDuration || 20, 'float'); | ||
| 44 | + offset += 4; | ||
| 45 | + | ||
| 43 | return { | 46 | return { |
| 44 | buffer: buffer, ptr: ptr, len: len, | 47 | buffer: buffer, ptr: ptr, len: len, |
| 45 | } | 48 | } |
| @@ -53,6 +56,7 @@ function initSherpaOnnxVadModelConfig(config, Module) { | @@ -53,6 +56,7 @@ function initSherpaOnnxVadModelConfig(config, Module) { | ||
| 53 | minSilenceDuration: 0.50, | 56 | minSilenceDuration: 0.50, |
| 54 | minSpeechDuration: 0.25, | 57 | minSpeechDuration: 0.25, |
| 55 | windowSize: 512, | 58 | windowSize: 512, |
| 59 | + maxSpeechDuration: 20, | ||
| 56 | }; | 60 | }; |
| 57 | } | 61 | } |
| 58 | 62 | ||
| @@ -93,6 +97,7 @@ function createVad(Module, myConfig) { | @@ -93,6 +97,7 @@ function createVad(Module, myConfig) { | ||
| 93 | threshold: 0.50, | 97 | threshold: 0.50, |
| 94 | minSilenceDuration: 0.50, | 98 | minSilenceDuration: 0.50, |
| 95 | minSpeechDuration: 0.25, | 99 | minSpeechDuration: 0.25, |
| 100 | + maxSpeechDuration: 20, | ||
| 96 | windowSize: 512, | 101 | windowSize: 512, |
| 97 | }; | 102 | }; |
| 98 | 103 |
| @@ -13,7 +13,7 @@ | @@ -13,7 +13,7 @@ | ||
| 13 | 13 | ||
| 14 | extern "C" { | 14 | extern "C" { |
| 15 | 15 | ||
| 16 | -static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, ""); | 16 | +static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 6 * 4, ""); |
| 17 | 17 | ||
| 18 | static_assert(sizeof(SherpaOnnxVadModelConfig) == | 18 | static_assert(sizeof(SherpaOnnxVadModelConfig) == |
| 19 | sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4, | 19 | sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4, |
| @@ -29,6 +29,8 @@ void MyPrint(SherpaOnnxVadModelConfig *config) { | @@ -29,6 +29,8 @@ void MyPrint(SherpaOnnxVadModelConfig *config) { | ||
| 29 | fprintf(stdout, "min_speech_duration: %.3f\n", | 29 | fprintf(stdout, "min_speech_duration: %.3f\n", |
| 30 | silero_vad->min_speech_duration); | 30 | silero_vad->min_speech_duration); |
| 31 | fprintf(stdout, "window_size: %d\n", silero_vad->window_size); | 31 | fprintf(stdout, "window_size: %d\n", silero_vad->window_size); |
| 32 | + fprintf(stdout, "max_speech_duration: %.3f\n", | ||
| 33 | + silero_vad->max_speech_duration); | ||
| 32 | 34 | ||
| 33 | fprintf(stdout, "----------config----------\n"); | 35 | fprintf(stdout, "----------config----------\n"); |
| 34 | 36 |
-
请 注册 或 登录 后发表评论