Committed by
GitHub
Add Flush to VAD so that the last segment can be detected. (#1099)
正在显示
35 个修改的文件
包含
237 行增加
和
29 行删除
| @@ -52,11 +52,6 @@ jobs: | @@ -52,11 +52,6 @@ jobs: | ||
| 52 | cmake --build . --target install --config Release | 52 | cmake --build . --target install --config Release |
| 53 | rm -rf install/pkgconfig | 53 | rm -rf install/pkgconfig |
| 54 | 54 | ||
| 55 | - - uses: actions/upload-artifact@v4 | ||
| 56 | - with: | ||
| 57 | - name: windows-${{ matrix.arch }} | ||
| 58 | - path: ./build/install/lib/ | ||
| 59 | - | ||
| 60 | - name: Create tar file | 55 | - name: Create tar file |
| 61 | shell: bash | 56 | shell: bash |
| 62 | run: | | 57 | run: | |
| @@ -72,6 +67,11 @@ jobs: | @@ -72,6 +67,11 @@ jobs: | ||
| 72 | ls -lh *.tar.bz2 | 67 | ls -lh *.tar.bz2 |
| 73 | mv *.tar.bz2 ../ | 68 | mv *.tar.bz2 ../ |
| 74 | 69 | ||
| 70 | + - uses: actions/upload-artifact@v4 | ||
| 71 | + with: | ||
| 72 | + name: windows-${{ matrix.arch }} | ||
| 73 | + path: ./*.tar.bz2 | ||
| 74 | + | ||
| 75 | # https://huggingface.co/docs/hub/spaces-github-actions | 75 | # https://huggingface.co/docs/hub/spaces-github-actions |
| 76 | - name: Publish to huggingface | 76 | - name: Publish to huggingface |
| 77 | if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch') | 77 | if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch') |
| @@ -88,7 +88,9 @@ jobs: | @@ -88,7 +88,9 @@ jobs: | ||
| 88 | 88 | ||
| 89 | rm -rf huggingface | 89 | rm -rf huggingface |
| 90 | export GIT_CLONE_PROTECTION_ACTIVE=false | 90 | export GIT_CLONE_PROTECTION_ACTIVE=false |
| 91 | - GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface | 91 | + export GIT_LFS_SKIP_SMUDGE=1 |
| 92 | + | ||
| 93 | + git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface | ||
| 92 | 94 | ||
| 93 | cd huggingface | 95 | cd huggingface |
| 94 | mkdir -p windows-for-dotnet | 96 | mkdir -p windows-for-dotnet |
| @@ -10,8 +10,8 @@ project(sherpa-onnx) | @@ -10,8 +10,8 @@ project(sherpa-onnx) | ||
| 10 | # Remember to update | 10 | # Remember to update |
| 11 | # ./nodejs-addon-examples | 11 | # ./nodejs-addon-examples |
| 12 | # ./dart-api-examples/ | 12 | # ./dart-api-examples/ |
| 13 | -# ./sherpa-onnx/flutter/CHANGELOG.md | ||
| 14 | -set(SHERPA_ONNX_VERSION "1.10.11") | 13 | +# ./CHANGELOG.md |
| 14 | +set(SHERPA_ONNX_VERSION "1.10.12") | ||
| 15 | 15 | ||
| 16 | # Disable warning about | 16 | # Disable warning about |
| 17 | # | 17 | # |
| @@ -93,6 +93,28 @@ void main(List<String> arguments) async { | @@ -93,6 +93,28 @@ void main(List<String> arguments) async { | ||
| 93 | } | 93 | } |
| 94 | } | 94 | } |
| 95 | 95 | ||
| 96 | + vad.flush(); | ||
| 97 | + while (!vad.isEmpty()) { | ||
| 98 | + final stream = recognizer.createStream(); | ||
| 99 | + final segment = vad.front(); | ||
| 100 | + stream.acceptWaveform( | ||
| 101 | + samples: segment.samples, sampleRate: waveData.sampleRate); | ||
| 102 | + recognizer.decode(stream); | ||
| 103 | + | ||
| 104 | + final result = recognizer.getResult(stream); | ||
| 105 | + | ||
| 106 | + final startTime = segment.start * 1.0 / waveData.sampleRate; | ||
| 107 | + final duration = segment.samples.length * 1.0 / waveData.sampleRate; | ||
| 108 | + final stopTime = startTime + duration; | ||
| 109 | + if (result.text != '') { | ||
| 110 | + print( | ||
| 111 | + '${startTime.toStringAsPrecision(4)} -- ${stopTime.toStringAsPrecision(4)}: ${result.text}'); | ||
| 112 | + } | ||
| 113 | + | ||
| 114 | + stream.free(); | ||
| 115 | + vad.pop(); | ||
| 116 | + } | ||
| 117 | + | ||
| 96 | vad.free(); | 118 | vad.free(); |
| 97 | recognizer.free(); | 119 | recognizer.free(); |
| 98 | } | 120 | } |
| @@ -65,6 +65,12 @@ void main(List<String> arguments) async { | @@ -65,6 +65,12 @@ void main(List<String> arguments) async { | ||
| 65 | } | 65 | } |
| 66 | } | 66 | } |
| 67 | 67 | ||
| 68 | + vad.flush(); | ||
| 69 | + while (!vad.isEmpty()) { | ||
| 70 | + allSamples.add(vad.front().samples); | ||
| 71 | + vad.pop(); | ||
| 72 | + } | ||
| 73 | + | ||
| 68 | vad.free(); | 74 | vad.free(); |
| 69 | 75 | ||
| 70 | final s = Float32List.fromList(allSamples.expand((x) => x).toList()); | 76 | final s = Float32List.fromList(allSamples.expand((x) => x).toList()); |
| @@ -57,6 +57,26 @@ class VadNonStreamingAsrParaformer | @@ -57,6 +57,26 @@ class VadNonStreamingAsrParaformer | ||
| 57 | } | 57 | } |
| 58 | } | 58 | } |
| 59 | } | 59 | } |
| 60 | + | ||
| 61 | + vad.Flush(); | ||
| 62 | + | ||
| 63 | + while (!vad.IsEmpty()) { | ||
| 64 | + SpeechSegment segment = vad.Front(); | ||
| 65 | + float startTime = segment.Start / (float)sampleRate; | ||
| 66 | + float duration = segment.Samples.Length / (float)sampleRate; | ||
| 67 | + | ||
| 68 | + OfflineStream stream = recognizer.CreateStream(); | ||
| 69 | + stream.AcceptWaveform(sampleRate, segment.Samples); | ||
| 70 | + recognizer.Decode(stream); | ||
| 71 | + String text = stream.Result.Text; | ||
| 72 | + | ||
| 73 | + if (!String.IsNullOrEmpty(text)) { | ||
| 74 | + Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime), | ||
| 75 | + String.Format("{0:0.00}", startTime+duration), text); | ||
| 76 | + } | ||
| 77 | + | ||
| 78 | + vad.Pop(); | ||
| 79 | + } | ||
| 60 | } | 80 | } |
| 61 | } | 81 | } |
| 62 | 82 |
| @@ -5,7 +5,7 @@ description: > | @@ -5,7 +5,7 @@ description: > | ||
| 5 | 5 | ||
| 6 | publish_to: 'none' | 6 | publish_to: 'none' |
| 7 | 7 | ||
| 8 | -version: 1.10.11 | 8 | +version: 1.10.12 |
| 9 | 9 | ||
| 10 | topics: | 10 | topics: |
| 11 | - speech-recognition | 11 | - speech-recognition |
| @@ -30,7 +30,7 @@ dependencies: | @@ -30,7 +30,7 @@ dependencies: | ||
| 30 | record: ^5.1.0 | 30 | record: ^5.1.0 |
| 31 | url_launcher: ^6.2.6 | 31 | url_launcher: ^6.2.6 |
| 32 | 32 | ||
| 33 | - sherpa_onnx: ^1.10.11 | 33 | + sherpa_onnx: ^1.10.12 |
| 34 | # sherpa_onnx: | 34 | # sherpa_onnx: |
| 35 | # path: ../../flutter/sherpa_onnx | 35 | # path: ../../flutter/sherpa_onnx |
| 36 | 36 |
| @@ -17,7 +17,7 @@ dependencies: | @@ -17,7 +17,7 @@ dependencies: | ||
| 17 | cupertino_icons: ^1.0.6 | 17 | cupertino_icons: ^1.0.6 |
| 18 | path_provider: ^2.1.3 | 18 | path_provider: ^2.1.3 |
| 19 | path: ^1.9.0 | 19 | path: ^1.9.0 |
| 20 | - sherpa_onnx: ^1.10.11 | 20 | + sherpa_onnx: ^1.10.12 |
| 21 | url_launcher: ^6.2.6 | 21 | url_launcher: ^6.2.6 |
| 22 | audioplayers: ^5.0.0 | 22 | audioplayers: ^5.0.0 |
| 23 | 23 |
| @@ -491,6 +491,12 @@ typedef SherpaOnnxVoiceActivityDetectorResetNative = Void Function( | @@ -491,6 +491,12 @@ typedef SherpaOnnxVoiceActivityDetectorResetNative = Void Function( | ||
| 491 | typedef SherpaOnnxVoiceActivityDetectorReset = void Function( | 491 | typedef SherpaOnnxVoiceActivityDetectorReset = void Function( |
| 492 | Pointer<SherpaOnnxVoiceActivityDetector>); | 492 | Pointer<SherpaOnnxVoiceActivityDetector>); |
| 493 | 493 | ||
| 494 | +typedef SherpaOnnxVoiceActivityDetectorFlushNative = Void Function( | ||
| 495 | + Pointer<SherpaOnnxVoiceActivityDetector>); | ||
| 496 | + | ||
| 497 | +typedef SherpaOnnxVoiceActivityDetectorFlush = void Function( | ||
| 498 | + Pointer<SherpaOnnxVoiceActivityDetector>); | ||
| 499 | + | ||
| 494 | typedef SherpaOnnxVoiceActivityDetectorFrontNative | 500 | typedef SherpaOnnxVoiceActivityDetectorFrontNative |
| 495 | = Pointer<SherpaOnnxSpeechSegment> Function( | 501 | = Pointer<SherpaOnnxSpeechSegment> Function( |
| 496 | Pointer<SherpaOnnxVoiceActivityDetector>); | 502 | Pointer<SherpaOnnxVoiceActivityDetector>); |
| @@ -779,6 +785,8 @@ class SherpaOnnxBindings { | @@ -779,6 +785,8 @@ class SherpaOnnxBindings { | ||
| 779 | 785 | ||
| 780 | static SherpaOnnxVoiceActivityDetectorReset? voiceActivityDetectorReset; | 786 | static SherpaOnnxVoiceActivityDetectorReset? voiceActivityDetectorReset; |
| 781 | 787 | ||
| 788 | + static SherpaOnnxVoiceActivityDetectorFlush? voiceActivityDetectorFlush; | ||
| 789 | + | ||
| 782 | static SherpaOnnxCreateCircularBuffer? createCircularBuffer; | 790 | static SherpaOnnxCreateCircularBuffer? createCircularBuffer; |
| 783 | 791 | ||
| 784 | static SherpaOnnxDestroyCircularBuffer? destroyCircularBuffer; | 792 | static SherpaOnnxDestroyCircularBuffer? destroyCircularBuffer; |
| @@ -1036,6 +1044,11 @@ class SherpaOnnxBindings { | @@ -1036,6 +1044,11 @@ class SherpaOnnxBindings { | ||
| 1036 | 'SherpaOnnxVoiceActivityDetectorReset') | 1044 | 'SherpaOnnxVoiceActivityDetectorReset') |
| 1037 | .asFunction(); | 1045 | .asFunction(); |
| 1038 | 1046 | ||
| 1047 | + voiceActivityDetectorFlush ??= dynamicLibrary | ||
| 1048 | + .lookup<NativeFunction<SherpaOnnxVoiceActivityDetectorFlushNative>>( | ||
| 1049 | + 'SherpaOnnxVoiceActivityDetectorFlush') | ||
| 1050 | + .asFunction(); | ||
| 1051 | + | ||
| 1039 | createCircularBuffer ??= dynamicLibrary | 1052 | createCircularBuffer ??= dynamicLibrary |
| 1040 | .lookup<NativeFunction<SherpaOnnxCreateCircularBufferNative>>( | 1053 | .lookup<NativeFunction<SherpaOnnxCreateCircularBufferNative>>( |
| 1041 | 'SherpaOnnxCreateCircularBuffer') | 1054 | 'SherpaOnnxCreateCircularBuffer') |
| @@ -207,6 +207,10 @@ class VoiceActivityDetector { | @@ -207,6 +207,10 @@ class VoiceActivityDetector { | ||
| 207 | SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr); | 207 | SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr); |
| 208 | } | 208 | } |
| 209 | 209 | ||
| 210 | + void flush() { | ||
| 211 | + SherpaOnnxBindings.voiceActivityDetectorFlush?.call(ptr); | ||
| 212 | + } | ||
| 213 | + | ||
| 210 | Pointer<SherpaOnnxVoiceActivityDetector> ptr; | 214 | Pointer<SherpaOnnxVoiceActivityDetector> ptr; |
| 211 | final VadModelConfig config; | 215 | final VadModelConfig config; |
| 212 | } | 216 | } |
| @@ -17,7 +17,7 @@ topics: | @@ -17,7 +17,7 @@ topics: | ||
| 17 | - voice-activity-detection | 17 | - voice-activity-detection |
| 18 | 18 | ||
| 19 | # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec | 19 | # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec |
| 20 | -version: 1.10.11 | 20 | +version: 1.10.12 |
| 21 | 21 | ||
| 22 | homepage: https://github.com/k2-fsa/sherpa-onnx | 22 | homepage: https://github.com/k2-fsa/sherpa-onnx |
| 23 | 23 | ||
| @@ -30,19 +30,19 @@ dependencies: | @@ -30,19 +30,19 @@ dependencies: | ||
| 30 | flutter: | 30 | flutter: |
| 31 | sdk: flutter | 31 | sdk: flutter |
| 32 | 32 | ||
| 33 | - sherpa_onnx_android: ^1.10.11 | 33 | + sherpa_onnx_android: ^1.10.12 |
| 34 | # path: ../sherpa_onnx_android | 34 | # path: ../sherpa_onnx_android |
| 35 | 35 | ||
| 36 | - sherpa_onnx_macos: ^1.10.11 | 36 | + sherpa_onnx_macos: ^1.10.12 |
| 37 | # path: ../sherpa_onnx_macos | 37 | # path: ../sherpa_onnx_macos |
| 38 | 38 | ||
| 39 | - sherpa_onnx_linux: ^1.10.11 | 39 | + sherpa_onnx_linux: ^1.10.12 |
| 40 | # path: ../sherpa_onnx_linux | 40 | # path: ../sherpa_onnx_linux |
| 41 | # | 41 | # |
| 42 | - sherpa_onnx_windows: ^1.10.11 | 42 | + sherpa_onnx_windows: ^1.10.12 |
| 43 | # path: ../sherpa_onnx_windows | 43 | # path: ../sherpa_onnx_windows |
| 44 | 44 | ||
| 45 | - sherpa_onnx_ios: ^1.10.11 | 45 | + sherpa_onnx_ios: ^1.10.12 |
| 46 | # sherpa_onnx_ios: | 46 | # sherpa_onnx_ios: |
| 47 | # path: ../sherpa_onnx_ios | 47 | # path: ../sherpa_onnx_ios |
| 48 | 48 |
| @@ -7,7 +7,7 @@ | @@ -7,7 +7,7 @@ | ||
| 7 | # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c | 7 | # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c |
| 8 | Pod::Spec.new do |s| | 8 | Pod::Spec.new do |s| |
| 9 | s.name = 'sherpa_onnx_ios' | 9 | s.name = 'sherpa_onnx_ios' |
| 10 | - s.version = '1.10.11' | 10 | + s.version = '1.10.12' |
| 11 | s.summary = 'A new Flutter FFI plugin project.' | 11 | s.summary = 'A new Flutter FFI plugin project.' |
| 12 | s.description = <<-DESC | 12 | s.description = <<-DESC |
| 13 | A new Flutter FFI plugin project. | 13 | A new Flutter FFI plugin project. |
| @@ -4,7 +4,7 @@ | @@ -4,7 +4,7 @@ | ||
| 4 | # | 4 | # |
| 5 | Pod::Spec.new do |s| | 5 | Pod::Spec.new do |s| |
| 6 | s.name = 'sherpa_onnx_macos' | 6 | s.name = 'sherpa_onnx_macos' |
| 7 | - s.version = '1.10.11' | 7 | + s.version = '1.10.12' |
| 8 | s.summary = 'sherpa-onnx Flutter FFI plugin project.' | 8 | s.summary = 'sherpa-onnx Flutter FFI plugin project.' |
| 9 | s.description = <<-DESC | 9 | s.description = <<-DESC |
| 10 | sherpa-onnx Flutter FFI plugin project. | 10 | sherpa-onnx Flutter FFI plugin project. |
| @@ -98,6 +98,25 @@ public class VadNonStreamingParaformer { | @@ -98,6 +98,25 @@ public class VadNonStreamingParaformer { | ||
| 98 | } | 98 | } |
| 99 | } | 99 | } |
| 100 | 100 | ||
| 101 | + vad.flush(); | ||
| 102 | + while (!vad.empty()) { | ||
| 103 | + SpeechSegment segment = vad.front(); | ||
| 104 | + float startTime = segment.getStart() / 16000.0f; | ||
| 105 | + float duration = segment.getSamples().length / 16000.0f; | ||
| 106 | + | ||
| 107 | + OfflineStream stream = recognizer.createStream(); | ||
| 108 | + stream.acceptWaveform(segment.getSamples(), 16000); | ||
| 109 | + recognizer.decode(stream); | ||
| 110 | + String text = recognizer.getResult(stream).getText(); | ||
| 111 | + stream.release(); | ||
| 112 | + | ||
| 113 | + if (!text.isEmpty()) { | ||
| 114 | + System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text); | ||
| 115 | + } | ||
| 116 | + | ||
| 117 | + vad.pop(); | ||
| 118 | + } | ||
| 119 | + | ||
| 101 | vad.release(); | 120 | vad.release(); |
| 102 | recognizer.release(); | 121 | recognizer.release(); |
| 103 | } | 122 | } |
| @@ -59,6 +59,16 @@ public class VadRemoveSilence { | @@ -59,6 +59,16 @@ public class VadRemoveSilence { | ||
| 59 | } | 59 | } |
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | + vad.flush(); | ||
| 63 | + while (!vad.empty()) { | ||
| 64 | + | ||
| 65 | + // if you want to get the starting time of this segment, you can use | ||
| 66 | + /* float startTime = vad.front().getStart() / 16000.0f; */ | ||
| 67 | + | ||
| 68 | + segments.add(vad.front().getSamples()); | ||
| 69 | + vad.pop(); | ||
| 70 | + } | ||
| 71 | + | ||
| 62 | // get total number of samples | 72 | // get total number of samples |
| 63 | int n = 0; | 73 | int n = 0; |
| 64 | for (float[] s : segments) { | 74 | for (float[] s : segments) { |
| @@ -105,6 +105,12 @@ def main(): | @@ -105,6 +105,12 @@ def main(): | ||
| 105 | speech_samples.extend(vad.front.samples) | 105 | speech_samples.extend(vad.front.samples) |
| 106 | vad.pop() | 106 | vad.pop() |
| 107 | 107 | ||
| 108 | + vad.flush() | ||
| 109 | + | ||
| 110 | + while not vad.empty(): | ||
| 111 | + speech_samples.extend(vad.front.samples) | ||
| 112 | + vad.pop() | ||
| 113 | + | ||
| 108 | speech_samples = np.array(speech_samples, dtype=np.float32) | 114 | speech_samples = np.array(speech_samples, dtype=np.float32) |
| 109 | 115 | ||
| 110 | sf.write(args.output, speech_samples, samplerate=sample_rate) | 116 | sf.write(args.output, speech_samples, samplerate=sample_rate) |
| @@ -17,7 +17,7 @@ topics: | @@ -17,7 +17,7 @@ topics: | ||
| 17 | - voice-activity-detection | 17 | - voice-activity-detection |
| 18 | 18 | ||
| 19 | # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec | 19 | # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec |
| 20 | -version: 1.10.6 | 20 | +version: 1.10.12 |
| 21 | 21 | ||
| 22 | homepage: https://github.com/k2-fsa/sherpa-onnx | 22 | homepage: https://github.com/k2-fsa/sherpa-onnx |
| 23 | 23 |
| @@ -53,6 +53,11 @@ namespace SherpaOnnx | @@ -53,6 +53,11 @@ namespace SherpaOnnx | ||
| 53 | SherpaOnnxVoiceActivityDetectorReset(_handle.Handle); | 53 | SherpaOnnxVoiceActivityDetectorReset(_handle.Handle); |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | + public void Flush() | ||
| 57 | + { | ||
| 58 | + SherpaOnnxVoiceActivityDetectorFlush(_handle.Handle); | ||
| 59 | + } | ||
| 60 | + | ||
| 56 | public void Dispose() | 61 | public void Dispose() |
| 57 | { | 62 | { |
| 58 | Cleanup(); | 63 | Cleanup(); |
| @@ -106,5 +111,7 @@ namespace SherpaOnnx | @@ -106,5 +111,7 @@ namespace SherpaOnnx | ||
| 106 | [DllImport(Dll.Filename)] | 111 | [DllImport(Dll.Filename)] |
| 107 | private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle); | 112 | private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle); |
| 108 | 113 | ||
| 114 | + [DllImport(Dll.Filename)] | ||
| 115 | + private static extern void SherpaOnnxVoiceActivityDetectorFlush(IntPtr handle); | ||
| 109 | } | 116 | } |
| 110 | } | 117 | } |
| @@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() { | @@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() { | ||
| 856 | C.SherpaOnnxVoiceActivityDetectorReset(vad.impl) | 856 | C.SherpaOnnxVoiceActivityDetectorReset(vad.impl) |
| 857 | } | 857 | } |
| 858 | 858 | ||
| 859 | +func (vad *VoiceActivityDetector) Flush() { | ||
| 860 | + C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl) | ||
| 861 | +} | ||
| 862 | + | ||
| 859 | // Spoken language identification | 863 | // Spoken language identification |
| 860 | 864 | ||
| 861 | type SpokenLanguageIdentificationWhisperConfig struct { | 865 | type SpokenLanguageIdentificationWhisperConfig struct { |
| @@ -29,7 +29,7 @@ class CircularBuffer { | @@ -29,7 +29,7 @@ class CircularBuffer { | ||
| 29 | } | 29 | } |
| 30 | 30 | ||
| 31 | reset() { | 31 | reset() { |
| 32 | - return addon.circularBufferReset(this.handle); | 32 | + addon.circularBufferReset(this.handle); |
| 33 | } | 33 | } |
| 34 | } | 34 | } |
| 35 | 35 | ||
| @@ -79,7 +79,11 @@ config = { | @@ -79,7 +79,11 @@ config = { | ||
| 79 | } | 79 | } |
| 80 | 80 | ||
| 81 | reset() { | 81 | reset() { |
| 82 | - return addon.VoiceActivityDetectorResetWrapper(this.handle); | 82 | + addon.VoiceActivityDetectorResetWrapper(this.handle); |
| 83 | + } | ||
| 84 | + | ||
| 85 | + flush() { | ||
| 86 | + addon.VoiceActivityDetectorFlushWrapper(this.handle); | ||
| 83 | } | 87 | } |
| 84 | } | 88 | } |
| 85 | 89 |
| @@ -590,6 +590,31 @@ static void VoiceActivityDetectorResetWrapper(const Napi::CallbackInfo &info) { | @@ -590,6 +590,31 @@ static void VoiceActivityDetectorResetWrapper(const Napi::CallbackInfo &info) { | ||
| 590 | SherpaOnnxVoiceActivityDetectorReset(vad); | 590 | SherpaOnnxVoiceActivityDetectorReset(vad); |
| 591 | } | 591 | } |
| 592 | 592 | ||
| 593 | +static void VoiceActivityDetectorFlushWrapper(const Napi::CallbackInfo &info) { | ||
| 594 | + Napi::Env env = info.Env(); | ||
| 595 | + | ||
| 596 | + if (info.Length() != 1) { | ||
| 597 | + std::ostringstream os; | ||
| 598 | + os << "Expect only 1 argument. Given: " << info.Length(); | ||
| 599 | + | ||
| 600 | + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); | ||
| 601 | + | ||
| 602 | + return; | ||
| 603 | + } | ||
| 604 | + | ||
| 605 | + if (!info[0].IsExternal()) { | ||
| 606 | + Napi::TypeError::New(env, "Argument 0 should be a VAD pointer.") | ||
| 607 | + .ThrowAsJavaScriptException(); | ||
| 608 | + | ||
| 609 | + return; | ||
| 610 | + } | ||
| 611 | + | ||
| 612 | + SherpaOnnxVoiceActivityDetector *vad = | ||
| 613 | + info[0].As<Napi::External<SherpaOnnxVoiceActivityDetector>>().Data(); | ||
| 614 | + | ||
| 615 | + SherpaOnnxVoiceActivityDetectorFlush(vad); | ||
| 616 | +} | ||
| 617 | + | ||
| 593 | void InitVad(Napi::Env env, Napi::Object exports) { | 618 | void InitVad(Napi::Env env, Napi::Object exports) { |
| 594 | exports.Set(Napi::String::New(env, "createCircularBuffer"), | 619 | exports.Set(Napi::String::New(env, "createCircularBuffer"), |
| 595 | Napi::Function::New(env, CreateCircularBufferWrapper)); | 620 | Napi::Function::New(env, CreateCircularBufferWrapper)); |
| @@ -636,4 +661,7 @@ void InitVad(Napi::Env env, Napi::Object exports) { | @@ -636,4 +661,7 @@ void InitVad(Napi::Env env, Napi::Object exports) { | ||
| 636 | 661 | ||
| 637 | exports.Set(Napi::String::New(env, "voiceActivityDetectorReset"), | 662 | exports.Set(Napi::String::New(env, "voiceActivityDetectorReset"), |
| 638 | Napi::Function::New(env, VoiceActivityDetectorResetWrapper)); | 663 | Napi::Function::New(env, VoiceActivityDetectorResetWrapper)); |
| 664 | + | ||
| 665 | + exports.Set(Napi::String::New(env, "voiceActivityDetectorFlush"), | ||
| 666 | + Napi::Function::New(env, VoiceActivityDetectorFlushWrapper)); | ||
| 639 | } | 667 | } |
| @@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { | @@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) { | ||
| 876 | p->impl->Reset(); | 876 | p->impl->Reset(); |
| 877 | } | 877 | } |
| 878 | 878 | ||
| 879 | +void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) { | ||
| 880 | + p->impl->Flush(); | ||
| 881 | +} | ||
| 882 | + | ||
| 879 | #if SHERPA_ONNX_ENABLE_TTS == 1 | 883 | #if SHERPA_ONNX_ENABLE_TTS == 1 |
| 880 | struct SherpaOnnxOfflineTts { | 884 | struct SherpaOnnxOfflineTts { |
| 881 | std::unique_ptr<sherpa_onnx::OfflineTts> impl; | 885 | std::unique_ptr<sherpa_onnx::OfflineTts> impl; |
| @@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment( | @@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment( | ||
| 815 | SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( | 815 | SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset( |
| 816 | SherpaOnnxVoiceActivityDetector *p); | 816 | SherpaOnnxVoiceActivityDetector *p); |
| 817 | 817 | ||
| 818 | +SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush( | ||
| 819 | + SherpaOnnxVoiceActivityDetector *p); | ||
| 820 | + | ||
| 818 | // ============================================================ | 821 | // ============================================================ |
| 819 | // For offline Text-to-Speech (i.e., non-streaming TTS) | 822 | // For offline Text-to-Speech (i.e., non-streaming TTS) |
| 820 | // ============================================================ | 823 | // ============================================================ |
| @@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl { | @@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl { | ||
| 118 | start_ = -1; | 118 | start_ = -1; |
| 119 | } | 119 | } |
| 120 | 120 | ||
| 121 | + void Flush() { | ||
| 122 | + if (start_ == -1 || buffer_.Size() == 0) { | ||
| 123 | + return; | ||
| 124 | + } | ||
| 125 | + | ||
| 126 | + int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples(); | ||
| 127 | + if (end <= start_) { | ||
| 128 | + return; | ||
| 129 | + } | ||
| 130 | + | ||
| 131 | + std::vector<float> s = buffer_.Get(start_, end - start_); | ||
| 132 | + | ||
| 133 | + SpeechSegment segment; | ||
| 134 | + | ||
| 135 | + segment.start = start_; | ||
| 136 | + segment.samples = std::move(s); | ||
| 137 | + | ||
| 138 | + segments_.push(std::move(segment)); | ||
| 139 | + | ||
| 140 | + buffer_.Pop(end - buffer_.Head()); | ||
| 141 | + start_ = -1; | ||
| 142 | + } | ||
| 143 | + | ||
| 121 | bool IsSpeechDetected() const { return start_ != -1; } | 144 | bool IsSpeechDetected() const { return start_ != -1; } |
| 122 | 145 | ||
| 123 | const VadModelConfig &GetConfig() const { return config_; } | 146 | const VadModelConfig &GetConfig() const { return config_; } |
| @@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const { | @@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const { | ||
| 164 | return impl_->Front(); | 187 | return impl_->Front(); |
| 165 | } | 188 | } |
| 166 | 189 | ||
| 167 | -void VoiceActivityDetector::Reset() { impl_->Reset(); } | 190 | +void VoiceActivityDetector::Reset() const { impl_->Reset(); } |
| 191 | + | ||
| 192 | +void VoiceActivityDetector::Flush() const { impl_->Flush(); } | ||
| 168 | 193 | ||
| 169 | bool VoiceActivityDetector::IsSpeechDetected() const { | 194 | bool VoiceActivityDetector::IsSpeechDetected() const { |
| 170 | return impl_->IsSpeechDetected(); | 195 | return impl_->IsSpeechDetected(); |
| @@ -41,7 +41,11 @@ class VoiceActivityDetector { | @@ -41,7 +41,11 @@ class VoiceActivityDetector { | ||
| 41 | 41 | ||
| 42 | bool IsSpeechDetected() const; | 42 | bool IsSpeechDetected() const; |
| 43 | 43 | ||
| 44 | - void Reset(); | 44 | + void Reset() const; |
| 45 | + | ||
| 46 | + // At the end of the utterance, you can invoke this method so that | ||
| 47 | + // the last speech segment can be detected. | ||
| 48 | + void Flush() const; | ||
| 45 | 49 | ||
| 46 | const VadModelConfig &GetConfig() const; | 50 | const VadModelConfig &GetConfig() const; |
| 47 | 51 |
| @@ -46,6 +46,10 @@ public class Vad { | @@ -46,6 +46,10 @@ public class Vad { | ||
| 46 | reset(this.ptr); | 46 | reset(this.ptr); |
| 47 | } | 47 | } |
| 48 | 48 | ||
| 49 | + public void flush() { | ||
| 50 | + flush(this.ptr); | ||
| 51 | + } | ||
| 52 | + | ||
| 49 | public SpeechSegment front() { | 53 | public SpeechSegment front() { |
| 50 | Object[] arr = front(this.ptr); | 54 | Object[] arr = front(this.ptr); |
| 51 | int start = (int) arr[0]; | 55 | int start = (int) arr[0]; |
| @@ -75,4 +79,6 @@ public class Vad { | @@ -75,4 +79,6 @@ public class Vad { | ||
| 75 | private native boolean isSpeechDetected(long ptr); | 79 | private native boolean isSpeechDetected(long ptr); |
| 76 | 80 | ||
| 77 | private native void reset(long ptr); | 81 | private native void reset(long ptr); |
| 82 | + | ||
| 83 | + private native void flush(long ptr); | ||
| 78 | } | 84 | } |
| @@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/, | @@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/, | ||
| 173 | auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr); | 173 | auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr); |
| 174 | model->Reset(); | 174 | model->Reset(); |
| 175 | } | 175 | } |
| 176 | + | ||
| 177 | +SHERPA_ONNX_EXTERN_C | ||
| 178 | +JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/, | ||
| 179 | + jobject /*obj*/, | ||
| 180 | + jlong ptr) { | ||
| 181 | + auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr); | ||
| 182 | + model->Flush(); | ||
| 183 | +} |
| @@ -52,6 +52,8 @@ class Vad( | @@ -52,6 +52,8 @@ class Vad( | ||
| 52 | 52 | ||
| 53 | fun reset() = reset(ptr) | 53 | fun reset() = reset(ptr) |
| 54 | 54 | ||
| 55 | + fun flush() = flush(ptr) | ||
| 56 | + | ||
| 55 | private external fun delete(ptr: Long) | 57 | private external fun delete(ptr: Long) |
| 56 | 58 | ||
| 57 | private external fun newFromAsset( | 59 | private external fun newFromAsset( |
| @@ -70,6 +72,7 @@ class Vad( | @@ -70,6 +72,7 @@ class Vad( | ||
| 70 | private external fun front(ptr: Long): Array<Any> | 72 | private external fun front(ptr: Long): Array<Any> |
| 71 | private external fun isSpeechDetected(ptr: Long): Boolean | 73 | private external fun isSpeechDetected(ptr: Long): Boolean |
| 72 | private external fun reset(ptr: Long) | 74 | private external fun reset(ptr: Long) |
| 75 | + private external fun flush(ptr: Long) | ||
| 73 | 76 | ||
| 74 | companion object { | 77 | companion object { |
| 75 | init { | 78 | init { |
| @@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) { | @@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) { | ||
| 38 | .def("is_speech_detected", &PyClass::IsSpeechDetected, | 38 | .def("is_speech_detected", &PyClass::IsSpeechDetected, |
| 39 | py::call_guard<py::gil_scoped_release>()) | 39 | py::call_guard<py::gil_scoped_release>()) |
| 40 | .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>()) | 40 | .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>()) |
| 41 | + .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>()) | ||
| 41 | .def_property_readonly("front", &PyClass::Front); | 42 | .def_property_readonly("front", &PyClass::Front); |
| 42 | } | 43 | } |
| 43 | 44 |
| @@ -633,6 +633,10 @@ class SherpaOnnxVoiceActivityDetectorWrapper { | @@ -633,6 +633,10 @@ class SherpaOnnxVoiceActivityDetectorWrapper { | ||
| 633 | func reset() { | 633 | func reset() { |
| 634 | SherpaOnnxVoiceActivityDetectorReset(vad) | 634 | SherpaOnnxVoiceActivityDetectorReset(vad) |
| 635 | } | 635 | } |
| 636 | + | ||
| 637 | + func flush() { | ||
| 638 | + SherpaOnnxVoiceActivityDetectorFlush(vad) | ||
| 639 | + } | ||
| 636 | } | 640 | } |
| 637 | 641 | ||
| 638 | // offline tts | 642 | // offline tts |
-
请 注册 或 登录 后发表评论