Fangjun Kuang
Committed by GitHub

Add Flush to VAD so that the last segment can be detected. (#1099)

... ... @@ -52,11 +52,6 @@ jobs:
cmake --build . --target install --config Release
rm -rf install/pkgconfig
- uses: actions/upload-artifact@v4
with:
name: windows-${{ matrix.arch }}
path: ./build/install/lib/
- name: Create tar file
shell: bash
run: |
... ... @@ -72,6 +67,11 @@ jobs:
ls -lh *.tar.bz2
mv *.tar.bz2 ../
- uses: actions/upload-artifact@v4
with:
name: windows-${{ matrix.arch }}
path: ./*.tar.bz2
# https://huggingface.co/docs/hub/spaces-github-actions
- name: Publish to huggingface
if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch')
... ... @@ -88,7 +88,9 @@ jobs:
rm -rf huggingface
export GIT_CLONE_PROTECTION_ACTIVE=false
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
export GIT_LFS_SKIP_SMUDGE=1
git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
cd huggingface
mkdir -p windows-for-dotnet
... ...
## 1.10.12
* Add Flush to VAD so that the last speech segment can be detected. See also
https://github.com/k2-fsa/sherpa-onnx/discussions/1077#discussioncomment-9979740
## 1.10.11
* Support the iOS platform for iOS.
... ...
... ... @@ -10,8 +10,8 @@ project(sherpa-onnx)
# Remember to update
# ./nodejs-addon-examples
# ./dart-api-examples/
# ./sherpa-onnx/flutter/CHANGELOG.md
set(SHERPA_ONNX_VERSION "1.10.11")
# ./CHANGELOG.md
set(SHERPA_ONNX_VERSION "1.10.12")
# Disable warning about
#
... ...
... ... @@ -93,6 +93,28 @@ void main(List<String> arguments) async {
}
}
vad.flush();
while (!vad.isEmpty()) {
final stream = recognizer.createStream();
final segment = vad.front();
stream.acceptWaveform(
samples: segment.samples, sampleRate: waveData.sampleRate);
recognizer.decode(stream);
final result = recognizer.getResult(stream);
final startTime = segment.start * 1.0 / waveData.sampleRate;
final duration = segment.samples.length * 1.0 / waveData.sampleRate;
final stopTime = startTime + duration;
if (result.text != '') {
print(
'${startTime.toStringAsPrecision(4)} -- ${stopTime.toStringAsPrecision(4)}: ${result.text}');
}
stream.free();
vad.pop();
}
vad.free();
recognizer.free();
}
... ...
... ... @@ -10,7 +10,7 @@ environment:
# Add regular dependencies here.
dependencies:
sherpa_onnx: ^1.10.11
sherpa_onnx: ^1.10.12
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -11,7 +11,7 @@ environment:
# Add regular dependencies here.
dependencies:
sherpa_onnx: ^1.10.11
sherpa_onnx: ^1.10.12
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -8,7 +8,7 @@ environment:
# Add regular dependencies here.
dependencies:
sherpa_onnx: ^1.10.11
sherpa_onnx: ^1.10.12
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -65,6 +65,12 @@ void main(List<String> arguments) async {
}
}
vad.flush();
while (!vad.isEmpty()) {
allSamples.add(vad.front().samples);
vad.pop();
}
vad.free();
final s = Float32List.fromList(allSamples.expand((x) => x).toList());
... ...
... ... @@ -9,7 +9,7 @@ environment:
sdk: ^3.4.0
dependencies:
sherpa_onnx: ^1.10.11
sherpa_onnx: ^1.10.12
path: ^1.9.0
args: ^2.5.0
... ...
... ... @@ -57,6 +57,26 @@ class VadNonStreamingAsrParaformer
}
}
}
vad.Flush();
while (!vad.IsEmpty()) {
SpeechSegment segment = vad.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
recognizer.Decode(stream);
String text = stream.Result.Text;
if (!String.IsNullOrEmpty(text)) {
Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
String.Format("{0:0.00}", startTime+duration), text);
}
vad.Pop();
}
}
}
... ...
... ... @@ -5,7 +5,7 @@ description: >
publish_to: 'none'
version: 1.10.11
version: 1.10.12
topics:
- speech-recognition
... ... @@ -30,7 +30,7 @@ dependencies:
record: ^5.1.0
url_launcher: ^6.2.6
sherpa_onnx: ^1.10.11
sherpa_onnx: ^1.10.12
# sherpa_onnx:
# path: ../../flutter/sherpa_onnx
... ...
... ... @@ -17,7 +17,7 @@ dependencies:
cupertino_icons: ^1.0.6
path_provider: ^2.1.3
path: ^1.9.0
sherpa_onnx: ^1.10.11
sherpa_onnx: ^1.10.12
url_launcher: ^6.2.6
audioplayers: ^5.0.0
... ...
... ... @@ -491,6 +491,12 @@ typedef SherpaOnnxVoiceActivityDetectorResetNative = Void Function(
typedef SherpaOnnxVoiceActivityDetectorReset = void Function(
Pointer<SherpaOnnxVoiceActivityDetector>);
typedef SherpaOnnxVoiceActivityDetectorFlushNative = Void Function(
Pointer<SherpaOnnxVoiceActivityDetector>);
typedef SherpaOnnxVoiceActivityDetectorFlush = void Function(
Pointer<SherpaOnnxVoiceActivityDetector>);
typedef SherpaOnnxVoiceActivityDetectorFrontNative
= Pointer<SherpaOnnxSpeechSegment> Function(
Pointer<SherpaOnnxVoiceActivityDetector>);
... ... @@ -779,6 +785,8 @@ class SherpaOnnxBindings {
static SherpaOnnxVoiceActivityDetectorReset? voiceActivityDetectorReset;
static SherpaOnnxVoiceActivityDetectorFlush? voiceActivityDetectorFlush;
static SherpaOnnxCreateCircularBuffer? createCircularBuffer;
static SherpaOnnxDestroyCircularBuffer? destroyCircularBuffer;
... ... @@ -1036,6 +1044,11 @@ class SherpaOnnxBindings {
'SherpaOnnxVoiceActivityDetectorReset')
.asFunction();
voiceActivityDetectorFlush ??= dynamicLibrary
.lookup<NativeFunction<SherpaOnnxVoiceActivityDetectorFlushNative>>(
'SherpaOnnxVoiceActivityDetectorFlush')
.asFunction();
createCircularBuffer ??= dynamicLibrary
.lookup<NativeFunction<SherpaOnnxCreateCircularBufferNative>>(
'SherpaOnnxCreateCircularBuffer')
... ...
... ... @@ -207,6 +207,10 @@ class VoiceActivityDetector {
SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr);
}
void flush() {
SherpaOnnxBindings.voiceActivityDetectorFlush?.call(ptr);
}
Pointer<SherpaOnnxVoiceActivityDetector> ptr;
final VadModelConfig config;
}
... ...
... ... @@ -17,7 +17,7 @@ topics:
- voice-activity-detection
# remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
version: 1.10.11
version: 1.10.12
homepage: https://github.com/k2-fsa/sherpa-onnx
... ... @@ -30,19 +30,19 @@ dependencies:
flutter:
sdk: flutter
sherpa_onnx_android: ^1.10.11
sherpa_onnx_android: ^1.10.12
# path: ../sherpa_onnx_android
sherpa_onnx_macos: ^1.10.11
sherpa_onnx_macos: ^1.10.12
# path: ../sherpa_onnx_macos
sherpa_onnx_linux: ^1.10.11
sherpa_onnx_linux: ^1.10.12
# path: ../sherpa_onnx_linux
#
sherpa_onnx_windows: ^1.10.11
sherpa_onnx_windows: ^1.10.12
# path: ../sherpa_onnx_windows
sherpa_onnx_ios: ^1.10.11
sherpa_onnx_ios: ^1.10.12
# sherpa_onnx_ios:
# path: ../sherpa_onnx_ios
... ...
... ... @@ -7,7 +7,7 @@
# https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
Pod::Spec.new do |s|
s.name = 'sherpa_onnx_ios'
s.version = '1.10.11'
s.version = '1.10.12'
s.summary = 'A new Flutter FFI plugin project.'
s.description = <<-DESC
A new Flutter FFI plugin project.
... ...
... ... @@ -4,7 +4,7 @@
#
Pod::Spec.new do |s|
s.name = 'sherpa_onnx_macos'
s.version = '1.10.11'
s.version = '1.10.12'
s.summary = 'sherpa-onnx Flutter FFI plugin project.'
s.description = <<-DESC
sherpa-onnx Flutter FFI plugin project.
... ...
... ... @@ -98,6 +98,25 @@ public class VadNonStreamingParaformer {
}
}
vad.flush();
while (!vad.empty()) {
SpeechSegment segment = vad.front();
float startTime = segment.getStart() / 16000.0f;
float duration = segment.getSamples().length / 16000.0f;
OfflineStream stream = recognizer.createStream();
stream.acceptWaveform(segment.getSamples(), 16000);
recognizer.decode(stream);
String text = recognizer.getResult(stream).getText();
stream.release();
if (!text.isEmpty()) {
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
}
vad.pop();
}
vad.release();
recognizer.release();
}
... ...
... ... @@ -59,6 +59,16 @@ public class VadRemoveSilence {
}
}
vad.flush();
while (!vad.empty()) {
// if you want to get the starting time of this segment, you can use
/* float startTime = vad.front().getStart() / 16000.0f; */
segments.add(vad.front().getSamples());
vad.pop();
}
// get total number of samples
int n = 0;
for (float[] s : segments) {
... ...
{
"dependencies": {
"sherpa-onnx-node": "^1.10.6"
"sherpa-onnx-node": "^1.10.12"
}
}
... ...
... ... @@ -105,6 +105,12 @@ def main():
speech_samples.extend(vad.front.samples)
vad.pop()
vad.flush()
while not vad.empty():
speech_samples.extend(vad.front.samples)
vad.pop()
speech_samples = np.array(speech_samples, dtype=np.float32)
sf.write(args.output, speech_samples, samplerate=sample_rate)
... ...
... ... @@ -17,7 +17,7 @@ topics:
- voice-activity-detection
# remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
version: 1.10.6
version: 1.10.12
homepage: https://github.com/k2-fsa/sherpa-onnx
... ...
... ... @@ -53,6 +53,11 @@ namespace SherpaOnnx
SherpaOnnxVoiceActivityDetectorReset(_handle.Handle);
}
public void Flush()
{
SherpaOnnxVoiceActivityDetectorFlush(_handle.Handle);
}
public void Dispose()
{
Cleanup();
... ... @@ -106,5 +111,7 @@ namespace SherpaOnnx
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle);
[DllImport(Dll.Filename)]
private static extern void SherpaOnnxVoiceActivityDetectorFlush(IntPtr handle);
}
}
... ...
... ... @@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() {
C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
}
func (vad *VoiceActivityDetector) Flush() {
C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl)
}
// Spoken language identification
type SpokenLanguageIdentificationWhisperConfig struct {
... ...
... ... @@ -29,7 +29,7 @@ class CircularBuffer {
}
reset() {
return addon.circularBufferReset(this.handle);
addon.circularBufferReset(this.handle);
}
}
... ... @@ -79,7 +79,11 @@ config = {
}
reset() {
return addon.VoiceActivityDetectorResetWrapper(this.handle);
addon.VoiceActivityDetectorResetWrapper(this.handle);
}
flush() {
addon.VoiceActivityDetectorFlushWrapper(this.handle);
}
}
... ...
... ... @@ -590,6 +590,31 @@ static void VoiceActivityDetectorResetWrapper(const Napi::CallbackInfo &info) {
SherpaOnnxVoiceActivityDetectorReset(vad);
}
static void VoiceActivityDetectorFlushWrapper(const Napi::CallbackInfo &info) {
Napi::Env env = info.Env();
if (info.Length() != 1) {
std::ostringstream os;
os << "Expect only 1 argument. Given: " << info.Length();
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
return;
}
if (!info[0].IsExternal()) {
Napi::TypeError::New(env, "Argument 0 should be a VAD pointer.")
.ThrowAsJavaScriptException();
return;
}
SherpaOnnxVoiceActivityDetector *vad =
info[0].As<Napi::External<SherpaOnnxVoiceActivityDetector>>().Data();
SherpaOnnxVoiceActivityDetectorFlush(vad);
}
void InitVad(Napi::Env env, Napi::Object exports) {
exports.Set(Napi::String::New(env, "createCircularBuffer"),
Napi::Function::New(env, CreateCircularBufferWrapper));
... ... @@ -636,4 +661,7 @@ void InitVad(Napi::Env env, Napi::Object exports) {
exports.Set(Napi::String::New(env, "voiceActivityDetectorReset"),
Napi::Function::New(env, VoiceActivityDetectorResetWrapper));
exports.Set(Napi::String::New(env, "voiceActivityDetectorFlush"),
Napi::Function::New(env, VoiceActivityDetectorFlushWrapper));
}
... ...
... ... @@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
p->impl->Reset();
}
void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) {
p->impl->Flush();
}
#if SHERPA_ONNX_ENABLE_TTS == 1
struct SherpaOnnxOfflineTts {
std::unique_ptr<sherpa_onnx::OfflineTts> impl;
... ...
... ... @@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
SherpaOnnxVoiceActivityDetector *p);
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush(
SherpaOnnxVoiceActivityDetector *p);
// ============================================================
// For offline Text-to-Speech (i.e., non-streaming TTS)
// ============================================================
... ...
... ... @@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl {
start_ = -1;
}
void Flush() {
if (start_ == -1 || buffer_.Size() == 0) {
return;
}
int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
if (end <= start_) {
return;
}
std::vector<float> s = buffer_.Get(start_, end - start_);
SpeechSegment segment;
segment.start = start_;
segment.samples = std::move(s);
segments_.push(std::move(segment));
buffer_.Pop(end - buffer_.Head());
start_ = -1;
}
bool IsSpeechDetected() const { return start_ != -1; }
const VadModelConfig &GetConfig() const { return config_; }
... ... @@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const {
return impl_->Front();
}
void VoiceActivityDetector::Reset() { impl_->Reset(); }
void VoiceActivityDetector::Reset() const { impl_->Reset(); }
void VoiceActivityDetector::Flush() const { impl_->Flush(); }
bool VoiceActivityDetector::IsSpeechDetected() const {
return impl_->IsSpeechDetected();
... ...
... ... @@ -41,7 +41,11 @@ class VoiceActivityDetector {
bool IsSpeechDetected() const;
void Reset();
void Reset() const;
// At the end of the utterance, you can invoke this method so that
// the last speech segment can be detected.
void Flush() const;
const VadModelConfig &GetConfig() const;
... ...
... ... @@ -46,6 +46,10 @@ public class Vad {
reset(this.ptr);
}
public void flush() {
flush(this.ptr);
}
public SpeechSegment front() {
Object[] arr = front(this.ptr);
int start = (int) arr[0];
... ... @@ -75,4 +79,6 @@ public class Vad {
private native boolean isSpeechDetected(long ptr);
private native void reset(long ptr);
private native void flush(long ptr);
}
... ...
... ... @@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/,
auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
model->Reset();
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/,
jobject /*obj*/,
jlong ptr) {
auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
model->Flush();
}
... ...
... ... @@ -52,6 +52,8 @@ class Vad(
fun reset() = reset(ptr)
fun flush() = flush(ptr)
private external fun delete(ptr: Long)
private external fun newFromAsset(
... ... @@ -70,6 +72,7 @@ class Vad(
private external fun front(ptr: Long): Array<Any>
private external fun isSpeechDetected(ptr: Long): Boolean
private external fun reset(ptr: Long)
private external fun flush(ptr: Long)
companion object {
init {
... ...
... ... @@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) {
.def("is_speech_detected", &PyClass::IsSpeechDetected,
py::call_guard<py::gil_scoped_release>())
.def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
.def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
.def_property_readonly("front", &PyClass::Front);
}
... ...
... ... @@ -633,6 +633,10 @@ class SherpaOnnxVoiceActivityDetectorWrapper {
func reset() {
SherpaOnnxVoiceActivityDetectorReset(vad)
}
func flush() {
SherpaOnnxVoiceActivityDetectorFlush(vad)
}
}
// offline tts
... ...