Fangjun Kuang
Committed by GitHub

Add Go API for ten-vad (#2384)

... ... @@ -2,9 +2,10 @@ package main
import (
"fmt"
portaudio "github.com/csukuangfj/portaudio-go"
"github.com/gen2brain/malgo"
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
"log"
"os"
)
func main() {
... ... @@ -13,62 +14,79 @@ func main() {
config := sherpa.VadModelConfig{}
// Please download silero_vad.onnx from
// https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
// or ten-vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
if FileExists("./silero_vad.onnx") {
fmt.Println("Use silero-vad")
config.SileroVad.Model = "./silero_vad.onnx"
config.SileroVad.Threshold = 0.5
config.SileroVad.MinSilenceDuration = 0.5
config.SileroVad.MinSpeechDuration = 0.25
config.SileroVad.MaxSpeechDuration = 10
config.SileroVad.WindowSize = 512
} else if FileExists("./ten-vad.onnx") {
fmt.Println("Use ten-vad")
config.TenVad.Model = "./ten-vad.onnx"
config.TenVad.Threshold = 0.5
config.TenVad.MinSilenceDuration = 0.5
config.TenVad.MinSpeechDuration = 0.25
config.TenVad.MaxSpeechDuration = 10
config.TenVad.WindowSize = 256
} else {
fmt.Println("Please download either ./silero_vad.onnx or ./ten-vad.onnx")
return
}
config.SampleRate = 16000
config.NumThreads = 1
config.Provider = "cpu"
config.Debug = 1
windowSize := config.SileroVad.WindowSize
if config.TenVad.Model != "" {
windowSize = config.TenVad.WindowSize
}
var bufferSizeInSeconds float32 = 5
vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds)
defer sherpa.DeleteVoiceActivityDetector(vad)
err := portaudio.Initialize()
if err != nil {
log.Fatalf("Unable to initialize portaudio: %v\n", err)
}
defer portaudio.Terminate()
buffer := sherpa.NewCircularBuffer(10 * config.SampleRate)
defer sherpa.DeleteCircularBuffer(buffer)
default_device, err := portaudio.DefaultInputDevice()
if err != nil {
log.Fatal("Failed to get default input device: %v\n", err)
}
log.Printf("Selected default input device: %s\n", default_device.Name)
param := portaudio.StreamParameters{}
param.Input.Device = default_device
param.Input.Channels = 1
param.Input.Latency = default_device.DefaultLowInputLatency
ctx, err := malgo.InitContext(nil, malgo.ContextConfig{}, func(message string) {
fmt.Printf("LOG <%v>", message)
})
chk(err)
param.SampleRate = float64(config.SampleRate)
param.FramesPerBuffer = 0
param.Flags = portaudio.ClipOff
defer func() {
_ = ctx.Uninit()
ctx.Free()
}()
// you can choose another value for 0.1 if you want
samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second
samples := make([]float32, samplesPerCall)
deviceConfig := malgo.DefaultDeviceConfig(malgo.Duplex)
deviceConfig.Capture.Format = malgo.FormatS16
deviceConfig.Capture.Channels = 1
deviceConfig.Playback.Format = malgo.FormatS16
deviceConfig.Playback.Channels = 1
deviceConfig.SampleRate = 16000
deviceConfig.Alsa.NoMMap = 1
s, err := portaudio.OpenStream(param, samples)
if err != nil {
log.Fatalf("Failed to open the stream")
}
defer s.Close()
chk(s.Start())
log.Print("Started! Please speak")
printed := false
k := 0
for {
chk(s.Read())
vad.AcceptWaveform(samples)
onRecvFrames := func(_, pSample []byte, framecount uint32) {
samples := samplesInt16ToFloat(pSample)
buffer.Push(samples)
for buffer.Size() >= windowSize {
head := buffer.Head()
s := buffer.Get(head, windowSize)
buffer.Pop(windowSize)
vad.AcceptWaveform(s)
if vad.IsSpeech() && !printed {
printed = true
... ... @@ -101,8 +119,22 @@ func main() {
log.Print("----------\n")
}
}
}
captureCallbacks := malgo.DeviceCallbacks{
Data: onRecvFrames,
}
device, err := malgo.InitDevice(ctx.Context, deviceConfig, captureCallbacks)
chk(err)
err = device.Start()
chk(err)
fmt.Println("Started. Please speak. Press ctrl + C to exit")
fmt.Scanln()
device.Uninit()
chk(s.Stop())
}
func chk(err error) {
... ... @@ -110,3 +142,25 @@ func chk(err error) {
panic(err)
}
}
func samplesInt16ToFloat(inSamples []byte) []float32 {
numSamples := len(inSamples) / 2
outSamples := make([]float32, numSamples)
for i := 0; i != numSamples; i++ {
// Decode two bytes into an int16 using bit manipulation
s16 := int16(inSamples[2*i]) | int16(inSamples[2*i+1])<<8
outSamples[i] = float32(s16) / 32768
}
return outSamples
}
func FileExists(path string) bool {
_, err := os.Stat(path)
if err == nil {
return true
}
return false
}
... ...
... ... @@ -3,7 +3,11 @@
set -ex
if [ ! -f ./silero_vad.onnx ]; then
curl -SL -O https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi
if [ ! -f ./ten-vad.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
fi
go mod tidy
... ...
... ... @@ -1142,8 +1142,18 @@ type SileroVadModelConfig struct {
MaxSpeechDuration float32
}
type TenVadModelConfig struct {
Model string
Threshold float32
MinSilenceDuration float32
MinSpeechDuration float32
WindowSize int
MaxSpeechDuration float32
}
type VadModelConfig struct {
SileroVad SileroVadModelConfig
TenVad TenVadModelConfig
SampleRate int
NumThreads int
Provider string
... ... @@ -1220,6 +1230,15 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)
c.ten_vad.model = C.CString(config.TenVad.Model)
defer C.free(unsafe.Pointer(c.ten_vad.model))
c.ten_vad.threshold = C.float(config.TenVad.Threshold)
c.ten_vad.min_silence_duration = C.float(config.TenVad.MinSilenceDuration)
c.ten_vad.min_speech_duration = C.float(config.TenVad.MinSpeechDuration)
c.ten_vad.window_size = C.int(config.TenVad.WindowSize)
c.ten_vad.max_speech_duration = C.float(config.TenVad.MaxSpeechDuration)
c.sample_rate = C.int(config.SampleRate)
c.num_threads = C.int(config.NumThreads)
c.provider = C.CString(config.Provider)
... ...