Fangjun Kuang
Committed by GitHub

Add Golang API for VAD (#708)

@@ -86,3 +86,4 @@ vits-piper-* @@ -86,3 +86,4 @@ vits-piper-*
86 vits-coqui-* 86 vits-coqui-*
87 vits-mms-* 87 vits-mms-*
88 *.tar.bz2 88 *.tar.bz2
  89 +sherpa-onnx-paraformer-trilingual-zh-cantonese-en
@@ -6,4 +6,21 @@ Please refer to the documentation @@ -6,4 +6,21 @@ Please refer to the documentation
6 https://k2-fsa.github.io/sherpa/onnx/go-api/index.html 6 https://k2-fsa.github.io/sherpa/onnx/go-api/index.html
7 for details. 7 for details.
8 8
  9 +- [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use
  10 + a non-streaming ASR model to decode files
  11 +
  12 +- [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS
  13 + model to convert text to speech
  14 +
  15 +- [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone)
  16 + It shows how to use a streaming ASR model to recognize speech from a microphone in real-time
  17 +
  18 +- [./vad](./vad) It shows how to use silero VAD with Golang.
  19 +
  20 +- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper
  21 + for speech recognition.
  22 +
  23 +- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer
  24 + for speech recognition.
  25 +
9 [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx 26 [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
@@ -57,8 +57,7 @@ func main() { @@ -57,8 +57,7 @@ func main() {
57 log.Println("Done!") 57 log.Println("Done!")
58 58
59 ok := audio.Save(filename) 59 ok := audio.Save(filename)
60 - if ok != 1 { 60 + if !ok {
61 log.Fatalf("Failed to write", filename) 61 log.Fatalf("Failed to write", filename)
62 } 62 }
63 -  
64 } 63 }
  1 +module vad-asr-paraformer
  2 +
  3 +go 1.12
  1 +package main
  2 +
  3 +import (
  4 + "fmt"
  5 + "github.com/gordonklaus/portaudio"
  6 + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
  7 + "log"
  8 + "strings"
  9 +)
  10 +
  11 +func main() {
  12 + log.SetFlags(log.LstdFlags | log.Lmicroseconds)
  13 +
  14 + // 1. Create VAD
  15 + config := sherpa.VadModelConfig{}
  16 +
  17 + // Please download silero_vad.onnx from
  18 + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  19 +
  20 + config.SileroVad.Model = "./silero_vad.onnx"
  21 + config.SileroVad.Threshold = 0.5
  22 + config.SileroVad.MinSilenceDuration = 0.5
  23 + config.SileroVad.MinSpeechDuration = 0.25
  24 + config.SileroVad.WindowSize = 512
  25 + config.SampleRate = 16000
  26 + config.NumThreads = 1
  27 + config.Provider = "cpu"
  28 + config.Debug = 1
  29 +
  30 + var bufferSizeInSeconds float32 = 20
  31 +
  32 + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds)
  33 + defer sherpa.DeleteVoiceActivityDetector(vad)
  34 +
  35 + // 2. Create ASR recognizer
  36 +
  37 + c := sherpa.OfflineRecognizerConfig{}
  38 + c.FeatConfig.SampleRate = 16000
  39 + c.FeatConfig.FeatureDim = 80
  40 +
  41 + // Please download the model from
  42 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2
  43 + c.ModelConfig.Paraformer.Model = "./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx"
  44 + c.ModelConfig.Tokens = "./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt"
  45 + c.ModelConfig.NumThreads = 2
  46 + c.ModelConfig.Debug = 1
  47 + c.ModelConfig.Provider = "cpu"
  48 +
  49 + recognizer := sherpa.NewOfflineRecognizer(&c)
  50 + defer sherpa.DeleteOfflineRecognizer(recognizer)
  51 +
  52 + err := portaudio.Initialize()
  53 + if err != nil {
  54 + log.Fatalf("Unable to initialize portaudio: %v\n", err)
  55 + }
  56 + defer portaudio.Terminate()
  57 +
  58 + default_device, err := portaudio.DefaultInputDevice()
  59 + if err != nil {
  60 + log.Fatal("Failed to get default input device: %v\n", err)
  61 + }
  62 + log.Printf("Selected default input device: %s\n", default_device.Name)
  63 + param := portaudio.StreamParameters{}
  64 + param.Input.Device = default_device
  65 + param.Input.Channels = 1
  66 + param.Input.Latency = default_device.DefaultHighInputLatency
  67 +
  68 + param.SampleRate = float64(config.SampleRate)
  69 + param.FramesPerBuffer = 0
  70 + param.Flags = portaudio.ClipOff
  71 +
  72 + // you can choose another value for 0.1 if you want
  73 + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second
  74 + samples := make([]float32, samplesPerCall)
  75 +
  76 + s, err := portaudio.OpenStream(param, samples)
  77 + if err != nil {
  78 + log.Fatalf("Failed to open the stream")
  79 + }
  80 +
  81 + defer s.Close()
  82 + chk(s.Start())
  83 +
  84 + log.Print("Started! Please speak")
  85 + printed := false
  86 +
  87 + k := 0
  88 + for {
  89 + chk(s.Read())
  90 + vad.AcceptWaveform(samples)
  91 +
  92 + if vad.IsSpeech() && !printed {
  93 + printed = true
  94 + log.Print("Detected speech\n")
  95 + }
  96 +
  97 + if !vad.IsSpeech() {
  98 + printed = false
  99 + }
  100 +
  101 + for !vad.IsEmpty() {
  102 + speechSegment := vad.Front()
  103 + vad.Pop()
  104 +
  105 + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate)
  106 +
  107 + audio := &sherpa.GeneratedAudio{}
  108 + audio.Samples = speechSegment.Samples
  109 + audio.SampleRate = config.SampleRate
  110 +
  111 + // Now decode it
  112 + go decode(recognizer, audio, k)
  113 +
  114 + k += 1
  115 +
  116 + log.Printf("Duration: %.2f seconds\n", duration)
  117 + }
  118 + }
  119 +
  120 + chk(s.Stop())
  121 +}
  122 +
  123 +func decode(recognizer *sherpa.OfflineRecognizer, audio *sherpa.GeneratedAudio, id int) {
  124 + stream := sherpa.NewOfflineStream(recognizer)
  125 + defer sherpa.DeleteOfflineStream(stream)
  126 + stream.AcceptWaveform(audio.SampleRate, audio.Samples)
  127 + recognizer.Decode(stream)
  128 + result := stream.GetResult()
  129 + text := strings.ToLower(result.Text)
  130 + text = strings.Trim(text, " ")
  131 + log.Println(text)
  132 +
  133 + duration := float32(len(audio.Samples)) / float32(audio.SampleRate)
  134 +
  135 + filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, text)
  136 + ok := audio.Save(filename)
  137 + if ok {
  138 + log.Printf("Saved to %s", filename)
  139 + }
  140 + log.Print("----------\n")
  141 +}
  142 +
  143 +func chk(err error) {
  144 + if err != nil {
  145 + panic(err)
  146 + }
  147 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +
  4 +if [ ! -f ./silero_vad.onnx ]; then
  5 + curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  6 +fi
  7 +
  8 +if [ ! -f ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx ]; then
  9 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2
  10 + tar xvf sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2
  11 + rm sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2
  12 +fi
  13 +
  14 +go mod tidy
  15 +go build
  16 +./vad-asr-paraformer
  1 +module vad-asr-whisper
  2 +
  3 +go 1.12
  1 +package main
  2 +
  3 +import (
  4 + "fmt"
  5 + "github.com/gordonklaus/portaudio"
  6 + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
  7 + "log"
  8 + "strings"
  9 +)
  10 +
  11 +func main() {
  12 + log.SetFlags(log.LstdFlags | log.Lmicroseconds)
  13 +
  14 + // 1. Create VAD
  15 + config := sherpa.VadModelConfig{}
  16 +
  17 + // Please download silero_vad.onnx from
  18 + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  19 +
  20 + config.SileroVad.Model = "./silero_vad.onnx"
  21 + config.SileroVad.Threshold = 0.5
  22 + config.SileroVad.MinSilenceDuration = 0.5
  23 + config.SileroVad.MinSpeechDuration = 0.25
  24 + config.SileroVad.WindowSize = 512
  25 + config.SampleRate = 16000
  26 + config.NumThreads = 1
  27 + config.Provider = "cpu"
  28 + config.Debug = 1
  29 +
  30 + var bufferSizeInSeconds float32 = 20
  31 +
  32 + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds)
  33 + defer sherpa.DeleteVoiceActivityDetector(vad)
  34 +
  35 + // 2. Create ASR recognizer
  36 +
  37 + c := sherpa.OfflineRecognizerConfig{}
  38 + c.FeatConfig.SampleRate = 16000
  39 + c.FeatConfig.FeatureDim = 80
  40 + c.ModelConfig.Whisper.Encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx"
  41 + c.ModelConfig.Whisper.Decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx"
  42 + c.ModelConfig.Tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt"
  43 + c.ModelConfig.NumThreads = 2
  44 + c.ModelConfig.Debug = 1
  45 + c.ModelConfig.Provider = "cpu"
  46 +
  47 + recognizer := sherpa.NewOfflineRecognizer(&c)
  48 + defer sherpa.DeleteOfflineRecognizer(recognizer)
  49 +
  50 + err := portaudio.Initialize()
  51 + if err != nil {
  52 + log.Fatalf("Unable to initialize portaudio: %v\n", err)
  53 + }
  54 + defer portaudio.Terminate()
  55 +
  56 + default_device, err := portaudio.DefaultInputDevice()
  57 + if err != nil {
  58 + log.Fatal("Failed to get default input device: %v\n", err)
  59 + }
  60 + log.Printf("Selected default input device: %s\n", default_device.Name)
  61 + param := portaudio.StreamParameters{}
  62 + param.Input.Device = default_device
  63 + param.Input.Channels = 1
  64 + param.Input.Latency = default_device.DefaultHighInputLatency
  65 +
  66 + param.SampleRate = float64(config.SampleRate)
  67 + param.FramesPerBuffer = 0
  68 + param.Flags = portaudio.ClipOff
  69 +
  70 + // you can choose another value for 0.1 if you want
  71 + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second
  72 + samples := make([]float32, samplesPerCall)
  73 +
  74 + s, err := portaudio.OpenStream(param, samples)
  75 + if err != nil {
  76 + log.Fatalf("Failed to open the stream")
  77 + }
  78 +
  79 + defer s.Close()
  80 + chk(s.Start())
  81 +
  82 + log.Print("Started! Please speak")
  83 + printed := false
  84 +
  85 + k := 0
  86 + for {
  87 + chk(s.Read())
  88 + vad.AcceptWaveform(samples)
  89 +
  90 + if vad.IsSpeech() && !printed {
  91 + printed = true
  92 + log.Print("Detected speech\n")
  93 + }
  94 +
  95 + if !vad.IsSpeech() {
  96 + printed = false
  97 + }
  98 +
  99 + for !vad.IsEmpty() {
  100 + speechSegment := vad.Front()
  101 + vad.Pop()
  102 +
  103 + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate)
  104 +
  105 + audio := &sherpa.GeneratedAudio{}
  106 + audio.Samples = speechSegment.Samples
  107 + audio.SampleRate = config.SampleRate
  108 +
  109 + // Now decode it
  110 + go decode(recognizer, audio, k)
  111 +
  112 + k += 1
  113 +
  114 + log.Printf("Duration: %.2f seconds\n", duration)
  115 + }
  116 + }
  117 +
  118 + chk(s.Stop())
  119 +}
  120 +
  121 +func decode(recognizer *sherpa.OfflineRecognizer, audio *sherpa.GeneratedAudio, id int) {
  122 + stream := sherpa.NewOfflineStream(recognizer)
  123 + defer sherpa.DeleteOfflineStream(stream)
  124 + stream.AcceptWaveform(audio.SampleRate, audio.Samples)
  125 + recognizer.Decode(stream)
  126 + result := stream.GetResult()
  127 + text := strings.ToLower(result.Text)
  128 + text = strings.Trim(text, " ")
  129 + log.Println(text)
  130 +
  131 + duration := float32(len(audio.Samples)) / float32(audio.SampleRate)
  132 +
  133 + filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, text)
  134 + ok := audio.Save(filename)
  135 + if ok {
  136 + log.Printf("Saved to %s", filename)
  137 + }
  138 + log.Print("----------\n")
  139 +}
  140 +
  141 +func chk(err error) {
  142 + if err != nil {
  143 + panic(err)
  144 + }
  145 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +
  4 +if [ ! -f ./silero_vad.onnx ]; then
  5 + curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  6 +fi
  7 +
  8 +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then
  9 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
  10 + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
  11 + rm sherpa-onnx-whisper-tiny.en.tar.bz2
  12 +fi
  13 +
  14 +go mod tidy
  15 +go build
  16 +./vad-asr-whisper
  1 +module vad
  2 +
  3 +go 1.12
  1 +package main
  2 +
  3 +import (
  4 + "fmt"
  5 + "github.com/gordonklaus/portaudio"
  6 + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
  7 + "log"
  8 +)
  9 +
  10 +func main() {
  11 + log.SetFlags(log.LstdFlags | log.Lmicroseconds)
  12 +
  13 + config := sherpa.VadModelConfig{}
  14 +
  15 + // Please download silero_vad.onnx from
  16 + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  17 +
  18 + config.SileroVad.Model = "./silero_vad.onnx"
  19 + config.SileroVad.Threshold = 0.5
  20 + config.SileroVad.MinSilenceDuration = 0.5
  21 + config.SileroVad.MinSpeechDuration = 0.25
  22 + config.SileroVad.WindowSize = 512
  23 + config.SampleRate = 16000
  24 + config.NumThreads = 1
  25 + config.Provider = "cpu"
  26 + config.Debug = 1
  27 +
  28 + var bufferSizeInSeconds float32 = 5
  29 +
  30 + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds)
  31 + defer sherpa.DeleteVoiceActivityDetector(vad)
  32 +
  33 + err := portaudio.Initialize()
  34 + if err != nil {
  35 + log.Fatalf("Unable to initialize portaudio: %v\n", err)
  36 + }
  37 + defer portaudio.Terminate()
  38 +
  39 + default_device, err := portaudio.DefaultInputDevice()
  40 + if err != nil {
  41 + log.Fatal("Failed to get default input device: %v\n", err)
  42 + }
  43 + log.Printf("Selected default input device: %s\n", default_device.Name)
  44 + param := portaudio.StreamParameters{}
  45 + param.Input.Device = default_device
  46 + param.Input.Channels = 1
  47 + param.Input.Latency = default_device.DefaultLowInputLatency
  48 +
  49 + param.SampleRate = float64(config.SampleRate)
  50 + param.FramesPerBuffer = 0
  51 + param.Flags = portaudio.ClipOff
  52 +
  53 + // you can choose another value for 0.1 if you want
  54 + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second
  55 + samples := make([]float32, samplesPerCall)
  56 +
  57 + s, err := portaudio.OpenStream(param, samples)
  58 + if err != nil {
  59 + log.Fatalf("Failed to open the stream")
  60 + }
  61 +
  62 + defer s.Close()
  63 + chk(s.Start())
  64 +
  65 + log.Print("Started! Please speak")
  66 + printed := false
  67 +
  68 + k := 0
  69 + for {
  70 + chk(s.Read())
  71 + vad.AcceptWaveform(samples)
  72 +
  73 + if vad.IsSpeech() && !printed {
  74 + printed = true
  75 + log.Print("Detected speech\n")
  76 + }
  77 +
  78 + if !vad.IsSpeech() {
  79 + printed = false
  80 + }
  81 +
  82 + for !vad.IsEmpty() {
  83 + speechSegment := vad.Front()
  84 + vad.Pop()
  85 +
  86 + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate)
  87 +
  88 + audio := sherpa.GeneratedAudio{}
  89 + audio.Samples = speechSegment.Samples
  90 + audio.SampleRate = config.SampleRate
  91 +
  92 + filename := fmt.Sprintf("seg-%d-%.2f-seconds.wav", k, duration)
  93 + ok := audio.Save(filename)
  94 + if ok {
  95 + log.Printf("Saved to %s", filename)
  96 + }
  97 +
  98 + k += 1
  99 +
  100 + log.Printf("Duration: %.2f seconds\n", duration)
  101 + log.Print("----------\n")
  102 + }
  103 + }
  104 +
  105 + chk(s.Stop())
  106 +}
  107 +
  108 +func chk(err error) {
  109 + if err != nil {
  110 + panic(err)
  111 + }
  112 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +
  4 +if [ ! -f ./silero_vad.onnx ]; then
  5 + curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
  6 +fi
  7 +
  8 +go mod tidy
  9 +go build
  10 +./vad
@@ -235,6 +235,12 @@ def get_vits_models() -> List[TtsModel]: @@ -235,6 +235,12 @@ def get_vits_models() -> List[TtsModel]:
235 return [ 235 return [
236 # Chinese 236 # Chinese
237 TtsModel( 237 TtsModel(
  238 + model_dir="vits-icefall-zh-aishell3",
  239 + model_name="model.onnx",
  240 + lang="zh",
  241 + rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst",
  242 + ),
  243 + TtsModel(
238 model_dir="vits-zh-aishell3", 244 model_dir="vits-zh-aishell3",
239 model_name="vits-aishell3.onnx", 245 model_name="vits-aishell3.onnx",
240 lang="zh", 246 lang="zh",
  1 +module vad-asr-paraformer
  2 +
  3 +go 1.12
  4 +
  5 +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
  6 +
  7 +require (
  8 + github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5
  9 + github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx v0.0.0-00010101000000-000000000000
  10 +)
  1 +../../../../go-api-examples/vad-asr-paraformer/main.go
  1 +../../../../go-api-examples/vad-asr-paraformer/run.sh
  1 +module vad-asr-whisper
  2 +
  3 +go 1.12
  4 +
  5 +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
  1 +../../../../go-api-examples/vad-asr-whisper/main.go
  1 +../../../../go-api-examples/vad-asr-whisper/run.sh
  1 +module vad
  2 +
  3 +go 1.12
  4 +
  5 +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
  1 +../../../../go-api-examples/vad/main.go
  1 +../../../../go-api-examples/vad/run.sh
@@ -614,6 +614,9 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA @@ -614,6 +614,9 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
614 ans.SampleRate = int(audio.sample_rate) 614 ans.SampleRate = int(audio.sample_rate)
615 n := int(audio.n) 615 n := int(audio.n)
616 ans.Samples = make([]float32, n) 616 ans.Samples = make([]float32, n)
  617 +
  618 + // see https://stackoverflow.com/questions/48756732/what-does-1-30c-yourtype-do-exactly-in-cgo
  619 + // :n:n means 0:n:n, means low:high:capacity
617 samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n] 620 samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n]
618 // copy(ans.Samples, samples) 621 // copy(ans.Samples, samples)
619 for i := 0; i < n; i++ { 622 for i := 0; i < n; i++ {
@@ -623,11 +626,160 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA @@ -623,11 +626,160 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA
623 return ans 626 return ans
624 } 627 }
625 628
626 -func (audio *GeneratedAudio) Save(filename string) int { 629 +func (audio *GeneratedAudio) Save(filename string) bool {
627 s := C.CString(filename) 630 s := C.CString(filename)
628 defer C.free(unsafe.Pointer(s)) 631 defer C.free(unsafe.Pointer(s))
629 632
630 ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s)) 633 ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s))
631 634
632 - return ok 635 + return ok == 1
  636 +}
  637 +
  638 +// ============================================================
  639 +// For VAD
  640 +// ============================================================
  641 +type SileroVadModelConfig struct {
  642 + Model string
  643 + Threshold float32
  644 + MinSilenceDuration float32
  645 + MinSpeechDuration float32
  646 + WindowSize int
  647 +}
  648 +
  649 +type VadModelConfig struct {
  650 + SileroVad SileroVadModelConfig
  651 + SampleRate int
  652 + NumThreads int
  653 + Provider string
  654 + Debug int
  655 +}
  656 +
  657 +type CircularBuffer struct {
  658 + impl *C.struct_SherpaOnnxCircularBuffer
  659 +}
  660 +
  661 +func DeleteCircularBuffer(buffer *CircularBuffer) {
  662 + C.SherpaOnnxDestroyCircularBuffer(buffer.impl)
  663 + buffer.impl = nil
  664 +}
  665 +
  666 +func NewCircularBuffer(capacity int) *CircularBuffer {
  667 + circularBuffer := &CircularBuffer{}
  668 + circularBuffer.impl = C.SherpaOnnxCreateCircularBuffer(C.int(capacity))
  669 + return circularBuffer
  670 +}
  671 +
  672 +func (buffer *CircularBuffer) Push(samples []float32) {
  673 + C.SherpaOnnxCircularBufferPush(buffer.impl, (*C.float)(&samples[0]), C.int(len(samples)))
  674 +}
  675 +
  676 +func (buffer *CircularBuffer) Get(start int, n int) []float32 {
  677 + samples := C.SherpaOnnxCircularBufferGet(buffer.impl, C.int(start), C.int(n))
  678 + defer C.SherpaOnnxCircularBufferFree(samples)
  679 +
  680 + result := make([]float32, n)
  681 +
  682 + p := (*[1 << 28]C.float)(unsafe.Pointer(samples))[:n:n]
  683 + for i := 0; i < n; i++ {
  684 + result[i] = float32(p[i])
  685 + }
  686 +
  687 + return result
  688 +}
  689 +
  690 +func (buffer *CircularBuffer) Pop(n int) {
  691 + C.SherpaOnnxCircularBufferPop(buffer.impl, C.int(n))
  692 +}
  693 +
  694 +func (buffer *CircularBuffer) Size() int {
  695 + return int(C.SherpaOnnxCircularBufferSize(buffer.impl))
  696 +}
  697 +
  698 +func (buffer *CircularBuffer) Head() int {
  699 + return int(C.SherpaOnnxCircularBufferHead(buffer.impl))
  700 +}
  701 +
  702 +func (buffer *CircularBuffer) Reset() {
  703 + C.SherpaOnnxCircularBufferReset(buffer.impl)
  704 +}
  705 +
  706 +type SpeechSegment struct {
  707 + Start int
  708 + Samples []float32
  709 +}
  710 +
  711 +type VoiceActivityDetector struct {
  712 + impl *C.struct_SherpaOnnxVoiceActivityDetector
  713 +}
  714 +
  715 +func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector {
  716 + c := C.struct_SherpaOnnxVadModelConfig{}
  717 +
  718 + c.silero_vad.model = C.CString(config.SileroVad.Model)
  719 + defer C.free(unsafe.Pointer(c.silero_vad.model))
  720 +
  721 + c.silero_vad.threshold = C.float(config.SileroVad.Threshold)
  722 + c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
  723 + c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
  724 + c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
  725 +
  726 + c.sample_rate = C.int(config.SampleRate)
  727 + c.num_threads = C.int(config.NumThreads)
  728 + c.provider = C.CString(config.Provider)
  729 + defer C.free(unsafe.Pointer(c.provider))
  730 +
  731 + c.debug = C.int(config.Debug)
  732 +
  733 + vad := &VoiceActivityDetector{}
  734 + vad.impl = C.SherpaOnnxCreateVoiceActivityDetector(&c, C.float(bufferSizeInSeconds))
  735 +
  736 + return vad
  737 +}
  738 +
  739 +func DeleteVoiceActivityDetector(vad *VoiceActivityDetector) {
  740 + C.SherpaOnnxDestroyVoiceActivityDetector(vad.impl)
  741 + vad.impl = nil
  742 +}
  743 +
  744 +func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32) {
  745 + C.SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad.impl, (*C.float)(&samples[0]), C.int(len(samples)))
  746 +}
  747 +
  748 +func (vad *VoiceActivityDetector) IsEmpty() bool {
  749 + return 1 == int(C.SherpaOnnxVoiceActivityDetectorEmpty(vad.impl))
  750 +}
  751 +
  752 +func (vad *VoiceActivityDetector) IsSpeech() bool {
  753 + return 1 == int(C.SherpaOnnxVoiceActivityDetectorDetected(vad.impl))
  754 +}
  755 +
  756 +func (vad *VoiceActivityDetector) Pop() {
  757 + C.SherpaOnnxVoiceActivityDetectorPop(vad.impl)
  758 +}
  759 +
  760 +func (vad *VoiceActivityDetector) Clear() {
  761 + C.SherpaOnnxVoiceActivityDetectorClear(vad.impl)
  762 +}
  763 +
  764 +func (vad *VoiceActivityDetector) Front() *SpeechSegment {
  765 + f := C.SherpaOnnxVoiceActivityDetectorFront(vad.impl)
  766 + defer C.SherpaOnnxDestroySpeechSegment(f)
  767 +
  768 + ans := &SpeechSegment{}
  769 + ans.Start = int(f.start)
  770 +
  771 + n := int(f.n)
  772 + ans.Samples = make([]float32, n)
  773 +
  774 + samples := (*[1 << 28]C.float)(unsafe.Pointer(f.samples))[:n:n]
  775 +
  776 + for i := 0; i < n; i++ {
  777 + ans.Samples[i] = float32(samples[i])
  778 + }
  779 +
  780 + return ans
  781 +}
  782 +
  783 +func (vad *VoiceActivityDetector) Reset() {
  784 + C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
633 } 785 }
@@ -309,6 +309,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( @@ -309,6 +309,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
309 309
310 recognizer_config.model_config.whisper.task = 310 recognizer_config.model_config.whisper.task =
311 SHERPA_ONNX_OR(config->model_config.whisper.task, "transcribe"); 311 SHERPA_ONNX_OR(config->model_config.whisper.task, "transcribe");
  312 + if (recognizer_config.model_config.whisper.task.empty()) {
  313 + recognizer_config.model_config.whisper.task = "transcribe";
  314 + }
312 315
313 recognizer_config.model_config.tdnn.model = 316 recognizer_config.model_config.tdnn.model =
314 SHERPA_ONNX_OR(config->model_config.tdnn.model, ""); 317 SHERPA_ONNX_OR(config->model_config.tdnn.model, "");
@@ -331,6 +334,11 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( @@ -331,6 +334,11 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
331 334
332 recognizer_config.decoding_method = 335 recognizer_config.decoding_method =
333 SHERPA_ONNX_OR(config->decoding_method, "greedy_search"); 336 SHERPA_ONNX_OR(config->decoding_method, "greedy_search");
  337 +
  338 + if (recognizer_config.decoding_method.empty()) {
  339 + recognizer_config.decoding_method = "greedy_search";
  340 + }
  341 +
334 recognizer_config.max_active_paths = 342 recognizer_config.max_active_paths =
335 SHERPA_ONNX_OR(config->max_active_paths, 4); 343 SHERPA_ONNX_OR(config->max_active_paths, 4);
336 344