Add Golang API for spoken language identification. (#709)

Fangjun Kuang · GitHub
Commit a042f440767ef6bd78b696db68318e13da852aad a042f440 1 parent 12efbf73
go-api-examples/README.md
go-api-examples/vad-spoken-language-identification/go.mod
go-api-examples/vad-spoken-language-identification/main.go
go-api-examples/vad-spoken-language-identification/run.sh
scripts/go/_internal/vad-spoken-language-identification/.gitignore
scripts/go/_internal/vad-spoken-language-identification/go.mod
scripts/go/_internal/vad-spoken-language-identification/main.go
scripts/go/_internal/vad-spoken-language-identification/run.sh
scripts/go/sherpa_onnx.go
sherpa-onnx/csrc/spoken-language-identification.cc
--- a/go-api-examples/README.md
查看文件 @a042f44
+++ b/go-api-examples/README.md
查看文件 @a042f44
@@ -23,4 +23,7 @@ for details.
 - [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer
   for speech recognition.
 
+ - [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper
+   for spoken language identification.
+ 
 [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
--- a/go-api-examples/vad-spoken-language-identification/go.mod 0 → 100644
查看文件 @a042f44
+++ b/go-api-examples/vad-spoken-language-identification/go.mod 0 → 100644
查看文件 @a042f44
+ module vad-spoken-language-identification
+ 
+ go 1.12
--- a/go-api-examples/vad-spoken-language-identification/main.go 0 → 100644
查看文件 @a042f44
+++ b/go-api-examples/vad-spoken-language-identification/main.go 0 → 100644
查看文件 @a042f44
+ package main
+ 
+ import (
+ 	"fmt"
+ 	iso639 "github.com/barbashov/iso639-3"
+ 	"github.com/gordonklaus/portaudio"
+ 	sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
+ 	"log"
+ )
+ 
+ func main() {
+ 	log.SetFlags(log.LstdFlags | log.Lmicroseconds)
+ 
+ 	// 1. Create VAD
+ 	config := sherpa.VadModelConfig{}
+ 
+ 	// Please download silero_vad.onnx from
+ 	// https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
+ 
+ 	config.SileroVad.Model = "./silero_vad.onnx"
+ 	config.SileroVad.Threshold = 0.5
+ 	config.SileroVad.MinSilenceDuration = 0.5
+ 	config.SileroVad.MinSpeechDuration = 0.25
+ 	config.SileroVad.WindowSize = 512
+ 	config.SampleRate = 16000
+ 	config.NumThreads = 1
+ 	config.Provider = "cpu"
+ 	config.Debug = 1
+ 
+ 	var bufferSizeInSeconds float32 = 20
+ 
+ 	vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds)
+ 	defer sherpa.DeleteVoiceActivityDetector(vad)
+ 
+ 	// 2. Create spoken language identifier
+ 
+ 	c := sherpa.SpokenLanguageIdentificationConfig{}
+ 	c.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx"
+ 	c.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx"
+ 	c.NumThreads = 2
+ 	c.Debug = 1
+ 	c.Provider = "cpu"
+ 
+ 	slid := sherpa.NewSpokenLanguageIdentification(&c)
+ 	defer sherpa.DeleteSpokenLanguageIdentification(slid)
+ 
+ 	err := portaudio.Initialize()
+ 	if err != nil {
+ 		log.Fatalf("Unable to initialize portaudio: %v\n", err)
+ 	}
+ 	defer portaudio.Terminate()
+ 
+ 	default_device, err := portaudio.DefaultInputDevice()
+ 	if err != nil {
+ 		log.Fatal("Failed to get default input device: %v\n", err)
+ 	}
+ 	log.Printf("Selected default input device: %s\n", default_device.Name)
+ 	param := portaudio.StreamParameters{}
+ 	param.Input.Device = default_device
+ 	param.Input.Channels = 1
+ 	param.Input.Latency = default_device.DefaultHighInputLatency
+ 
+ 	param.SampleRate = float64(config.SampleRate)
+ 	param.FramesPerBuffer = 0
+ 	param.Flags = portaudio.ClipOff
+ 
+ 	// you can choose another value for 0.1 if you want
+ 	samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second
+ 	samples := make([]float32, samplesPerCall)
+ 
+ 	s, err := portaudio.OpenStream(param, samples)
+ 	if err != nil {
+ 		log.Fatalf("Failed to open the stream")
+ 	}
+ 
+ 	defer s.Close()
+ 	chk(s.Start())
+ 
+ 	log.Print("Started! Please speak")
+ 	printed := false
+ 
+ 	k := 0
+ 	for {
+ 		chk(s.Read())
+ 		vad.AcceptWaveform(samples)
+ 
+ 		if vad.IsSpeech() && !printed {
+ 			printed = true
+ 			log.Print("Detected speech\n")
+ 		}
+ 
+ 		if !vad.IsSpeech() {
+ 			printed = false
+ 		}
+ 
+ 		for !vad.IsEmpty() {
+ 			speechSegment := vad.Front()
+ 			vad.Pop()
+ 
+ 			duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate)
+ 
+ 			audio := &sherpa.GeneratedAudio{}
+ 			audio.Samples = speechSegment.Samples
+ 			audio.SampleRate = config.SampleRate
+ 
+ 			// Now decode it
+ 			go decode(slid, audio, k)
+ 
+ 			k += 1
+ 
+ 			log.Printf("Duration: %.2f seconds\n", duration)
+ 		}
+ 	}
+ 
+ 	chk(s.Stop())
+ }
+ 
+ func decode(slid *sherpa.SpokenLanguageIdentification, audio *sherpa.GeneratedAudio, id int) {
+ 	stream := slid.CreateStream()
+ 	defer sherpa.DeleteOfflineStream(stream)
+ 
+ 	stream.AcceptWaveform(audio.SampleRate, audio.Samples)
+ 	result := slid.Compute(stream)
+ 	lang := iso639.FromPart1Code(result.Lang).Name
+ 	log.Printf("Detected language: %v", lang)
+ 
+ 	duration := float32(len(audio.Samples)) / float32(audio.SampleRate)
+ 
+ 	filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, lang)
+ 	ok := audio.Save(filename)
+ 	if ok {
+ 		log.Printf("Saved to %s", filename)
+ 	}
+ 	log.Print("----------\n")
+ }
+ 
+ func chk(err error) {
+ 	if err != nil {
+ 		panic(err)
+ 	}
+ }
--- a/go-api-examples/vad-spoken-language-identification/run.sh 0 → 100755
查看文件 @a042f44
+++ b/go-api-examples/vad-spoken-language-identification/run.sh 0 → 100755
查看文件 @a042f44
+ #!/usr/bin/env bash
+ 
+ 
+ if [ ! -f ./silero_vad.onnx ]; then
+   curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
+ fi
+ 
+ if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+   tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+   rm sherpa-onnx-whisper-tiny.tar.bz2
+ fi
+ 
+ go mod tidy
+ go build
+ ./vad-spoken-language-identification
--- a/scripts/go/_internal/vad-spoken-language-identification/.gitignore 0 → 100644
查看文件 @a042f44
+++ b/scripts/go/_internal/vad-spoken-language-identification/.gitignore 0 → 100644
查看文件 @a042f44
+ vad-spoken-language-identification
+ 
--- a/scripts/go/_internal/vad-spoken-language-identification/go.mod 0 → 100644
查看文件 @a042f44
+++ b/scripts/go/_internal/vad-spoken-language-identification/go.mod 0 → 100644
查看文件 @a042f44
+ module vad-spoken-language-identification
+ 
+ go 1.12
+ 
+ replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
--- a/scripts/go/_internal/vad-spoken-language-identification/main.go 0 → 120000
查看文件 @a042f44
+++ b/scripts/go/_internal/vad-spoken-language-identification/main.go 0 → 120000
查看文件 @a042f44
+ /Users/fangjun/open-source/sherpa-onnx/go-api-examples/vad-spoken-language-identification/main.go
\ No newline at end of file
--- a/scripts/go/_internal/vad-spoken-language-identification/run.sh 0 → 120000
查看文件 @a042f44
+++ b/scripts/go/_internal/vad-spoken-language-identification/run.sh 0 → 120000
查看文件 @a042f44
+ /Users/fangjun/open-source/sherpa-onnx/go-api-examples/vad-spoken-language-identification/run.sh
\ No newline at end of file
--- a/scripts/go/sherpa_onnx.go
查看文件 @a042f44
+++ b/scripts/go/sherpa_onnx.go
查看文件 @a042f44
@@ -783,3 +783,72 @@ func (vad *VoiceActivityDetector) Front() *SpeechSegment {
 func (vad *VoiceActivityDetector) Reset() {
 	C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
 }
+ 
+ // Spoken language identification
+ 
+ type SpokenLanguageIdentificationWhisperConfig struct {
+ 	Encoder      string
+ 	Decoder      string
+ 	TailPaddings int
+ }
+ 
+ type SpokenLanguageIdentificationConfig struct {
+ 	Whisper    SpokenLanguageIdentificationWhisperConfig
+ 	NumThreads int
+ 	Debug      int
+ 	Provider   string
+ }
+ 
+ type SpokenLanguageIdentification struct {
+ 	impl *C.struct_SherpaOnnxSpokenLanguageIdentification
+ }
+ 
+ type SpokenLanguageIdentificationResult struct {
+ 	Lang string
+ }
+ 
+ func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification {
+ 	c := C.struct_SherpaOnnxSpokenLanguageIdentificationConfig{}
+ 
+ 	c.whisper.encoder = C.CString(config.Whisper.Encoder)
+ 	defer C.free(unsafe.Pointer(c.whisper.encoder))
+ 
+ 	c.whisper.decoder = C.CString(config.Whisper.Decoder)
+ 	defer C.free(unsafe.Pointer(c.whisper.decoder))
+ 
+ 	c.whisper.tail_paddings = C.int(config.Whisper.TailPaddings)
+ 
+ 	c.num_threads = C.int(config.NumThreads)
+ 	c.debug = C.int(config.Debug)
+ 
+ 	c.provider = C.CString(config.Provider)
+ 	defer C.free(unsafe.Pointer(c.provider))
+ 
+ 	slid := &SpokenLanguageIdentification{}
+ 	slid.impl = C.SherpaOnnxCreateSpokenLanguageIdentification(&c)
+ 
+ 	return slid
+ }
+ 
+ func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification) {
+ 	C.SherpaOnnxDestroySpokenLanguageIdentification(slid.impl)
+ 	slid.impl = nil
+ }
+ 
+ // The user has to invoke DeleteOfflineStream() to free the returned value
+ // to avoid memory leak
+ func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream {
+ 	stream := &OfflineStream{}
+ 	stream.impl = C.SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid.impl)
+ 	return stream
+ }
+ 
+ func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult {
+ 	r := C.SherpaOnnxSpokenLanguageIdentificationCompute(slid.impl, stream.impl)
+ 	// defer C.SherpaOnnxDestroySpokenLanguageIdentificationResult(r)
+ 
+ 	ans := &SpokenLanguageIdentificationResult{}
+ 	ans.Lang = C.GoString(r.lang)
+ 
+ 	return ans
+ }
--- a/sherpa-onnx/csrc/spoken-language-identification.cc
查看文件 @a042f44
+++ b/sherpa-onnx/csrc/spoken-language-identification.cc
查看文件 @a042f44
@@ -91,7 +91,7 @@ std::string SpokenLanguageIdentificationConfig::ToString() const {
   std::ostringstream os;
 
   os << "SpokenLanguageIdentificationConfig(";
-   os << "whisper=\"" << whisper.ToString() << "\", ";
+   os << "whisper=" << whisper.ToString() << ", ";
   os << "num_threads=" << num_threads << ", ";
   os << "debug=" << (debug ? "True" : "False") << ", ";
   os << "provider=\"" << provider << "\")";