Committed by
GitHub
Add Golang API for spoken language identification. (#709)
正在显示
10 个修改的文件
包含
242 行增加
和
1 行删除
| @@ -23,4 +23,7 @@ for details. | @@ -23,4 +23,7 @@ for details. | ||
| 23 | - [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer | 23 | - [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer |
| 24 | for speech recognition. | 24 | for speech recognition. |
| 25 | 25 | ||
| 26 | +- [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper | ||
| 27 | + for spoken language identification. | ||
| 28 | + | ||
| 26 | [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx | 29 | [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx |
| 1 | +package main | ||
| 2 | + | ||
| 3 | +import ( | ||
| 4 | + "fmt" | ||
| 5 | + iso639 "github.com/barbashov/iso639-3" | ||
| 6 | + "github.com/gordonklaus/portaudio" | ||
| 7 | + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" | ||
| 8 | + "log" | ||
| 9 | +) | ||
| 10 | + | ||
| 11 | +func main() { | ||
| 12 | + log.SetFlags(log.LstdFlags | log.Lmicroseconds) | ||
| 13 | + | ||
| 14 | + // 1. Create VAD | ||
| 15 | + config := sherpa.VadModelConfig{} | ||
| 16 | + | ||
| 17 | + // Please download silero_vad.onnx from | ||
| 18 | + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 19 | + | ||
| 20 | + config.SileroVad.Model = "./silero_vad.onnx" | ||
| 21 | + config.SileroVad.Threshold = 0.5 | ||
| 22 | + config.SileroVad.MinSilenceDuration = 0.5 | ||
| 23 | + config.SileroVad.MinSpeechDuration = 0.25 | ||
| 24 | + config.SileroVad.WindowSize = 512 | ||
| 25 | + config.SampleRate = 16000 | ||
| 26 | + config.NumThreads = 1 | ||
| 27 | + config.Provider = "cpu" | ||
| 28 | + config.Debug = 1 | ||
| 29 | + | ||
| 30 | + var bufferSizeInSeconds float32 = 20 | ||
| 31 | + | ||
| 32 | + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds) | ||
| 33 | + defer sherpa.DeleteVoiceActivityDetector(vad) | ||
| 34 | + | ||
| 35 | + // 2. Create spoken language identifier | ||
| 36 | + | ||
| 37 | + c := sherpa.SpokenLanguageIdentificationConfig{} | ||
| 38 | + c.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx" | ||
| 39 | + c.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx" | ||
| 40 | + c.NumThreads = 2 | ||
| 41 | + c.Debug = 1 | ||
| 42 | + c.Provider = "cpu" | ||
| 43 | + | ||
| 44 | + slid := sherpa.NewSpokenLanguageIdentification(&c) | ||
| 45 | + defer sherpa.DeleteSpokenLanguageIdentification(slid) | ||
| 46 | + | ||
| 47 | + err := portaudio.Initialize() | ||
| 48 | + if err != nil { | ||
| 49 | + log.Fatalf("Unable to initialize portaudio: %v\n", err) | ||
| 50 | + } | ||
| 51 | + defer portaudio.Terminate() | ||
| 52 | + | ||
| 53 | + default_device, err := portaudio.DefaultInputDevice() | ||
| 54 | + if err != nil { | ||
| 55 | + log.Fatal("Failed to get default input device: %v\n", err) | ||
| 56 | + } | ||
| 57 | + log.Printf("Selected default input device: %s\n", default_device.Name) | ||
| 58 | + param := portaudio.StreamParameters{} | ||
| 59 | + param.Input.Device = default_device | ||
| 60 | + param.Input.Channels = 1 | ||
| 61 | + param.Input.Latency = default_device.DefaultHighInputLatency | ||
| 62 | + | ||
| 63 | + param.SampleRate = float64(config.SampleRate) | ||
| 64 | + param.FramesPerBuffer = 0 | ||
| 65 | + param.Flags = portaudio.ClipOff | ||
| 66 | + | ||
| 67 | + // you can choose another value for 0.1 if you want | ||
| 68 | + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second | ||
| 69 | + samples := make([]float32, samplesPerCall) | ||
| 70 | + | ||
| 71 | + s, err := portaudio.OpenStream(param, samples) | ||
| 72 | + if err != nil { | ||
| 73 | + log.Fatalf("Failed to open the stream") | ||
| 74 | + } | ||
| 75 | + | ||
| 76 | + defer s.Close() | ||
| 77 | + chk(s.Start()) | ||
| 78 | + | ||
| 79 | + log.Print("Started! Please speak") | ||
| 80 | + printed := false | ||
| 81 | + | ||
| 82 | + k := 0 | ||
| 83 | + for { | ||
| 84 | + chk(s.Read()) | ||
| 85 | + vad.AcceptWaveform(samples) | ||
| 86 | + | ||
| 87 | + if vad.IsSpeech() && !printed { | ||
| 88 | + printed = true | ||
| 89 | + log.Print("Detected speech\n") | ||
| 90 | + } | ||
| 91 | + | ||
| 92 | + if !vad.IsSpeech() { | ||
| 93 | + printed = false | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + for !vad.IsEmpty() { | ||
| 97 | + speechSegment := vad.Front() | ||
| 98 | + vad.Pop() | ||
| 99 | + | ||
| 100 | + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate) | ||
| 101 | + | ||
| 102 | + audio := &sherpa.GeneratedAudio{} | ||
| 103 | + audio.Samples = speechSegment.Samples | ||
| 104 | + audio.SampleRate = config.SampleRate | ||
| 105 | + | ||
| 106 | + // Now decode it | ||
| 107 | + go decode(slid, audio, k) | ||
| 108 | + | ||
| 109 | + k += 1 | ||
| 110 | + | ||
| 111 | + log.Printf("Duration: %.2f seconds\n", duration) | ||
| 112 | + } | ||
| 113 | + } | ||
| 114 | + | ||
| 115 | + chk(s.Stop()) | ||
| 116 | +} | ||
| 117 | + | ||
| 118 | +func decode(slid *sherpa.SpokenLanguageIdentification, audio *sherpa.GeneratedAudio, id int) { | ||
| 119 | + stream := slid.CreateStream() | ||
| 120 | + defer sherpa.DeleteOfflineStream(stream) | ||
| 121 | + | ||
| 122 | + stream.AcceptWaveform(audio.SampleRate, audio.Samples) | ||
| 123 | + result := slid.Compute(stream) | ||
| 124 | + lang := iso639.FromPart1Code(result.Lang).Name | ||
| 125 | + log.Printf("Detected language: %v", lang) | ||
| 126 | + | ||
| 127 | + duration := float32(len(audio.Samples)) / float32(audio.SampleRate) | ||
| 128 | + | ||
| 129 | + filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, lang) | ||
| 130 | + ok := audio.Save(filename) | ||
| 131 | + if ok { | ||
| 132 | + log.Printf("Saved to %s", filename) | ||
| 133 | + } | ||
| 134 | + log.Print("----------\n") | ||
| 135 | +} | ||
| 136 | + | ||
| 137 | +func chk(err error) { | ||
| 138 | + if err != nil { | ||
| 139 | + panic(err) | ||
| 140 | + } | ||
| 141 | +} |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | + | ||
| 4 | +if [ ! -f ./silero_vad.onnx ]; then | ||
| 5 | + curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 6 | +fi | ||
| 7 | + | ||
| 8 | +if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then | ||
| 9 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 10 | + tar xvf sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 11 | + rm sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +go mod tidy | ||
| 15 | +go build | ||
| 16 | +./vad-spoken-language-identification |
| 1 | +/Users/fangjun/open-source/sherpa-onnx/go-api-examples/vad-spoken-language-identification/main.go |
| 1 | +/Users/fangjun/open-source/sherpa-onnx/go-api-examples/vad-spoken-language-identification/run.sh |
| @@ -783,3 +783,72 @@ func (vad *VoiceActivityDetector) Front() *SpeechSegment { | @@ -783,3 +783,72 @@ func (vad *VoiceActivityDetector) Front() *SpeechSegment { | ||
| 783 | func (vad *VoiceActivityDetector) Reset() { | 783 | func (vad *VoiceActivityDetector) Reset() { |
| 784 | C.SherpaOnnxVoiceActivityDetectorReset(vad.impl) | 784 | C.SherpaOnnxVoiceActivityDetectorReset(vad.impl) |
| 785 | } | 785 | } |
| 786 | + | ||
| 787 | +// Spoken language identification | ||
| 788 | + | ||
| 789 | +type SpokenLanguageIdentificationWhisperConfig struct { | ||
| 790 | + Encoder string | ||
| 791 | + Decoder string | ||
| 792 | + TailPaddings int | ||
| 793 | +} | ||
| 794 | + | ||
| 795 | +type SpokenLanguageIdentificationConfig struct { | ||
| 796 | + Whisper SpokenLanguageIdentificationWhisperConfig | ||
| 797 | + NumThreads int | ||
| 798 | + Debug int | ||
| 799 | + Provider string | ||
| 800 | +} | ||
| 801 | + | ||
| 802 | +type SpokenLanguageIdentification struct { | ||
| 803 | + impl *C.struct_SherpaOnnxSpokenLanguageIdentification | ||
| 804 | +} | ||
| 805 | + | ||
| 806 | +type SpokenLanguageIdentificationResult struct { | ||
| 807 | + Lang string | ||
| 808 | +} | ||
| 809 | + | ||
| 810 | +func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification { | ||
| 811 | + c := C.struct_SherpaOnnxSpokenLanguageIdentificationConfig{} | ||
| 812 | + | ||
| 813 | + c.whisper.encoder = C.CString(config.Whisper.Encoder) | ||
| 814 | + defer C.free(unsafe.Pointer(c.whisper.encoder)) | ||
| 815 | + | ||
| 816 | + c.whisper.decoder = C.CString(config.Whisper.Decoder) | ||
| 817 | + defer C.free(unsafe.Pointer(c.whisper.decoder)) | ||
| 818 | + | ||
| 819 | + c.whisper.tail_paddings = C.int(config.Whisper.TailPaddings) | ||
| 820 | + | ||
| 821 | + c.num_threads = C.int(config.NumThreads) | ||
| 822 | + c.debug = C.int(config.Debug) | ||
| 823 | + | ||
| 824 | + c.provider = C.CString(config.Provider) | ||
| 825 | + defer C.free(unsafe.Pointer(c.provider)) | ||
| 826 | + | ||
| 827 | + slid := &SpokenLanguageIdentification{} | ||
| 828 | + slid.impl = C.SherpaOnnxCreateSpokenLanguageIdentification(&c) | ||
| 829 | + | ||
| 830 | + return slid | ||
| 831 | +} | ||
| 832 | + | ||
| 833 | +func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification) { | ||
| 834 | + C.SherpaOnnxDestroySpokenLanguageIdentification(slid.impl) | ||
| 835 | + slid.impl = nil | ||
| 836 | +} | ||
| 837 | + | ||
| 838 | +// The user has to invoke DeleteOfflineStream() to free the returned value | ||
| 839 | +// to avoid memory leak | ||
| 840 | +func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream { | ||
| 841 | + stream := &OfflineStream{} | ||
| 842 | + stream.impl = C.SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid.impl) | ||
| 843 | + return stream | ||
| 844 | +} | ||
| 845 | + | ||
| 846 | +func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult { | ||
| 847 | + r := C.SherpaOnnxSpokenLanguageIdentificationCompute(slid.impl, stream.impl) | ||
| 848 | + // defer C.SherpaOnnxDestroySpokenLanguageIdentificationResult(r) | ||
| 849 | + | ||
| 850 | + ans := &SpokenLanguageIdentificationResult{} | ||
| 851 | + ans.Lang = C.GoString(r.lang) | ||
| 852 | + | ||
| 853 | + return ans | ||
| 854 | +} |
| @@ -91,7 +91,7 @@ std::string SpokenLanguageIdentificationConfig::ToString() const { | @@ -91,7 +91,7 @@ std::string SpokenLanguageIdentificationConfig::ToString() const { | ||
| 91 | std::ostringstream os; | 91 | std::ostringstream os; |
| 92 | 92 | ||
| 93 | os << "SpokenLanguageIdentificationConfig("; | 93 | os << "SpokenLanguageIdentificationConfig("; |
| 94 | - os << "whisper=\"" << whisper.ToString() << "\", "; | 94 | + os << "whisper=" << whisper.ToString() << ", "; |
| 95 | os << "num_threads=" << num_threads << ", "; | 95 | os << "num_threads=" << num_threads << ", "; |
| 96 | os << "debug=" << (debug ? "True" : "False") << ", "; | 96 | os << "debug=" << (debug ? "True" : "False") << ", "; |
| 97 | os << "provider=\"" << provider << "\")"; | 97 | os << "provider=\"" << provider << "\")"; |
-
请 注册 或 登录 后发表评论