正在显示
28 个修改的文件
包含
674 行增加
和
4 行删除
| @@ -6,4 +6,21 @@ Please refer to the documentation | @@ -6,4 +6,21 @@ Please refer to the documentation | ||
| 6 | https://k2-fsa.github.io/sherpa/onnx/go-api/index.html | 6 | https://k2-fsa.github.io/sherpa/onnx/go-api/index.html |
| 7 | for details. | 7 | for details. |
| 8 | 8 | ||
| 9 | +- [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use | ||
| 10 | + a non-streaming ASR model to decode files | ||
| 11 | + | ||
| 12 | +- [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS | ||
| 13 | + model to convert text to speech | ||
| 14 | + | ||
| 15 | +- [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone) | ||
| 16 | + It shows how to use a streaming ASR model to recognize speech from a microphone in real-time | ||
| 17 | + | ||
| 18 | +- [./vad](./vad) It shows how to use silero VAD with Golang. | ||
| 19 | + | ||
| 20 | +- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper | ||
| 21 | + for speech recognition. | ||
| 22 | + | ||
| 23 | +- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer | ||
| 24 | + for speech recognition. | ||
| 25 | + | ||
| 9 | [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx | 26 | [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx |
| @@ -57,8 +57,7 @@ func main() { | @@ -57,8 +57,7 @@ func main() { | ||
| 57 | log.Println("Done!") | 57 | log.Println("Done!") |
| 58 | 58 | ||
| 59 | ok := audio.Save(filename) | 59 | ok := audio.Save(filename) |
| 60 | - if ok != 1 { | 60 | + if !ok { |
| 61 | log.Fatalf("Failed to write", filename) | 61 | log.Fatalf("Failed to write", filename) |
| 62 | } | 62 | } |
| 63 | - | ||
| 64 | } | 63 | } |
go-api-examples/vad-asr-paraformer/go.mod
0 → 100644
go-api-examples/vad-asr-paraformer/main.go
0 → 100644
| 1 | +package main | ||
| 2 | + | ||
| 3 | +import ( | ||
| 4 | + "fmt" | ||
| 5 | + "github.com/gordonklaus/portaudio" | ||
| 6 | + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" | ||
| 7 | + "log" | ||
| 8 | + "strings" | ||
| 9 | +) | ||
| 10 | + | ||
| 11 | +func main() { | ||
| 12 | + log.SetFlags(log.LstdFlags | log.Lmicroseconds) | ||
| 13 | + | ||
| 14 | + // 1. Create VAD | ||
| 15 | + config := sherpa.VadModelConfig{} | ||
| 16 | + | ||
| 17 | + // Please download silero_vad.onnx from | ||
| 18 | + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 19 | + | ||
| 20 | + config.SileroVad.Model = "./silero_vad.onnx" | ||
| 21 | + config.SileroVad.Threshold = 0.5 | ||
| 22 | + config.SileroVad.MinSilenceDuration = 0.5 | ||
| 23 | + config.SileroVad.MinSpeechDuration = 0.25 | ||
| 24 | + config.SileroVad.WindowSize = 512 | ||
| 25 | + config.SampleRate = 16000 | ||
| 26 | + config.NumThreads = 1 | ||
| 27 | + config.Provider = "cpu" | ||
| 28 | + config.Debug = 1 | ||
| 29 | + | ||
| 30 | + var bufferSizeInSeconds float32 = 20 | ||
| 31 | + | ||
| 32 | + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds) | ||
| 33 | + defer sherpa.DeleteVoiceActivityDetector(vad) | ||
| 34 | + | ||
| 35 | + // 2. Create ASR recognizer | ||
| 36 | + | ||
| 37 | + c := sherpa.OfflineRecognizerConfig{} | ||
| 38 | + c.FeatConfig.SampleRate = 16000 | ||
| 39 | + c.FeatConfig.FeatureDim = 80 | ||
| 40 | + | ||
| 41 | + // Please download the model from | ||
| 42 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
| 43 | + c.ModelConfig.Paraformer.Model = "./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx" | ||
| 44 | + c.ModelConfig.Tokens = "./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/tokens.txt" | ||
| 45 | + c.ModelConfig.NumThreads = 2 | ||
| 46 | + c.ModelConfig.Debug = 1 | ||
| 47 | + c.ModelConfig.Provider = "cpu" | ||
| 48 | + | ||
| 49 | + recognizer := sherpa.NewOfflineRecognizer(&c) | ||
| 50 | + defer sherpa.DeleteOfflineRecognizer(recognizer) | ||
| 51 | + | ||
| 52 | + err := portaudio.Initialize() | ||
| 53 | + if err != nil { | ||
| 54 | + log.Fatalf("Unable to initialize portaudio: %v\n", err) | ||
| 55 | + } | ||
| 56 | + defer portaudio.Terminate() | ||
| 57 | + | ||
| 58 | + default_device, err := portaudio.DefaultInputDevice() | ||
| 59 | + if err != nil { | ||
| 60 | + log.Fatal("Failed to get default input device: %v\n", err) | ||
| 61 | + } | ||
| 62 | + log.Printf("Selected default input device: %s\n", default_device.Name) | ||
| 63 | + param := portaudio.StreamParameters{} | ||
| 64 | + param.Input.Device = default_device | ||
| 65 | + param.Input.Channels = 1 | ||
| 66 | + param.Input.Latency = default_device.DefaultHighInputLatency | ||
| 67 | + | ||
| 68 | + param.SampleRate = float64(config.SampleRate) | ||
| 69 | + param.FramesPerBuffer = 0 | ||
| 70 | + param.Flags = portaudio.ClipOff | ||
| 71 | + | ||
| 72 | + // you can choose another value for 0.1 if you want | ||
| 73 | + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second | ||
| 74 | + samples := make([]float32, samplesPerCall) | ||
| 75 | + | ||
| 76 | + s, err := portaudio.OpenStream(param, samples) | ||
| 77 | + if err != nil { | ||
| 78 | + log.Fatalf("Failed to open the stream") | ||
| 79 | + } | ||
| 80 | + | ||
| 81 | + defer s.Close() | ||
| 82 | + chk(s.Start()) | ||
| 83 | + | ||
| 84 | + log.Print("Started! Please speak") | ||
| 85 | + printed := false | ||
| 86 | + | ||
| 87 | + k := 0 | ||
| 88 | + for { | ||
| 89 | + chk(s.Read()) | ||
| 90 | + vad.AcceptWaveform(samples) | ||
| 91 | + | ||
| 92 | + if vad.IsSpeech() && !printed { | ||
| 93 | + printed = true | ||
| 94 | + log.Print("Detected speech\n") | ||
| 95 | + } | ||
| 96 | + | ||
| 97 | + if !vad.IsSpeech() { | ||
| 98 | + printed = false | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + for !vad.IsEmpty() { | ||
| 102 | + speechSegment := vad.Front() | ||
| 103 | + vad.Pop() | ||
| 104 | + | ||
| 105 | + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate) | ||
| 106 | + | ||
| 107 | + audio := &sherpa.GeneratedAudio{} | ||
| 108 | + audio.Samples = speechSegment.Samples | ||
| 109 | + audio.SampleRate = config.SampleRate | ||
| 110 | + | ||
| 111 | + // Now decode it | ||
| 112 | + go decode(recognizer, audio, k) | ||
| 113 | + | ||
| 114 | + k += 1 | ||
| 115 | + | ||
| 116 | + log.Printf("Duration: %.2f seconds\n", duration) | ||
| 117 | + } | ||
| 118 | + } | ||
| 119 | + | ||
| 120 | + chk(s.Stop()) | ||
| 121 | +} | ||
| 122 | + | ||
| 123 | +func decode(recognizer *sherpa.OfflineRecognizer, audio *sherpa.GeneratedAudio, id int) { | ||
| 124 | + stream := sherpa.NewOfflineStream(recognizer) | ||
| 125 | + defer sherpa.DeleteOfflineStream(stream) | ||
| 126 | + stream.AcceptWaveform(audio.SampleRate, audio.Samples) | ||
| 127 | + recognizer.Decode(stream) | ||
| 128 | + result := stream.GetResult() | ||
| 129 | + text := strings.ToLower(result.Text) | ||
| 130 | + text = strings.Trim(text, " ") | ||
| 131 | + log.Println(text) | ||
| 132 | + | ||
| 133 | + duration := float32(len(audio.Samples)) / float32(audio.SampleRate) | ||
| 134 | + | ||
| 135 | + filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, text) | ||
| 136 | + ok := audio.Save(filename) | ||
| 137 | + if ok { | ||
| 138 | + log.Printf("Saved to %s", filename) | ||
| 139 | + } | ||
| 140 | + log.Print("----------\n") | ||
| 141 | +} | ||
| 142 | + | ||
| 143 | +func chk(err error) { | ||
| 144 | + if err != nil { | ||
| 145 | + panic(err) | ||
| 146 | + } | ||
| 147 | +} |
go-api-examples/vad-asr-paraformer/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | + | ||
| 4 | +if [ ! -f ./silero_vad.onnx ]; then | ||
| 5 | + curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 6 | +fi | ||
| 7 | + | ||
| 8 | +if [ ! -f ./sherpa-onnx-paraformer-trilingual-zh-cantonese-en/model.int8.onnx ]; then | ||
| 9 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
| 10 | + tar xvf sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
| 11 | + rm sherpa-onnx-paraformer-trilingual-zh-cantonese-en.tar.bz2 | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +go mod tidy | ||
| 15 | +go build | ||
| 16 | +./vad-asr-paraformer |
go-api-examples/vad-asr-whisper/go.mod
0 → 100644
go-api-examples/vad-asr-whisper/main.go
0 → 100644
| 1 | +package main | ||
| 2 | + | ||
| 3 | +import ( | ||
| 4 | + "fmt" | ||
| 5 | + "github.com/gordonklaus/portaudio" | ||
| 6 | + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" | ||
| 7 | + "log" | ||
| 8 | + "strings" | ||
| 9 | +) | ||
| 10 | + | ||
| 11 | +func main() { | ||
| 12 | + log.SetFlags(log.LstdFlags | log.Lmicroseconds) | ||
| 13 | + | ||
| 14 | + // 1. Create VAD | ||
| 15 | + config := sherpa.VadModelConfig{} | ||
| 16 | + | ||
| 17 | + // Please download silero_vad.onnx from | ||
| 18 | + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 19 | + | ||
| 20 | + config.SileroVad.Model = "./silero_vad.onnx" | ||
| 21 | + config.SileroVad.Threshold = 0.5 | ||
| 22 | + config.SileroVad.MinSilenceDuration = 0.5 | ||
| 23 | + config.SileroVad.MinSpeechDuration = 0.25 | ||
| 24 | + config.SileroVad.WindowSize = 512 | ||
| 25 | + config.SampleRate = 16000 | ||
| 26 | + config.NumThreads = 1 | ||
| 27 | + config.Provider = "cpu" | ||
| 28 | + config.Debug = 1 | ||
| 29 | + | ||
| 30 | + var bufferSizeInSeconds float32 = 20 | ||
| 31 | + | ||
| 32 | + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds) | ||
| 33 | + defer sherpa.DeleteVoiceActivityDetector(vad) | ||
| 34 | + | ||
| 35 | + // 2. Create ASR recognizer | ||
| 36 | + | ||
| 37 | + c := sherpa.OfflineRecognizerConfig{} | ||
| 38 | + c.FeatConfig.SampleRate = 16000 | ||
| 39 | + c.FeatConfig.FeatureDim = 80 | ||
| 40 | + c.ModelConfig.Whisper.Encoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx" | ||
| 41 | + c.ModelConfig.Whisper.Decoder = "./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx" | ||
| 42 | + c.ModelConfig.Tokens = "./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt" | ||
| 43 | + c.ModelConfig.NumThreads = 2 | ||
| 44 | + c.ModelConfig.Debug = 1 | ||
| 45 | + c.ModelConfig.Provider = "cpu" | ||
| 46 | + | ||
| 47 | + recognizer := sherpa.NewOfflineRecognizer(&c) | ||
| 48 | + defer sherpa.DeleteOfflineRecognizer(recognizer) | ||
| 49 | + | ||
| 50 | + err := portaudio.Initialize() | ||
| 51 | + if err != nil { | ||
| 52 | + log.Fatalf("Unable to initialize portaudio: %v\n", err) | ||
| 53 | + } | ||
| 54 | + defer portaudio.Terminate() | ||
| 55 | + | ||
| 56 | + default_device, err := portaudio.DefaultInputDevice() | ||
| 57 | + if err != nil { | ||
| 58 | + log.Fatal("Failed to get default input device: %v\n", err) | ||
| 59 | + } | ||
| 60 | + log.Printf("Selected default input device: %s\n", default_device.Name) | ||
| 61 | + param := portaudio.StreamParameters{} | ||
| 62 | + param.Input.Device = default_device | ||
| 63 | + param.Input.Channels = 1 | ||
| 64 | + param.Input.Latency = default_device.DefaultHighInputLatency | ||
| 65 | + | ||
| 66 | + param.SampleRate = float64(config.SampleRate) | ||
| 67 | + param.FramesPerBuffer = 0 | ||
| 68 | + param.Flags = portaudio.ClipOff | ||
| 69 | + | ||
| 70 | + // you can choose another value for 0.1 if you want | ||
| 71 | + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second | ||
| 72 | + samples := make([]float32, samplesPerCall) | ||
| 73 | + | ||
| 74 | + s, err := portaudio.OpenStream(param, samples) | ||
| 75 | + if err != nil { | ||
| 76 | + log.Fatalf("Failed to open the stream") | ||
| 77 | + } | ||
| 78 | + | ||
| 79 | + defer s.Close() | ||
| 80 | + chk(s.Start()) | ||
| 81 | + | ||
| 82 | + log.Print("Started! Please speak") | ||
| 83 | + printed := false | ||
| 84 | + | ||
| 85 | + k := 0 | ||
| 86 | + for { | ||
| 87 | + chk(s.Read()) | ||
| 88 | + vad.AcceptWaveform(samples) | ||
| 89 | + | ||
| 90 | + if vad.IsSpeech() && !printed { | ||
| 91 | + printed = true | ||
| 92 | + log.Print("Detected speech\n") | ||
| 93 | + } | ||
| 94 | + | ||
| 95 | + if !vad.IsSpeech() { | ||
| 96 | + printed = false | ||
| 97 | + } | ||
| 98 | + | ||
| 99 | + for !vad.IsEmpty() { | ||
| 100 | + speechSegment := vad.Front() | ||
| 101 | + vad.Pop() | ||
| 102 | + | ||
| 103 | + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate) | ||
| 104 | + | ||
| 105 | + audio := &sherpa.GeneratedAudio{} | ||
| 106 | + audio.Samples = speechSegment.Samples | ||
| 107 | + audio.SampleRate = config.SampleRate | ||
| 108 | + | ||
| 109 | + // Now decode it | ||
| 110 | + go decode(recognizer, audio, k) | ||
| 111 | + | ||
| 112 | + k += 1 | ||
| 113 | + | ||
| 114 | + log.Printf("Duration: %.2f seconds\n", duration) | ||
| 115 | + } | ||
| 116 | + } | ||
| 117 | + | ||
| 118 | + chk(s.Stop()) | ||
| 119 | +} | ||
| 120 | + | ||
| 121 | +func decode(recognizer *sherpa.OfflineRecognizer, audio *sherpa.GeneratedAudio, id int) { | ||
| 122 | + stream := sherpa.NewOfflineStream(recognizer) | ||
| 123 | + defer sherpa.DeleteOfflineStream(stream) | ||
| 124 | + stream.AcceptWaveform(audio.SampleRate, audio.Samples) | ||
| 125 | + recognizer.Decode(stream) | ||
| 126 | + result := stream.GetResult() | ||
| 127 | + text := strings.ToLower(result.Text) | ||
| 128 | + text = strings.Trim(text, " ") | ||
| 129 | + log.Println(text) | ||
| 130 | + | ||
| 131 | + duration := float32(len(audio.Samples)) / float32(audio.SampleRate) | ||
| 132 | + | ||
| 133 | + filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, text) | ||
| 134 | + ok := audio.Save(filename) | ||
| 135 | + if ok { | ||
| 136 | + log.Printf("Saved to %s", filename) | ||
| 137 | + } | ||
| 138 | + log.Print("----------\n") | ||
| 139 | +} | ||
| 140 | + | ||
| 141 | +func chk(err error) { | ||
| 142 | + if err != nil { | ||
| 143 | + panic(err) | ||
| 144 | + } | ||
| 145 | +} |
go-api-examples/vad-asr-whisper/run.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | + | ||
| 4 | +if [ ! -f ./silero_vad.onnx ]; then | ||
| 5 | + curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 6 | +fi | ||
| 7 | + | ||
| 8 | +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then | ||
| 9 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 10 | + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 11 | + rm sherpa-onnx-whisper-tiny.en.tar.bz2 | ||
| 12 | +fi | ||
| 13 | + | ||
| 14 | +go mod tidy | ||
| 15 | +go build | ||
| 16 | +./vad-asr-whisper |
go-api-examples/vad/go.mod
0 → 100644
go-api-examples/vad/main.go
0 → 100644
| 1 | +package main | ||
| 2 | + | ||
| 3 | +import ( | ||
| 4 | + "fmt" | ||
| 5 | + "github.com/gordonklaus/portaudio" | ||
| 6 | + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" | ||
| 7 | + "log" | ||
| 8 | +) | ||
| 9 | + | ||
| 10 | +func main() { | ||
| 11 | + log.SetFlags(log.LstdFlags | log.Lmicroseconds) | ||
| 12 | + | ||
| 13 | + config := sherpa.VadModelConfig{} | ||
| 14 | + | ||
| 15 | + // Please download silero_vad.onnx from | ||
| 16 | + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx | ||
| 17 | + | ||
| 18 | + config.SileroVad.Model = "./silero_vad.onnx" | ||
| 19 | + config.SileroVad.Threshold = 0.5 | ||
| 20 | + config.SileroVad.MinSilenceDuration = 0.5 | ||
| 21 | + config.SileroVad.MinSpeechDuration = 0.25 | ||
| 22 | + config.SileroVad.WindowSize = 512 | ||
| 23 | + config.SampleRate = 16000 | ||
| 24 | + config.NumThreads = 1 | ||
| 25 | + config.Provider = "cpu" | ||
| 26 | + config.Debug = 1 | ||
| 27 | + | ||
| 28 | + var bufferSizeInSeconds float32 = 5 | ||
| 29 | + | ||
| 30 | + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds) | ||
| 31 | + defer sherpa.DeleteVoiceActivityDetector(vad) | ||
| 32 | + | ||
| 33 | + err := portaudio.Initialize() | ||
| 34 | + if err != nil { | ||
| 35 | + log.Fatalf("Unable to initialize portaudio: %v\n", err) | ||
| 36 | + } | ||
| 37 | + defer portaudio.Terminate() | ||
| 38 | + | ||
| 39 | + default_device, err := portaudio.DefaultInputDevice() | ||
| 40 | + if err != nil { | ||
| 41 | + log.Fatal("Failed to get default input device: %v\n", err) | ||
| 42 | + } | ||
| 43 | + log.Printf("Selected default input device: %s\n", default_device.Name) | ||
| 44 | + param := portaudio.StreamParameters{} | ||
| 45 | + param.Input.Device = default_device | ||
| 46 | + param.Input.Channels = 1 | ||
| 47 | + param.Input.Latency = default_device.DefaultLowInputLatency | ||
| 48 | + | ||
| 49 | + param.SampleRate = float64(config.SampleRate) | ||
| 50 | + param.FramesPerBuffer = 0 | ||
| 51 | + param.Flags = portaudio.ClipOff | ||
| 52 | + | ||
| 53 | + // you can choose another value for 0.1 if you want | ||
| 54 | + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second | ||
| 55 | + samples := make([]float32, samplesPerCall) | ||
| 56 | + | ||
| 57 | + s, err := portaudio.OpenStream(param, samples) | ||
| 58 | + if err != nil { | ||
| 59 | + log.Fatalf("Failed to open the stream") | ||
| 60 | + } | ||
| 61 | + | ||
| 62 | + defer s.Close() | ||
| 63 | + chk(s.Start()) | ||
| 64 | + | ||
| 65 | + log.Print("Started! Please speak") | ||
| 66 | + printed := false | ||
| 67 | + | ||
| 68 | + k := 0 | ||
| 69 | + for { | ||
| 70 | + chk(s.Read()) | ||
| 71 | + vad.AcceptWaveform(samples) | ||
| 72 | + | ||
| 73 | + if vad.IsSpeech() && !printed { | ||
| 74 | + printed = true | ||
| 75 | + log.Print("Detected speech\n") | ||
| 76 | + } | ||
| 77 | + | ||
| 78 | + if !vad.IsSpeech() { | ||
| 79 | + printed = false | ||
| 80 | + } | ||
| 81 | + | ||
| 82 | + for !vad.IsEmpty() { | ||
| 83 | + speechSegment := vad.Front() | ||
| 84 | + vad.Pop() | ||
| 85 | + | ||
| 86 | + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate) | ||
| 87 | + | ||
| 88 | + audio := sherpa.GeneratedAudio{} | ||
| 89 | + audio.Samples = speechSegment.Samples | ||
| 90 | + audio.SampleRate = config.SampleRate | ||
| 91 | + | ||
| 92 | + filename := fmt.Sprintf("seg-%d-%.2f-seconds.wav", k, duration) | ||
| 93 | + ok := audio.Save(filename) | ||
| 94 | + if ok { | ||
| 95 | + log.Printf("Saved to %s", filename) | ||
| 96 | + } | ||
| 97 | + | ||
| 98 | + k += 1 | ||
| 99 | + | ||
| 100 | + log.Printf("Duration: %.2f seconds\n", duration) | ||
| 101 | + log.Print("----------\n") | ||
| 102 | + } | ||
| 103 | + } | ||
| 104 | + | ||
| 105 | + chk(s.Stop()) | ||
| 106 | +} | ||
| 107 | + | ||
| 108 | +func chk(err error) { | ||
| 109 | + if err != nil { | ||
| 110 | + panic(err) | ||
| 111 | + } | ||
| 112 | +} |
go-api-examples/vad/run.sh
0 → 100755
| @@ -235,6 +235,12 @@ def get_vits_models() -> List[TtsModel]: | @@ -235,6 +235,12 @@ def get_vits_models() -> List[TtsModel]: | ||
| 235 | return [ | 235 | return [ |
| 236 | # Chinese | 236 | # Chinese |
| 237 | TtsModel( | 237 | TtsModel( |
| 238 | + model_dir="vits-icefall-zh-aishell3", | ||
| 239 | + model_name="model.onnx", | ||
| 240 | + lang="zh", | ||
| 241 | + rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst", | ||
| 242 | + ), | ||
| 243 | + TtsModel( | ||
| 238 | model_dir="vits-zh-aishell3", | 244 | model_dir="vits-zh-aishell3", |
| 239 | model_name="vits-aishell3.onnx", | 245 | model_name="vits-aishell3.onnx", |
| 240 | lang="zh", | 246 | lang="zh", |
| 1 | +vad-asr-paraformer |
| 1 | +../../../../go-api-examples/vad-asr-paraformer/main.go |
| 1 | +../../../../go-api-examples/vad-asr-paraformer/run.sh |
scripts/go/_internal/vad-asr-whisper/go.mod
0 → 100644
scripts/go/_internal/vad-asr-whisper/main.go
0 → 120000
| 1 | +../../../../go-api-examples/vad-asr-whisper/main.go |
scripts/go/_internal/vad-asr-whisper/run.sh
0 → 120000
| 1 | +../../../../go-api-examples/vad-asr-whisper/run.sh |
scripts/go/_internal/vad/.gitignore
0 → 100644
scripts/go/_internal/vad/go.mod
0 → 100644
scripts/go/_internal/vad/main.go
0 → 120000
| 1 | +../../../../go-api-examples/vad/main.go |
scripts/go/_internal/vad/run.sh
0 → 120000
| 1 | +../../../../go-api-examples/vad/run.sh |
| @@ -614,6 +614,9 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA | @@ -614,6 +614,9 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA | ||
| 614 | ans.SampleRate = int(audio.sample_rate) | 614 | ans.SampleRate = int(audio.sample_rate) |
| 615 | n := int(audio.n) | 615 | n := int(audio.n) |
| 616 | ans.Samples = make([]float32, n) | 616 | ans.Samples = make([]float32, n) |
| 617 | + | ||
| 618 | + // see https://stackoverflow.com/questions/48756732/what-does-1-30c-yourtype-do-exactly-in-cgo | ||
| 619 | + // :n:n means 0:n:n, means low:high:capacity | ||
| 617 | samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n] | 620 | samples := (*[1 << 28]C.float)(unsafe.Pointer(audio.samples))[:n:n] |
| 618 | // copy(ans.Samples, samples) | 621 | // copy(ans.Samples, samples) |
| 619 | for i := 0; i < n; i++ { | 622 | for i := 0; i < n; i++ { |
| @@ -623,11 +626,160 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA | @@ -623,11 +626,160 @@ func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedA | ||
| 623 | return ans | 626 | return ans |
| 624 | } | 627 | } |
| 625 | 628 | ||
| 626 | -func (audio *GeneratedAudio) Save(filename string) int { | 629 | +func (audio *GeneratedAudio) Save(filename string) bool { |
| 627 | s := C.CString(filename) | 630 | s := C.CString(filename) |
| 628 | defer C.free(unsafe.Pointer(s)) | 631 | defer C.free(unsafe.Pointer(s)) |
| 629 | 632 | ||
| 630 | ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s)) | 633 | ok := int(C.SherpaOnnxWriteWave((*C.float)(&audio.Samples[0]), C.int(len(audio.Samples)), C.int(audio.SampleRate), s)) |
| 631 | 634 | ||
| 632 | - return ok | 635 | + return ok == 1 |
| 636 | +} | ||
| 637 | + | ||
| 638 | +// ============================================================ | ||
| 639 | +// For VAD | ||
| 640 | +// ============================================================ | ||
| 641 | +type SileroVadModelConfig struct { | ||
| 642 | + Model string | ||
| 643 | + Threshold float32 | ||
| 644 | + MinSilenceDuration float32 | ||
| 645 | + MinSpeechDuration float32 | ||
| 646 | + WindowSize int | ||
| 647 | +} | ||
| 648 | + | ||
| 649 | +type VadModelConfig struct { | ||
| 650 | + SileroVad SileroVadModelConfig | ||
| 651 | + SampleRate int | ||
| 652 | + NumThreads int | ||
| 653 | + Provider string | ||
| 654 | + Debug int | ||
| 655 | +} | ||
| 656 | + | ||
| 657 | +type CircularBuffer struct { | ||
| 658 | + impl *C.struct_SherpaOnnxCircularBuffer | ||
| 659 | +} | ||
| 660 | + | ||
| 661 | +func DeleteCircularBuffer(buffer *CircularBuffer) { | ||
| 662 | + C.SherpaOnnxDestroyCircularBuffer(buffer.impl) | ||
| 663 | + buffer.impl = nil | ||
| 664 | +} | ||
| 665 | + | ||
| 666 | +func NewCircularBuffer(capacity int) *CircularBuffer { | ||
| 667 | + circularBuffer := &CircularBuffer{} | ||
| 668 | + circularBuffer.impl = C.SherpaOnnxCreateCircularBuffer(C.int(capacity)) | ||
| 669 | + return circularBuffer | ||
| 670 | +} | ||
| 671 | + | ||
| 672 | +func (buffer *CircularBuffer) Push(samples []float32) { | ||
| 673 | + C.SherpaOnnxCircularBufferPush(buffer.impl, (*C.float)(&samples[0]), C.int(len(samples))) | ||
| 674 | +} | ||
| 675 | + | ||
| 676 | +func (buffer *CircularBuffer) Get(start int, n int) []float32 { | ||
| 677 | + samples := C.SherpaOnnxCircularBufferGet(buffer.impl, C.int(start), C.int(n)) | ||
| 678 | + defer C.SherpaOnnxCircularBufferFree(samples) | ||
| 679 | + | ||
| 680 | + result := make([]float32, n) | ||
| 681 | + | ||
| 682 | + p := (*[1 << 28]C.float)(unsafe.Pointer(samples))[:n:n] | ||
| 683 | + for i := 0; i < n; i++ { | ||
| 684 | + result[i] = float32(p[i]) | ||
| 685 | + } | ||
| 686 | + | ||
| 687 | + return result | ||
| 688 | +} | ||
| 689 | + | ||
| 690 | +func (buffer *CircularBuffer) Pop(n int) { | ||
| 691 | + C.SherpaOnnxCircularBufferPop(buffer.impl, C.int(n)) | ||
| 692 | +} | ||
| 693 | + | ||
| 694 | +func (buffer *CircularBuffer) Size() int { | ||
| 695 | + return int(C.SherpaOnnxCircularBufferSize(buffer.impl)) | ||
| 696 | +} | ||
| 697 | + | ||
| 698 | +func (buffer *CircularBuffer) Head() int { | ||
| 699 | + return int(C.SherpaOnnxCircularBufferHead(buffer.impl)) | ||
| 700 | +} | ||
| 701 | + | ||
| 702 | +func (buffer *CircularBuffer) Reset() { | ||
| 703 | + C.SherpaOnnxCircularBufferReset(buffer.impl) | ||
| 704 | +} | ||
| 705 | + | ||
| 706 | +type SpeechSegment struct { | ||
| 707 | + Start int | ||
| 708 | + Samples []float32 | ||
| 709 | +} | ||
| 710 | + | ||
| 711 | +type VoiceActivityDetector struct { | ||
| 712 | + impl *C.struct_SherpaOnnxVoiceActivityDetector | ||
| 713 | +} | ||
| 714 | + | ||
| 715 | +func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float32) *VoiceActivityDetector { | ||
| 716 | + c := C.struct_SherpaOnnxVadModelConfig{} | ||
| 717 | + | ||
| 718 | + c.silero_vad.model = C.CString(config.SileroVad.Model) | ||
| 719 | + defer C.free(unsafe.Pointer(c.silero_vad.model)) | ||
| 720 | + | ||
| 721 | + c.silero_vad.threshold = C.float(config.SileroVad.Threshold) | ||
| 722 | + c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration) | ||
| 723 | + c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration) | ||
| 724 | + c.silero_vad.window_size = C.int(config.SileroVad.WindowSize) | ||
| 725 | + | ||
| 726 | + c.sample_rate = C.int(config.SampleRate) | ||
| 727 | + c.num_threads = C.int(config.NumThreads) | ||
| 728 | + c.provider = C.CString(config.Provider) | ||
| 729 | + defer C.free(unsafe.Pointer(c.provider)) | ||
| 730 | + | ||
| 731 | + c.debug = C.int(config.Debug) | ||
| 732 | + | ||
| 733 | + vad := &VoiceActivityDetector{} | ||
| 734 | + vad.impl = C.SherpaOnnxCreateVoiceActivityDetector(&c, C.float(bufferSizeInSeconds)) | ||
| 735 | + | ||
| 736 | + return vad | ||
| 737 | +} | ||
| 738 | + | ||
| 739 | +func DeleteVoiceActivityDetector(vad *VoiceActivityDetector) { | ||
| 740 | + C.SherpaOnnxDestroyVoiceActivityDetector(vad.impl) | ||
| 741 | + vad.impl = nil | ||
| 742 | +} | ||
| 743 | + | ||
| 744 | +func (vad *VoiceActivityDetector) AcceptWaveform(samples []float32) { | ||
| 745 | + C.SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad.impl, (*C.float)(&samples[0]), C.int(len(samples))) | ||
| 746 | +} | ||
| 747 | + | ||
| 748 | +func (vad *VoiceActivityDetector) IsEmpty() bool { | ||
| 749 | + return 1 == int(C.SherpaOnnxVoiceActivityDetectorEmpty(vad.impl)) | ||
| 750 | +} | ||
| 751 | + | ||
| 752 | +func (vad *VoiceActivityDetector) IsSpeech() bool { | ||
| 753 | + return 1 == int(C.SherpaOnnxVoiceActivityDetectorDetected(vad.impl)) | ||
| 754 | +} | ||
| 755 | + | ||
| 756 | +func (vad *VoiceActivityDetector) Pop() { | ||
| 757 | + C.SherpaOnnxVoiceActivityDetectorPop(vad.impl) | ||
| 758 | +} | ||
| 759 | + | ||
| 760 | +func (vad *VoiceActivityDetector) Clear() { | ||
| 761 | + C.SherpaOnnxVoiceActivityDetectorClear(vad.impl) | ||
| 762 | +} | ||
| 763 | + | ||
| 764 | +func (vad *VoiceActivityDetector) Front() *SpeechSegment { | ||
| 765 | + f := C.SherpaOnnxVoiceActivityDetectorFront(vad.impl) | ||
| 766 | + defer C.SherpaOnnxDestroySpeechSegment(f) | ||
| 767 | + | ||
| 768 | + ans := &SpeechSegment{} | ||
| 769 | + ans.Start = int(f.start) | ||
| 770 | + | ||
| 771 | + n := int(f.n) | ||
| 772 | + ans.Samples = make([]float32, n) | ||
| 773 | + | ||
| 774 | + samples := (*[1 << 28]C.float)(unsafe.Pointer(f.samples))[:n:n] | ||
| 775 | + | ||
| 776 | + for i := 0; i < n; i++ { | ||
| 777 | + ans.Samples[i] = float32(samples[i]) | ||
| 778 | + } | ||
| 779 | + | ||
| 780 | + return ans | ||
| 781 | +} | ||
| 782 | + | ||
| 783 | +func (vad *VoiceActivityDetector) Reset() { | ||
| 784 | + C.SherpaOnnxVoiceActivityDetectorReset(vad.impl) | ||
| 633 | } | 785 | } |
| @@ -309,6 +309,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | @@ -309,6 +309,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | ||
| 309 | 309 | ||
| 310 | recognizer_config.model_config.whisper.task = | 310 | recognizer_config.model_config.whisper.task = |
| 311 | SHERPA_ONNX_OR(config->model_config.whisper.task, "transcribe"); | 311 | SHERPA_ONNX_OR(config->model_config.whisper.task, "transcribe"); |
| 312 | + if (recognizer_config.model_config.whisper.task.empty()) { | ||
| 313 | + recognizer_config.model_config.whisper.task = "transcribe"; | ||
| 314 | + } | ||
| 312 | 315 | ||
| 313 | recognizer_config.model_config.tdnn.model = | 316 | recognizer_config.model_config.tdnn.model = |
| 314 | SHERPA_ONNX_OR(config->model_config.tdnn.model, ""); | 317 | SHERPA_ONNX_OR(config->model_config.tdnn.model, ""); |
| @@ -331,6 +334,11 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | @@ -331,6 +334,11 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( | ||
| 331 | 334 | ||
| 332 | recognizer_config.decoding_method = | 335 | recognizer_config.decoding_method = |
| 333 | SHERPA_ONNX_OR(config->decoding_method, "greedy_search"); | 336 | SHERPA_ONNX_OR(config->decoding_method, "greedy_search"); |
| 337 | + | ||
| 338 | + if (recognizer_config.decoding_method.empty()) { | ||
| 339 | + recognizer_config.decoding_method = "greedy_search"; | ||
| 340 | + } | ||
| 341 | + | ||
| 334 | recognizer_config.max_active_paths = | 342 | recognizer_config.max_active_paths = |
| 335 | SHERPA_ONNX_OR(config->max_active_paths, 4); | 343 | SHERPA_ONNX_OR(config->max_active_paths, 4); |
| 336 | 344 |
-
请 注册 或 登录 后发表评论