正在显示
6 个修改的文件
包含
115 行增加
和
55 行删除
| @@ -134,6 +134,53 @@ jobs: | @@ -134,6 +134,53 @@ jobs: | ||
| 134 | name: ${{ matrix.os }}-libs | 134 | name: ${{ matrix.os }}-libs |
| 135 | path: to-upload/ | 135 | path: to-upload/ |
| 136 | 136 | ||
| 137 | + - name: Test non-streaming decoding files | ||
| 138 | + shell: bash | ||
| 139 | + run: | | ||
| 140 | + cd scripts/go/_internal/non-streaming-decode-files/ | ||
| 141 | + ls -lh | ||
| 142 | + go mod tidy | ||
| 143 | + cat go.mod | ||
| 144 | + go build | ||
| 145 | + ls -lh | ||
| 146 | + | ||
| 147 | + echo "Test Moonshine" | ||
| 148 | + ./run-moonshine.sh | ||
| 149 | + rm -rf sherpa-onnx-* | ||
| 150 | + | ||
| 151 | + echo "Test SenseVoice ctc" | ||
| 152 | + ./run-sense-voice-small.sh | ||
| 153 | + rm -rf sherpa-onnx-sense-* | ||
| 154 | + | ||
| 155 | + echo "Test telespeech ctc" | ||
| 156 | + ./run-telespeech-ctc.sh | ||
| 157 | + rm -rf sherpa-onnx-telespeech-ctc-* | ||
| 158 | + | ||
| 159 | + echo "Test transducer" | ||
| 160 | + ./run-transducer.sh | ||
| 161 | + rm -rf sherpa-onnx-zipformer-en-2023-06-26 | ||
| 162 | + | ||
| 163 | + echo "Test transducer" | ||
| 164 | + ./run-transducer.sh | ||
| 165 | + rm -rf sherpa-onnx-zipformer-en-2023-06-26 | ||
| 166 | + | ||
| 167 | + echo "Test paraformer" | ||
| 168 | + ./run-paraformer.sh | ||
| 169 | + ./run-paraformer-itn.sh | ||
| 170 | + rm -rf sherpa-onnx-paraformer-zh-2023-09-14 | ||
| 171 | + | ||
| 172 | + echo "Test NeMo CTC" | ||
| 173 | + ./run-nemo-ctc.sh | ||
| 174 | + rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium | ||
| 175 | + | ||
| 176 | + echo "Test Whisper tiny.en" | ||
| 177 | + ./run-whisper.sh | ||
| 178 | + rm -rf sherpa-onnx-whisper-tiny.en | ||
| 179 | + | ||
| 180 | + echo "Test Tdnn yesno" | ||
| 181 | + ./run-tdnn-yesno.sh | ||
| 182 | + rm -rf sherpa-onnx-tdnn-yesno | ||
| 183 | + | ||
| 137 | - name: Test adding punctuation | 184 | - name: Test adding punctuation |
| 138 | shell: bash | 185 | shell: bash |
| 139 | run: | | 186 | run: | |
| @@ -193,49 +240,6 @@ jobs: | @@ -193,49 +240,6 @@ jobs: | ||
| 193 | name: tts-waves-${{ matrix.os }} | 240 | name: tts-waves-${{ matrix.os }} |
| 194 | path: tts-waves | 241 | path: tts-waves |
| 195 | 242 | ||
| 196 | - - name: Test non-streaming decoding files | ||
| 197 | - shell: bash | ||
| 198 | - run: | | ||
| 199 | - cd scripts/go/_internal/non-streaming-decode-files/ | ||
| 200 | - ls -lh | ||
| 201 | - go mod tidy | ||
| 202 | - cat go.mod | ||
| 203 | - go build | ||
| 204 | - ls -lh | ||
| 205 | - | ||
| 206 | - echo "Test SenseVoice ctc" | ||
| 207 | - ./run-sense-voice-small.sh | ||
| 208 | - rm -rf sherpa-onnx-sense-* | ||
| 209 | - | ||
| 210 | - echo "Test telespeech ctc" | ||
| 211 | - ./run-telespeech-ctc.sh | ||
| 212 | - rm -rf sherpa-onnx-telespeech-ctc-* | ||
| 213 | - | ||
| 214 | - echo "Test transducer" | ||
| 215 | - ./run-transducer.sh | ||
| 216 | - rm -rf sherpa-onnx-zipformer-en-2023-06-26 | ||
| 217 | - | ||
| 218 | - echo "Test transducer" | ||
| 219 | - ./run-transducer.sh | ||
| 220 | - rm -rf sherpa-onnx-zipformer-en-2023-06-26 | ||
| 221 | - | ||
| 222 | - echo "Test paraformer" | ||
| 223 | - ./run-paraformer.sh | ||
| 224 | - ./run-paraformer-itn.sh | ||
| 225 | - rm -rf sherpa-onnx-paraformer-zh-2023-09-14 | ||
| 226 | - | ||
| 227 | - echo "Test NeMo CTC" | ||
| 228 | - ./run-nemo-ctc.sh | ||
| 229 | - rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium | ||
| 230 | - | ||
| 231 | - echo "Test Whisper tiny.en" | ||
| 232 | - ./run-whisper.sh | ||
| 233 | - rm -rf sherpa-onnx-whisper-tiny.en | ||
| 234 | - | ||
| 235 | - echo "Test Tdnn yesno" | ||
| 236 | - ./run-tdnn-yesno.sh | ||
| 237 | - rm -rf sherpa-onnx-tdnn-yesno | ||
| 238 | - | ||
| 239 | - name: Test streaming decoding files | 243 | - name: Test streaming decoding files |
| 240 | shell: bash | 244 | shell: bash |
| 241 | run: | | 245 | run: | |
| @@ -6,28 +6,41 @@ Please refer to the documentation | @@ -6,28 +6,41 @@ Please refer to the documentation | ||
| 6 | https://k2-fsa.github.io/sherpa/onnx/go-api/index.html | 6 | https://k2-fsa.github.io/sherpa/onnx/go-api/index.html |
| 7 | for details. | 7 | for details. |
| 8 | 8 | ||
| 9 | +- [./add-punctuation](./add-punctuation) It shows how to use | ||
| 10 | + a punctuation model to add punctuations to text | ||
| 11 | + | ||
| 9 | - [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use | 12 | - [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use |
| 10 | a non-streaming ASR model to decode files | 13 | a non-streaming ASR model to decode files |
| 11 | 14 | ||
| 15 | +- [./non-streaming-speaker-diarization](./non-streaming-speaker-diarization) It shows how to use | ||
| 16 | + a speaker segmentation model and a speaker embedding model for speaker diarization. | ||
| 17 | + | ||
| 12 | - [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS | 18 | - [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS |
| 13 | model to convert text to speech | 19 | model to convert text to speech |
| 14 | 20 | ||
| 15 | - [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone) | 21 | - [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone) |
| 16 | It shows how to use a streaming ASR model to recognize speech from a microphone in real-time | 22 | It shows how to use a streaming ASR model to recognize speech from a microphone in real-time |
| 17 | 23 | ||
| 24 | +- [./speaker-identification](./speaker-identification) It shows how to use a speaker | ||
| 25 | + embedding model for speaker identification. | ||
| 26 | + | ||
| 27 | +- [./streaming-decode-files](./streaming-decode-files) It shows how to use a streaming | ||
| 28 | + model for streaming speech recognition | ||
| 29 | + | ||
| 30 | +- [./streaming-hlg-decoding](./streaming-hlg-decoding) It shows how to use a streaming | ||
| 31 | + model for streaming speech recognition with HLG decoding | ||
| 32 | + | ||
| 18 | - [./vad](./vad) It shows how to use silero VAD with Golang. | 33 | - [./vad](./vad) It shows how to use silero VAD with Golang. |
| 19 | 34 | ||
| 20 | -- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper | 35 | +- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer |
| 21 | for speech recognition. | 36 | for speech recognition. |
| 22 | 37 | ||
| 23 | -- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer | 38 | +- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper |
| 39 | + | ||
| 40 | +- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification. | ||
| 24 | for speech recognition. | 41 | for speech recognition. |
| 25 | 42 | ||
| 26 | - [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper | 43 | - [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper |
| 27 | for spoken language identification. | 44 | for spoken language identification. |
| 28 | 45 | ||
| 29 | -- [./speaker-identification](./speaker-identification) It shows how to use Go API for speaker identification. | ||
| 30 | - | ||
| 31 | -- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification. | ||
| 32 | - | ||
| 33 | [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx | 46 | [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx |
| @@ -34,6 +34,11 @@ func main() { | @@ -34,6 +34,11 @@ func main() { | ||
| 34 | flag.StringVar(&config.ModelConfig.Whisper.Task, "whisper-task", "transcribe", "transcribe or translate") | 34 | flag.StringVar(&config.ModelConfig.Whisper.Task, "whisper-task", "transcribe", "transcribe or translate") |
| 35 | flag.IntVar(&config.ModelConfig.Whisper.TailPaddings, "whisper-tail-paddings", -1, "tail paddings for whisper") | 35 | flag.IntVar(&config.ModelConfig.Whisper.TailPaddings, "whisper-tail-paddings", -1, "tail paddings for whisper") |
| 36 | 36 | ||
| 37 | + flag.StringVar(&config.ModelConfig.Moonshine.Preprocessor, "moonshine-preprocessor", "", "Path to the moonshine preprocessor model") | ||
| 38 | + flag.StringVar(&config.ModelConfig.Moonshine.Encoder, "moonshine-encoder", "", "Path to the moonshine encoder model") | ||
| 39 | + flag.StringVar(&config.ModelConfig.Moonshine.UncachedDecoder, "moonshine-uncached-decoder", "", "Path to the moonshine uncached decoder model") | ||
| 40 | + flag.StringVar(&config.ModelConfig.Moonshine.CachedDecoder, "moonshine-cached-decoder", "", "Path to the moonshine cached decoder model") | ||
| 41 | + | ||
| 37 | flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model") | 42 | flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model") |
| 38 | 43 | ||
| 39 | flag.StringVar(&config.ModelConfig.SenseVoice.Model, "sense-voice-model", "", "Path to the SenseVoice model") | 44 | flag.StringVar(&config.ModelConfig.SenseVoice.Model, "sense-voice-model", "", "Path to the SenseVoice model") |
| @@ -85,12 +90,8 @@ func main() { | @@ -85,12 +90,8 @@ func main() { | ||
| 85 | log.Println("Emotion: " + result.Emotion) | 90 | log.Println("Emotion: " + result.Emotion) |
| 86 | log.Println("Lang: " + result.Lang) | 91 | log.Println("Lang: " + result.Lang) |
| 87 | log.Println("Event: " + result.Event) | 92 | log.Println("Event: " + result.Event) |
| 88 | - for _, v := range result.Timestamps { | ||
| 89 | - log.Printf("Timestamp: %+v\n", v) | ||
| 90 | - } | ||
| 91 | - for _, v := range result.Tokens { | ||
| 92 | - log.Println("Token: " + v) | ||
| 93 | - } | 93 | + log.Printf("Timestamp: %v\n", result.Timestamps) |
| 94 | + log.Printf("Tokens: %v\n", result.Tokens) | ||
| 94 | log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate)) | 95 | log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate)) |
| 95 | } | 96 | } |
| 96 | 97 |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then | ||
| 6 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 7 | + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 8 | + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 9 | +fi | ||
| 10 | + | ||
| 11 | +go mod tidy | ||
| 12 | +go build | ||
| 13 | + | ||
| 14 | +./non-streaming-decode-files \ | ||
| 15 | + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \ | ||
| 16 | + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \ | ||
| 17 | + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \ | ||
| 18 | + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \ | ||
| 19 | + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \ | ||
| 20 | + ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav | ||
| 21 | + |
| 1 | +../../../../go-api-examples/non-streaming-decode-files/run-moonshine.sh |
| @@ -382,6 +382,13 @@ type OfflineWhisperModelConfig struct { | @@ -382,6 +382,13 @@ type OfflineWhisperModelConfig struct { | ||
| 382 | TailPaddings int | 382 | TailPaddings int |
| 383 | } | 383 | } |
| 384 | 384 | ||
| 385 | +type OfflineMoonshineModelConfig struct { | ||
| 386 | + Preprocessor string | ||
| 387 | + Encoder string | ||
| 388 | + UncachedDecoder string | ||
| 389 | + CachedDecoder string | ||
| 390 | +} | ||
| 391 | + | ||
| 385 | type OfflineTdnnModelConfig struct { | 392 | type OfflineTdnnModelConfig struct { |
| 386 | Model string | 393 | Model string |
| 387 | } | 394 | } |
| @@ -405,6 +412,7 @@ type OfflineModelConfig struct { | @@ -405,6 +412,7 @@ type OfflineModelConfig struct { | ||
| 405 | Whisper OfflineWhisperModelConfig | 412 | Whisper OfflineWhisperModelConfig |
| 406 | Tdnn OfflineTdnnModelConfig | 413 | Tdnn OfflineTdnnModelConfig |
| 407 | SenseVoice OfflineSenseVoiceModelConfig | 414 | SenseVoice OfflineSenseVoiceModelConfig |
| 415 | + Moonshine OfflineMoonshineModelConfig | ||
| 408 | Tokens string // Path to tokens.txt | 416 | Tokens string // Path to tokens.txt |
| 409 | 417 | ||
| 410 | // Number of threads to use for neural network computation | 418 | // Number of threads to use for neural network computation |
| @@ -515,6 +523,18 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { | @@ -515,6 +523,18 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { | ||
| 515 | 523 | ||
| 516 | c.model_config.sense_voice.use_itn = C.int(config.ModelConfig.SenseVoice.UseInverseTextNormalization) | 524 | c.model_config.sense_voice.use_itn = C.int(config.ModelConfig.SenseVoice.UseInverseTextNormalization) |
| 517 | 525 | ||
| 526 | + c.model_config.moonshine.preprocessor = C.CString(config.ModelConfig.Moonshine.Preprocessor) | ||
| 527 | + defer C.free(unsafe.Pointer(c.model_config.moonshine.preprocessor)) | ||
| 528 | + | ||
| 529 | + c.model_config.moonshine.encoder = C.CString(config.ModelConfig.Moonshine.Encoder) | ||
| 530 | + defer C.free(unsafe.Pointer(c.model_config.moonshine.encoder)) | ||
| 531 | + | ||
| 532 | + c.model_config.moonshine.uncached_decoder = C.CString(config.ModelConfig.Moonshine.UncachedDecoder) | ||
| 533 | + defer C.free(unsafe.Pointer(c.model_config.moonshine.uncached_decoder)) | ||
| 534 | + | ||
| 535 | + c.model_config.moonshine.cached_decoder = C.CString(config.ModelConfig.Moonshine.CachedDecoder) | ||
| 536 | + defer C.free(unsafe.Pointer(c.model_config.moonshine.cached_decoder)) | ||
| 537 | + | ||
| 518 | c.model_config.tokens = C.CString(config.ModelConfig.Tokens) | 538 | c.model_config.tokens = C.CString(config.ModelConfig.Tokens) |
| 519 | defer C.free(unsafe.Pointer(c.model_config.tokens)) | 539 | defer C.free(unsafe.Pointer(c.model_config.tokens)) |
| 520 | 540 |
-
请 注册 或 登录 后发表评论