Fangjun Kuang
Committed by GitHub

Add Go API for Moonshine models (#1479)

@@ -134,6 +134,53 @@ jobs: @@ -134,6 +134,53 @@ jobs:
134 name: ${{ matrix.os }}-libs 134 name: ${{ matrix.os }}-libs
135 path: to-upload/ 135 path: to-upload/
136 136
  137 + - name: Test non-streaming decoding files
  138 + shell: bash
  139 + run: |
  140 + cd scripts/go/_internal/non-streaming-decode-files/
  141 + ls -lh
  142 + go mod tidy
  143 + cat go.mod
  144 + go build
  145 + ls -lh
  146 +
  147 + echo "Test Moonshine"
  148 + ./run-moonshine.sh
  149 + rm -rf sherpa-onnx-*
  150 +
  151 + echo "Test SenseVoice ctc"
  152 + ./run-sense-voice-small.sh
  153 + rm -rf sherpa-onnx-sense-*
  154 +
  155 + echo "Test telespeech ctc"
  156 + ./run-telespeech-ctc.sh
  157 + rm -rf sherpa-onnx-telespeech-ctc-*
  158 +
  159 + echo "Test transducer"
  160 + ./run-transducer.sh
  161 + rm -rf sherpa-onnx-zipformer-en-2023-06-26
  162 +
  163 + echo "Test transducer"
  164 + ./run-transducer.sh
  165 + rm -rf sherpa-onnx-zipformer-en-2023-06-26
  166 +
  167 + echo "Test paraformer"
  168 + ./run-paraformer.sh
  169 + ./run-paraformer-itn.sh
  170 + rm -rf sherpa-onnx-paraformer-zh-2023-09-14
  171 +
  172 + echo "Test NeMo CTC"
  173 + ./run-nemo-ctc.sh
  174 + rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium
  175 +
  176 + echo "Test Whisper tiny.en"
  177 + ./run-whisper.sh
  178 + rm -rf sherpa-onnx-whisper-tiny.en
  179 +
  180 + echo "Test Tdnn yesno"
  181 + ./run-tdnn-yesno.sh
  182 + rm -rf sherpa-onnx-tdnn-yesno
  183 +
137 - name: Test adding punctuation 184 - name: Test adding punctuation
138 shell: bash 185 shell: bash
139 run: | 186 run: |
@@ -193,49 +240,6 @@ jobs: @@ -193,49 +240,6 @@ jobs:
193 name: tts-waves-${{ matrix.os }} 240 name: tts-waves-${{ matrix.os }}
194 path: tts-waves 241 path: tts-waves
195 242
196 - - name: Test non-streaming decoding files  
197 - shell: bash  
198 - run: |  
199 - cd scripts/go/_internal/non-streaming-decode-files/  
200 - ls -lh  
201 - go mod tidy  
202 - cat go.mod  
203 - go build  
204 - ls -lh  
205 -  
206 - echo "Test SenseVoice ctc"  
207 - ./run-sense-voice-small.sh  
208 - rm -rf sherpa-onnx-sense-*  
209 -  
210 - echo "Test telespeech ctc"  
211 - ./run-telespeech-ctc.sh  
212 - rm -rf sherpa-onnx-telespeech-ctc-*  
213 -  
214 - echo "Test transducer"  
215 - ./run-transducer.sh  
216 - rm -rf sherpa-onnx-zipformer-en-2023-06-26  
217 -  
218 - echo "Test transducer"  
219 - ./run-transducer.sh  
220 - rm -rf sherpa-onnx-zipformer-en-2023-06-26  
221 -  
222 - echo "Test paraformer"  
223 - ./run-paraformer.sh  
224 - ./run-paraformer-itn.sh  
225 - rm -rf sherpa-onnx-paraformer-zh-2023-09-14  
226 -  
227 - echo "Test NeMo CTC"  
228 - ./run-nemo-ctc.sh  
229 - rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium  
230 -  
231 - echo "Test Whisper tiny.en"  
232 - ./run-whisper.sh  
233 - rm -rf sherpa-onnx-whisper-tiny.en  
234 -  
235 - echo "Test Tdnn yesno"  
236 - ./run-tdnn-yesno.sh  
237 - rm -rf sherpa-onnx-tdnn-yesno  
238 -  
239 - name: Test streaming decoding files 243 - name: Test streaming decoding files
240 shell: bash 244 shell: bash
241 run: | 245 run: |
@@ -6,28 +6,41 @@ Please refer to the documentation @@ -6,28 +6,41 @@ Please refer to the documentation
6 https://k2-fsa.github.io/sherpa/onnx/go-api/index.html 6 https://k2-fsa.github.io/sherpa/onnx/go-api/index.html
7 for details. 7 for details.
8 8
  9 +- [./add-punctuation](./add-punctuation) It shows how to use
  10 + a punctuation model to add punctuations to text
  11 +
9 - [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use 12 - [./non-streaming-decode-files](./non-streaming-decode-files) It shows how to use
10 a non-streaming ASR model to decode files 13 a non-streaming ASR model to decode files
11 14
  15 +- [./non-streaming-speaker-diarization](./non-streaming-speaker-diarization) It shows how to use
  16 + a speaker segmentation model and a speaker embedding model for speaker diarization.
  17 +
12 - [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS 18 - [./non-streaming-tts](./non-streaming-tts) It shows how to use a non-streaming TTS
13 model to convert text to speech 19 model to convert text to speech
14 20
15 - [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone) 21 - [./real-time-speech-recognition-from-microphone](./real-time-speech-recognition-from-microphone)
16 It shows how to use a streaming ASR model to recognize speech from a microphone in real-time 22 It shows how to use a streaming ASR model to recognize speech from a microphone in real-time
17 23
  24 +- [./speaker-identification](./speaker-identification) It shows how to use a speaker
  25 + embedding model for speaker identification.
  26 +
  27 +- [./streaming-decode-files](./streaming-decode-files) It shows how to use a streaming
  28 + model for streaming speech recognition
  29 +
  30 +- [./streaming-hlg-decoding](./streaming-hlg-decoding) It shows how to use a streaming
  31 + model for streaming speech recognition with HLG decoding
  32 +
18 - [./vad](./vad) It shows how to use silero VAD with Golang. 33 - [./vad](./vad) It shows how to use silero VAD with Golang.
19 34
20 -- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper 35 +- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer
21 for speech recognition. 36 for speech recognition.
22 37
23 -- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer 38 +- [./vad-asr-whisper](./vad-asr-whisper) It shows how to use silero VAD + Whisper
  39 +
  40 +- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification.
24 for speech recognition. 41 for speech recognition.
25 42
26 - [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper 43 - [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper
27 for spoken language identification. 44 for spoken language identification.
28 45
29 -- [./speaker-identification](./speaker-identification) It shows how to use Go API for speaker identification.  
30 -  
31 -- [./vad-speaker-identification](./vad-speaker-identification) It shows how to use Go API for VAD + speaker identification.  
32 -  
33 [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx 46 [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
@@ -34,6 +34,11 @@ func main() { @@ -34,6 +34,11 @@ func main() {
34 flag.StringVar(&config.ModelConfig.Whisper.Task, "whisper-task", "transcribe", "transcribe or translate") 34 flag.StringVar(&config.ModelConfig.Whisper.Task, "whisper-task", "transcribe", "transcribe or translate")
35 flag.IntVar(&config.ModelConfig.Whisper.TailPaddings, "whisper-tail-paddings", -1, "tail paddings for whisper") 35 flag.IntVar(&config.ModelConfig.Whisper.TailPaddings, "whisper-tail-paddings", -1, "tail paddings for whisper")
36 36
  37 + flag.StringVar(&config.ModelConfig.Moonshine.Preprocessor, "moonshine-preprocessor", "", "Path to the moonshine preprocessor model")
  38 + flag.StringVar(&config.ModelConfig.Moonshine.Encoder, "moonshine-encoder", "", "Path to the moonshine encoder model")
  39 + flag.StringVar(&config.ModelConfig.Moonshine.UncachedDecoder, "moonshine-uncached-decoder", "", "Path to the moonshine uncached decoder model")
  40 + flag.StringVar(&config.ModelConfig.Moonshine.CachedDecoder, "moonshine-cached-decoder", "", "Path to the moonshine cached decoder model")
  41 +
37 flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model") 42 flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model")
38 43
39 flag.StringVar(&config.ModelConfig.SenseVoice.Model, "sense-voice-model", "", "Path to the SenseVoice model") 44 flag.StringVar(&config.ModelConfig.SenseVoice.Model, "sense-voice-model", "", "Path to the SenseVoice model")
@@ -85,12 +90,8 @@ func main() { @@ -85,12 +90,8 @@ func main() {
85 log.Println("Emotion: " + result.Emotion) 90 log.Println("Emotion: " + result.Emotion)
86 log.Println("Lang: " + result.Lang) 91 log.Println("Lang: " + result.Lang)
87 log.Println("Event: " + result.Event) 92 log.Println("Event: " + result.Event)
88 - for _, v := range result.Timestamps {  
89 - log.Printf("Timestamp: %+v\n", v)  
90 - }  
91 - for _, v := range result.Tokens {  
92 - log.Println("Token: " + v)  
93 - } 93 + log.Printf("Timestamp: %v\n", result.Timestamps)
  94 + log.Printf("Tokens: %v\n", result.Tokens)
94 log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate)) 95 log.Printf("Wave duration: %v seconds", float32(len(samples))/float32(sampleRate))
95 } 96 }
96 97
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  7 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  8 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./non-streaming-decode-files \
  15 + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \
  16 + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \
  17 + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \
  18 + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \
  19 + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \
  20 + ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav
  21 +
  1 +../../../../go-api-examples/non-streaming-decode-files/run-moonshine.sh
@@ -382,6 +382,13 @@ type OfflineWhisperModelConfig struct { @@ -382,6 +382,13 @@ type OfflineWhisperModelConfig struct {
382 TailPaddings int 382 TailPaddings int
383 } 383 }
384 384
  385 +type OfflineMoonshineModelConfig struct {
  386 + Preprocessor string
  387 + Encoder string
  388 + UncachedDecoder string
  389 + CachedDecoder string
  390 +}
  391 +
385 type OfflineTdnnModelConfig struct { 392 type OfflineTdnnModelConfig struct {
386 Model string 393 Model string
387 } 394 }
@@ -405,6 +412,7 @@ type OfflineModelConfig struct { @@ -405,6 +412,7 @@ type OfflineModelConfig struct {
405 Whisper OfflineWhisperModelConfig 412 Whisper OfflineWhisperModelConfig
406 Tdnn OfflineTdnnModelConfig 413 Tdnn OfflineTdnnModelConfig
407 SenseVoice OfflineSenseVoiceModelConfig 414 SenseVoice OfflineSenseVoiceModelConfig
  415 + Moonshine OfflineMoonshineModelConfig
408 Tokens string // Path to tokens.txt 416 Tokens string // Path to tokens.txt
409 417
410 // Number of threads to use for neural network computation 418 // Number of threads to use for neural network computation
@@ -515,6 +523,18 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { @@ -515,6 +523,18 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer {
515 523
516 c.model_config.sense_voice.use_itn = C.int(config.ModelConfig.SenseVoice.UseInverseTextNormalization) 524 c.model_config.sense_voice.use_itn = C.int(config.ModelConfig.SenseVoice.UseInverseTextNormalization)
517 525
  526 + c.model_config.moonshine.preprocessor = C.CString(config.ModelConfig.Moonshine.Preprocessor)
  527 + defer C.free(unsafe.Pointer(c.model_config.moonshine.preprocessor))
  528 +
  529 + c.model_config.moonshine.encoder = C.CString(config.ModelConfig.Moonshine.Encoder)
  530 + defer C.free(unsafe.Pointer(c.model_config.moonshine.encoder))
  531 +
  532 + c.model_config.moonshine.uncached_decoder = C.CString(config.ModelConfig.Moonshine.UncachedDecoder)
  533 + defer C.free(unsafe.Pointer(c.model_config.moonshine.uncached_decoder))
  534 +
  535 + c.model_config.moonshine.cached_decoder = C.CString(config.ModelConfig.Moonshine.CachedDecoder)
  536 + defer C.free(unsafe.Pointer(c.model_config.moonshine.cached_decoder))
  537 +
518 c.model_config.tokens = C.CString(config.ModelConfig.Tokens) 538 c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
519 defer C.free(unsafe.Pointer(c.model_config.tokens)) 539 defer C.free(unsafe.Pointer(c.model_config.tokens))
520 540