Fangjun Kuang
Committed by GitHub

Add Go API for SenseVoice (#1154)

@@ -191,6 +191,10 @@ jobs: @@ -191,6 +191,10 @@ jobs:
191 go build 191 go build
192 ls -lh 192 ls -lh
193 193
  194 + echo "Test SenseVoice ctc"
  195 + ./run-sense-voice-small.sh
  196 + rm -rf sherpa-onnx-sense-*
  197 +
194 echo "Test telespeech ctc" 198 echo "Test telespeech ctc"
195 ./run-telespeech-ctc.sh 199 ./run-telespeech-ctc.sh
196 rm -rf sherpa-onnx-telespeech-ctc-* 200 rm -rf sherpa-onnx-telespeech-ctc-*
@@ -35,6 +35,10 @@ func main() { @@ -35,6 +35,10 @@ func main() {
35 35
36 flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model") 36 flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model")
37 37
  38 + flag.StringVar(&config.ModelConfig.SenseVoice.Model, "sense-voice-model", "", "Path to the SenseVoice model")
  39 + flag.StringVar(&config.ModelConfig.SenseVoice.Language, "sense-voice-language", "", "If not empty, specify the Language for the input wave")
  40 + flag.IntVar(&config.ModelConfig.SenseVoice.UseInverseTextNormalization, "sense-voice-use-itn", 1, " 1 to use inverse text normalization")
  41 +
38 flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file") 42 flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file")
39 flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing") 43 flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing")
40 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") 44 flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message")
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  7 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  8 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +
  14 +./non-streaming-decode-files \
  15 + --sense-voice-model ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx \
  16 + --tokens ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \
  17 + --debug 0 \
  18 + ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav
  1 +../../../../go-api-examples/non-streaming-decode-files/run-sense-voice-small.sh
@@ -370,6 +370,12 @@ type OfflineTdnnModelConfig struct { @@ -370,6 +370,12 @@ type OfflineTdnnModelConfig struct {
370 Model string 370 Model string
371 } 371 }
372 372
  373 +type OfflineSenseVoiceModelConfig struct {
  374 + Model string
  375 + Language string
  376 + UseInverseTextNormalization int
  377 +}
  378 +
373 // Configuration for offline LM. 379 // Configuration for offline LM.
374 type OfflineLMConfig struct { 380 type OfflineLMConfig struct {
375 Model string // Path to the model 381 Model string // Path to the model
@@ -382,6 +388,7 @@ type OfflineModelConfig struct { @@ -382,6 +388,7 @@ type OfflineModelConfig struct {
382 NemoCTC OfflineNemoEncDecCtcModelConfig 388 NemoCTC OfflineNemoEncDecCtcModelConfig
383 Whisper OfflineWhisperModelConfig 389 Whisper OfflineWhisperModelConfig
384 Tdnn OfflineTdnnModelConfig 390 Tdnn OfflineTdnnModelConfig
  391 + SenseVoice OfflineSenseVoiceModelConfig
385 Tokens string // Path to tokens.txt 392 Tokens string // Path to tokens.txt
386 393
387 // Number of threads to use for neural network computation 394 // Number of threads to use for neural network computation
@@ -478,6 +485,14 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { @@ -478,6 +485,14 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer {
478 c.model_config.tdnn.model = C.CString(config.ModelConfig.Tdnn.Model) 485 c.model_config.tdnn.model = C.CString(config.ModelConfig.Tdnn.Model)
479 defer C.free(unsafe.Pointer(c.model_config.tdnn.model)) 486 defer C.free(unsafe.Pointer(c.model_config.tdnn.model))
480 487
  488 + c.model_config.sense_voice.model = C.CString(config.ModelConfig.SenseVoice.Model)
  489 + defer C.free(unsafe.Pointer(c.model_config.sense_voice.model))
  490 +
  491 + c.model_config.sense_voice.language = C.CString(config.ModelConfig.SenseVoice.Language)
  492 + defer C.free(unsafe.Pointer(c.model_config.sense_voice.language))
  493 +
  494 + c.model_config.sense_voice.use_itn = C.int(config.ModelConfig.SenseVoice.UseInverseTextNormalization)
  495 +
481 c.model_config.tokens = C.CString(config.ModelConfig.Tokens) 496 c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
482 defer C.free(unsafe.Pointer(c.model_config.tokens)) 497 defer C.free(unsafe.Pointer(c.model_config.tokens))
483 498