Fangjun Kuang
Committed by GitHub

Add Go API for Keyword spotting (#1662)

@@ -68,6 +68,13 @@ jobs: @@ -68,6 +68,13 @@ jobs:
68 run: | 68 run: |
69 gcc --version 69 gcc --version
70 70
  71 + - name: Test Keyword spotting
  72 + if: matrix.os != 'windows-latest'
  73 + shell: bash
  74 + run: |
  75 + cd go-api-examples/keyword-spotting-from-file/
  76 + ./run.sh
  77 +
71 - name: Test adding punctuation 78 - name: Test adding punctuation
72 if: matrix.os != 'windows-latest' 79 if: matrix.os != 'windows-latest'
73 shell: bash 80 shell: bash
@@ -134,6 +134,15 @@ jobs: @@ -134,6 +134,15 @@ jobs:
134 name: ${{ matrix.os }}-libs 134 name: ${{ matrix.os }}-libs
135 path: to-upload/ 135 path: to-upload/
136 136
  137 + - name: Test Keyword spotting
  138 + shell: bash
  139 + run: |
  140 + cd scripts/go/_internal/keyword-spotting-from-file/
  141 +
  142 + ./run.sh
  143 +
  144 + ls -lh
  145 +
137 - name: Test non-streaming decoding files 146 - name: Test non-streaming decoding files
138 shell: bash 147 shell: bash
139 run: | 148 run: |
  1 +module keyword-spotting-from-file
  2 +
  3 +go 1.12
  4 +
  1 +package main
  2 +
  3 +import (
  4 + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
  5 + "log"
  6 +)
  7 +
  8 +func main() {
  9 + log.SetFlags(log.LstdFlags | log.Lmicroseconds)
  10 +
  11 + config := sherpa.KeywordSpotterConfig{}
  12 +
  13 + // Please download the models from
  14 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models
  15 +
  16 + config.ModelConfig.Transducer.Encoder = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx"
  17 + config.ModelConfig.Transducer.Decoder = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx"
  18 + config.ModelConfig.Transducer.Joiner = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx"
  19 + config.ModelConfig.Tokens = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt"
  20 + config.KeywordsFile = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt"
  21 + config.ModelConfig.NumThreads = 1
  22 + config.ModelConfig.Debug = 1
  23 +
  24 + spotter := sherpa.NewKeywordSpotter(&config)
  25 + defer sherpa.DeleteKeywordSpotter(spotter)
  26 +
  27 + wave_filename := "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav"
  28 +
  29 + wave := sherpa.ReadWave(wave_filename)
  30 + if wave == nil {
  31 + log.Printf("Failed to read %v\n", wave_filename)
  32 + return
  33 + }
  34 +
  35 + log.Println("----------Use pre-defined keywords----------")
  36 +
  37 + stream := sherpa.NewKeywordStream(spotter)
  38 + defer sherpa.DeleteOnlineStream(stream)
  39 +
  40 + stream.AcceptWaveform(wave.SampleRate, wave.Samples)
  41 +
  42 + for spotter.IsReady(stream) {
  43 + spotter.Decode(stream)
  44 + result := spotter.GetResult(stream)
  45 + if result.Keyword != "" {
  46 + log.Printf("Detected %v\n", result.Keyword)
  47 + }
  48 + }
  49 +
  50 + log.Println("----------Use pre-defined keywords + add a new keyword----------")
  51 +
  52 + stream2 := sherpa.NewKeywordStreamWithKeywords(spotter, "y ǎn y uán @演员")
  53 + defer sherpa.DeleteOnlineStream(stream2)
  54 +
  55 + stream2.AcceptWaveform(wave.SampleRate, wave.Samples)
  56 +
  57 + for spotter.IsReady(stream2) {
  58 + spotter.Decode(stream2)
  59 + result := spotter.GetResult(stream2)
  60 + if result.Keyword != "" {
  61 + log.Printf("Detected %v\n", result.Keyword)
  62 + }
  63 + }
  64 +
  65 + log.Println("----------Use pre-defined keywords + add 2 new keywords----------")
  66 +
  67 + stream3 := sherpa.NewKeywordStreamWithKeywords(spotter, "y ǎn y uán @演员/zh ī m íng @知名")
  68 + defer sherpa.DeleteOnlineStream(stream3)
  69 +
  70 + stream3.AcceptWaveform(wave.SampleRate, wave.Samples)
  71 +
  72 + for spotter.IsReady(stream3) {
  73 + spotter.Decode(stream3)
  74 + result := spotter.GetResult(stream3)
  75 + if result.Keyword != "" {
  76 + log.Printf("Detected %v\n", result.Keyword)
  77 + }
  78 + }
  79 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
  7 + tar xvf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
  8 + rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2
  9 +fi
  10 +
  11 +go mod tidy
  12 +go build
  13 +./keyword-spotting-from-file
  1 +module keyword-spotting-from-file
  2 +
  3 +go 1.12
  4 +
  5 +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
  1 +../../../../go-api-examples/keyword-spotting-from-file/main.go
  1 +../../../../go-api-examples/keyword-spotting-from-file/run.sh
@@ -1385,3 +1385,151 @@ func (punc *OfflinePunctuation) AddPunct(text string) string { @@ -1385,3 +1385,151 @@ func (punc *OfflinePunctuation) AddPunct(text string) string {
1385 1385
1386 return text_with_punct 1386 return text_with_punct
1387 } 1387 }
  1388 +
  1389 +// Configuration for the online/streaming recognizer.
  1390 +type KeywordSpotterConfig struct {
  1391 + FeatConfig FeatureConfig
  1392 + ModelConfig OnlineModelConfig
  1393 + MaxActivePaths int
  1394 + KeywordsFile string
  1395 + KeywordsScore float32
  1396 + KeywordsThreshold float32
  1397 + KeywordsBuf string
  1398 + KeywordsBufSize int
  1399 +}
  1400 +
  1401 +type KeywordSpotterResult struct {
  1402 + Keyword string
  1403 +}
  1404 +
  1405 +type KeywordSpotter struct {
  1406 + impl *C.struct_SherpaOnnxKeywordSpotter
  1407 +}
  1408 +
  1409 +// Free the internal pointer inside the recognizer to avoid memory leak.
  1410 +func DeleteKeywordSpotter(spotter *KeywordSpotter) {
  1411 + C.SherpaOnnxDestroyKeywordSpotter(spotter.impl)
  1412 + spotter.impl = nil
  1413 +}
  1414 +
  1415 +// The user is responsible to invoke [DeleteKeywordSpotter]() to free
  1416 +// the returned spotter to avoid memory leak
  1417 +func NewKeywordSpotter(config *KeywordSpotterConfig) *KeywordSpotter {
  1418 + c := C.struct_SherpaOnnxKeywordSpotterConfig{}
  1419 + c.feat_config.sample_rate = C.int(config.FeatConfig.SampleRate)
  1420 + c.feat_config.feature_dim = C.int(config.FeatConfig.FeatureDim)
  1421 +
  1422 + c.model_config.transducer.encoder = C.CString(config.ModelConfig.Transducer.Encoder)
  1423 + defer C.free(unsafe.Pointer(c.model_config.transducer.encoder))
  1424 +
  1425 + c.model_config.transducer.decoder = C.CString(config.ModelConfig.Transducer.Decoder)
  1426 + defer C.free(unsafe.Pointer(c.model_config.transducer.decoder))
  1427 +
  1428 + c.model_config.transducer.joiner = C.CString(config.ModelConfig.Transducer.Joiner)
  1429 + defer C.free(unsafe.Pointer(c.model_config.transducer.joiner))
  1430 +
  1431 + c.model_config.paraformer.encoder = C.CString(config.ModelConfig.Paraformer.Encoder)
  1432 + defer C.free(unsafe.Pointer(c.model_config.paraformer.encoder))
  1433 +
  1434 + c.model_config.paraformer.decoder = C.CString(config.ModelConfig.Paraformer.Decoder)
  1435 + defer C.free(unsafe.Pointer(c.model_config.paraformer.decoder))
  1436 +
  1437 + c.model_config.zipformer2_ctc.model = C.CString(config.ModelConfig.Zipformer2Ctc.Model)
  1438 + defer C.free(unsafe.Pointer(c.model_config.zipformer2_ctc.model))
  1439 +
  1440 + c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
  1441 + defer C.free(unsafe.Pointer(c.model_config.tokens))
  1442 +
  1443 + c.model_config.num_threads = C.int(config.ModelConfig.NumThreads)
  1444 +
  1445 + c.model_config.provider = C.CString(config.ModelConfig.Provider)
  1446 + defer C.free(unsafe.Pointer(c.model_config.provider))
  1447 +
  1448 + c.model_config.debug = C.int(config.ModelConfig.Debug)
  1449 +
  1450 + c.model_config.model_type = C.CString(config.ModelConfig.ModelType)
  1451 + defer C.free(unsafe.Pointer(c.model_config.model_type))
  1452 +
  1453 + c.model_config.modeling_unit = C.CString(config.ModelConfig.ModelingUnit)
  1454 + defer C.free(unsafe.Pointer(c.model_config.modeling_unit))
  1455 +
  1456 + c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab)
  1457 + defer C.free(unsafe.Pointer(c.model_config.bpe_vocab))
  1458 +
  1459 + c.model_config.tokens_buf = C.CString(config.ModelConfig.TokensBuf)
  1460 + defer C.free(unsafe.Pointer(c.model_config.tokens_buf))
  1461 +
  1462 + c.model_config.tokens_buf_size = C.int(config.ModelConfig.TokensBufSize)
  1463 +
  1464 + c.max_active_paths = C.int(config.MaxActivePaths)
  1465 +
  1466 + c.keywords_file = C.CString(config.KeywordsFile)
  1467 + defer C.free(unsafe.Pointer(c.keywords_file))
  1468 +
  1469 + c.keywords_score = C.float(config.KeywordsScore)
  1470 +
  1471 + c.keywords_threshold = C.float(config.KeywordsThreshold)
  1472 +
  1473 + c.keywords_buf = C.CString(config.KeywordsBuf)
  1474 + defer C.free(unsafe.Pointer(c.keywords_buf))
  1475 +
  1476 + c.keywords_buf_size = C.int(config.KeywordsBufSize)
  1477 +
  1478 + spotter := &KeywordSpotter{}
  1479 + spotter.impl = C.SherpaOnnxCreateKeywordSpotter(&c)
  1480 +
  1481 + return spotter
  1482 +}
  1483 +
  1484 +// The user is responsible to invoke [DeleteOnlineStream]() to free
  1485 +// the returned stream to avoid memory leak
  1486 +func NewKeywordStream(spotter *KeywordSpotter) *OnlineStream {
  1487 + stream := &OnlineStream{}
  1488 + stream.impl = C.SherpaOnnxCreateKeywordStream(spotter.impl)
  1489 + return stream
  1490 +}
  1491 +
  1492 +// The user is responsible to invoke [DeleteOnlineStream]() to free
  1493 +// the returned stream to avoid memory leak
  1494 +func NewKeywordStreamWithKeywords(spotter *KeywordSpotter, keywords string) *OnlineStream {
  1495 + stream := &OnlineStream{}
  1496 +
  1497 + s := C.CString(keywords)
  1498 + defer C.free(unsafe.Pointer(s))
  1499 +
  1500 + stream.impl = C.SherpaOnnxCreateKeywordStreamWithKeywords(spotter.impl, s)
  1501 + return stream
  1502 +}
  1503 +
  1504 +// Check whether the stream has enough feature frames for decoding.
  1505 +// Return true if this stream is ready for decoding. Return false otherwise.
  1506 +//
  1507 +// You will usually use it like below:
  1508 +//
  1509 +// for spotter.IsReady(s) {
  1510 +// spotter.Decode(s)
  1511 +// }
  1512 +func (spotter *KeywordSpotter) IsReady(s *OnlineStream) bool {
  1513 + return C.SherpaOnnxIsKeywordStreamReady(spotter.impl, s.impl) == 1
  1514 +}
  1515 +
  1516 +// Decode the stream. Before calling this function, you have to ensure
  1517 +// that spotter.IsReady(s) returns true. Otherwise, you will be SAD.
  1518 +//
  1519 +// You usually use it like below:
  1520 +//
  1521 +// for spotter.IsReady(s) {
  1522 +// spotter.Decode(s)
  1523 +// }
  1524 +func (spotter *KeywordSpotter) Decode(s *OnlineStream) {
  1525 + C.SherpaOnnxDecodeKeywordStream(spotter.impl, s.impl)
  1526 +}
  1527 +
  1528 +// Get the current result of stream since the last invoke of Reset()
  1529 +func (spotter *KeywordSpotter) GetResult(s *OnlineStream) *KeywordSpotterResult {
  1530 + p := C.SherpaOnnxGetKeywordResult(spotter.impl, s.impl)
  1531 + defer C.SherpaOnnxDestroyKeywordResult(p)
  1532 + result := &KeywordSpotterResult{}
  1533 + result.Keyword = C.GoString(p.keyword)
  1534 + return result
  1535 +}