keyword-spotter.h 4.3 KB
// sherpa-onnx/csrc/keyword-spotter.h
//
// Copyright (c)  2023-2024  Xiaomi Corporation

#ifndef SHERPA_ONNX_CSRC_KEYWORD_SPOTTER_H_
#define SHERPA_ONNX_CSRC_KEYWORD_SPOTTER_H_

#include <memory>
#include <string>
#include <vector>

#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif

#include "sherpa-onnx/csrc/features.h"
#include "sherpa-onnx/csrc/online-model-config.h"
#include "sherpa-onnx/csrc/online-stream.h"
#include "sherpa-onnx/csrc/online-transducer-model-config.h"
#include "sherpa-onnx/csrc/parse-options.h"

namespace sherpa_onnx {

struct KeywordResult {
  /// The triggered keyword.
  /// For English, it consists of space separated words.
  /// For Chinese, it consists of Chinese words without spaces.
  /// Example 1: "hello world"
  /// Example 2: "你好世界"
  std::string keyword;

  /// Decoded results at the token level.
  /// For instance, for BPE-based models it consists of a list of BPE tokens.
  std::vector<std::string> tokens;

  /// timestamps.size() == tokens.size()
  /// timestamps[i] records the time in seconds when tokens[i] is decoded.
  std::vector<float> timestamps;

  /// Starting time of this segment.
  /// When an endpoint is detected, it will change
  float start_time = 0;

  /** Return a json string.
   *
   * The returned string contains:
   *   {
   *     "keyword": "The triggered keyword",
   *     "tokens": [x, x, x],
   *     "timestamps": [x, x, x],
   *     "start_time": x,
   *   }
   */
  std::string AsJsonString() const;
};

struct KeywordSpotterConfig {
  FeatureExtractorConfig feat_config;
  OnlineModelConfig model_config;

  int32_t max_active_paths = 4;

  int32_t num_trailing_blanks = 1;

  float keywords_score = 1.0;

  float keywords_threshold = 0.25;

  std::string keywords_file;

  /// if keywords_buf is non-empty,
  /// the keywords will be loaded from the buffer instead of from the
  /// "keywrods_file"
  std::string keywords_buf;

  KeywordSpotterConfig() = default;

  KeywordSpotterConfig(const FeatureExtractorConfig &feat_config,
                       const OnlineModelConfig &model_config,
                       int32_t max_active_paths, int32_t num_trailing_blanks,
                       float keywords_score, float keywords_threshold,
                       const std::string &keywords_file)
      : feat_config(feat_config),
        model_config(model_config),
        max_active_paths(max_active_paths),
        num_trailing_blanks(num_trailing_blanks),
        keywords_score(keywords_score),
        keywords_threshold(keywords_threshold),
        keywords_file(keywords_file) {}

  void Register(ParseOptions *po);
  bool Validate() const;

  std::string ToString() const;
};

class KeywordSpotterImpl;

class KeywordSpotter {
 public:
  explicit KeywordSpotter(const KeywordSpotterConfig &config);

#if __ANDROID_API__ >= 9
  KeywordSpotter(AAssetManager *mgr, const KeywordSpotterConfig &config);
#endif

  ~KeywordSpotter();

  /** Create a stream for decoding.
   *
   */
  std::unique_ptr<OnlineStream> CreateStream() const;

  /** Create a stream for decoding.
   *
   *  @param The keywords for this string, it might contain several keywords,
   *         the keywords are separated by "/". In each of the keywords, there
   *         are cjkchars or bpes, the bpe/cjkchar are separated by space (" ").
   *         For example, keywords I LOVE YOU and HELLO WORLD, looks like:
   *
   *         "▁I ▁LOVE ▁YOU/▁HE LL O ▁WORLD"
   */
  std::unique_ptr<OnlineStream> CreateStream(const std::string &keywords) const;

  /**
   * Return true if the given stream has enough frames for decoding.
   * Return false otherwise
   */
  bool IsReady(OnlineStream *s) const;

  // Remember to call it after detecting a keyword
  void Reset(OnlineStream *s) const;

  /** Decode a single stream. */
  void DecodeStream(OnlineStream *s) const {
    OnlineStream *ss[1] = {s};
    DecodeStreams(ss, 1);
  }

  /** Decode multiple streams in parallel
   *
   * @param ss Pointer array containing streams to be decoded.
   * @param n Number of streams in `ss`.
   */
  void DecodeStreams(OnlineStream **ss, int32_t n) const;

  KeywordResult GetResult(OnlineStream *s) const;

 private:
  std::unique_ptr<KeywordSpotterImpl> impl_;
};

}  // namespace sherpa_onnx

#endif  // SHERPA_ONNX_CSRC_KEYWORD_SPOTTER_H_