Fangjun Kuang
Committed by GitHub

Fix tokens for byte-level BPE token. (#324)

1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.7.15") 4 +set(SHERPA_ONNX_VERSION "1.7.16")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -46,6 +46,19 @@ void SymbolTable::Init(std::istream &is) { @@ -46,6 +46,19 @@ void SymbolTable::Init(std::istream &is) {
46 } 46 }
47 } 47 }
48 48
  49 + // for byte-level BPE
  50 + // id 0 is blank, id 1 is sos/eos, id 2 is unk
  51 + if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' &&
  52 + sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') {
  53 + std::ostringstream os;
  54 + os << std::hex << (id - 3);
  55 +
  56 + if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) {
  57 + uint8_t i = id - 3;
  58 + sym = std::string(&i, &i + 1);
  59 + }
  60 + }
  61 +
49 assert(!sym.empty()); 62 assert(!sym.empty());
50 assert(sym2id_.count(sym) == 0); 63 assert(sym2id_.count(sym) == 0);
51 assert(id2sym_.count(id) == 0); 64 assert(id2sym_.count(id) == 0);