Fangjun Kuang
Committed by GitHub

Fix tokens for byte-level BPE token. (#324)

cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)
set(SHERPA_ONNX_VERSION "1.7.15")
set(SHERPA_ONNX_VERSION "1.7.16")
# Disable warning about
#
... ...
... ... @@ -46,6 +46,19 @@ void SymbolTable::Init(std::istream &is) {
}
}
// for byte-level BPE
// id 0 is blank, id 1 is sos/eos, id 2 is unk
if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' &&
sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') {
std::ostringstream os;
os << std::hex << (id - 3);
if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) {
uint8_t i = id - 3;
sym = std::string(&i, &i + 1);
}
}
assert(!sym.empty());
assert(sym2id_.count(sym) == 0);
assert(id2sym_.count(id) == 0);
... ...