Committed by
GitHub
Fix tokens for byte-level BPE token. (#324)
正在显示
2 个修改的文件
包含
14 行增加
和
1 行删除
| @@ -46,6 +46,19 @@ void SymbolTable::Init(std::istream &is) { | @@ -46,6 +46,19 @@ void SymbolTable::Init(std::istream &is) { | ||
| 46 | } | 46 | } |
| 47 | } | 47 | } |
| 48 | 48 | ||
| 49 | + // for byte-level BPE | ||
| 50 | + // id 0 is blank, id 1 is sos/eos, id 2 is unk | ||
| 51 | + if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' && | ||
| 52 | + sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') { | ||
| 53 | + std::ostringstream os; | ||
| 54 | + os << std::hex << (id - 3); | ||
| 55 | + | ||
| 56 | + if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) { | ||
| 57 | + uint8_t i = id - 3; | ||
| 58 | + sym = std::string(&i, &i + 1); | ||
| 59 | + } | ||
| 60 | + } | ||
| 61 | + | ||
| 49 | assert(!sym.empty()); | 62 | assert(!sym.empty()); |
| 50 | assert(sym2id_.count(sym) == 0); | 63 | assert(sym2id_.count(sym) == 0); |
| 51 | assert(id2sym_.count(id) == 0); | 64 | assert(id2sym_.count(id) == 0); |
-
请 注册 或 登录 后发表评论