Fangjun Kuang
Committed by GitHub

Fix symbol table for byte bpe (#361)

@@ -60,7 +60,16 @@ void SymbolTable::Init(std::istream &is) { @@ -60,7 +60,16 @@ void SymbolTable::Init(std::istream &is) {
60 } 60 }
61 61
62 assert(!sym.empty()); 62 assert(!sym.empty());
63 - assert(sym2id_.count(sym) == 0); 63 +
  64 + // for byte bpe, after replacing ▁ with a space, whose ascii is also 0x20,
  65 + // there is a conflict between the real byte 0x20 and ▁, so we disable
  66 + // the following check.
  67 + //
  68 + // Note: Only id2sym_ matters as we use it to convert ID to symbols.
  69 + if (sym != " ") {
  70 + assert(sym2id_.count(sym) == 0);
  71 + }
  72 +
64 assert(id2sym_.count(id) == 0); 73 assert(id2sym_.count(id) == 0);
65 74
66 sym2id_.insert({sym, id}); 75 sym2id_.insert({sym, id});