Karel Vesely
Committed by GitHub

Fixes issue #535 , fix hexa 1-char tokens in ASR output. (#550)

- Avoid output like : `[' K', '<0x64>', '<0x79>', 'ť', ' a', '<0x75>',
  'to', 'bu', '<0x73>', '<0x75>', ... ]` with regular 500 BPE units.
- Don't rewrite 1-char tokens in range [ 0x20 (space) .. 0x7E (tilde) ]
@@ -45,8 +45,10 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, @@ -45,8 +45,10 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
45 auto sym = sym_table[src.tokens[i]]; 45 auto sym = sym_table[src.tokens[i]];
46 text.append(sym); 46 text.append(sym);
47 47
48 - if (sym.size() == 1 && sym[0] != ' ') { 48 + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
49 // for byte bpe models 49 // for byte bpe models
  50 + // (but don't rewrite printable characters 0x20..0x7e,
  51 + // which collide with standard BPE units)
50 std::ostringstream os; 52 std::ostringstream os;
51 os << "<0x" << std::hex << std::uppercase 53 os << "<0x" << std::hex << std::uppercase
52 << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; 54 << (static_cast<int32_t>(sym[0]) & 0xff) << ">";
@@ -46,8 +46,10 @@ static OfflineRecognitionResult Convert( @@ -46,8 +46,10 @@ static OfflineRecognitionResult Convert(
46 auto sym = sym_table[i]; 46 auto sym = sym_table[i];
47 text.append(sym); 47 text.append(sym);
48 48
49 - if (sym.size() == 1 && sym[0] != ' ') {  
50 - // for byte bpe models 49 + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
  50 + // for byte bpe models,
  51 + // (but don't rewrite printable characters 0x20..0x7e,
  52 + // which collide with standard BPE units)
51 std::ostringstream os; 53 std::ostringstream os;
52 os << "<0x" << std::hex << std::uppercase 54 os << "<0x" << std::hex << std::uppercase
53 << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; 55 << (static_cast<int32_t>(sym[0]) & 0xff) << ">";
@@ -38,8 +38,10 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, @@ -38,8 +38,10 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
38 38
39 r.text.append(sym); 39 r.text.append(sym);
40 40
41 - if (sym.size() == 1 && sym[0] != ' ') { 41 + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
42 // for byte bpe models 42 // for byte bpe models
  43 + // (but don't rewrite printable characters 0x20..0x7e,
  44 + // which collide with standard BPE units)
43 std::ostringstream os; 45 std::ostringstream os;
44 os << "<0x" << std::hex << std::uppercase 46 os << "<0x" << std::hex << std::uppercase
45 << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; 47 << (static_cast<int32_t>(sym[0]) & 0xff) << ">";
@@ -50,8 +50,10 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, @@ -50,8 +50,10 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
50 50
51 r.text.append(sym); 51 r.text.append(sym);
52 52
53 - if (sym.size() == 1 && sym[0] != ' ') { 53 + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
54 // for byte bpe models 54 // for byte bpe models
  55 + // (but don't rewrite printable characters 0x20..0x7e,
  56 + // which collide with standard BPE units)
55 std::ostringstream os; 57 std::ostringstream os;
56 os << "<0x" << std::hex << std::uppercase 58 os << "<0x" << std::hex << std::uppercase
57 << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; 59 << (static_cast<int32_t>(sym[0]) & 0xff) << ">";