Committed by
GitHub
Fixes issue #535 , fix hexa 1-char tokens in ASR output. (#550)
- Avoid output like : `[' K', '<0x64>', '<0x79>', 'ť', ' a', '<0x75>', 'to', 'bu', '<0x73>', '<0x75>', ... ]` with regular 500 BPE units. - Don't rewrite 1-char tokens in range [ 0x20 (space) .. 0x7E (tilde) ]
正在显示
4 个修改的文件
包含
13 行增加
和
5 行删除
| @@ -45,8 +45,10 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, | @@ -45,8 +45,10 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, | ||
| 45 | auto sym = sym_table[src.tokens[i]]; | 45 | auto sym = sym_table[src.tokens[i]]; |
| 46 | text.append(sym); | 46 | text.append(sym); |
| 47 | 47 | ||
| 48 | - if (sym.size() == 1 && sym[0] != ' ') { | 48 | + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { |
| 49 | // for byte bpe models | 49 | // for byte bpe models |
| 50 | + // (but don't rewrite printable characters 0x20..0x7e, | ||
| 51 | + // which collide with standard BPE units) | ||
| 50 | std::ostringstream os; | 52 | std::ostringstream os; |
| 51 | os << "<0x" << std::hex << std::uppercase | 53 | os << "<0x" << std::hex << std::uppercase |
| 52 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | 54 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; |
| @@ -46,8 +46,10 @@ static OfflineRecognitionResult Convert( | @@ -46,8 +46,10 @@ static OfflineRecognitionResult Convert( | ||
| 46 | auto sym = sym_table[i]; | 46 | auto sym = sym_table[i]; |
| 47 | text.append(sym); | 47 | text.append(sym); |
| 48 | 48 | ||
| 49 | - if (sym.size() == 1 && sym[0] != ' ') { | ||
| 50 | - // for byte bpe models | 49 | + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { |
| 50 | + // for byte bpe models, | ||
| 51 | + // (but don't rewrite printable characters 0x20..0x7e, | ||
| 52 | + // which collide with standard BPE units) | ||
| 51 | std::ostringstream os; | 53 | std::ostringstream os; |
| 52 | os << "<0x" << std::hex << std::uppercase | 54 | os << "<0x" << std::hex << std::uppercase |
| 53 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | 55 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; |
| @@ -38,8 +38,10 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, | @@ -38,8 +38,10 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, | ||
| 38 | 38 | ||
| 39 | r.text.append(sym); | 39 | r.text.append(sym); |
| 40 | 40 | ||
| 41 | - if (sym.size() == 1 && sym[0] != ' ') { | 41 | + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { |
| 42 | // for byte bpe models | 42 | // for byte bpe models |
| 43 | + // (but don't rewrite printable characters 0x20..0x7e, | ||
| 44 | + // which collide with standard BPE units) | ||
| 43 | std::ostringstream os; | 45 | std::ostringstream os; |
| 44 | os << "<0x" << std::hex << std::uppercase | 46 | os << "<0x" << std::hex << std::uppercase |
| 45 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | 47 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; |
| @@ -50,8 +50,10 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, | @@ -50,8 +50,10 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, | ||
| 50 | 50 | ||
| 51 | r.text.append(sym); | 51 | r.text.append(sym); |
| 52 | 52 | ||
| 53 | - if (sym.size() == 1 && sym[0] != ' ') { | 53 | + if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) { |
| 54 | // for byte bpe models | 54 | // for byte bpe models |
| 55 | + // (but don't rewrite printable characters 0x20..0x7e, | ||
| 56 | + // which collide with standard BPE units) | ||
| 55 | std::ostringstream os; | 57 | std::ostringstream os; |
| 56 | os << "<0x" << std::hex << std::uppercase | 58 | os << "<0x" << std::hex << std::uppercase |
| 57 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | 59 | << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; |
-
请 注册 或 登录 后发表评论