Fangjun Kuang
Committed by GitHub

Fix displaying English words for paraformer models. (#114)

@@ -28,11 +28,51 @@ static OfflineRecognitionResult Convert( @@ -28,11 +28,51 @@ static OfflineRecognitionResult Convert(
28 r.tokens.reserve(src.tokens.size()); 28 r.tokens.reserve(src.tokens.size());
29 29
30 std::string text; 30 std::string text;
31 - for (auto i : src.tokens) {  
32 - auto sym = sym_table[i];  
33 - text.append(sym);  
34 31
35 - r.tokens.push_back(std::move(sym)); 32 + // When the current token ends with "@@" we set mergeable to true
  33 + bool mergeable = false;
  34 +
  35 + for (int32_t i = 0; i != src.tokens.size(); ++i) {
  36 + auto sym = sym_table[src.tokens[i]];
  37 + r.tokens.push_back(sym);
  38 +
  39 + if ((sym.back() != '@') || (sym.size() > 2 && sym[sym.size() - 2] != '@')) {
  40 + // sym does not end with "@@"
  41 + const uint8_t *p = reinterpret_cast<const uint8_t *>(sym.c_str());
  42 + if (p[0] < 0x80) {
  43 + // an ascii
  44 + if (mergeable) {
  45 + mergeable = false;
  46 + text.append(sym);
  47 + } else {
  48 + text.append(" ");
  49 + text.append(sym);
  50 + }
  51 + } else {
  52 + // not an ascii
  53 + mergeable = false;
  54 +
  55 + if (i > 0) {
  56 + const uint8_t *p = reinterpret_cast<const uint8_t *>(
  57 + sym_table[src.tokens[i - 1]].c_str());
  58 + if (p[0] < 0x80) {
  59 + // put a space between ascii and non-ascii
  60 + text.append(" ");
  61 + }
  62 + }
  63 + text.append(sym);
  64 + }
  65 + } else {
  66 + // this sym ends with @@
  67 + sym = std::string(sym.data(), sym.size() - 2);
  68 + if (mergeable) {
  69 + text.append(sym);
  70 + } else {
  71 + text.append(" ");
  72 + text.append(sym);
  73 + mergeable = true;
  74 + }
  75 + }
36 } 76 }
37 r.text = std::move(text); 77 r.text = std::move(text);
38 78