Fangjun Kuang
Committed by GitHub

Fix tokens processing for byte-level BPE (#333)

1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.7.17") 4 +set(SHERPA_ONNX_VERSION "1.7.18")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -8,9 +8,9 @@ @@ -8,9 +8,9 @@
8 8
9 #include <algorithm> 9 #include <algorithm>
10 #include <cmath> 10 #include <cmath>
  11 +#include <iomanip>
11 12
12 #include "kaldi-native-fbank/csrc/online-feature.h" 13 #include "kaldi-native-fbank/csrc/online-feature.h"
13 -#include "nlohmann/json.hpp"  
14 #include "sherpa-onnx/csrc/macros.h" 14 #include "sherpa-onnx/csrc/macros.h"
15 #include "sherpa-onnx/csrc/offline-recognizer.h" 15 #include "sherpa-onnx/csrc/offline-recognizer.h"
16 #include "sherpa-onnx/csrc/resample.h" 16 #include "sherpa-onnx/csrc/resample.h"
@@ -256,25 +256,50 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const { @@ -256,25 +256,50 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const {
256 return impl_->GetResult(); 256 return impl_->GetResult();
257 } 257 }
258 std::string OfflineRecognitionResult::AsJsonString() const { 258 std::string OfflineRecognitionResult::AsJsonString() const {
259 - nlohmann::json j;  
260 - j["text"] = text;  
261 - j["tokens"] = tokens;  
262 -#if 1  
263 - // This branch chooses number of decimal points to keep in  
264 - // the return json string  
265 std::ostringstream os; 259 std::ostringstream os;
266 - os << "["; 260 + os << "{";
  261 + os << "\"text\""
  262 + << ": ";
  263 + os << "\"" << text << "\""
  264 + << ", ";
  265 +
  266 + os << "\""
  267 + << "timestamps"
  268 + << "\""
  269 + << ": ";
  270 + os << "\"[";
  271 +
267 std::string sep = ""; 272 std::string sep = "";
268 for (auto t : timestamps) { 273 for (auto t : timestamps) {
269 os << sep << std::fixed << std::setprecision(2) << t; 274 os << sep << std::fixed << std::setprecision(2) << t;
270 sep = ", "; 275 sep = ", ";
271 } 276 }
  277 + os << "]\", ";
  278 +
  279 + os << "\""
  280 + << "tokens"
  281 + << "\""
  282 + << ":";
  283 + os << "[";
  284 +
  285 + sep = "";
  286 + auto oldFlags = os.flags();
  287 + for (const auto &t : tokens) {
  288 + if (t.size() == 1 && static_cast<uint8_t>(t[0]) > 0x7f) {
  289 + const uint8_t *p = reinterpret_cast<const uint8_t *>(t.c_str());
  290 + os << sep << "\""
  291 + << "<0x" << std::hex << std::uppercase << static_cast<uint32_t>(p[0])
  292 + << ">"
  293 + << "\"";
  294 + os.flags(oldFlags);
  295 + } else {
  296 + os << sep << "\"" << t << "\"";
  297 + }
  298 + sep = ", ";
  299 + }
272 os << "]"; 300 os << "]";
273 - j["timestamps"] = os.str();  
274 -#else  
275 - j["timestamps"] = timestamps;  
276 -#endif 301 + os << "}";
277 302
278 - return j.dump(); 303 + return os.str();
279 } 304 }
280 } // namespace sherpa_onnx 305 } // namespace sherpa_onnx
@@ -51,7 +51,7 @@ void SymbolTable::Init(std::istream &is) { @@ -51,7 +51,7 @@ void SymbolTable::Init(std::istream &is) {
51 if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' && 51 if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' &&
52 sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') { 52 sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') {
53 std::ostringstream os; 53 std::ostringstream os;
54 - os << std::hex << (id - 3); 54 + os << std::hex << std::uppercase << (id - 3);
55 55
56 if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) { 56 if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) {
57 uint8_t i = id - 3; 57 uint8_t i = id - 3;