Committed by
GitHub
Fix tokens processing for byte-level BPE (#333)
正在显示
3 个修改的文件
包含
40 行增加
和
15 行删除
| @@ -8,9 +8,9 @@ | @@ -8,9 +8,9 @@ | ||
| 8 | 8 | ||
| 9 | #include <algorithm> | 9 | #include <algorithm> |
| 10 | #include <cmath> | 10 | #include <cmath> |
| 11 | +#include <iomanip> | ||
| 11 | 12 | ||
| 12 | #include "kaldi-native-fbank/csrc/online-feature.h" | 13 | #include "kaldi-native-fbank/csrc/online-feature.h" |
| 13 | -#include "nlohmann/json.hpp" | ||
| 14 | #include "sherpa-onnx/csrc/macros.h" | 14 | #include "sherpa-onnx/csrc/macros.h" |
| 15 | #include "sherpa-onnx/csrc/offline-recognizer.h" | 15 | #include "sherpa-onnx/csrc/offline-recognizer.h" |
| 16 | #include "sherpa-onnx/csrc/resample.h" | 16 | #include "sherpa-onnx/csrc/resample.h" |
| @@ -256,25 +256,50 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const { | @@ -256,25 +256,50 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const { | ||
| 256 | return impl_->GetResult(); | 256 | return impl_->GetResult(); |
| 257 | } | 257 | } |
| 258 | std::string OfflineRecognitionResult::AsJsonString() const { | 258 | std::string OfflineRecognitionResult::AsJsonString() const { |
| 259 | - nlohmann::json j; | ||
| 260 | - j["text"] = text; | ||
| 261 | - j["tokens"] = tokens; | ||
| 262 | -#if 1 | ||
| 263 | - // This branch chooses number of decimal points to keep in | ||
| 264 | - // the return json string | ||
| 265 | std::ostringstream os; | 259 | std::ostringstream os; |
| 266 | - os << "["; | 260 | + os << "{"; |
| 261 | + os << "\"text\"" | ||
| 262 | + << ": "; | ||
| 263 | + os << "\"" << text << "\"" | ||
| 264 | + << ", "; | ||
| 265 | + | ||
| 266 | + os << "\"" | ||
| 267 | + << "timestamps" | ||
| 268 | + << "\"" | ||
| 269 | + << ": "; | ||
| 270 | + os << "\"["; | ||
| 271 | + | ||
| 267 | std::string sep = ""; | 272 | std::string sep = ""; |
| 268 | for (auto t : timestamps) { | 273 | for (auto t : timestamps) { |
| 269 | os << sep << std::fixed << std::setprecision(2) << t; | 274 | os << sep << std::fixed << std::setprecision(2) << t; |
| 270 | sep = ", "; | 275 | sep = ", "; |
| 271 | } | 276 | } |
| 277 | + os << "]\", "; | ||
| 278 | + | ||
| 279 | + os << "\"" | ||
| 280 | + << "tokens" | ||
| 281 | + << "\"" | ||
| 282 | + << ":"; | ||
| 283 | + os << "["; | ||
| 284 | + | ||
| 285 | + sep = ""; | ||
| 286 | + auto oldFlags = os.flags(); | ||
| 287 | + for (const auto &t : tokens) { | ||
| 288 | + if (t.size() == 1 && static_cast<uint8_t>(t[0]) > 0x7f) { | ||
| 289 | + const uint8_t *p = reinterpret_cast<const uint8_t *>(t.c_str()); | ||
| 290 | + os << sep << "\"" | ||
| 291 | + << "<0x" << std::hex << std::uppercase << static_cast<uint32_t>(p[0]) | ||
| 292 | + << ">" | ||
| 293 | + << "\""; | ||
| 294 | + os.flags(oldFlags); | ||
| 295 | + } else { | ||
| 296 | + os << sep << "\"" << t << "\""; | ||
| 297 | + } | ||
| 298 | + sep = ", "; | ||
| 299 | + } | ||
| 272 | os << "]"; | 300 | os << "]"; |
| 273 | - j["timestamps"] = os.str(); | ||
| 274 | -#else | ||
| 275 | - j["timestamps"] = timestamps; | ||
| 276 | -#endif | 301 | + os << "}"; |
| 277 | 302 | ||
| 278 | - return j.dump(); | 303 | + return os.str(); |
| 279 | } | 304 | } |
| 280 | } // namespace sherpa_onnx | 305 | } // namespace sherpa_onnx |
| @@ -51,7 +51,7 @@ void SymbolTable::Init(std::istream &is) { | @@ -51,7 +51,7 @@ void SymbolTable::Init(std::istream &is) { | ||
| 51 | if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' && | 51 | if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' && |
| 52 | sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') { | 52 | sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') { |
| 53 | std::ostringstream os; | 53 | std::ostringstream os; |
| 54 | - os << std::hex << (id - 3); | 54 | + os << std::hex << std::uppercase << (id - 3); |
| 55 | 55 | ||
| 56 | if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) { | 56 | if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) { |
| 57 | uint8_t i = id - 3; | 57 | uint8_t i = id - 3; |
-
请 注册 或 登录 后发表评论