Committed by
GitHub
Fix Byte BPE string results for Python. (#512)
It ignores invalid UTF8 strings.
正在显示
6 个修改的文件
包含
54 行增加
和
3 行删除
| @@ -5,7 +5,9 @@ | @@ -5,7 +5,9 @@ | ||
| 5 | #ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_ | 5 | #ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_ |
| 6 | #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_ | 6 | #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_ |
| 7 | 7 | ||
| 8 | +#include <ios> | ||
| 8 | #include <memory> | 9 | #include <memory> |
| 10 | +#include <sstream> | ||
| 9 | #include <string> | 11 | #include <string> |
| 10 | #include <utility> | 12 | #include <utility> |
| 11 | #include <vector> | 13 | #include <vector> |
| @@ -42,6 +44,15 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, | @@ -42,6 +44,15 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, | ||
| 42 | } | 44 | } |
| 43 | auto sym = sym_table[src.tokens[i]]; | 45 | auto sym = sym_table[src.tokens[i]]; |
| 44 | text.append(sym); | 46 | text.append(sym); |
| 47 | + | ||
| 48 | + if (sym.size() == 1 && sym[0] != ' ') { | ||
| 49 | + // for byte bpe models | ||
| 50 | + std::ostringstream os; | ||
| 51 | + os << "<0x" << std::hex << std::uppercase | ||
| 52 | + << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | ||
| 53 | + sym = os.str(); | ||
| 54 | + } | ||
| 55 | + | ||
| 45 | r.tokens.push_back(std::move(sym)); | 56 | r.tokens.push_back(std::move(sym)); |
| 46 | } | 57 | } |
| 47 | r.text = std::move(text); | 58 | r.text = std::move(text); |
| @@ -6,8 +6,10 @@ | @@ -6,8 +6,10 @@ | ||
| 6 | #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ | 6 | #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ |
| 7 | 7 | ||
| 8 | #include <fstream> | 8 | #include <fstream> |
| 9 | +#include <ios> | ||
| 9 | #include <memory> | 10 | #include <memory> |
| 10 | #include <regex> // NOLINT | 11 | #include <regex> // NOLINT |
| 12 | +#include <sstream> | ||
| 11 | #include <string> | 13 | #include <string> |
| 12 | #include <utility> | 14 | #include <utility> |
| 13 | #include <vector> | 15 | #include <vector> |
| @@ -44,6 +46,14 @@ static OfflineRecognitionResult Convert( | @@ -44,6 +46,14 @@ static OfflineRecognitionResult Convert( | ||
| 44 | auto sym = sym_table[i]; | 46 | auto sym = sym_table[i]; |
| 45 | text.append(sym); | 47 | text.append(sym); |
| 46 | 48 | ||
| 49 | + if (sym.size() == 1 && sym[0] != ' ') { | ||
| 50 | + // for byte bpe models | ||
| 51 | + std::ostringstream os; | ||
| 52 | + os << "<0x" << std::hex << std::uppercase | ||
| 53 | + << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | ||
| 54 | + sym = os.str(); | ||
| 55 | + } | ||
| 56 | + | ||
| 47 | r.tokens.push_back(std::move(sym)); | 57 | r.tokens.push_back(std::move(sym)); |
| 48 | } | 58 | } |
| 49 | r.text = std::move(text); | 59 | r.text = std::move(text); |
| @@ -6,7 +6,9 @@ | @@ -6,7 +6,9 @@ | ||
| 6 | #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_ | 6 | #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_ |
| 7 | 7 | ||
| 8 | #include <algorithm> | 8 | #include <algorithm> |
| 9 | +#include <ios> | ||
| 9 | #include <memory> | 10 | #include <memory> |
| 11 | +#include <sstream> | ||
| 10 | #include <string> | 12 | #include <string> |
| 11 | #include <utility> | 13 | #include <utility> |
| 12 | #include <vector> | 14 | #include <vector> |
| @@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, | @@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, | ||
| 35 | auto sym = sym_table[i]; | 37 | auto sym = sym_table[i]; |
| 36 | 38 | ||
| 37 | r.text.append(sym); | 39 | r.text.append(sym); |
| 40 | + | ||
| 41 | + if (sym.size() == 1 && sym[0] != ' ') { | ||
| 42 | + // for byte bpe models | ||
| 43 | + std::ostringstream os; | ||
| 44 | + os << "<0x" << std::hex << std::uppercase | ||
| 45 | + << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | ||
| 46 | + sym = os.str(); | ||
| 47 | + } | ||
| 48 | + | ||
| 38 | r.tokens.push_back(std::move(sym)); | 49 | r.tokens.push_back(std::move(sym)); |
| 39 | } | 50 | } |
| 40 | 51 |
| @@ -6,8 +6,10 @@ | @@ -6,8 +6,10 @@ | ||
| 6 | #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ | 6 | #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ |
| 7 | 7 | ||
| 8 | #include <algorithm> | 8 | #include <algorithm> |
| 9 | +#include <ios> | ||
| 9 | #include <memory> | 10 | #include <memory> |
| 10 | #include <regex> // NOLINT | 11 | #include <regex> // NOLINT |
| 12 | +#include <sstream> | ||
| 11 | #include <string> | 13 | #include <string> |
| 12 | #include <utility> | 14 | #include <utility> |
| 13 | #include <vector> | 15 | #include <vector> |
| @@ -47,6 +49,15 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, | @@ -47,6 +49,15 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, | ||
| 47 | auto sym = sym_table[i]; | 49 | auto sym = sym_table[i]; |
| 48 | 50 | ||
| 49 | r.text.append(sym); | 51 | r.text.append(sym); |
| 52 | + | ||
| 53 | + if (sym.size() == 1 && sym[0] != ' ') { | ||
| 54 | + // for byte bpe models | ||
| 55 | + std::ostringstream os; | ||
| 56 | + os << "<0x" << std::hex << std::uppercase | ||
| 57 | + << (static_cast<int32_t>(sym[0]) & 0xff) << ">"; | ||
| 58 | + sym = os.str(); | ||
| 59 | + } | ||
| 60 | + | ||
| 50 | r.tokens.push_back(std::move(sym)); | 61 | r.tokens.push_back(std::move(sym)); |
| 51 | } | 62 | } |
| 52 | 63 |
| @@ -23,8 +23,12 @@ Args: | @@ -23,8 +23,12 @@ Args: | ||
| 23 | static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT | 23 | static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT |
| 24 | using PyClass = OfflineRecognitionResult; | 24 | using PyClass = OfflineRecognitionResult; |
| 25 | py::class_<PyClass>(*m, "OfflineRecognitionResult") | 25 | py::class_<PyClass>(*m, "OfflineRecognitionResult") |
| 26 | - .def_property_readonly("text", | ||
| 27 | - [](const PyClass &self) { return self.text; }) | 26 | + .def_property_readonly( |
| 27 | + "text", | ||
| 28 | + [](const PyClass &self) -> py::str { | ||
| 29 | + return py::str(PyUnicode_DecodeUTF8(self.text.c_str(), | ||
| 30 | + self.text.size(), "ignore")); | ||
| 31 | + }) | ||
| 28 | .def_property_readonly("tokens", | 32 | .def_property_readonly("tokens", |
| 29 | [](const PyClass &self) { return self.tokens; }) | 33 | [](const PyClass &self) { return self.tokens; }) |
| 30 | .def_property_readonly( | 34 | .def_property_readonly( |
| @@ -15,7 +15,11 @@ static void PybindOnlineRecognizerResult(py::module *m) { | @@ -15,7 +15,11 @@ static void PybindOnlineRecognizerResult(py::module *m) { | ||
| 15 | using PyClass = OnlineRecognizerResult; | 15 | using PyClass = OnlineRecognizerResult; |
| 16 | py::class_<PyClass>(*m, "OnlineRecognizerResult") | 16 | py::class_<PyClass>(*m, "OnlineRecognizerResult") |
| 17 | .def_property_readonly( | 17 | .def_property_readonly( |
| 18 | - "text", [](PyClass &self) -> std::string { return self.text; }) | 18 | + "text", |
| 19 | + [](PyClass &self) -> py::str { | ||
| 20 | + return py::str(PyUnicode_DecodeUTF8(self.text.c_str(), | ||
| 21 | + self.text.size(), "ignore")); | ||
| 22 | + }) | ||
| 19 | .def_property_readonly( | 23 | .def_property_readonly( |
| 20 | "tokens", | 24 | "tokens", |
| 21 | [](PyClass &self) -> std::vector<std::string> { return self.tokens; }) | 25 | [](PyClass &self) -> std::vector<std::string> { return self.tokens; }) |
-
请 注册 或 登录 后发表评论