Fangjun Kuang
Committed by GitHub

Fix Byte BPE string results for Python. (#512)

It ignores invalid UTF8 strings.
@@ -5,7 +5,9 @@ @@ -5,7 +5,9 @@
5 #ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_ 5 #ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
6 #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_ 6 #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
7 7
  8 +#include <ios>
8 #include <memory> 9 #include <memory>
  10 +#include <sstream>
9 #include <string> 11 #include <string>
10 #include <utility> 12 #include <utility>
11 #include <vector> 13 #include <vector>
@@ -42,6 +44,15 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, @@ -42,6 +44,15 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
42 } 44 }
43 auto sym = sym_table[src.tokens[i]]; 45 auto sym = sym_table[src.tokens[i]];
44 text.append(sym); 46 text.append(sym);
  47 +
  48 + if (sym.size() == 1 && sym[0] != ' ') {
  49 + // for byte bpe models
  50 + std::ostringstream os;
  51 + os << "<0x" << std::hex << std::uppercase
  52 + << (static_cast<int32_t>(sym[0]) & 0xff) << ">";
  53 + sym = os.str();
  54 + }
  55 +
45 r.tokens.push_back(std::move(sym)); 56 r.tokens.push_back(std::move(sym));
46 } 57 }
47 r.text = std::move(text); 58 r.text = std::move(text);
@@ -6,8 +6,10 @@ @@ -6,8 +6,10 @@
6 #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ 6 #define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
7 7
8 #include <fstream> 8 #include <fstream>
  9 +#include <ios>
9 #include <memory> 10 #include <memory>
10 #include <regex> // NOLINT 11 #include <regex> // NOLINT
  12 +#include <sstream>
11 #include <string> 13 #include <string>
12 #include <utility> 14 #include <utility>
13 #include <vector> 15 #include <vector>
@@ -44,6 +46,14 @@ static OfflineRecognitionResult Convert( @@ -44,6 +46,14 @@ static OfflineRecognitionResult Convert(
44 auto sym = sym_table[i]; 46 auto sym = sym_table[i];
45 text.append(sym); 47 text.append(sym);
46 48
  49 + if (sym.size() == 1 && sym[0] != ' ') {
  50 + // for byte bpe models
  51 + std::ostringstream os;
  52 + os << "<0x" << std::hex << std::uppercase
  53 + << (static_cast<int32_t>(sym[0]) & 0xff) << ">";
  54 + sym = os.str();
  55 + }
  56 +
47 r.tokens.push_back(std::move(sym)); 57 r.tokens.push_back(std::move(sym));
48 } 58 }
49 r.text = std::move(text); 59 r.text = std::move(text);
@@ -6,7 +6,9 @@ @@ -6,7 +6,9 @@
6 #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_ 6 #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_
7 7
8 #include <algorithm> 8 #include <algorithm>
  9 +#include <ios>
9 #include <memory> 10 #include <memory>
  11 +#include <sstream>
10 #include <string> 12 #include <string>
11 #include <utility> 13 #include <utility>
12 #include <vector> 14 #include <vector>
@@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, @@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
35 auto sym = sym_table[i]; 37 auto sym = sym_table[i];
36 38
37 r.text.append(sym); 39 r.text.append(sym);
  40 +
  41 + if (sym.size() == 1 && sym[0] != ' ') {
  42 + // for byte bpe models
  43 + std::ostringstream os;
  44 + os << "<0x" << std::hex << std::uppercase
  45 + << (static_cast<int32_t>(sym[0]) & 0xff) << ">";
  46 + sym = os.str();
  47 + }
  48 +
38 r.tokens.push_back(std::move(sym)); 49 r.tokens.push_back(std::move(sym));
39 } 50 }
40 51
@@ -6,8 +6,10 @@ @@ -6,8 +6,10 @@
6 #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_ 6 #define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
7 7
8 #include <algorithm> 8 #include <algorithm>
  9 +#include <ios>
9 #include <memory> 10 #include <memory>
10 #include <regex> // NOLINT 11 #include <regex> // NOLINT
  12 +#include <sstream>
11 #include <string> 13 #include <string>
12 #include <utility> 14 #include <utility>
13 #include <vector> 15 #include <vector>
@@ -47,6 +49,15 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, @@ -47,6 +49,15 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
47 auto sym = sym_table[i]; 49 auto sym = sym_table[i];
48 50
49 r.text.append(sym); 51 r.text.append(sym);
  52 +
  53 + if (sym.size() == 1 && sym[0] != ' ') {
  54 + // for byte bpe models
  55 + std::ostringstream os;
  56 + os << "<0x" << std::hex << std::uppercase
  57 + << (static_cast<int32_t>(sym[0]) & 0xff) << ">";
  58 + sym = os.str();
  59 + }
  60 +
50 r.tokens.push_back(std::move(sym)); 61 r.tokens.push_back(std::move(sym));
51 } 62 }
52 63
@@ -23,8 +23,12 @@ Args: @@ -23,8 +23,12 @@ Args:
23 static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT 23 static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
24 using PyClass = OfflineRecognitionResult; 24 using PyClass = OfflineRecognitionResult;
25 py::class_<PyClass>(*m, "OfflineRecognitionResult") 25 py::class_<PyClass>(*m, "OfflineRecognitionResult")
26 - .def_property_readonly("text",  
27 - [](const PyClass &self) { return self.text; }) 26 + .def_property_readonly(
  27 + "text",
  28 + [](const PyClass &self) -> py::str {
  29 + return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
  30 + self.text.size(), "ignore"));
  31 + })
28 .def_property_readonly("tokens", 32 .def_property_readonly("tokens",
29 [](const PyClass &self) { return self.tokens; }) 33 [](const PyClass &self) { return self.tokens; })
30 .def_property_readonly( 34 .def_property_readonly(
@@ -15,7 +15,11 @@ static void PybindOnlineRecognizerResult(py::module *m) { @@ -15,7 +15,11 @@ static void PybindOnlineRecognizerResult(py::module *m) {
15 using PyClass = OnlineRecognizerResult; 15 using PyClass = OnlineRecognizerResult;
16 py::class_<PyClass>(*m, "OnlineRecognizerResult") 16 py::class_<PyClass>(*m, "OnlineRecognizerResult")
17 .def_property_readonly( 17 .def_property_readonly(
18 - "text", [](PyClass &self) -> std::string { return self.text; }) 18 + "text",
  19 + [](PyClass &self) -> py::str {
  20 + return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
  21 + self.text.size(), "ignore"));
  22 + })
19 .def_property_readonly( 23 .def_property_readonly(
20 "tokens", 24 "tokens",
21 [](PyClass &self) -> std::vector<std::string> { return self.tokens; }) 25 [](PyClass &self) -> std::vector<std::string> { return self.tokens; })