Fangjun Kuang
Committed by GitHub

Fix splitting utf8 string into words (#385)

1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.8.4") 4 +set(SHERPA_ONNX_VERSION "1.8.5")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET) @@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET)
175 include(asio) 175 include(asio)
176 endif() 176 endif()
177 177
178 -include(utfcpp)  
179 -  
180 add_subdirectory(sherpa-onnx) 178 add_subdirectory(sherpa-onnx)
181 179
182 if(SHERPA_ONNX_ENABLE_C_API) 180 if(SHERPA_ONNX_ENABLE_C_API)
1 -function(download_utfcpp)  
2 - include(FetchContent)  
3 -  
4 - set(utfcpp_URL "https://github.com/nemtrif/utfcpp/archive/refs/tags/v3.2.5.tar.gz")  
5 - set(utfcpp_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/utfcpp-3.2.5.tar.gz")  
6 - set(utfcpp_HASH "SHA256=14fd1b3c466814cb4c40771b7f207b61d2c7a0aa6a5e620ca05c00df27f25afd")  
7 -  
8 - # If you don't have access to the Internet,  
9 - # please pre-download utfcpp  
10 - set(possible_file_locations  
11 - $ENV{HOME}/Downloads/utfcpp-3.2.5.tar.gz  
12 - ${PROJECT_SOURCE_DIR}/utfcpp-3.2.5.tar.gz  
13 - ${PROJECT_BINARY_DIR}/utfcpp-3.2.5.tar.gz  
14 - /tmp/utfcpp-3.2.5.tar.gz  
15 - /star-fj/fangjun/download/github/utfcpp-3.2.5.tar.gz  
16 - )  
17 -  
18 - foreach(f IN LISTS possible_file_locations)  
19 - if(EXISTS ${f})  
20 - set(utfcpp_URL "${f}")  
21 - file(TO_CMAKE_PATH "${utfcpp_URL}" utfcpp_URL)  
22 - message(STATUS "Found local downloaded utfcpp: ${utfcpp_URL}")  
23 - set(utfcpp_URL2)  
24 - break()  
25 - endif()  
26 - endforeach()  
27 -  
28 - FetchContent_Declare(utfcpp  
29 - URL  
30 - ${utfcpp_URL}  
31 - ${utfcpp_URL2}  
32 - URL_HASH ${utfcpp_HASH}  
33 - )  
34 -  
35 - FetchContent_GetProperties(utfcpp)  
36 - if(NOT utfcpp_POPULATED)  
37 - message(STATUS "Downloading utfcpp from ${utfcpp_URL}")  
38 - FetchContent_Populate(utfcpp)  
39 - endif()  
40 - message(STATUS "utfcpp is downloaded to ${utfcpp_SOURCE_DIR}")  
41 - # add_subdirectory(${utfcpp_SOURCE_DIR} ${utfcpp_BINARY_DIR} EXCLUDE_FROM_ALL)  
42 - include_directories(${utfcpp_SOURCE_DIR})  
43 -endfunction()  
44 -  
45 -download_utfcpp()  
@@ -16,7 +16,7 @@ @@ -16,7 +16,7 @@
16 #include <utility> 16 #include <utility>
17 #include <vector> 17 #include <vector>
18 18
19 -#include "source/utf8.h" 19 +#include "sherpa-onnx/csrc/macros.h"
20 20
21 // This file is copied/modified from 21 // This file is copied/modified from
22 // https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc 22 // https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc
@@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, @@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
163 std::vector<double> *out); 163 std::vector<double> *out);
164 164
165 std::vector<std::string> SplitUtf8(const std::string &text) { 165 std::vector<std::string> SplitUtf8(const std::string &text) {
166 - char *begin = const_cast<char *>(text.c_str());  
167 - char *end = begin + text.size(); 166 + const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str());
  167 + const uint8_t *end = begin + text.size();
168 168
169 std::vector<std::string> ans; 169 std::vector<std::string> ans;
170 - std::string buf;  
171 170
172 - while (begin < end) {  
173 - uint32_t code = utf8::next(begin, end); 171 + auto start = begin;
  172 + while (start < end) {
  173 + uint8_t c = *start;
  174 + uint8_t i = 0x80;
  175 + int32_t num_bytes = 0;
174 176
175 - // 1. is punctuation  
176 - if (std::ispunct(code)) {  
177 - if (!buf.empty()) {  
178 - ans.push_back(std::move(buf));  
179 - }  
180 -  
181 - char s[5] = {0};  
182 - utf8::append(code, s);  
183 - ans.push_back(s);  
184 - continue;  
185 - }  
186 -  
187 - // 2. is space  
188 - if (std::isspace(code)) {  
189 - if (!buf.empty()) {  
190 - ans.push_back(std::move(buf));  
191 - }  
192 - continue;  
193 - }  
194 -  
195 - // 3. is alpha  
196 - if (std::isalpha(code)) {  
197 - buf.push_back(code);  
198 - continue; 177 + // see
  178 + // https://en.wikipedia.org/wiki/UTF-8
  179 + for (; c & i; i >>= 1) {
  180 + ++num_bytes;
199 } 181 }
200 182
201 - if (!buf.empty()) {  
202 - ans.push_back(std::move(buf)); 183 + if (num_bytes == 0) {
  184 + // this is an ascii
  185 + ans.emplace_back(reinterpret_cast<const char *>(start), 1);
  186 + ++start;
  187 + } else if (2 <= num_bytes && num_bytes <= 4) {
  188 + ans.emplace_back(reinterpret_cast<const char *>(start), num_bytes);
  189 + start += num_bytes;
  190 + } else {
  191 + SHERPA_ONNX_LOGE("Invalid byte at position: %d",
  192 + static_cast<int32_t>(start - begin));
  193 + // skip this byte
  194 + ++start;
203 } 195 }
204 -  
205 - // for others  
206 -  
207 - char s[5] = {0};  
208 - utf8::append(code, s);  
209 - ans.push_back(s);  
210 - }  
211 -  
212 - if (!buf.empty()) {  
213 - ans.push_back(std::move(buf));  
214 } 196 }
215 197
216 return ans; 198 return ans;
217 } 199 }
  200 +
218 } // namespace sherpa_onnx 201 } // namespace sherpa_onnx