Committed by
GitHub
Fix splitting utf8 string into words (#385)
正在显示
3 个修改的文件
包含
26 行增加
和
90 行删除
| 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) | 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) |
| 2 | project(sherpa-onnx) | 2 | project(sherpa-onnx) |
| 3 | 3 | ||
| 4 | -set(SHERPA_ONNX_VERSION "1.8.4") | 4 | +set(SHERPA_ONNX_VERSION "1.8.5") |
| 5 | 5 | ||
| 6 | # Disable warning about | 6 | # Disable warning about |
| 7 | # | 7 | # |
| @@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET) | @@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET) | ||
| 175 | include(asio) | 175 | include(asio) |
| 176 | endif() | 176 | endif() |
| 177 | 177 | ||
| 178 | -include(utfcpp) | ||
| 179 | - | ||
| 180 | add_subdirectory(sherpa-onnx) | 178 | add_subdirectory(sherpa-onnx) |
| 181 | 179 | ||
| 182 | if(SHERPA_ONNX_ENABLE_C_API) | 180 | if(SHERPA_ONNX_ENABLE_C_API) |
cmake/utfcpp.cmake
已删除
100644 → 0
| 1 | -function(download_utfcpp) | ||
| 2 | - include(FetchContent) | ||
| 3 | - | ||
| 4 | - set(utfcpp_URL "https://github.com/nemtrif/utfcpp/archive/refs/tags/v3.2.5.tar.gz") | ||
| 5 | - set(utfcpp_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/utfcpp-3.2.5.tar.gz") | ||
| 6 | - set(utfcpp_HASH "SHA256=14fd1b3c466814cb4c40771b7f207b61d2c7a0aa6a5e620ca05c00df27f25afd") | ||
| 7 | - | ||
| 8 | - # If you don't have access to the Internet, | ||
| 9 | - # please pre-download utfcpp | ||
| 10 | - set(possible_file_locations | ||
| 11 | - $ENV{HOME}/Downloads/utfcpp-3.2.5.tar.gz | ||
| 12 | - ${PROJECT_SOURCE_DIR}/utfcpp-3.2.5.tar.gz | ||
| 13 | - ${PROJECT_BINARY_DIR}/utfcpp-3.2.5.tar.gz | ||
| 14 | - /tmp/utfcpp-3.2.5.tar.gz | ||
| 15 | - /star-fj/fangjun/download/github/utfcpp-3.2.5.tar.gz | ||
| 16 | - ) | ||
| 17 | - | ||
| 18 | - foreach(f IN LISTS possible_file_locations) | ||
| 19 | - if(EXISTS ${f}) | ||
| 20 | - set(utfcpp_URL "${f}") | ||
| 21 | - file(TO_CMAKE_PATH "${utfcpp_URL}" utfcpp_URL) | ||
| 22 | - message(STATUS "Found local downloaded utfcpp: ${utfcpp_URL}") | ||
| 23 | - set(utfcpp_URL2) | ||
| 24 | - break() | ||
| 25 | - endif() | ||
| 26 | - endforeach() | ||
| 27 | - | ||
| 28 | - FetchContent_Declare(utfcpp | ||
| 29 | - URL | ||
| 30 | - ${utfcpp_URL} | ||
| 31 | - ${utfcpp_URL2} | ||
| 32 | - URL_HASH ${utfcpp_HASH} | ||
| 33 | - ) | ||
| 34 | - | ||
| 35 | - FetchContent_GetProperties(utfcpp) | ||
| 36 | - if(NOT utfcpp_POPULATED) | ||
| 37 | - message(STATUS "Downloading utfcpp from ${utfcpp_URL}") | ||
| 38 | - FetchContent_Populate(utfcpp) | ||
| 39 | - endif() | ||
| 40 | - message(STATUS "utfcpp is downloaded to ${utfcpp_SOURCE_DIR}") | ||
| 41 | - # add_subdirectory(${utfcpp_SOURCE_DIR} ${utfcpp_BINARY_DIR} EXCLUDE_FROM_ALL) | ||
| 42 | - include_directories(${utfcpp_SOURCE_DIR}) | ||
| 43 | -endfunction() | ||
| 44 | - | ||
| 45 | -download_utfcpp() |
| @@ -16,7 +16,7 @@ | @@ -16,7 +16,7 @@ | ||
| 16 | #include <utility> | 16 | #include <utility> |
| 17 | #include <vector> | 17 | #include <vector> |
| 18 | 18 | ||
| 19 | -#include "source/utf8.h" | 19 | +#include "sherpa-onnx/csrc/macros.h" |
| 20 | 20 | ||
| 21 | // This file is copied/modified from | 21 | // This file is copied/modified from |
| 22 | // https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc | 22 | // https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc |
| @@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | @@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, | ||
| 163 | std::vector<double> *out); | 163 | std::vector<double> *out); |
| 164 | 164 | ||
| 165 | std::vector<std::string> SplitUtf8(const std::string &text) { | 165 | std::vector<std::string> SplitUtf8(const std::string &text) { |
| 166 | - char *begin = const_cast<char *>(text.c_str()); | ||
| 167 | - char *end = begin + text.size(); | 166 | + const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str()); |
| 167 | + const uint8_t *end = begin + text.size(); | ||
| 168 | 168 | ||
| 169 | std::vector<std::string> ans; | 169 | std::vector<std::string> ans; |
| 170 | - std::string buf; | ||
| 171 | 170 | ||
| 172 | - while (begin < end) { | ||
| 173 | - uint32_t code = utf8::next(begin, end); | 171 | + auto start = begin; |
| 172 | + while (start < end) { | ||
| 173 | + uint8_t c = *start; | ||
| 174 | + uint8_t i = 0x80; | ||
| 175 | + int32_t num_bytes = 0; | ||
| 174 | 176 | ||
| 175 | - // 1. is punctuation | ||
| 176 | - if (std::ispunct(code)) { | ||
| 177 | - if (!buf.empty()) { | ||
| 178 | - ans.push_back(std::move(buf)); | ||
| 179 | - } | ||
| 180 | - | ||
| 181 | - char s[5] = {0}; | ||
| 182 | - utf8::append(code, s); | ||
| 183 | - ans.push_back(s); | ||
| 184 | - continue; | ||
| 185 | - } | ||
| 186 | - | ||
| 187 | - // 2. is space | ||
| 188 | - if (std::isspace(code)) { | ||
| 189 | - if (!buf.empty()) { | ||
| 190 | - ans.push_back(std::move(buf)); | ||
| 191 | - } | ||
| 192 | - continue; | ||
| 193 | - } | ||
| 194 | - | ||
| 195 | - // 3. is alpha | ||
| 196 | - if (std::isalpha(code)) { | ||
| 197 | - buf.push_back(code); | ||
| 198 | - continue; | 177 | + // see |
| 178 | + // https://en.wikipedia.org/wiki/UTF-8 | ||
| 179 | + for (; c & i; i >>= 1) { | ||
| 180 | + ++num_bytes; | ||
| 199 | } | 181 | } |
| 200 | 182 | ||
| 201 | - if (!buf.empty()) { | ||
| 202 | - ans.push_back(std::move(buf)); | 183 | + if (num_bytes == 0) { |
| 184 | + // this is an ascii | ||
| 185 | + ans.emplace_back(reinterpret_cast<const char *>(start), 1); | ||
| 186 | + ++start; | ||
| 187 | + } else if (2 <= num_bytes && num_bytes <= 4) { | ||
| 188 | + ans.emplace_back(reinterpret_cast<const char *>(start), num_bytes); | ||
| 189 | + start += num_bytes; | ||
| 190 | + } else { | ||
| 191 | + SHERPA_ONNX_LOGE("Invalid byte at position: %d", | ||
| 192 | + static_cast<int32_t>(start - begin)); | ||
| 193 | + // skip this byte | ||
| 194 | + ++start; | ||
| 203 | } | 195 | } |
| 204 | - | ||
| 205 | - // for others | ||
| 206 | - | ||
| 207 | - char s[5] = {0}; | ||
| 208 | - utf8::append(code, s); | ||
| 209 | - ans.push_back(s); | ||
| 210 | - } | ||
| 211 | - | ||
| 212 | - if (!buf.empty()) { | ||
| 213 | - ans.push_back(std::move(buf)); | ||
| 214 | } | 196 | } |
| 215 | 197 | ||
| 216 | return ans; | 198 | return ans; |
| 217 | } | 199 | } |
| 200 | + | ||
| 218 | } // namespace sherpa_onnx | 201 | } // namespace sherpa_onnx |
-
请 注册 或 登录 后发表评论