Fix splitting utf8 string into words (#385)

Fangjun Kuang · GitHub
Commit 6e5efa48c553ab5a10a10d06faf7aba7226b6737 6e5efa48 1 parent 1249710e
CMakeLists.txt
cmake/utfcpp.cmake
sherpa-onnx/csrc/text-utils.cc
--- a/CMakeLists.txt
查看文件 @6e5efa4
+++ b/CMakeLists.txt
查看文件 @6e5efa4
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-onnx)
-set(SHERPA_ONNX_VERSION "1.8.4")
+set(SHERPA_ONNX_VERSION "1.8.5")
 # Disable warning about
 #
@@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET)
   include(asio)
 endif()
-include(utfcpp)
-
 add_subdirectory(sherpa-onnx)
 if(SHERPA_ONNX_ENABLE_C_API)
--- a/cmake/utfcpp.cmake 已删除 100644 → 0
查看文件 @1249710
+++ b/cmake/utfcpp.cmake 已删除 100644 → 0
查看文件 @1249710
-function(download_utfcpp)
-  include(FetchContent)
-
-  set(utfcpp_URL  "https://github.com/nemtrif/utfcpp/archive/refs/tags/v3.2.5.tar.gz")
-  set(utfcpp_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/utfcpp-3.2.5.tar.gz")
-  set(utfcpp_HASH "SHA256=14fd1b3c466814cb4c40771b7f207b61d2c7a0aa6a5e620ca05c00df27f25afd")
-
-  # If you don't have access to the Internet,
-  # please pre-download utfcpp
-  set(possible_file_locations
-    $ENV{HOME}/Downloads/utfcpp-3.2.5.tar.gz
-    ${PROJECT_SOURCE_DIR}/utfcpp-3.2.5.tar.gz
-    ${PROJECT_BINARY_DIR}/utfcpp-3.2.5.tar.gz
-    /tmp/utfcpp-3.2.5.tar.gz
-    /star-fj/fangjun/download/github/utfcpp-3.2.5.tar.gz
-  )
-
-  foreach(f IN LISTS possible_file_locations)
-    if(EXISTS ${f})
-      set(utfcpp_URL  "${f}")
-      file(TO_CMAKE_PATH "${utfcpp_URL}" utfcpp_URL)
-      message(STATUS "Found local downloaded utfcpp: ${utfcpp_URL}")
-      set(utfcpp_URL2)
-      break()
-    endif()
-  endforeach()
-
-  FetchContent_Declare(utfcpp
-    URL
-      ${utfcpp_URL}
-      ${utfcpp_URL2}
-    URL_HASH          ${utfcpp_HASH}
-  )
-
-  FetchContent_GetProperties(utfcpp)
-  if(NOT utfcpp_POPULATED)
-    message(STATUS "Downloading utfcpp from ${utfcpp_URL}")
-    FetchContent_Populate(utfcpp)
-  endif()
-  message(STATUS "utfcpp is downloaded to ${utfcpp_SOURCE_DIR}")
-  # add_subdirectory(${utfcpp_SOURCE_DIR} ${utfcpp_BINARY_DIR} EXCLUDE_FROM_ALL)
-  include_directories(${utfcpp_SOURCE_DIR})
-endfunction()
-
-download_utfcpp()
--- a/sherpa-onnx/csrc/text-utils.cc
查看文件 @6e5efa4
+++ b/sherpa-onnx/csrc/text-utils.cc
查看文件 @6e5efa4
@@ -16,7 +16,7 @@
 #include <utility>
 #include <vector>
-#include "source/utf8.h"
+#include "sherpa-onnx/csrc/macros.h"
 // This file is copied/modified from
 // https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc
@@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
                                   std::vector<double> *out);
 std::vector<std::string> SplitUtf8(const std::string &text) {
-  char *begin = const_cast<char *>(text.c_str());
-  char *end = begin + text.size();
+  const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str());
+  const uint8_t *end = begin + text.size();
   std::vector<std::string> ans;
-  std::string buf;
-  while (begin < end) {
-    uint32_t code = utf8::next(begin, end);
+  auto start = begin;
+  while (start < end) {
+    uint8_t c = *start;
+    uint8_t i = 0x80;
+    int32_t num_bytes = 0;
-    // 1. is punctuation
-    if (std::ispunct(code)) {
-      if (!buf.empty()) {
-        ans.push_back(std::move(buf));
-      }
-
-      char s[5] = {0};
-      utf8::append(code, s);
-      ans.push_back(s);
-      continue;
-    }
-
-    // 2. is space
-    if (std::isspace(code)) {
-      if (!buf.empty()) {
-        ans.push_back(std::move(buf));
-      }
-      continue;
-    }
-
-    // 3. is alpha
-    if (std::isalpha(code)) {
-      buf.push_back(code);
-      continue;
+    // see
+    // https://en.wikipedia.org/wiki/UTF-8
+    for (; c & i; i >>= 1) {
+      ++num_bytes;
     }
-    if (!buf.empty()) {
-      ans.push_back(std::move(buf));
+    if (num_bytes == 0) {
+      // this is an ascii
+      ans.emplace_back(reinterpret_cast<const char *>(start), 1);
+      ++start;
+    } else if (2 <= num_bytes && num_bytes <= 4) {
+      ans.emplace_back(reinterpret_cast<const char *>(start), num_bytes);
+      start += num_bytes;
+    } else {
+      SHERPA_ONNX_LOGE("Invalid byte at position: %d",
+                       static_cast<int32_t>(start - begin));
+      // skip this byte
+      ++start;
     }
-
-    // for others
-
-    char s[5] = {0};
-    utf8::append(code, s);
-    ans.push_back(s);
-  }
-
-  if (!buf.empty()) {
-    ans.push_back(std::move(buf));
   }
   return ans;
 }
+
 }  // namespace sherpa_onnx