Fangjun Kuang
Committed by GitHub

Fix passing gb2312 encoded strings to tts on Windows (#1819)

@@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default; @@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default;
96 GeneratedAudio OfflineTts::Generate( 96 GeneratedAudio OfflineTts::Generate(
97 const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/, 97 const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/,
98 GeneratedAudioCallback callback /*= nullptr*/) const { 98 GeneratedAudioCallback callback /*= nullptr*/) const {
  99 +#if !defined(_WIN32)
99 return impl_->Generate(text, sid, speed, std::move(callback)); 100 return impl_->Generate(text, sid, speed, std::move(callback));
  101 +#else
  102 + if (IsUtf8(text)) {
  103 + return impl_->Generate(text, sid, speed, std::move(callback));
  104 + } else if (IsGB2312(text)) {
  105 + auto utf8_text = Gb2312ToUtf8(text);
  106 + static bool printed = false;
  107 + if (!printed) {
  108 + SHERPA_ONNX_LOGE(
  109 + "Detected GB2312 encoded string! Converting it to UTF8.");
  110 + printed = true;
  111 + }
  112 + return impl_->Generate(utf8_text, sid, speed, std::move(callback));
  113 + } else {
  114 + SHERPA_ONNX_LOGE(
  115 + "Non UTF8 encoded string is received. You would not get expected "
  116 + "results!");
  117 + return impl_->Generate(text, sid, speed, std::move(callback));
  118 + }
  119 +#endif
100 } 120 }
101 121
102 int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); } 122 int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); }
@@ -16,6 +16,10 @@ @@ -16,6 +16,10 @@
16 #include <utility> 16 #include <utility>
17 #include <vector> 17 #include <vector>
18 18
  19 +#if defined(_WIN32)
  20 +#include <Windows.h>
  21 +#endif
  22 +
19 #include "sherpa-onnx/csrc/macros.h" 23 #include "sherpa-onnx/csrc/macros.h"
20 24
21 // This file is copied/modified from 25 // This file is copied/modified from
@@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text, @@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text,
502 return ans; 506 return ans;
503 } 507 }
504 508
  509 +bool IsUtf8(const std::string &text) {
  510 + int32_t n = static_cast<int32_t>(text.size());
  511 + int32_t i = 0;
  512 + const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data());
  513 + while (i < n) {
  514 + if (p[i] <= 0x7f) {
  515 + i += 1;
  516 + continue;
  517 + }
  518 +
  519 + if (InRange(p[i], 0xc2, 0xdf) && i + 1 < n &&
  520 + InRange(p[i + 1], 0x80, 0xbf)) {
  521 + i += 2;
  522 + continue;
  523 + }
  524 +
  525 + if (p[i] == 0xe0 && i + 2 < n && InRange(p[i + 1], 0xa0, 0xbf) &&
  526 + InRange(p[i + 2], 0x80, 0xbf)) {
  527 + i += 3;
  528 + continue;
  529 + }
  530 +
  531 + if (InRange(p[i], 0xe1, 0xec) && i + 2 < n &&
  532 + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) {
  533 + i += 3;
  534 + continue;
  535 + }
  536 +
  537 + if (p[i] == 0xed && i + 2 < n && InRange(p[i + 1], 0x80, 0x9f) &&
  538 + InRange(p[i + 2], 0x80, 0xbf)) {
  539 + i += 3;
  540 + continue;
  541 + }
  542 +
  543 + if (InRange(p[i], 0xee, 0xef) && i + 2 < n &&
  544 + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) {
  545 + i += 3;
  546 + continue;
  547 + }
  548 +
  549 + if (p[i] == 0xf0 && i + 3 < n && InRange(p[i + 1], 0x90, 0xbf) &&
  550 + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) {
  551 + i += 4;
  552 + continue;
  553 + }
  554 +
  555 + if (InRange(p[i], 0xf1, 0xf3) && i + 3 < n &&
  556 + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf) &&
  557 + InRange(p[i + 3], 0x80, 0xbf)) {
  558 + i += 4;
  559 + continue;
  560 + }
  561 +
  562 + if (p[i] == 0xf4 && i + 3 < n && InRange(p[i + 1], 0x80, 0x8f) &&
  563 + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) {
  564 + i += 4;
  565 + continue;
  566 + }
  567 +
  568 + return false;
  569 + }
  570 +
  571 + return true;
  572 +}
  573 +
  574 +bool IsGB2312(const std::string &text) {
  575 + int32_t n = static_cast<int32_t>(text.size());
  576 + int32_t i = 0;
  577 + const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data());
  578 + while (i < n) {
  579 + if (p[i] <= 0x7f) {
  580 + i += 1;
  581 + continue;
  582 + }
  583 +
  584 + if (InRange(p[i], 0xa1, 0xf7) && i + 1 < n &&
  585 + InRange(p[i + 1], 0xa1, 0xfe)) {
  586 + i += 2;
  587 + continue;
  588 + }
  589 +
  590 + return false;
  591 + }
  592 +
  593 + return true;
  594 +}
  595 +
  596 +#if defined(_WIN32)
  597 +std::string Gb2312ToUtf8(const std::string &text) {
  598 + // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar
  599 + // 936 is from
  600 + // https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
  601 + // GB2312 -> 936
  602 + int32_t num_wchars =
  603 + MultiByteToWideChar(936, 0, text.c_str(), text.size(), nullptr, 0);
  604 + SHERPA_ONNX_LOGE("num of wchars: %d", num_wchars);
  605 + if (num_wchars == 0) {
  606 + return {};
  607 + }
  608 +
  609 + std::wstring wstr;
  610 + wstr.resize(num_wchars);
  611 + MultiByteToWideChar(936, 0, text.c_str(), text.size(), wstr.data(),
  612 + num_wchars);
  613 + // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
  614 + int32_t num_chars = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr,
  615 + 0, nullptr, nullptr);
  616 + if (num_chars == 0) {
  617 + return {};
  618 + }
  619 +
  620 + std::string ans(num_chars, 0);
  621 + WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, ans.data(), num_chars,
  622 + nullptr, nullptr);
  623 +
  624 + return ans;
  625 +}
  626 +#endif
  627 +
505 } // namespace sherpa_onnx 628 } // namespace sherpa_onnx
@@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out); @@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out);
127 std::string RemoveInvalidUtf8Sequences(const std::string &text, 127 std::string RemoveInvalidUtf8Sequences(const std::string &text,
128 bool show_debug_msg = false); 128 bool show_debug_msg = false);
129 129
  130 +// Return true if text contains valid utf8 sequence.
  131 +// Return false otherwise
  132 +bool IsUtf8(const std::string &text);
  133 +
  134 +// Return true if text contains valid gb2312 encoded sequence
  135 +// Return false otherwise
  136 +bool IsGB2312(const std::string &text);
  137 +
  138 +#if defined(_WIN32)
  139 +std::string Gb2312ToUtf8(const std::string &text);
  140 +#endif
  141 +
130 } // namespace sherpa_onnx 142 } // namespace sherpa_onnx
131 143
132 #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_ 144 #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_