Fangjun Kuang
Committed by GitHub

Fix passing gb2312 encoded strings to tts on Windows (#1819)

... ... @@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default;
GeneratedAudio OfflineTts::Generate(
const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/,
GeneratedAudioCallback callback /*= nullptr*/) const {
#if !defined(_WIN32)
return impl_->Generate(text, sid, speed, std::move(callback));
#else
if (IsUtf8(text)) {
return impl_->Generate(text, sid, speed, std::move(callback));
} else if (IsGB2312(text)) {
auto utf8_text = Gb2312ToUtf8(text);
static bool printed = false;
if (!printed) {
SHERPA_ONNX_LOGE(
"Detected GB2312 encoded string! Converting it to UTF8.");
printed = true;
}
return impl_->Generate(utf8_text, sid, speed, std::move(callback));
} else {
SHERPA_ONNX_LOGE(
"Non UTF8 encoded string is received. You would not get expected "
"results!");
return impl_->Generate(text, sid, speed, std::move(callback));
}
#endif
}
int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); }
... ...
... ... @@ -16,6 +16,10 @@
#include <utility>
#include <vector>
#if defined(_WIN32)
#include <Windows.h>
#endif
#include "sherpa-onnx/csrc/macros.h"
// This file is copied/modified from
... ... @@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text,
return ans;
}
bool IsUtf8(const std::string &text) {
int32_t n = static_cast<int32_t>(text.size());
int32_t i = 0;
const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data());
while (i < n) {
if (p[i] <= 0x7f) {
i += 1;
continue;
}
if (InRange(p[i], 0xc2, 0xdf) && i + 1 < n &&
InRange(p[i + 1], 0x80, 0xbf)) {
i += 2;
continue;
}
if (p[i] == 0xe0 && i + 2 < n && InRange(p[i + 1], 0xa0, 0xbf) &&
InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}
if (InRange(p[i], 0xe1, 0xec) && i + 2 < n &&
InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}
if (p[i] == 0xed && i + 2 < n && InRange(p[i + 1], 0x80, 0x9f) &&
InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}
if (InRange(p[i], 0xee, 0xef) && i + 2 < n &&
InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) {
i += 3;
continue;
}
if (p[i] == 0xf0 && i + 3 < n && InRange(p[i + 1], 0x90, 0xbf) &&
InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) {
i += 4;
continue;
}
if (InRange(p[i], 0xf1, 0xf3) && i + 3 < n &&
InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf) &&
InRange(p[i + 3], 0x80, 0xbf)) {
i += 4;
continue;
}
if (p[i] == 0xf4 && i + 3 < n && InRange(p[i + 1], 0x80, 0x8f) &&
InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) {
i += 4;
continue;
}
return false;
}
return true;
}
bool IsGB2312(const std::string &text) {
int32_t n = static_cast<int32_t>(text.size());
int32_t i = 0;
const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data());
while (i < n) {
if (p[i] <= 0x7f) {
i += 1;
continue;
}
if (InRange(p[i], 0xa1, 0xf7) && i + 1 < n &&
InRange(p[i + 1], 0xa1, 0xfe)) {
i += 2;
continue;
}
return false;
}
return true;
}
#if defined(_WIN32)
std::string Gb2312ToUtf8(const std::string &text) {
// https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar
// 936 is from
// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
// GB2312 -> 936
int32_t num_wchars =
MultiByteToWideChar(936, 0, text.c_str(), text.size(), nullptr, 0);
SHERPA_ONNX_LOGE("num of wchars: %d", num_wchars);
if (num_wchars == 0) {
return {};
}
std::wstring wstr;
wstr.resize(num_wchars);
MultiByteToWideChar(936, 0, text.c_str(), text.size(), wstr.data(),
num_wchars);
// https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
int32_t num_chars = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr,
0, nullptr, nullptr);
if (num_chars == 0) {
return {};
}
std::string ans(num_chars, 0);
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, ans.data(), num_chars,
nullptr, nullptr);
return ans;
}
#endif
} // namespace sherpa_onnx
... ...
... ... @@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out);
std::string RemoveInvalidUtf8Sequences(const std::string &text,
bool show_debug_msg = false);
// Return true if text contains valid utf8 sequence.
// Return false otherwise
bool IsUtf8(const std::string &text);
// Return true if text contains valid gb2312 encoded sequence
// Return false otherwise
bool IsGB2312(const std::string &text);
#if defined(_WIN32)
std::string Gb2312ToUtf8(const std::string &text);
#endif
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
... ...