Committed by
GitHub
Fix passing gb2312 encoded strings to tts on Windows (#1819)
正在显示
3 个修改的文件
包含
155 行增加
和
0 行删除
| @@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default; | @@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default; | ||
| 96 | GeneratedAudio OfflineTts::Generate( | 96 | GeneratedAudio OfflineTts::Generate( |
| 97 | const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/, | 97 | const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/, |
| 98 | GeneratedAudioCallback callback /*= nullptr*/) const { | 98 | GeneratedAudioCallback callback /*= nullptr*/) const { |
| 99 | +#if !defined(_WIN32) | ||
| 99 | return impl_->Generate(text, sid, speed, std::move(callback)); | 100 | return impl_->Generate(text, sid, speed, std::move(callback)); |
| 101 | +#else | ||
| 102 | + if (IsUtf8(text)) { | ||
| 103 | + return impl_->Generate(text, sid, speed, std::move(callback)); | ||
| 104 | + } else if (IsGB2312(text)) { | ||
| 105 | + auto utf8_text = Gb2312ToUtf8(text); | ||
| 106 | + static bool printed = false; | ||
| 107 | + if (!printed) { | ||
| 108 | + SHERPA_ONNX_LOGE( | ||
| 109 | + "Detected GB2312 encoded string! Converting it to UTF8."); | ||
| 110 | + printed = true; | ||
| 111 | + } | ||
| 112 | + return impl_->Generate(utf8_text, sid, speed, std::move(callback)); | ||
| 113 | + } else { | ||
| 114 | + SHERPA_ONNX_LOGE( | ||
| 115 | + "Non UTF8 encoded string is received. You would not get expected " | ||
| 116 | + "results!"); | ||
| 117 | + return impl_->Generate(text, sid, speed, std::move(callback)); | ||
| 118 | + } | ||
| 119 | +#endif | ||
| 100 | } | 120 | } |
| 101 | 121 | ||
| 102 | int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); } | 122 | int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); } |
| @@ -16,6 +16,10 @@ | @@ -16,6 +16,10 @@ | ||
| 16 | #include <utility> | 16 | #include <utility> |
| 17 | #include <vector> | 17 | #include <vector> |
| 18 | 18 | ||
| 19 | +#if defined(_WIN32) | ||
| 20 | +#include <Windows.h> | ||
| 21 | +#endif | ||
| 22 | + | ||
| 19 | #include "sherpa-onnx/csrc/macros.h" | 23 | #include "sherpa-onnx/csrc/macros.h" |
| 20 | 24 | ||
| 21 | // This file is copied/modified from | 25 | // This file is copied/modified from |
| @@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text, | @@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text, | ||
| 502 | return ans; | 506 | return ans; |
| 503 | } | 507 | } |
| 504 | 508 | ||
| 509 | +bool IsUtf8(const std::string &text) { | ||
| 510 | + int32_t n = static_cast<int32_t>(text.size()); | ||
| 511 | + int32_t i = 0; | ||
| 512 | + const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data()); | ||
| 513 | + while (i < n) { | ||
| 514 | + if (p[i] <= 0x7f) { | ||
| 515 | + i += 1; | ||
| 516 | + continue; | ||
| 517 | + } | ||
| 518 | + | ||
| 519 | + if (InRange(p[i], 0xc2, 0xdf) && i + 1 < n && | ||
| 520 | + InRange(p[i + 1], 0x80, 0xbf)) { | ||
| 521 | + i += 2; | ||
| 522 | + continue; | ||
| 523 | + } | ||
| 524 | + | ||
| 525 | + if (p[i] == 0xe0 && i + 2 < n && InRange(p[i + 1], 0xa0, 0xbf) && | ||
| 526 | + InRange(p[i + 2], 0x80, 0xbf)) { | ||
| 527 | + i += 3; | ||
| 528 | + continue; | ||
| 529 | + } | ||
| 530 | + | ||
| 531 | + if (InRange(p[i], 0xe1, 0xec) && i + 2 < n && | ||
| 532 | + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) { | ||
| 533 | + i += 3; | ||
| 534 | + continue; | ||
| 535 | + } | ||
| 536 | + | ||
| 537 | + if (p[i] == 0xed && i + 2 < n && InRange(p[i + 1], 0x80, 0x9f) && | ||
| 538 | + InRange(p[i + 2], 0x80, 0xbf)) { | ||
| 539 | + i += 3; | ||
| 540 | + continue; | ||
| 541 | + } | ||
| 542 | + | ||
| 543 | + if (InRange(p[i], 0xee, 0xef) && i + 2 < n && | ||
| 544 | + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) { | ||
| 545 | + i += 3; | ||
| 546 | + continue; | ||
| 547 | + } | ||
| 548 | + | ||
| 549 | + if (p[i] == 0xf0 && i + 3 < n && InRange(p[i + 1], 0x90, 0xbf) && | ||
| 550 | + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) { | ||
| 551 | + i += 4; | ||
| 552 | + continue; | ||
| 553 | + } | ||
| 554 | + | ||
| 555 | + if (InRange(p[i], 0xf1, 0xf3) && i + 3 < n && | ||
| 556 | + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf) && | ||
| 557 | + InRange(p[i + 3], 0x80, 0xbf)) { | ||
| 558 | + i += 4; | ||
| 559 | + continue; | ||
| 560 | + } | ||
| 561 | + | ||
| 562 | + if (p[i] == 0xf4 && i + 3 < n && InRange(p[i + 1], 0x80, 0x8f) && | ||
| 563 | + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) { | ||
| 564 | + i += 4; | ||
| 565 | + continue; | ||
| 566 | + } | ||
| 567 | + | ||
| 568 | + return false; | ||
| 569 | + } | ||
| 570 | + | ||
| 571 | + return true; | ||
| 572 | +} | ||
| 573 | + | ||
| 574 | +bool IsGB2312(const std::string &text) { | ||
| 575 | + int32_t n = static_cast<int32_t>(text.size()); | ||
| 576 | + int32_t i = 0; | ||
| 577 | + const uint8_t *p = reinterpret_cast<const uint8_t *>(text.data()); | ||
| 578 | + while (i < n) { | ||
| 579 | + if (p[i] <= 0x7f) { | ||
| 580 | + i += 1; | ||
| 581 | + continue; | ||
| 582 | + } | ||
| 583 | + | ||
| 584 | + if (InRange(p[i], 0xa1, 0xf7) && i + 1 < n && | ||
| 585 | + InRange(p[i + 1], 0xa1, 0xfe)) { | ||
| 586 | + i += 2; | ||
| 587 | + continue; | ||
| 588 | + } | ||
| 589 | + | ||
| 590 | + return false; | ||
| 591 | + } | ||
| 592 | + | ||
| 593 | + return true; | ||
| 594 | +} | ||
| 595 | + | ||
| 596 | +#if defined(_WIN32) | ||
| 597 | +std::string Gb2312ToUtf8(const std::string &text) { | ||
| 598 | + // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar | ||
| 599 | + // 936 is from | ||
| 600 | + // https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers | ||
| 601 | + // GB2312 -> 936 | ||
| 602 | + int32_t num_wchars = | ||
| 603 | + MultiByteToWideChar(936, 0, text.c_str(), text.size(), nullptr, 0); | ||
| 604 | + SHERPA_ONNX_LOGE("num of wchars: %d", num_wchars); | ||
| 605 | + if (num_wchars == 0) { | ||
| 606 | + return {}; | ||
| 607 | + } | ||
| 608 | + | ||
| 609 | + std::wstring wstr; | ||
| 610 | + wstr.resize(num_wchars); | ||
| 611 | + MultiByteToWideChar(936, 0, text.c_str(), text.size(), wstr.data(), | ||
| 612 | + num_wchars); | ||
| 613 | + // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte | ||
| 614 | + int32_t num_chars = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, nullptr, | ||
| 615 | + 0, nullptr, nullptr); | ||
| 616 | + if (num_chars == 0) { | ||
| 617 | + return {}; | ||
| 618 | + } | ||
| 619 | + | ||
| 620 | + std::string ans(num_chars, 0); | ||
| 621 | + WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, ans.data(), num_chars, | ||
| 622 | + nullptr, nullptr); | ||
| 623 | + | ||
| 624 | + return ans; | ||
| 625 | +} | ||
| 626 | +#endif | ||
| 627 | + | ||
| 505 | } // namespace sherpa_onnx | 628 | } // namespace sherpa_onnx |
| @@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out); | @@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out); | ||
| 127 | std::string RemoveInvalidUtf8Sequences(const std::string &text, | 127 | std::string RemoveInvalidUtf8Sequences(const std::string &text, |
| 128 | bool show_debug_msg = false); | 128 | bool show_debug_msg = false); |
| 129 | 129 | ||
| 130 | +// Return true if text contains valid utf8 sequence. | ||
| 131 | +// Return false otherwise | ||
| 132 | +bool IsUtf8(const std::string &text); | ||
| 133 | + | ||
| 134 | +// Return true if text contains valid gb2312 encoded sequence | ||
| 135 | +// Return false otherwise | ||
| 136 | +bool IsGB2312(const std::string &text); | ||
| 137 | + | ||
| 138 | +#if defined(_WIN32) | ||
| 139 | +std::string Gb2312ToUtf8(const std::string &text); | ||
| 140 | +#endif | ||
| 141 | + | ||
| 130 | } // namespace sherpa_onnx | 142 | } // namespace sherpa_onnx |
| 131 | 143 | ||
| 132 | #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_ | 144 | #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_ |
-
请 注册 或 登录 后发表评论