Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2025-02-08 09:48:58 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2025-02-08 09:48:58 +0800
Commit
d38cb810145fcb5ebbe70b329e296f338ce73211
d38cb810
1 parent
51b42748
Fix passing gb2312 encoded strings to tts on Windows (#1819)
隐藏空白字符变更
内嵌
并排对比
正在显示
3 个修改的文件
包含
155 行增加
和
0 行删除
sherpa-onnx/csrc/offline-tts.cc
sherpa-onnx/csrc/text-utils.cc
sherpa-onnx/csrc/text-utils.h
sherpa-onnx/csrc/offline-tts.cc
查看文件 @
d38cb81
...
...
@@ -96,7 +96,27 @@ OfflineTts::~OfflineTts() = default;
GeneratedAudio
OfflineTts
::
Generate
(
const
std
::
string
&
text
,
int64_t
sid
/*=0*/
,
float
speed
/*= 1.0*/
,
GeneratedAudioCallback
callback
/*= nullptr*/
)
const
{
#if !defined(_WIN32)
return
impl_
->
Generate
(
text
,
sid
,
speed
,
std
::
move
(
callback
));
#else
if
(
IsUtf8
(
text
))
{
return
impl_
->
Generate
(
text
,
sid
,
speed
,
std
::
move
(
callback
));
}
else
if
(
IsGB2312
(
text
))
{
auto
utf8_text
=
Gb2312ToUtf8
(
text
);
static
bool
printed
=
false
;
if
(
!
printed
)
{
SHERPA_ONNX_LOGE
(
"Detected GB2312 encoded string! Converting it to UTF8."
);
printed
=
true
;
}
return
impl_
->
Generate
(
utf8_text
,
sid
,
speed
,
std
::
move
(
callback
));
}
else
{
SHERPA_ONNX_LOGE
(
"Non UTF8 encoded string is received. You would not get expected "
"results!"
);
return
impl_
->
Generate
(
text
,
sid
,
speed
,
std
::
move
(
callback
));
}
#endif
}
int32_t
OfflineTts
::
SampleRate
()
const
{
return
impl_
->
SampleRate
();
}
...
...
sherpa-onnx/csrc/text-utils.cc
查看文件 @
d38cb81
...
...
@@ -16,6 +16,10 @@
#include <utility>
#include <vector>
#if defined(_WIN32)
#include <Windows.h>
#endif
#include "sherpa-onnx/csrc/macros.h"
// This file is copied/modified from
...
...
@@ -502,4 +506,123 @@ std::string RemoveInvalidUtf8Sequences(const std::string &text,
return
ans
;
}
bool
IsUtf8
(
const
std
::
string
&
text
)
{
int32_t
n
=
static_cast
<
int32_t
>
(
text
.
size
());
int32_t
i
=
0
;
const
uint8_t
*
p
=
reinterpret_cast
<
const
uint8_t
*>
(
text
.
data
());
while
(
i
<
n
)
{
if
(
p
[
i
]
<=
0x7f
)
{
i
+=
1
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xc2
,
0xdf
)
&&
i
+
1
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
))
{
i
+=
2
;
continue
;
}
if
(
p
[
i
]
==
0xe0
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0xa0
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
i
+=
3
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xe1
,
0xec
)
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
i
+=
3
;
continue
;
}
if
(
p
[
i
]
==
0xed
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0x9f
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
i
+=
3
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xee
,
0xef
)
&&
i
+
2
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
))
{
i
+=
3
;
continue
;
}
if
(
p
[
i
]
==
0xf0
&&
i
+
3
<
n
&&
InRange
(
p
[
i
+
1
],
0x90
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
3
],
0x80
,
0xbf
))
{
i
+=
4
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xf1
,
0xf3
)
&&
i
+
3
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
3
],
0x80
,
0xbf
))
{
i
+=
4
;
continue
;
}
if
(
p
[
i
]
==
0xf4
&&
i
+
3
<
n
&&
InRange
(
p
[
i
+
1
],
0x80
,
0x8f
)
&&
InRange
(
p
[
i
+
2
],
0x80
,
0xbf
)
&&
InRange
(
p
[
i
+
3
],
0x80
,
0xbf
))
{
i
+=
4
;
continue
;
}
return
false
;
}
return
true
;
}
bool
IsGB2312
(
const
std
::
string
&
text
)
{
int32_t
n
=
static_cast
<
int32_t
>
(
text
.
size
());
int32_t
i
=
0
;
const
uint8_t
*
p
=
reinterpret_cast
<
const
uint8_t
*>
(
text
.
data
());
while
(
i
<
n
)
{
if
(
p
[
i
]
<=
0x7f
)
{
i
+=
1
;
continue
;
}
if
(
InRange
(
p
[
i
],
0xa1
,
0xf7
)
&&
i
+
1
<
n
&&
InRange
(
p
[
i
+
1
],
0xa1
,
0xfe
))
{
i
+=
2
;
continue
;
}
return
false
;
}
return
true
;
}
#if defined(_WIN32)
std
::
string
Gb2312ToUtf8
(
const
std
::
string
&
text
)
{
// https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar
// 936 is from
// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
// GB2312 -> 936
int32_t
num_wchars
=
MultiByteToWideChar
(
936
,
0
,
text
.
c_str
(),
text
.
size
(),
nullptr
,
0
);
SHERPA_ONNX_LOGE
(
"num of wchars: %d"
,
num_wchars
);
if
(
num_wchars
==
0
)
{
return
{};
}
std
::
wstring
wstr
;
wstr
.
resize
(
num_wchars
);
MultiByteToWideChar
(
936
,
0
,
text
.
c_str
(),
text
.
size
(),
wstr
.
data
(),
num_wchars
);
// https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
int32_t
num_chars
=
WideCharToMultiByte
(
CP_UTF8
,
0
,
wstr
.
c_str
(),
-
1
,
nullptr
,
0
,
nullptr
,
nullptr
);
if
(
num_chars
==
0
)
{
return
{};
}
std
::
string
ans
(
num_chars
,
0
);
WideCharToMultiByte
(
CP_UTF8
,
0
,
wstr
.
c_str
(),
-
1
,
ans
.
data
(),
num_chars
,
nullptr
,
nullptr
);
return
ans
;
}
#endif
}
// namespace sherpa_onnx
...
...
sherpa-onnx/csrc/text-utils.h
查看文件 @
d38cb81
...
...
@@ -127,6 +127,18 @@ void ToLowerCase(std::string *in_out);
std
::
string
RemoveInvalidUtf8Sequences
(
const
std
::
string
&
text
,
bool
show_debug_msg
=
false
);
// Return true if text contains valid utf8 sequence.
// Return false otherwise
bool
IsUtf8
(
const
std
::
string
&
text
);
// Return true if text contains valid gb2312 encoded sequence
// Return false otherwise
bool
IsGB2312
(
const
std
::
string
&
text
);
#if defined(_WIN32)
std
::
string
Gb2312ToUtf8
(
const
std
::
string
&
text
);
#endif
}
// namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
...
...
请
注册
或
登录
后发表评论