Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-10-28 11:09:34 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-10-28 11:09:34 +0800
Commit
64ab1ea9f8f8ad649dee6f1b4725e034ad08ebb2
64ab1ea9
1 parent
69e985f7
Support Spanish in TTS (#396)
隐藏空白字符变更
内嵌
并排对比
正在显示
3 个修改的文件
包含
43 行增加
和
6 行删除
sherpa-onnx/csrc/lexicon.cc
sherpa-onnx/csrc/lexicon.h
sherpa-onnx/csrc/text-utils.cc
sherpa-onnx/csrc/lexicon.cc
查看文件 @
64ab1ea
...
...
@@ -131,6 +131,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
return
ConvertTextToTokenIdsEnglish
(
text
);
case
Language
:
:
kGerman
:
return
ConvertTextToTokenIdsGerman
(
text
);
case
Language
:
:
kSpanish
:
return
ConvertTextToTokenIdsSpanish
(
text
);
case
Language
:
:
kChinese
:
return
ConvertTextToTokenIdsChinese
(
text
);
default
:
...
...
@@ -250,6 +252,8 @@ void Lexicon::InitLanguage(const std::string &_lang) {
language_
=
Language
::
kEnglish
;
}
else
if
(
lang
==
"german"
)
{
language_
=
Language
::
kGerman
;
}
else
if
(
lang
==
"spanish"
)
{
language_
=
Language
::
kSpanish
;
}
else
if
(
lang
==
"chinese"
)
{
language_
=
Language
::
kChinese
;
}
else
{
...
...
sherpa-onnx/csrc/lexicon.h
查看文件 @
64ab1ea
...
...
@@ -41,6 +41,11 @@ class Lexicon {
return
ConvertTextToTokenIdsEnglish
(
text
);
}
std
::
vector
<
int64_t
>
ConvertTextToTokenIdsSpanish
(
const
std
::
string
&
text
)
const
{
return
ConvertTextToTokenIdsEnglish
(
text
);
}
std
::
vector
<
int64_t
>
ConvertTextToTokenIdsEnglish
(
const
std
::
string
&
text
)
const
;
...
...
@@ -56,6 +61,7 @@ class Lexicon {
enum
class
Language
{
kEnglish
,
kGerman
,
kSpanish
,
kChinese
,
kUnknown
,
};
...
...
sherpa-onnx/csrc/text-utils.cc
查看文件 @
64ab1ea
...
...
@@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
std
::
vector
<
double
>
*
out
);
static
bool
IsPunct
(
char
c
)
{
return
c
!=
'\''
&&
std
::
ispunct
(
c
);
}
static
bool
IsGermanUmlauts
(
const
std
::
string
&
word
s
)
{
static
bool
IsGermanUmlauts
(
const
std
::
string
&
word
)
{
// ä 0xC3 0xA4
// ö 0xC3 0xB6
// ü 0xC3 0xBC
...
...
@@ -173,12 +173,12 @@ static bool IsGermanUmlauts(const std::string &words) {
// Ü 0xC3 0x9C
// ß 0xC3 0x9F
if
(
word
s
.
size
()
!=
2
||
static_cast
<
uint8_t
>
(
words
[
0
])
!=
0xc3
)
{
if
(
word
.
size
()
!=
2
||
static_cast
<
uint8_t
>
(
word
[
0
])
!=
0xc3
)
{
return
false
;
}
auto
c
=
static_cast
<
uint8_t
>
(
words
[
1
]);
if
(
c
==
0xa4
||
c
==
0xb6
||
c
==
0xbC
||
c
==
0x84
||
c
==
0x96
||
auto
c
=
static_cast
<
uint8_t
>
(
word
[
1
]);
if
(
c
==
0xa4
||
c
==
0xb6
||
c
==
0xbc
||
c
==
0x84
||
c
==
0x96
||
c
==
0x9c
||
c
==
0x9f
)
{
return
true
;
}
...
...
@@ -186,6 +186,33 @@ static bool IsGermanUmlauts(const std::string &words) {
return
false
;
}
// see https://www.tandem.net/blog/spanish-accents
static
bool
IsSpanishDiacritic
(
const
std
::
string
&
word
)
{
// á 0xC3 0xA1
// é 0xC3 0xA9
// í 0xC3 0xAD
// ó 0xC3 0xB3
// ú 0xC3 0xBA
// ü 0xC3 0xBC
// ñ 0xC3 0xB1
if
(
word
.
size
()
!=
2
||
static_cast
<
uint8_t
>
(
word
[
0
])
!=
0xc3
)
{
return
false
;
}
auto
c
=
static_cast
<
uint8_t
>
(
word
[
1
]);
if
(
c
==
0xa1
||
c
==
0xa9
||
c
==
0xad
||
c
==
0xb3
||
c
==
0xba
||
c
==
0xbc
||
c
==
0xb1
)
{
return
true
;
}
return
false
;
}
static
bool
IsSpecial
(
const
std
::
string
&
w
)
{
return
IsGermanUmlauts
(
w
)
||
IsSpanishDiacritic
(
w
);
}
static
std
::
vector
<
std
::
string
>
MergeCharactersIntoWords
(
const
std
::
vector
<
std
::
string
>
&
words
)
{
std
::
vector
<
std
::
string
>
ans
;
...
...
@@ -196,7 +223,7 @@ static std::vector<std::string> MergeCharactersIntoWords(
while
(
i
<
n
)
{
const
auto
&
w
=
words
[
i
];
if
(
w
.
size
()
>=
3
||
(
w
.
size
()
==
2
&&
!
Is
GermanUmlauts
(
w
))
||
if
(
w
.
size
()
>=
3
||
(
w
.
size
()
==
2
&&
!
Is
Special
(
w
))
||
(
w
.
size
()
==
1
&&
(
IsPunct
(
w
[
0
])
||
std
::
isspace
(
w
[
0
]))))
{
if
(
prev
!=
-
1
)
{
std
::
string
t
;
...
...
@@ -215,7 +242,7 @@ static std::vector<std::string> MergeCharactersIntoWords(
}
// e.g., öffnen
if
(
w
.
size
()
==
1
||
(
w
.
size
()
==
2
&&
Is
GermanUmlauts
(
w
)))
{
if
(
w
.
size
()
==
1
||
(
w
.
size
()
==
2
&&
Is
Special
(
w
)))
{
if
(
prev
==
-
1
)
{
prev
=
i
;
}
...
...
请
注册
或
登录
后发表评论