Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-10-28 22:22:00 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-10-28 22:22:00 +0800
Commit
157628b2574f6cd277ed9db2c7ac95ce5be64a14
157628b2
1 parent
64ab1ea9
Support French in TTS (#397)
隐藏空白字符变更
内嵌
并排对比
正在显示
4 个修改的文件
包含
96 行增加
和
4 行删除
CMakeLists.txt
sherpa-onnx/csrc/lexicon.cc
sherpa-onnx/csrc/lexicon.h
sherpa-onnx/csrc/text-utils.cc
CMakeLists.txt
查看文件 @
157628b
cmake_minimum_required
(
VERSION 3.13 FATAL_ERROR
)
project
(
sherpa-onnx
)
set
(
SHERPA_ONNX_VERSION
"1.8.
6
"
)
set
(
SHERPA_ONNX_VERSION
"1.8.
7
"
)
# Disable warning about
#
...
...
sherpa-onnx/csrc/lexicon.cc
查看文件 @
157628b
...
...
@@ -133,6 +133,8 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
return
ConvertTextToTokenIdsGerman
(
text
);
case
Language
:
:
kSpanish
:
return
ConvertTextToTokenIdsSpanish
(
text
);
case
Language
:
:
kFrench
:
return
ConvertTextToTokenIdsFrench
(
text
);
case
Language
:
:
kChinese
:
return
ConvertTextToTokenIdsChinese
(
text
);
default
:
...
...
@@ -254,6 +256,8 @@ void Lexicon::InitLanguage(const std::string &_lang) {
language_
=
Language
::
kGerman
;
}
else
if
(
lang
==
"spanish"
)
{
language_
=
Language
::
kSpanish
;
}
else
if
(
lang
==
"french"
)
{
language_
=
Language
::
kFrench
;
}
else
if
(
lang
==
"chinese"
)
{
language_
=
Language
::
kChinese
;
}
else
{
...
...
sherpa-onnx/csrc/lexicon.h
查看文件 @
157628b
...
...
@@ -46,6 +46,11 @@ class Lexicon {
return
ConvertTextToTokenIdsEnglish
(
text
);
}
std
::
vector
<
int64_t
>
ConvertTextToTokenIdsFrench
(
const
std
::
string
&
text
)
const
{
return
ConvertTextToTokenIdsEnglish
(
text
);
}
std
::
vector
<
int64_t
>
ConvertTextToTokenIdsEnglish
(
const
std
::
string
&
text
)
const
;
...
...
@@ -62,6 +67,7 @@ class Lexicon {
kEnglish
,
kGerman
,
kSpanish
,
kFrench
,
kChinese
,
kUnknown
,
};
...
...
sherpa-onnx/csrc/text-utils.cc
查看文件 @
157628b
...
...
@@ -164,7 +164,7 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
std
::
vector
<
double
>
*
out
);
static
bool
IsPunct
(
char
c
)
{
return
c
!=
'\''
&&
std
::
ispunct
(
c
);
}
static
bool
IsGermanUmlaut
s
(
const
std
::
string
&
word
)
{
static
bool
IsGermanUmlaut
(
const
std
::
string
&
word
)
{
// ä 0xC3 0xA4
// ö 0xC3 0xB6
// ü 0xC3 0xBC
...
...
@@ -187,6 +187,7 @@ static bool IsGermanUmlauts(const std::string &word) {
}
// see https://www.tandem.net/blog/spanish-accents
// https://www.compart.com/en/unicode/U+00DC
static
bool
IsSpanishDiacritic
(
const
std
::
string
&
word
)
{
// á 0xC3 0xA1
// é 0xC3 0xA9
...
...
@@ -195,6 +196,16 @@ static bool IsSpanishDiacritic(const std::string &word) {
// ú 0xC3 0xBA
// ü 0xC3 0xBC
// ñ 0xC3 0xB1
//
// uppercase
//
// Á 0xC3 0x81
// É 0xC3 0x89
// Í 0xC3 0x8D
// Ó 0xC3 0x93
// Ú 0xC3 0x9A
// Ü 0xC3 0x9C
// Ñ 0xC3 0x91
if
(
word
.
size
()
!=
2
||
static_cast
<
uint8_t
>
(
word
[
0
])
!=
0xc3
)
{
return
false
;
...
...
@@ -202,15 +213,86 @@ static bool IsSpanishDiacritic(const std::string &word) {
auto
c
=
static_cast
<
uint8_t
>
(
word
[
1
]);
if
(
c
==
0xa1
||
c
==
0xa9
||
c
==
0xad
||
c
==
0xb3
||
c
==
0xba
||
c
==
0xbc
||
c
==
0xb1
)
{
c
==
0xbc
||
c
==
0xb1
||
c
==
0x81
||
c
==
0x89
||
c
==
0x8d
||
c
==
0x93
||
c
==
0x9a
||
c
==
0x9c
||
c
==
0x91
)
{
return
true
;
}
return
false
;
}
// see https://www.busuu.com/en/french/accent-marks
static
bool
IsFrenchDiacritic
(
const
std
::
string
&
word
)
{
// acute accent
// é 0xC3 0xA9
//
// grave accent
// à 0xC3 0xA0
// è 0xC3 0xA8
// ù 0xC3 0xB9
//
// cedilla
// ç 0xC3 0xA7
//
// circumflex
// â 0xC3 0xA2
// ê 0xC3 0xAA
// î 0xC3 0xAE
// ô 0xC3 0xB4
// û 0xC3 0xBB
//
// trema
// ë 0xC3 0xAB
// ï 0xC3 0xAF
// ü 0xC3 0xBC
//
// É 0xC3 0x89
//
// À 0xC3 0x80
// È 0xC3 0x88
// Ù 0xC3 0x99
// Ç 0xC3 0x87
// Â 0xC3 0x82
// Ê 0xC3 0x8A
// Î 0xC3 0x8E
// Ô 0xC3 0x94
// Û 0xC3 0x9B
// Ë 0xC3 0x8B
// Ï 0xC3 0x8F
// Ü 0xC3 0x9C
if
(
word
.
size
()
!=
2
||
static_cast
<
uint8_t
>
(
word
[
0
])
!=
0xc3
)
{
return
false
;
}
auto
c
=
static_cast
<
uint8_t
>
(
word
[
1
]);
if
(
c
==
0xa9
||
c
==
0xa0
||
c
==
0xa8
||
c
==
0xb9
||
c
==
0xa7
||
c
==
0xa2
||
c
==
0xaa
||
c
==
0xae
||
c
==
0xb4
||
c
==
0xbb
||
c
==
0xab
||
c
==
0xaf
||
c
==
0xbc
||
c
==
0x89
||
c
==
0x80
||
c
==
0x88
||
c
==
0x99
||
c
==
0x87
||
c
==
0x82
||
c
==
0x8a
||
c
==
0x8e
||
c
==
0x94
||
c
==
0x9b
||
c
==
0x8b
||
c
==
0x8f
||
c
==
0x9c
)
{
return
true
;
}
return
false
;
}
static
bool
IsSpecial
(
const
std
::
string
&
w
)
{
return
IsGermanUmlauts
(
w
)
||
IsSpanishDiacritic
(
w
);
bool
ans
=
IsGermanUmlaut
(
w
)
||
IsSpanishDiacritic
(
w
)
||
IsFrenchDiacritic
(
w
);
// for french d’impossible
// ’ 0xE2 0x80 0x99
bool
ans2
=
false
;
if
(
w
.
size
()
==
3
)
{
auto
c0
=
static_cast
<
uint8_t
>
(
w
[
0
]);
auto
c1
=
static_cast
<
uint8_t
>
(
w
[
1
]);
auto
c2
=
static_cast
<
uint8_t
>
(
w
[
2
]);
if
(
c0
==
0xe2
&&
c1
==
0x80
&&
c2
==
0x99
)
{
ans2
=
true
;
}
}
return
ans
||
ans2
;
}
static
std
::
vector
<
std
::
string
>
MergeCharactersIntoWords
(
...
...
请
注册
或
登录
后发表评论