Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-10-26 14:10:24 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-10-26 14:10:24 +0800
Commit
44512858d60d1dbb38f1900edb775b7a56c974fd
44512858
1 parent
a8fed2a9
Support vits models from piper (#390)
显示空白字符变更
内嵌
并排对比
正在显示
5 个修改的文件
包含
130 行增加
和
49 行删除
sherpa-onnx/csrc/lexicon.cc
sherpa-onnx/csrc/lexicon.h
sherpa-onnx/csrc/offline-tts-vits-impl.h
sherpa-onnx/csrc/offline-tts-vits-model.cc
sherpa-onnx/csrc/offline-tts-vits-model.h
sherpa-onnx/csrc/lexicon.cc
查看文件 @
4451285
...
...
@@ -83,8 +83,8 @@ static std::vector<int32_t> ConvertTokensToIds(
Lexicon
::
Lexicon
(
const
std
::
string
&
lexicon
,
const
std
::
string
&
tokens
,
const
std
::
string
&
punctuations
,
const
std
::
string
&
language
,
bool
debug
/*= false*/
)
:
debug_
(
debug
)
{
bool
debug
/*= false*/
,
bool
is_piper
/*= false*/
)
:
debug_
(
debug
),
is_piper_
(
is_piper
)
{
InitLanguage
(
language
);
{
...
...
@@ -103,8 +103,9 @@ Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens,
#if __ANDROID_API__ >= 9
Lexicon
::
Lexicon
(
AAssetManager
*
mgr
,
const
std
::
string
&
lexicon
,
const
std
::
string
&
tokens
,
const
std
::
string
&
punctuations
,
const
std
::
string
&
language
,
bool
debug
/*= false*/
)
:
debug_
(
debug
)
{
const
std
::
string
&
language
,
bool
debug
/*= false*/
,
bool
is_piper
/*= false*/
)
:
debug_
(
debug
),
is_piper_
(
is_piper
)
{
InitLanguage
(
language
);
{
...
...
@@ -206,6 +207,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
int32_t
blank
=
token2id_
.
at
(
" "
);
std
::
vector
<
int64_t
>
ans
;
if
(
is_piper_
)
{
ans
.
push_back
(
token2id_
.
at
(
"^"
));
// sos
}
for
(
const
auto
&
w
:
words
)
{
if
(
punctuations_
.
count
(
w
))
{
ans
.
push_back
(
token2id_
.
at
(
w
));
...
...
@@ -227,6 +232,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
ans
.
resize
(
ans
.
size
()
-
1
);
}
if
(
is_piper_
)
{
ans
.
push_back
(
token2id_
.
at
(
"$"
));
// eos
}
return
ans
;
}
...
...
sherpa-onnx/csrc/lexicon.h
查看文件 @
4451285
...
...
@@ -24,12 +24,13 @@ class Lexicon {
public
:
Lexicon
(
const
std
::
string
&
lexicon
,
const
std
::
string
&
tokens
,
const
std
::
string
&
punctuations
,
const
std
::
string
&
language
,
bool
debug
=
false
);
bool
debug
=
false
,
bool
is_piper
=
false
);
#if __ANDROID_API__ >= 9
Lexicon
(
AAssetManager
*
mgr
,
const
std
::
string
&
lexicon
,
const
std
::
string
&
tokens
,
const
std
::
string
&
punctuations
,
const
std
::
string
&
language
,
bool
debug
=
false
);
const
std
::
string
&
language
,
bool
debug
=
false
,
bool
is_piper
=
false
);
#endif
std
::
vector
<
int64_t
>
ConvertTextToTokenIds
(
const
std
::
string
&
text
)
const
;
...
...
@@ -59,7 +60,7 @@ class Lexicon {
std
::
unordered_map
<
std
::
string
,
int32_t
>
token2id_
;
Language
language_
;
bool
debug_
;
//
bool
is_piper_
;
};
}
// namespace sherpa_onnx
...
...
sherpa-onnx/csrc/offline-tts-vits-impl.h
查看文件 @
4451285
...
...
@@ -26,15 +26,15 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
explicit
OfflineTtsVitsImpl
(
const
OfflineTtsConfig
&
config
)
:
model_
(
std
::
make_unique
<
OfflineTtsVitsModel
>
(
config
.
model
)),
lexicon_
(
config
.
model
.
vits
.
lexicon
,
config
.
model
.
vits
.
tokens
,
model_
->
Punctuations
(),
model_
->
Language
(),
config
.
model
.
debug
)
{}
model_
->
Punctuations
(),
model_
->
Language
(),
config
.
model
.
debug
,
model_
->
IsPiper
())
{}
#if __ANDROID_API__ >= 9
OfflineTtsVitsImpl
(
AAssetManager
*
mgr
,
const
OfflineTtsConfig
&
config
)
:
model_
(
std
::
make_unique
<
OfflineTtsVitsModel
>
(
mgr
,
config
.
model
)),
lexicon_
(
mgr
,
config
.
model
.
vits
.
lexicon
,
config
.
model
.
vits
.
tokens
,
model_
->
Punctuations
(),
model_
->
Language
(),
config
.
model
.
debug
)
{}
model_
->
Punctuations
(),
model_
->
Language
(),
config
.
model
.
debug
,
model_
->
IsPiper
())
{}
#endif
GeneratedAudio
Generate
(
const
std
::
string
&
text
,
int64_t
sid
=
0
,
...
...
@@ -43,17 +43,16 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
if
(
num_speakers
==
0
&&
sid
!=
0
)
{
SHERPA_ONNX_LOGE
(
"This is a single-speaker model and supports only sid 0. Given sid: "
"%d"
,
"%d
. sid is ignored
"
,
static_cast
<
int32_t
>
(
sid
));
return
{};
}
if
(
num_speakers
!=
0
&&
(
sid
>=
num_speakers
||
sid
<
0
))
{
SHERPA_ONNX_LOGE
(
"This model contains only %d speakers. sid should be in the range "
"[%d, %d]. Given: %d"
,
"[%d, %d]. Given: %d
. Use sid=0
"
,
num_speakers
,
0
,
num_speakers
-
1
,
static_cast
<
int32_t
>
(
sid
));
return
{}
;
sid
=
0
;
}
std
::
vector
<
int64_t
>
x
=
lexicon_
.
ConvertTextToTokenIds
(
text
);
...
...
sherpa-onnx/csrc/offline-tts-vits-model.cc
查看文件 @
4451285
...
...
@@ -38,6 +38,107 @@ class OfflineTtsVitsModel::Impl {
#endif
Ort
::
Value
Run
(
Ort
::
Value
x
,
int64_t
sid
,
float
speed
)
{
if
(
is_piper_
)
{
return
RunVitsPiper
(
std
::
move
(
x
),
sid
,
speed
);
}
return
RunVits
(
std
::
move
(
x
),
sid
,
speed
);
}
int32_t
SampleRate
()
const
{
return
sample_rate_
;
}
bool
AddBlank
()
const
{
return
add_blank_
;
}
std
::
string
Punctuations
()
const
{
return
punctuations_
;
}
std
::
string
Language
()
const
{
return
language_
;
}
bool
IsPiper
()
const
{
return
is_piper_
;
}
int32_t
NumSpeakers
()
const
{
return
num_speakers_
;
}
private
:
void
Init
(
void
*
model_data
,
size_t
model_data_length
)
{
sess_
=
std
::
make_unique
<
Ort
::
Session
>
(
env_
,
model_data
,
model_data_length
,
sess_opts_
);
GetInputNames
(
sess_
.
get
(),
&
input_names_
,
&
input_names_ptr_
);
GetOutputNames
(
sess_
.
get
(),
&
output_names_
,
&
output_names_ptr_
);
// get meta data
Ort
::
ModelMetadata
meta_data
=
sess_
->
GetModelMetadata
();
if
(
config_
.
debug
)
{
std
::
ostringstream
os
;
os
<<
"---vits model---
\n
"
;
PrintModelMetadata
(
os
,
meta_data
);
SHERPA_ONNX_LOGE
(
"%s
\n
"
,
os
.
str
().
c_str
());
}
Ort
::
AllocatorWithDefaultOptions
allocator
;
// used in the macro below
SHERPA_ONNX_READ_META_DATA
(
sample_rate_
,
"sample_rate"
);
SHERPA_ONNX_READ_META_DATA
(
add_blank_
,
"add_blank"
);
SHERPA_ONNX_READ_META_DATA
(
num_speakers_
,
"n_speakers"
);
SHERPA_ONNX_READ_META_DATA_STR
(
punctuations_
,
"punctuation"
);
SHERPA_ONNX_READ_META_DATA_STR
(
language_
,
"language"
);
std
::
string
comment
;
SHERPA_ONNX_READ_META_DATA_STR
(
comment
,
"comment"
);
if
(
comment
.
find
(
"piper"
)
!=
std
::
string
::
npos
)
{
is_piper_
=
true
;
}
}
Ort
::
Value
RunVitsPiper
(
Ort
::
Value
x
,
int64_t
sid
,
float
speed
)
{
auto
memory_info
=
Ort
::
MemoryInfo
::
CreateCpu
(
OrtDeviceAllocator
,
OrtMemTypeDefault
);
std
::
vector
<
int64_t
>
x_shape
=
x
.
GetTensorTypeAndShapeInfo
().
GetShape
();
if
(
x_shape
[
0
]
!=
1
)
{
SHERPA_ONNX_LOGE
(
"Support only batch_size == 1. Given: %d"
,
static_cast
<
int32_t
>
(
x_shape
[
0
]));
exit
(
-
1
);
}
int64_t
len
=
x_shape
[
1
];
int64_t
len_shape
=
1
;
Ort
::
Value
x_length
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
&
len
,
1
,
&
len_shape
,
1
);
float
noise_scale
=
config_
.
vits
.
noise_scale
;
float
length_scale
=
config_
.
vits
.
length_scale
;
float
noise_scale_w
=
config_
.
vits
.
noise_scale_w
;
if
(
speed
!=
1
&&
speed
>
0
)
{
length_scale
=
1.
/
speed
;
}
std
::
array
<
float
,
3
>
scales
=
{
noise_scale
,
length_scale
,
noise_scale_w
};
int64_t
scale_shape
=
3
;
Ort
::
Value
scales_tensor
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
scales
.
data
(),
scales
.
size
(),
&
scale_shape
,
1
);
int64_t
sid_shape
=
1
;
Ort
::
Value
sid_tensor
=
Ort
::
Value
::
CreateTensor
(
memory_info
,
&
sid
,
1
,
&
sid_shape
,
1
);
std
::
vector
<
Ort
::
Value
>
inputs
;
inputs
.
reserve
(
4
);
inputs
.
push_back
(
std
::
move
(
x
));
inputs
.
push_back
(
std
::
move
(
x_length
));
inputs
.
push_back
(
std
::
move
(
scales_tensor
));
if
(
input_names_
.
size
()
==
4
&&
input_names_
.
back
()
==
"sid"
)
{
inputs
.
push_back
(
std
::
move
(
sid_tensor
));
}
auto
out
=
sess_
->
Run
({},
input_names_ptr_
.
data
(),
inputs
.
data
(),
inputs
.
size
(),
output_names_ptr_
.
data
(),
output_names_ptr_
.
size
());
return
std
::
move
(
out
[
0
]);
}
Ort
::
Value
RunVits
(
Ort
::
Value
x
,
int64_t
sid
,
float
speed
)
{
auto
memory_info
=
Ort
::
MemoryInfo
::
CreateCpu
(
OrtDeviceAllocator
,
OrtMemTypeDefault
);
...
...
@@ -94,40 +195,6 @@ class OfflineTtsVitsModel::Impl {
return
std
::
move
(
out
[
0
]);
}
int32_t
SampleRate
()
const
{
return
sample_rate_
;
}
bool
AddBlank
()
const
{
return
add_blank_
;
}
std
::
string
Punctuations
()
const
{
return
punctuations_
;
}
std
::
string
Language
()
const
{
return
language_
;
}
int32_t
NumSpeakers
()
const
{
return
num_speakers_
;
}
private
:
void
Init
(
void
*
model_data
,
size_t
model_data_length
)
{
sess_
=
std
::
make_unique
<
Ort
::
Session
>
(
env_
,
model_data
,
model_data_length
,
sess_opts_
);
GetInputNames
(
sess_
.
get
(),
&
input_names_
,
&
input_names_ptr_
);
GetOutputNames
(
sess_
.
get
(),
&
output_names_
,
&
output_names_ptr_
);
// get meta data
Ort
::
ModelMetadata
meta_data
=
sess_
->
GetModelMetadata
();
if
(
config_
.
debug
)
{
std
::
ostringstream
os
;
os
<<
"---vits model---
\n
"
;
PrintModelMetadata
(
os
,
meta_data
);
SHERPA_ONNX_LOGE
(
"%s
\n
"
,
os
.
str
().
c_str
());
}
Ort
::
AllocatorWithDefaultOptions
allocator
;
// used in the macro below
SHERPA_ONNX_READ_META_DATA
(
sample_rate_
,
"sample_rate"
);
SHERPA_ONNX_READ_META_DATA
(
add_blank_
,
"add_blank"
);
SHERPA_ONNX_READ_META_DATA
(
num_speakers_
,
"n_speakers"
);
SHERPA_ONNX_READ_META_DATA_STR
(
punctuations_
,
"punctuation"
);
SHERPA_ONNX_READ_META_DATA_STR
(
language_
,
"language"
);
}
private
:
OfflineTtsModelConfig
config_
;
Ort
::
Env
env_
;
...
...
@@ -147,6 +214,8 @@ class OfflineTtsVitsModel::Impl {
int32_t
num_speakers_
;
std
::
string
punctuations_
;
std
::
string
language_
;
bool
is_piper_
=
false
;
};
OfflineTtsVitsModel
::
OfflineTtsVitsModel
(
const
OfflineTtsModelConfig
&
config
)
...
...
@@ -175,6 +244,8 @@ std::string OfflineTtsVitsModel::Punctuations() const {
std
::
string
OfflineTtsVitsModel
::
Language
()
const
{
return
impl_
->
Language
();
}
bool
OfflineTtsVitsModel
::
IsPiper
()
const
{
return
impl_
->
IsPiper
();
}
int32_t
OfflineTtsVitsModel
::
NumSpeakers
()
const
{
return
impl_
->
NumSpeakers
();
}
...
...
sherpa-onnx/csrc/offline-tts-vits-model.h
查看文件 @
4451285
...
...
@@ -47,6 +47,7 @@ class OfflineTtsVitsModel {
std
::
string
Punctuations
()
const
;
std
::
string
Language
()
const
;
bool
IsPiper
()
const
;
int32_t
NumSpeakers
()
const
;
private
:
...
...
请
注册
或
登录
后发表评论