Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-12-06 11:00:38 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-12-06 11:00:38 +0800
Commit
23cf92daf71e47c0ddb1130bdfba085be7feb189
23cf92da
1 parent
3b90e85e
Use espeak-ng for coqui-ai/TTS VITS English models. (#466)
隐藏空白字符变更
内嵌
并排对比
正在显示
10 个修改的文件
包含
230 行增加
和
93 行删除
android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt
python-api-examples/offline-tts-play.py
scripts/apk/generate-tts-apk-script.py
sherpa-onnx/csrc/offline-tts-vits-impl.h
sherpa-onnx/csrc/offline-tts-vits-model-config.cc
sherpa-onnx/csrc/offline-tts-vits-model-metadata.h
sherpa-onnx/csrc/offline-tts-vits-model.cc
sherpa-onnx/csrc/offline-tts-vits-model.h
sherpa-onnx/csrc/piper-phonemize-lexicon.cc
sherpa-onnx/csrc/piper-phonemize-lexicon.h
android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt
查看文件 @
23cf92d
...
...
@@ -23,7 +23,7 @@ data class OfflineTtsModelConfig(
data class OfflineTtsConfig(
var model: OfflineTtsModelConfig,
var ruleFsts: String = "",
var maxNumSentences: Int =
2
,
var maxNumSentences: Int =
1
,
)
class GeneratedAudio(
...
...
python-api-examples/offline-tts-play.py
查看文件 @
23cf92d
...
...
@@ -311,6 +311,9 @@ def main():
if
len
(
audio
.
samples
)
==
0
:
print
(
"Error in generating audios. Please read previous error messages."
)
global
killed
killed
=
True
play_back_thread
.
join
()
return
elapsed_seconds
=
end
-
start
...
...
scripts/apk/generate-tts-apk-script.py
查看文件 @
23cf92d
...
...
@@ -33,6 +33,23 @@ class TtsModel:
data_dir
:
Optional
[
str
]
=
None
def
get_coqui_models
()
->
List
[
TtsModel
]:
# English (coqui-ai/TTS)
models
=
[
TtsModel
(
model_dir
=
"vits-coqui-en-ljspeech"
),
TtsModel
(
model_dir
=
"vits-coqui-en-ljspeech-neon"
),
TtsModel
(
model_dir
=
"vits-coqui-en-vctk"
),
# TtsModel(model_dir="vits-coqui-en-jenny"),
]
for
m
in
models
:
m
.
data_dir
=
m
.
model_dir
+
"/"
+
"espeak-ng-data"
m
.
model_name
=
"model.onnx"
m
.
lang
=
"en"
return
models
def
get_piper_models
()
->
List
[
TtsModel
]:
models
=
[
TtsModel
(
model_dir
=
"vits-piper-ar_JO-kareem-low"
),
...
...
@@ -137,6 +154,7 @@ def get_piper_models() -> List[TtsModel]:
TtsModel
(
model_dir
=
"vits-piper-vi_VN-vivos-x_low"
),
TtsModel
(
model_dir
=
"vits-piper-zh_CN-huayan-medium"
),
]
for
m
in
models
:
m
.
data_dir
=
m
.
model_dir
+
"/"
+
"espeak-ng-data"
m
.
model_name
=
m
.
model_dir
[
len
(
"vits-piper-"
)
:]
+
".onnx"
...
...
@@ -145,7 +163,7 @@ def get_piper_models() -> List[TtsModel]:
return
models
def
get_
all
_models
()
->
List
[
TtsModel
]:
def
get_
vits
_models
()
->
List
[
TtsModel
]:
return
[
# Chinese
TtsModel
(
...
...
@@ -202,12 +220,6 @@ def get_all_models() -> List[TtsModel]:
lang
=
"zh"
,
rule_fsts
=
"vits-zh-hf-theresa/rule.fst"
,
),
# English (coqui-ai/TTS)
# fmt: off
TtsModel
(
model_dir
=
"vits-coqui-en-ljspeech"
,
model_name
=
"model.onnx"
,
lang
=
"en"
),
TtsModel
(
model_dir
=
"vits-coqui-en-ljspeech-neon"
,
model_name
=
"model.onnx"
,
lang
=
"en"
),
TtsModel
(
model_dir
=
"vits-coqui-en-vctk"
,
model_name
=
"model.onnx"
,
lang
=
"en"
),
# TtsModel(model_dir="vits-coqui-en-jenny", model_name="model.onnx", lang="en"),
# English (US)
TtsModel
(
model_dir
=
"vits-vctk"
,
model_name
=
"vits-vctk.onnx"
,
lang
=
"en"
),
TtsModel
(
model_dir
=
"vits-ljs"
,
model_name
=
"vits-ljs.onnx"
,
lang
=
"en"
),
...
...
@@ -225,8 +237,11 @@ def main():
s
=
f
.
read
()
template
=
environment
.
from_string
(
s
)
d
=
dict
()
# all_model_list = get_all_models()
# all_model_list = get_vits_models()
all_model_list
=
get_piper_models
()
all_model_list
+=
get_coqui_models
()
num_models
=
len
(
all_model_list
)
num_per_runner
=
num_models
//
total
...
...
sherpa-onnx/csrc/offline-tts-vits-impl.h
查看文件 @
23cf92d
...
...
@@ -69,12 +69,16 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
}
#endif
int32_t
SampleRate
()
const
override
{
return
model_
->
SampleRate
();
}
int32_t
SampleRate
()
const
override
{
return
model_
->
GetMetaData
().
sample_rate
;
}
GeneratedAudio
Generate
(
const
std
::
string
&
_text
,
int64_t
sid
=
0
,
float
speed
=
1
.
0
,
GeneratedAudioCallback
callback
=
nullptr
)
const
override
{
int32_t
num_speakers
=
model_
->
NumSpeakers
();
const
auto
&
meta_data
=
model_
->
GetMetaData
();
int32_t
num_speakers
=
meta_data
.
num_speakers
;
if
(
num_speakers
==
0
&&
sid
!=
0
)
{
SHERPA_ONNX_LOGE
(
"This is a single-speaker model and supports only sid 0. Given sid: "
...
...
@@ -105,14 +109,14 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
}
std
::
vector
<
std
::
vector
<
int64_t
>>
x
=
frontend_
->
ConvertTextToTokenIds
(
text
,
m
odel_
->
Voice
()
);
frontend_
->
ConvertTextToTokenIds
(
text
,
m
eta_data
.
voice
);
if
(
x
.
empty
()
||
(
x
.
size
()
==
1
&&
x
[
0
].
empty
()))
{
SHERPA_ONNX_LOGE
(
"Failed to convert %s to token IDs"
,
text
.
c_str
());
return
{};
}
if
(
m
odel_
->
AddBlank
()
&&
config_
.
model
.
vits
.
data_dir
.
empty
())
{
if
(
m
eta_data
.
add_blank
&&
config_
.
model
.
vits
.
data_dir
.
empty
())
{
for
(
auto
&
k
:
x
)
{
k
=
AddBlank
(
k
);
}
...
...
@@ -189,25 +193,33 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
private
:
#if __ANDROID_API__ >= 9
void
InitFrontend
(
AAssetManager
*
mgr
)
{
if
(
model_
->
IsPiper
()
&&
!
config_
.
model
.
vits
.
data_dir
.
empty
())
{
const
auto
&
meta_data
=
model_
->
GetMetaData
();
if
((
meta_data
.
is_piper
||
meta_data
.
is_coqui
)
&&
!
config_
.
model
.
vits
.
data_dir
.
empty
())
{
frontend_
=
std
::
make_unique
<
PiperPhonemizeLexicon
>
(
mgr
,
config_
.
model
.
vits
.
tokens
,
config_
.
model
.
vits
.
data_dir
);
mgr
,
config_
.
model
.
vits
.
tokens
,
config_
.
model
.
vits
.
data_dir
,
meta_data
);
}
else
{
frontend_
=
std
::
make_unique
<
Lexicon
>
(
mgr
,
config_
.
model
.
vits
.
lexicon
,
config_
.
model
.
vits
.
tokens
,
m
odel_
->
Punctuations
(),
model_
->
Language
()
,
config_
.
model
.
debug
);
m
eta_data
.
punctuations
,
meta_data
.
language
,
config_
.
model
.
debug
);
}
}
#endif
void
InitFrontend
()
{
if
(
model_
->
IsPiper
()
&&
!
config_
.
model
.
vits
.
data_dir
.
empty
())
{
const
auto
&
meta_data
=
model_
->
GetMetaData
();
if
((
meta_data
.
is_piper
||
meta_data
.
is_coqui
)
&&
!
config_
.
model
.
vits
.
data_dir
.
empty
())
{
frontend_
=
std
::
make_unique
<
PiperPhonemizeLexicon
>
(
config_
.
model
.
vits
.
tokens
,
config_
.
model
.
vits
.
data_dir
);
config_
.
model
.
vits
.
tokens
,
config_
.
model
.
vits
.
data_dir
,
model_
->
GetMetaData
());
}
else
{
frontend_
=
std
::
make_unique
<
Lexicon
>
(
config_
.
model
.
vits
.
lexicon
,
config_
.
model
.
vits
.
tokens
,
m
odel_
->
Punctuations
(),
model_
->
Language
()
,
config_
.
model
.
debug
);
m
eta_data
.
punctuations
,
meta_data
.
language
,
config_
.
model
.
debug
);
}
}
...
...
@@ -256,7 +268,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
const
float
*
p
=
audio
.
GetTensorData
<
float
>
();
GeneratedAudio
ans
;
ans
.
sample_rate
=
model_
->
SampleRate
()
;
ans
.
sample_rate
=
model_
->
GetMetaData
().
sample_rate
;
ans
.
samples
=
std
::
vector
<
float
>
(
p
,
p
+
total
);
return
ans
;
}
...
...
sherpa-onnx/csrc/offline-tts-vits-model-config.cc
查看文件 @
23cf92d
...
...
@@ -46,7 +46,8 @@ bool OfflineTtsVitsModelConfig::Validate() const {
if
(
data_dir
.
empty
())
{
if
(
lexicon
.
empty
())
{
SHERPA_ONNX_LOGE
(
"Please provide --vits-lexicon"
);
SHERPA_ONNX_LOGE
(
"Please provide --vits-lexicon if you leave --vits-data-dir empty"
);
return
false
;
}
...
...
sherpa-onnx/csrc/offline-tts-vits-model-metadata.h
0 → 100644
查看文件 @
23cf92d
// sherpa-onnx/csrc/offline-tts-vits-model-metadata.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_
#include <cstdint>
#include <string>
namespace
sherpa_onnx
{
struct
OfflineTtsVitsModelMetaData
{
int32_t
sample_rate
;
int32_t
add_blank
=
0
;
int32_t
num_speakers
=
0
;
std
::
string
punctuations
;
std
::
string
language
;
std
::
string
voice
;
bool
is_piper
=
false
;
bool
is_coqui
=
false
;
// the following options are for models from coqui-ai/TTS
int32_t
blank_id
=
0
;
int32_t
bos_id
=
0
;
int32_t
eos_id
=
0
;
int32_t
use_eos_bos
=
0
;
};
}
// namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_
...
...
sherpa-onnx/csrc/offline-tts-vits-model.cc
查看文件 @
23cf92d
...
...
@@ -38,22 +38,14 @@ class OfflineTtsVitsModel::Impl {
#endif
Ort
::
Value
Run
(
Ort
::
Value
x
,
int64_t
sid
,
float
speed
)
{
if
(
is_piper_
)
{
return
RunVitsPiper
(
std
::
move
(
x
),
sid
,
speed
);
if
(
meta_data_
.
is_piper
||
meta_data_
.
is_coqui
)
{
return
RunVitsPiperOrCoqui
(
std
::
move
(
x
),
sid
,
speed
);
}
return
RunVits
(
std
::
move
(
x
),
sid
,
speed
);
}
int32_t
SampleRate
()
const
{
return
sample_rate_
;
}
bool
AddBlank
()
const
{
return
add_blank_
;
}
std
::
string
Punctuations
()
const
{
return
punctuations_
;
}
std
::
string
Language
()
const
{
return
language_
;
}
std
::
string
Voice
()
const
{
return
voice_
;
}
bool
IsPiper
()
const
{
return
is_piper_
;
}
int32_t
NumSpeakers
()
const
{
return
num_speakers_
;
}
const
OfflineTtsVitsModelMetaData
&
GetMetaData
()
const
{
return
meta_data_
;
}
private
:
void
Init
(
void
*
model_data
,
size_t
model_data_length
)
{
...
...
@@ -70,27 +62,52 @@ class OfflineTtsVitsModel::Impl {
std
::
ostringstream
os
;
os
<<
"---vits model---
\n
"
;
PrintModelMetadata
(
os
,
meta_data
);
os
<<
"----------input names----------
\n
"
;
int32_t
i
=
0
;
for
(
const
auto
&
s
:
input_names_
)
{
os
<<
i
<<
" "
<<
s
<<
"
\n
"
;
++
i
;
}
os
<<
"----------output names----------
\n
"
;
i
=
0
;
for
(
const
auto
&
s
:
output_names_
)
{
os
<<
i
<<
" "
<<
s
<<
"
\n
"
;
++
i
;
}
SHERPA_ONNX_LOGE
(
"%s
\n
"
,
os
.
str
().
c_str
());
}
Ort
::
AllocatorWithDefaultOptions
allocator
;
// used in the macro below
SHERPA_ONNX_READ_META_DATA
(
sample_rate_
,
"sample_rate"
);
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT
(
add_blank_
,
"add_blank"
,
0
);
SHERPA_ONNX_READ_META_DATA
(
num_speakers_
,
"n_speakers"
);
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT
(
punctuations_
,
"punctuation"
,
""
);
SHERPA_ONNX_READ_META_DATA_STR
(
language_
,
"language"
);
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT
(
voice_
,
"voice"
,
""
);
SHERPA_ONNX_READ_META_DATA
(
meta_data_
.
sample_rate
,
"sample_rate"
);
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT
(
meta_data_
.
add_blank
,
"add_blank"
,
0
);
SHERPA_ONNX_READ_META_DATA
(
meta_data_
.
num_speakers
,
"n_speakers"
);
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT
(
meta_data_
.
punctuations
,
"punctuation"
,
""
);
SHERPA_ONNX_READ_META_DATA_STR
(
meta_data_
.
language
,
"language"
);
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT
(
meta_data_
.
voice
,
"voice"
,
""
);
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT
(
meta_data_
.
blank_id
,
"blank_id"
,
0
);
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT
(
meta_data_
.
bos_id
,
"bos_id"
,
0
);
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT
(
meta_data_
.
eos_id
,
"eos_id"
,
0
);
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT
(
meta_data_
.
use_eos_bos
,
"use_eos_bos"
,
0
);
std
::
string
comment
;
SHERPA_ONNX_READ_META_DATA_STR
(
comment
,
"comment"
);
if
(
comment
.
find
(
"piper"
)
!=
std
::
string
::
npos
||
comment
.
find
(
"coqui"
)
!=
std
::
string
::
npos
)
{
is_piper_
=
true
;
if
(
comment
.
find
(
"piper"
)
!=
std
::
string
::
npos
)
{
meta_data_
.
is_piper
=
true
;
}
if
(
comment
.
find
(
"coqui"
)
!=
std
::
string
::
npos
)
{
meta_data_
.
is_coqui
=
true
;
}
}
Ort
::
Value
RunVitsPiper
(
Ort
::
Value
x
,
int64_t
sid
,
float
speed
)
{
Ort
::
Value
RunVitsPiper
OrCoqui
(
Ort
::
Value
x
,
int64_t
sid
,
float
speed
)
{
auto
memory_info
=
Ort
::
MemoryInfo
::
CreateCpu
(
OrtDeviceAllocator
,
OrtMemTypeDefault
);
...
...
@@ -213,14 +230,7 @@ class OfflineTtsVitsModel::Impl {
std
::
vector
<
std
::
string
>
output_names_
;
std
::
vector
<
const
char
*>
output_names_ptr_
;
int32_t
sample_rate_
;
int32_t
add_blank_
;
int32_t
num_speakers_
;
std
::
string
punctuations_
;
std
::
string
language_
;
std
::
string
voice_
;
bool
is_piper_
=
false
;
OfflineTtsVitsModelMetaData
meta_data_
;
};
OfflineTtsVitsModel
::
OfflineTtsVitsModel
(
const
OfflineTtsModelConfig
&
config
)
...
...
@@ -239,21 +249,8 @@ Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/,
return
impl_
->
Run
(
std
::
move
(
x
),
sid
,
speed
);
}
int32_t
OfflineTtsVitsModel
::
SampleRate
()
const
{
return
impl_
->
SampleRate
();
}
bool
OfflineTtsVitsModel
::
AddBlank
()
const
{
return
impl_
->
AddBlank
();
}
std
::
string
OfflineTtsVitsModel
::
Punctuations
()
const
{
return
impl_
->
Punctuations
();
}
std
::
string
OfflineTtsVitsModel
::
Language
()
const
{
return
impl_
->
Language
();
}
std
::
string
OfflineTtsVitsModel
::
Voice
()
const
{
return
impl_
->
Voice
();
}
bool
OfflineTtsVitsModel
::
IsPiper
()
const
{
return
impl_
->
IsPiper
();
}
int32_t
OfflineTtsVitsModel
::
NumSpeakers
()
const
{
return
impl_
->
NumSpeakers
();
const
OfflineTtsVitsModelMetaData
&
OfflineTtsVitsModel
::
GetMetaData
()
const
{
return
impl_
->
GetMetaData
();
}
}
// namespace sherpa_onnx
...
...
sherpa-onnx/csrc/offline-tts-vits-model.h
查看文件 @
23cf92d
...
...
@@ -15,6 +15,7 @@
#include "onnxruntime_cxx_api.h" // NOLINT
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
namespace
sherpa_onnx
{
...
...
@@ -39,17 +40,7 @@ class OfflineTtsVitsModel {
*/
Ort
::
Value
Run
(
Ort
::
Value
x
,
int64_t
sid
=
0
,
float
speed
=
1
.
0
);
// Sample rate of the generated audio
int32_t
SampleRate
()
const
;
// true to insert a blank between each token
bool
AddBlank
()
const
;
std
::
string
Punctuations
()
const
;
std
::
string
Language
()
const
;
// e.g., Chinese, English, German, etc.
std
::
string
Voice
()
const
;
// e.g., en-us, for espeak-ng
bool
IsPiper
()
const
;
int32_t
NumSpeakers
()
const
;
const
OfflineTtsVitsModelMetaData
&
GetMetaData
()
const
;
private
:
class
Impl
;
...
...
sherpa-onnx/csrc/piper-phonemize-lexicon.cc
查看文件 @
23cf92d
...
...
@@ -57,10 +57,17 @@ static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {
s
=
conv
.
from_bytes
(
sym
);
if
(
s
.
size
()
!=
1
)
{
// for tokens.txt from coqui-ai/TTS, the last token is <BLNK>
if
(
s
.
size
()
==
6
&&
s
[
0
]
==
'<'
&&
s
[
1
]
==
'B'
&&
s
[
2
]
==
'L'
&&
s
[
3
]
==
'N'
&&
s
[
4
]
==
'K'
&&
s
[
5
]
==
'>'
)
{
continue
;
}
SHERPA_ONNX_LOGE
(
"Error when reading tokens at Line %s. size: %d"
,
line
.
c_str
(),
static_cast
<
int32_t
>
(
s
.
size
()));
exit
(
-
1
);
}
char32_t
c
=
s
[
0
];
if
(
token2id
.
count
(
c
))
{
...
...
@@ -77,7 +84,7 @@ static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {
// see the function "phonemes_to_ids" from
// https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb
static
std
::
vector
<
int64_t
>
PhonemesToIds
(
static
std
::
vector
<
int64_t
>
P
iperP
honemesToIds
(
const
std
::
unordered_map
<
char32_t
,
int32_t
>
&
token2id
,
const
std
::
vector
<
piper
::
Phoneme
>
&
phonemes
)
{
// see
...
...
@@ -104,6 +111,65 @@ static std::vector<int64_t> PhonemesToIds(
return
ans
;
}
static
std
::
vector
<
int64_t
>
CoquiPhonemesToIds
(
const
std
::
unordered_map
<
char32_t
,
int32_t
>
&
token2id
,
const
std
::
vector
<
piper
::
Phoneme
>
&
phonemes
,
const
OfflineTtsVitsModelMetaData
&
meta_data
)
{
// see
// https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py#L87
int32_t
use_eos_bos
=
meta_data
.
use_eos_bos
;
int32_t
bos_id
=
meta_data
.
bos_id
;
int32_t
eos_id
=
meta_data
.
eos_id
;
int32_t
blank_id
=
meta_data
.
blank_id
;
int32_t
add_blank
=
meta_data
.
add_blank
;
int32_t
comma_id
=
token2id
.
at
(
','
);
SHERPA_ONNX_LOGE
(
"comma id: %d"
,
comma_id
);
std
::
vector
<
int64_t
>
ans
;
if
(
add_blank
)
{
ans
.
reserve
(
phonemes
.
size
()
*
2
+
3
);
}
else
{
ans
.
reserve
(
phonemes
.
size
()
+
2
);
}
if
(
use_eos_bos
)
{
ans
.
push_back
(
bos_id
);
}
if
(
add_blank
)
{
ans
.
push_back
(
blank_id
);
for
(
auto
p
:
phonemes
)
{
if
(
token2id
.
count
(
p
))
{
ans
.
push_back
(
token2id
.
at
(
p
));
ans
.
push_back
(
blank_id
);
}
else
{
SHERPA_ONNX_LOGE
(
"Skip unknown phonemes. Unicode codepoint:
\\
U+%04x."
,
static_cast
<
uint32_t
>
(
p
));
}
}
}
else
{
// not adding blank
for
(
auto
p
:
phonemes
)
{
if
(
token2id
.
count
(
p
))
{
ans
.
push_back
(
token2id
.
at
(
p
));
}
else
{
SHERPA_ONNX_LOGE
(
"Skip unknown phonemes. Unicode codepoint:
\\
U+%04x."
,
static_cast
<
uint32_t
>
(
p
));
}
}
}
// add a comma at the end of a sentence so that we can have a longer pause.
ans
.
push_back
(
comma_id
);
if
(
use_eos_bos
)
{
ans
.
push_back
(
eos_id
);
}
return
ans
;
}
void
InitEspeak
(
const
std
::
string
&
data_dir
)
{
static
std
::
once_flag
init_flag
;
std
::
call_once
(
init_flag
,
[
data_dir
]()
{
...
...
@@ -119,21 +185,23 @@ void InitEspeak(const std::string &data_dir) {
});
}
PiperPhonemizeLexicon
::
PiperPhonemizeLexicon
(
const
std
::
string
&
tokens
,
const
std
::
string
&
data_dir
)
:
data_dir_
(
data_dir
)
{
PiperPhonemizeLexicon
::
PiperPhonemizeLexicon
(
const
std
::
string
&
tokens
,
const
std
::
string
&
data_dir
,
const
OfflineTtsVitsModelMetaData
&
meta_data
)
:
meta_data_
(
meta_data
)
{
{
std
::
ifstream
is
(
tokens
);
token2id_
=
ReadTokens
(
is
);
}
InitEspeak
(
data_dir
_
);
InitEspeak
(
data_dir
);
}
#if __ANDROID_API__ >= 9
PiperPhonemizeLexicon
::
PiperPhonemizeLexicon
(
AAssetManager
*
mgr
,
const
std
::
string
&
tokens
,
const
std
::
string
&
data_dir
)
{
PiperPhonemizeLexicon
::
PiperPhonemizeLexicon
(
AAssetManager
*
mgr
,
const
std
::
string
&
tokens
,
const
std
::
string
&
data_dir
,
const
OfflineTtsVitsModelMetaData
&
meta_data
)
:
meta_data_
(
meta_data
)
{
{
auto
buf
=
ReadFile
(
mgr
,
tokens
);
std
::
istrstream
is
(
buf
.
data
(),
buf
.
size
());
...
...
@@ -141,8 +209,9 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(AAssetManager *mgr,
}
// We should copy the directory of espeak-ng-data from the asset to
// some internal or external storage and then pass the directory to data_dir.
InitEspeak
(
data_dir_
);
// some internal or external storage and then pass the directory to
// data_dir.
InitEspeak
(
data_dir
);
}
#endif
...
...
@@ -160,9 +229,21 @@ std::vector<std::vector<int64_t>> PiperPhonemizeLexicon::ConvertTextToTokenIds(
std
::
vector
<
std
::
vector
<
int64_t
>>
ans
;
std
::
vector
<
int64_t
>
phoneme_ids
;
for
(
const
auto
&
p
:
phonemes
)
{
phoneme_ids
=
PhonemesToIds
(
token2id_
,
p
);
ans
.
push_back
(
std
::
move
(
phoneme_ids
));
if
(
meta_data_
.
is_piper
)
{
for
(
const
auto
&
p
:
phonemes
)
{
phoneme_ids
=
PiperPhonemesToIds
(
token2id_
,
p
);
ans
.
push_back
(
std
::
move
(
phoneme_ids
));
}
}
else
if
(
meta_data_
.
is_coqui
)
{
for
(
const
auto
&
p
:
phonemes
)
{
phoneme_ids
=
CoquiPhonemesToIds
(
token2id_
,
p
,
meta_data_
);
ans
.
push_back
(
std
::
move
(
phoneme_ids
));
}
}
else
{
SHERPA_ONNX_LOGE
(
"Unsupported model"
);
exit
(
-
1
);
}
return
ans
;
...
...
sherpa-onnx/csrc/piper-phonemize-lexicon.h
查看文件 @
23cf92d
...
...
@@ -15,25 +15,28 @@
#endif
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
namespace
sherpa_onnx
{
class
PiperPhonemizeLexicon
:
public
OfflineTtsFrontend
{
public
:
PiperPhonemizeLexicon
(
const
std
::
string
&
tokens
,
const
std
::
string
&
data_dir
);
PiperPhonemizeLexicon
(
const
std
::
string
&
tokens
,
const
std
::
string
&
data_dir
,
const
OfflineTtsVitsModelMetaData
&
meta_data
);
#if __ANDROID_API__ >= 9
PiperPhonemizeLexicon
(
AAssetManager
*
mgr
,
const
std
::
string
&
tokens
,
const
std
::
string
&
data_dir
);
const
std
::
string
&
data_dir
,
const
OfflineTtsVitsModelMetaData
&
meta_data
);
#endif
std
::
vector
<
std
::
vector
<
int64_t
>>
ConvertTextToTokenIds
(
const
std
::
string
&
text
,
const
std
::
string
&
voice
=
""
)
const
override
;
private
:
std
::
string
data_dir_
;
// map unicode codepoint to an integer ID
std
::
unordered_map
<
char32_t
,
int32_t
>
token2id_
;
OfflineTtsVitsModelMetaData
meta_data_
;
};
}
// namespace sherpa_onnx
...
...
请
注册
或
登录
后发表评论