Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2023-11-05 13:06:00 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2023-11-05 13:06:00 +0800
Commit
723e5265bb89c2d520f507bf842c270f1fb5a905
723e5265
1 parent
606cb26a
Support Chinese polyphones in TTS (#409)
隐藏空白字符变更
内嵌
并排对比
正在显示
2 个修改的文件
包含
49 行增加
和
1 行删除
sherpa-onnx/csrc/lexicon.cc
sherpa-onnx/csrc/lexicon.h
sherpa-onnx/csrc/lexicon.cc
查看文件 @
723e526
...
...
@@ -17,6 +17,8 @@
#include "android/asset_manager_jni.h"
#endif
#include <regex>
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/text-utils.h"
...
...
@@ -147,7 +149,36 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
std
::
vector
<
int64_t
>
Lexicon
::
ConvertTextToTokenIdsChinese
(
const
std
::
string
&
text
)
const
{
std
::
vector
<
std
::
string
>
words
=
SplitUtf8
(
text
);
std
::
vector
<
std
::
string
>
words
;
if
(
pattern_
)
{
// Handle polyphones
size_t
pos
=
0
;
auto
begin
=
std
::
sregex_iterator
(
text
.
begin
(),
text
.
end
(),
*
pattern_
);
auto
end
=
std
::
sregex_iterator
();
for
(
std
::
sregex_iterator
i
=
begin
;
i
!=
end
;
++
i
)
{
std
::
smatch
match
=
*
i
;
if
(
pos
<
match
.
position
())
{
auto
this_segment
=
text
.
substr
(
pos
,
match
.
position
()
-
pos
);
auto
this_segment_words
=
SplitUtf8
(
this_segment
);
words
.
insert
(
words
.
end
(),
this_segment_words
.
begin
(),
this_segment_words
.
end
());
pos
=
match
.
position
()
+
match
.
length
();
}
else
if
(
pos
==
match
.
position
())
{
pos
=
match
.
position
()
+
match
.
length
();
}
words
.
push_back
(
match
.
str
());
}
if
(
pos
<
text
.
size
())
{
auto
this_segment
=
text
.
substr
(
pos
,
text
.
size
()
-
pos
);
auto
this_segment_words
=
SplitUtf8
(
this_segment
);
words
.
insert
(
words
.
end
(),
this_segment_words
.
begin
(),
this_segment_words
.
end
());
}
}
else
{
words
=
SplitUtf8
(
text
);
}
if
(
debug_
)
{
fprintf
(
stderr
,
"Input text in string: %s
\n
"
,
text
.
c_str
());
...
...
@@ -272,6 +303,9 @@ void Lexicon::InitLexicon(std::istream &is) {
std
::
string
line
;
std
::
string
phone
;
std
::
ostringstream
os
;
std
::
string
sep
;
while
(
std
::
getline
(
is
,
line
))
{
std
::
istringstream
iss
(
line
);
...
...
@@ -293,8 +327,18 @@ void Lexicon::InitLexicon(std::istream &is) {
if
(
ids
.
empty
())
{
continue
;
}
if
(
language_
==
Language
::
kChinese
&&
word
.
size
()
>
3
)
{
// this is not a single word;
os
<<
sep
<<
word
;
sep
=
"|"
;
}
word2ids_
.
insert
({
std
::
move
(
word
),
std
::
move
(
ids
)});
}
if
(
!
sep
.
empty
())
{
pattern_
=
std
::
make_unique
<
std
::
regex
>
(
os
.
str
());
}
}
void
Lexicon
::
InitPunctuations
(
const
std
::
string
&
punctuations
)
{
...
...
sherpa-onnx/csrc/lexicon.h
查看文件 @
723e526
...
...
@@ -7,6 +7,7 @@
#include <cstdint>
#include <iostream>
#include <regex>
#include <string>
#include <unordered_map>
#include <unordered_set>
...
...
@@ -79,6 +80,9 @@ class Lexicon {
Language
language_
;
bool
debug_
;
bool
is_piper_
;
// for Chinese polyphones
std
::
unique_ptr
<
std
::
regex
>
pattern_
;
};
}
// namespace sherpa_onnx
...
...
请
注册
或
登录
后发表评论