Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2024-12-31 16:06:27 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-12-31 16:06:27 +0800
Commit
ebe92e523d92319163b73655c17f80e1f6d1e813
ebe92e52
1 parent
d3538531
Remove spaces after punctuations for TTS (#1666)
隐藏空白字符变更
内嵌
并排对比
正在显示
1 个修改的文件
包含
73 行增加
和
3 行删除
sherpa-onnx/csrc/jieba-lexicon.cc
sherpa-onnx/csrc/jieba-lexicon.cc
查看文件 @
ebe92e5
...
...
@@ -6,6 +6,7 @@
#include <fstream>
#include <regex> // NOLINT
#include <unordered_set>
#include <utility>
#include "cppjieba/Jieba.hpp"
...
...
@@ -16,6 +17,14 @@
namespace
sherpa_onnx
{
static
bool
IsPunct
(
const
std
::
string
&
s
)
{
static
const
std
::
unordered_set
<
std
::
string
>
puncts
=
{
","
,
"."
,
"!"
,
"?"
,
":"
,
"
\"
"
,
"'"
,
","
,
"。"
,
"!"
,
"?"
,
"“"
,
"”"
,
"‘"
,
"’"
,
};
return
puncts
.
count
(
s
);
}
class
JiebaLexicon
::
Impl
{
public
:
Impl
(
const
std
::
string
&
lexicon
,
const
std
::
string
&
tokens
,
...
...
@@ -67,8 +76,13 @@ class JiebaLexicon::Impl {
jieba_
->
Cut
(
text
,
words
,
is_hmm
);
if
(
debug_
)
{
SHERPA_ONNX_LOGE
(
"input text: %s"
,
text
.
c_str
());
SHERPA_ONNX_LOGE
(
"after replacing punctuations: %s"
,
s
.
c_str
());
#if __OHOS__
SHERPA_ONNX_LOGE
(
"input text:
\n
%{public}s"
,
text
.
c_str
());
SHERPA_ONNX_LOGE
(
"after replacing punctuations:
\n
%{public}s"
,
s
.
c_str
());
#else
SHERPA_ONNX_LOGE
(
"input text:
\n
%s"
,
text
.
c_str
());
SHERPA_ONNX_LOGE
(
"after replacing punctuations:
\n
%s"
,
s
.
c_str
());
#endif
std
::
ostringstream
os
;
std
::
string
sep
=
""
;
...
...
@@ -77,7 +91,52 @@ class JiebaLexicon::Impl {
sep
=
"_"
;
}
SHERPA_ONNX_LOGE
(
"after jieba processing: %s"
,
os
.
str
().
c_str
());
#if __OHOS__
SHERPA_ONNX_LOGE
(
"after jieba processing:
\n
%{public}s"
,
os
.
str
().
c_str
());
#else
SHERPA_ONNX_LOGE
(
"after jieba processing:
\n
%s"
,
os
.
str
().
c_str
());
#endif
}
// remove spaces after punctuations
std
::
vector
<
std
::
string
>
words2
=
std
::
move
(
words
);
words
.
reserve
(
words2
.
size
());
for
(
int32_t
i
=
0
;
i
<
words2
.
size
();
++
i
)
{
if
(
i
==
0
)
{
words
.
push_back
(
std
::
move
(
words2
[
i
]));
}
else
if
(
words2
[
i
]
==
" "
)
{
if
(
words
.
back
()
==
" "
||
IsPunct
(
words
.
back
()))
{
continue
;
}
else
{
words
.
push_back
(
std
::
move
(
words2
[
i
]));
}
}
else
if
(
IsPunct
(
words2
[
i
]))
{
if
(
words
.
back
()
==
" "
||
IsPunct
(
words
.
back
()))
{
continue
;
}
else
{
words
.
push_back
(
std
::
move
(
words2
[
i
]));
}
}
else
{
words
.
push_back
(
std
::
move
(
words2
[
i
]));
}
}
if
(
debug_
)
{
std
::
ostringstream
os
;
std
::
string
sep
=
""
;
for
(
const
auto
&
w
:
words
)
{
os
<<
sep
<<
w
;
sep
=
"_"
;
}
#if __OHOS__
SHERPA_ONNX_LOGE
(
"after removing spaces after punctuations:
\n
%{public}s"
,
os
.
str
().
c_str
());
#else
SHERPA_ONNX_LOGE
(
"after removing spaces after punctuations:
\n
%s"
,
os
.
str
().
c_str
());
#endif
}
std
::
vector
<
TokenIDs
>
ans
;
...
...
@@ -86,7 +145,11 @@ class JiebaLexicon::Impl {
for
(
const
auto
&
w
:
words
)
{
auto
ids
=
ConvertWordToIds
(
w
);
if
(
ids
.
empty
())
{
#if __OHOS__
SHERPA_ONNX_LOGE
(
"Ignore OOV '%{public}s'"
,
w
.
c_str
());
#else
SHERPA_ONNX_LOGE
(
"Ignore OOV '%s'"
,
w
.
c_str
());
#endif
continue
;
}
...
...
@@ -173,8 +236,15 @@ class JiebaLexicon::Impl {
ToLowerCase
(
&
word
);
if
(
word2ids_
.
count
(
word
))
{
#if __OHOS__
SHERPA_ONNX_LOGE
(
"Duplicated word: %{public}s at line %{public}d:%{public}s. Ignore "
"it."
,
word
.
c_str
(),
line_num
,
line
.
c_str
());
#else
SHERPA_ONNX_LOGE
(
"Duplicated word: %s at line %d:%s. Ignore it."
,
word
.
c_str
(),
line_num
,
line
.
c_str
());
#endif
continue
;
}
...
...
请
注册
或
登录
后发表评论