Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2025-02-13 18:19:34 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2025-02-13 18:19:34 +0800
Commit
944400e3990f650f1be707e5af9a7f6a72b3b4ef
944400e3
1 parent
115e9c22
Fix spliting text by languages for kokoro tts. (#1849)
隐藏空白字符变更
内嵌
并排对比
正在显示
7 个修改的文件
包含
203 行增加
和
35 行删除
sherpa-onnx/c-api/cxx-api.cc
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
sherpa-onnx/csrc/regex-lang-test.cc
sherpa-onnx/csrc/text-utils-test.cc
sherpa-onnx/csrc/text-utils.cc
sherpa-onnx/csrc/text-utils.h
sherpa-onnx/c-api/cxx-api.cc
查看文件 @
944400e
...
...
@@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const {
return
OfflineStream
{
s
};
}
OfflineStream
OfflineRecognizer
::
CreateStream
(
const
std
::
string
&
hotwords
)
const
{
OfflineStream
OfflineRecognizer
::
CreateStream
(
const
std
::
string
&
hotwords
)
const
{
auto
s
=
SherpaOnnxCreateOfflineStreamWithHotwords
(
p_
,
hotwords
.
c_str
());
return
OfflineStream
{
s
};
}
...
...
sherpa-onnx/csrc/CMakeLists.txt
查看文件 @
944400e
...
...
@@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
context-graph-test.cc
packed-sequence-test.cc
pad-sequence-test.cc
regex-lang-test.cc
slice-test.cc
stack-test.cc
text-utils-test.cc
...
...
sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
查看文件 @
944400e
...
...
@@ -4,9 +4,7 @@
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
#include <codecvt>
#include <fstream>
#include <locale>
#include <regex> // NOLINT
#include <sstream>
#include <strstream>
...
...
@@ -22,6 +20,8 @@
#include "rawfile/raw_file_manager.h"
#endif
#include <codecvt>
#include "cppjieba/Jieba.hpp"
#include "espeak-ng/speak_lib.h"
#include "phoneme_ids.hpp"
...
...
@@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text,
piper
::
eSpeakPhonemeConfig
&
config
,
// NOLINT
std
::
vector
<
std
::
vector
<
piper
::
Phoneme
>>
*
phonemes
);
static
std
::
wstring
ToWideString
(
const
std
::
string
&
s
)
{
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std
::
wstring_convert
<
std
::
codecvt_utf8_utf16
<
wchar_t
>>
converter
;
return
converter
.
from_bytes
(
s
);
}
static
std
::
string
ToString
(
const
std
::
wstring
&
s
)
{
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std
::
wstring_convert
<
std
::
codecvt_utf8_utf16
<
wchar_t
>>
converter
;
return
converter
.
to_bytes
(
s
);
}
class
KokoroMultiLangLexicon
::
Impl
{
public
:
Impl
(
const
std
::
string
&
tokens
,
const
std
::
string
&
lexicon
,
...
...
@@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl {
// https://en.cppreference.com/w/cpp/regex
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
std
::
string
expr
=
"([;:,.?!'
\"
…
\\
(
\\
)“”])|([
\\
u4e00-
\\
u9fff]+)|([äöüßÄÖÜ
\\
u0000-
\\
u007f]+"
")"
;
std
::
string
expr_chinese
=
"([
\\
u4e00-
\\
u9fff]+)"
;
std
::
string
expr_not_chinese
=
"([^
\\
u4e00-
\\
u9fff]+)"
;
std
::
string
expr_both
=
expr_chinese
+
"|"
+
expr_not_chinese
;
auto
ws
=
ToWideString
(
text
);
std
::
wstring
wexpr
=
ToWideString
(
expr
);
std
::
wregex
we
(
wexpr
);
std
::
wstring
wexpr_both
=
ToWideString
(
expr_both
);
std
::
wregex
we_both
(
wexpr_both
);
std
::
wstring
wexpr_zh
=
ToWideString
(
expr_chinese
);
std
::
wregex
we_zh
(
wexpr_zh
);
auto
begin
=
std
::
wsregex_iterator
(
ws
.
begin
(),
ws
.
end
(),
we
);
auto
begin
=
std
::
wsregex_iterator
(
ws
.
begin
(),
ws
.
end
(),
we
_both
);
auto
end
=
std
::
wsregex_iterator
();
std
::
vector
<
TokenIDs
>
ans
;
...
...
@@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl {
for
(
std
::
wsregex_iterator
i
=
begin
;
i
!=
end
;
++
i
)
{
std
::
wsmatch
match
=
*
i
;
std
::
wstring
match_str
=
match
.
str
();
auto
ms
=
ToString
(
match_str
);
uint8_t
c
=
reinterpret_cast
<
const
uint8_t
*>
(
ms
.
data
())[
0
];
std
::
vector
<
std
::
vector
<
int32_t
>>
ids_vec
;
if
(
c
<
0x80
)
{
if
(
std
::
regex_match
(
match_str
,
we_zh
))
{
if
(
debug_
)
{
SHERPA_ONNX_LOGE
(
"
Non-
Chinese: %s"
,
ms
.
c_str
());
SHERPA_ONNX_LOGE
(
"Chinese: %s"
,
ms
.
c_str
());
}
ids_vec
=
Convert
EnglishToTokenIDs
(
ms
,
meta_data_
.
voice
);
ids_vec
=
Convert
ChineseToTokenIDs
(
ms
);
}
else
{
if
(
debug_
)
{
SHERPA_ONNX_LOGE
(
"Chinese: %s"
,
ms
.
c_str
());
SHERPA_ONNX_LOGE
(
"
Non-
Chinese: %s"
,
ms
.
c_str
());
}
ids_vec
=
ConvertChineseToTokenIDs
(
ms
);
ids_vec
=
ConvertEnglishToTokenIDs
(
ms
,
meta_data_
.
voice
);
}
for
(
const
auto
&
ids
:
ids_vec
)
{
...
...
@@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl {
this_sentence
.
push_back
(
space_id
);
}
else
{
if
(
debug_
)
{
SHERPA_ONNX_LOGE
(
"Use espeak-ng to handle the OOV: '%s'"
,
word
.
c_str
());
SHERPA_ONNX_LOGE
(
"Use espeak-ng to handle the OOV: '%s'"
,
word
.
c_str
());
}
piper
::
eSpeakPhonemeConfig
config
;
config
.
voice
=
voice
;
...
...
sherpa-onnx/csrc/regex-lang-test.cc
0 → 100644
查看文件 @
944400e
// sherpa-onnx/csrc/regex-lang-test.cc
//
// Copyright (c) 2025 Xiaomi Corporation
#include <regex> // NOLINT
#include "gtest/gtest.h"
#include "sherpa-onnx/csrc/text-utils.cc"
namespace
sherpa_onnx
{
static
void
TestLang
(
const
std
::
string
&
expr
,
const
std
::
string
&
text
,
const
std
::
vector
<
std
::
string
>
&
expected
)
{
auto
ws
=
ToWideString
(
text
);
std
::
wstring
wexpr
=
ToWideString
(
expr
);
std
::
wregex
we
(
wexpr
);
auto
begin
=
std
::
wsregex_iterator
(
ws
.
begin
(),
ws
.
end
(),
we
);
auto
end
=
std
::
wsregex_iterator
();
int32_t
k
=
0
;
for
(
std
::
wsregex_iterator
i
=
begin
;
i
!=
end
;
++
i
)
{
std
::
wsmatch
match
=
*
i
;
std
::
wstring
match_str
=
match
.
str
();
auto
ms
=
ToString
(
match_str
);
std
::
cout
<<
ms
<<
"
\n
"
;
EXPECT_EQ
(
ms
,
expected
[
k
]);
k
++
;
}
EXPECT_EQ
(
k
,
expected
.
size
());
}
TEST
(
German
,
Case1
)
{
std
::
cout
<<
"----------Test German----------"
;
// see https://character-table.netlify.app/german/
std
::
string
expr
=
"([
\\
u0020-
\\
u005f
\\
u0061-"
"
\\
u007d
\\
u00a0
\\
u00a7
\\
u00a9
\\
u00ab
\\
u00bb
\\
u00c4
\\
u00d6
\\
u00dc
\\
u00df
\\
"
"u00e4
\\
u00f6
\\
u00fc
\\
u2010-
\\
u2011
\\
u2013-"
"
\\
u2014
\\
u2018
\\
u201a
\\
u201c
\\
u201e
\\
u2026
\\
u2030
\\
u20ac]+)"
;
std
::
string
text
=
"开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€"
;
std
::
vector
<
std
::
string
>
expected
=
{
"Übeltäter übergibt Ärzten "
,
"öfters äußerst ätzende Öle"
,
"3€"
};
TestLang
(
expr
,
text
,
expected
);
}
TEST
(
French
,
Case1
)
{
std
::
string
expr
=
"([
\\
u0020-
\\
u005f
\\
u0061-"
"
\\
u007a
\\
u007c
\\
u00a0
\\
u00a7
\\
u00a9
\\
u00ab
\\
u00b2-"
"
\\
u00b3
\\
u00bb
\\
u00c0
\\
u00c2
\\
u00c6-
\\
u00cb
\\
u00ce-"
"
\\
u00cf
\\
u00d4
\\
u00d9
\\
u00db-
\\
u00dc
\\
u00e0
\\
u00e2
\\
u00e6-"
"
\\
u00eb
\\
u00ee-
\\
u00ef
\\
u00f4
\\
u00f9
\\
u00fb-
\\
u00fc
\\
u00ff
\\
u0152-"
"
\\
u0153
\\
u0178
\\
u02b3
\\
u02e2
\\
u1d48-
\\
u1d49
\\
u2010-
\\
u2011
\\
u2013-"
"
\\
u2014
\\
u2019
\\
u201c-
\\
u201d
\\
u2020-
\\
u2021
\\
u2026
\\
u202f-"
"
\\
u2030
\\
u20ac
\\
u2212]+)"
;
std
::
string
text
=
"L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon"
;
std
::
vector
<
std
::
string
>
expected
=
{
"L'été, "
,
"avec son ciel bleuâtre, "
,
"est un moment où, "
,
"Noël, maçon"
,
};
TestLang
(
expr
,
text
,
expected
);
}
TEST
(
English
,
Case1
)
{
// https://character-table.netlify.app/english/
std
::
string
expr
=
"([
\\
u0020-
\\
u005f
\\
u0061-
\\
u007a
\\
u007c
\\
u00a0
\\
u00a7
\\
u00a9
\\
u2010-"
"
\\
u2011
\\
u2013-
\\
u2014
\\
u2018-
\\
u2019
\\
u201c-
\\
u201d
\\
u2020-"
"
\\
u2021
\\
u2026
\\
u2030
\\
u2032-
\\
u2033
\\
u20ac]+)"
;
std
::
string
text
=
"一how are you doing? 二Thank you!"
;
std
::
vector
<
std
::
string
>
expected
=
{
"how are you doing? "
,
"Thank you!"
,
};
TestLang
(
expr
,
text
,
expected
);
}
}
// namespace sherpa_onnx
...
...
sherpa-onnx/csrc/text-utils-test.cc
查看文件 @
944400e
...
...
@@ -8,6 +8,14 @@
namespace
sherpa_onnx
{
TEST
(
ToLowerCase
,
WideString
)
{
std
::
string
text
=
"Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€"
;
auto
t
=
ToLowerCase
(
text
);
std
::
cout
<<
text
<<
"
\n
"
;
std
::
cout
<<
t
<<
"
\n
"
;
}
TEST
(
RemoveInvalidUtf8Sequences
,
Case1
)
{
std
::
vector
<
uint8_t
>
v
=
{
0xe4
,
0xbb
,
0x8a
,
// 今
...
...
sherpa-onnx/csrc/text-utils.cc
查看文件 @
944400e
...
...
@@ -8,8 +8,11 @@
#include <algorithm>
#include <cassert>
#include <cctype>
#include <codecvt>
#include <cstdint>
#include <cwctype>
#include <limits>
#include <locale>
#include <sstream>
#include <string>
#include <unordered_map>
...
...
@@ -389,10 +392,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
}
std
::
string
ToLowerCase
(
const
std
::
string
&
s
)
{
std
::
string
ans
(
s
.
size
(),
0
);
std
::
transform
(
s
.
begin
(),
s
.
end
(),
ans
.
begin
(),
[](
unsigned
char
c
)
{
return
std
::
tolower
(
c
);
});
return
ans
;
return
ToString
(
ToLowerCase
(
ToWideString
(
s
)));
}
void
ToLowerCase
(
std
::
string
*
in_out
)
{
...
...
@@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) {
[](
unsigned
char
c
)
{
return
std
::
tolower
(
c
);
});
}
std
::
wstring
ToLowerCase
(
const
std
::
wstring
&
s
)
{
std
::
wstring
ans
(
s
.
size
(),
0
);
std
::
transform
(
s
.
begin
(),
s
.
end
(),
ans
.
begin
(),
[](
wchar_t
c
)
->
wchar_t
{
switch
(
c
)
{
// French
case
L'À'
:
return
L'à'
;
case
L'Â'
:
return
L'â'
;
case
L'Æ'
:
return
L'æ'
;
case
L'Ç'
:
return
L'ç'
;
case
L'È'
:
return
L'è'
;
case
L'É'
:
return
L'é'
;
case
L'Ë'
:
return
L'ë'
;
case
L'Î'
:
return
L'î'
;
case
L'Ï'
:
return
L'ï'
;
case
L'Ô'
:
return
L'ô'
;
case
L'Ù'
:
return
L'ù'
;
case
L'Û'
:
return
L'û'
;
case
L'Ü'
:
return
L'ü'
;
// others
case
L'Á'
:
return
L'á'
;
case
L'Í'
:
return
L'í'
;
case
L'Ó'
:
return
L'ó'
;
case
L'Ú'
:
return
L'ú'
;
case
L'Ñ'
:
return
L'ñ'
;
case
L'Ì'
:
return
L'ì'
;
case
L'Ò'
:
return
L'ò'
;
case
L'Ä'
:
return
L'ä'
;
case
L'Ö'
:
return
L'ö'
;
// TODO(fangjun): Add more
default
:
return
std
::
towlower
(
c
);
}
});
return
ans
;
}
static
inline
bool
InRange
(
uint8_t
x
,
uint8_t
low
,
uint8_t
high
)
{
return
low
<=
x
&&
x
<=
high
;
}
...
...
@@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) {
}
#endif
std
::
wstring
ToWideString
(
const
std
::
string
&
s
)
{
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std
::
wstring_convert
<
std
::
codecvt_utf8_utf16
<
wchar_t
>>
converter
;
return
converter
.
from_bytes
(
s
);
}
std
::
string
ToString
(
const
std
::
wstring
&
s
)
{
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std
::
wstring_convert
<
std
::
codecvt_utf8_utf16
<
wchar_t
>>
converter
;
return
converter
.
to_bytes
(
s
);
}
}
// namespace sherpa_onnx
...
...
sherpa-onnx/csrc/text-utils.h
查看文件 @
944400e
...
...
@@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text);
std
::
string
ToLowerCase
(
const
std
::
string
&
s
);
void
ToLowerCase
(
std
::
string
*
in_out
);
std
::
wstring
ToLowerCase
(
const
std
::
wstring
&
s
);
std
::
string
RemoveInvalidUtf8Sequences
(
const
std
::
string
&
text
,
bool
show_debug_msg
=
false
);
...
...
@@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text);
std
::
string
Gb2312ToUtf8
(
const
std
::
string
&
text
);
#endif
std
::
wstring
ToWideString
(
const
std
::
string
&
s
);
std
::
string
ToString
(
const
std
::
wstring
&
s
);
}
// namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
...
...
请
注册
或
登录
后发表评论