Fangjun Kuang
Committed by GitHub

Support extra languages in multi-lang kokoro tts (#2303)

... ... @@ -35,18 +35,18 @@ jobs:
matrix:
# See https://github.com/actions/runner-images
include:
- os: ubuntu-22.04
python-version: "3.7"
- os: ubuntu-22.04
- os: ubuntu-latest
python-version: "3.8"
- os: ubuntu-22.04
- os: ubuntu-latest
python-version: "3.9"
- os: ubuntu-22.04
- os: ubuntu-latest
python-version: "3.10"
- os: ubuntu-22.04
- os: ubuntu-latest
python-version: "3.11"
- os: ubuntu-22.04
- os: ubuntu-latest
python-version: "3.12"
- os: ubuntu-latest
python-version: "3.13"
- os: macos-13
python-version: "3.8"
... ... @@ -103,7 +103,7 @@ jobs:
export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
cmake --version
export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j"
export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j2"
python3 setup.py bdist_wheel
ls -lh dist
... ...
### Supported functions
|Speech recognition| Speech synthesis | Source separation |
|Speech recognition| [Speech synthesis][tts-url] | [Source separation][ss-url] |
|------------------|------------------|-------------------|
| ✔️ | ✔️ | ✔️ |
|Speaker identification| Speaker diarization | Speaker verification |
|Speaker identification| [Speaker diarization][sd-url] | Speaker verification |
|----------------------|-------------------- |------------------------|
| ✔️ | ✔️ | ✔️ |
| Spoken Language identification | Audio tagging | Voice activity detection |
| [Spoken Language identification][slid-url] | [Audio tagging][at-url] | [Voice activity detection][vad-url] |
|--------------------------------|---------------|--------------------------|
| ✔️ | ✔️ | ✔️ |
| Keyword spotting | Add punctuation | Speech enhancement |
| [Keyword spotting][kws-url] | [Add punctuation][punct-url] | [Speech enhancement][se-url] |
|------------------|-----------------|--------------------|
| ✔️ | ✔️ | ✔️ |
... ... @@ -501,3 +501,12 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
[spleeter]: https://github.com/deezer/spleeter
[UVR]: https://github.com/Anjok07/ultimatevocalremovergui
[gtcrn]: https://github.com/Xiaobin-Rong/gtcrn
[tts-url]: https://k2-fsa.github.io/sherpa/onnx/tts/all-in-one.html
[ss-url]: https://k2-fsa.github.io/sherpa/onnx/source-separation/index.html
[sd-url]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/index.html
[slid-url]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/index.html
[at-url]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/index.html
[vad-url]: https://k2-fsa.github.io/sherpa/onnx/vad/index.html
[kws-url]: https://k2-fsa.github.io/sherpa/onnx/kws/index.html
[punct-url]: https://k2-fsa.github.io/sherpa/onnx/punctuation/index.html
[se-url]: https://k2-fsa.github.io/sherpa/onnx/speech-enhancment/index.html
... ...
... ... @@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct {
external double lengthScale;
external Pointer<Utf8> dictDir;
external Pointer<Utf8> lexicon;
external Pointer<Utf8> lang;
}
final class SherpaOnnxOfflineTtsModelConfig extends Struct {
... ...
... ... @@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig {
this.lengthScale = 1.0,
this.dictDir = '',
this.lexicon = '',
this.lang = '',
});
factory OfflineTtsKokoroModelConfig.fromJson(Map<String, dynamic> json) {
... ... @@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig {
lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0,
dictDir: json['dictDir'] as String? ?? '',
lexicon: json['lexicon'] as String? ?? '',
lang: json['lang'] as String? ?? '',
);
}
@override
String toString() {
return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon)';
return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon, lang: $lang)';
}
Map<String, dynamic> toJson() => {
... ... @@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig {
'lengthScale': lengthScale,
'dictDir': dictDir,
'lexicon': lexicon,
'lang': lang,
};
final String model;
... ... @@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig {
final double lengthScale;
final String dictDir;
final String lexicon;
final String lang;
}
class OfflineTtsModelConfig {
... ... @@ -286,6 +290,7 @@ class OfflineTts {
c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale;
c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8();
c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8();
c.ref.model.kokoro.lang = config.model.kokoro.lang.toNativeUtf8();
c.ref.model.numThreads = config.model.numThreads;
c.ref.model.debug = config.model.debug ? 1 : 0;
... ... @@ -302,6 +307,7 @@ class OfflineTts {
calloc.free(c.ref.ruleFsts);
calloc.free(c.ref.model.provider);
calloc.free(c.ref.model.kokoro.lang);
calloc.free(c.ref.model.kokoro.lexicon);
calloc.free(c.ref.model.kokoro.dictDir);
calloc.free(c.ref.model.kokoro.dataDir);
... ...
... ... @@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig(
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale);
SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir);
SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon);
SHERPA_ONNX_ASSIGN_ATTR_STR(lang, lang);
return c;
}
... ... @@ -177,6 +178,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper(
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir);
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir);
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon);
SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lang);
SHERPA_ONNX_DELETE_C_STR(c.model.provider);
... ...
... ... @@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig {
public lengthScale: number = 1.0;
public dictDir: string = '';
public lexicon: string = '';
public lang: string = '';
}
export class OfflineTtsModelConfig {
... ...
... ... @@ -18,6 +18,7 @@ namespace SherpaOnnx
DictDir = "";
Lexicon = "";
Lang = "";
}
[MarshalAs(UnmanagedType.LPStr)]
public string Model;
... ... @@ -38,5 +39,8 @@ namespace SherpaOnnx
[MarshalAs(UnmanagedType.LPStr)]
public string Lexicon;
[MarshalAs(UnmanagedType.LPStr)]
public string Lang;
}
}
... ...
... ... @@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct {
DataDir string // Path to espeak-ng-data directory
DictDir string // Path to dict directory
Lexicon string // Path to lexicon files
Lang string // Example: es for Spanish, fr-fr for French. Can be empty
LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
}
... ... @@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon)
defer C.free(unsafe.Pointer(c.model.kokoro.lexicon))
c.model.kokoro.lang = C.CString(config.Model.Kokoro.Lang)
defer C.free(unsafe.Pointer(c.model.kokoro.lang))
c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale)
c.model.num_threads = C.int(config.Model.NumThreads)
... ...
... ... @@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig(
SHERPA_ONNX_OR(config->model.kokoro.dict_dir, "");
tts_config.model.kokoro.lexicon =
SHERPA_ONNX_OR(config->model.kokoro.lexicon, "");
tts_config.model.kokoro.lang = SHERPA_ONNX_OR(config->model.kokoro.lang, "");
tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
tts_config.model.debug = config->model.debug;
... ...
... ... @@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig {
float length_scale; // < 1, faster in speech speed; > 1, slower in speed
const char *dict_dir;
const char *lexicon;
const char *lang;
} SherpaOnnxOfflineTtsKokoroModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {
... ...
... ... @@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) {
c.model.kokoro.length_scale = config.model.kokoro.length_scale;
c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str();
c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str();
c.model.kokoro.lang = config.model.kokoro.lang.c_str();
c.model.num_threads = config.model.num_threads;
c.model.debug = config.model.debug;
... ...
... ... @@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig {
std::string data_dir;
std::string dict_dir;
std::string lexicon;
std::string lang;
float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed
};
... ...
... ... @@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl {
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
}
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
const std::string &voice) const {
std::string text = ToLowerCase(_text);
if (debug_) {
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
... ... @@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl {
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
}
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
ids_vec = ConvertNonChineseToTokenIDs(ms, voice);
}
for (const auto &ids : ids_vec) {
... ... @@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl {
return ans;
}
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak(
const std::string &text, const std::string &voice) const {
auto temp = ConvertTextToTokenIdsKokoro(
phoneme2id_, meta_data_.max_token_len, text, voice);
std::vector<std::vector<int32_t>> ans;
ans.reserve(temp.size());
for (const auto &i : temp) {
ans.emplace_back(i.tokens.begin(), i.tokens.end());
}
return ans;
}
std::vector<std::vector<int32_t>> ConvertNonChineseToTokenIDs(
const std::string &text, const std::string &voice) const {
if (!voice.empty()) {
return ConvertTextToTokenIDsWithEspeak(text, voice);
}
// If voice is empty, we split the text into words and use the lexicon
// to lookup the pronunciation of each word, fallback to espeak if
// a word is not in the lexicon.
std::vector<std::string> words = SplitUtf8(text);
if (debug_) {
std::ostringstream os;
... ... @@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl {
piper::eSpeakPhonemeConfig config;
config.voice = voice;
config.voice = meta_data_.voice;
std::vector<std::vector<piper::Phoneme>> phonemes;
... ... @@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl {
void InitTokens(std::istream &is) {
token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
std::u32string s;
for (const auto &p : token2id_) {
s = conv.from_bytes(p.first);
if (s.size() != 1) {
SHERPA_ONNX_LOGE("Error for token %s with id %d", p.first.c_str(),
p.second);
SHERPA_ONNX_EXIT(-1);
}
char32_t c = s[0];
phoneme2id_.insert({c, p.second});
}
}
void InitLexicon(const std::string &lexicon) {
if (lexicon.empty()) {
return;
}
std::vector<std::string> files;
SplitStringToVector(lexicon, ",", false, &files);
for (const auto &f : files) {
... ... @@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl {
template <typename Manager>
void InitLexicon(Manager *mgr, const std::string &lexicon) {
if (lexicon.empty()) {
return;
}
std::vector<std::string> files;
SplitStringToVector(lexicon, ",", false, &files);
for (const auto &f : files) {
... ... @@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl {
std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
if (ids.empty()) {
if (ids.empty() && word != "呣") {
SHERPA_ONNX_LOGE(
"Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
word.c_str(), line_num, line.c_str());
... ... @@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl {
// tokens.txt is saved in token2id_
std::unordered_map<std::string, int32_t> token2id_;
std::unordered_map<char32_t, int32_t> phoneme2id_;
std::unique_ptr<cppjieba::Jieba> jieba_;
bool debug_ = false;
};
... ... @@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon(
meta_data, debug)) {}
std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
const std::string &text, const std::string & /*unused_voice = ""*/) const {
return impl_->ConvertTextToTokenIds(text);
const std::string &text, const std::string &voice /*= ""*/) const {
return impl_->ConvertTextToTokenIds(text, voice);
}
#if __ANDROID_API__ >= 9
... ...
... ... @@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig {
OfflineSpeechDenoiserModelConfig() = default;
OfflineSpeechDenoiserModelConfig(OfflineSpeechDenoiserGtcrnModelConfig gtcrn,
int32_t num_threads, bool debug,
const std::string &provider)
OfflineSpeechDenoiserModelConfig(
const OfflineSpeechDenoiserGtcrnModelConfig &gtcrn, int32_t num_threads,
bool debug, const std::string &provider)
: gtcrn(gtcrn),
num_threads(num_threads),
debug(debug),
... ...
... ... @@ -6,6 +6,7 @@
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
#include <cstdint>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
... ... @@ -57,6 +58,12 @@ class OfflineTtsFrontend {
// implementation is in ./piper-phonemize-lexicon.cc
void InitEspeak(const std::string &data_dir);
// implementation in ./piper-phonemize-lexicon.cc
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
const std::unordered_map<char32_t, int32_t> &token2id,
int32_t max_token_len, const std::string &text,
const std::string &voice = "");
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
... ...
... ... @@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
}
}
std::vector<TokenIDs> token_ids =
frontend_->ConvertTextToTokenIds(text, meta_data.voice);
std::vector<TokenIDs> token_ids = frontend_->ConvertTextToTokenIds(
text, config_.model.kokoro.lang.empty() ? meta_data.voice
: config_.model.kokoro.lang);
if (token_ids.empty() ||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
... ... @@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
if (meta_data.version >= 2) {
// this is a multi-lingual model, we require that you pass lexicon
// and dict_dir
if (config_.model.kokoro.lexicon.empty() ||
if ((config_.model.kokoro.lexicon.empty() &&
config_.model.kokoro.lang.empty()) ||
config_.model.kokoro.dict_dir.empty()) {
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
SHERPA_ONNX_LOGE(
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
"v1.0). Please pass --kokoro-lexicon and --kokoro-dict-dir or "
"provide --kokoro-lang and --kokoro-dict-dir");
SHERPA_ONNX_EXIT(-1);
}
... ... @@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
if (meta_data.version >= 2) {
// this is a multi-lingual model, we require that you pass lexicon
// and dict_dir
if (config_.model.kokoro.lexicon.empty() ||
if ((config_.model.kokoro.lexicon.empty() &&
config_.model.kokoro.lang.empty()) ||
config_.model.kokoro.dict_dir.empty()) {
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
SHERPA_ONNX_LOGE(
... ...
... ... @@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
"Path to voices.bin for Kokoro models");
po->Register("kokoro-tokens", &tokens,
"Path to tokens.txt for Kokoro models");
po->Register("kokoro-lang", &lang,
"Used only by kokoro >= 1.0. Example values: "
"en (English), "
"es (Spanish), fr (French), hi (hindi), it (Italian), "
"pt-br (Brazilian Portuguese)."
"You can leave it empty, in which case you need to provide "
"--kokoro-lexicon.");
po->Register(
"kokoro-lexicon", &lexicon,
"Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
... ... @@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
os << "lexicon=\"" << lexicon << "\", ";
os << "data_dir=\"" << data_dir << "\", ";
os << "dict_dir=\"" << dict_dir << "\", ";
os << "length_scale=" << length_scale << ")";
os << "length_scale=" << length_scale << ", ";
os << "lang=\"" << lang << "\")";
return os.str();
}
... ...
... ... @@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig {
// speed = 1 / length_scale
float length_scale = 1.0;
// Used only for Kokoro >= 1.0.
//
// If it is not empty, meta_data.voice is ignored.
// Example values: es (Spanish), fr (French), pt (Portuguese)
// See https://hf-mirror.com/hexgrad/Kokoro-82M/blob/main/VOICES.md
std::string lang;
OfflineTtsKokoroModelConfig() = default;
OfflineTtsKokoroModelConfig(const std::string &model,
... ... @@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig {
const std::string &tokens,
const std::string &lexicon,
const std::string &data_dir,
const std::string &dict_dir, float length_scale)
const std::string &dict_dir, float length_scale,
const std::string &lang)
: model(model),
voices(voices),
tokens(tokens),
lexicon(lexicon),
data_dir(data_dir),
dict_dir(dict_dir),
length_scale(length_scale) {}
length_scale(length_scale),
lang(lang) {}
void Register(ParseOptions *po);
bool Validate() const;
... ...
... ... @@ -351,7 +351,8 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
if (is_matcha_) {
return ConvertTextToTokenIdsMatcha(text, voice);
} else if (is_kokoro_) {
return ConvertTextToTokenIdsKokoro(text, voice);
return ConvertTextToTokenIdsKokoro(
token2id_, kokoro_meta_data_.max_token_len, text, voice);
} else {
return ConvertTextToTokenIdsVits(text, voice);
}
... ... @@ -382,8 +383,10 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
return ans;
}
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
const std::string &text, const std::string &voice /*= ""*/) const {
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
const std::unordered_map<char32_t, int32_t> &token2id,
int32_t max_token_len, const std::string &text,
const std::string &voice /*= ""*/) {
piper::eSpeakPhonemeConfig config;
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
... ... @@ -397,8 +400,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
std::vector<TokenIDs> ans;
for (const auto &p : phonemes) {
auto phoneme_ids =
PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len);
auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len);
for (auto &ids : phoneme_ids) {
ans.emplace_back(std::move(ids));
... ...
... ... @@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
std::vector<TokenIDs> ConvertTextToTokenIdsMatcha(
const std::string &text, const std::string &voice = "") const;
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
const std::string &text, const std::string &voice = "") const;
private:
// map unicode codepoint to an integer ID
std::unordered_map<char32_t, int32_t> token2id_;
... ...
... ... @@ -6,6 +6,7 @@ public class OfflineTtsKokoroModelConfig {
private final String voices;
private final String tokens;
private final String lexicon;
private final String lang;
private final String dataDir;
private final String dictDir;
private final float lengthScale;
... ... @@ -15,6 +16,7 @@ public class OfflineTtsKokoroModelConfig {
this.voices = builder.voices;
this.tokens = builder.tokens;
this.lexicon = builder.lexicon;
this.lang = builder.lang;
this.dataDir = builder.dataDir;
this.dictDir = builder.dictDir;
this.lengthScale = builder.lengthScale;
... ... @@ -50,6 +52,7 @@ public class OfflineTtsKokoroModelConfig {
private String voices = "";
private String tokens = "";
private String lexicon = "";
private String lang = "";
private String dataDir = "";
private String dictDir = "";
private float lengthScale = 1.0f;
... ... @@ -78,6 +81,11 @@ public class OfflineTtsKokoroModelConfig {
return this;
}
public Builder setLang(String lang) {
this.lang = lang;
return this;
}
public Builder setDataDir(String dataDir) {
this.dataDir = dataDir;
return this;
... ...
... ... @@ -145,6 +145,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
ans.model.kokoro.lexicon = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(kokoro_cls, "lang", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(kokoro, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.kokoro.lang = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(kokoro, fid);
p = env->GetStringUTFChars(s, nullptr);
... ...
... ... @@ -31,6 +31,7 @@ data class OfflineTtsKokoroModelConfig(
var tokens: String = "",
var dataDir: String = "",
var lexicon: String = "",
var lang: String = "",
var dictDir: String = "",
var lengthScale: Float = 1.0f,
)
... ...
... ... @@ -84,6 +84,7 @@ type
LengthScale: Single;
DictDir: AnsiString;
Lexicon: AnsiString;
Lang: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
... ... @@ -841,6 +842,7 @@ type
LengthScale: cfloat;
DictDir: PAnsiChar;
Lexicon: PAnsiChar;
Lang: PAnsiChar;
end;
SherpaOnnxOfflineTtsModelConfig = record
... ... @@ -2096,10 +2098,11 @@ begin
'DataDir := %s, ' +
'LengthScale := %.2f, ' +
'DictDir := %s, ' +
'Lexicon := %s' +
'Lexicon := %s, ' +
'Lang := %s' +
')',
[Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale,
Self.DictDir, Self.Lexicon]);
Self.DictDir, Self.Lexicon, Self.Lang]);
end;
class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig);
... ... @@ -2180,6 +2183,7 @@ begin
C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale;
C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir);
C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon);
C.Model.Kokoro.Lang := PAnsiChar(Config.Model.Kokoro.Lang);
C.Model.NumThreads := Config.Model.NumThreads;
C.Model.Provider := PAnsiChar(Config.Model.Provider);
... ...
... ... @@ -17,10 +17,12 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
.def(py::init<>())
.def(py::init<const std::string &, const std::string &,
const std::string &, const std::string &,
const std::string &, const std::string &, float>(),
const std::string &, const std::string &, float,
const std::string &>(),
py::arg("model"), py::arg("voices"), py::arg("tokens"),
py::arg("lexicon") = "", py::arg("data_dir"),
py::arg("dict_dir") = "", py::arg("length_scale") = 1.0)
py::arg("dict_dir") = "", py::arg("length_scale") = 1.0,
py::arg("lang") = "")
.def_readwrite("model", &PyClass::model)
.def_readwrite("voices", &PyClass::voices)
.def_readwrite("tokens", &PyClass::tokens)
... ... @@ -28,6 +30,7 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) {
.def_readwrite("data_dir", &PyClass::data_dir)
.def_readwrite("dict_dir", &PyClass::dict_dir)
.def_readwrite("length_scale", &PyClass::length_scale)
.def_readwrite("lang", &PyClass::lang)
.def("__str__", &PyClass::ToString)
.def("validate", &PyClass::Validate);
}
... ...
... ... @@ -806,7 +806,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig(
dataDir: String = "",
lengthScale: Float = 1.0,
dictDir: String = "",
lexicon: String = ""
lexicon: String = "",
lang: String = ""
) -> SherpaOnnxOfflineTtsKokoroModelConfig {
return SherpaOnnxOfflineTtsKokoroModelConfig(
model: toCPointer(model),
... ... @@ -815,7 +816,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig(
data_dir: toCPointer(dataDir),
length_scale: lengthScale,
dict_dir: toCPointer(dictDir),
lexicon: toCPointer(lexicon)
lexicon: toCPointer(lexicon),
lang: toCPointer(lang)
)
}
... ...
... ... @@ -143,13 +143,14 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
const langLen = Module.lengthBytesUTF8(config.lang || '') + 1;
const n =
modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen;
const n = modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen +
lexiconLen + langLen;
const buffer = Module._malloc(n);
const len = 7 * 4;
const len = 8 * 4;
const ptr = Module._malloc(len);
let offset = 0;
... ... @@ -171,6 +172,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
offset += lexiconLen;
Module.stringToUTF8(config.lang || '', buffer + offset, langLen);
offset += langLen;
offset = 0;
Module.setValue(ptr, buffer + offset, 'i8*');
offset += modelLen;
... ... @@ -192,6 +196,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
Module.setValue(ptr + 24, buffer + offset, 'i8*');
offset += lexiconLen;
Module.setValue(ptr + 28, buffer + offset, 'i8*');
offset += langLen;
return {
buffer: buffer, ptr: ptr, len: len,
}
... ... @@ -233,6 +240,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
dataDir: '',
dictDir: '',
lexicon: '',
lang: '',
};
}
... ...
... ... @@ -15,7 +15,7 @@ extern "C" {
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 7 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
... ...