Committed by
GitHub
Support extra languages in multi-lang kokoro tts (#2303)
正在显示
28 个修改的文件
包含
187 行增加
和
49 行删除
| @@ -35,18 +35,18 @@ jobs: | @@ -35,18 +35,18 @@ jobs: | ||
| 35 | matrix: | 35 | matrix: |
| 36 | # See https://github.com/actions/runner-images | 36 | # See https://github.com/actions/runner-images |
| 37 | include: | 37 | include: |
| 38 | - - os: ubuntu-22.04 | ||
| 39 | - python-version: "3.7" | ||
| 40 | - - os: ubuntu-22.04 | 38 | + - os: ubuntu-latest |
| 41 | python-version: "3.8" | 39 | python-version: "3.8" |
| 42 | - - os: ubuntu-22.04 | 40 | + - os: ubuntu-latest |
| 43 | python-version: "3.9" | 41 | python-version: "3.9" |
| 44 | - - os: ubuntu-22.04 | 42 | + - os: ubuntu-latest |
| 45 | python-version: "3.10" | 43 | python-version: "3.10" |
| 46 | - - os: ubuntu-22.04 | 44 | + - os: ubuntu-latest |
| 47 | python-version: "3.11" | 45 | python-version: "3.11" |
| 48 | - - os: ubuntu-22.04 | 46 | + - os: ubuntu-latest |
| 49 | python-version: "3.12" | 47 | python-version: "3.12" |
| 48 | + - os: ubuntu-latest | ||
| 49 | + python-version: "3.13" | ||
| 50 | 50 | ||
| 51 | - os: macos-13 | 51 | - os: macos-13 |
| 52 | python-version: "3.8" | 52 | python-version: "3.8" |
| @@ -103,7 +103,7 @@ jobs: | @@ -103,7 +103,7 @@ jobs: | ||
| 103 | export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" | 103 | export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" |
| 104 | cmake --version | 104 | cmake --version |
| 105 | 105 | ||
| 106 | - export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j" | 106 | + export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j2" |
| 107 | 107 | ||
| 108 | python3 setup.py bdist_wheel | 108 | python3 setup.py bdist_wheel |
| 109 | ls -lh dist | 109 | ls -lh dist |
| 1 | ### Supported functions | 1 | ### Supported functions |
| 2 | 2 | ||
| 3 | -|Speech recognition| Speech synthesis | Source separation | | 3 | +|Speech recognition| [Speech synthesis][tts-url] | [Source separation][ss-url] | |
| 4 | |------------------|------------------|-------------------| | 4 | |------------------|------------------|-------------------| |
| 5 | | ✔️ | ✔️ | ✔️ | | 5 | | ✔️ | ✔️ | ✔️ | |
| 6 | 6 | ||
| 7 | -|Speaker identification| Speaker diarization | Speaker verification | | 7 | +|Speaker identification| [Speaker diarization][sd-url] | Speaker verification | |
| 8 | |----------------------|-------------------- |------------------------| | 8 | |----------------------|-------------------- |------------------------| |
| 9 | | ✔️ | ✔️ | ✔️ | | 9 | | ✔️ | ✔️ | ✔️ | |
| 10 | 10 | ||
| 11 | -| Spoken Language identification | Audio tagging | Voice activity detection | | 11 | +| [Spoken Language identification][slid-url] | [Audio tagging][at-url] | [Voice activity detection][vad-url] | |
| 12 | |--------------------------------|---------------|--------------------------| | 12 | |--------------------------------|---------------|--------------------------| |
| 13 | | ✔️ | ✔️ | ✔️ | | 13 | | ✔️ | ✔️ | ✔️ | |
| 14 | 14 | ||
| 15 | -| Keyword spotting | Add punctuation | Speech enhancement | | 15 | +| [Keyword spotting][kws-url] | [Add punctuation][punct-url] | [Speech enhancement][se-url] | |
| 16 | |------------------|-----------------|--------------------| | 16 | |------------------|-----------------|--------------------| |
| 17 | | ✔️ | ✔️ | ✔️ | | 17 | | ✔️ | ✔️ | ✔️ | |
| 18 | 18 | ||
| @@ -501,3 +501,12 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | @@ -501,3 +501,12 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. | ||
| 501 | [spleeter]: https://github.com/deezer/spleeter | 501 | [spleeter]: https://github.com/deezer/spleeter |
| 502 | [UVR]: https://github.com/Anjok07/ultimatevocalremovergui | 502 | [UVR]: https://github.com/Anjok07/ultimatevocalremovergui |
| 503 | [gtcrn]: https://github.com/Xiaobin-Rong/gtcrn | 503 | [gtcrn]: https://github.com/Xiaobin-Rong/gtcrn |
| 504 | +[tts-url]: https://k2-fsa.github.io/sherpa/onnx/tts/all-in-one.html | ||
| 505 | +[ss-url]: https://k2-fsa.github.io/sherpa/onnx/source-separation/index.html | ||
| 506 | +[sd-url]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/index.html | ||
| 507 | +[slid-url]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/index.html | ||
| 508 | +[at-url]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/index.html | ||
| 509 | +[vad-url]: https://k2-fsa.github.io/sherpa/onnx/vad/index.html | ||
| 510 | +[kws-url]: https://k2-fsa.github.io/sherpa/onnx/kws/index.html | ||
| 511 | +[punct-url]: https://k2-fsa.github.io/sherpa/onnx/punctuation/index.html | ||
| 512 | +[se-url]: https://k2-fsa.github.io/sherpa/onnx/speech-enhancment/index.html |
| @@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct { | @@ -201,6 +201,7 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct { | ||
| 201 | external double lengthScale; | 201 | external double lengthScale; |
| 202 | external Pointer<Utf8> dictDir; | 202 | external Pointer<Utf8> dictDir; |
| 203 | external Pointer<Utf8> lexicon; | 203 | external Pointer<Utf8> lexicon; |
| 204 | + external Pointer<Utf8> lang; | ||
| 204 | } | 205 | } |
| 205 | 206 | ||
| 206 | final class SherpaOnnxOfflineTtsModelConfig extends Struct { | 207 | final class SherpaOnnxOfflineTtsModelConfig extends Struct { |
| @@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig { | @@ -117,6 +117,7 @@ class OfflineTtsKokoroModelConfig { | ||
| 117 | this.lengthScale = 1.0, | 117 | this.lengthScale = 1.0, |
| 118 | this.dictDir = '', | 118 | this.dictDir = '', |
| 119 | this.lexicon = '', | 119 | this.lexicon = '', |
| 120 | + this.lang = '', | ||
| 120 | }); | 121 | }); |
| 121 | 122 | ||
| 122 | factory OfflineTtsKokoroModelConfig.fromJson(Map<String, dynamic> json) { | 123 | factory OfflineTtsKokoroModelConfig.fromJson(Map<String, dynamic> json) { |
| @@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig { | @@ -128,12 +129,13 @@ class OfflineTtsKokoroModelConfig { | ||
| 128 | lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0, | 129 | lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0, |
| 129 | dictDir: json['dictDir'] as String? ?? '', | 130 | dictDir: json['dictDir'] as String? ?? '', |
| 130 | lexicon: json['lexicon'] as String? ?? '', | 131 | lexicon: json['lexicon'] as String? ?? '', |
| 132 | + lang: json['lang'] as String? ?? '', | ||
| 131 | ); | 133 | ); |
| 132 | } | 134 | } |
| 133 | 135 | ||
| 134 | @override | 136 | @override |
| 135 | String toString() { | 137 | String toString() { |
| 136 | - return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon)'; | 138 | + return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale, dictDir: $dictDir, lexicon: $lexicon, lang: $lang)'; |
| 137 | } | 139 | } |
| 138 | 140 | ||
| 139 | Map<String, dynamic> toJson() => { | 141 | Map<String, dynamic> toJson() => { |
| @@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig { | @@ -144,6 +146,7 @@ class OfflineTtsKokoroModelConfig { | ||
| 144 | 'lengthScale': lengthScale, | 146 | 'lengthScale': lengthScale, |
| 145 | 'dictDir': dictDir, | 147 | 'dictDir': dictDir, |
| 146 | 'lexicon': lexicon, | 148 | 'lexicon': lexicon, |
| 149 | + 'lang': lang, | ||
| 147 | }; | 150 | }; |
| 148 | 151 | ||
| 149 | final String model; | 152 | final String model; |
| @@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig { | @@ -153,6 +156,7 @@ class OfflineTtsKokoroModelConfig { | ||
| 153 | final double lengthScale; | 156 | final double lengthScale; |
| 154 | final String dictDir; | 157 | final String dictDir; |
| 155 | final String lexicon; | 158 | final String lexicon; |
| 159 | + final String lang; | ||
| 156 | } | 160 | } |
| 157 | 161 | ||
| 158 | class OfflineTtsModelConfig { | 162 | class OfflineTtsModelConfig { |
| @@ -286,6 +290,7 @@ class OfflineTts { | @@ -286,6 +290,7 @@ class OfflineTts { | ||
| 286 | c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale; | 290 | c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale; |
| 287 | c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8(); | 291 | c.ref.model.kokoro.dictDir = config.model.kokoro.dictDir.toNativeUtf8(); |
| 288 | c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8(); | 292 | c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8(); |
| 293 | + c.ref.model.kokoro.lang = config.model.kokoro.lang.toNativeUtf8(); | ||
| 289 | 294 | ||
| 290 | c.ref.model.numThreads = config.model.numThreads; | 295 | c.ref.model.numThreads = config.model.numThreads; |
| 291 | c.ref.model.debug = config.model.debug ? 1 : 0; | 296 | c.ref.model.debug = config.model.debug ? 1 : 0; |
| @@ -302,6 +307,7 @@ class OfflineTts { | @@ -302,6 +307,7 @@ class OfflineTts { | ||
| 302 | calloc.free(c.ref.ruleFsts); | 307 | calloc.free(c.ref.ruleFsts); |
| 303 | calloc.free(c.ref.model.provider); | 308 | calloc.free(c.ref.model.provider); |
| 304 | 309 | ||
| 310 | + calloc.free(c.ref.model.kokoro.lang); | ||
| 305 | calloc.free(c.ref.model.kokoro.lexicon); | 311 | calloc.free(c.ref.model.kokoro.lexicon); |
| 306 | calloc.free(c.ref.model.kokoro.dictDir); | 312 | calloc.free(c.ref.model.kokoro.dictDir); |
| 307 | calloc.free(c.ref.model.kokoro.dataDir); | 313 | calloc.free(c.ref.model.kokoro.dataDir); |
| @@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig( | @@ -70,6 +70,7 @@ static SherpaOnnxOfflineTtsKokoroModelConfig GetOfflineTtsKokoroModelConfig( | ||
| 70 | SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale); | 70 | SHERPA_ONNX_ASSIGN_ATTR_FLOAT(length_scale, lengthScale); |
| 71 | SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir); | 71 | SHERPA_ONNX_ASSIGN_ATTR_STR(dict_dir, dictDir); |
| 72 | SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon); | 72 | SHERPA_ONNX_ASSIGN_ATTR_STR(lexicon, lexicon); |
| 73 | + SHERPA_ONNX_ASSIGN_ATTR_STR(lang, lang); | ||
| 73 | 74 | ||
| 74 | return c; | 75 | return c; |
| 75 | } | 76 | } |
| @@ -177,6 +178,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | @@ -177,6 +178,7 @@ static Napi::External<SherpaOnnxOfflineTts> CreateOfflineTtsWrapper( | ||
| 177 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir); | 178 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.data_dir); |
| 178 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir); | 179 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.dict_dir); |
| 179 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon); | 180 | SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lexicon); |
| 181 | + SHERPA_ONNX_DELETE_C_STR(c.model.kokoro.lang); | ||
| 180 | 182 | ||
| 181 | SHERPA_ONNX_DELETE_C_STR(c.model.provider); | 183 | SHERPA_ONNX_DELETE_C_STR(c.model.provider); |
| 182 | 184 |
| @@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig { | @@ -36,6 +36,7 @@ export class OfflineTtsKokoroModelConfig { | ||
| 36 | public lengthScale: number = 1.0; | 36 | public lengthScale: number = 1.0; |
| 37 | public dictDir: string = ''; | 37 | public dictDir: string = ''; |
| 38 | public lexicon: string = ''; | 38 | public lexicon: string = ''; |
| 39 | + public lang: string = ''; | ||
| 39 | } | 40 | } |
| 40 | 41 | ||
| 41 | export class OfflineTtsModelConfig { | 42 | export class OfflineTtsModelConfig { |
| @@ -18,6 +18,7 @@ namespace SherpaOnnx | @@ -18,6 +18,7 @@ namespace SherpaOnnx | ||
| 18 | 18 | ||
| 19 | DictDir = ""; | 19 | DictDir = ""; |
| 20 | Lexicon = ""; | 20 | Lexicon = ""; |
| 21 | + Lang = ""; | ||
| 21 | } | 22 | } |
| 22 | [MarshalAs(UnmanagedType.LPStr)] | 23 | [MarshalAs(UnmanagedType.LPStr)] |
| 23 | public string Model; | 24 | public string Model; |
| @@ -38,5 +39,8 @@ namespace SherpaOnnx | @@ -38,5 +39,8 @@ namespace SherpaOnnx | ||
| 38 | 39 | ||
| 39 | [MarshalAs(UnmanagedType.LPStr)] | 40 | [MarshalAs(UnmanagedType.LPStr)] |
| 40 | public string Lexicon; | 41 | public string Lexicon; |
| 42 | + | ||
| 43 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 44 | + public string Lang; | ||
| 41 | } | 45 | } |
| 42 | } | 46 | } |
| @@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct { | @@ -857,6 +857,7 @@ type OfflineTtsKokoroModelConfig struct { | ||
| 857 | DataDir string // Path to espeak-ng-data directory | 857 | DataDir string // Path to espeak-ng-data directory |
| 858 | DictDir string // Path to dict directory | 858 | DictDir string // Path to dict directory |
| 859 | Lexicon string // Path to lexicon files | 859 | Lexicon string // Path to lexicon files |
| 860 | + Lang string // Example: es for Spanish, fr-fr for French. Can be empty | ||
| 860 | LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed | 861 | LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed |
| 861 | } | 862 | } |
| 862 | 863 | ||
| @@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { | @@ -1006,6 +1007,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { | ||
| 1006 | c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon) | 1007 | c.model.kokoro.lexicon = C.CString(config.Model.Kokoro.Lexicon) |
| 1007 | defer C.free(unsafe.Pointer(c.model.kokoro.lexicon)) | 1008 | defer C.free(unsafe.Pointer(c.model.kokoro.lexicon)) |
| 1008 | 1009 | ||
| 1010 | + c.model.kokoro.lang = C.CString(config.Model.Kokoro.Lang) | ||
| 1011 | + defer C.free(unsafe.Pointer(c.model.kokoro.lang)) | ||
| 1012 | + | ||
| 1009 | c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale) | 1013 | c.model.kokoro.length_scale = C.float(config.Model.Kokoro.LengthScale) |
| 1010 | 1014 | ||
| 1011 | c.model.num_threads = C.int(config.Model.NumThreads) | 1015 | c.model.num_threads = C.int(config.Model.NumThreads) |
| @@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | @@ -1164,6 +1164,7 @@ static sherpa_onnx::OfflineTtsConfig GetOfflineTtsConfig( | ||
| 1164 | SHERPA_ONNX_OR(config->model.kokoro.dict_dir, ""); | 1164 | SHERPA_ONNX_OR(config->model.kokoro.dict_dir, ""); |
| 1165 | tts_config.model.kokoro.lexicon = | 1165 | tts_config.model.kokoro.lexicon = |
| 1166 | SHERPA_ONNX_OR(config->model.kokoro.lexicon, ""); | 1166 | SHERPA_ONNX_OR(config->model.kokoro.lexicon, ""); |
| 1167 | + tts_config.model.kokoro.lang = SHERPA_ONNX_OR(config->model.kokoro.lang, ""); | ||
| 1167 | 1168 | ||
| 1168 | tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); | 1169 | tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); |
| 1169 | tts_config.model.debug = config->model.debug; | 1170 | tts_config.model.debug = config->model.debug; |
| @@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig { | @@ -958,6 +958,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsKokoroModelConfig { | ||
| 958 | float length_scale; // < 1, faster in speech speed; > 1, slower in speed | 958 | float length_scale; // < 1, faster in speech speed; > 1, slower in speed |
| 959 | const char *dict_dir; | 959 | const char *dict_dir; |
| 960 | const char *lexicon; | 960 | const char *lexicon; |
| 961 | + const char *lang; | ||
| 961 | } SherpaOnnxOfflineTtsKokoroModelConfig; | 962 | } SherpaOnnxOfflineTtsKokoroModelConfig; |
| 962 | 963 | ||
| 963 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { | 964 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { |
| @@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { | @@ -366,6 +366,7 @@ OfflineTts OfflineTts::Create(const OfflineTtsConfig &config) { | ||
| 366 | c.model.kokoro.length_scale = config.model.kokoro.length_scale; | 366 | c.model.kokoro.length_scale = config.model.kokoro.length_scale; |
| 367 | c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str(); | 367 | c.model.kokoro.dict_dir = config.model.kokoro.dict_dir.c_str(); |
| 368 | c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str(); | 368 | c.model.kokoro.lexicon = config.model.kokoro.lexicon.c_str(); |
| 369 | + c.model.kokoro.lang = config.model.kokoro.lang.c_str(); | ||
| 369 | 370 | ||
| 370 | c.model.num_threads = config.model.num_threads; | 371 | c.model.num_threads = config.model.num_threads; |
| 371 | c.model.debug = config.model.debug; | 372 | c.model.debug = config.model.debug; |
| @@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig { | @@ -367,6 +367,7 @@ struct OfflineTtsKokoroModelConfig { | ||
| 367 | std::string data_dir; | 367 | std::string data_dir; |
| 368 | std::string dict_dir; | 368 | std::string dict_dir; |
| 369 | std::string lexicon; | 369 | std::string lexicon; |
| 370 | + std::string lang; | ||
| 370 | 371 | ||
| 371 | float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed | 372 | float length_scale = 1.0; // < 1, faster in speed; > 1, slower in speed |
| 372 | }; | 373 | }; |
| @@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl { | @@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl { | ||
| 67 | InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc | 67 | InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc |
| 68 | } | 68 | } |
| 69 | 69 | ||
| 70 | - std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const { | 70 | + std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text, |
| 71 | + const std::string &voice) const { | ||
| 71 | std::string text = ToLowerCase(_text); | 72 | std::string text = ToLowerCase(_text); |
| 72 | if (debug_) { | 73 | if (debug_) { |
| 73 | SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str()); | 74 | SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str()); |
| @@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl { | @@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl { | ||
| 124 | SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); | 125 | SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); |
| 125 | } | 126 | } |
| 126 | 127 | ||
| 127 | - ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice); | 128 | + ids_vec = ConvertNonChineseToTokenIDs(ms, voice); |
| 128 | } | 129 | } |
| 129 | 130 | ||
| 130 | for (const auto &ids : ids_vec) { | 131 | for (const auto &ids : ids_vec) { |
| @@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl { | @@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl { | ||
| 255 | return ans; | 256 | return ans; |
| 256 | } | 257 | } |
| 257 | 258 | ||
| 258 | - std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs( | 259 | + std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak( |
| 259 | const std::string &text, const std::string &voice) const { | 260 | const std::string &text, const std::string &voice) const { |
| 261 | + auto temp = ConvertTextToTokenIdsKokoro( | ||
| 262 | + phoneme2id_, meta_data_.max_token_len, text, voice); | ||
| 263 | + std::vector<std::vector<int32_t>> ans; | ||
| 264 | + ans.reserve(temp.size()); | ||
| 265 | + | ||
| 266 | + for (const auto &i : temp) { | ||
| 267 | + ans.emplace_back(i.tokens.begin(), i.tokens.end()); | ||
| 268 | + } | ||
| 269 | + | ||
| 270 | + return ans; | ||
| 271 | + } | ||
| 272 | + | ||
| 273 | + std::vector<std::vector<int32_t>> ConvertNonChineseToTokenIDs( | ||
| 274 | + const std::string &text, const std::string &voice) const { | ||
| 275 | + if (!voice.empty()) { | ||
| 276 | + return ConvertTextToTokenIDsWithEspeak(text, voice); | ||
| 277 | + } | ||
| 278 | + | ||
| 279 | + // If voice is empty, we split the text into words and use the lexicon | ||
| 280 | + // to lookup the pronunciation of each word, fallback to espeak if | ||
| 281 | + // a word is not in the lexicon. | ||
| 282 | + | ||
| 260 | std::vector<std::string> words = SplitUtf8(text); | 283 | std::vector<std::string> words = SplitUtf8(text); |
| 261 | if (debug_) { | 284 | if (debug_) { |
| 262 | std::ostringstream os; | 285 | std::ostringstream os; |
| @@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl { | @@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl { | ||
| 317 | 340 | ||
| 318 | piper::eSpeakPhonemeConfig config; | 341 | piper::eSpeakPhonemeConfig config; |
| 319 | 342 | ||
| 320 | - config.voice = voice; | 343 | + config.voice = meta_data_.voice; |
| 321 | 344 | ||
| 322 | std::vector<std::vector<piper::Phoneme>> phonemes; | 345 | std::vector<std::vector<piper::Phoneme>> phonemes; |
| 323 | 346 | ||
| @@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl { | @@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl { | ||
| 391 | 414 | ||
| 392 | void InitTokens(std::istream &is) { | 415 | void InitTokens(std::istream &is) { |
| 393 | token2id_ = ReadTokens(is); // defined in ./symbol-table.cc | 416 | token2id_ = ReadTokens(is); // defined in ./symbol-table.cc |
| 417 | + | ||
| 418 | + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; | ||
| 419 | + std::u32string s; | ||
| 420 | + for (const auto &p : token2id_) { | ||
| 421 | + s = conv.from_bytes(p.first); | ||
| 422 | + | ||
| 423 | + if (s.size() != 1) { | ||
| 424 | + SHERPA_ONNX_LOGE("Error for token %s with id %d", p.first.c_str(), | ||
| 425 | + p.second); | ||
| 426 | + SHERPA_ONNX_EXIT(-1); | ||
| 427 | + } | ||
| 428 | + | ||
| 429 | + char32_t c = s[0]; | ||
| 430 | + phoneme2id_.insert({c, p.second}); | ||
| 431 | + } | ||
| 394 | } | 432 | } |
| 395 | 433 | ||
| 396 | void InitLexicon(const std::string &lexicon) { | 434 | void InitLexicon(const std::string &lexicon) { |
| 435 | + if (lexicon.empty()) { | ||
| 436 | + return; | ||
| 437 | + } | ||
| 438 | + | ||
| 397 | std::vector<std::string> files; | 439 | std::vector<std::string> files; |
| 398 | SplitStringToVector(lexicon, ",", false, &files); | 440 | SplitStringToVector(lexicon, ",", false, &files); |
| 399 | for (const auto &f : files) { | 441 | for (const auto &f : files) { |
| @@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl { | @@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl { | ||
| 404 | 446 | ||
| 405 | template <typename Manager> | 447 | template <typename Manager> |
| 406 | void InitLexicon(Manager *mgr, const std::string &lexicon) { | 448 | void InitLexicon(Manager *mgr, const std::string &lexicon) { |
| 449 | + if (lexicon.empty()) { | ||
| 450 | + return; | ||
| 451 | + } | ||
| 452 | + | ||
| 407 | std::vector<std::string> files; | 453 | std::vector<std::string> files; |
| 408 | SplitStringToVector(lexicon, ",", false, &files); | 454 | SplitStringToVector(lexicon, ",", false, &files); |
| 409 | for (const auto &f : files) { | 455 | for (const auto &f : files) { |
| @@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl { | @@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl { | ||
| 445 | 491 | ||
| 446 | std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list); | 492 | std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list); |
| 447 | 493 | ||
| 448 | - if (ids.empty()) { | 494 | + if (ids.empty() && word != "呣") { |
| 449 | SHERPA_ONNX_LOGE( | 495 | SHERPA_ONNX_LOGE( |
| 450 | "Invalid pronunciation for word '%s' at line %d:%s. Ignore it", | 496 | "Invalid pronunciation for word '%s' at line %d:%s. Ignore it", |
| 451 | word.c_str(), line_num, line.c_str()); | 497 | word.c_str(), line_num, line.c_str()); |
| @@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl { | @@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl { | ||
| 465 | // tokens.txt is saved in token2id_ | 511 | // tokens.txt is saved in token2id_ |
| 466 | std::unordered_map<std::string, int32_t> token2id_; | 512 | std::unordered_map<std::string, int32_t> token2id_; |
| 467 | 513 | ||
| 514 | + std::unordered_map<char32_t, int32_t> phoneme2id_; | ||
| 515 | + | ||
| 468 | std::unique_ptr<cppjieba::Jieba> jieba_; | 516 | std::unique_ptr<cppjieba::Jieba> jieba_; |
| 469 | bool debug_ = false; | 517 | bool debug_ = false; |
| 470 | }; | 518 | }; |
| @@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon( | @@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon( | ||
| 487 | meta_data, debug)) {} | 535 | meta_data, debug)) {} |
| 488 | 536 | ||
| 489 | std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds( | 537 | std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds( |
| 490 | - const std::string &text, const std::string & /*unused_voice = ""*/) const { | ||
| 491 | - return impl_->ConvertTextToTokenIds(text); | 538 | + const std::string &text, const std::string &voice /*= ""*/) const { |
| 539 | + return impl_->ConvertTextToTokenIds(text, voice); | ||
| 492 | } | 540 | } |
| 493 | 541 | ||
| 494 | #if __ANDROID_API__ >= 9 | 542 | #if __ANDROID_API__ >= 9 |
| @@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig { | @@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig { | ||
| 20 | 20 | ||
| 21 | OfflineSpeechDenoiserModelConfig() = default; | 21 | OfflineSpeechDenoiserModelConfig() = default; |
| 22 | 22 | ||
| 23 | - OfflineSpeechDenoiserModelConfig(OfflineSpeechDenoiserGtcrnModelConfig gtcrn, | ||
| 24 | - int32_t num_threads, bool debug, | ||
| 25 | - const std::string &provider) | 23 | + OfflineSpeechDenoiserModelConfig( |
| 24 | + const OfflineSpeechDenoiserGtcrnModelConfig >crn, int32_t num_threads, | ||
| 25 | + bool debug, const std::string &provider) | ||
| 26 | : gtcrn(gtcrn), | 26 | : gtcrn(gtcrn), |
| 27 | num_threads(num_threads), | 27 | num_threads(num_threads), |
| 28 | debug(debug), | 28 | debug(debug), |
| @@ -6,6 +6,7 @@ | @@ -6,6 +6,7 @@ | ||
| 6 | #define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ | 6 | #define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ |
| 7 | #include <cstdint> | 7 | #include <cstdint> |
| 8 | #include <string> | 8 | #include <string> |
| 9 | +#include <unordered_map> | ||
| 9 | #include <utility> | 10 | #include <utility> |
| 10 | #include <vector> | 11 | #include <vector> |
| 11 | 12 | ||
| @@ -57,6 +58,12 @@ class OfflineTtsFrontend { | @@ -57,6 +58,12 @@ class OfflineTtsFrontend { | ||
| 57 | // implementation is in ./piper-phonemize-lexicon.cc | 58 | // implementation is in ./piper-phonemize-lexicon.cc |
| 58 | void InitEspeak(const std::string &data_dir); | 59 | void InitEspeak(const std::string &data_dir); |
| 59 | 60 | ||
| 61 | +// implementation in ./piper-phonemize-lexicon.cc | ||
| 62 | +std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( | ||
| 63 | + const std::unordered_map<char32_t, int32_t> &token2id, | ||
| 64 | + int32_t max_token_len, const std::string &text, | ||
| 65 | + const std::string &voice = ""); | ||
| 66 | + | ||
| 60 | } // namespace sherpa_onnx | 67 | } // namespace sherpa_onnx |
| 61 | 68 | ||
| 62 | #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ | 69 | #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ |
| @@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | @@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 220 | } | 220 | } |
| 221 | } | 221 | } |
| 222 | 222 | ||
| 223 | - std::vector<TokenIDs> token_ids = | ||
| 224 | - frontend_->ConvertTextToTokenIds(text, meta_data.voice); | 223 | + std::vector<TokenIDs> token_ids = frontend_->ConvertTextToTokenIds( |
| 224 | + text, config_.model.kokoro.lang.empty() ? meta_data.voice | ||
| 225 | + : config_.model.kokoro.lang); | ||
| 225 | 226 | ||
| 226 | if (token_ids.empty() || | 227 | if (token_ids.empty() || |
| 227 | (token_ids.size() == 1 && token_ids[0].tokens.empty())) { | 228 | (token_ids.size() == 1 && token_ids[0].tokens.empty())) { |
| @@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | @@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 335 | if (meta_data.version >= 2) { | 336 | if (meta_data.version >= 2) { |
| 336 | // this is a multi-lingual model, we require that you pass lexicon | 337 | // this is a multi-lingual model, we require that you pass lexicon |
| 337 | // and dict_dir | 338 | // and dict_dir |
| 338 | - if (config_.model.kokoro.lexicon.empty() || | 339 | + if ((config_.model.kokoro.lexicon.empty() && |
| 340 | + config_.model.kokoro.lang.empty()) || | ||
| 339 | config_.model.kokoro.dict_dir.empty()) { | 341 | config_.model.kokoro.dict_dir.empty()) { |
| 340 | SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); | 342 | SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); |
| 341 | SHERPA_ONNX_LOGE( | 343 | SHERPA_ONNX_LOGE( |
| 342 | "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " | 344 | "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " |
| 343 | - "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir"); | 345 | + "v1.0). Please pass --kokoro-lexicon and --kokoro-dict-dir or " |
| 346 | + "provide --kokoro-lang and --kokoro-dict-dir"); | ||
| 344 | SHERPA_ONNX_EXIT(-1); | 347 | SHERPA_ONNX_EXIT(-1); |
| 345 | } | 348 | } |
| 346 | 349 | ||
| @@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | @@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { | ||
| 362 | if (meta_data.version >= 2) { | 365 | if (meta_data.version >= 2) { |
| 363 | // this is a multi-lingual model, we require that you pass lexicon | 366 | // this is a multi-lingual model, we require that you pass lexicon |
| 364 | // and dict_dir | 367 | // and dict_dir |
| 365 | - if (config_.model.kokoro.lexicon.empty() || | 368 | + if ((config_.model.kokoro.lexicon.empty() && |
| 369 | + config_.model.kokoro.lang.empty()) || | ||
| 366 | config_.model.kokoro.dict_dir.empty()) { | 370 | config_.model.kokoro.dict_dir.empty()) { |
| 367 | SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); | 371 | SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); |
| 368 | SHERPA_ONNX_LOGE( | 372 | SHERPA_ONNX_LOGE( |
| @@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { | @@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { | ||
| 18 | "Path to voices.bin for Kokoro models"); | 18 | "Path to voices.bin for Kokoro models"); |
| 19 | po->Register("kokoro-tokens", &tokens, | 19 | po->Register("kokoro-tokens", &tokens, |
| 20 | "Path to tokens.txt for Kokoro models"); | 20 | "Path to tokens.txt for Kokoro models"); |
| 21 | + po->Register("kokoro-lang", &lang, | ||
| 22 | + "Used only by kokoro >= 1.0. Example values: " | ||
| 23 | + "en (English), " | ||
| 24 | + "es (Spanish), fr (French), hi (hindi), it (Italian), " | ||
| 25 | + "pt-br (Brazilian Portuguese)." | ||
| 26 | + "You can leave it empty, in which case you need to provide " | ||
| 27 | + "--kokoro-lexicon."); | ||
| 21 | po->Register( | 28 | po->Register( |
| 22 | "kokoro-lexicon", &lexicon, | 29 | "kokoro-lexicon", &lexicon, |
| 23 | "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0" | 30 | "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0" |
| @@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { | @@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { | ||
| 127 | os << "lexicon=\"" << lexicon << "\", "; | 134 | os << "lexicon=\"" << lexicon << "\", "; |
| 128 | os << "data_dir=\"" << data_dir << "\", "; | 135 | os << "data_dir=\"" << data_dir << "\", "; |
| 129 | os << "dict_dir=\"" << dict_dir << "\", "; | 136 | os << "dict_dir=\"" << dict_dir << "\", "; |
| 130 | - os << "length_scale=" << length_scale << ")"; | 137 | + os << "length_scale=" << length_scale << ", "; |
| 138 | + os << "lang=\"" << lang << "\")"; | ||
| 131 | 139 | ||
| 132 | return os.str(); | 140 | return os.str(); |
| 133 | } | 141 | } |
| @@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig { | @@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig { | ||
| 27 | // speed = 1 / length_scale | 27 | // speed = 1 / length_scale |
| 28 | float length_scale = 1.0; | 28 | float length_scale = 1.0; |
| 29 | 29 | ||
| 30 | + // Used only for Kokoro >= 1.0. | ||
| 31 | + // | ||
| 32 | + // If it is not empty, meta_data.voice is ignored. | ||
| 33 | + // Example values: es (Spanish), fr (French), pt (Portuguese) | ||
| 34 | + // See https://hf-mirror.com/hexgrad/Kokoro-82M/blob/main/VOICES.md | ||
| 35 | + std::string lang; | ||
| 36 | + | ||
| 30 | OfflineTtsKokoroModelConfig() = default; | 37 | OfflineTtsKokoroModelConfig() = default; |
| 31 | 38 | ||
| 32 | OfflineTtsKokoroModelConfig(const std::string &model, | 39 | OfflineTtsKokoroModelConfig(const std::string &model, |
| @@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig { | @@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig { | ||
| 34 | const std::string &tokens, | 41 | const std::string &tokens, |
| 35 | const std::string &lexicon, | 42 | const std::string &lexicon, |
| 36 | const std::string &data_dir, | 43 | const std::string &data_dir, |
| 37 | - const std::string &dict_dir, float length_scale) | 44 | + const std::string &dict_dir, float length_scale, |
| 45 | + const std::string &lang) | ||
| 38 | : model(model), | 46 | : model(model), |
| 39 | voices(voices), | 47 | voices(voices), |
| 40 | tokens(tokens), | 48 | tokens(tokens), |
| 41 | lexicon(lexicon), | 49 | lexicon(lexicon), |
| 42 | data_dir(data_dir), | 50 | data_dir(data_dir), |
| 43 | dict_dir(dict_dir), | 51 | dict_dir(dict_dir), |
| 44 | - length_scale(length_scale) {} | 52 | + length_scale(length_scale), |
| 53 | + lang(lang) {} | ||
| 45 | 54 | ||
| 46 | void Register(ParseOptions *po); | 55 | void Register(ParseOptions *po); |
| 47 | bool Validate() const; | 56 | bool Validate() const; |
| @@ -351,7 +351,8 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | @@ -351,7 +351,8 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds( | ||
| 351 | if (is_matcha_) { | 351 | if (is_matcha_) { |
| 352 | return ConvertTextToTokenIdsMatcha(text, voice); | 352 | return ConvertTextToTokenIdsMatcha(text, voice); |
| 353 | } else if (is_kokoro_) { | 353 | } else if (is_kokoro_) { |
| 354 | - return ConvertTextToTokenIdsKokoro(text, voice); | 354 | + return ConvertTextToTokenIdsKokoro( |
| 355 | + token2id_, kokoro_meta_data_.max_token_len, text, voice); | ||
| 355 | } else { | 356 | } else { |
| 356 | return ConvertTextToTokenIdsVits(text, voice); | 357 | return ConvertTextToTokenIdsVits(text, voice); |
| 357 | } | 358 | } |
| @@ -382,8 +383,10 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( | @@ -382,8 +383,10 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( | ||
| 382 | return ans; | 383 | return ans; |
| 383 | } | 384 | } |
| 384 | 385 | ||
| 385 | -std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( | ||
| 386 | - const std::string &text, const std::string &voice /*= ""*/) const { | 386 | +std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( |
| 387 | + const std::unordered_map<char32_t, int32_t> &token2id, | ||
| 388 | + int32_t max_token_len, const std::string &text, | ||
| 389 | + const std::string &voice /*= ""*/) { | ||
| 387 | piper::eSpeakPhonemeConfig config; | 390 | piper::eSpeakPhonemeConfig config; |
| 388 | 391 | ||
| 389 | // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices | 392 | // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices |
| @@ -397,8 +400,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( | @@ -397,8 +400,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( | ||
| 397 | std::vector<TokenIDs> ans; | 400 | std::vector<TokenIDs> ans; |
| 398 | 401 | ||
| 399 | for (const auto &p : phonemes) { | 402 | for (const auto &p : phonemes) { |
| 400 | - auto phoneme_ids = | ||
| 401 | - PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len); | 403 | + auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len); |
| 402 | 404 | ||
| 403 | for (auto &ids : phoneme_ids) { | 405 | for (auto &ids : phoneme_ids) { |
| 404 | ans.emplace_back(std::move(ids)); | 406 | ans.emplace_back(std::move(ids)); |
| @@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | @@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 52 | std::vector<TokenIDs> ConvertTextToTokenIdsMatcha( | 52 | std::vector<TokenIDs> ConvertTextToTokenIdsMatcha( |
| 53 | const std::string &text, const std::string &voice = "") const; | 53 | const std::string &text, const std::string &voice = "") const; |
| 54 | 54 | ||
| 55 | - std::vector<TokenIDs> ConvertTextToTokenIdsKokoro( | ||
| 56 | - const std::string &text, const std::string &voice = "") const; | ||
| 57 | - | ||
| 58 | private: | 55 | private: |
| 59 | // map unicode codepoint to an integer ID | 56 | // map unicode codepoint to an integer ID |
| 60 | std::unordered_map<char32_t, int32_t> token2id_; | 57 | std::unordered_map<char32_t, int32_t> token2id_; |
| @@ -6,6 +6,7 @@ public class OfflineTtsKokoroModelConfig { | @@ -6,6 +6,7 @@ public class OfflineTtsKokoroModelConfig { | ||
| 6 | private final String voices; | 6 | private final String voices; |
| 7 | private final String tokens; | 7 | private final String tokens; |
| 8 | private final String lexicon; | 8 | private final String lexicon; |
| 9 | + private final String lang; | ||
| 9 | private final String dataDir; | 10 | private final String dataDir; |
| 10 | private final String dictDir; | 11 | private final String dictDir; |
| 11 | private final float lengthScale; | 12 | private final float lengthScale; |
| @@ -15,6 +16,7 @@ public class OfflineTtsKokoroModelConfig { | @@ -15,6 +16,7 @@ public class OfflineTtsKokoroModelConfig { | ||
| 15 | this.voices = builder.voices; | 16 | this.voices = builder.voices; |
| 16 | this.tokens = builder.tokens; | 17 | this.tokens = builder.tokens; |
| 17 | this.lexicon = builder.lexicon; | 18 | this.lexicon = builder.lexicon; |
| 19 | + this.lang = builder.lang; | ||
| 18 | this.dataDir = builder.dataDir; | 20 | this.dataDir = builder.dataDir; |
| 19 | this.dictDir = builder.dictDir; | 21 | this.dictDir = builder.dictDir; |
| 20 | this.lengthScale = builder.lengthScale; | 22 | this.lengthScale = builder.lengthScale; |
| @@ -50,6 +52,7 @@ public class OfflineTtsKokoroModelConfig { | @@ -50,6 +52,7 @@ public class OfflineTtsKokoroModelConfig { | ||
| 50 | private String voices = ""; | 52 | private String voices = ""; |
| 51 | private String tokens = ""; | 53 | private String tokens = ""; |
| 52 | private String lexicon = ""; | 54 | private String lexicon = ""; |
| 55 | + private String lang = ""; | ||
| 53 | private String dataDir = ""; | 56 | private String dataDir = ""; |
| 54 | private String dictDir = ""; | 57 | private String dictDir = ""; |
| 55 | private float lengthScale = 1.0f; | 58 | private float lengthScale = 1.0f; |
| @@ -78,6 +81,11 @@ public class OfflineTtsKokoroModelConfig { | @@ -78,6 +81,11 @@ public class OfflineTtsKokoroModelConfig { | ||
| 78 | return this; | 81 | return this; |
| 79 | } | 82 | } |
| 80 | 83 | ||
| 84 | + public Builder setLang(String lang) { | ||
| 85 | + this.lang = lang; | ||
| 86 | + return this; | ||
| 87 | + } | ||
| 88 | + | ||
| 81 | public Builder setDataDir(String dataDir) { | 89 | public Builder setDataDir(String dataDir) { |
| 82 | this.dataDir = dataDir; | 90 | this.dataDir = dataDir; |
| 83 | return this; | 91 | return this; |
| @@ -145,6 +145,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | @@ -145,6 +145,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | ||
| 145 | ans.model.kokoro.lexicon = p; | 145 | ans.model.kokoro.lexicon = p; |
| 146 | env->ReleaseStringUTFChars(s, p); | 146 | env->ReleaseStringUTFChars(s, p); |
| 147 | 147 | ||
| 148 | + fid = env->GetFieldID(kokoro_cls, "lang", "Ljava/lang/String;"); | ||
| 149 | + s = (jstring)env->GetObjectField(kokoro, fid); | ||
| 150 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 151 | + ans.model.kokoro.lang = p; | ||
| 152 | + env->ReleaseStringUTFChars(s, p); | ||
| 153 | + | ||
| 148 | fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); | 154 | fid = env->GetFieldID(kokoro_cls, "dataDir", "Ljava/lang/String;"); |
| 149 | s = (jstring)env->GetObjectField(kokoro, fid); | 155 | s = (jstring)env->GetObjectField(kokoro, fid); |
| 150 | p = env->GetStringUTFChars(s, nullptr); | 156 | p = env->GetStringUTFChars(s, nullptr); |
| @@ -31,6 +31,7 @@ data class OfflineTtsKokoroModelConfig( | @@ -31,6 +31,7 @@ data class OfflineTtsKokoroModelConfig( | ||
| 31 | var tokens: String = "", | 31 | var tokens: String = "", |
| 32 | var dataDir: String = "", | 32 | var dataDir: String = "", |
| 33 | var lexicon: String = "", | 33 | var lexicon: String = "", |
| 34 | + var lang: String = "", | ||
| 34 | var dictDir: String = "", | 35 | var dictDir: String = "", |
| 35 | var lengthScale: Float = 1.0f, | 36 | var lengthScale: Float = 1.0f, |
| 36 | ) | 37 | ) |
| @@ -84,6 +84,7 @@ type | @@ -84,6 +84,7 @@ type | ||
| 84 | LengthScale: Single; | 84 | LengthScale: Single; |
| 85 | DictDir: AnsiString; | 85 | DictDir: AnsiString; |
| 86 | Lexicon: AnsiString; | 86 | Lexicon: AnsiString; |
| 87 | + Lang: AnsiString; | ||
| 87 | 88 | ||
| 88 | function ToString: AnsiString; | 89 | function ToString: AnsiString; |
| 89 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); | 90 | class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); |
| @@ -841,6 +842,7 @@ type | @@ -841,6 +842,7 @@ type | ||
| 841 | LengthScale: cfloat; | 842 | LengthScale: cfloat; |
| 842 | DictDir: PAnsiChar; | 843 | DictDir: PAnsiChar; |
| 843 | Lexicon: PAnsiChar; | 844 | Lexicon: PAnsiChar; |
| 845 | + Lang: PAnsiChar; | ||
| 844 | end; | 846 | end; |
| 845 | 847 | ||
| 846 | SherpaOnnxOfflineTtsModelConfig = record | 848 | SherpaOnnxOfflineTtsModelConfig = record |
| @@ -2096,10 +2098,11 @@ begin | @@ -2096,10 +2098,11 @@ begin | ||
| 2096 | 'DataDir := %s, ' + | 2098 | 'DataDir := %s, ' + |
| 2097 | 'LengthScale := %.2f, ' + | 2099 | 'LengthScale := %.2f, ' + |
| 2098 | 'DictDir := %s, ' + | 2100 | 'DictDir := %s, ' + |
| 2099 | - 'Lexicon := %s' + | 2101 | + 'Lexicon := %s, ' + |
| 2102 | + 'Lang := %s' + | ||
| 2100 | ')', | 2103 | ')', |
| 2101 | [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale, | 2104 | [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale, |
| 2102 | - Self.DictDir, Self.Lexicon]); | 2105 | + Self.DictDir, Self.Lexicon, Self.Lang]); |
| 2103 | end; | 2106 | end; |
| 2104 | 2107 | ||
| 2105 | class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); | 2108 | class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); |
| @@ -2180,6 +2183,7 @@ begin | @@ -2180,6 +2183,7 @@ begin | ||
| 2180 | C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale; | 2183 | C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale; |
| 2181 | C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir); | 2184 | C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir); |
| 2182 | C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon); | 2185 | C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon); |
| 2186 | + C.Model.Kokoro.Lang := PAnsiChar(Config.Model.Kokoro.Lang); | ||
| 2183 | 2187 | ||
| 2184 | C.Model.NumThreads := Config.Model.NumThreads; | 2188 | C.Model.NumThreads := Config.Model.NumThreads; |
| 2185 | C.Model.Provider := PAnsiChar(Config.Model.Provider); | 2189 | C.Model.Provider := PAnsiChar(Config.Model.Provider); |
| @@ -17,10 +17,12 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { | @@ -17,10 +17,12 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { | ||
| 17 | .def(py::init<>()) | 17 | .def(py::init<>()) |
| 18 | .def(py::init<const std::string &, const std::string &, | 18 | .def(py::init<const std::string &, const std::string &, |
| 19 | const std::string &, const std::string &, | 19 | const std::string &, const std::string &, |
| 20 | - const std::string &, const std::string &, float>(), | 20 | + const std::string &, const std::string &, float, |
| 21 | + const std::string &>(), | ||
| 21 | py::arg("model"), py::arg("voices"), py::arg("tokens"), | 22 | py::arg("model"), py::arg("voices"), py::arg("tokens"), |
| 22 | py::arg("lexicon") = "", py::arg("data_dir"), | 23 | py::arg("lexicon") = "", py::arg("data_dir"), |
| 23 | - py::arg("dict_dir") = "", py::arg("length_scale") = 1.0) | 24 | + py::arg("dict_dir") = "", py::arg("length_scale") = 1.0, |
| 25 | + py::arg("lang") = "") | ||
| 24 | .def_readwrite("model", &PyClass::model) | 26 | .def_readwrite("model", &PyClass::model) |
| 25 | .def_readwrite("voices", &PyClass::voices) | 27 | .def_readwrite("voices", &PyClass::voices) |
| 26 | .def_readwrite("tokens", &PyClass::tokens) | 28 | .def_readwrite("tokens", &PyClass::tokens) |
| @@ -28,6 +30,7 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { | @@ -28,6 +30,7 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { | ||
| 28 | .def_readwrite("data_dir", &PyClass::data_dir) | 30 | .def_readwrite("data_dir", &PyClass::data_dir) |
| 29 | .def_readwrite("dict_dir", &PyClass::dict_dir) | 31 | .def_readwrite("dict_dir", &PyClass::dict_dir) |
| 30 | .def_readwrite("length_scale", &PyClass::length_scale) | 32 | .def_readwrite("length_scale", &PyClass::length_scale) |
| 33 | + .def_readwrite("lang", &PyClass::lang) | ||
| 31 | .def("__str__", &PyClass::ToString) | 34 | .def("__str__", &PyClass::ToString) |
| 32 | .def("validate", &PyClass::Validate); | 35 | .def("validate", &PyClass::Validate); |
| 33 | } | 36 | } |
| @@ -806,7 +806,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( | @@ -806,7 +806,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( | ||
| 806 | dataDir: String = "", | 806 | dataDir: String = "", |
| 807 | lengthScale: Float = 1.0, | 807 | lengthScale: Float = 1.0, |
| 808 | dictDir: String = "", | 808 | dictDir: String = "", |
| 809 | - lexicon: String = "" | 809 | + lexicon: String = "", |
| 810 | + lang: String = "" | ||
| 810 | ) -> SherpaOnnxOfflineTtsKokoroModelConfig { | 811 | ) -> SherpaOnnxOfflineTtsKokoroModelConfig { |
| 811 | return SherpaOnnxOfflineTtsKokoroModelConfig( | 812 | return SherpaOnnxOfflineTtsKokoroModelConfig( |
| 812 | model: toCPointer(model), | 813 | model: toCPointer(model), |
| @@ -815,7 +816,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( | @@ -815,7 +816,8 @@ func sherpaOnnxOfflineTtsKokoroModelConfig( | ||
| 815 | data_dir: toCPointer(dataDir), | 816 | data_dir: toCPointer(dataDir), |
| 816 | length_scale: lengthScale, | 817 | length_scale: lengthScale, |
| 817 | dict_dir: toCPointer(dictDir), | 818 | dict_dir: toCPointer(dictDir), |
| 818 | - lexicon: toCPointer(lexicon) | 819 | + lexicon: toCPointer(lexicon), |
| 820 | + lang: toCPointer(lang) | ||
| 819 | ) | 821 | ) |
| 820 | } | 822 | } |
| 821 | 823 |
| @@ -143,13 +143,14 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | @@ -143,13 +143,14 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | ||
| 143 | const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; | 143 | const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; |
| 144 | const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1; | 144 | const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1; |
| 145 | const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; | 145 | const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; |
| 146 | + const langLen = Module.lengthBytesUTF8(config.lang || '') + 1; | ||
| 146 | 147 | ||
| 147 | - const n = | ||
| 148 | - modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen; | 148 | + const n = modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + |
| 149 | + lexiconLen + langLen; | ||
| 149 | 150 | ||
| 150 | const buffer = Module._malloc(n); | 151 | const buffer = Module._malloc(n); |
| 151 | 152 | ||
| 152 | - const len = 7 * 4; | 153 | + const len = 8 * 4; |
| 153 | const ptr = Module._malloc(len); | 154 | const ptr = Module._malloc(len); |
| 154 | 155 | ||
| 155 | let offset = 0; | 156 | let offset = 0; |
| @@ -171,6 +172,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | @@ -171,6 +172,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | ||
| 171 | Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); | 172 | Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); |
| 172 | offset += lexiconLen; | 173 | offset += lexiconLen; |
| 173 | 174 | ||
| 175 | + Module.stringToUTF8(config.lang || '', buffer + offset, langLen); | ||
| 176 | + offset += langLen; | ||
| 177 | + | ||
| 174 | offset = 0; | 178 | offset = 0; |
| 175 | Module.setValue(ptr, buffer + offset, 'i8*'); | 179 | Module.setValue(ptr, buffer + offset, 'i8*'); |
| 176 | offset += modelLen; | 180 | offset += modelLen; |
| @@ -192,6 +196,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | @@ -192,6 +196,9 @@ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { | ||
| 192 | Module.setValue(ptr + 24, buffer + offset, 'i8*'); | 196 | Module.setValue(ptr + 24, buffer + offset, 'i8*'); |
| 193 | offset += lexiconLen; | 197 | offset += lexiconLen; |
| 194 | 198 | ||
| 199 | + Module.setValue(ptr + 28, buffer + offset, 'i8*'); | ||
| 200 | + offset += langLen; | ||
| 201 | + | ||
| 195 | return { | 202 | return { |
| 196 | buffer: buffer, ptr: ptr, len: len, | 203 | buffer: buffer, ptr: ptr, len: len, |
| 197 | } | 204 | } |
| @@ -233,6 +240,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -233,6 +240,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 233 | dataDir: '', | 240 | dataDir: '', |
| 234 | dictDir: '', | 241 | dictDir: '', |
| 235 | lexicon: '', | 242 | lexicon: '', |
| 243 | + lang: '', | ||
| 236 | }; | 244 | }; |
| 237 | } | 245 | } |
| 238 | 246 |
| @@ -15,7 +15,7 @@ extern "C" { | @@ -15,7 +15,7 @@ extern "C" { | ||
| 15 | 15 | ||
| 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); | 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); |
| 17 | static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); | 17 | static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); |
| 18 | -static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 7 * 4, ""); | 18 | +static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 8 * 4, ""); |
| 19 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == | 19 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == |
| 20 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + | 20 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + |
| 21 | sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + | 21 | sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + |
-
请 注册 或 登录 后发表评论