Committed by
GitHub
Add dict_dir arg to c api to support Chinese TTS models using jieba (#809)
正在显示
12 个修改的文件
包含
48 行增加
和
6 行删除
| @@ -470,6 +470,19 @@ void CNonStreamingTextToSpeechDlg::Init() { | @@ -470,6 +470,19 @@ void CNonStreamingTextToSpeechDlg::Init() { | ||
| 470 | } else if (Exists("./lexicon.txt")) { | 470 | } else if (Exists("./lexicon.txt")) { |
| 471 | config.model.vits.lexicon = "./lexicon.txt"; | 471 | config.model.vits.lexicon = "./lexicon.txt"; |
| 472 | } | 472 | } |
| 473 | + | ||
| 474 | + if (Exists("./dict/jieba.dict.utf8")) { | ||
| 475 | + config.model.vits.dict_dir = "./dict"; | ||
| 476 | + } | ||
| 477 | + | ||
| 478 | + if (Exists("./phone.fst") && Exists("./date.fst") && Exists("./number.fst")) { | ||
| 479 | + config.rule_fsts = "./phone.fst,./date.fst,number.fst"; | ||
| 480 | + } | ||
| 481 | + | ||
| 482 | + if (Exists("./rule.far")) { | ||
| 483 | + config.rule_fars = "./rule.far"; | ||
| 484 | + } | ||
| 485 | + | ||
| 473 | config.model.vits.tokens = "./tokens.txt"; | 486 | config.model.vits.tokens = "./tokens.txt"; |
| 474 | 487 | ||
| 475 | tts_ = SherpaOnnxCreateOfflineTts(&config); | 488 | tts_ = SherpaOnnxCreateOfflineTts(&config); |
| @@ -8,6 +8,7 @@ function createOfflineTts() { | @@ -8,6 +8,7 @@ function createOfflineTts() { | ||
| 8 | lexicon: '', | 8 | lexicon: '', |
| 9 | tokens: './vits-piper-en_US-amy-low/tokens.txt', | 9 | tokens: './vits-piper-en_US-amy-low/tokens.txt', |
| 10 | dataDir: './vits-piper-en_US-amy-low/espeak-ng-data', | 10 | dataDir: './vits-piper-en_US-amy-low/espeak-ng-data', |
| 11 | + dictDir: '', | ||
| 11 | noiseScale: 0.667, | 12 | noiseScale: 0.667, |
| 12 | noiseScaleW: 0.8, | 13 | noiseScaleW: 0.8, |
| 13 | lengthScale: 1.0, | 14 | lengthScale: 1.0, |
| @@ -8,6 +8,7 @@ function createOfflineTts() { | @@ -8,6 +8,7 @@ function createOfflineTts() { | ||
| 8 | lexicon: './vits-icefall-zh-aishell3/lexicon.txt', | 8 | lexicon: './vits-icefall-zh-aishell3/lexicon.txt', |
| 9 | tokens: './vits-icefall-zh-aishell3/tokens.txt', | 9 | tokens: './vits-icefall-zh-aishell3/tokens.txt', |
| 10 | dataDir: '', | 10 | dataDir: '', |
| 11 | + dictDir: '', | ||
| 11 | noiseScale: 0.667, | 12 | noiseScale: 0.667, |
| 12 | noiseScaleW: 0.8, | 13 | noiseScaleW: 0.8, |
| 13 | lengthScale: 1.0, | 14 | lengthScale: 1.0, |
| @@ -23,6 +23,8 @@ namespace SherpaOnnx | @@ -23,6 +23,8 @@ namespace SherpaOnnx | ||
| 23 | NoiseScale = 0.667F; | 23 | NoiseScale = 0.667F; |
| 24 | NoiseScaleW = 0.8F; | 24 | NoiseScaleW = 0.8F; |
| 25 | LengthScale = 1.0F; | 25 | LengthScale = 1.0F; |
| 26 | + | ||
| 27 | + DictDir = ""; | ||
| 26 | } | 28 | } |
| 27 | [MarshalAs(UnmanagedType.LPStr)] | 29 | [MarshalAs(UnmanagedType.LPStr)] |
| 28 | public string Model; | 30 | public string Model; |
| @@ -39,6 +41,9 @@ namespace SherpaOnnx | @@ -39,6 +41,9 @@ namespace SherpaOnnx | ||
| 39 | public float NoiseScale; | 41 | public float NoiseScale; |
| 40 | public float NoiseScaleW; | 42 | public float NoiseScaleW; |
| 41 | public float LengthScale; | 43 | public float LengthScale; |
| 44 | + | ||
| 45 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 46 | + public string DictDir; | ||
| 42 | } | 47 | } |
| 43 | 48 | ||
| 44 | [StructLayout(LayoutKind.Sequential)] | 49 | [StructLayout(LayoutKind.Sequential)] |
| @@ -532,10 +532,11 @@ type OfflineTtsVitsModelConfig struct { | @@ -532,10 +532,11 @@ type OfflineTtsVitsModelConfig struct { | ||
| 532 | Model string // Path to the VITS onnx model | 532 | Model string // Path to the VITS onnx model |
| 533 | Lexicon string // Path to lexicon.txt | 533 | Lexicon string // Path to lexicon.txt |
| 534 | Tokens string // Path to tokens.txt | 534 | Tokens string // Path to tokens.txt |
| 535 | - DataDir string // Path to tokens.txt | 535 | + DataDir string // Path to espeak-ng-data directory |
| 536 | NoiseScale float32 // noise scale for vits models. Please use 0.667 in general | 536 | NoiseScale float32 // noise scale for vits models. Please use 0.667 in general |
| 537 | NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general | 537 | NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general |
| 538 | LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed | 538 | LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed |
| 539 | + DictDir string // Path to dict directory for jieba (used only in Chinese tts) | ||
| 539 | } | 540 | } |
| 540 | 541 | ||
| 541 | type OfflineTtsModelConfig struct { | 542 | type OfflineTtsModelConfig struct { |
| @@ -605,6 +606,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { | @@ -605,6 +606,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { | ||
| 605 | c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW) | 606 | c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW) |
| 606 | c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale) | 607 | c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale) |
| 607 | 608 | ||
| 609 | + c.model.vits.dict_dir = C.CString(config.Model.Vits.DictDir) | ||
| 610 | + defer C.free(unsafe.Pointer(c.model.vits.dict_dir)) | ||
| 611 | + | ||
| 608 | c.model.num_threads = C.int(config.Model.NumThreads) | 612 | c.model.num_threads = C.int(config.Model.NumThreads) |
| 609 | c.model.debug = C.int(config.Model.Debug) | 613 | c.model.debug = C.int(config.Model.Debug) |
| 610 | 614 |
| @@ -818,6 +818,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | @@ -818,6 +818,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | ||
| 818 | SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8); | 818 | SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8); |
| 819 | tts_config.model.vits.length_scale = | 819 | tts_config.model.vits.length_scale = |
| 820 | SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0); | 820 | SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0); |
| 821 | + tts_config.model.vits.dict_dir = | ||
| 822 | + SHERPA_ONNX_OR(config->model.vits.dict_dir, ""); | ||
| 821 | 823 | ||
| 822 | tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); | 824 | tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); |
| 823 | tts_config.model.debug = config->model.debug; | 825 | tts_config.model.debug = config->model.debug; |
| @@ -772,6 +772,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { | @@ -772,6 +772,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { | ||
| 772 | float noise_scale; | 772 | float noise_scale; |
| 773 | float noise_scale_w; | 773 | float noise_scale_w; |
| 774 | float length_scale; // < 1, faster in speed; > 1, slower in speed | 774 | float length_scale; // < 1, faster in speed; > 1, slower in speed |
| 775 | + const char *dict_dir; | ||
| 775 | } SherpaOnnxOfflineTtsVitsModelConfig; | 776 | } SherpaOnnxOfflineTtsVitsModelConfig; |
| 776 | 777 | ||
| 777 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { | 778 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { |
| @@ -623,7 +623,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | @@ -623,7 +623,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | ||
| 623 | dataDir: String = "", | 623 | dataDir: String = "", |
| 624 | noiseScale: Float = 0.667, | 624 | noiseScale: Float = 0.667, |
| 625 | noiseScaleW: Float = 0.8, | 625 | noiseScaleW: Float = 0.8, |
| 626 | - lengthScale: Float = 1.0 | 626 | + lengthScale: Float = 1.0, |
| 627 | + dictDir: String = "" | ||
| 627 | ) -> SherpaOnnxOfflineTtsVitsModelConfig { | 628 | ) -> SherpaOnnxOfflineTtsVitsModelConfig { |
| 628 | return SherpaOnnxOfflineTtsVitsModelConfig( | 629 | return SherpaOnnxOfflineTtsVitsModelConfig( |
| 629 | model: toCPointer(model), | 630 | model: toCPointer(model), |
| @@ -632,7 +633,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | @@ -632,7 +633,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | ||
| 632 | data_dir: toCPointer(dataDir), | 633 | data_dir: toCPointer(dataDir), |
| 633 | noise_scale: noiseScale, | 634 | noise_scale: noiseScale, |
| 634 | noise_scale_w: noiseScaleW, | 635 | noise_scale_w: noiseScaleW, |
| 635 | - length_scale: lengthScale) | 636 | + length_scale: lengthScale, |
| 637 | + dict_dir: toCPointer(dictDir)) | ||
| 636 | } | 638 | } |
| 637 | 639 | ||
| 638 | func sherpaOnnxOfflineTtsModelConfig( | 640 | func sherpaOnnxOfflineTtsModelConfig( |
| @@ -43,6 +43,7 @@ void PrintOfflineTtsConfig(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -43,6 +43,7 @@ void PrintOfflineTtsConfig(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 43 | fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); | 43 | fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); |
| 44 | fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); | 44 | fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); |
| 45 | fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); | 45 | fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); |
| 46 | + fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); | ||
| 46 | 47 | ||
| 47 | fprintf(stdout, "----------tts model config----------\n"); | 48 | fprintf(stdout, "----------tts model config----------\n"); |
| 48 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); | 49 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); |
| @@ -18,7 +18,12 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | @@ -18,7 +18,12 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | ||
| 18 | const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; | 18 | const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; |
| 19 | const dataDirLen = Module.lengthBytesUTF8(config.dataDir) + 1; | 19 | const dataDirLen = Module.lengthBytesUTF8(config.dataDir) + 1; |
| 20 | 20 | ||
| 21 | - const n = modelLen + lexiconLen + tokensLen + dataDirLen; | 21 | + if (!('dictDir' in config)) { |
| 22 | + config.dictDir = '' | ||
| 23 | + } | ||
| 24 | + const dictDirLen = Module.lengthBytesUTF8(config.dictDir) + 1; | ||
| 25 | + | ||
| 26 | + const n = modelLen + lexiconLen + tokensLen + dataDirLen + dictDirLen; | ||
| 22 | 27 | ||
| 23 | const buffer = Module._malloc(n); | 28 | const buffer = Module._malloc(n); |
| 24 | 29 | ||
| @@ -38,6 +43,9 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | @@ -38,6 +43,9 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | ||
| 38 | Module.stringToUTF8(config.dataDir, buffer + offset, dataDirLen); | 43 | Module.stringToUTF8(config.dataDir, buffer + offset, dataDirLen); |
| 39 | offset += dataDirLen; | 44 | offset += dataDirLen; |
| 40 | 45 | ||
| 46 | + Module.stringToUTF8(config.dictDir, buffer + offset, dictDirLen); | ||
| 47 | + offset += dictDirLen; | ||
| 48 | + | ||
| 41 | offset = 0; | 49 | offset = 0; |
| 42 | Module.setValue(ptr, buffer + offset, 'i8*'); | 50 | Module.setValue(ptr, buffer + offset, 'i8*'); |
| 43 | offset += modelLen; | 51 | offset += modelLen; |
| @@ -54,6 +62,8 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | @@ -54,6 +62,8 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | ||
| 54 | Module.setValue(ptr + 16, config.noiseScale, 'float'); | 62 | Module.setValue(ptr + 16, config.noiseScale, 'float'); |
| 55 | Module.setValue(ptr + 20, config.noiseScaleW, 'float'); | 63 | Module.setValue(ptr + 20, config.noiseScaleW, 'float'); |
| 56 | Module.setValue(ptr + 24, config.lengthScale, 'float'); | 64 | Module.setValue(ptr + 24, config.lengthScale, 'float'); |
| 65 | + Module.setValue(ptr + 28, buffer + offset, 'i8*'); | ||
| 66 | + offset += dictDirLen; | ||
| 57 | 67 | ||
| 58 | return { | 68 | return { |
| 59 | buffer: buffer, ptr: ptr, len: len, | 69 | buffer: buffer, ptr: ptr, len: len, |
| @@ -184,6 +194,7 @@ function createOfflineTts(Module, myConfig) { | @@ -184,6 +194,7 @@ function createOfflineTts(Module, myConfig) { | ||
| 184 | lexicon: '', | 194 | lexicon: '', |
| 185 | tokens: './tokens.txt', | 195 | tokens: './tokens.txt', |
| 186 | dataDir: './espeak-ng-data', | 196 | dataDir: './espeak-ng-data', |
| 197 | + dictDir: '', | ||
| 187 | noiseScale: 0.667, | 198 | noiseScale: 0.667, |
| 188 | noiseScaleW: 0.8, | 199 | noiseScaleW: 0.8, |
| 189 | lengthScale: 1.0, | 200 | lengthScale: 1.0, |
| @@ -13,7 +13,7 @@ | @@ -13,7 +13,7 @@ | ||
| 13 | 13 | ||
| 14 | extern "C" { | 14 | extern "C" { |
| 15 | 15 | ||
| 16 | -static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 7 * 4, ""); | 16 | +static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); |
| 17 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == | 17 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == |
| 18 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, | 18 | sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, |
| 19 | ""); | 19 | ""); |
| @@ -32,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -32,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 32 | fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); | 32 | fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); |
| 33 | fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); | 33 | fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); |
| 34 | fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); | 34 | fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); |
| 35 | + fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); | ||
| 35 | 36 | ||
| 36 | fprintf(stdout, "----------tts model config----------\n"); | 37 | fprintf(stdout, "----------tts model config----------\n"); |
| 37 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); | 38 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); |
-
请 注册 或 登录 后发表评论