Fangjun Kuang
Committed by GitHub

Add dict_dir arg to c api to support Chinese TTS models using jieba (#809)

1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.9.22") 4 +set(SHERPA_ONNX_VERSION "1.9.23")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -470,6 +470,19 @@ void CNonStreamingTextToSpeechDlg::Init() { @@ -470,6 +470,19 @@ void CNonStreamingTextToSpeechDlg::Init() {
470 } else if (Exists("./lexicon.txt")) { 470 } else if (Exists("./lexicon.txt")) {
471 config.model.vits.lexicon = "./lexicon.txt"; 471 config.model.vits.lexicon = "./lexicon.txt";
472 } 472 }
  473 +
  474 + if (Exists("./dict/jieba.dict.utf8")) {
  475 + config.model.vits.dict_dir = "./dict";
  476 + }
  477 +
  478 + if (Exists("./phone.fst") && Exists("./date.fst") && Exists("./number.fst")) {
  479 + config.rule_fsts = "./phone.fst,./date.fst,number.fst";
  480 + }
  481 +
  482 + if (Exists("./rule.far")) {
  483 + config.rule_fars = "./rule.far";
  484 + }
  485 +
473 config.model.vits.tokens = "./tokens.txt"; 486 config.model.vits.tokens = "./tokens.txt";
474 487
475 tts_ = SherpaOnnxCreateOfflineTts(&config); 488 tts_ = SherpaOnnxCreateOfflineTts(&config);
@@ -8,6 +8,7 @@ function createOfflineTts() { @@ -8,6 +8,7 @@ function createOfflineTts() {
8 lexicon: '', 8 lexicon: '',
9 tokens: './vits-piper-en_US-amy-low/tokens.txt', 9 tokens: './vits-piper-en_US-amy-low/tokens.txt',
10 dataDir: './vits-piper-en_US-amy-low/espeak-ng-data', 10 dataDir: './vits-piper-en_US-amy-low/espeak-ng-data',
  11 + dictDir: '',
11 noiseScale: 0.667, 12 noiseScale: 0.667,
12 noiseScaleW: 0.8, 13 noiseScaleW: 0.8,
13 lengthScale: 1.0, 14 lengthScale: 1.0,
@@ -8,6 +8,7 @@ function createOfflineTts() { @@ -8,6 +8,7 @@ function createOfflineTts() {
8 lexicon: './vits-icefall-zh-aishell3/lexicon.txt', 8 lexicon: './vits-icefall-zh-aishell3/lexicon.txt',
9 tokens: './vits-icefall-zh-aishell3/tokens.txt', 9 tokens: './vits-icefall-zh-aishell3/tokens.txt',
10 dataDir: '', 10 dataDir: '',
  11 + dictDir: '',
11 noiseScale: 0.667, 12 noiseScale: 0.667,
12 noiseScaleW: 0.8, 13 noiseScaleW: 0.8,
13 lengthScale: 1.0, 14 lengthScale: 1.0,
@@ -23,6 +23,8 @@ namespace SherpaOnnx @@ -23,6 +23,8 @@ namespace SherpaOnnx
23 NoiseScale = 0.667F; 23 NoiseScale = 0.667F;
24 NoiseScaleW = 0.8F; 24 NoiseScaleW = 0.8F;
25 LengthScale = 1.0F; 25 LengthScale = 1.0F;
  26 +
  27 + DictDir = "";
26 } 28 }
27 [MarshalAs(UnmanagedType.LPStr)] 29 [MarshalAs(UnmanagedType.LPStr)]
28 public string Model; 30 public string Model;
@@ -39,6 +41,9 @@ namespace SherpaOnnx @@ -39,6 +41,9 @@ namespace SherpaOnnx
39 public float NoiseScale; 41 public float NoiseScale;
40 public float NoiseScaleW; 42 public float NoiseScaleW;
41 public float LengthScale; 43 public float LengthScale;
  44 +
  45 + [MarshalAs(UnmanagedType.LPStr)]
  46 + public string DictDir;
42 } 47 }
43 48
44 [StructLayout(LayoutKind.Sequential)] 49 [StructLayout(LayoutKind.Sequential)]
@@ -532,10 +532,11 @@ type OfflineTtsVitsModelConfig struct { @@ -532,10 +532,11 @@ type OfflineTtsVitsModelConfig struct {
532 Model string // Path to the VITS onnx model 532 Model string // Path to the VITS onnx model
533 Lexicon string // Path to lexicon.txt 533 Lexicon string // Path to lexicon.txt
534 Tokens string // Path to tokens.txt 534 Tokens string // Path to tokens.txt
535 - DataDir string // Path to tokens.txt 535 + DataDir string // Path to espeak-ng-data directory
536 NoiseScale float32 // noise scale for vits models. Please use 0.667 in general 536 NoiseScale float32 // noise scale for vits models. Please use 0.667 in general
537 NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general 537 NoiseScaleW float32 // noise scale for vits models. Please use 0.8 in general
538 LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed 538 LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed
  539 + DictDir string // Path to dict directory for jieba (used only in Chinese tts)
539 } 540 }
540 541
541 type OfflineTtsModelConfig struct { 542 type OfflineTtsModelConfig struct {
@@ -605,6 +606,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { @@ -605,6 +606,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
605 c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW) 606 c.model.vits.noise_scale_w = C.float(config.Model.Vits.NoiseScaleW)
606 c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale) 607 c.model.vits.length_scale = C.float(config.Model.Vits.LengthScale)
607 608
  609 + c.model.vits.dict_dir = C.CString(config.Model.Vits.DictDir)
  610 + defer C.free(unsafe.Pointer(c.model.vits.dict_dir))
  611 +
608 c.model.num_threads = C.int(config.Model.NumThreads) 612 c.model.num_threads = C.int(config.Model.NumThreads)
609 c.model.debug = C.int(config.Model.Debug) 613 c.model.debug = C.int(config.Model.Debug)
610 614
@@ -818,6 +818,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( @@ -818,6 +818,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
818 SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8); 818 SHERPA_ONNX_OR(config->model.vits.noise_scale_w, 0.8);
819 tts_config.model.vits.length_scale = 819 tts_config.model.vits.length_scale =
820 SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0); 820 SHERPA_ONNX_OR(config->model.vits.length_scale, 1.0);
  821 + tts_config.model.vits.dict_dir =
  822 + SHERPA_ONNX_OR(config->model.vits.dict_dir, "");
821 823
822 tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); 824 tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
823 tts_config.model.debug = config->model.debug; 825 tts_config.model.debug = config->model.debug;
@@ -772,6 +772,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { @@ -772,6 +772,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig {
772 float noise_scale; 772 float noise_scale;
773 float noise_scale_w; 773 float noise_scale_w;
774 float length_scale; // < 1, faster in speed; > 1, slower in speed 774 float length_scale; // < 1, faster in speed; > 1, slower in speed
  775 + const char *dict_dir;
775 } SherpaOnnxOfflineTtsVitsModelConfig; 776 } SherpaOnnxOfflineTtsVitsModelConfig;
776 777
777 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { 778 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {
@@ -623,7 +623,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( @@ -623,7 +623,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig(
623 dataDir: String = "", 623 dataDir: String = "",
624 noiseScale: Float = 0.667, 624 noiseScale: Float = 0.667,
625 noiseScaleW: Float = 0.8, 625 noiseScaleW: Float = 0.8,
626 - lengthScale: Float = 1.0 626 + lengthScale: Float = 1.0,
  627 + dictDir: String = ""
627 ) -> SherpaOnnxOfflineTtsVitsModelConfig { 628 ) -> SherpaOnnxOfflineTtsVitsModelConfig {
628 return SherpaOnnxOfflineTtsVitsModelConfig( 629 return SherpaOnnxOfflineTtsVitsModelConfig(
629 model: toCPointer(model), 630 model: toCPointer(model),
@@ -632,7 +633,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig( @@ -632,7 +633,8 @@ func sherpaOnnxOfflineTtsVitsModelConfig(
632 data_dir: toCPointer(dataDir), 633 data_dir: toCPointer(dataDir),
633 noise_scale: noiseScale, 634 noise_scale: noiseScale,
634 noise_scale_w: noiseScaleW, 635 noise_scale_w: noiseScaleW,
635 - length_scale: lengthScale) 636 + length_scale: lengthScale,
  637 + dict_dir: toCPointer(dictDir))
636 } 638 }
637 639
638 func sherpaOnnxOfflineTtsModelConfig( 640 func sherpaOnnxOfflineTtsModelConfig(
@@ -43,6 +43,7 @@ void PrintOfflineTtsConfig(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -43,6 +43,7 @@ void PrintOfflineTtsConfig(SherpaOnnxOfflineTtsConfig *tts_config) {
43 fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); 43 fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale);
44 fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); 44 fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w);
45 fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); 45 fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale);
  46 + fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir);
46 47
47 fprintf(stdout, "----------tts model config----------\n"); 48 fprintf(stdout, "----------tts model config----------\n");
48 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); 49 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
@@ -18,7 +18,12 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { @@ -18,7 +18,12 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
18 const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; 18 const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1;
19 const dataDirLen = Module.lengthBytesUTF8(config.dataDir) + 1; 19 const dataDirLen = Module.lengthBytesUTF8(config.dataDir) + 1;
20 20
21 - const n = modelLen + lexiconLen + tokensLen + dataDirLen; 21 + if (!('dictDir' in config)) {
  22 + config.dictDir = ''
  23 + }
  24 + const dictDirLen = Module.lengthBytesUTF8(config.dictDir) + 1;
  25 +
  26 + const n = modelLen + lexiconLen + tokensLen + dataDirLen + dictDirLen;
22 27
23 const buffer = Module._malloc(n); 28 const buffer = Module._malloc(n);
24 29
@@ -38,6 +43,9 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { @@ -38,6 +43,9 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
38 Module.stringToUTF8(config.dataDir, buffer + offset, dataDirLen); 43 Module.stringToUTF8(config.dataDir, buffer + offset, dataDirLen);
39 offset += dataDirLen; 44 offset += dataDirLen;
40 45
  46 + Module.stringToUTF8(config.dictDir, buffer + offset, dictDirLen);
  47 + offset += dictDirLen;
  48 +
41 offset = 0; 49 offset = 0;
42 Module.setValue(ptr, buffer + offset, 'i8*'); 50 Module.setValue(ptr, buffer + offset, 'i8*');
43 offset += modelLen; 51 offset += modelLen;
@@ -54,6 +62,8 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { @@ -54,6 +62,8 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
54 Module.setValue(ptr + 16, config.noiseScale, 'float'); 62 Module.setValue(ptr + 16, config.noiseScale, 'float');
55 Module.setValue(ptr + 20, config.noiseScaleW, 'float'); 63 Module.setValue(ptr + 20, config.noiseScaleW, 'float');
56 Module.setValue(ptr + 24, config.lengthScale, 'float'); 64 Module.setValue(ptr + 24, config.lengthScale, 'float');
  65 + Module.setValue(ptr + 28, buffer + offset, 'i8*');
  66 + offset += dictDirLen;
57 67
58 return { 68 return {
59 buffer: buffer, ptr: ptr, len: len, 69 buffer: buffer, ptr: ptr, len: len,
@@ -184,6 +194,7 @@ function createOfflineTts(Module, myConfig) { @@ -184,6 +194,7 @@ function createOfflineTts(Module, myConfig) {
184 lexicon: '', 194 lexicon: '',
185 tokens: './tokens.txt', 195 tokens: './tokens.txt',
186 dataDir: './espeak-ng-data', 196 dataDir: './espeak-ng-data',
  197 + dictDir: '',
187 noiseScale: 0.667, 198 noiseScale: 0.667,
188 noiseScaleW: 0.8, 199 noiseScaleW: 0.8,
189 lengthScale: 1.0, 200 lengthScale: 1.0,
@@ -13,7 +13,7 @@ @@ -13,7 +13,7 @@
13 13
14 extern "C" { 14 extern "C" {
15 15
16 -static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 7 * 4, ""); 16 +static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == 17 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
18 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, 18 sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4,
19 ""); 19 "");
@@ -32,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -32,6 +32,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
32 fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale); 32 fprintf(stdout, "noise scale: %.3f\n", vits_model_config->noise_scale);
33 fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w); 33 fprintf(stdout, "noise scale w: %.3f\n", vits_model_config->noise_scale_w);
34 fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); 34 fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale);
  35 + fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir);
35 36
36 fprintf(stdout, "----------tts model config----------\n"); 37 fprintf(stdout, "----------tts model config----------\n");
37 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); 38 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);