Fangjun Kuang
Committed by GitHub

Add Android demo for MatchaTTS models. (#1683)

@@ -26,6 +26,7 @@ jobs: @@ -26,6 +26,7 @@ jobs:
26 total: ["40"] 26 total: ["40"]
27 index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39"] 27 index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39"]
28 28
  29 +
29 steps: 30 steps:
30 - uses: actions/checkout@v4 31 - uses: actions/checkout@v4
31 with: 32 with:
@@ -183,6 +183,8 @@ class MainActivity : AppCompatActivity() { @@ -183,6 +183,8 @@ class MainActivity : AppCompatActivity() {
183 private fun initTts() { 183 private fun initTts() {
184 var modelDir: String? 184 var modelDir: String?
185 var modelName: String? 185 var modelName: String?
  186 + var acousticModelName: String?
  187 + var vocoder: String?
186 var ruleFsts: String? 188 var ruleFsts: String?
187 var ruleFars: String? 189 var ruleFars: String?
188 var lexicon: String? 190 var lexicon: String?
@@ -193,8 +195,18 @@ class MainActivity : AppCompatActivity() { @@ -193,8 +195,18 @@ class MainActivity : AppCompatActivity() {
193 // The purpose of such a design is to make the CI test easier 195 // The purpose of such a design is to make the CI test easier
194 // Please see 196 // Please see
195 // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py 197 // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py
196 - modelDir = null 198 +
  199 + // VITS -- begin
197 modelName = null 200 modelName = null
  201 + // VITS -- end
  202 +
  203 + // Matcha -- begin
  204 + acousticModelName = null
  205 + vocoder = null
  206 + // Matcha -- end
  207 +
  208 +
  209 + modelDir = null
198 ruleFsts = null 210 ruleFsts = null
199 ruleFars = null 211 ruleFars = null
200 lexicon = null 212 lexicon = null
@@ -217,7 +229,6 @@ class MainActivity : AppCompatActivity() { @@ -217,7 +229,6 @@ class MainActivity : AppCompatActivity() {
217 // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 229 // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
218 // modelDir = "vits-icefall-zh-aishell3" 230 // modelDir = "vits-icefall-zh-aishell3"
219 // modelName = "model.onnx" 231 // modelName = "model.onnx"
220 - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst"  
221 // ruleFars = "vits-icefall-zh-aishell3/rule.far" 232 // ruleFars = "vits-icefall-zh-aishell3/rule.far"
222 // lexicon = "lexicon.txt" 233 // lexicon = "lexicon.txt"
223 234
@@ -233,24 +244,47 @@ class MainActivity : AppCompatActivity() { @@ -233,24 +244,47 @@ class MainActivity : AppCompatActivity() {
233 // modelDir = "vits-coqui-de-css10" 244 // modelDir = "vits-coqui-de-css10"
234 // modelName = "model.onnx" 245 // modelName = "model.onnx"
235 246
  247 + // Example 6
  248 + // vits-melo-tts-zh_en
  249 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-melo-tts-zh-en-chinese-english-1-speaker
  250 + // modelDir = "vits-melo-tts-zh_en"
  251 + // modelName = "model.onnx"
  252 + // lexicon = "lexicon.txt"
  253 + // dictDir = "vits-melo-tts-zh_en/dict"
  254 +
  255 + // Example 7
  256 + // matcha-icefall-zh-baker
  257 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  258 + // modelDir = "matcha-icefall-zh-baker"
  259 + // acousticModelName = "model-steps-3.onnx"
  260 + // vocoder = "hifigan_v2.onnx"
  261 + // lexicon = "lexicon.txt"
  262 + // dictDir = "matcha-icefall-zh-baker/dict"
  263 +
  264 + // Example 8
  265 + // matcha-icefall-en_US-ljspeech
  266 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  267 + // modelDir = "matcha-icefall-en_US-ljspeech"
  268 + // acousticModelName = "model-steps-3.onnx"
  269 + // vocoder = "hifigan_v2.onnx"
  270 + // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data"
  271 +
236 if (dataDir != null) { 272 if (dataDir != null) {
237 - val newDir = copyDataDir(modelDir!!)  
238 - modelDir = newDir + "/" + modelDir  
239 - dataDir = newDir + "/" + dataDir  
240 - assets = null 273 + val newDir = copyDataDir(dataDir!!)
  274 + dataDir = "$newDir/$dataDir"
241 } 275 }
242 276
243 if (dictDir != null) { 277 if (dictDir != null) {
244 - val newDir = copyDataDir(modelDir!!)  
245 - modelDir = newDir + "/" + modelDir  
246 - dictDir = modelDir + "/" + "dict" 278 + val newDir = copyDataDir(dictDir!!)
  279 + dictDir = "$newDir/$dictDir"
247 ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" 280 ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst"
248 - assets = null  
249 } 281 }
250 282
251 val config = getOfflineTtsConfig( 283 val config = getOfflineTtsConfig(
252 modelDir = modelDir!!, 284 modelDir = modelDir!!,
253 - modelName = modelName!!, 285 + modelName = modelName ?: "",
  286 + acousticModelName = acousticModelName ?: "",
  287 + vocoder = vocoder ?: "",
254 lexicon = lexicon ?: "", 288 lexicon = lexicon ?: "",
255 dataDir = dataDir ?: "", 289 dataDir = dataDir ?: "",
256 dictDir = dictDir ?: "", 290 dictDir = dictDir ?: "",
@@ -57,7 +57,7 @@ class MainActivity : ComponentActivity() { @@ -57,7 +57,7 @@ class MainActivity : ComponentActivity() {
57 color = MaterialTheme.colorScheme.background 57 color = MaterialTheme.colorScheme.background
58 ) { 58 ) {
59 Scaffold(topBar = { 59 Scaffold(topBar = {
60 - TopAppBar(title = { Text("Next-gen Kaldi: TTS") }) 60 + TopAppBar(title = { Text("Next-gen Kaldi: TTS Engine") })
61 }) { 61 }) {
62 Box(modifier = Modifier.padding(it)) { 62 Box(modifier = Modifier.padding(it)) {
63 Column(modifier = Modifier.padding(16.dp)) { 63 Column(modifier = Modifier.padding(16.dp)) {
@@ -65,8 +65,8 @@ class MainActivity : ComponentActivity() { @@ -65,8 +65,8 @@ class MainActivity : ComponentActivity() {
65 Text("Speed " + String.format("%.1f", TtsEngine.speed)) 65 Text("Speed " + String.format("%.1f", TtsEngine.speed))
66 Slider( 66 Slider(
67 value = TtsEngine.speedState.value, 67 value = TtsEngine.speedState.value,
68 - onValueChange = {  
69 - TtsEngine.speed = it 68 + onValueChange = {
  69 + TtsEngine.speed = it
70 preferenceHelper.setSpeed(it) 70 preferenceHelper.setSpeed(it)
71 }, 71 },
72 valueRange = 0.2F..3.0F, 72 valueRange = 0.2F..3.0F,
@@ -138,7 +138,9 @@ class MainActivity : ComponentActivity() { @@ -138,7 +138,9 @@ class MainActivity : ComponentActivity() {
138 val filename = 138 val filename =
139 application.filesDir.absolutePath + "/generated.wav" 139 application.filesDir.absolutePath + "/generated.wav"
140 val ok = 140 val ok =
141 - audio.samples.isNotEmpty() && audio.save(filename) 141 + audio.samples.isNotEmpty() && audio.save(
  142 + filename
  143 + )
142 144
143 if (ok) { 145 if (ok) {
144 stopMediaPlayer() 146 stopMediaPlayer()
1 package com.k2fsa.sherpa.onnx.tts.engine 1 package com.k2fsa.sherpa.onnx.tts.engine
2 2
  3 +import PreferenceHelper
3 import android.content.Context 4 import android.content.Context
4 import android.content.res.AssetManager 5 import android.content.res.AssetManager
5 import android.util.Log 6 import android.util.Log
@@ -11,7 +12,6 @@ import com.k2fsa.sherpa.onnx.getOfflineTtsConfig @@ -11,7 +12,6 @@ import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
11 import java.io.File 12 import java.io.File
12 import java.io.FileOutputStream 13 import java.io.FileOutputStream
13 import java.io.IOException 14 import java.io.IOException
14 -import PreferenceHelper  
15 15
16 object TtsEngine { 16 object TtsEngine {
17 var tts: OfflineTts? = null 17 var tts: OfflineTts? = null
@@ -41,6 +41,8 @@ object TtsEngine { @@ -41,6 +41,8 @@ object TtsEngine {
41 41
42 private var modelDir: String? = null 42 private var modelDir: String? = null
43 private var modelName: String? = null 43 private var modelName: String? = null
  44 + private var acousticModelName: String? = null
  45 + private var vocoder: String? = null
44 private var ruleFsts: String? = null 46 private var ruleFsts: String? = null
45 private var ruleFars: String? = null 47 private var ruleFars: String? = null
46 private var lexicon: String? = null 48 private var lexicon: String? = null
@@ -52,8 +54,17 @@ object TtsEngine { @@ -52,8 +54,17 @@ object TtsEngine {
52 // The purpose of such a design is to make the CI test easier 54 // The purpose of such a design is to make the CI test easier
53 // Please see 55 // Please see
54 // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py 56 // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py
55 - modelDir = null 57 + //
  58 + // For VITS -- begin
56 modelName = null 59 modelName = null
  60 + // For VITS -- end
  61 +
  62 + // For Matcha -- begin
  63 + acousticModelName = null
  64 + vocoder = null
  65 + // For Matcha -- end
  66 +
  67 + modelDir = null
57 ruleFsts = null 68 ruleFsts = null
58 ruleFars = null 69 ruleFars = null
59 lexicon = null 70 lexicon = null
@@ -82,7 +93,6 @@ object TtsEngine { @@ -82,7 +93,6 @@ object TtsEngine {
82 // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 93 // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
83 // modelDir = "vits-icefall-zh-aishell3" 94 // modelDir = "vits-icefall-zh-aishell3"
84 // modelName = "model.onnx" 95 // modelName = "model.onnx"
85 - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst"  
86 // ruleFars = "vits-icefall-zh-aishell3/rule.far" 96 // ruleFars = "vits-icefall-zh-aishell3/rule.far"
87 // lexicon = "lexicon.txt" 97 // lexicon = "lexicon.txt"
88 // lang = "zho" 98 // lang = "zho"
@@ -101,8 +111,35 @@ object TtsEngine { @@ -101,8 +111,35 @@ object TtsEngine {
101 // modelDir = "vits-coqui-de-css10" 111 // modelDir = "vits-coqui-de-css10"
102 // modelName = "model.onnx" 112 // modelName = "model.onnx"
103 // lang = "deu" 113 // lang = "deu"
104 - }  
105 114
  115 + // Example 6
  116 + // vits-melo-tts-zh_en
  117 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-melo-tts-zh-en-chinese-english-1-speaker
  118 + // modelDir = "vits-melo-tts-zh_en"
  119 + // modelName = "model.onnx"
  120 + // lexicon = "lexicon.txt"
  121 + // dictDir = "vits-melo-tts-zh_en/dict"
  122 + // lang = "zho"
  123 +
  124 + // Example 7
  125 + // matcha-icefall-zh-baker
  126 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  127 + // modelDir = "matcha-icefall-zh-baker"
  128 + // acousticModelName = "model-steps-3.onnx"
  129 + // vocoder = "hifigan_v2.onnx"
  130 + // lexicon = "lexicon.txt"
  131 + // dictDir = "matcha-icefall-zh-baker/dict"
  132 + // lang = "zho"
  133 +
  134 + // Example 8
  135 + // matcha-icefall-en_US-ljspeech
  136 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  137 + // modelDir = "matcha-icefall-en_US-ljspeech"
  138 + // acousticModelName = "model-steps-3.onnx"
  139 + // vocoder = "hifigan_v2.onnx"
  140 + // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data"
  141 + // lang = "eng"
  142 + }
106 143
107 fun createTts(context: Context) { 144 fun createTts(context: Context) {
108 Log.i(TAG, "Init Next-gen Kaldi TTS") 145 Log.i(TAG, "Init Next-gen Kaldi TTS")
@@ -115,22 +152,22 @@ object TtsEngine { @@ -115,22 +152,22 @@ object TtsEngine {
115 assets = context.assets 152 assets = context.assets
116 153
117 if (dataDir != null) { 154 if (dataDir != null) {
118 - val newDir = copyDataDir(context, modelDir!!)  
119 - modelDir = "$newDir/$modelDir" 155 + val newDir = copyDataDir(context, dataDir!!)
120 dataDir = "$newDir/$dataDir" 156 dataDir = "$newDir/$dataDir"
121 - assets = null  
122 } 157 }
123 158
124 if (dictDir != null) { 159 if (dictDir != null) {
125 - val newDir = copyDataDir(context, modelDir!!)  
126 - modelDir = "$newDir/$modelDir"  
127 - dictDir = "$modelDir/dict" 160 + val newDir = copyDataDir(context, dictDir!!)
  161 + dictDir = "$newDir/$dictDir"
128 ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" 162 ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst"
129 - assets = null  
130 } 163 }
131 164
132 val config = getOfflineTtsConfig( 165 val config = getOfflineTtsConfig(
133 - modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", 166 + modelDir = modelDir!!,
  167 + modelName = modelName ?: "",
  168 + acousticModelName = acousticModelName ?: "",
  169 + vocoder = vocoder ?: "",
  170 + lexicon = lexicon ?: "",
134 dataDir = dataDir ?: "", 171 dataDir = dataDir ?: "",
135 dictDir = dictDir ?: "", 172 dictDir = dictDir ?: "",
136 ruleFsts = ruleFsts ?: "", 173 ruleFsts = ruleFsts ?: "",
@@ -37,6 +37,8 @@ mkdir -p apks @@ -37,6 +37,8 @@ mkdir -p apks
37 pushd ./android/SherpaOnnxTtsEngine/app/src/main/assets/ 37 pushd ./android/SherpaOnnxTtsEngine/app/src/main/assets/
38 model_dir={{ tts_model.model_dir }} 38 model_dir={{ tts_model.model_dir }}
39 model_name={{ tts_model.model_name }} 39 model_name={{ tts_model.model_name }}
  40 +acoustic_model_name={{ tts_model.acoustic_model_name }}
  41 +vocoder={{ tts_model.vocoder }}
40 lang={{ tts_model.lang }} 42 lang={{ tts_model.lang }}
41 lang_iso_639_3={{ tts_model.lang_iso_639_3 }} 43 lang_iso_639_3={{ tts_model.lang_iso_639_3 }}
42 44
@@ -44,15 +46,30 @@ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$mod @@ -44,15 +46,30 @@ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$mod
44 tar xf $model_dir.tar.bz2 46 tar xf $model_dir.tar.bz2
45 rm $model_dir.tar.bz2 47 rm $model_dir.tar.bz2
46 48
  49 +{% if tts_model.vocoder %}
  50 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/$vocoder
  51 +{% endif %}
  52 +
47 popd 53 popd
48 # Now we are at the project root directory 54 # Now we are at the project root directory
49 55
50 git checkout . 56 git checkout .
51 pushd android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine 57 pushd android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine
52 sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./TtsEngine.kt 58 sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./TtsEngine.kt
53 -sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./TtsEngine.kt  
54 sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt 59 sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt
55 60
  61 +{% if tts_model.model_name %}
  62 + sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./TtsEngine.kt
  63 +{% endif %}
  64 +
  65 +{% if tts_model.model_name %}
  66 + sed -i.bak s/"acousticModelName = null"/"acousticModelName = \"$acoustic_model_name\""/ ./TtsEngine.kt
  67 +{% endif %}
  68 +
  69 +{% if tts_model.vocoder %}
  70 + sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./TtsEngine.kt
  71 +{% endif %}
  72 +
56 {% if tts_model.rule_fsts %} 73 {% if tts_model.rule_fsts %}
57 rule_fsts={{ tts_model.rule_fsts }} 74 rule_fsts={{ tts_model.rule_fsts }}
58 sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt 75 sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt
@@ -109,6 +126,7 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do @@ -109,6 +126,7 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do
109 done 126 done
110 127
111 rm -rf ./android/SherpaOnnxTtsEngine/app/src/main/assets/$model_dir 128 rm -rf ./android/SherpaOnnxTtsEngine/app/src/main/assets/$model_dir
  129 +rm -fv ./android/SherpaOnnxTtsEngine/app/src/main/assets/*.onnx
112 {% endfor %} 130 {% endfor %}
113 131
114 git checkout . 132 git checkout .
@@ -37,19 +37,38 @@ mkdir -p apks @@ -37,19 +37,38 @@ mkdir -p apks
37 pushd ./android/SherpaOnnxTts/app/src/main/assets/ 37 pushd ./android/SherpaOnnxTts/app/src/main/assets/
38 model_dir={{ tts_model.model_dir }} 38 model_dir={{ tts_model.model_dir }}
39 model_name={{ tts_model.model_name }} 39 model_name={{ tts_model.model_name }}
  40 +acoustic_model_name={{ tts_model.acoustic_model_name }}
  41 +vocoder={{ tts_model.vocoder }}
40 lang={{ tts_model.lang }} 42 lang={{ tts_model.lang }}
41 43
42 wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2 44 wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2
43 tar xf $model_dir.tar.bz2 45 tar xf $model_dir.tar.bz2
44 rm $model_dir.tar.bz2 46 rm $model_dir.tar.bz2
45 47
  48 +{% if tts_model.vocoder %}
  49 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/$vocoder
  50 +{% endif %}
  51 +
46 popd 52 popd
47 # Now we are at the project root directory 53 # Now we are at the project root directory
48 54
49 git checkout . 55 git checkout .
50 pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx 56 pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx
51 sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt 57 sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt
52 -sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt 58 +
  59 +
  60 +{% if tts_model.model_name %}
  61 + sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt
  62 +{% endif %}
  63 +
  64 +{% if tts_model.acoustic_model_name %}
  65 + sed -i.bak s/"acousticModelName = null"/"acousticModelName = \"$acoustic_model_name\""/ ./MainActivity.kt
  66 +{% endif %}
  67 +
  68 +{% if tts_model.vocoder %}
  69 + sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./MainActivity.kt
  70 +{% endif %}
  71 +
53 72
54 {% if tts_model.rule_fsts %} 73 {% if tts_model.rule_fsts %}
55 rule_fsts={{ tts_model.rule_fsts }} 74 rule_fsts={{ tts_model.rule_fsts }}
@@ -107,6 +126,8 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do @@ -107,6 +126,8 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do
107 done 126 done
108 127
109 rm -rf ./android/SherpaOnnxTts/app/src/main/assets/$model_dir 128 rm -rf ./android/SherpaOnnxTts/app/src/main/assets/$model_dir
  129 +rm -fv ./android/SherpaOnnxTts/app/src/main/assets/*.onnx
  130 +
110 {% endfor %} 131 {% endfor %}
111 132
112 git checkout . 133 git checkout .
@@ -30,7 +30,9 @@ def get_args(): @@ -30,7 +30,9 @@ def get_args():
30 @dataclass 30 @dataclass
31 class TtsModel: 31 class TtsModel:
32 model_dir: str 32 model_dir: str
33 - model_name: str = "" 33 + model_name: str = "" # for vits
  34 + acoustic_model_name: str = "" # for matcha
  35 + vocoder: str = "" # for matcha
34 lang: str = "" # en, zh, fr, de, etc. 36 lang: str = "" # en, zh, fr, de, etc.
35 rule_fsts: Optional[List[str]] = None 37 rule_fsts: Optional[List[str]] = None
36 rule_fars: Optional[List[str]] = None 38 rule_fars: Optional[List[str]] = None
@@ -378,6 +380,35 @@ def get_vits_models() -> List[TtsModel]: @@ -378,6 +380,35 @@ def get_vits_models() -> List[TtsModel]:
378 return all_models 380 return all_models
379 381
380 382
  383 +def get_matcha_models() -> List[TtsModel]:
  384 + chinese_models = [
  385 + TtsModel(
  386 + model_dir="matcha-icefall-zh-baker",
  387 + acoustic_model_name="model-steps-3.onnx",
  388 + lang="zh",
  389 + )
  390 + ]
  391 + rule_fsts = ["phone.fst", "date.fst", "number.fst"]
  392 + for m in chinese_models:
  393 + s = [f"{m.model_dir}/{r}" for r in rule_fsts]
  394 + m.rule_fsts = ",".join(s)
  395 + m.dict_dir = m.model_dir + "/dict"
  396 + m.vocoder = "hifigan_v2.onnx"
  397 +
  398 + english_models = [
  399 + TtsModel(
  400 + model_dir="matcha-icefall-en_US-ljspeech",
  401 + acoustic_model_name="model-steps-3.onnx",
  402 + lang="en",
  403 + )
  404 + ]
  405 + for m in english_models:
  406 + m.data_dir = f"{m.model_dir}/espeak-ng-data"
  407 + m.vocoder = "hifigan_v2.onnx"
  408 +
  409 + return chinese_models + english_models
  410 +
  411 +
381 def main(): 412 def main():
382 args = get_args() 413 args = get_args()
383 index = args.index 414 index = args.index
@@ -389,7 +420,10 @@ def main(): @@ -389,7 +420,10 @@ def main():
389 all_model_list += get_piper_models() 420 all_model_list += get_piper_models()
390 all_model_list += get_mimic3_models() 421 all_model_list += get_mimic3_models()
391 all_model_list += get_coqui_models() 422 all_model_list += get_coqui_models()
  423 + all_model_list += get_matcha_models()
  424 +
392 convert_lang_to_iso_639_3(all_model_list) 425 convert_lang_to_iso_639_3(all_model_list)
  426 + print(all_model_list)
393 427
394 num_models = len(all_model_list) 428 num_models = len(all_model_list)
395 429
@@ -348,6 +348,10 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -348,6 +348,10 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
348 mgr, config_.model.vits.lexicon, config_.model.vits.tokens, 348 mgr, config_.model.vits.lexicon, config_.model.vits.tokens,
349 config_.model.vits.dict_dir, model_->GetMetaData(), 349 config_.model.vits.dict_dir, model_->GetMetaData(),
350 config_.model.debug); 350 config_.model.debug);
  351 + } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) {
  352 + frontend_ = std::make_unique<JiebaLexicon>(
  353 + mgr, config_.model.vits.lexicon, config_.model.vits.tokens,
  354 + config_.model.vits.dict_dir, config_.model.debug);
351 } else if (meta_data.is_melo_tts && meta_data.language == "English") { 355 } else if (meta_data.is_melo_tts && meta_data.language == "English") {
352 frontend_ = std::make_unique<MeloTtsLexicon>( 356 frontend_ = std::make_unique<MeloTtsLexicon>(
353 mgr, config_.model.vits.lexicon, config_.model.vits.tokens, 357 mgr, config_.model.vits.lexicon, config_.model.vits.tokens,
@@ -173,22 +173,55 @@ class OfflineTts( @@ -173,22 +173,55 @@ class OfflineTts(
173 // to download models 173 // to download models
174 fun getOfflineTtsConfig( 174 fun getOfflineTtsConfig(
175 modelDir: String, 175 modelDir: String,
176 - modelName: String, 176 + modelName: String, // for VITS
  177 + acousticModelName: String, // for Matcha
  178 + vocoder: String, // for Matcha
177 lexicon: String, 179 lexicon: String,
178 dataDir: String, 180 dataDir: String,
179 dictDir: String, 181 dictDir: String,
180 ruleFsts: String, 182 ruleFsts: String,
181 ruleFars: String 183 ruleFars: String
182 ): OfflineTtsConfig { 184 ): OfflineTtsConfig {
  185 + if (modelName.isEmpty() && acousticModelName.isEmpty()) {
  186 + throw IllegalArgumentException("Please specify a TTS model")
  187 + }
  188 +
  189 + if (modelName.isNotEmpty() && acousticModelName.isNotEmpty()) {
  190 + throw IllegalArgumentException("Please specify either a VITS or a Matcha model, but not both")
  191 + }
  192 +
  193 + if (acousticModelName.isNotEmpty() && vocoder.isEmpty()) {
  194 + throw IllegalArgumentException("Please provide vocoder for Matcha TTS")
  195 + }
  196 + val vits = if (modelName.isNotEmpty()) {
  197 + OfflineTtsVitsModelConfig(
  198 + model = "$modelDir/$modelName",
  199 + lexicon = "$modelDir/$lexicon",
  200 + tokens = "$modelDir/tokens.txt",
  201 + dataDir = dataDir,
  202 + dictDir = dictDir,
  203 + )
  204 + } else {
  205 + OfflineTtsVitsModelConfig()
  206 + }
  207 +
  208 + val matcha = if (acousticModelName.isNotEmpty()) {
  209 + OfflineTtsMatchaModelConfig(
  210 + acousticModel = "$modelDir/$acousticModelName",
  211 + vocoder = vocoder,
  212 + lexicon = "$modelDir/$lexicon",
  213 + tokens = "$modelDir/tokens.txt",
  214 + dictDir = dictDir,
  215 + dataDir = dataDir,
  216 + )
  217 + } else {
  218 + OfflineTtsMatchaModelConfig()
  219 + }
  220 +
183 return OfflineTtsConfig( 221 return OfflineTtsConfig(
184 model = OfflineTtsModelConfig( 222 model = OfflineTtsModelConfig(
185 - vits = OfflineTtsVitsModelConfig(  
186 - model = "$modelDir/$modelName",  
187 - lexicon = "$modelDir/$lexicon",  
188 - tokens = "$modelDir/tokens.txt",  
189 - dataDir = dataDir,  
190 - dictDir = dictDir,  
191 - ), 223 + vits = vits,
  224 + matcha = matcha,
192 numThreads = 2, 225 numThreads = 2,
193 debug = true, 226 debug = true,
194 provider = "cpu", 227 provider = "cpu",