Committed by
GitHub
Add Android demo for MatchaTTS models. (#1683)
正在显示
9 个修改的文件
包含
222 行增加
和
38 行删除
| @@ -26,6 +26,7 @@ jobs: | @@ -26,6 +26,7 @@ jobs: | ||
| 26 | total: ["40"] | 26 | total: ["40"] |
| 27 | index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39"] | 27 | index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39"] |
| 28 | 28 | ||
| 29 | + | ||
| 29 | steps: | 30 | steps: |
| 30 | - uses: actions/checkout@v4 | 31 | - uses: actions/checkout@v4 |
| 31 | with: | 32 | with: |
| @@ -183,6 +183,8 @@ class MainActivity : AppCompatActivity() { | @@ -183,6 +183,8 @@ class MainActivity : AppCompatActivity() { | ||
| 183 | private fun initTts() { | 183 | private fun initTts() { |
| 184 | var modelDir: String? | 184 | var modelDir: String? |
| 185 | var modelName: String? | 185 | var modelName: String? |
| 186 | + var acousticModelName: String? | ||
| 187 | + var vocoder: String? | ||
| 186 | var ruleFsts: String? | 188 | var ruleFsts: String? |
| 187 | var ruleFars: String? | 189 | var ruleFars: String? |
| 188 | var lexicon: String? | 190 | var lexicon: String? |
| @@ -193,8 +195,18 @@ class MainActivity : AppCompatActivity() { | @@ -193,8 +195,18 @@ class MainActivity : AppCompatActivity() { | ||
| 193 | // The purpose of such a design is to make the CI test easier | 195 | // The purpose of such a design is to make the CI test easier |
| 194 | // Please see | 196 | // Please see |
| 195 | // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py | 197 | // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py |
| 196 | - modelDir = null | 198 | + |
| 199 | + // VITS -- begin | ||
| 197 | modelName = null | 200 | modelName = null |
| 201 | + // VITS -- end | ||
| 202 | + | ||
| 203 | + // Matcha -- begin | ||
| 204 | + acousticModelName = null | ||
| 205 | + vocoder = null | ||
| 206 | + // Matcha -- end | ||
| 207 | + | ||
| 208 | + | ||
| 209 | + modelDir = null | ||
| 198 | ruleFsts = null | 210 | ruleFsts = null |
| 199 | ruleFars = null | 211 | ruleFars = null |
| 200 | lexicon = null | 212 | lexicon = null |
| @@ -217,7 +229,6 @@ class MainActivity : AppCompatActivity() { | @@ -217,7 +229,6 @@ class MainActivity : AppCompatActivity() { | ||
| 217 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 229 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 218 | // modelDir = "vits-icefall-zh-aishell3" | 230 | // modelDir = "vits-icefall-zh-aishell3" |
| 219 | // modelName = "model.onnx" | 231 | // modelName = "model.onnx" |
| 220 | - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst" | ||
| 221 | // ruleFars = "vits-icefall-zh-aishell3/rule.far" | 232 | // ruleFars = "vits-icefall-zh-aishell3/rule.far" |
| 222 | // lexicon = "lexicon.txt" | 233 | // lexicon = "lexicon.txt" |
| 223 | 234 | ||
| @@ -233,24 +244,47 @@ class MainActivity : AppCompatActivity() { | @@ -233,24 +244,47 @@ class MainActivity : AppCompatActivity() { | ||
| 233 | // modelDir = "vits-coqui-de-css10" | 244 | // modelDir = "vits-coqui-de-css10" |
| 234 | // modelName = "model.onnx" | 245 | // modelName = "model.onnx" |
| 235 | 246 | ||
| 247 | + // Example 6 | ||
| 248 | + // vits-melo-tts-zh_en | ||
| 249 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-melo-tts-zh-en-chinese-english-1-speaker | ||
| 250 | + // modelDir = "vits-melo-tts-zh_en" | ||
| 251 | + // modelName = "model.onnx" | ||
| 252 | + // lexicon = "lexicon.txt" | ||
| 253 | + // dictDir = "vits-melo-tts-zh_en/dict" | ||
| 254 | + | ||
| 255 | + // Example 7 | ||
| 256 | + // matcha-icefall-zh-baker | ||
| 257 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
| 258 | + // modelDir = "matcha-icefall-zh-baker" | ||
| 259 | + // acousticModelName = "model-steps-3.onnx" | ||
| 260 | + // vocoder = "hifigan_v2.onnx" | ||
| 261 | + // lexicon = "lexicon.txt" | ||
| 262 | + // dictDir = "matcha-icefall-zh-baker/dict" | ||
| 263 | + | ||
| 264 | + // Example 8 | ||
| 265 | + // matcha-icefall-en_US-ljspeech | ||
| 266 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 267 | + // modelDir = "matcha-icefall-en_US-ljspeech" | ||
| 268 | + // acousticModelName = "model-steps-3.onnx" | ||
| 269 | + // vocoder = "hifigan_v2.onnx" | ||
| 270 | + // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data" | ||
| 271 | + | ||
| 236 | if (dataDir != null) { | 272 | if (dataDir != null) { |
| 237 | - val newDir = copyDataDir(modelDir!!) | ||
| 238 | - modelDir = newDir + "/" + modelDir | ||
| 239 | - dataDir = newDir + "/" + dataDir | ||
| 240 | - assets = null | 273 | + val newDir = copyDataDir(dataDir!!) |
| 274 | + dataDir = "$newDir/$dataDir" | ||
| 241 | } | 275 | } |
| 242 | 276 | ||
| 243 | if (dictDir != null) { | 277 | if (dictDir != null) { |
| 244 | - val newDir = copyDataDir(modelDir!!) | ||
| 245 | - modelDir = newDir + "/" + modelDir | ||
| 246 | - dictDir = modelDir + "/" + "dict" | 278 | + val newDir = copyDataDir(dictDir!!) |
| 279 | + dictDir = "$newDir/$dictDir" | ||
| 247 | ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" | 280 | ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" |
| 248 | - assets = null | ||
| 249 | } | 281 | } |
| 250 | 282 | ||
| 251 | val config = getOfflineTtsConfig( | 283 | val config = getOfflineTtsConfig( |
| 252 | modelDir = modelDir!!, | 284 | modelDir = modelDir!!, |
| 253 | - modelName = modelName!!, | 285 | + modelName = modelName ?: "", |
| 286 | + acousticModelName = acousticModelName ?: "", | ||
| 287 | + vocoder = vocoder ?: "", | ||
| 254 | lexicon = lexicon ?: "", | 288 | lexicon = lexicon ?: "", |
| 255 | dataDir = dataDir ?: "", | 289 | dataDir = dataDir ?: "", |
| 256 | dictDir = dictDir ?: "", | 290 | dictDir = dictDir ?: "", |
| @@ -57,7 +57,7 @@ class MainActivity : ComponentActivity() { | @@ -57,7 +57,7 @@ class MainActivity : ComponentActivity() { | ||
| 57 | color = MaterialTheme.colorScheme.background | 57 | color = MaterialTheme.colorScheme.background |
| 58 | ) { | 58 | ) { |
| 59 | Scaffold(topBar = { | 59 | Scaffold(topBar = { |
| 60 | - TopAppBar(title = { Text("Next-gen Kaldi: TTS") }) | 60 | + TopAppBar(title = { Text("Next-gen Kaldi: TTS Engine") }) |
| 61 | }) { | 61 | }) { |
| 62 | Box(modifier = Modifier.padding(it)) { | 62 | Box(modifier = Modifier.padding(it)) { |
| 63 | Column(modifier = Modifier.padding(16.dp)) { | 63 | Column(modifier = Modifier.padding(16.dp)) { |
| @@ -65,8 +65,8 @@ class MainActivity : ComponentActivity() { | @@ -65,8 +65,8 @@ class MainActivity : ComponentActivity() { | ||
| 65 | Text("Speed " + String.format("%.1f", TtsEngine.speed)) | 65 | Text("Speed " + String.format("%.1f", TtsEngine.speed)) |
| 66 | Slider( | 66 | Slider( |
| 67 | value = TtsEngine.speedState.value, | 67 | value = TtsEngine.speedState.value, |
| 68 | - onValueChange = { | ||
| 69 | - TtsEngine.speed = it | 68 | + onValueChange = { |
| 69 | + TtsEngine.speed = it | ||
| 70 | preferenceHelper.setSpeed(it) | 70 | preferenceHelper.setSpeed(it) |
| 71 | }, | 71 | }, |
| 72 | valueRange = 0.2F..3.0F, | 72 | valueRange = 0.2F..3.0F, |
| @@ -138,7 +138,9 @@ class MainActivity : ComponentActivity() { | @@ -138,7 +138,9 @@ class MainActivity : ComponentActivity() { | ||
| 138 | val filename = | 138 | val filename = |
| 139 | application.filesDir.absolutePath + "/generated.wav" | 139 | application.filesDir.absolutePath + "/generated.wav" |
| 140 | val ok = | 140 | val ok = |
| 141 | - audio.samples.isNotEmpty() && audio.save(filename) | 141 | + audio.samples.isNotEmpty() && audio.save( |
| 142 | + filename | ||
| 143 | + ) | ||
| 142 | 144 | ||
| 143 | if (ok) { | 145 | if (ok) { |
| 144 | stopMediaPlayer() | 146 | stopMediaPlayer() |
| 1 | package com.k2fsa.sherpa.onnx.tts.engine | 1 | package com.k2fsa.sherpa.onnx.tts.engine |
| 2 | 2 | ||
| 3 | +import PreferenceHelper | ||
| 3 | import android.content.Context | 4 | import android.content.Context |
| 4 | import android.content.res.AssetManager | 5 | import android.content.res.AssetManager |
| 5 | import android.util.Log | 6 | import android.util.Log |
| @@ -11,7 +12,6 @@ import com.k2fsa.sherpa.onnx.getOfflineTtsConfig | @@ -11,7 +12,6 @@ import com.k2fsa.sherpa.onnx.getOfflineTtsConfig | ||
| 11 | import java.io.File | 12 | import java.io.File |
| 12 | import java.io.FileOutputStream | 13 | import java.io.FileOutputStream |
| 13 | import java.io.IOException | 14 | import java.io.IOException |
| 14 | -import PreferenceHelper | ||
| 15 | 15 | ||
| 16 | object TtsEngine { | 16 | object TtsEngine { |
| 17 | var tts: OfflineTts? = null | 17 | var tts: OfflineTts? = null |
| @@ -41,6 +41,8 @@ object TtsEngine { | @@ -41,6 +41,8 @@ object TtsEngine { | ||
| 41 | 41 | ||
| 42 | private var modelDir: String? = null | 42 | private var modelDir: String? = null |
| 43 | private var modelName: String? = null | 43 | private var modelName: String? = null |
| 44 | + private var acousticModelName: String? = null | ||
| 45 | + private var vocoder: String? = null | ||
| 44 | private var ruleFsts: String? = null | 46 | private var ruleFsts: String? = null |
| 45 | private var ruleFars: String? = null | 47 | private var ruleFars: String? = null |
| 46 | private var lexicon: String? = null | 48 | private var lexicon: String? = null |
| @@ -52,8 +54,17 @@ object TtsEngine { | @@ -52,8 +54,17 @@ object TtsEngine { | ||
| 52 | // The purpose of such a design is to make the CI test easier | 54 | // The purpose of such a design is to make the CI test easier |
| 53 | // Please see | 55 | // Please see |
| 54 | // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py | 56 | // https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/apk/generate-tts-apk-script.py |
| 55 | - modelDir = null | 57 | + // |
| 58 | + // For VITS -- begin | ||
| 56 | modelName = null | 59 | modelName = null |
| 60 | + // For VITS -- end | ||
| 61 | + | ||
| 62 | + // For Matcha -- begin | ||
| 63 | + acousticModelName = null | ||
| 64 | + vocoder = null | ||
| 65 | + // For Matcha -- end | ||
| 66 | + | ||
| 67 | + modelDir = null | ||
| 57 | ruleFsts = null | 68 | ruleFsts = null |
| 58 | ruleFars = null | 69 | ruleFars = null |
| 59 | lexicon = null | 70 | lexicon = null |
| @@ -82,7 +93,6 @@ object TtsEngine { | @@ -82,7 +93,6 @@ object TtsEngine { | ||
| 82 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 93 | // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 83 | // modelDir = "vits-icefall-zh-aishell3" | 94 | // modelDir = "vits-icefall-zh-aishell3" |
| 84 | // modelName = "model.onnx" | 95 | // modelName = "model.onnx" |
| 85 | - // ruleFsts = "vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst" | ||
| 86 | // ruleFars = "vits-icefall-zh-aishell3/rule.far" | 96 | // ruleFars = "vits-icefall-zh-aishell3/rule.far" |
| 87 | // lexicon = "lexicon.txt" | 97 | // lexicon = "lexicon.txt" |
| 88 | // lang = "zho" | 98 | // lang = "zho" |
| @@ -101,8 +111,35 @@ object TtsEngine { | @@ -101,8 +111,35 @@ object TtsEngine { | ||
| 101 | // modelDir = "vits-coqui-de-css10" | 111 | // modelDir = "vits-coqui-de-css10" |
| 102 | // modelName = "model.onnx" | 112 | // modelName = "model.onnx" |
| 103 | // lang = "deu" | 113 | // lang = "deu" |
| 104 | - } | ||
| 105 | 114 | ||
| 115 | + // Example 6 | ||
| 116 | + // vits-melo-tts-zh_en | ||
| 117 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-melo-tts-zh-en-chinese-english-1-speaker | ||
| 118 | + // modelDir = "vits-melo-tts-zh_en" | ||
| 119 | + // modelName = "model.onnx" | ||
| 120 | + // lexicon = "lexicon.txt" | ||
| 121 | + // dictDir = "vits-melo-tts-zh_en/dict" | ||
| 122 | + // lang = "zho" | ||
| 123 | + | ||
| 124 | + // Example 7 | ||
| 125 | + // matcha-icefall-zh-baker | ||
| 126 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
| 127 | + // modelDir = "matcha-icefall-zh-baker" | ||
| 128 | + // acousticModelName = "model-steps-3.onnx" | ||
| 129 | + // vocoder = "hifigan_v2.onnx" | ||
| 130 | + // lexicon = "lexicon.txt" | ||
| 131 | + // dictDir = "matcha-icefall-zh-baker/dict" | ||
| 132 | + // lang = "zho" | ||
| 133 | + | ||
| 134 | + // Example 8 | ||
| 135 | + // matcha-icefall-en_US-ljspeech | ||
| 136 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 137 | + // modelDir = "matcha-icefall-en_US-ljspeech" | ||
| 138 | + // acousticModelName = "model-steps-3.onnx" | ||
| 139 | + // vocoder = "hifigan_v2.onnx" | ||
| 140 | + // dataDir = "matcha-icefall-en_US-ljspeech/espeak-ng-data" | ||
| 141 | + // lang = "eng" | ||
| 142 | + } | ||
| 106 | 143 | ||
| 107 | fun createTts(context: Context) { | 144 | fun createTts(context: Context) { |
| 108 | Log.i(TAG, "Init Next-gen Kaldi TTS") | 145 | Log.i(TAG, "Init Next-gen Kaldi TTS") |
| @@ -115,22 +152,22 @@ object TtsEngine { | @@ -115,22 +152,22 @@ object TtsEngine { | ||
| 115 | assets = context.assets | 152 | assets = context.assets |
| 116 | 153 | ||
| 117 | if (dataDir != null) { | 154 | if (dataDir != null) { |
| 118 | - val newDir = copyDataDir(context, modelDir!!) | ||
| 119 | - modelDir = "$newDir/$modelDir" | 155 | + val newDir = copyDataDir(context, dataDir!!) |
| 120 | dataDir = "$newDir/$dataDir" | 156 | dataDir = "$newDir/$dataDir" |
| 121 | - assets = null | ||
| 122 | } | 157 | } |
| 123 | 158 | ||
| 124 | if (dictDir != null) { | 159 | if (dictDir != null) { |
| 125 | - val newDir = copyDataDir(context, modelDir!!) | ||
| 126 | - modelDir = "$newDir/$modelDir" | ||
| 127 | - dictDir = "$modelDir/dict" | 160 | + val newDir = copyDataDir(context, dictDir!!) |
| 161 | + dictDir = "$newDir/$dictDir" | ||
| 128 | ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" | 162 | ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst" |
| 129 | - assets = null | ||
| 130 | } | 163 | } |
| 131 | 164 | ||
| 132 | val config = getOfflineTtsConfig( | 165 | val config = getOfflineTtsConfig( |
| 133 | - modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", | 166 | + modelDir = modelDir!!, |
| 167 | + modelName = modelName ?: "", | ||
| 168 | + acousticModelName = acousticModelName ?: "", | ||
| 169 | + vocoder = vocoder ?: "", | ||
| 170 | + lexicon = lexicon ?: "", | ||
| 134 | dataDir = dataDir ?: "", | 171 | dataDir = dataDir ?: "", |
| 135 | dictDir = dictDir ?: "", | 172 | dictDir = dictDir ?: "", |
| 136 | ruleFsts = ruleFsts ?: "", | 173 | ruleFsts = ruleFsts ?: "", |
| @@ -37,6 +37,8 @@ mkdir -p apks | @@ -37,6 +37,8 @@ mkdir -p apks | ||
| 37 | pushd ./android/SherpaOnnxTtsEngine/app/src/main/assets/ | 37 | pushd ./android/SherpaOnnxTtsEngine/app/src/main/assets/ |
| 38 | model_dir={{ tts_model.model_dir }} | 38 | model_dir={{ tts_model.model_dir }} |
| 39 | model_name={{ tts_model.model_name }} | 39 | model_name={{ tts_model.model_name }} |
| 40 | +acoustic_model_name={{ tts_model.acoustic_model_name }} | ||
| 41 | +vocoder={{ tts_model.vocoder }} | ||
| 40 | lang={{ tts_model.lang }} | 42 | lang={{ tts_model.lang }} |
| 41 | lang_iso_639_3={{ tts_model.lang_iso_639_3 }} | 43 | lang_iso_639_3={{ tts_model.lang_iso_639_3 }} |
| 42 | 44 | ||
| @@ -44,15 +46,30 @@ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$mod | @@ -44,15 +46,30 @@ wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$mod | ||
| 44 | tar xf $model_dir.tar.bz2 | 46 | tar xf $model_dir.tar.bz2 |
| 45 | rm $model_dir.tar.bz2 | 47 | rm $model_dir.tar.bz2 |
| 46 | 48 | ||
| 49 | +{% if tts_model.vocoder %} | ||
| 50 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/$vocoder | ||
| 51 | +{% endif %} | ||
| 52 | + | ||
| 47 | popd | 53 | popd |
| 48 | # Now we are at the project root directory | 54 | # Now we are at the project root directory |
| 49 | 55 | ||
| 50 | git checkout . | 56 | git checkout . |
| 51 | pushd android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine | 57 | pushd android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine |
| 52 | sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./TtsEngine.kt | 58 | sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./TtsEngine.kt |
| 53 | -sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./TtsEngine.kt | ||
| 54 | sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt | 59 | sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt |
| 55 | 60 | ||
| 61 | +{% if tts_model.model_name %} | ||
| 62 | + sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./TtsEngine.kt | ||
| 63 | +{% endif %} | ||
| 64 | + | ||
| 65 | +{% if tts_model.model_name %} | ||
| 66 | + sed -i.bak s/"acousticModelName = null"/"acousticModelName = \"$acoustic_model_name\""/ ./TtsEngine.kt | ||
| 67 | +{% endif %} | ||
| 68 | + | ||
| 69 | +{% if tts_model.vocoder %} | ||
| 70 | + sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./TtsEngine.kt | ||
| 71 | +{% endif %} | ||
| 72 | + | ||
| 56 | {% if tts_model.rule_fsts %} | 73 | {% if tts_model.rule_fsts %} |
| 57 | rule_fsts={{ tts_model.rule_fsts }} | 74 | rule_fsts={{ tts_model.rule_fsts }} |
| 58 | sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt | 75 | sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt |
| @@ -109,6 +126,7 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do | @@ -109,6 +126,7 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do | ||
| 109 | done | 126 | done |
| 110 | 127 | ||
| 111 | rm -rf ./android/SherpaOnnxTtsEngine/app/src/main/assets/$model_dir | 128 | rm -rf ./android/SherpaOnnxTtsEngine/app/src/main/assets/$model_dir |
| 129 | +rm -fv ./android/SherpaOnnxTtsEngine/app/src/main/assets/*.onnx | ||
| 112 | {% endfor %} | 130 | {% endfor %} |
| 113 | 131 | ||
| 114 | git checkout . | 132 | git checkout . |
| @@ -37,19 +37,38 @@ mkdir -p apks | @@ -37,19 +37,38 @@ mkdir -p apks | ||
| 37 | pushd ./android/SherpaOnnxTts/app/src/main/assets/ | 37 | pushd ./android/SherpaOnnxTts/app/src/main/assets/ |
| 38 | model_dir={{ tts_model.model_dir }} | 38 | model_dir={{ tts_model.model_dir }} |
| 39 | model_name={{ tts_model.model_name }} | 39 | model_name={{ tts_model.model_name }} |
| 40 | +acoustic_model_name={{ tts_model.acoustic_model_name }} | ||
| 41 | +vocoder={{ tts_model.vocoder }} | ||
| 40 | lang={{ tts_model.lang }} | 42 | lang={{ tts_model.lang }} |
| 41 | 43 | ||
| 42 | wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2 | 44 | wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2 |
| 43 | tar xf $model_dir.tar.bz2 | 45 | tar xf $model_dir.tar.bz2 |
| 44 | rm $model_dir.tar.bz2 | 46 | rm $model_dir.tar.bz2 |
| 45 | 47 | ||
| 48 | +{% if tts_model.vocoder %} | ||
| 49 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/$vocoder | ||
| 50 | +{% endif %} | ||
| 51 | + | ||
| 46 | popd | 52 | popd |
| 47 | # Now we are at the project root directory | 53 | # Now we are at the project root directory |
| 48 | 54 | ||
| 49 | git checkout . | 55 | git checkout . |
| 50 | pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx | 56 | pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx |
| 51 | sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt | 57 | sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt |
| 52 | -sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt | 58 | + |
| 59 | + | ||
| 60 | +{% if tts_model.model_name %} | ||
| 61 | + sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt | ||
| 62 | +{% endif %} | ||
| 63 | + | ||
| 64 | +{% if tts_model.acoustic_model_name %} | ||
| 65 | + sed -i.bak s/"acousticModelName = null"/"acousticModelName = \"$acoustic_model_name\""/ ./MainActivity.kt | ||
| 66 | +{% endif %} | ||
| 67 | + | ||
| 68 | +{% if tts_model.vocoder %} | ||
| 69 | + sed -i.bak s/"vocoder = null"/"vocoder = \"$vocoder\""/ ./MainActivity.kt | ||
| 70 | +{% endif %} | ||
| 71 | + | ||
| 53 | 72 | ||
| 54 | {% if tts_model.rule_fsts %} | 73 | {% if tts_model.rule_fsts %} |
| 55 | rule_fsts={{ tts_model.rule_fsts }} | 74 | rule_fsts={{ tts_model.rule_fsts }} |
| @@ -107,6 +126,8 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do | @@ -107,6 +126,8 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do | ||
| 107 | done | 126 | done |
| 108 | 127 | ||
| 109 | rm -rf ./android/SherpaOnnxTts/app/src/main/assets/$model_dir | 128 | rm -rf ./android/SherpaOnnxTts/app/src/main/assets/$model_dir |
| 129 | +rm -fv ./android/SherpaOnnxTts/app/src/main/assets/*.onnx | ||
| 130 | + | ||
| 110 | {% endfor %} | 131 | {% endfor %} |
| 111 | 132 | ||
| 112 | git checkout . | 133 | git checkout . |
| @@ -30,7 +30,9 @@ def get_args(): | @@ -30,7 +30,9 @@ def get_args(): | ||
| 30 | @dataclass | 30 | @dataclass |
| 31 | class TtsModel: | 31 | class TtsModel: |
| 32 | model_dir: str | 32 | model_dir: str |
| 33 | - model_name: str = "" | 33 | + model_name: str = "" # for vits |
| 34 | + acoustic_model_name: str = "" # for matcha | ||
| 35 | + vocoder: str = "" # for matcha | ||
| 34 | lang: str = "" # en, zh, fr, de, etc. | 36 | lang: str = "" # en, zh, fr, de, etc. |
| 35 | rule_fsts: Optional[List[str]] = None | 37 | rule_fsts: Optional[List[str]] = None |
| 36 | rule_fars: Optional[List[str]] = None | 38 | rule_fars: Optional[List[str]] = None |
| @@ -378,6 +380,35 @@ def get_vits_models() -> List[TtsModel]: | @@ -378,6 +380,35 @@ def get_vits_models() -> List[TtsModel]: | ||
| 378 | return all_models | 380 | return all_models |
| 379 | 381 | ||
| 380 | 382 | ||
| 383 | +def get_matcha_models() -> List[TtsModel]: | ||
| 384 | + chinese_models = [ | ||
| 385 | + TtsModel( | ||
| 386 | + model_dir="matcha-icefall-zh-baker", | ||
| 387 | + acoustic_model_name="model-steps-3.onnx", | ||
| 388 | + lang="zh", | ||
| 389 | + ) | ||
| 390 | + ] | ||
| 391 | + rule_fsts = ["phone.fst", "date.fst", "number.fst"] | ||
| 392 | + for m in chinese_models: | ||
| 393 | + s = [f"{m.model_dir}/{r}" for r in rule_fsts] | ||
| 394 | + m.rule_fsts = ",".join(s) | ||
| 395 | + m.dict_dir = m.model_dir + "/dict" | ||
| 396 | + m.vocoder = "hifigan_v2.onnx" | ||
| 397 | + | ||
| 398 | + english_models = [ | ||
| 399 | + TtsModel( | ||
| 400 | + model_dir="matcha-icefall-en_US-ljspeech", | ||
| 401 | + acoustic_model_name="model-steps-3.onnx", | ||
| 402 | + lang="en", | ||
| 403 | + ) | ||
| 404 | + ] | ||
| 405 | + for m in english_models: | ||
| 406 | + m.data_dir = f"{m.model_dir}/espeak-ng-data" | ||
| 407 | + m.vocoder = "hifigan_v2.onnx" | ||
| 408 | + | ||
| 409 | + return chinese_models + english_models | ||
| 410 | + | ||
| 411 | + | ||
| 381 | def main(): | 412 | def main(): |
| 382 | args = get_args() | 413 | args = get_args() |
| 383 | index = args.index | 414 | index = args.index |
| @@ -389,7 +420,10 @@ def main(): | @@ -389,7 +420,10 @@ def main(): | ||
| 389 | all_model_list += get_piper_models() | 420 | all_model_list += get_piper_models() |
| 390 | all_model_list += get_mimic3_models() | 421 | all_model_list += get_mimic3_models() |
| 391 | all_model_list += get_coqui_models() | 422 | all_model_list += get_coqui_models() |
| 423 | + all_model_list += get_matcha_models() | ||
| 424 | + | ||
| 392 | convert_lang_to_iso_639_3(all_model_list) | 425 | convert_lang_to_iso_639_3(all_model_list) |
| 426 | + print(all_model_list) | ||
| 393 | 427 | ||
| 394 | num_models = len(all_model_list) | 428 | num_models = len(all_model_list) |
| 395 | 429 |
| @@ -348,6 +348,10 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -348,6 +348,10 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 348 | mgr, config_.model.vits.lexicon, config_.model.vits.tokens, | 348 | mgr, config_.model.vits.lexicon, config_.model.vits.tokens, |
| 349 | config_.model.vits.dict_dir, model_->GetMetaData(), | 349 | config_.model.vits.dict_dir, model_->GetMetaData(), |
| 350 | config_.model.debug); | 350 | config_.model.debug); |
| 351 | + } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) { | ||
| 352 | + frontend_ = std::make_unique<JiebaLexicon>( | ||
| 353 | + mgr, config_.model.vits.lexicon, config_.model.vits.tokens, | ||
| 354 | + config_.model.vits.dict_dir, config_.model.debug); | ||
| 351 | } else if (meta_data.is_melo_tts && meta_data.language == "English") { | 355 | } else if (meta_data.is_melo_tts && meta_data.language == "English") { |
| 352 | frontend_ = std::make_unique<MeloTtsLexicon>( | 356 | frontend_ = std::make_unique<MeloTtsLexicon>( |
| 353 | mgr, config_.model.vits.lexicon, config_.model.vits.tokens, | 357 | mgr, config_.model.vits.lexicon, config_.model.vits.tokens, |
| @@ -173,22 +173,55 @@ class OfflineTts( | @@ -173,22 +173,55 @@ class OfflineTts( | ||
| 173 | // to download models | 173 | // to download models |
| 174 | fun getOfflineTtsConfig( | 174 | fun getOfflineTtsConfig( |
| 175 | modelDir: String, | 175 | modelDir: String, |
| 176 | - modelName: String, | 176 | + modelName: String, // for VITS |
| 177 | + acousticModelName: String, // for Matcha | ||
| 178 | + vocoder: String, // for Matcha | ||
| 177 | lexicon: String, | 179 | lexicon: String, |
| 178 | dataDir: String, | 180 | dataDir: String, |
| 179 | dictDir: String, | 181 | dictDir: String, |
| 180 | ruleFsts: String, | 182 | ruleFsts: String, |
| 181 | ruleFars: String | 183 | ruleFars: String |
| 182 | ): OfflineTtsConfig { | 184 | ): OfflineTtsConfig { |
| 185 | + if (modelName.isEmpty() && acousticModelName.isEmpty()) { | ||
| 186 | + throw IllegalArgumentException("Please specify a TTS model") | ||
| 187 | + } | ||
| 188 | + | ||
| 189 | + if (modelName.isNotEmpty() && acousticModelName.isNotEmpty()) { | ||
| 190 | + throw IllegalArgumentException("Please specify either a VITS or a Matcha model, but not both") | ||
| 191 | + } | ||
| 192 | + | ||
| 193 | + if (acousticModelName.isNotEmpty() && vocoder.isEmpty()) { | ||
| 194 | + throw IllegalArgumentException("Please provide vocoder for Matcha TTS") | ||
| 195 | + } | ||
| 196 | + val vits = if (modelName.isNotEmpty()) { | ||
| 197 | + OfflineTtsVitsModelConfig( | ||
| 198 | + model = "$modelDir/$modelName", | ||
| 199 | + lexicon = "$modelDir/$lexicon", | ||
| 200 | + tokens = "$modelDir/tokens.txt", | ||
| 201 | + dataDir = dataDir, | ||
| 202 | + dictDir = dictDir, | ||
| 203 | + ) | ||
| 204 | + } else { | ||
| 205 | + OfflineTtsVitsModelConfig() | ||
| 206 | + } | ||
| 207 | + | ||
| 208 | + val matcha = if (acousticModelName.isNotEmpty()) { | ||
| 209 | + OfflineTtsMatchaModelConfig( | ||
| 210 | + acousticModel = "$modelDir/$acousticModelName", | ||
| 211 | + vocoder = vocoder, | ||
| 212 | + lexicon = "$modelDir/$lexicon", | ||
| 213 | + tokens = "$modelDir/tokens.txt", | ||
| 214 | + dictDir = dictDir, | ||
| 215 | + dataDir = dataDir, | ||
| 216 | + ) | ||
| 217 | + } else { | ||
| 218 | + OfflineTtsMatchaModelConfig() | ||
| 219 | + } | ||
| 220 | + | ||
| 183 | return OfflineTtsConfig( | 221 | return OfflineTtsConfig( |
| 184 | model = OfflineTtsModelConfig( | 222 | model = OfflineTtsModelConfig( |
| 185 | - vits = OfflineTtsVitsModelConfig( | ||
| 186 | - model = "$modelDir/$modelName", | ||
| 187 | - lexicon = "$modelDir/$lexicon", | ||
| 188 | - tokens = "$modelDir/tokens.txt", | ||
| 189 | - dataDir = dataDir, | ||
| 190 | - dictDir = dictDir, | ||
| 191 | - ), | 223 | + vits = vits, |
| 224 | + matcha = matcha, | ||
| 192 | numThreads = 2, | 225 | numThreads = 2, |
| 193 | debug = true, | 226 | debug = true, |
| 194 | provider = "cpu", | 227 | provider = "cpu", |
-
请 注册 或 登录 后发表评论