Committed by
GitHub
Use piper-phonemize to convert text to token IDs (#453)
正在显示
55 个修改的文件
包含
1048 行增加
和
192 行删除
| @@ -52,14 +52,13 @@ node ./test-online-transducer.js | @@ -52,14 +52,13 @@ node ./test-online-transducer.js | ||
| 52 | rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 | 52 | rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 |
| 53 | 53 | ||
| 54 | # offline tts | 54 | # offline tts |
| 55 | -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 | ||
| 56 | -tar xvf vits-vctk.tar.bz2 | ||
| 57 | -rm vits-vctk.tar.bz2 | 55 | + |
| 56 | +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 57 | +tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 58 | node ./test-offline-tts-en.js | 58 | node ./test-offline-tts-en.js |
| 59 | -rm -rf vits-vctk | 59 | +rm vits-piper-en_US-amy-low.tar.bz2 |
| 60 | 60 | ||
| 61 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 | 61 | curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 |
| 62 | tar xvf vits-zh-aishell3.tar.bz2 | 62 | tar xvf vits-zh-aishell3.tar.bz2 |
| 63 | -rm vits-zh-aishell3.tar.bz2 | ||
| 64 | node ./test-offline-tts-zh.js | 63 | node ./test-offline-tts-zh.js |
| 65 | -rm -rf vits-zh-aishell3 | 64 | +rm vits-zh-aishell3.tar.bz2 |
| @@ -17,6 +17,24 @@ which $EXE | @@ -17,6 +17,24 @@ which $EXE | ||
| 17 | mkdir ./tts | 17 | mkdir ./tts |
| 18 | 18 | ||
| 19 | log "------------------------------------------------------------" | 19 | log "------------------------------------------------------------" |
| 20 | +log "vits-piper-en_US-amy-low" | ||
| 21 | +log "------------------------------------------------------------" | ||
| 22 | +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 23 | +tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 24 | +rm vits-piper-en_US-amy-low.tar.bz2 | ||
| 25 | + | ||
| 26 | +$EXE \ | ||
| 27 | + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ | ||
| 28 | + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ | ||
| 29 | + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | ||
| 30 | + --debug=1 \ | ||
| 31 | + --output-filename=./tts/amy.wav \ | ||
| 32 | + "“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.” The sun shone bleakly in the sky, its meager light struggling to penetrate the thick foliage of the forest. Birds sang their songs up in the crowns of the trees, fluttering from one branch to the other. A blanket of total tranquility lied over the forest. The peace was only broken by the steady gallop of the horses of the soldiers who were traveling to their upcoming knighting the morrow at Camelot, and rowdy conversation. “Finally we will get what we deserve,” “It’s been about time,” Perceval agreed. “We’ve been risking our arses for the past two years. It’s the least they could give us.” Merlin remained ostensibly silent, refusing to join the verbal parade of self-aggrandizing his fellow soldiers have engaged in. He found it difficult to happy about anything, when even if they had won the war, he had lost everything else in the process." | ||
| 33 | + | ||
| 34 | +file ./tts/amy.wav | ||
| 35 | +rm -rf vits-piper-en_US-amy-low | ||
| 36 | + | ||
| 37 | +log "------------------------------------------------------------" | ||
| 20 | log "vits-ljs test" | 38 | log "vits-ljs test" |
| 21 | log "------------------------------------------------------------" | 39 | log "------------------------------------------------------------" |
| 22 | 40 |
| @@ -26,8 +26,8 @@ jobs: | @@ -26,8 +26,8 @@ jobs: | ||
| 26 | fail-fast: false | 26 | fail-fast: false |
| 27 | matrix: | 27 | matrix: |
| 28 | os: [ubuntu-latest] | 28 | os: [ubuntu-latest] |
| 29 | - total: ["12"] | ||
| 30 | - index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"] | 29 | + total: ["30"] |
| 30 | + index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29"] | ||
| 31 | 31 | ||
| 32 | steps: | 32 | steps: |
| 33 | - uses: actions/checkout@v4 | 33 | - uses: actions/checkout@v4 |
.github/workflows/test-build-wheel.yaml
0 → 100644
| 1 | +name: test-build-wheel | ||
| 2 | + | ||
| 3 | +on: | ||
| 4 | + push: | ||
| 5 | + branches: | ||
| 6 | + - master | ||
| 7 | + | ||
| 8 | + pull_request: | ||
| 9 | + | ||
| 10 | + workflow_dispatch: | ||
| 11 | + | ||
| 12 | +concurrency: | ||
| 13 | + group: test-build-wheel-${{ github.ref }} | ||
| 14 | + cancel-in-progress: true | ||
| 15 | + | ||
| 16 | +jobs: | ||
| 17 | + test-build-wheel: | ||
| 18 | + name: ${{ matrix.os }} ${{ matrix.python_version }} | ||
| 19 | + runs-on: ${{ matrix.os }} | ||
| 20 | + strategy: | ||
| 21 | + fail-fast: false | ||
| 22 | + matrix: | ||
| 23 | + os: [ubuntu-latest, macos-latest, windows-latest] | ||
| 24 | + python-version: ["3.8", "3.9", "3.10", "3.11"] | ||
| 25 | + | ||
| 26 | + steps: | ||
| 27 | + - uses: actions/checkout@v4 | ||
| 28 | + with: | ||
| 29 | + fetch-depth: 0 | ||
| 30 | + | ||
| 31 | + - name: Setup Python ${{ matrix.python-version }} | ||
| 32 | + uses: actions/setup-python@v2 | ||
| 33 | + with: | ||
| 34 | + python-version: ${{ matrix.python-version }} | ||
| 35 | + | ||
| 36 | + - name: ccache | ||
| 37 | + uses: hendrikmuhs/ccache-action@v1.2 | ||
| 38 | + with: | ||
| 39 | + key: ${{ matrix.os }}-${{ matrix.python_version }} | ||
| 40 | + | ||
| 41 | + - name: Install python dependencies | ||
| 42 | + shell: bash | ||
| 43 | + run: | | ||
| 44 | + python3 -m pip install --upgrade pip | ||
| 45 | + python3 -m pip install wheel twine setuptools | ||
| 46 | + | ||
| 47 | + - name: Build | ||
| 48 | + shell: bash | ||
| 49 | + run: | | ||
| 50 | + export CMAKE_CXX_COMPILER_LAUNCHER=ccache | ||
| 51 | + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" | ||
| 52 | + cmake --version | ||
| 53 | + | ||
| 54 | + export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j" | ||
| 55 | + | ||
| 56 | + python3 setup.py bdist_wheel | ||
| 57 | + ls -lh dist | ||
| 58 | + | ||
| 59 | + - name: Display wheel | ||
| 60 | + shell: bash | ||
| 61 | + run: | | ||
| 62 | + ls -lh dist | ||
| 63 | + | ||
| 64 | + - name: Install wheel | ||
| 65 | + shell: bash | ||
| 66 | + run: | | ||
| 67 | + pip install --verbose ./dist/*.whl | ||
| 68 | + | ||
| 69 | + - name: Test | ||
| 70 | + shell: bash | ||
| 71 | + run: | | ||
| 72 | + # For windows | ||
| 73 | + export PATH=/c/hostedtoolcache/windows/Python/3.7.9/x64/bin:$PATH | ||
| 74 | + export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH | ||
| 75 | + export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH | ||
| 76 | + export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH | ||
| 77 | + export PATH=/c/hostedtoolcache/windows/Python/3.11.6/x64/bin:$PATH | ||
| 78 | + | ||
| 79 | + which sherpa-onnx | ||
| 80 | + sherpa-onnx --help |
| @@ -70,6 +70,10 @@ jobs: | @@ -70,6 +70,10 @@ jobs: | ||
| 70 | mkdir -p scripts/nodejs/lib/win-x64 | 70 | mkdir -p scripts/nodejs/lib/win-x64 |
| 71 | dst=scripts/nodejs/lib/win-x64 | 71 | dst=scripts/nodejs/lib/win-x64 |
| 72 | fi | 72 | fi |
| 73 | + ls -lh build/install/lib/ | ||
| 74 | + | ||
| 75 | + rm -rf build/install/lib/pkgconfig | ||
| 76 | + | ||
| 73 | cp -v build/install/lib/* $dst/ | 77 | cp -v build/install/lib/* $dst/ |
| 74 | 78 | ||
| 75 | - name: replace files | 79 | - name: replace files |
| @@ -2,6 +2,8 @@ | @@ -2,6 +2,8 @@ | ||
| 2 | <manifest xmlns:android="http://schemas.android.com/apk/res/android" | 2 | <manifest xmlns:android="http://schemas.android.com/apk/res/android" |
| 3 | xmlns:tools="http://schemas.android.com/tools"> | 3 | xmlns:tools="http://schemas.android.com/tools"> |
| 4 | 4 | ||
| 5 | + <uses-permission android:name="android.permission.WRITE_INTERNAL_STORAGE" /> | ||
| 6 | + | ||
| 5 | <application | 7 | <application |
| 6 | android:allowBackup="true" | 8 | android:allowBackup="true" |
| 7 | android:dataExtractionRules="@xml/data_extraction_rules" | 9 | android:dataExtractionRules="@xml/data_extraction_rules" |
| 1 | package com.k2fsa.sherpa.onnx | 1 | package com.k2fsa.sherpa.onnx |
| 2 | 2 | ||
| 3 | +import android.content.res.AssetManager | ||
| 3 | import android.media.MediaPlayer | 4 | import android.media.MediaPlayer |
| 4 | import android.net.Uri | 5 | import android.net.Uri |
| 5 | import android.os.Bundle | 6 | import android.os.Bundle |
| @@ -9,6 +10,8 @@ import android.widget.EditText | @@ -9,6 +10,8 @@ import android.widget.EditText | ||
| 9 | import android.widget.Toast | 10 | import android.widget.Toast |
| 10 | import androidx.appcompat.app.AppCompatActivity | 11 | import androidx.appcompat.app.AppCompatActivity |
| 11 | import java.io.File | 12 | import java.io.File |
| 13 | +import java.io.FileOutputStream | ||
| 14 | +import java.io.IOException | ||
| 12 | 15 | ||
| 13 | const val TAG = "sherpa-onnx" | 16 | const val TAG = "sherpa-onnx" |
| 14 | 17 | ||
| @@ -19,7 +22,6 @@ class MainActivity : AppCompatActivity() { | @@ -19,7 +22,6 @@ class MainActivity : AppCompatActivity() { | ||
| 19 | private lateinit var speed: EditText | 22 | private lateinit var speed: EditText |
| 20 | private lateinit var generate: Button | 23 | private lateinit var generate: Button |
| 21 | private lateinit var play: Button | 24 | private lateinit var play: Button |
| 22 | - private var hasFile: Boolean = false | ||
| 23 | 25 | ||
| 24 | override fun onCreate(savedInstanceState: Bundle?) { | 26 | override fun onCreate(savedInstanceState: Bundle?) { |
| 25 | super.onCreate(savedInstanceState) | 27 | super.onCreate(savedInstanceState) |
| @@ -46,10 +48,10 @@ class MainActivity : AppCompatActivity() { | @@ -46,10 +48,10 @@ class MainActivity : AppCompatActivity() { | ||
| 46 | val sampleText = "" | 48 | val sampleText = "" |
| 47 | text.setText(sampleText) | 49 | text.setText(sampleText) |
| 48 | 50 | ||
| 49 | - play.isEnabled = false; | 51 | + play.isEnabled = false |
| 50 | } | 52 | } |
| 51 | 53 | ||
| 52 | - fun onClickGenerate() { | 54 | + private fun onClickGenerate() { |
| 53 | val sidInt = sid.text.toString().toIntOrNull() | 55 | val sidInt = sid.text.toString().toIntOrNull() |
| 54 | if (sidInt == null || sidInt < 0) { | 56 | if (sidInt == null || sidInt < 0) { |
| 55 | Toast.makeText( | 57 | Toast.makeText( |
| @@ -77,7 +79,7 @@ class MainActivity : AppCompatActivity() { | @@ -77,7 +79,7 @@ class MainActivity : AppCompatActivity() { | ||
| 77 | return | 79 | return |
| 78 | } | 80 | } |
| 79 | 81 | ||
| 80 | - play.isEnabled = false; | 82 | + play.isEnabled = false |
| 81 | val audio = tts.generate(text = textStr, sid = sidInt, speed = speedFloat) | 83 | val audio = tts.generate(text = textStr, sid = sidInt, speed = speedFloat) |
| 82 | 84 | ||
| 83 | val filename = application.filesDir.absolutePath + "/generated.wav" | 85 | val filename = application.filesDir.absolutePath + "/generated.wav" |
| @@ -89,7 +91,7 @@ class MainActivity : AppCompatActivity() { | @@ -89,7 +91,7 @@ class MainActivity : AppCompatActivity() { | ||
| 89 | } | 91 | } |
| 90 | } | 92 | } |
| 91 | 93 | ||
| 92 | - fun onClickPlay() { | 94 | + private fun onClickPlay() { |
| 93 | val filename = application.filesDir.absolutePath + "/generated.wav" | 95 | val filename = application.filesDir.absolutePath + "/generated.wav" |
| 94 | val mediaPlayer = MediaPlayer.create( | 96 | val mediaPlayer = MediaPlayer.create( |
| 95 | applicationContext, | 97 | applicationContext, |
| @@ -98,10 +100,13 @@ class MainActivity : AppCompatActivity() { | @@ -98,10 +100,13 @@ class MainActivity : AppCompatActivity() { | ||
| 98 | mediaPlayer.start() | 100 | mediaPlayer.start() |
| 99 | } | 101 | } |
| 100 | 102 | ||
| 101 | - fun initTts() { | ||
| 102 | - var modelDir :String? | ||
| 103 | - var modelName :String? | 103 | + private fun initTts() { |
| 104 | + var modelDir: String? | ||
| 105 | + var modelName: String? | ||
| 104 | var ruleFsts: String? | 106 | var ruleFsts: String? |
| 107 | + var lexicon: String? | ||
| 108 | + var dataDir: String? | ||
| 109 | + var assets: AssetManager? = application.assets | ||
| 105 | 110 | ||
| 106 | // The purpose of such a design is to make the CI test easier | 111 | // The purpose of such a design is to make the CI test easier |
| 107 | // Please see | 112 | // Please see |
| @@ -109,21 +114,90 @@ class MainActivity : AppCompatActivity() { | @@ -109,21 +114,90 @@ class MainActivity : AppCompatActivity() { | ||
| 109 | modelDir = null | 114 | modelDir = null |
| 110 | modelName = null | 115 | modelName = null |
| 111 | ruleFsts = null | 116 | ruleFsts = null |
| 117 | + lexicon = null | ||
| 118 | + dataDir = null | ||
| 112 | 119 | ||
| 113 | // Example 1: | 120 | // Example 1: |
| 114 | // modelDir = "vits-vctk" | 121 | // modelDir = "vits-vctk" |
| 115 | // modelName = "vits-vctk.onnx" | 122 | // modelName = "vits-vctk.onnx" |
| 123 | + // lexicon = "lexicon.txt" | ||
| 116 | 124 | ||
| 117 | // Example 2: | 125 | // Example 2: |
| 118 | - // modelDir = "vits-piper-en_US-lessac-medium" | ||
| 119 | - // modelName = "en_US-lessac-medium.onnx" | 126 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models |
| 127 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 128 | + // modelDir = "vits-piper-en_US-amy-low" | ||
| 129 | + // modelName = "en_US-amy-low.onnx" | ||
| 130 | + // dataDir = "vits-piper-en_US-amy-low/espeak-ng-data" | ||
| 120 | 131 | ||
| 121 | // Example 3: | 132 | // Example 3: |
| 122 | // modelDir = "vits-zh-aishell3" | 133 | // modelDir = "vits-zh-aishell3" |
| 123 | // modelName = "vits-aishell3.onnx" | 134 | // modelName = "vits-aishell3.onnx" |
| 124 | // ruleFsts = "vits-zh-aishell3/rule.fst" | 135 | // ruleFsts = "vits-zh-aishell3/rule.fst" |
| 136 | + // lexcion = "lexicon.txt" | ||
| 125 | 137 | ||
| 126 | - val config = getOfflineTtsConfig(modelDir = modelDir!!, modelName = modelName!!, ruleFsts = ruleFsts ?: "")!! | ||
| 127 | - tts = OfflineTts(assetManager = application.assets, config = config) | 138 | + if (dataDir != null) { |
| 139 | + val newDir = copyDataDir(modelDir) | ||
| 140 | + modelDir = newDir + "/" + modelDir | ||
| 141 | + dataDir = newDir + "/" + dataDir | ||
| 142 | + assets = null | ||
| 143 | + } | ||
| 144 | + | ||
| 145 | + val config = getOfflineTtsConfig( | ||
| 146 | + modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "", | ||
| 147 | + dataDir = dataDir ?: "", | ||
| 148 | + ruleFsts = ruleFsts ?: "" | ||
| 149 | + )!! | ||
| 150 | + | ||
| 151 | + tts = OfflineTts(assetManager = assets, config = config) | ||
| 152 | + } | ||
| 153 | + | ||
| 154 | + | ||
| 155 | + private fun copyDataDir(dataDir: String): String { | ||
| 156 | + println("data dir is $dataDir") | ||
| 157 | + copyAssets(dataDir) | ||
| 158 | + | ||
| 159 | + val newDataDir = application.getExternalFilesDir(null)!!.absolutePath | ||
| 160 | + println("newDataDir: $newDataDir") | ||
| 161 | + return newDataDir | ||
| 162 | + } | ||
| 163 | + | ||
| 164 | + private fun copyAssets(path: String) { | ||
| 165 | + val assets: Array<String>? | ||
| 166 | + try { | ||
| 167 | + assets = application.assets.list(path) | ||
| 168 | + if (assets!!.isEmpty()) { | ||
| 169 | + copyFile(path) | ||
| 170 | + } else { | ||
| 171 | + val fullPath = "${application.getExternalFilesDir(null)}/$path" | ||
| 172 | + val dir = File(fullPath) | ||
| 173 | + dir.mkdirs() | ||
| 174 | + for (asset in assets.iterator()) { | ||
| 175 | + val p: String = if (path == "") "" else path + "/" | ||
| 176 | + copyAssets(p + asset) | ||
| 177 | + } | ||
| 178 | + } | ||
| 179 | + } catch (ex: IOException) { | ||
| 180 | + Log.e(TAG, "Failed to copy $path. ${ex.toString()}") | ||
| 181 | + } | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + private fun copyFile(filename: String) { | ||
| 185 | + try { | ||
| 186 | + val istream = application.assets.open(filename) | ||
| 187 | + val newFilename = application.getExternalFilesDir(null).toString() + "/" + filename | ||
| 188 | + val ostream = FileOutputStream(newFilename) | ||
| 189 | + // Log.i(TAG, "Copying $filename to $newFilename") | ||
| 190 | + val buffer = ByteArray(1024) | ||
| 191 | + var read = 0 | ||
| 192 | + while (read != -1) { | ||
| 193 | + ostream.write(buffer, 0, read) | ||
| 194 | + read = istream.read(buffer) | ||
| 195 | + } | ||
| 196 | + istream.close() | ||
| 197 | + ostream.flush() | ||
| 198 | + ostream.close() | ||
| 199 | + } catch (ex: Exception) { | ||
| 200 | + Log.e(TAG, "Failed to copy $filename, ${ex.toString()}") | ||
| 201 | + } | ||
| 128 | } | 202 | } |
| 129 | } | 203 | } |
| @@ -5,8 +5,9 @@ import android.content.res.AssetManager | @@ -5,8 +5,9 @@ import android.content.res.AssetManager | ||
| 5 | 5 | ||
| 6 | data class OfflineTtsVitsModelConfig( | 6 | data class OfflineTtsVitsModelConfig( |
| 7 | var model: String, | 7 | var model: String, |
| 8 | - var lexicon: String, | 8 | + var lexicon: String = "", |
| 9 | var tokens: String, | 9 | var tokens: String, |
| 10 | + var dataDir: String = "", | ||
| 10 | var noiseScale: Float = 0.667f, | 11 | var noiseScale: Float = 0.667f, |
| 11 | var noiseScaleW: Float = 0.8f, | 12 | var noiseScaleW: Float = 0.8f, |
| 12 | var lengthScale: Float = 1.0f, | 13 | var lengthScale: Float = 1.0f, |
| @@ -22,6 +23,7 @@ data class OfflineTtsModelConfig( | @@ -22,6 +23,7 @@ data class OfflineTtsModelConfig( | ||
| 22 | data class OfflineTtsConfig( | 23 | data class OfflineTtsConfig( |
| 23 | var model: OfflineTtsModelConfig, | 24 | var model: OfflineTtsModelConfig, |
| 24 | var ruleFsts: String = "", | 25 | var ruleFsts: String = "", |
| 26 | + var maxNumSentences: Int = 2, | ||
| 25 | ) | 27 | ) |
| 26 | 28 | ||
| 27 | class GeneratedAudio( | 29 | class GeneratedAudio( |
| @@ -117,18 +119,25 @@ class OfflineTts( | @@ -117,18 +119,25 @@ class OfflineTts( | ||
| 117 | // please refer to | 119 | // please refer to |
| 118 | // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html | 120 | // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html |
| 119 | // to download models | 121 | // to download models |
| 120 | -fun getOfflineTtsConfig(modelDir: String, modelName: String, ruleFsts: String): OfflineTtsConfig? { | 122 | +fun getOfflineTtsConfig( |
| 123 | + modelDir: String, | ||
| 124 | + modelName: String, | ||
| 125 | + lexicon: String, | ||
| 126 | + dataDir: String, | ||
| 127 | + ruleFsts: String | ||
| 128 | +): OfflineTtsConfig? { | ||
| 121 | return OfflineTtsConfig( | 129 | return OfflineTtsConfig( |
| 122 | model = OfflineTtsModelConfig( | 130 | model = OfflineTtsModelConfig( |
| 123 | vits = OfflineTtsVitsModelConfig( | 131 | vits = OfflineTtsVitsModelConfig( |
| 124 | model = "$modelDir/$modelName", | 132 | model = "$modelDir/$modelName", |
| 125 | - lexicon = "$modelDir/lexicon.txt", | ||
| 126 | - tokens = "$modelDir/tokens.txt" | 133 | + lexicon = "$modelDir/$lexicon", |
| 134 | + tokens = "$modelDir/tokens.txt", | ||
| 135 | + dataDir = "$dataDir" | ||
| 127 | ), | 136 | ), |
| 128 | numThreads = 2, | 137 | numThreads = 2, |
| 129 | debug = true, | 138 | debug = true, |
| 130 | provider = "cpu", | 139 | provider = "cpu", |
| 131 | ), | 140 | ), |
| 132 | - ruleFsts=ruleFsts, | 141 | + ruleFsts = ruleFsts, |
| 133 | ) | 142 | ) |
| 134 | } | 143 | } |
| @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | ||
| 92 | make -j4 | 92 | make -j4 |
| 93 | make install/strip | 93 | make install/strip |
| 94 | cp -fv android-onnxruntime-libs/jni/arm64-v8a/libonnxruntime.so install/lib | 94 | cp -fv android-onnxruntime-libs/jni/arm64-v8a/libonnxruntime.so install/lib |
| 95 | +rm -rf install/lib/pkgconfig |
| @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | ||
| 92 | make -j4 | 92 | make -j4 |
| 93 | make install/strip | 93 | make install/strip |
| 94 | cp -fv android-onnxruntime-libs/jni/armeabi-v7a/libonnxruntime.so install/lib | 94 | cp -fv android-onnxruntime-libs/jni/armeabi-v7a/libonnxruntime.so install/lib |
| 95 | +rm -rf install/lib/pkgconfig |
| @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | ||
| 94 | make -j4 | 94 | make -j4 |
| 95 | make install/strip | 95 | make install/strip |
| 96 | cp -fv android-onnxruntime-libs/jni/x86_64/libonnxruntime.so install/lib | 96 | cp -fv android-onnxruntime-libs/jni/x86_64/libonnxruntime.so install/lib |
| 97 | +rm -rf install/lib/pkgconfig |
| @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" | ||
| 94 | make -j4 | 94 | make -j4 |
| 95 | make install/strip | 95 | make install/strip |
| 96 | cp -fv android-onnxruntime-libs/jni/x86/libonnxruntime.so install/lib | 96 | cp -fv android-onnxruntime-libs/jni/x86/libonnxruntime.so install/lib |
| 97 | +rm -rf install/lib/pkgconfig |
| @@ -140,7 +140,8 @@ echo "Generate xcframework" | @@ -140,7 +140,8 @@ echo "Generate xcframework" | ||
| 140 | 140 | ||
| 141 | mkdir -p "build/simulator/lib" | 141 | mkdir -p "build/simulator/lib" |
| 142 | for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \ | 142 | for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \ |
| 143 | - libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a; do | 143 | + libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \ |
| 144 | + libucd.a libpiper_phonemize.a libespeak-ng.a; do | ||
| 144 | lipo -create build/simulator_arm64/lib/${f} \ | 145 | lipo -create build/simulator_arm64/lib/${f} \ |
| 145 | build/simulator_x86_64/lib/${f} \ | 146 | build/simulator_x86_64/lib/${f} \ |
| 146 | -output build/simulator/lib/${f} | 147 | -output build/simulator/lib/${f} |
| @@ -154,7 +155,10 @@ libtool -static -o build/simulator/sherpa-onnx.a \ | @@ -154,7 +155,10 @@ libtool -static -o build/simulator/sherpa-onnx.a \ | ||
| 154 | build/simulator/lib/libsherpa-onnx-core.a \ | 155 | build/simulator/lib/libsherpa-onnx-core.a \ |
| 155 | build/simulator/lib/libsherpa-onnx-fst.a \ | 156 | build/simulator/lib/libsherpa-onnx-fst.a \ |
| 156 | build/simulator/lib/libsherpa-onnx-kaldifst-core.a \ | 157 | build/simulator/lib/libsherpa-onnx-kaldifst-core.a \ |
| 157 | - build/simulator/lib/libkaldi-decoder-core.a | 158 | + build/simulator/lib/libkaldi-decoder-core.a \ |
| 159 | + build/simulator/lib/libucd.a \ | ||
| 160 | + build/simulator/lib/libpiper_phonemize.a \ | ||
| 161 | + build/simulator/lib/libespeak-ng.a \ | ||
| 158 | 162 | ||
| 159 | libtool -static -o build/os64/sherpa-onnx.a \ | 163 | libtool -static -o build/os64/sherpa-onnx.a \ |
| 160 | build/os64/lib/libkaldi-native-fbank-core.a \ | 164 | build/os64/lib/libkaldi-native-fbank-core.a \ |
| @@ -162,7 +166,10 @@ libtool -static -o build/os64/sherpa-onnx.a \ | @@ -162,7 +166,10 @@ libtool -static -o build/os64/sherpa-onnx.a \ | ||
| 162 | build/os64/lib/libsherpa-onnx-core.a \ | 166 | build/os64/lib/libsherpa-onnx-core.a \ |
| 163 | build/os64/lib/libsherpa-onnx-fst.a \ | 167 | build/os64/lib/libsherpa-onnx-fst.a \ |
| 164 | build/os64/lib/libsherpa-onnx-kaldifst-core.a \ | 168 | build/os64/lib/libsherpa-onnx-kaldifst-core.a \ |
| 165 | - build/os64/lib/libkaldi-decoder-core.a | 169 | + build/os64/lib/libkaldi-decoder-core.a \ |
| 170 | + build/os64/lib/libucd.a \ | ||
| 171 | + build/os64/lib/libpiper_phonemize.a \ | ||
| 172 | + build/os64/lib/libespeak-ng.a \ | ||
| 166 | 173 | ||
| 167 | 174 | ||
| 168 | rm -rf sherpa-onnx.xcframework | 175 | rm -rf sherpa-onnx.xcframework |
| @@ -29,4 +29,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \ | @@ -29,4 +29,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \ | ||
| 29 | ./install/lib/libkaldi-native-fbank-core.a \ | 29 | ./install/lib/libkaldi-native-fbank-core.a \ |
| 30 | ./install/lib/libsherpa-onnx-fst.a \ | 30 | ./install/lib/libsherpa-onnx-fst.a \ |
| 31 | ./install/lib/libsherpa-onnx-kaldifst-core.a \ | 31 | ./install/lib/libsherpa-onnx-kaldifst-core.a \ |
| 32 | - ./install/lib/libkaldi-decoder-core.a | 32 | + ./install/lib/libkaldi-decoder-core.a \ |
| 33 | + ./install/lib/libucd.a \ | ||
| 34 | + ./install/lib/libpiper_phonemize.a \ | ||
| 35 | + ./install/lib/libespeak-ng.a |
| @@ -65,6 +65,29 @@ static struct cag_option options[] = { | @@ -65,6 +65,29 @@ static struct cag_option options[] = { | ||
| 65 | .identifier = 'a', | 65 | .identifier = 'a', |
| 66 | .description = | 66 | .description = |
| 67 | "Filename to save the generated audio. Default to ./generated.wav"}, | 67 | "Filename to save the generated audio. Default to ./generated.wav"}, |
| 68 | + | ||
| 69 | + {.access_name = "tts-rule-fsts", | ||
| 70 | + .value_name = "/path/to/rule.fst", | ||
| 71 | + .identifier = 'b', | ||
| 72 | + .description = "It not empty, it contains a list of rule FST filenames." | ||
| 73 | + "Multiple filenames are separated by a comma and they are " | ||
| 74 | + "applied from left to right. An example value: " | ||
| 75 | + "rule1.fst,rule2,fst,rule3.fst"}, | ||
| 76 | + | ||
| 77 | + {.access_name = "max-num-sentences", | ||
| 78 | + .value_name = "2", | ||
| 79 | + .identifier = 'c', | ||
| 80 | + .description = "Maximum number of sentences that we process at a time. " | ||
| 81 | + "This is to avoid OOM for very long input text. " | ||
| 82 | + "If you set it to -1, then we process all sentences in a " | ||
| 83 | + "single batch."}, | ||
| 84 | + | ||
| 85 | + {.access_name = "vits-data-dir", | ||
| 86 | + .value_name = "/path/to/espeak-ng-data", | ||
| 87 | + .identifier = 'd', | ||
| 88 | + .description = | ||
| 89 | + "Path to espeak-ng-data. If it is given, --vits-lexicon is ignored"}, | ||
| 90 | + | ||
| 68 | }; | 91 | }; |
| 69 | 92 | ||
| 70 | static void ShowUsage() { | 93 | static void ShowUsage() { |
| @@ -163,15 +186,38 @@ int32_t main(int32_t argc, char *argv[]) { | @@ -163,15 +186,38 @@ int32_t main(int32_t argc, char *argv[]) { | ||
| 163 | free((void *)filename); | 186 | free((void *)filename); |
| 164 | filename = strdup(value); | 187 | filename = strdup(value); |
| 165 | break; | 188 | break; |
| 189 | + case 'b': | ||
| 190 | + config.rule_fsts = value; | ||
| 191 | + break; | ||
| 192 | + case 'c': | ||
| 193 | + config.max_num_sentences = atoi(value); | ||
| 194 | + break; | ||
| 195 | + case 'd': | ||
| 196 | + config.model.vits.data_dir = value; | ||
| 197 | + break; | ||
| 198 | + case '?': | ||
| 199 | + fprintf(stderr, "Unknown option\n"); | ||
| 200 | + // fall through | ||
| 166 | case 'h': | 201 | case 'h': |
| 167 | // fall through | 202 | // fall through |
| 168 | default: | 203 | default: |
| 169 | ShowUsage(); | 204 | ShowUsage(); |
| 170 | } | 205 | } |
| 171 | } | 206 | } |
| 207 | + fprintf(stderr, "here\n"); | ||
| 208 | + | ||
| 209 | + if (!config.model.vits.model) { | ||
| 210 | + fprintf(stderr, "Please provide --vits-model\n"); | ||
| 211 | + ShowUsage(); | ||
| 212 | + } | ||
| 213 | + | ||
| 214 | + if (!config.model.vits.tokens) { | ||
| 215 | + fprintf(stderr, "Please provide --vits-tokens\n"); | ||
| 216 | + ShowUsage(); | ||
| 217 | + } | ||
| 172 | 218 | ||
| 173 | - if (!config.model.vits.model || !config.model.vits.lexicon || | ||
| 174 | - !config.model.vits.tokens) { | 219 | + if (!config.model.vits.data_dir && !config.model.vits.lexicon) { |
| 220 | + fprintf(stderr, "Please provide --vits-data-dir or --vits-lexicon\n"); | ||
| 175 | ShowUsage(); | 221 | ShowUsage(); |
| 176 | } | 222 | } |
| 177 | 223 |
| @@ -73,6 +73,10 @@ class BuildExtension(build_ext): | @@ -73,6 +73,10 @@ class BuildExtension(build_ext): | ||
| 73 | 73 | ||
| 74 | extra_cmake_args = f" -DCMAKE_INSTALL_PREFIX={install_dir} " | 74 | extra_cmake_args = f" -DCMAKE_INSTALL_PREFIX={install_dir} " |
| 75 | extra_cmake_args += " -DBUILD_SHARED_LIBS=ON " | 75 | extra_cmake_args += " -DBUILD_SHARED_LIBS=ON " |
| 76 | + extra_cmake_args += " -DBUILD_PIPER_PHONMIZE_EXE=OFF " | ||
| 77 | + extra_cmake_args += " -DBUILD_PIPER_PHONMIZE_TESTS=OFF " | ||
| 78 | + extra_cmake_args += " -DBUILD_ESPEAK_NG_EXE=OFF " | ||
| 79 | + extra_cmake_args += " -DBUILD_ESPEAK_NG_TESTS=OFF " | ||
| 76 | 80 | ||
| 77 | extra_cmake_args += " -DSHERPA_ONNX_ENABLE_CHECK=OFF " | 81 | extra_cmake_args += " -DSHERPA_ONNX_ENABLE_CHECK=OFF " |
| 78 | extra_cmake_args += " -DSHERPA_ONNX_ENABLE_PYTHON=ON " | 82 | extra_cmake_args += " -DSHERPA_ONNX_ENABLE_PYTHON=ON " |
| @@ -146,6 +150,9 @@ class BuildExtension(build_ext): | @@ -146,6 +150,9 @@ class BuildExtension(build_ext): | ||
| 146 | binaries += ["sherpa-onnx-core.dll"] | 150 | binaries += ["sherpa-onnx-core.dll"] |
| 147 | binaries += ["sherpa-onnx-portaudio.dll"] | 151 | binaries += ["sherpa-onnx-portaudio.dll"] |
| 148 | binaries += ["onnxruntime.dll"] | 152 | binaries += ["onnxruntime.dll"] |
| 153 | + binaries += ["piper_phonemize.dll"] | ||
| 154 | + binaries += ["espeak-ng.dll"] | ||
| 155 | + binaries += ["ucd.dll"] | ||
| 149 | binaries += ["kaldi-decoder-core.dll"] | 156 | binaries += ["kaldi-decoder-core.dll"] |
| 150 | binaries += ["sherpa-onnx-fst.lib"] | 157 | binaries += ["sherpa-onnx-fst.lib"] |
| 151 | binaries += ["sherpa-onnx-kaldifst-core.lib"] | 158 | binaries += ["sherpa-onnx-kaldifst-core.lib"] |
| @@ -161,5 +168,8 @@ class BuildExtension(build_ext): | @@ -161,5 +168,8 @@ class BuildExtension(build_ext): | ||
| 161 | shutil.copy(f"{src_file}", f"{out_bin_dir}/") | 168 | shutil.copy(f"{src_file}", f"{out_bin_dir}/") |
| 162 | 169 | ||
| 163 | shutil.rmtree(f"{install_dir}/bin") | 170 | shutil.rmtree(f"{install_dir}/bin") |
| 171 | + shutil.rmtree(f"{install_dir}/share") | ||
| 172 | + shutil.rmtree(f"{install_dir}/lib/pkgconfig") | ||
| 173 | + | ||
| 164 | if is_windows(): | 174 | if is_windows(): |
| 165 | shutil.rmtree(f"{install_dir}/lib") | 175 | shutil.rmtree(f"{install_dir}/lib") |
| @@ -86,7 +86,7 @@ function(download_espeak_ng_for_piper) | @@ -86,7 +86,7 @@ function(download_espeak_ng_for_piper) | ||
| 86 | -Wno-unused-result | 86 | -Wno-unused-result |
| 87 | -Wno-format-overflow | 87 | -Wno-format-overflow |
| 88 | -Wno-format-truncation | 88 | -Wno-format-truncation |
| 89 | - -Wno-maybe-uninitialized | 89 | + -Wno-uninitialized |
| 90 | -Wno-format | 90 | -Wno-format |
| 91 | ) | 91 | ) |
| 92 | 92 |
| @@ -13,4 +13,4 @@ Cflags: -I"${includedir}" | @@ -13,4 +13,4 @@ Cflags: -I"${includedir}" | ||
| 13 | # Note: -lcargs is required only for the following file | 13 | # Note: -lcargs is required only for the following file |
| 14 | # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c | 14 | # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c |
| 15 | # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c | 15 | # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c |
| 16 | -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lcargs -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ | 16 | +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ |
| @@ -40,7 +40,7 @@ | @@ -40,7 +40,7 @@ | ||
| 40 | /* End PBXContainerItemProxy section */ | 40 | /* End PBXContainerItemProxy section */ |
| 41 | 41 | ||
| 42 | /* Begin PBXFileReference section */ | 42 | /* Begin PBXFileReference section */ |
| 43 | - C93989AF2A89FE33009AB859 /* onnxruntime.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = onnxruntime.xcframework; path = "../../build-ios/ios-onnxruntime/1.16.0/onnxruntime.xcframework"; sourceTree = "<group>"; }; | 43 | + C93989AF2A89FE33009AB859 /* onnxruntime.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = onnxruntime.xcframework; path = "../../build-ios/ios-onnxruntime/1.16.3/onnxruntime.xcframework"; sourceTree = "<group>"; }; |
| 44 | C93989B12A89FF78009AB859 /* decoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = decoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"; sourceTree = "<group>"; }; | 44 | C93989B12A89FF78009AB859 /* decoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = decoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"; sourceTree = "<group>"; }; |
| 45 | C93989B22A89FF78009AB859 /* encoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = encoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx"; sourceTree = "<group>"; }; | 45 | C93989B22A89FF78009AB859 /* encoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = encoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx"; sourceTree = "<group>"; }; |
| 46 | C93989B32A89FF78009AB859 /* tokens.txt */ = {isa = PBXFileReference; lastKnownFileType = text; name = tokens.txt; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt"; sourceTree = "<group>"; }; | 46 | C93989B32A89FF78009AB859 /* tokens.txt */ = {isa = PBXFileReference; lastKnownFileType = text; name = tokens.txt; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt"; sourceTree = "<group>"; }; |
| @@ -65,7 +65,7 @@ struct ContentView: View { | @@ -65,7 +65,7 @@ struct ContentView: View { | ||
| 65 | self.filename = tempDirectoryURL.appendingPathComponent("test.wav") | 65 | self.filename = tempDirectoryURL.appendingPathComponent("test.wav") |
| 66 | } | 66 | } |
| 67 | 67 | ||
| 68 | - let ret = audio.save(filename: filename.path) | 68 | + let _ = audio.save(filename: filename.path) |
| 69 | 69 | ||
| 70 | self.audioPlayer = try! AVAudioPlayer(contentsOf: filename) | 70 | self.audioPlayer = try! AVAudioPlayer(contentsOf: filename) |
| 71 | self.audioPlayer.play() | 71 | self.audioPlayer.play() |
| @@ -7,6 +7,12 @@ | @@ -7,6 +7,12 @@ | ||
| 7 | 7 | ||
| 8 | import Foundation | 8 | import Foundation |
| 9 | 9 | ||
| 10 | + | ||
| 11 | +// used to get the path to espeak-ng-data | ||
| 12 | +func resourceURL(to path: String) -> String { | ||
| 13 | + return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path | ||
| 14 | +} | ||
| 15 | + | ||
| 10 | func getResource(_ forResource: String, _ ofType: String) -> String { | 16 | func getResource(_ forResource: String, _ ofType: String) -> String { |
| 11 | let path = Bundle.main.path(forResource: forResource, ofType: ofType) | 17 | let path = Bundle.main.path(forResource: forResource, ofType: ofType) |
| 12 | precondition( | 18 | precondition( |
| @@ -59,8 +65,30 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper { | @@ -59,8 +65,30 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper { | ||
| 59 | return SherpaOnnxOfflineTtsWrapper(config: &config) | 65 | return SherpaOnnxOfflineTtsWrapper(config: &config) |
| 60 | } | 66 | } |
| 61 | 67 | ||
| 68 | +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 69 | +func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper { | ||
| 70 | + // please see https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 71 | + | ||
| 72 | + // vits-vctk.onnx | ||
| 73 | + let model = getResource("en_US-amy-low", "onnx") | ||
| 74 | + | ||
| 75 | + // tokens.txt | ||
| 76 | + let tokens = getResource("tokens", "txt") | ||
| 77 | + | ||
| 78 | + // in this case, we don't need lexicon.txt | ||
| 79 | + let dataDir = resourceURL(to: "espeak-ng-data") | ||
| 80 | + | ||
| 81 | + let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: "", tokens: tokens, dataDir: dataDir) | ||
| 82 | + let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) | ||
| 83 | + var config = sherpaOnnxOfflineTtsConfig(model: modelConfig) | ||
| 84 | + | ||
| 85 | + return SherpaOnnxOfflineTtsWrapper(config: &config) | ||
| 86 | +} | ||
| 87 | + | ||
| 62 | func createOfflineTts() -> SherpaOnnxOfflineTtsWrapper { | 88 | func createOfflineTts() -> SherpaOnnxOfflineTtsWrapper { |
| 63 | - return getTtsForVCTK() | 89 | + return getTtsFor_en_US_amy_low() |
| 90 | + | ||
| 91 | + // return getTtsForVCTK() | ||
| 64 | 92 | ||
| 65 | // return getTtsForAishell3() | 93 | // return getTtsForAishell3() |
| 66 | 94 |
| @@ -8,20 +8,22 @@ fun main() { | @@ -8,20 +8,22 @@ fun main() { | ||
| 8 | } | 8 | } |
| 9 | 9 | ||
| 10 | fun testTts() { | 10 | fun testTts() { |
| 11 | + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 12 | + // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 11 | var config = OfflineTtsConfig( | 13 | var config = OfflineTtsConfig( |
| 12 | model=OfflineTtsModelConfig( | 14 | model=OfflineTtsModelConfig( |
| 13 | vits=OfflineTtsVitsModelConfig( | 15 | vits=OfflineTtsVitsModelConfig( |
| 14 | - model="./vits-zh-aishell3/vits-aishell3.onnx", | ||
| 15 | - lexicon="./vits-zh-aishell3/lexicon.txt", | ||
| 16 | - tokens="./vits-zh-aishell3/tokens.txt", | 16 | + model="./vits-piper-en_US-amy-low/en_US-amy-low.onnx", |
| 17 | + tokens="./vits-piper-en_US-amy-low/tokens.txt", | ||
| 18 | + dataDir="./vits-piper-en_US-amy-low/espeak-ng-data", | ||
| 17 | ), | 19 | ), |
| 18 | numThreads=1, | 20 | numThreads=1, |
| 19 | debug=true, | 21 | debug=true, |
| 20 | ) | 22 | ) |
| 21 | ) | 23 | ) |
| 22 | val tts = OfflineTts(config=config) | 24 | val tts = OfflineTts(config=config) |
| 23 | - val audio = tts.generate(text="林美丽最美丽!", sid=99, speed=1.2f) | ||
| 24 | - audio.save(filename="99.wav") | 25 | + val audio = tts.generate(text="“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”") |
| 26 | + audio.save(filename="test-en.wav") | ||
| 25 | } | 27 | } |
| 26 | 28 | ||
| 27 | fun testAsr() { | 29 | fun testAsr() { |
| @@ -34,9 +34,10 @@ if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then | @@ -34,9 +34,10 @@ if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then | ||
| 34 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21 | 34 | git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21 |
| 35 | fi | 35 | fi |
| 36 | 36 | ||
| 37 | -if [ ! -f ./vits-zh-aishell3/tokens.txt ]; then | ||
| 38 | - git lfs install | ||
| 39 | - git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 | 37 | +if [ ! -f ./vits-piper-en_US-amy-low/en_US-amy-low.onnx ]; then |
| 38 | + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 39 | + tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 40 | + rm vits-piper-en_US-amy-low.tar.bz2 | ||
| 40 | fi | 41 | fi |
| 41 | 42 | ||
| 42 | kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt Tts.kt | 43 | kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt Tts.kt |
| @@ -42,15 +42,14 @@ In the following, we demonstrate how to run text-to-speech. | @@ -42,15 +42,14 @@ In the following, we demonstrate how to run text-to-speech. | ||
| 42 | ## ./test-offline-tts-en.js | 42 | ## ./test-offline-tts-en.js |
| 43 | 43 | ||
| 44 | [./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use | 44 | [./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use |
| 45 | -a VITS pretrained model | ||
| 46 | -[VCTK](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers) | 45 | +[vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2) |
| 47 | for text-to-speech. | 46 | for text-to-speech. |
| 48 | 47 | ||
| 49 | You can use the following command to run it: | 48 | You can use the following command to run it: |
| 50 | 49 | ||
| 51 | ```bash | 50 | ```bash |
| 52 | -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 | ||
| 53 | -tar xvf vits-vctk.tar.bz2 | 51 | +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 52 | +tar xvf vits-piper-en_US-amy-low.tar.bz2 | ||
| 54 | node ./test-offline-tts-en.js | 53 | node ./test-offline-tts-en.js |
| 55 | ``` | 54 | ``` |
| 56 | 55 |
| @@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx'); | @@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx'); | ||
| 4 | 4 | ||
| 5 | function createOfflineTts() { | 5 | function createOfflineTts() { |
| 6 | const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); | 6 | const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); |
| 7 | - vits.model = './vits-vctk/vits-vctk.onnx'; | ||
| 8 | - vits.lexicon = './vits-vctk/lexicon.txt'; | ||
| 9 | - vits.tokens = './vits-vctk/tokens.txt'; | 7 | + vits.model = 'vits-piper-en_US-amy-low/en_US-amy-low.onnx' |
| 8 | + vits.tokens = './vits-piper-en_US-amy-low/tokens.txt'; | ||
| 9 | + vits.dataDir = './vits-piper-en_US-amy-low/espeak-ng-data' | ||
| 10 | 10 | ||
| 11 | const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); | 11 | const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); |
| 12 | modelConfig.vits = vits; | 12 | modelConfig.vits = vits; |
| @@ -18,10 +18,11 @@ function createOfflineTts() { | @@ -18,10 +18,11 @@ function createOfflineTts() { | ||
| 18 | } | 18 | } |
| 19 | 19 | ||
| 20 | const tts = createOfflineTts(); | 20 | const tts = createOfflineTts(); |
| 21 | -const speakerId = 99; | 21 | +const speakerId = 0; |
| 22 | const speed = 1.0; | 22 | const speed = 1.0; |
| 23 | -const audio = | ||
| 24 | - tts.generate('Good morning. How are you doing?', speakerId, speed); | 23 | +const audio = tts.generate( |
| 24 | + '“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”', | ||
| 25 | + speakerId, speed); | ||
| 25 | audio.save('./test-en.wav'); | 26 | audio.save('./test-en.wav'); |
| 26 | console.log('Saved to test-en.wav successfully.'); | 27 | console.log('Saved to test-en.wav successfully.'); |
| 27 | tts.free(); | 28 | tts.free(); |
| @@ -63,16 +63,26 @@ def get_args(): | @@ -63,16 +63,26 @@ def get_args(): | ||
| 63 | parser.add_argument( | 63 | parser.add_argument( |
| 64 | "--vits-lexicon", | 64 | "--vits-lexicon", |
| 65 | type=str, | 65 | type=str, |
| 66 | + default="", | ||
| 66 | help="Path to lexicon.txt", | 67 | help="Path to lexicon.txt", |
| 67 | ) | 68 | ) |
| 68 | 69 | ||
| 69 | parser.add_argument( | 70 | parser.add_argument( |
| 70 | "--vits-tokens", | 71 | "--vits-tokens", |
| 71 | type=str, | 72 | type=str, |
| 73 | + default="", | ||
| 72 | help="Path to tokens.txt", | 74 | help="Path to tokens.txt", |
| 73 | ) | 75 | ) |
| 74 | 76 | ||
| 75 | parser.add_argument( | 77 | parser.add_argument( |
| 78 | + "--vits-data-dir", | ||
| 79 | + type=str, | ||
| 80 | + default="", | ||
| 81 | + help="""Path to the dict director of espeak-ng. If it is specified, | ||
| 82 | + --vits-lexicon and --vits-tokens are ignored""", | ||
| 83 | + ) | ||
| 84 | + | ||
| 85 | + parser.add_argument( | ||
| 76 | "--tts-rule-fsts", | 86 | "--tts-rule-fsts", |
| 77 | type=str, | 87 | type=str, |
| 78 | default="", | 88 | default="", |
| @@ -80,6 +90,17 @@ def get_args(): | @@ -80,6 +90,17 @@ def get_args(): | ||
| 80 | ) | 90 | ) |
| 81 | 91 | ||
| 82 | parser.add_argument( | 92 | parser.add_argument( |
| 93 | + "--max-num-sentences", | ||
| 94 | + type=int, | ||
| 95 | + default=2, | ||
| 96 | + help="""Max number of sentences in a batch to avoid OOM if the input | ||
| 97 | + text is very long. Set it to -1 to process all the sentences in a | ||
| 98 | + single batch. A smaller value does not mean it is slower compared | ||
| 99 | + to a larger one on CPU. | ||
| 100 | + """, | ||
| 101 | + ) | ||
| 102 | + | ||
| 103 | + parser.add_argument( | ||
| 83 | "--output-filename", | 104 | "--output-filename", |
| 84 | type=str, | 105 | type=str, |
| 85 | default="./generated.wav", | 106 | default="./generated.wav", |
| @@ -142,14 +163,19 @@ def main(): | @@ -142,14 +163,19 @@ def main(): | ||
| 142 | vits=sherpa_onnx.OfflineTtsVitsModelConfig( | 163 | vits=sherpa_onnx.OfflineTtsVitsModelConfig( |
| 143 | model=args.vits_model, | 164 | model=args.vits_model, |
| 144 | lexicon=args.vits_lexicon, | 165 | lexicon=args.vits_lexicon, |
| 166 | + data_dir=args.vits_data_dir, | ||
| 145 | tokens=args.vits_tokens, | 167 | tokens=args.vits_tokens, |
| 146 | ), | 168 | ), |
| 147 | provider=args.provider, | 169 | provider=args.provider, |
| 148 | debug=args.debug, | 170 | debug=args.debug, |
| 149 | num_threads=args.num_threads, | 171 | num_threads=args.num_threads, |
| 150 | ), | 172 | ), |
| 151 | - rule_fsts=args.tts_rule_fsts | 173 | + rule_fsts=args.tts_rule_fsts, |
| 174 | + max_num_sentences=args.max_num_sentences, | ||
| 152 | ) | 175 | ) |
| 176 | + if not tts_config.validate(): | ||
| 177 | + raise ValueError("Please check your config") | ||
| 178 | + | ||
| 153 | tts = sherpa_onnx.OfflineTts(tts_config) | 179 | tts = sherpa_onnx.OfflineTts(tts_config) |
| 154 | 180 | ||
| 155 | start = time.time() | 181 | start = time.time() |
| @@ -37,13 +37,9 @@ model_dir={{ tts_model.model_dir }} | @@ -37,13 +37,9 @@ model_dir={{ tts_model.model_dir }} | ||
| 37 | model_name={{ tts_model.model_name }} | 37 | model_name={{ tts_model.model_name }} |
| 38 | lang={{ tts_model.lang }} | 38 | lang={{ tts_model.lang }} |
| 39 | 39 | ||
| 40 | -mkdir $model_dir | ||
| 41 | -cd $model_dir | ||
| 42 | -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/$model_name | ||
| 43 | -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/lexicon.txt | ||
| 44 | -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/tokens.txt | ||
| 45 | -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/MODEL_CARD 2>/dev/null || true | ||
| 46 | -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/rule.fst 2>/dev/null || true | 40 | +wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2 |
| 41 | +tar xf $model_dir.tar.bz2 | ||
| 42 | +rm $model_dir.tar.bz2 | ||
| 47 | 43 | ||
| 48 | popd | 44 | popd |
| 49 | # Now we are at the project root directory | 45 | # Now we are at the project root directory |
| @@ -52,11 +48,19 @@ git checkout . | @@ -52,11 +48,19 @@ git checkout . | ||
| 52 | pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx | 48 | pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx |
| 53 | sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt | 49 | sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt |
| 54 | sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt | 50 | sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt |
| 51 | + | ||
| 55 | {% if tts_model.rule_fsts %} | 52 | {% if tts_model.rule_fsts %} |
| 56 | rule_fsts={{ tts_model.rule_fsts }} | 53 | rule_fsts={{ tts_model.rule_fsts }} |
| 57 | sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt | 54 | sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt |
| 58 | {% endif %} | 55 | {% endif %} |
| 59 | 56 | ||
| 57 | +{% if tts_model.data_dir %} | ||
| 58 | + data_dir={{ tts_model.data_dir }} | ||
| 59 | + sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt | ||
| 60 | +{% else %} | ||
| 61 | + sed -i.bak s/"lexicon = null"/"lexicon = \"lexicon.txt\""/ ./MainActivity.kt | ||
| 62 | +{% endif %} | ||
| 63 | + | ||
| 60 | git diff | 64 | git diff |
| 61 | popd | 65 | popd |
| 62 | 66 |
| @@ -27,9 +27,122 @@ def get_args(): | @@ -27,9 +27,122 @@ def get_args(): | ||
| 27 | @dataclass | 27 | @dataclass |
| 28 | class TtsModel: | 28 | class TtsModel: |
| 29 | model_dir: str | 29 | model_dir: str |
| 30 | - model_name: str | ||
| 31 | - lang: str # en, zh, fr, de, etc. | 30 | + model_name: str = "" |
| 31 | + lang: str = "" # en, zh, fr, de, etc. | ||
| 32 | rule_fsts: Optional[List[str]] = None | 32 | rule_fsts: Optional[List[str]] = None |
| 33 | + data_dir: Optional[str] = None | ||
| 34 | + | ||
| 35 | + | ||
| 36 | +def get_piper_models() -> List[TtsModel]: | ||
| 37 | + models = [ | ||
| 38 | + TtsModel(model_dir="vits-piper-ar_JO-kareem-low"), | ||
| 39 | + TtsModel(model_dir="vits-piper-ar_JO-kareem-medium"), | ||
| 40 | + TtsModel(model_dir="vits-piper-ca_ES-upc_ona-medium"), | ||
| 41 | + TtsModel(model_dir="vits-piper-ca_ES-upc_ona-x_low"), | ||
| 42 | + TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"), | ||
| 43 | + TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"), | ||
| 44 | + TtsModel(model_dir="vits-piper-cs_CZ-jirka-medium"), | ||
| 45 | + TtsModel(model_dir="vits-piper-da_DK-talesyntese-medium"), | ||
| 46 | + TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low"), | ||
| 47 | + TtsModel(model_dir="vits-piper-de_DE-karlsson-low"), | ||
| 48 | + TtsModel(model_dir="vits-piper-de_DE-kerstin-low"), | ||
| 49 | + TtsModel(model_dir="vits-piper-de_DE-pavoque-low"), | ||
| 50 | + TtsModel(model_dir="vits-piper-de_DE-ramona-low"), | ||
| 51 | + TtsModel(model_dir="vits-piper-de_DE-thorsten-high"), | ||
| 52 | + TtsModel(model_dir="vits-piper-de_DE-thorsten-low"), | ||
| 53 | + TtsModel(model_dir="vits-piper-de_DE-thorsten-medium"), | ||
| 54 | + TtsModel(model_dir="vits-piper-de_DE-thorsten_emotional-medium"), | ||
| 55 | + TtsModel(model_dir="vits-piper-el_GR-rapunzelina-low"), | ||
| 56 | + TtsModel(model_dir="vits-piper-en_GB-alan-low"), | ||
| 57 | + TtsModel(model_dir="vits-piper-en_GB-alan-medium"), | ||
| 58 | + TtsModel(model_dir="vits-piper-en_GB-alba-medium"), | ||
| 59 | + TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium"), | ||
| 60 | + TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium"), | ||
| 61 | + TtsModel(model_dir="vits-piper-en_GB-semaine-medium"), | ||
| 62 | + TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low"), | ||
| 63 | + TtsModel(model_dir="vits-piper-en_GB-sweetbbak-amy"), | ||
| 64 | + TtsModel(model_dir="vits-piper-en_GB-vctk-medium"), | ||
| 65 | + TtsModel(model_dir="vits-piper-en_US-amy-low"), | ||
| 66 | + TtsModel(model_dir="vits-piper-en_US-amy-medium"), | ||
| 67 | + TtsModel(model_dir="vits-piper-en_US-arctic-medium"), | ||
| 68 | + TtsModel(model_dir="vits-piper-en_US-danny-low"), | ||
| 69 | + TtsModel(model_dir="vits-piper-en_US-hfc_male-medium"), | ||
| 70 | + TtsModel(model_dir="vits-piper-en_US-joe-medium"), | ||
| 71 | + TtsModel(model_dir="vits-piper-en_US-kathleen-low"), | ||
| 72 | + TtsModel(model_dir="vits-piper-en_US-kusal-medium"), | ||
| 73 | + TtsModel(model_dir="vits-piper-en_US-l2arctic-medium"), | ||
| 74 | + TtsModel(model_dir="vits-piper-en_US-lessac-high"), | ||
| 75 | + TtsModel(model_dir="vits-piper-en_US-lessac-low"), | ||
| 76 | + TtsModel(model_dir="vits-piper-en_US-lessac-medium"), | ||
| 77 | + TtsModel(model_dir="vits-piper-en_US-libritts-high"), | ||
| 78 | + TtsModel(model_dir="vits-piper-en_US-libritts_r-medium"), | ||
| 79 | + TtsModel(model_dir="vits-piper-en_US-ryan-high"), | ||
| 80 | + TtsModel(model_dir="vits-piper-en_US-ryan-low"), | ||
| 81 | + TtsModel(model_dir="vits-piper-en_US-ryan-medium"), | ||
| 82 | + TtsModel(model_dir="vits-piper-es_ES-carlfm-x_low"), | ||
| 83 | + TtsModel(model_dir="vits-piper-es_ES-davefx-medium"), | ||
| 84 | + TtsModel(model_dir="vits-piper-es_ES-mls_10246-low"), | ||
| 85 | + TtsModel(model_dir="vits-piper-es_ES-mls_9972-low"), | ||
| 86 | + TtsModel(model_dir="vits-piper-es_ES-sharvard-medium"), | ||
| 87 | + TtsModel(model_dir="vits-piper-es_MX-ald-medium"), | ||
| 88 | + TtsModel(model_dir="vits-piper-fi_FI-harri-low"), | ||
| 89 | + TtsModel(model_dir="vits-piper-fi_FI-harri-medium"), | ||
| 90 | + TtsModel(model_dir="vits-piper-fr_FR-siwis-low"), | ||
| 91 | + TtsModel(model_dir="vits-piper-fr_FR-siwis-medium"), | ||
| 92 | + TtsModel(model_dir="vits-piper-fr_FR-upmc-medium"), | ||
| 93 | + TtsModel(model_dir="vits-piper-hu_HU-anna-medium"), | ||
| 94 | + TtsModel(model_dir="vits-piper-hu_HU-berta-medium"), | ||
| 95 | + TtsModel(model_dir="vits-piper-hu_HU-imre-medium"), | ||
| 96 | + TtsModel(model_dir="vits-piper-is_IS-bui-medium"), | ||
| 97 | + TtsModel(model_dir="vits-piper-is_IS-salka-medium"), | ||
| 98 | + TtsModel(model_dir="vits-piper-is_IS-steinn-medium"), | ||
| 99 | + TtsModel(model_dir="vits-piper-is_IS-ugla-medium"), | ||
| 100 | + TtsModel(model_dir="vits-piper-it_IT-riccardo-x_low"), | ||
| 101 | + TtsModel(model_dir="vits-piper-ka_GE-natia-medium"), | ||
| 102 | + TtsModel(model_dir="vits-piper-kk_KZ-iseke-x_low"), | ||
| 103 | + TtsModel(model_dir="vits-piper-kk_KZ-issai-high"), | ||
| 104 | + TtsModel(model_dir="vits-piper-kk_KZ-raya-x_low"), | ||
| 105 | + TtsModel(model_dir="vits-piper-lb_LU-marylux-medium"), | ||
| 106 | + TtsModel(model_dir="vits-piper-ne_NP-google-medium"), | ||
| 107 | + TtsModel(model_dir="vits-piper-ne_NP-google-x_low"), | ||
| 108 | + TtsModel(model_dir="vits-piper-nl_BE-nathalie-medium"), | ||
| 109 | + TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"), | ||
| 110 | + TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"), | ||
| 111 | + TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"), | ||
| 112 | + TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"), | ||
| 113 | + TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), | ||
| 114 | + TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"), | ||
| 115 | + TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"), | ||
| 116 | + TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"), | ||
| 117 | + TtsModel(model_dir="vits-piper-pl_PL-mc_speech-medium"), | ||
| 118 | + TtsModel(model_dir="vits-piper-pl_PL-mls_6892-low"), | ||
| 119 | + TtsModel(model_dir="vits-piper-pt_BR-edresson-low"), | ||
| 120 | + TtsModel(model_dir="vits-piper-pt_BR-faber-medium"), | ||
| 121 | + TtsModel(model_dir="vits-piper-pt_PT-tugao-medium"), | ||
| 122 | + TtsModel(model_dir="vits-piper-ro_RO-mihai-medium"), | ||
| 123 | + TtsModel(model_dir="vits-piper-ru_RU-denis-medium"), | ||
| 124 | + TtsModel(model_dir="vits-piper-ru_RU-dmitri-medium"), | ||
| 125 | + TtsModel(model_dir="vits-piper-ru_RU-irina-medium"), | ||
| 126 | + TtsModel(model_dir="vits-piper-ru_RU-ruslan-medium"), | ||
| 127 | + TtsModel(model_dir="vits-piper-sk_SK-lili-medium"), | ||
| 128 | + TtsModel(model_dir="vits-piper-sr_RS-serbski_institut-medium"), | ||
| 129 | + TtsModel(model_dir="vits-piper-sv_SE-nst-medium"), | ||
| 130 | + TtsModel(model_dir="vits-piper-sw_CD-lanfrica-medium"), | ||
| 131 | + TtsModel(model_dir="vits-piper-tr_TR-dfki-medium"), | ||
| 132 | + TtsModel(model_dir="vits-piper-tr_TR-fahrettin-medium"), | ||
| 133 | + TtsModel(model_dir="vits-piper-uk_UA-lada-x_low"), | ||
| 134 | + TtsModel(model_dir="vits-piper-uk_UA-ukrainian_tts-medium"), | ||
| 135 | + TtsModel(model_dir="vits-piper-vi_VN-25hours_single-low"), | ||
| 136 | + TtsModel(model_dir="vits-piper-vi_VN-vais1000-medium"), | ||
| 137 | + TtsModel(model_dir="vits-piper-vi_VN-vivos-x_low"), | ||
| 138 | + TtsModel(model_dir="vits-piper-zh_CN-huayan-medium"), | ||
| 139 | + ] | ||
| 140 | + for m in models: | ||
| 141 | + m.data_dir = m.model_dir + "/" + "espeak-ng-data" | ||
| 142 | + m.model_name = m.model_dir[len("vits-piper-") :] + ".onnx" | ||
| 143 | + m.lang = "en" | ||
| 144 | + | ||
| 145 | + return models | ||
| 33 | 146 | ||
| 34 | 147 | ||
| 35 | def get_all_models() -> List[TtsModel]: | 148 | def get_all_models() -> List[TtsModel]: |
| @@ -98,56 +211,6 @@ def get_all_models() -> List[TtsModel]: | @@ -98,56 +211,6 @@ def get_all_models() -> List[TtsModel]: | ||
| 98 | # English (US) | 211 | # English (US) |
| 99 | TtsModel(model_dir="vits-vctk", model_name="vits-vctk.onnx", lang="en"), | 212 | TtsModel(model_dir="vits-vctk", model_name="vits-vctk.onnx", lang="en"), |
| 100 | TtsModel(model_dir="vits-ljs", model_name="vits-ljs.onnx", lang="en"), | 213 | TtsModel(model_dir="vits-ljs", model_name="vits-ljs.onnx", lang="en"), |
| 101 | - TtsModel(model_dir="vits-piper-en_US-amy-low", model_name="en_US-amy-low.onnx", lang="en",), | ||
| 102 | - TtsModel(model_dir="vits-piper-en_US-amy-medium", model_name="en_US-amy-medium.onnx", lang="en",), | ||
| 103 | - TtsModel(model_dir="vits-piper-en_US-arctic-medium", model_name="en_US-arctic-medium.onnx", lang="en",), | ||
| 104 | - TtsModel(model_dir="vits-piper-en_US-danny-low", model_name="en_US-danny-low.onnx", lang="en",), | ||
| 105 | - TtsModel(model_dir="vits-piper-en_US-hfc_male-medium", model_name="en_US-hfc_male-medium.onnx", lang="en",), | ||
| 106 | - TtsModel(model_dir="vits-piper-en_US-joe-medium", model_name="en_US-joe-medium.onnx", lang="en",), | ||
| 107 | - TtsModel(model_dir="vits-piper-en_US-kathleen-low", model_name="en_US-kathleen-low.onnx", lang="en",), | ||
| 108 | - TtsModel(model_dir="vits-piper-en_US-kusal-medium", model_name="en_US-kusal-medium.onnx", lang="en",), | ||
| 109 | - TtsModel(model_dir="vits-piper-en_US-l2arctic-medium", model_name="en_US-l2arctic-medium.onnx", lang="en",), | ||
| 110 | - TtsModel(model_dir="vits-piper-en_US-lessac-low", model_name="en_US-lessac-low.onnx", lang="en",), | ||
| 111 | - TtsModel(model_dir="vits-piper-en_US-lessac-medium", model_name="en_US-lessac-medium.onnx", lang="en",), | ||
| 112 | - TtsModel(model_dir="vits-piper-en_US-lessac-high", model_name="en_US-lessac-high.onnx", lang="en",), | ||
| 113 | - TtsModel(model_dir="vits-piper-en_US-libritts-high", model_name="en_US-libritts-high.onnx", lang="en",), | ||
| 114 | - TtsModel(model_dir="vits-piper-en_US-libritts_r-medium", model_name="en_US-libritts_r-medium.onnx", lang="en",), | ||
| 115 | - TtsModel(model_dir="vits-piper-en_US-ryan-low", model_name="en_US-ryan-low.onnx", lang="en",), | ||
| 116 | - TtsModel(model_dir="vits-piper-en_US-ryan-medium", model_name="en_US-ryan-medium.onnx", lang="en",), | ||
| 117 | - TtsModel(model_dir="vits-piper-en_US-ryan-high", model_name="en_US-ryan-high.onnx", lang="en",), | ||
| 118 | - # English (GB) | ||
| 119 | - TtsModel(model_dir="vits-piper-en_GB-alan-low", model_name="en_GB-alan-low.onnx",lang="en",), | ||
| 120 | - TtsModel(model_dir="vits-piper-en_GB-alan-medium", model_name="en_GB-alan-medium.onnx",lang="en",), | ||
| 121 | - TtsModel(model_dir="vits-piper-en_GB-alba-medium", model_name="en_GB-alba-medium.onnx",lang="en",), | ||
| 122 | - TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium", model_name="en_GB-jenny_dioco-medium.onnx",lang="en",), | ||
| 123 | - TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium", model_name="en_GB-northern_english_male-medium.onnx",lang="en",), | ||
| 124 | - TtsModel(model_dir="vits-piper-en_GB-semaine-medium", model_name="en_GB-semaine-medium.onnx",lang="en",), | ||
| 125 | - TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low", model_name="en_GB-southern_english_female-low.onnx",lang="en",), | ||
| 126 | - TtsModel(model_dir="vits-piper-en_GB-vctk-medium", model_name="en_GB-vctk-medium.onnx",lang="en",), | ||
| 127 | - # German (DE) | ||
| 128 | - TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low", model_name="de_DE-eva_k-x_low.onnx",lang="de",), | ||
| 129 | - TtsModel(model_dir="vits-piper-de_DE-karlsson-low", model_name="de_DE-karlsson-low.onnx",lang="de",), | ||
| 130 | - TtsModel(model_dir="vits-piper-de_DE-kerstin-low", model_name="de_DE-kerstin-low.onnx",lang="de",), | ||
| 131 | - TtsModel(model_dir="vits-piper-de_DE-pavoque-low", model_name="de_DE-pavoque-low.onnx",lang="de",), | ||
| 132 | - TtsModel(model_dir="vits-piper-de_DE-ramona-low", model_name="de_DE-ramona-low.onnx",lang="de",), | ||
| 133 | - TtsModel(model_dir="vits-piper-de_DE-thorsten-low", model_name="de_DE-thorsten-low.onnx",lang="de",), | ||
| 134 | - TtsModel(model_dir="vits-piper-de_DE-thorsten-medium", model_name="de_DE-thorsten-medium.onnx",lang="de",), | ||
| 135 | - TtsModel(model_dir="vits-piper-de_DE-thorsten-high", model_name="de_DE-thorsten-high.onnx",lang="de",), | ||
| 136 | - TtsModel(model_dir="vits-piper-de_DE-thorsten_emotional-medium", model_name="de_DE-thorsten_emotional-medium.onnx",lang="de",), | ||
| 137 | - # French (FR) | ||
| 138 | - TtsModel(model_dir="vits-piper-fr_FR-upmc-medium", model_name="fr_FR-upmc-medium.onnx",lang="fr",), | ||
| 139 | - TtsModel(model_dir="vits-piper-fr_FR-siwis-low", model_name="fr_FR-siwis-low.onnx",lang="fr",), | ||
| 140 | - TtsModel(model_dir="vits-piper-fr_FR-siwis-medium", model_name="fr_FR-siwis-medium.onnx",lang="fr",), | ||
| 141 | - | ||
| 142 | - # Spanish (ES) | ||
| 143 | - TtsModel(model_dir="vits-piper-es_ES-carlfm-x_low", model_name="es_ES-carlfm-x_low.onnx",lang="es",), | ||
| 144 | - TtsModel(model_dir="vits-piper-es_ES-davefx-medium", model_name="es_ES-davefx-medium.onnx",lang="es",), | ||
| 145 | - TtsModel(model_dir="vits-piper-es_ES-mls_10246-low", model_name="es_ES-mls_10246-low.onnx",lang="es",), | ||
| 146 | - TtsModel(model_dir="vits-piper-es_ES-mls_9972-low", model_name="es_ES-mls_9972-low.onnx",lang="es",), | ||
| 147 | - TtsModel(model_dir="vits-piper-es_ES-sharvard-medium", model_name="es_ES-sharvard-medium.onnx",lang="es",), | ||
| 148 | - | ||
| 149 | - # Spanish (MX) | ||
| 150 | - TtsModel(model_dir="vits-piper-es_MX-ald-medium", model_name="es_MX-ald-medium.onnx",lang="es",), | ||
| 151 | # fmt: on | 214 | # fmt: on |
| 152 | ] | 215 | ] |
| 153 | 216 | ||
| @@ -162,7 +225,8 @@ def main(): | @@ -162,7 +225,8 @@ def main(): | ||
| 162 | s = f.read() | 225 | s = f.read() |
| 163 | template = environment.from_string(s) | 226 | template = environment.from_string(s) |
| 164 | d = dict() | 227 | d = dict() |
| 165 | - all_model_list = get_all_models() | 228 | + # all_model_list = get_all_models() |
| 229 | + all_model_list = get_piper_models() | ||
| 166 | num_models = len(all_model_list) | 230 | num_models = len(all_model_list) |
| 167 | 231 | ||
| 168 | num_per_runner = num_models // total | 232 | num_per_runner = num_models // total |
| @@ -186,6 +186,7 @@ const SherpaOnnxOfflineTtsVitsModelConfig = StructType({ | @@ -186,6 +186,7 @@ const SherpaOnnxOfflineTtsVitsModelConfig = StructType({ | ||
| 186 | "model" : cstring, | 186 | "model" : cstring, |
| 187 | "lexicon" : cstring, | 187 | "lexicon" : cstring, |
| 188 | "tokens" : cstring, | 188 | "tokens" : cstring, |
| 189 | + "dataDir" : cstring, | ||
| 189 | "noiseScale" : float, | 190 | "noiseScale" : float, |
| 190 | "noiseScaleW" : float, | 191 | "noiseScaleW" : float, |
| 191 | "lengthScale" : float, | 192 | "lengthScale" : float, |
| @@ -201,6 +202,7 @@ const SherpaOnnxOfflineTtsModelConfig = StructType({ | @@ -201,6 +202,7 @@ const SherpaOnnxOfflineTtsModelConfig = StructType({ | ||
| 201 | const SherpaOnnxOfflineTtsConfig = StructType({ | 202 | const SherpaOnnxOfflineTtsConfig = StructType({ |
| 202 | "model" : SherpaOnnxOfflineTtsModelConfig, | 203 | "model" : SherpaOnnxOfflineTtsModelConfig, |
| 203 | "ruleFsts" : cstring, | 204 | "ruleFsts" : cstring, |
| 205 | + "maxNumSentences" : int32_t, | ||
| 204 | }); | 206 | }); |
| 205 | 207 | ||
| 206 | const SherpaOnnxGeneratedAudio = StructType({ | 208 | const SherpaOnnxGeneratedAudio = StructType({ |
| @@ -65,6 +65,9 @@ def get_binaries_to_install(): | @@ -65,6 +65,9 @@ def get_binaries_to_install(): | ||
| 65 | binaries += ["sherpa-onnx-core.dll"] | 65 | binaries += ["sherpa-onnx-core.dll"] |
| 66 | binaries += ["sherpa-onnx-portaudio.dll"] | 66 | binaries += ["sherpa-onnx-portaudio.dll"] |
| 67 | binaries += ["onnxruntime.dll"] | 67 | binaries += ["onnxruntime.dll"] |
| 68 | + binaries += ["piper_phonemize.dll"] | ||
| 69 | + binaries += ["espeak-ng.dll"] | ||
| 70 | + binaries += ["ucd.dll"] | ||
| 68 | binaries += ["kaldi-decoder-core.dll"] | 71 | binaries += ["kaldi-decoder-core.dll"] |
| 69 | binaries += ["sherpa-onnx-fst.lib"] | 72 | binaries += ["sherpa-onnx-fst.lib"] |
| 70 | binaries += ["sherpa-onnx-kaldifst-core.lib"] | 73 | binaries += ["sherpa-onnx-kaldifst-core.lib"] |
| @@ -547,6 +547,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | @@ -547,6 +547,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | ||
| 547 | tts_config.model.vits.lexicon = | 547 | tts_config.model.vits.lexicon = |
| 548 | SHERPA_ONNX_OR(config->model.vits.lexicon, ""); | 548 | SHERPA_ONNX_OR(config->model.vits.lexicon, ""); |
| 549 | tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, ""); | 549 | tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, ""); |
| 550 | + tts_config.model.vits.data_dir = | ||
| 551 | + SHERPA_ONNX_OR(config->model.vits.data_dir, ""); | ||
| 550 | tts_config.model.vits.noise_scale = | 552 | tts_config.model.vits.noise_scale = |
| 551 | SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667); | 553 | SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667); |
| 552 | tts_config.model.vits.noise_scale_w = | 554 | tts_config.model.vits.noise_scale_w = |
| @@ -558,6 +560,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | @@ -558,6 +560,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | ||
| 558 | tts_config.model.debug = config->model.debug; | 560 | tts_config.model.debug = config->model.debug; |
| 559 | tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); | 561 | tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); |
| 560 | tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); | 562 | tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); |
| 563 | + tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); | ||
| 561 | 564 | ||
| 562 | if (tts_config.model.debug) { | 565 | if (tts_config.model.debug) { |
| 563 | fprintf(stderr, "%s\n", tts_config.ToString().c_str()); | 566 | fprintf(stderr, "%s\n", tts_config.ToString().c_str()); |
| @@ -607,6 +607,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { | @@ -607,6 +607,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { | ||
| 607 | const char *model; | 607 | const char *model; |
| 608 | const char *lexicon; | 608 | const char *lexicon; |
| 609 | const char *tokens; | 609 | const char *tokens; |
| 610 | + const char *data_dir; | ||
| 610 | 611 | ||
| 611 | float noise_scale; | 612 | float noise_scale; |
| 612 | float noise_scale_w; | 613 | float noise_scale_w; |
| @@ -623,6 +624,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { | @@ -623,6 +624,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { | ||
| 623 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { | 624 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { |
| 624 | SherpaOnnxOfflineTtsModelConfig model; | 625 | SherpaOnnxOfflineTtsModelConfig model; |
| 625 | const char *rule_fsts; | 626 | const char *rule_fsts; |
| 627 | + int32_t max_num_sentences; | ||
| 626 | } SherpaOnnxOfflineTtsConfig; | 628 | } SherpaOnnxOfflineTtsConfig; |
| 627 | 629 | ||
| 628 | SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { | 630 | SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { |
| @@ -74,6 +74,7 @@ set(sources | @@ -74,6 +74,7 @@ set(sources | ||
| 74 | packed-sequence.cc | 74 | packed-sequence.cc |
| 75 | pad-sequence.cc | 75 | pad-sequence.cc |
| 76 | parse-options.cc | 76 | parse-options.cc |
| 77 | + piper-phonemize-lexicon.cc | ||
| 77 | provider.cc | 78 | provider.cc |
| 78 | resample.cc | 79 | resample.cc |
| 79 | session.cc | 80 | session.cc |
| @@ -129,8 +129,8 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon, | @@ -129,8 +129,8 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon, | ||
| 129 | } | 129 | } |
| 130 | #endif | 130 | #endif |
| 131 | 131 | ||
| 132 | -std::vector<int64_t> Lexicon::ConvertTextToTokenIds( | ||
| 133 | - const std::string &text) const { | 132 | +std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIds( |
| 133 | + const std::string &text, const std::string & /*voice*/ /*= ""*/) const { | ||
| 134 | switch (language_) { | 134 | switch (language_) { |
| 135 | case Language::kEnglish: | 135 | case Language::kEnglish: |
| 136 | return ConvertTextToTokenIdsEnglish(text); | 136 | return ConvertTextToTokenIdsEnglish(text); |
| @@ -150,7 +150,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( | @@ -150,7 +150,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( | ||
| 150 | return {}; | 150 | return {}; |
| 151 | } | 151 | } |
| 152 | 152 | ||
| 153 | -std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese( | 153 | +std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsChinese( |
| 154 | const std::string &text) const { | 154 | const std::string &text) const { |
| 155 | std::vector<std::string> words; | 155 | std::vector<std::string> words; |
| 156 | if (pattern_) { | 156 | if (pattern_) { |
| @@ -245,10 +245,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese( | @@ -245,10 +245,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese( | ||
| 245 | ans.push_back(eos); | 245 | ans.push_back(eos); |
| 246 | } | 246 | } |
| 247 | 247 | ||
| 248 | - return ans; | 248 | + return {ans}; |
| 249 | } | 249 | } |
| 250 | 250 | ||
| 251 | -std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish( | 251 | +std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsEnglish( |
| 252 | const std::string &_text) const { | 252 | const std::string &_text) const { |
| 253 | std::string text(_text); | 253 | std::string text(_text); |
| 254 | ToLowerCase(&text); | 254 | ToLowerCase(&text); |
| @@ -301,7 +301,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish( | @@ -301,7 +301,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish( | ||
| 301 | ans.push_back(token2id_.at("$")); // eos | 301 | ans.push_back(token2id_.at("$")); // eos |
| 302 | } | 302 | } |
| 303 | 303 | ||
| 304 | - return ans; | 304 | + return {ans}; |
| 305 | } | 305 | } |
| 306 | 306 | ||
| 307 | void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); } | 307 | void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); } |
| @@ -18,11 +18,15 @@ | @@ -18,11 +18,15 @@ | ||
| 18 | #include "android/asset_manager_jni.h" | 18 | #include "android/asset_manager_jni.h" |
| 19 | #endif | 19 | #endif |
| 20 | 20 | ||
| 21 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 22 | + | ||
| 21 | namespace sherpa_onnx { | 23 | namespace sherpa_onnx { |
| 22 | 24 | ||
| 23 | -// TODO(fangjun): Refactor it to an abstract class | ||
| 24 | -class Lexicon { | 25 | +class Lexicon : public OfflineTtsFrontend { |
| 25 | public: | 26 | public: |
| 27 | + Lexicon() = default; // for subclasses | ||
| 28 | + // | ||
| 29 | + // Note: for models from piper, we won't use this class. | ||
| 26 | Lexicon(const std::string &lexicon, const std::string &tokens, | 30 | Lexicon(const std::string &lexicon, const std::string &tokens, |
| 27 | const std::string &punctuations, const std::string &language, | 31 | const std::string &punctuations, const std::string &language, |
| 28 | bool debug = false, bool is_piper = false); | 32 | bool debug = false, bool is_piper = false); |
| @@ -34,28 +38,29 @@ class Lexicon { | @@ -34,28 +38,29 @@ class Lexicon { | ||
| 34 | bool is_piper = false); | 38 | bool is_piper = false); |
| 35 | #endif | 39 | #endif |
| 36 | 40 | ||
| 37 | - std::vector<int64_t> ConvertTextToTokenIds(const std::string &text) const; | 41 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIds( |
| 42 | + const std::string &text, const std::string &voice = "") const override; | ||
| 38 | 43 | ||
| 39 | private: | 44 | private: |
| 40 | - std::vector<int64_t> ConvertTextToTokenIdsGerman( | 45 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsGerman( |
| 41 | const std::string &text) const { | 46 | const std::string &text) const { |
| 42 | return ConvertTextToTokenIdsEnglish(text); | 47 | return ConvertTextToTokenIdsEnglish(text); |
| 43 | } | 48 | } |
| 44 | 49 | ||
| 45 | - std::vector<int64_t> ConvertTextToTokenIdsSpanish( | 50 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsSpanish( |
| 46 | const std::string &text) const { | 51 | const std::string &text) const { |
| 47 | return ConvertTextToTokenIdsEnglish(text); | 52 | return ConvertTextToTokenIdsEnglish(text); |
| 48 | } | 53 | } |
| 49 | 54 | ||
| 50 | - std::vector<int64_t> ConvertTextToTokenIdsFrench( | 55 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsFrench( |
| 51 | const std::string &text) const { | 56 | const std::string &text) const { |
| 52 | return ConvertTextToTokenIdsEnglish(text); | 57 | return ConvertTextToTokenIdsEnglish(text); |
| 53 | } | 58 | } |
| 54 | 59 | ||
| 55 | - std::vector<int64_t> ConvertTextToTokenIdsEnglish( | 60 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsEnglish( |
| 56 | const std::string &text) const; | 61 | const std::string &text) const; |
| 57 | 62 | ||
| 58 | - std::vector<int64_t> ConvertTextToTokenIdsChinese( | 63 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsChinese( |
| 59 | const std::string &text) const; | 64 | const std::string &text) const; |
| 60 | 65 | ||
| 61 | void InitLanguage(const std::string &lang); | 66 | void InitLanguage(const std::string &lang); |
| @@ -43,6 +43,21 @@ | @@ -43,6 +43,21 @@ | ||
| 43 | } \ | 43 | } \ |
| 44 | } while (0) | 44 | } while (0) |
| 45 | 45 | ||
| 46 | +#define SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(dst, src_key, default_value) \ | ||
| 47 | + do { \ | ||
| 48 | + auto value = \ | ||
| 49 | + meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \ | ||
| 50 | + if (!value) { \ | ||
| 51 | + dst = default_value; \ | ||
| 52 | + } else { \ | ||
| 53 | + dst = atoi(value.get()); \ | ||
| 54 | + if (dst < 0) { \ | ||
| 55 | + SHERPA_ONNX_LOGE("Invalid value %d for %s", dst, src_key); \ | ||
| 56 | + exit(-1); \ | ||
| 57 | + } \ | ||
| 58 | + } \ | ||
| 59 | + } while (0) | ||
| 60 | + | ||
| 46 | // read a vector of integers | 61 | // read a vector of integers |
| 47 | #define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key) \ | 62 | #define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key) \ |
| 48 | do { \ | 63 | do { \ |
| @@ -112,4 +127,20 @@ | @@ -112,4 +127,20 @@ | ||
| 112 | } \ | 127 | } \ |
| 113 | } while (0) | 128 | } while (0) |
| 114 | 129 | ||
| 130 | +#define SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(dst, src_key, \ | ||
| 131 | + default_value) \ | ||
| 132 | + do { \ | ||
| 133 | + auto value = \ | ||
| 134 | + meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \ | ||
| 135 | + if (!value) { \ | ||
| 136 | + dst = default_value; \ | ||
| 137 | + } else { \ | ||
| 138 | + dst = value.get(); \ | ||
| 139 | + if (dst.empty()) { \ | ||
| 140 | + SHERPA_ONNX_LOGE("Invalid value for %s\n", src_key); \ | ||
| 141 | + exit(-1); \ | ||
| 142 | + } \ | ||
| 143 | + } \ | ||
| 144 | + } while (0) | ||
| 145 | + | ||
| 115 | #endif // SHERPA_ONNX_CSRC_MACROS_H_ | 146 | #endif // SHERPA_ONNX_CSRC_MACROS_H_ |
sherpa-onnx/csrc/offline-tts-frontend.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/offline-tts-frontend.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ | ||
| 7 | +#include <cstdint> | ||
| 8 | +#include <string> | ||
| 9 | +#include <vector> | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +class OfflineTtsFrontend { | ||
| 14 | + public: | ||
| 15 | + virtual ~OfflineTtsFrontend() = default; | ||
| 16 | + | ||
| 17 | + /** Convert a string to token IDs. | ||
| 18 | + * | ||
| 19 | + * @param text The input text. | ||
| 20 | + * Example 1: "This is the first sample sentence; this is the | ||
| 21 | + * second one." Example 2: "这是第一句。这是第二句。" | ||
| 22 | + * @param voice Optional. It is for espeak-ng. | ||
| 23 | + * | ||
| 24 | + * @return Return a vector-of-vector of token IDs. Each subvector contains | ||
| 25 | + * a sentence that can be processed independently. | ||
| 26 | + * If a frontend does not support splitting the text into sentences, | ||
| 27 | + * the resulting vector contains only one subvector. | ||
| 28 | + */ | ||
| 29 | + virtual std::vector<std::vector<int64_t>> ConvertTextToTokenIds( | ||
| 30 | + const std::string &text, const std::string &voice = "") const = 0; | ||
| 31 | +}; | ||
| 32 | + | ||
| 33 | +} // namespace sherpa_onnx | ||
| 34 | + | ||
| 35 | +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ |
| @@ -18,9 +18,11 @@ | @@ -18,9 +18,11 @@ | ||
| 18 | #include "kaldifst/csrc/text-normalizer.h" | 18 | #include "kaldifst/csrc/text-normalizer.h" |
| 19 | #include "sherpa-onnx/csrc/lexicon.h" | 19 | #include "sherpa-onnx/csrc/lexicon.h" |
| 20 | #include "sherpa-onnx/csrc/macros.h" | 20 | #include "sherpa-onnx/csrc/macros.h" |
| 21 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 21 | #include "sherpa-onnx/csrc/offline-tts-impl.h" | 22 | #include "sherpa-onnx/csrc/offline-tts-impl.h" |
| 22 | #include "sherpa-onnx/csrc/offline-tts-vits-model.h" | 23 | #include "sherpa-onnx/csrc/offline-tts-vits-model.h" |
| 23 | #include "sherpa-onnx/csrc/onnx-utils.h" | 24 | #include "sherpa-onnx/csrc/onnx-utils.h" |
| 25 | +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" | ||
| 24 | #include "sherpa-onnx/csrc/text-utils.h" | 26 | #include "sherpa-onnx/csrc/text-utils.h" |
| 25 | 27 | ||
| 26 | namespace sherpa_onnx { | 28 | namespace sherpa_onnx { |
| @@ -29,10 +31,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -29,10 +31,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 29 | public: | 31 | public: |
| 30 | explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config) | 32 | explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config) |
| 31 | : config_(config), | 33 | : config_(config), |
| 32 | - model_(std::make_unique<OfflineTtsVitsModel>(config.model)), | ||
| 33 | - lexicon_(config.model.vits.lexicon, config.model.vits.tokens, | ||
| 34 | - model_->Punctuations(), model_->Language(), config.model.debug, | ||
| 35 | - model_->IsPiper()) { | 34 | + model_(std::make_unique<OfflineTtsVitsModel>(config.model)) { |
| 35 | + InitFrontend(); | ||
| 36 | + | ||
| 36 | if (!config.rule_fsts.empty()) { | 37 | if (!config.rule_fsts.empty()) { |
| 37 | std::vector<std::string> files; | 38 | std::vector<std::string> files; |
| 38 | SplitStringToVector(config.rule_fsts, ",", false, &files); | 39 | SplitStringToVector(config.rule_fsts, ",", false, &files); |
| @@ -49,10 +50,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -49,10 +50,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 49 | #if __ANDROID_API__ >= 9 | 50 | #if __ANDROID_API__ >= 9 |
| 50 | OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config) | 51 | OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config) |
| 51 | : config_(config), | 52 | : config_(config), |
| 52 | - model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)), | ||
| 53 | - lexicon_(mgr, config.model.vits.lexicon, config.model.vits.tokens, | ||
| 54 | - model_->Punctuations(), model_->Language(), config.model.debug, | ||
| 55 | - model_->IsPiper()) { | 53 | + model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)) { |
| 54 | + InitFrontend(mgr); | ||
| 55 | + | ||
| 56 | if (!config.rule_fsts.empty()) { | 56 | if (!config.rule_fsts.empty()) { |
| 57 | std::vector<std::string> files; | 57 | std::vector<std::string> files; |
| 58 | SplitStringToVector(config.rule_fsts, ",", false, &files); | 58 | SplitStringToVector(config.rule_fsts, ",", false, &files); |
| @@ -101,20 +101,119 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -101,20 +101,119 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 101 | } | 101 | } |
| 102 | } | 102 | } |
| 103 | 103 | ||
| 104 | - std::vector<int64_t> x = lexicon_.ConvertTextToTokenIds(text); | ||
| 105 | - if (x.empty()) { | 104 | + std::vector<std::vector<int64_t>> x = |
| 105 | + frontend_->ConvertTextToTokenIds(text, model_->Voice()); | ||
| 106 | + | ||
| 107 | + if (x.empty() || (x.size() == 1 && x[0].empty())) { | ||
| 106 | SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str()); | 108 | SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str()); |
| 107 | return {}; | 109 | return {}; |
| 108 | } | 110 | } |
| 109 | 111 | ||
| 110 | - if (model_->AddBlank()) { | ||
| 111 | - std::vector<int64_t> buffer(x.size() * 2 + 1); | ||
| 112 | - int32_t i = 1; | ||
| 113 | - for (auto k : x) { | ||
| 114 | - buffer[i] = k; | ||
| 115 | - i += 2; | 112 | + if (model_->AddBlank() && config_.model.vits.data_dir.empty()) { |
| 113 | + for (auto &k : x) { | ||
| 114 | + k = AddBlank(k); | ||
| 115 | + } | ||
| 116 | + } | ||
| 117 | + | ||
| 118 | + int32_t x_size = static_cast<int32_t>(x.size()); | ||
| 119 | + | ||
| 120 | + if (config_.max_num_sentences <= 0 || x_size <= config_.max_num_sentences) { | ||
| 121 | + return Process(x, sid, speed); | ||
| 122 | + } | ||
| 123 | + | ||
| 124 | + // the input text is too long, we process sentences within it in batches | ||
| 125 | + // to avoid OOM. Batch size is config_.max_num_sentences | ||
| 126 | + std::vector<std::vector<int64_t>> batch; | ||
| 127 | + int32_t batch_size = config_.max_num_sentences; | ||
| 128 | + batch.reserve(config_.max_num_sentences); | ||
| 129 | + int32_t num_batches = x_size / batch_size; | ||
| 130 | + | ||
| 131 | + if (config_.model.debug) { | ||
| 132 | + SHERPA_ONNX_LOGE( | ||
| 133 | + "Text is too long. Split it into %d batches. batch size: %d. Number " | ||
| 134 | + "of sentences: %d", | ||
| 135 | + num_batches, batch_size, x_size); | ||
| 136 | + } | ||
| 137 | + | ||
| 138 | + GeneratedAudio ans; | ||
| 139 | + | ||
| 140 | + int32_t k = 0; | ||
| 141 | + | ||
| 142 | + for (int32_t b = 0; b != num_batches; ++b) { | ||
| 143 | + batch.clear(); | ||
| 144 | + for (int32_t i = 0; i != batch_size; ++i, ++k) { | ||
| 145 | + batch.push_back(std::move(x[k])); | ||
| 116 | } | 146 | } |
| 117 | - x = std::move(buffer); | 147 | + |
| 148 | + auto audio = Process(batch, sid, speed); | ||
| 149 | + ans.sample_rate = audio.sample_rate; | ||
| 150 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 151 | + audio.samples.end()); | ||
| 152 | + } | ||
| 153 | + | ||
| 154 | + batch.clear(); | ||
| 155 | + while (k < x.size()) { | ||
| 156 | + batch.push_back(std::move(x[k])); | ||
| 157 | + ++k; | ||
| 158 | + } | ||
| 159 | + | ||
| 160 | + if (!batch.empty()) { | ||
| 161 | + auto audio = Process(batch, sid, speed); | ||
| 162 | + ans.sample_rate = audio.sample_rate; | ||
| 163 | + ans.samples.insert(ans.samples.end(), audio.samples.begin(), | ||
| 164 | + audio.samples.end()); | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + return ans; | ||
| 168 | + } | ||
| 169 | + | ||
| 170 | + private: | ||
| 171 | + void InitFrontend(AAssetManager *mgr) { | ||
| 172 | + if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) { | ||
| 173 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 174 | + mgr, config_.model.vits.tokens, config_.model.vits.data_dir); | ||
| 175 | + } else { | ||
| 176 | + frontend_ = std::make_unique<Lexicon>( | ||
| 177 | + mgr, config_.model.vits.lexicon, config_.model.vits.tokens, | ||
| 178 | + model_->Punctuations(), model_->Language(), config_.model.debug, | ||
| 179 | + model_->IsPiper()); | ||
| 180 | + } | ||
| 181 | + } | ||
| 182 | + | ||
| 183 | + void InitFrontend() { | ||
| 184 | + if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) { | ||
| 185 | + frontend_ = std::make_unique<PiperPhonemizeLexicon>( | ||
| 186 | + config_.model.vits.tokens, config_.model.vits.data_dir); | ||
| 187 | + } else { | ||
| 188 | + frontend_ = std::make_unique<Lexicon>( | ||
| 189 | + config_.model.vits.lexicon, config_.model.vits.tokens, | ||
| 190 | + model_->Punctuations(), model_->Language(), config_.model.debug, | ||
| 191 | + model_->IsPiper()); | ||
| 192 | + } | ||
| 193 | + } | ||
| 194 | + | ||
| 195 | + std::vector<int64_t> AddBlank(const std::vector<int64_t> &x) const { | ||
| 196 | + // we assume the blank ID is 0 | ||
| 197 | + std::vector<int64_t> buffer(x.size() * 2 + 1); | ||
| 198 | + int32_t i = 1; | ||
| 199 | + for (auto k : x) { | ||
| 200 | + buffer[i] = k; | ||
| 201 | + i += 2; | ||
| 202 | + } | ||
| 203 | + return buffer; | ||
| 204 | + } | ||
| 205 | + | ||
| 206 | + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens, | ||
| 207 | + int32_t sid, float speed) const { | ||
| 208 | + int32_t num_tokens = 0; | ||
| 209 | + for (const auto &k : tokens) { | ||
| 210 | + num_tokens += k.size(); | ||
| 211 | + } | ||
| 212 | + | ||
| 213 | + std::vector<int64_t> x; | ||
| 214 | + x.reserve(num_tokens); | ||
| 215 | + for (const auto &k : tokens) { | ||
| 216 | + x.insert(x.end(), k.begin(), k.end()); | ||
| 118 | } | 217 | } |
| 119 | 218 | ||
| 120 | auto memory_info = | 219 | auto memory_info = |
| @@ -147,7 +246,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -147,7 +246,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 147 | OfflineTtsConfig config_; | 246 | OfflineTtsConfig config_; |
| 148 | std::unique_ptr<OfflineTtsVitsModel> model_; | 247 | std::unique_ptr<OfflineTtsVitsModel> model_; |
| 149 | std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_; | 248 | std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_; |
| 150 | - Lexicon lexicon_; | 249 | + std::unique_ptr<OfflineTtsFrontend> frontend_; |
| 151 | }; | 250 | }; |
| 152 | 251 | ||
| 153 | } // namespace sherpa_onnx | 252 | } // namespace sherpa_onnx |
| @@ -13,6 +13,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { | @@ -13,6 +13,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { | ||
| 13 | po->Register("vits-model", &model, "Path to VITS model"); | 13 | po->Register("vits-model", &model, "Path to VITS model"); |
| 14 | po->Register("vits-lexicon", &lexicon, "Path to lexicon.txt for VITS models"); | 14 | po->Register("vits-lexicon", &lexicon, "Path to lexicon.txt for VITS models"); |
| 15 | po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models"); | 15 | po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models"); |
| 16 | + po->Register("vits-data-dir", &data_dir, | ||
| 17 | + "Path to the directory containing dict for espeak-ng. If it is " | ||
| 18 | + "given, --vits-lexicon is ignored."); | ||
| 16 | po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); | 19 | po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); |
| 17 | po->Register("vits-noise-scale-w", &noise_scale_w, | 20 | po->Register("vits-noise-scale-w", &noise_scale_w, |
| 18 | "noise_scale_w for VITS models"); | 21 | "noise_scale_w for VITS models"); |
| @@ -31,16 +34,6 @@ bool OfflineTtsVitsModelConfig::Validate() const { | @@ -31,16 +34,6 @@ bool OfflineTtsVitsModelConfig::Validate() const { | ||
| 31 | return false; | 34 | return false; |
| 32 | } | 35 | } |
| 33 | 36 | ||
| 34 | - if (lexicon.empty()) { | ||
| 35 | - SHERPA_ONNX_LOGE("Please provide --vits-lexicon"); | ||
| 36 | - return false; | ||
| 37 | - } | ||
| 38 | - | ||
| 39 | - if (!FileExists(lexicon)) { | ||
| 40 | - SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str()); | ||
| 41 | - return false; | ||
| 42 | - } | ||
| 43 | - | ||
| 44 | if (tokens.empty()) { | 37 | if (tokens.empty()) { |
| 45 | SHERPA_ONNX_LOGE("Please provide --vits-tokens"); | 38 | SHERPA_ONNX_LOGE("Please provide --vits-tokens"); |
| 46 | return false; | 39 | return false; |
| @@ -51,6 +44,43 @@ bool OfflineTtsVitsModelConfig::Validate() const { | @@ -51,6 +44,43 @@ bool OfflineTtsVitsModelConfig::Validate() const { | ||
| 51 | return false; | 44 | return false; |
| 52 | } | 45 | } |
| 53 | 46 | ||
| 47 | + if (data_dir.empty()) { | ||
| 48 | + if (lexicon.empty()) { | ||
| 49 | + SHERPA_ONNX_LOGE("Please provide --vits-lexicon"); | ||
| 50 | + return false; | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | + if (!FileExists(lexicon)) { | ||
| 54 | + SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str()); | ||
| 55 | + return false; | ||
| 56 | + } | ||
| 57 | + | ||
| 58 | + } else { | ||
| 59 | + if (!FileExists(data_dir + "/phontab")) { | ||
| 60 | + SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test", | ||
| 61 | + data_dir.c_str()); | ||
| 62 | + return false; | ||
| 63 | + } | ||
| 64 | + | ||
| 65 | + if (!FileExists(data_dir + "/phonindex")) { | ||
| 66 | + SHERPA_ONNX_LOGE("%s/phonindex does not exist. Skipping test", | ||
| 67 | + data_dir.c_str()); | ||
| 68 | + return false; | ||
| 69 | + } | ||
| 70 | + | ||
| 71 | + if (!FileExists(data_dir + "/phondata")) { | ||
| 72 | + SHERPA_ONNX_LOGE("%s/phondata does not exist. Skipping test", | ||
| 73 | + data_dir.c_str()); | ||
| 74 | + return false; | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + if (!FileExists(data_dir + "/intonations")) { | ||
| 78 | + SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test", | ||
| 79 | + data_dir.c_str()); | ||
| 80 | + return false; | ||
| 81 | + } | ||
| 82 | + } | ||
| 83 | + | ||
| 54 | return true; | 84 | return true; |
| 55 | } | 85 | } |
| 56 | 86 | ||
| @@ -61,6 +91,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { | @@ -61,6 +91,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { | ||
| 61 | os << "model=\"" << model << "\", "; | 91 | os << "model=\"" << model << "\", "; |
| 62 | os << "lexicon=\"" << lexicon << "\", "; | 92 | os << "lexicon=\"" << lexicon << "\", "; |
| 63 | os << "tokens=\"" << tokens << "\", "; | 93 | os << "tokens=\"" << tokens << "\", "; |
| 94 | + os << "data_dir=\"" << data_dir << "\", "; | ||
| 64 | os << "noise_scale=" << noise_scale << ", "; | 95 | os << "noise_scale=" << noise_scale << ", "; |
| 65 | os << "noise_scale_w=" << noise_scale_w << ", "; | 96 | os << "noise_scale_w=" << noise_scale_w << ", "; |
| 66 | os << "length_scale=" << length_scale << ")"; | 97 | os << "length_scale=" << length_scale << ")"; |
| @@ -16,6 +16,10 @@ struct OfflineTtsVitsModelConfig { | @@ -16,6 +16,10 @@ struct OfflineTtsVitsModelConfig { | ||
| 16 | std::string lexicon; | 16 | std::string lexicon; |
| 17 | std::string tokens; | 17 | std::string tokens; |
| 18 | 18 | ||
| 19 | + // If data_dir is given, lexicon is ignored | ||
| 20 | + // data_dir is for piper-phonemize, which uses espeak-ng | ||
| 21 | + std::string data_dir; | ||
| 22 | + | ||
| 19 | float noise_scale = 0.667; | 23 | float noise_scale = 0.667; |
| 20 | float noise_scale_w = 0.8; | 24 | float noise_scale_w = 0.8; |
| 21 | float length_scale = 1; | 25 | float length_scale = 1; |
| @@ -28,11 +32,13 @@ struct OfflineTtsVitsModelConfig { | @@ -28,11 +32,13 @@ struct OfflineTtsVitsModelConfig { | ||
| 28 | OfflineTtsVitsModelConfig(const std::string &model, | 32 | OfflineTtsVitsModelConfig(const std::string &model, |
| 29 | const std::string &lexicon, | 33 | const std::string &lexicon, |
| 30 | const std::string &tokens, | 34 | const std::string &tokens, |
| 35 | + const std::string &data_dir, | ||
| 31 | float noise_scale = 0.667, | 36 | float noise_scale = 0.667, |
| 32 | float noise_scale_w = 0.8, float length_scale = 1) | 37 | float noise_scale_w = 0.8, float length_scale = 1) |
| 33 | : model(model), | 38 | : model(model), |
| 34 | lexicon(lexicon), | 39 | lexicon(lexicon), |
| 35 | tokens(tokens), | 40 | tokens(tokens), |
| 41 | + data_dir(data_dir), | ||
| 36 | noise_scale(noise_scale), | 42 | noise_scale(noise_scale), |
| 37 | noise_scale_w(noise_scale_w), | 43 | noise_scale_w(noise_scale_w), |
| 38 | length_scale(length_scale) {} | 44 | length_scale(length_scale) {} |
| @@ -51,6 +51,7 @@ class OfflineTtsVitsModel::Impl { | @@ -51,6 +51,7 @@ class OfflineTtsVitsModel::Impl { | ||
| 51 | 51 | ||
| 52 | std::string Punctuations() const { return punctuations_; } | 52 | std::string Punctuations() const { return punctuations_; } |
| 53 | std::string Language() const { return language_; } | 53 | std::string Language() const { return language_; } |
| 54 | + std::string Voice() const { return voice_; } | ||
| 54 | bool IsPiper() const { return is_piper_; } | 55 | bool IsPiper() const { return is_piper_; } |
| 55 | int32_t NumSpeakers() const { return num_speakers_; } | 56 | int32_t NumSpeakers() const { return num_speakers_; } |
| 56 | 57 | ||
| @@ -74,10 +75,12 @@ class OfflineTtsVitsModel::Impl { | @@ -74,10 +75,12 @@ class OfflineTtsVitsModel::Impl { | ||
| 74 | 75 | ||
| 75 | Ort::AllocatorWithDefaultOptions allocator; // used in the macro below | 76 | Ort::AllocatorWithDefaultOptions allocator; // used in the macro below |
| 76 | SHERPA_ONNX_READ_META_DATA(sample_rate_, "sample_rate"); | 77 | SHERPA_ONNX_READ_META_DATA(sample_rate_, "sample_rate"); |
| 77 | - SHERPA_ONNX_READ_META_DATA(add_blank_, "add_blank"); | 78 | + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(add_blank_, "add_blank", 0); |
| 78 | SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers"); | 79 | SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers"); |
| 79 | - SHERPA_ONNX_READ_META_DATA_STR(punctuations_, "punctuation"); | 80 | + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(punctuations_, "punctuation", |
| 81 | + ""); | ||
| 80 | SHERPA_ONNX_READ_META_DATA_STR(language_, "language"); | 82 | SHERPA_ONNX_READ_META_DATA_STR(language_, "language"); |
| 83 | + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(voice_, "voice", ""); | ||
| 81 | 84 | ||
| 82 | std::string comment; | 85 | std::string comment; |
| 83 | SHERPA_ONNX_READ_META_DATA_STR(comment, "comment"); | 86 | SHERPA_ONNX_READ_META_DATA_STR(comment, "comment"); |
| @@ -215,6 +218,7 @@ class OfflineTtsVitsModel::Impl { | @@ -215,6 +218,7 @@ class OfflineTtsVitsModel::Impl { | ||
| 215 | int32_t num_speakers_; | 218 | int32_t num_speakers_; |
| 216 | std::string punctuations_; | 219 | std::string punctuations_; |
| 217 | std::string language_; | 220 | std::string language_; |
| 221 | + std::string voice_; | ||
| 218 | 222 | ||
| 219 | bool is_piper_ = false; | 223 | bool is_piper_ = false; |
| 220 | }; | 224 | }; |
| @@ -244,6 +248,7 @@ std::string OfflineTtsVitsModel::Punctuations() const { | @@ -244,6 +248,7 @@ std::string OfflineTtsVitsModel::Punctuations() const { | ||
| 244 | } | 248 | } |
| 245 | 249 | ||
| 246 | std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); } | 250 | std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); } |
| 251 | +std::string OfflineTtsVitsModel::Voice() const { return impl_->Voice(); } | ||
| 247 | 252 | ||
| 248 | bool OfflineTtsVitsModel::IsPiper() const { return impl_->IsPiper(); } | 253 | bool OfflineTtsVitsModel::IsPiper() const { return impl_->IsPiper(); } |
| 249 | 254 |
| @@ -46,7 +46,8 @@ class OfflineTtsVitsModel { | @@ -46,7 +46,8 @@ class OfflineTtsVitsModel { | ||
| 46 | bool AddBlank() const; | 46 | bool AddBlank() const; |
| 47 | 47 | ||
| 48 | std::string Punctuations() const; | 48 | std::string Punctuations() const; |
| 49 | - std::string Language() const; | 49 | + std::string Language() const; // e.g., Chinese, English, German, etc. |
| 50 | + std::string Voice() const; // e.g., en-us, for espeak-ng | ||
| 50 | bool IsPiper() const; | 51 | bool IsPiper() const; |
| 51 | int32_t NumSpeakers() const; | 52 | int32_t NumSpeakers() const; |
| 52 | 53 |
| @@ -21,6 +21,12 @@ void OfflineTtsConfig::Register(ParseOptions *po) { | @@ -21,6 +21,12 @@ void OfflineTtsConfig::Register(ParseOptions *po) { | ||
| 21 | "Multiple filenames are separated by a comma and they are " | 21 | "Multiple filenames are separated by a comma and they are " |
| 22 | "applied from left to right. An example value: " | 22 | "applied from left to right. An example value: " |
| 23 | "rule1.fst,rule2,fst,rule3.fst"); | 23 | "rule1.fst,rule2,fst,rule3.fst"); |
| 24 | + | ||
| 25 | + po->Register( | ||
| 26 | + "tts-max-num-sentences", &max_num_sentences, | ||
| 27 | + "Maximum number of sentences that we process at a time. " | ||
| 28 | + "This is to avoid OOM for very long input text. " | ||
| 29 | + "If you set it to -1, then we process all sentences in a single batch."); | ||
| 24 | } | 30 | } |
| 25 | 31 | ||
| 26 | bool OfflineTtsConfig::Validate() const { | 32 | bool OfflineTtsConfig::Validate() const { |
| @@ -43,7 +49,8 @@ std::string OfflineTtsConfig::ToString() const { | @@ -43,7 +49,8 @@ std::string OfflineTtsConfig::ToString() const { | ||
| 43 | 49 | ||
| 44 | os << "OfflineTtsConfig("; | 50 | os << "OfflineTtsConfig("; |
| 45 | os << "model=" << model.ToString() << ", "; | 51 | os << "model=" << model.ToString() << ", "; |
| 46 | - os << "rule_fsts=\"" << rule_fsts << "\")"; | 52 | + os << "rule_fsts=\"" << rule_fsts << "\", "; |
| 53 | + os << "max_num_sentences=" << max_num_sentences << ")"; | ||
| 47 | 54 | ||
| 48 | return os.str(); | 55 | return os.str(); |
| 49 | } | 56 | } |
| @@ -28,10 +28,17 @@ struct OfflineTtsConfig { | @@ -28,10 +28,17 @@ struct OfflineTtsConfig { | ||
| 28 | // If there are multiple rules, they are applied from left to right. | 28 | // If there are multiple rules, they are applied from left to right. |
| 29 | std::string rule_fsts; | 29 | std::string rule_fsts; |
| 30 | 30 | ||
| 31 | + // Maximum number of sentences that we process at a time. | ||
| 32 | + // This is to avoid OOM for very long input text. | ||
| 33 | + // If you set it to -1, then we process all sentences in a single batch. | ||
| 34 | + int32_t max_num_sentences = 2; | ||
| 35 | + | ||
| 31 | OfflineTtsConfig() = default; | 36 | OfflineTtsConfig() = default; |
| 32 | OfflineTtsConfig(const OfflineTtsModelConfig &model, | 37 | OfflineTtsConfig(const OfflineTtsModelConfig &model, |
| 33 | - const std::string &rule_fsts) | ||
| 34 | - : model(model), rule_fsts(rule_fsts) {} | 38 | + const std::string &rule_fsts, int32_t max_num_sentences) |
| 39 | + : model(model), | ||
| 40 | + rule_fsts(rule_fsts), | ||
| 41 | + max_num_sentences(max_num_sentences) {} | ||
| 35 | 42 | ||
| 36 | void Register(ParseOptions *po); | 43 | void Register(ParseOptions *po); |
| 37 | bool Validate() const; | 44 | bool Validate() const; |
sherpa-onnx/csrc/piper-phonemize-lexicon.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/piper-phonemize-lexicon.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" | ||
| 6 | + | ||
| 7 | +#include <codecvt> | ||
| 8 | +#include <fstream> | ||
| 9 | +#include <locale> | ||
| 10 | +#include <map> | ||
| 11 | +#include <mutex> // NOLINT | ||
| 12 | +#include <sstream> | ||
| 13 | +#include <string> | ||
| 14 | +#include <utility> | ||
| 15 | +#include <vector> | ||
| 16 | + | ||
| 17 | +#if __ANDROID_API__ >= 9 | ||
| 18 | +#include <strstream> | ||
| 19 | + | ||
| 20 | +#include "android/asset_manager.h" | ||
| 21 | +#include "android/asset_manager_jni.h" | ||
| 22 | +#endif | ||
| 23 | + | ||
| 24 | +#include "espeak-ng/speak_lib.h" | ||
| 25 | +#include "phoneme_ids.hpp" | ||
| 26 | +#include "phonemize.hpp" | ||
| 27 | +#include "sherpa-onnx/csrc/macros.h" | ||
| 28 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 29 | + | ||
| 30 | +namespace sherpa_onnx { | ||
| 31 | + | ||
| 32 | +static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) { | ||
| 33 | + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv; | ||
| 34 | + std::unordered_map<char32_t, int32_t> token2id; | ||
| 35 | + | ||
| 36 | + std::string line; | ||
| 37 | + | ||
| 38 | + std::string sym; | ||
| 39 | + std::u32string s; | ||
| 40 | + int32_t id; | ||
| 41 | + while (std::getline(is, line)) { | ||
| 42 | + std::istringstream iss(line); | ||
| 43 | + iss >> sym; | ||
| 44 | + if (iss.eof()) { | ||
| 45 | + id = atoi(sym.c_str()); | ||
| 46 | + sym = " "; | ||
| 47 | + } else { | ||
| 48 | + iss >> id; | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + // eat the trailing \r\n on windows | ||
| 52 | + iss >> std::ws; | ||
| 53 | + if (!iss.eof()) { | ||
| 54 | + SHERPA_ONNX_LOGE("Error when reading tokens: %s", line.c_str()); | ||
| 55 | + exit(-1); | ||
| 56 | + } | ||
| 57 | + | ||
| 58 | + s = conv.from_bytes(sym); | ||
| 59 | + if (s.size() != 1) { | ||
| 60 | + SHERPA_ONNX_LOGE("Error when reading tokens at Line %s. size: %d", | ||
| 61 | + line.c_str(), static_cast<int32_t>(s.size())); | ||
| 62 | + exit(-1); | ||
| 63 | + } | ||
| 64 | + char32_t c = s[0]; | ||
| 65 | + | ||
| 66 | + if (token2id.count(c)) { | ||
| 67 | + SHERPA_ONNX_LOGE("Duplicated token %s. Line %s. Existing ID: %d", | ||
| 68 | + sym.c_str(), line.c_str(), token2id.at(c)); | ||
| 69 | + exit(-1); | ||
| 70 | + } | ||
| 71 | + | ||
| 72 | + token2id.insert({c, id}); | ||
| 73 | + } | ||
| 74 | + | ||
| 75 | + return token2id; | ||
| 76 | +} | ||
| 77 | + | ||
| 78 | +// see the function "phonemes_to_ids" from | ||
| 79 | +// https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb | ||
| 80 | +static std::vector<int64_t> PhonemesToIds( | ||
| 81 | + const std::unordered_map<char32_t, int32_t> &token2id, | ||
| 82 | + const std::vector<piper::Phoneme> &phonemes) { | ||
| 83 | + // see | ||
| 84 | + // https://github.com/rhasspy/piper-phonemize/blob/master/src/phoneme_ids.hpp#L17 | ||
| 85 | + int32_t pad = token2id.at(U'_'); | ||
| 86 | + int32_t bos = token2id.at(U'^'); | ||
| 87 | + int32_t eos = token2id.at(U'$'); | ||
| 88 | + | ||
| 89 | + std::vector<int64_t> ans; | ||
| 90 | + ans.reserve(phonemes.size()); | ||
| 91 | + | ||
| 92 | + ans.push_back(bos); | ||
| 93 | + for (auto p : phonemes) { | ||
| 94 | + if (token2id.count(p)) { | ||
| 95 | + ans.push_back(token2id.at(p)); | ||
| 96 | + ans.push_back(pad); | ||
| 97 | + } else { | ||
| 98 | + SHERPA_ONNX_LOGE("Skip unkown phonemes. Unicode codepoint: \\U+%04x.", p); | ||
| 99 | + } | ||
| 100 | + } | ||
| 101 | + ans.push_back(eos); | ||
| 102 | + | ||
| 103 | + return ans; | ||
| 104 | +} | ||
| 105 | + | ||
| 106 | +void InitEspeak(const std::string &data_dir) { | ||
| 107 | + static std::once_flag init_flag; | ||
| 108 | + std::call_once(init_flag, [data_dir]() { | ||
| 109 | + int32_t result = | ||
| 110 | + espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, data_dir.c_str(), 0); | ||
| 111 | + if (result != 22050) { | ||
| 112 | + SHERPA_ONNX_LOGE( | ||
| 113 | + "Failed to initialize espeak-ng with data dir: %s. Return code is: " | ||
| 114 | + "%d", | ||
| 115 | + data_dir.c_str(), result); | ||
| 116 | + exit(-1); | ||
| 117 | + } | ||
| 118 | + }); | ||
| 119 | +} | ||
| 120 | + | ||
| 121 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &tokens, | ||
| 122 | + const std::string &data_dir) | ||
| 123 | + : data_dir_(data_dir) { | ||
| 124 | + { | ||
| 125 | + std::ifstream is(tokens); | ||
| 126 | + token2id_ = ReadTokens(is); | ||
| 127 | + } | ||
| 128 | + | ||
| 129 | + InitEspeak(data_dir_); | ||
| 130 | +} | ||
| 131 | + | ||
| 132 | +#if __ANDROID_API__ >= 9 | ||
| 133 | +PiperPhonemizeLexicon::PiperPhonemizeLexicon(AAssetManager *mgr, | ||
| 134 | + const std::string &tokens, | ||
| 135 | + const std::string &data_dir) { | ||
| 136 | + { | ||
| 137 | + auto buf = ReadFile(mgr, tokens); | ||
| 138 | + std::istrstream is(buf.data(), buf.size()); | ||
| 139 | + token2id_ = ReadTokens(is); | ||
| 140 | + } | ||
| 141 | + | ||
| 142 | + // We should copy the directory of espeak-ng-data from the asset to | ||
| 143 | + // some internal or external storage and then pass the directory to data_dir. | ||
| 144 | + InitEspeak(data_dir_); | ||
| 145 | +} | ||
| 146 | +#endif | ||
| 147 | + | ||
| 148 | +std::vector<std::vector<int64_t>> PiperPhonemizeLexicon::ConvertTextToTokenIds( | ||
| 149 | + const std::string &text, const std::string &voice /*= ""*/) const { | ||
| 150 | + piper::eSpeakPhonemeConfig config; | ||
| 151 | + | ||
| 152 | + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices | ||
| 153 | + // to list available voices | ||
| 154 | + config.voice = voice; // e.g., voice is en-us | ||
| 155 | + | ||
| 156 | + std::vector<std::vector<piper::Phoneme>> phonemes; | ||
| 157 | + piper::phonemize_eSpeak(text, config, phonemes); | ||
| 158 | + | ||
| 159 | + std::vector<std::vector<int64_t>> ans; | ||
| 160 | + | ||
| 161 | + std::vector<int64_t> phoneme_ids; | ||
| 162 | + for (const auto &p : phonemes) { | ||
| 163 | + phoneme_ids = PhonemesToIds(token2id_, p); | ||
| 164 | + ans.push_back(std::move(phoneme_ids)); | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + return ans; | ||
| 168 | +} | ||
| 169 | + | ||
| 170 | +} // namespace sherpa_onnx |
sherpa-onnx/csrc/piper-phonemize-lexicon.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/piper-phonemize-lexicon.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ | ||
| 7 | + | ||
| 8 | +#include <string> | ||
| 9 | +#include <unordered_map> | ||
| 10 | +#include <vector> | ||
| 11 | + | ||
| 12 | +#if __ANDROID_API__ >= 9 | ||
| 13 | +#include "android/asset_manager.h" | ||
| 14 | +#include "android/asset_manager_jni.h" | ||
| 15 | +#endif | ||
| 16 | + | ||
| 17 | +#include "sherpa-onnx/csrc/offline-tts-frontend.h" | ||
| 18 | + | ||
| 19 | +namespace sherpa_onnx { | ||
| 20 | + | ||
| 21 | +class PiperPhonemizeLexicon : public OfflineTtsFrontend { | ||
| 22 | + public: | ||
| 23 | + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir); | ||
| 24 | + | ||
| 25 | +#if __ANDROID_API__ >= 9 | ||
| 26 | + PiperPhonemizeLexicon(AAssetManager *mgr, const std::string &tokens, | ||
| 27 | + const std::string &data_dir); | ||
| 28 | +#endif | ||
| 29 | + | ||
| 30 | + std::vector<std::vector<int64_t>> ConvertTextToTokenIds( | ||
| 31 | + const std::string &text, const std::string &voice = "") const override; | ||
| 32 | + | ||
| 33 | + private: | ||
| 34 | + std::string data_dir_; | ||
| 35 | + // map unicode codepoint to an integer ID | ||
| 36 | + std::unordered_map<char32_t, int32_t> token2id_; | ||
| 37 | +}; | ||
| 38 | + | ||
| 39 | +} // namespace sherpa_onnx | ||
| 40 | + | ||
| 41 | +#endif // SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ |
| @@ -48,7 +48,7 @@ TEST(PiperPhonemize, Case1) { | @@ -48,7 +48,7 @@ TEST(PiperPhonemize, Case1) { | ||
| 48 | 48 | ||
| 49 | piper::eSpeakPhonemeConfig config; | 49 | piper::eSpeakPhonemeConfig config; |
| 50 | 50 | ||
| 51 | - // ./bin/espeak-ng --path ./install/share/espeak-ng-data/ --voices | 51 | + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices |
| 52 | // to list available voices | 52 | // to list available voices |
| 53 | config.voice = "en-us"; | 53 | config.voice = "en-us"; |
| 54 | 54 | ||
| @@ -61,15 +61,15 @@ TEST(PiperPhonemize, Case1) { | @@ -61,15 +61,15 @@ TEST(PiperPhonemize, Case1) { | ||
| 61 | } | 61 | } |
| 62 | std::cout << "\n"; | 62 | std::cout << "\n"; |
| 63 | 63 | ||
| 64 | - std::vector<piper::PhonemeId> phonemeIds; | ||
| 65 | - std::map<piper::Phoneme, std::size_t> missingPhonemes; | 64 | + std::vector<piper::PhonemeId> phoneme_ids; |
| 65 | + std::map<piper::Phoneme, std::size_t> missing_phonemes; | ||
| 66 | 66 | ||
| 67 | { | 67 | { |
| 68 | piper::PhonemeIdConfig config; | 68 | piper::PhonemeIdConfig config; |
| 69 | - phonemes_to_ids(phonemes[0], config, phonemeIds, missingPhonemes); | 69 | + phonemes_to_ids(phonemes[0], config, phoneme_ids, missing_phonemes); |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | - for (int32_t p : phonemeIds) { | 72 | + for (int32_t p : phoneme_ids) { |
| 73 | std::cout << p << " "; | 73 | std::cout << p << " "; |
| 74 | } | 74 | } |
| 75 | std::cout << "\n"; | 75 | std::cout << "\n"; |
| @@ -545,6 +545,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | @@ -545,6 +545,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | ||
| 545 | ans.model.vits.tokens = p; | 545 | ans.model.vits.tokens = p; |
| 546 | env->ReleaseStringUTFChars(s, p); | 546 | env->ReleaseStringUTFChars(s, p); |
| 547 | 547 | ||
| 548 | + fid = env->GetFieldID(vits_cls, "dataDir", "Ljava/lang/String;"); | ||
| 549 | + s = (jstring)env->GetObjectField(vits, fid); | ||
| 550 | + p = env->GetStringUTFChars(s, nullptr); | ||
| 551 | + ans.model.vits.data_dir = p; | ||
| 552 | + env->ReleaseStringUTFChars(s, p); | ||
| 553 | + | ||
| 548 | fid = env->GetFieldID(vits_cls, "noiseScale", "F"); | 554 | fid = env->GetFieldID(vits_cls, "noiseScale", "F"); |
| 549 | ans.model.vits.noise_scale = env->GetFloatField(vits, fid); | 555 | ans.model.vits.noise_scale = env->GetFloatField(vits, fid); |
| 550 | 556 | ||
| @@ -573,6 +579,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | @@ -573,6 +579,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { | ||
| 573 | ans.rule_fsts = p; | 579 | ans.rule_fsts = p; |
| 574 | env->ReleaseStringUTFChars(s, p); | 580 | env->ReleaseStringUTFChars(s, p); |
| 575 | 581 | ||
| 582 | + fid = env->GetFieldID(cls, "maxNumSentences", "I"); | ||
| 583 | + ans.max_num_sentences = env->GetIntField(config, fid); | ||
| 584 | + | ||
| 576 | return ans; | 585 | return ans; |
| 577 | } | 586 | } |
| 578 | 587 | ||
| @@ -589,6 +598,11 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new( | @@ -589,6 +598,11 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new( | ||
| 589 | #endif | 598 | #endif |
| 590 | auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config); | 599 | auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config); |
| 591 | SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); | 600 | SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); |
| 601 | + | ||
| 602 | + if (!config.Validate()) { | ||
| 603 | + SHERPA_ONNX_LOGE("Erros found in config!"); | ||
| 604 | + } | ||
| 605 | + | ||
| 592 | auto tts = new sherpa_onnx::SherpaOnnxOfflineTts( | 606 | auto tts = new sherpa_onnx::SherpaOnnxOfflineTts( |
| 593 | #if __ANDROID_API__ >= 9 | 607 | #if __ANDROID_API__ >= 9 |
| 594 | mgr, | 608 | mgr, |
| @@ -16,17 +16,20 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { | @@ -16,17 +16,20 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { | ||
| 16 | py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig") | 16 | py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig") |
| 17 | .def(py::init<>()) | 17 | .def(py::init<>()) |
| 18 | .def(py::init<const std::string &, const std::string &, | 18 | .def(py::init<const std::string &, const std::string &, |
| 19 | - const std::string &, float, float, float>(), | 19 | + const std::string &, const std::string, float, float, |
| 20 | + float>(), | ||
| 20 | py::arg("model"), py::arg("lexicon"), py::arg("tokens"), | 21 | py::arg("model"), py::arg("lexicon"), py::arg("tokens"), |
| 21 | - py::arg("noise_scale") = 0.667, py::arg("noise_scale_w") = 0.8, | ||
| 22 | - py::arg("length_scale") = 1.0) | 22 | + py::arg("data_dir") = "", py::arg("noise_scale") = 0.667, |
| 23 | + py::arg("noise_scale_w") = 0.8, py::arg("length_scale") = 1.0) | ||
| 23 | .def_readwrite("model", &PyClass::model) | 24 | .def_readwrite("model", &PyClass::model) |
| 24 | .def_readwrite("lexicon", &PyClass::lexicon) | 25 | .def_readwrite("lexicon", &PyClass::lexicon) |
| 25 | .def_readwrite("tokens", &PyClass::tokens) | 26 | .def_readwrite("tokens", &PyClass::tokens) |
| 27 | + .def_readwrite("data_dir", &PyClass::data_dir) | ||
| 26 | .def_readwrite("noise_scale", &PyClass::noise_scale) | 28 | .def_readwrite("noise_scale", &PyClass::noise_scale) |
| 27 | .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) | 29 | .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) |
| 28 | .def_readwrite("length_scale", &PyClass::length_scale) | 30 | .def_readwrite("length_scale", &PyClass::length_scale) |
| 29 | - .def("__str__", &PyClass::ToString); | 31 | + .def("__str__", &PyClass::ToString) |
| 32 | + .def("validate", &PyClass::Validate); | ||
| 30 | } | 33 | } |
| 31 | 34 | ||
| 32 | } // namespace sherpa_onnx | 35 | } // namespace sherpa_onnx |
| @@ -30,10 +30,14 @@ static void PybindOfflineTtsConfig(py::module *m) { | @@ -30,10 +30,14 @@ static void PybindOfflineTtsConfig(py::module *m) { | ||
| 30 | using PyClass = OfflineTtsConfig; | 30 | using PyClass = OfflineTtsConfig; |
| 31 | py::class_<PyClass>(*m, "OfflineTtsConfig") | 31 | py::class_<PyClass>(*m, "OfflineTtsConfig") |
| 32 | .def(py::init<>()) | 32 | .def(py::init<>()) |
| 33 | - .def(py::init<const OfflineTtsModelConfig &, const std::string &>(), | ||
| 34 | - py::arg("model"), py::arg("rule_fsts") = "") | 33 | + .def(py::init<const OfflineTtsModelConfig &, const std::string &, |
| 34 | + int32_t>(), | ||
| 35 | + py::arg("model"), py::arg("rule_fsts") = "", | ||
| 36 | + py::arg("max_num_sentences") = 2) | ||
| 35 | .def_readwrite("model", &PyClass::model) | 37 | .def_readwrite("model", &PyClass::model) |
| 36 | .def_readwrite("rule_fsts", &PyClass::rule_fsts) | 38 | .def_readwrite("rule_fsts", &PyClass::rule_fsts) |
| 39 | + .def_readwrite("max_num_sentences", &PyClass::max_num_sentences) | ||
| 40 | + .def("validate", &PyClass::Validate) | ||
| 37 | .def("__str__", &PyClass::ToString); | 41 | .def("__str__", &PyClass::ToString); |
| 38 | } | 42 | } |
| 39 | 43 |
| @@ -578,6 +578,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | @@ -578,6 +578,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | ||
| 578 | model: String, | 578 | model: String, |
| 579 | lexicon: String, | 579 | lexicon: String, |
| 580 | tokens: String, | 580 | tokens: String, |
| 581 | + dataDir: String = "", | ||
| 581 | noiseScale: Float = 0.667, | 582 | noiseScale: Float = 0.667, |
| 582 | noiseScaleW: Float = 0.8, | 583 | noiseScaleW: Float = 0.8, |
| 583 | lengthScale: Float = 1.0 | 584 | lengthScale: Float = 1.0 |
| @@ -586,6 +587,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | @@ -586,6 +587,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( | ||
| 586 | model: toCPointer(model), | 587 | model: toCPointer(model), |
| 587 | lexicon: toCPointer(lexicon), | 588 | lexicon: toCPointer(lexicon), |
| 588 | tokens: toCPointer(tokens), | 589 | tokens: toCPointer(tokens), |
| 590 | + data_dir: toCPointer(dataDir), | ||
| 589 | noise_scale: noiseScale, | 591 | noise_scale: noiseScale, |
| 590 | noise_scale_w: noiseScaleW, | 592 | noise_scale_w: noiseScaleW, |
| 591 | length_scale: lengthScale) | 593 | length_scale: lengthScale) |
| @@ -607,11 +609,13 @@ func sherpaOnnxOfflineTtsModelConfig( | @@ -607,11 +609,13 @@ func sherpaOnnxOfflineTtsModelConfig( | ||
| 607 | 609 | ||
| 608 | func sherpaOnnxOfflineTtsConfig( | 610 | func sherpaOnnxOfflineTtsConfig( |
| 609 | model: SherpaOnnxOfflineTtsModelConfig, | 611 | model: SherpaOnnxOfflineTtsModelConfig, |
| 610 | - ruleFsts: String = "" | 612 | + ruleFsts: String = "", |
| 613 | + maxNumSenetences: Int = 2 | ||
| 611 | ) -> SherpaOnnxOfflineTtsConfig { | 614 | ) -> SherpaOnnxOfflineTtsConfig { |
| 612 | return SherpaOnnxOfflineTtsConfig( | 615 | return SherpaOnnxOfflineTtsConfig( |
| 613 | model: model, | 616 | model: model, |
| 614 | - rule_fsts: toCPointer(ruleFsts) | 617 | + rule_fsts: toCPointer(ruleFsts), |
| 618 | + max_num_sentences: Int32(maxNumSenetences) | ||
| 615 | ) | 619 | ) |
| 616 | } | 620 | } |
| 617 | 621 |
| @@ -7,17 +7,12 @@ if [ ! -d ../build-swift-macos ]; then | @@ -7,17 +7,12 @@ if [ ! -d ../build-swift-macos ]; then | ||
| 7 | exit 1 | 7 | exit 1 |
| 8 | fi | 8 | fi |
| 9 | 9 | ||
| 10 | -if [ ! -d ./vits-vctk ]; then | ||
| 11 | - echo "Please download the pre-trained model for testing." | ||
| 12 | - echo "You can refer to" | ||
| 13 | - echo "" | ||
| 14 | - echo "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers" | ||
| 15 | - echo "" | ||
| 16 | - echo "for help" | 10 | +if [ ! -d ./vits-piper-en_US-amy-low ]; then |
| 11 | + echo "Download a pre-trained model for testing." | ||
| 17 | 12 | ||
| 18 | - wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 | ||
| 19 | - tar xvf vits-vctk.tar.bz2 | ||
| 20 | - rm vits-vctk.tar.bz2 | 13 | + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 14 | + tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 15 | + rm vits-piper-en_US-amy-low.tar.bz2 | ||
| 21 | fi | 16 | fi |
| 22 | 17 | ||
| 23 | if [ ! -e ./tts ]; then | 18 | if [ ! -e ./tts ]; then |
| 1 | func run() { | 1 | func run() { |
| 2 | - let model = "./vits-vctk/vits-vctk.onnx" | ||
| 3 | - let lexicon = "./vits-vctk/lexicon.txt" | ||
| 4 | - let tokens = "./vits-vctk/tokens.txt" | 2 | + let model = "./vits-piper-en_US-amy-low/en_US-amy-low.onnx" |
| 3 | + let tokens = "./vits-piper-en_US-amy-low/tokens.txt" | ||
| 4 | + let dataDir = "./vits-piper-en_US-amy-low/espeak-ng-data" | ||
| 5 | let vits = sherpaOnnxOfflineTtsVitsModelConfig( | 5 | let vits = sherpaOnnxOfflineTtsVitsModelConfig( |
| 6 | model: model, | 6 | model: model, |
| 7 | - lexicon: lexicon, | ||
| 8 | - tokens: tokens | 7 | + lexicon: "", |
| 8 | + tokens: tokens, | ||
| 9 | + dataDir: dataDir | ||
| 9 | ) | 10 | ) |
| 10 | let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) | 11 | let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) |
| 11 | var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig) | 12 | var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig) |
| 12 | 13 | ||
| 13 | let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig) | 14 | let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig) |
| 14 | 15 | ||
| 15 | - let text = "How are you doing? Fantastic!" | 16 | + let text = |
| 17 | + "“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”" | ||
| 16 | let sid = 99 | 18 | let sid = 99 |
| 17 | let speed: Float = 1.0 | 19 | let speed: Float = 1.0 |
| 18 | 20 |
-
请 注册 或 登录 后发表评论