Fangjun Kuang
Committed by GitHub

Use piper-phonemize to convert text to token IDs (#453)

正在显示 55 个修改的文件 包含 1048 行增加192 行删除
@@ -52,14 +52,13 @@ node ./test-online-transducer.js @@ -52,14 +52,13 @@ node ./test-online-transducer.js
52 rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 52 rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20
53 53
54 # offline tts 54 # offline tts
55 -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2  
56 -tar xvf vits-vctk.tar.bz2  
57 -rm vits-vctk.tar.bz2 55 +
  56 +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  57 +tar xf vits-piper-en_US-amy-low.tar.bz2
58 node ./test-offline-tts-en.js 58 node ./test-offline-tts-en.js
59 -rm -rf vits-vctk 59 +rm vits-piper-en_US-amy-low.tar.bz2
60 60
61 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 61 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
62 tar xvf vits-zh-aishell3.tar.bz2 62 tar xvf vits-zh-aishell3.tar.bz2
63 -rm vits-zh-aishell3.tar.bz2  
64 node ./test-offline-tts-zh.js 63 node ./test-offline-tts-zh.js
65 -rm -rf vits-zh-aishell3 64 +rm vits-zh-aishell3.tar.bz2
@@ -17,6 +17,24 @@ which $EXE @@ -17,6 +17,24 @@ which $EXE
17 mkdir ./tts 17 mkdir ./tts
18 18
19 log "------------------------------------------------------------" 19 log "------------------------------------------------------------"
  20 +log "vits-piper-en_US-amy-low"
  21 +log "------------------------------------------------------------"
  22 +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  23 +tar xf vits-piper-en_US-amy-low.tar.bz2
  24 +rm vits-piper-en_US-amy-low.tar.bz2
  25 +
  26 +$EXE \
  27 + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
  28 + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
  29 + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
  30 + --debug=1 \
  31 + --output-filename=./tts/amy.wav \
  32 + "“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.” The sun shone bleakly in the sky, its meager light struggling to penetrate the thick foliage of the forest. Birds sang their songs up in the crowns of the trees, fluttering from one branch to the other. A blanket of total tranquility lied over the forest. The peace was only broken by the steady gallop of the horses of the soldiers who were traveling to their upcoming knighting the morrow at Camelot, and rowdy conversation. “Finally we will get what we deserve,” “It’s been about time,” Perceval agreed. “We’ve been risking our arses for the past two years. It’s the least they could give us.” Merlin remained ostensibly silent, refusing to join the verbal parade of self-aggrandizing his fellow soldiers have engaged in. He found it difficult to happy about anything, when even if they had won the war, he had lost everything else in the process."
  33 +
  34 +file ./tts/amy.wav
  35 +rm -rf vits-piper-en_US-amy-low
  36 +
  37 +log "------------------------------------------------------------"
20 log "vits-ljs test" 38 log "vits-ljs test"
21 log "------------------------------------------------------------" 39 log "------------------------------------------------------------"
22 40
@@ -26,8 +26,8 @@ jobs: @@ -26,8 +26,8 @@ jobs:
26 fail-fast: false 26 fail-fast: false
27 matrix: 27 matrix:
28 os: [ubuntu-latest] 28 os: [ubuntu-latest]
29 - total: ["12"]  
30 - index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"] 29 + total: ["30"]
  30 + index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29"]
31 31
32 steps: 32 steps:
33 - uses: actions/checkout@v4 33 - uses: actions/checkout@v4
  1 +name: test-build-wheel
  2 +
  3 +on:
  4 + push:
  5 + branches:
  6 + - master
  7 +
  8 + pull_request:
  9 +
  10 + workflow_dispatch:
  11 +
  12 +concurrency:
  13 + group: test-build-wheel-${{ github.ref }}
  14 + cancel-in-progress: true
  15 +
  16 +jobs:
  17 + test-build-wheel:
  18 + name: ${{ matrix.os }} ${{ matrix.python_version }}
  19 + runs-on: ${{ matrix.os }}
  20 + strategy:
  21 + fail-fast: false
  22 + matrix:
  23 + os: [ubuntu-latest, macos-latest, windows-latest]
  24 + python-version: ["3.8", "3.9", "3.10", "3.11"]
  25 +
  26 + steps:
  27 + - uses: actions/checkout@v4
  28 + with:
  29 + fetch-depth: 0
  30 +
  31 + - name: Setup Python ${{ matrix.python-version }}
  32 + uses: actions/setup-python@v2
  33 + with:
  34 + python-version: ${{ matrix.python-version }}
  35 +
  36 + - name: ccache
  37 + uses: hendrikmuhs/ccache-action@v1.2
  38 + with:
  39 + key: ${{ matrix.os }}-${{ matrix.python_version }}
  40 +
  41 + - name: Install python dependencies
  42 + shell: bash
  43 + run: |
  44 + python3 -m pip install --upgrade pip
  45 + python3 -m pip install wheel twine setuptools
  46 +
  47 + - name: Build
  48 + shell: bash
  49 + run: |
  50 + export CMAKE_CXX_COMPILER_LAUNCHER=ccache
  51 + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
  52 + cmake --version
  53 +
  54 + export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j"
  55 +
  56 + python3 setup.py bdist_wheel
  57 + ls -lh dist
  58 +
  59 + - name: Display wheel
  60 + shell: bash
  61 + run: |
  62 + ls -lh dist
  63 +
  64 + - name: Install wheel
  65 + shell: bash
  66 + run: |
  67 + pip install --verbose ./dist/*.whl
  68 +
  69 + - name: Test
  70 + shell: bash
  71 + run: |
  72 + # For windows
  73 + export PATH=/c/hostedtoolcache/windows/Python/3.7.9/x64/bin:$PATH
  74 + export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH
  75 + export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
  76 + export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
  77 + export PATH=/c/hostedtoolcache/windows/Python/3.11.6/x64/bin:$PATH
  78 +
  79 + which sherpa-onnx
  80 + sherpa-onnx --help
@@ -70,6 +70,10 @@ jobs: @@ -70,6 +70,10 @@ jobs:
70 mkdir -p scripts/nodejs/lib/win-x64 70 mkdir -p scripts/nodejs/lib/win-x64
71 dst=scripts/nodejs/lib/win-x64 71 dst=scripts/nodejs/lib/win-x64
72 fi 72 fi
  73 + ls -lh build/install/lib/
  74 +
  75 + rm -rf build/install/lib/pkgconfig
  76 +
73 cp -v build/install/lib/* $dst/ 77 cp -v build/install/lib/* $dst/
74 78
75 - name: replace files 79 - name: replace files
@@ -77,3 +77,6 @@ xcuserdata/ @@ -77,3 +77,6 @@ xcuserdata/
77 vits-vctk 77 vits-vctk
78 vits-zh-aishell3 78 vits-zh-aishell3
79 jslint.mjs 79 jslint.mjs
  80 +vits-piper-en_US-amy-low
  81 +vits-piper-*-*-*
  82 +log
@@ -2,6 +2,8 @@ @@ -2,6 +2,8 @@
2 <manifest xmlns:android="http://schemas.android.com/apk/res/android" 2 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
3 xmlns:tools="http://schemas.android.com/tools"> 3 xmlns:tools="http://schemas.android.com/tools">
4 4
  5 + <uses-permission android:name="android.permission.WRITE_INTERNAL_STORAGE" />
  6 +
5 <application 7 <application
6 android:allowBackup="true" 8 android:allowBackup="true"
7 android:dataExtractionRules="@xml/data_extraction_rules" 9 android:dataExtractionRules="@xml/data_extraction_rules"
1 package com.k2fsa.sherpa.onnx 1 package com.k2fsa.sherpa.onnx
2 2
  3 +import android.content.res.AssetManager
3 import android.media.MediaPlayer 4 import android.media.MediaPlayer
4 import android.net.Uri 5 import android.net.Uri
5 import android.os.Bundle 6 import android.os.Bundle
@@ -9,6 +10,8 @@ import android.widget.EditText @@ -9,6 +10,8 @@ import android.widget.EditText
9 import android.widget.Toast 10 import android.widget.Toast
10 import androidx.appcompat.app.AppCompatActivity 11 import androidx.appcompat.app.AppCompatActivity
11 import java.io.File 12 import java.io.File
  13 +import java.io.FileOutputStream
  14 +import java.io.IOException
12 15
13 const val TAG = "sherpa-onnx" 16 const val TAG = "sherpa-onnx"
14 17
@@ -19,7 +22,6 @@ class MainActivity : AppCompatActivity() { @@ -19,7 +22,6 @@ class MainActivity : AppCompatActivity() {
19 private lateinit var speed: EditText 22 private lateinit var speed: EditText
20 private lateinit var generate: Button 23 private lateinit var generate: Button
21 private lateinit var play: Button 24 private lateinit var play: Button
22 - private var hasFile: Boolean = false  
23 25
24 override fun onCreate(savedInstanceState: Bundle?) { 26 override fun onCreate(savedInstanceState: Bundle?) {
25 super.onCreate(savedInstanceState) 27 super.onCreate(savedInstanceState)
@@ -46,10 +48,10 @@ class MainActivity : AppCompatActivity() { @@ -46,10 +48,10 @@ class MainActivity : AppCompatActivity() {
46 val sampleText = "" 48 val sampleText = ""
47 text.setText(sampleText) 49 text.setText(sampleText)
48 50
49 - play.isEnabled = false; 51 + play.isEnabled = false
50 } 52 }
51 53
52 - fun onClickGenerate() { 54 + private fun onClickGenerate() {
53 val sidInt = sid.text.toString().toIntOrNull() 55 val sidInt = sid.text.toString().toIntOrNull()
54 if (sidInt == null || sidInt < 0) { 56 if (sidInt == null || sidInt < 0) {
55 Toast.makeText( 57 Toast.makeText(
@@ -77,7 +79,7 @@ class MainActivity : AppCompatActivity() { @@ -77,7 +79,7 @@ class MainActivity : AppCompatActivity() {
77 return 79 return
78 } 80 }
79 81
80 - play.isEnabled = false; 82 + play.isEnabled = false
81 val audio = tts.generate(text = textStr, sid = sidInt, speed = speedFloat) 83 val audio = tts.generate(text = textStr, sid = sidInt, speed = speedFloat)
82 84
83 val filename = application.filesDir.absolutePath + "/generated.wav" 85 val filename = application.filesDir.absolutePath + "/generated.wav"
@@ -89,7 +91,7 @@ class MainActivity : AppCompatActivity() { @@ -89,7 +91,7 @@ class MainActivity : AppCompatActivity() {
89 } 91 }
90 } 92 }
91 93
92 - fun onClickPlay() { 94 + private fun onClickPlay() {
93 val filename = application.filesDir.absolutePath + "/generated.wav" 95 val filename = application.filesDir.absolutePath + "/generated.wav"
94 val mediaPlayer = MediaPlayer.create( 96 val mediaPlayer = MediaPlayer.create(
95 applicationContext, 97 applicationContext,
@@ -98,10 +100,13 @@ class MainActivity : AppCompatActivity() { @@ -98,10 +100,13 @@ class MainActivity : AppCompatActivity() {
98 mediaPlayer.start() 100 mediaPlayer.start()
99 } 101 }
100 102
101 - fun initTts() {  
102 - var modelDir :String?  
103 - var modelName :String? 103 + private fun initTts() {
  104 + var modelDir: String?
  105 + var modelName: String?
104 var ruleFsts: String? 106 var ruleFsts: String?
  107 + var lexicon: String?
  108 + var dataDir: String?
  109 + var assets: AssetManager? = application.assets
105 110
106 // The purpose of such a design is to make the CI test easier 111 // The purpose of such a design is to make the CI test easier
107 // Please see 112 // Please see
@@ -109,21 +114,90 @@ class MainActivity : AppCompatActivity() { @@ -109,21 +114,90 @@ class MainActivity : AppCompatActivity() {
109 modelDir = null 114 modelDir = null
110 modelName = null 115 modelName = null
111 ruleFsts = null 116 ruleFsts = null
  117 + lexicon = null
  118 + dataDir = null
112 119
113 // Example 1: 120 // Example 1:
114 // modelDir = "vits-vctk" 121 // modelDir = "vits-vctk"
115 // modelName = "vits-vctk.onnx" 122 // modelName = "vits-vctk.onnx"
  123 + // lexicon = "lexicon.txt"
116 124
117 // Example 2: 125 // Example 2:
118 - // modelDir = "vits-piper-en_US-lessac-medium"  
119 - // modelName = "en_US-lessac-medium.onnx" 126 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  127 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  128 + // modelDir = "vits-piper-en_US-amy-low"
  129 + // modelName = "en_US-amy-low.onnx"
  130 + // dataDir = "vits-piper-en_US-amy-low/espeak-ng-data"
120 131
121 // Example 3: 132 // Example 3:
122 // modelDir = "vits-zh-aishell3" 133 // modelDir = "vits-zh-aishell3"
123 // modelName = "vits-aishell3.onnx" 134 // modelName = "vits-aishell3.onnx"
124 // ruleFsts = "vits-zh-aishell3/rule.fst" 135 // ruleFsts = "vits-zh-aishell3/rule.fst"
  136 + // lexcion = "lexicon.txt"
125 137
126 - val config = getOfflineTtsConfig(modelDir = modelDir!!, modelName = modelName!!, ruleFsts = ruleFsts ?: "")!!  
127 - tts = OfflineTts(assetManager = application.assets, config = config) 138 + if (dataDir != null) {
  139 + val newDir = copyDataDir(modelDir)
  140 + modelDir = newDir + "/" + modelDir
  141 + dataDir = newDir + "/" + dataDir
  142 + assets = null
  143 + }
  144 +
  145 + val config = getOfflineTtsConfig(
  146 + modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "",
  147 + dataDir = dataDir ?: "",
  148 + ruleFsts = ruleFsts ?: ""
  149 + )!!
  150 +
  151 + tts = OfflineTts(assetManager = assets, config = config)
  152 + }
  153 +
  154 +
  155 + private fun copyDataDir(dataDir: String): String {
  156 + println("data dir is $dataDir")
  157 + copyAssets(dataDir)
  158 +
  159 + val newDataDir = application.getExternalFilesDir(null)!!.absolutePath
  160 + println("newDataDir: $newDataDir")
  161 + return newDataDir
  162 + }
  163 +
  164 + private fun copyAssets(path: String) {
  165 + val assets: Array<String>?
  166 + try {
  167 + assets = application.assets.list(path)
  168 + if (assets!!.isEmpty()) {
  169 + copyFile(path)
  170 + } else {
  171 + val fullPath = "${application.getExternalFilesDir(null)}/$path"
  172 + val dir = File(fullPath)
  173 + dir.mkdirs()
  174 + for (asset in assets.iterator()) {
  175 + val p: String = if (path == "") "" else path + "/"
  176 + copyAssets(p + asset)
  177 + }
  178 + }
  179 + } catch (ex: IOException) {
  180 + Log.e(TAG, "Failed to copy $path. ${ex.toString()}")
  181 + }
  182 + }
  183 +
  184 + private fun copyFile(filename: String) {
  185 + try {
  186 + val istream = application.assets.open(filename)
  187 + val newFilename = application.getExternalFilesDir(null).toString() + "/" + filename
  188 + val ostream = FileOutputStream(newFilename)
  189 + // Log.i(TAG, "Copying $filename to $newFilename")
  190 + val buffer = ByteArray(1024)
  191 + var read = 0
  192 + while (read != -1) {
  193 + ostream.write(buffer, 0, read)
  194 + read = istream.read(buffer)
  195 + }
  196 + istream.close()
  197 + ostream.flush()
  198 + ostream.close()
  199 + } catch (ex: Exception) {
  200 + Log.e(TAG, "Failed to copy $filename, ${ex.toString()}")
  201 + }
128 } 202 }
129 } 203 }
@@ -5,8 +5,9 @@ import android.content.res.AssetManager @@ -5,8 +5,9 @@ import android.content.res.AssetManager
5 5
6 data class OfflineTtsVitsModelConfig( 6 data class OfflineTtsVitsModelConfig(
7 var model: String, 7 var model: String,
8 - var lexicon: String, 8 + var lexicon: String = "",
9 var tokens: String, 9 var tokens: String,
  10 + var dataDir: String = "",
10 var noiseScale: Float = 0.667f, 11 var noiseScale: Float = 0.667f,
11 var noiseScaleW: Float = 0.8f, 12 var noiseScaleW: Float = 0.8f,
12 var lengthScale: Float = 1.0f, 13 var lengthScale: Float = 1.0f,
@@ -22,6 +23,7 @@ data class OfflineTtsModelConfig( @@ -22,6 +23,7 @@ data class OfflineTtsModelConfig(
22 data class OfflineTtsConfig( 23 data class OfflineTtsConfig(
23 var model: OfflineTtsModelConfig, 24 var model: OfflineTtsModelConfig,
24 var ruleFsts: String = "", 25 var ruleFsts: String = "",
  26 + var maxNumSentences: Int = 2,
25 ) 27 )
26 28
27 class GeneratedAudio( 29 class GeneratedAudio(
@@ -117,18 +119,25 @@ class OfflineTts( @@ -117,18 +119,25 @@ class OfflineTts(
117 // please refer to 119 // please refer to
118 // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html 120 // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html
119 // to download models 121 // to download models
120 -fun getOfflineTtsConfig(modelDir: String, modelName: String, ruleFsts: String): OfflineTtsConfig? { 122 +fun getOfflineTtsConfig(
  123 + modelDir: String,
  124 + modelName: String,
  125 + lexicon: String,
  126 + dataDir: String,
  127 + ruleFsts: String
  128 +): OfflineTtsConfig? {
121 return OfflineTtsConfig( 129 return OfflineTtsConfig(
122 model = OfflineTtsModelConfig( 130 model = OfflineTtsModelConfig(
123 vits = OfflineTtsVitsModelConfig( 131 vits = OfflineTtsVitsModelConfig(
124 model = "$modelDir/$modelName", 132 model = "$modelDir/$modelName",
125 - lexicon = "$modelDir/lexicon.txt",  
126 - tokens = "$modelDir/tokens.txt" 133 + lexicon = "$modelDir/$lexicon",
  134 + tokens = "$modelDir/tokens.txt",
  135 + dataDir = "$dataDir"
127 ), 136 ),
128 numThreads = 2, 137 numThreads = 2,
129 debug = true, 138 debug = true,
130 provider = "cpu", 139 provider = "cpu",
131 ), 140 ),
132 - ruleFsts=ruleFsts, 141 + ruleFsts = ruleFsts,
133 ) 142 )
134 } 143 }
@@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"
92 make -j4 92 make -j4
93 make install/strip 93 make install/strip
94 cp -fv android-onnxruntime-libs/jni/arm64-v8a/libonnxruntime.so install/lib 94 cp -fv android-onnxruntime-libs/jni/arm64-v8a/libonnxruntime.so install/lib
  95 +rm -rf install/lib/pkgconfig
@@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"
92 make -j4 92 make -j4
93 make install/strip 93 make install/strip
94 cp -fv android-onnxruntime-libs/jni/armeabi-v7a/libonnxruntime.so install/lib 94 cp -fv android-onnxruntime-libs/jni/armeabi-v7a/libonnxruntime.so install/lib
  95 +rm -rf install/lib/pkgconfig
@@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"
94 make -j4 94 make -j4
95 make install/strip 95 make install/strip
96 cp -fv android-onnxruntime-libs/jni/x86_64/libonnxruntime.so install/lib 96 cp -fv android-onnxruntime-libs/jni/x86_64/libonnxruntime.so install/lib
  97 +rm -rf install/lib/pkgconfig
@@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"
94 make -j4 94 make -j4
95 make install/strip 95 make install/strip
96 cp -fv android-onnxruntime-libs/jni/x86/libonnxruntime.so install/lib 96 cp -fv android-onnxruntime-libs/jni/x86/libonnxruntime.so install/lib
  97 +rm -rf install/lib/pkgconfig
@@ -140,7 +140,8 @@ echo "Generate xcframework" @@ -140,7 +140,8 @@ echo "Generate xcframework"
140 140
141 mkdir -p "build/simulator/lib" 141 mkdir -p "build/simulator/lib"
142 for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \ 142 for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \
143 - libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a; do 143 + libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \
  144 + libucd.a libpiper_phonemize.a libespeak-ng.a; do
144 lipo -create build/simulator_arm64/lib/${f} \ 145 lipo -create build/simulator_arm64/lib/${f} \
145 build/simulator_x86_64/lib/${f} \ 146 build/simulator_x86_64/lib/${f} \
146 -output build/simulator/lib/${f} 147 -output build/simulator/lib/${f}
@@ -154,7 +155,10 @@ libtool -static -o build/simulator/sherpa-onnx.a \ @@ -154,7 +155,10 @@ libtool -static -o build/simulator/sherpa-onnx.a \
154 build/simulator/lib/libsherpa-onnx-core.a \ 155 build/simulator/lib/libsherpa-onnx-core.a \
155 build/simulator/lib/libsherpa-onnx-fst.a \ 156 build/simulator/lib/libsherpa-onnx-fst.a \
156 build/simulator/lib/libsherpa-onnx-kaldifst-core.a \ 157 build/simulator/lib/libsherpa-onnx-kaldifst-core.a \
157 - build/simulator/lib/libkaldi-decoder-core.a 158 + build/simulator/lib/libkaldi-decoder-core.a \
  159 + build/simulator/lib/libucd.a \
  160 + build/simulator/lib/libpiper_phonemize.a \
  161 + build/simulator/lib/libespeak-ng.a \
158 162
159 libtool -static -o build/os64/sherpa-onnx.a \ 163 libtool -static -o build/os64/sherpa-onnx.a \
160 build/os64/lib/libkaldi-native-fbank-core.a \ 164 build/os64/lib/libkaldi-native-fbank-core.a \
@@ -162,7 +166,10 @@ libtool -static -o build/os64/sherpa-onnx.a \ @@ -162,7 +166,10 @@ libtool -static -o build/os64/sherpa-onnx.a \
162 build/os64/lib/libsherpa-onnx-core.a \ 166 build/os64/lib/libsherpa-onnx-core.a \
163 build/os64/lib/libsherpa-onnx-fst.a \ 167 build/os64/lib/libsherpa-onnx-fst.a \
164 build/os64/lib/libsherpa-onnx-kaldifst-core.a \ 168 build/os64/lib/libsherpa-onnx-kaldifst-core.a \
165 - build/os64/lib/libkaldi-decoder-core.a 169 + build/os64/lib/libkaldi-decoder-core.a \
  170 + build/os64/lib/libucd.a \
  171 + build/os64/lib/libpiper_phonemize.a \
  172 + build/os64/lib/libespeak-ng.a \
166 173
167 174
168 rm -rf sherpa-onnx.xcframework 175 rm -rf sherpa-onnx.xcframework
@@ -29,4 +29,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \ @@ -29,4 +29,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \
29 ./install/lib/libkaldi-native-fbank-core.a \ 29 ./install/lib/libkaldi-native-fbank-core.a \
30 ./install/lib/libsherpa-onnx-fst.a \ 30 ./install/lib/libsherpa-onnx-fst.a \
31 ./install/lib/libsherpa-onnx-kaldifst-core.a \ 31 ./install/lib/libsherpa-onnx-kaldifst-core.a \
32 - ./install/lib/libkaldi-decoder-core.a 32 + ./install/lib/libkaldi-decoder-core.a \
  33 + ./install/lib/libucd.a \
  34 + ./install/lib/libpiper_phonemize.a \
  35 + ./install/lib/libespeak-ng.a
@@ -65,6 +65,29 @@ static struct cag_option options[] = { @@ -65,6 +65,29 @@ static struct cag_option options[] = {
65 .identifier = 'a', 65 .identifier = 'a',
66 .description = 66 .description =
67 "Filename to save the generated audio. Default to ./generated.wav"}, 67 "Filename to save the generated audio. Default to ./generated.wav"},
  68 +
  69 + {.access_name = "tts-rule-fsts",
  70 + .value_name = "/path/to/rule.fst",
  71 + .identifier = 'b',
  72 + .description = "It not empty, it contains a list of rule FST filenames."
  73 + "Multiple filenames are separated by a comma and they are "
  74 + "applied from left to right. An example value: "
  75 + "rule1.fst,rule2,fst,rule3.fst"},
  76 +
  77 + {.access_name = "max-num-sentences",
  78 + .value_name = "2",
  79 + .identifier = 'c',
  80 + .description = "Maximum number of sentences that we process at a time. "
  81 + "This is to avoid OOM for very long input text. "
  82 + "If you set it to -1, then we process all sentences in a "
  83 + "single batch."},
  84 +
  85 + {.access_name = "vits-data-dir",
  86 + .value_name = "/path/to/espeak-ng-data",
  87 + .identifier = 'd',
  88 + .description =
  89 + "Path to espeak-ng-data. If it is given, --vits-lexicon is ignored"},
  90 +
68 }; 91 };
69 92
70 static void ShowUsage() { 93 static void ShowUsage() {
@@ -163,15 +186,38 @@ int32_t main(int32_t argc, char *argv[]) { @@ -163,15 +186,38 @@ int32_t main(int32_t argc, char *argv[]) {
163 free((void *)filename); 186 free((void *)filename);
164 filename = strdup(value); 187 filename = strdup(value);
165 break; 188 break;
  189 + case 'b':
  190 + config.rule_fsts = value;
  191 + break;
  192 + case 'c':
  193 + config.max_num_sentences = atoi(value);
  194 + break;
  195 + case 'd':
  196 + config.model.vits.data_dir = value;
  197 + break;
  198 + case '?':
  199 + fprintf(stderr, "Unknown option\n");
  200 + // fall through
166 case 'h': 201 case 'h':
167 // fall through 202 // fall through
168 default: 203 default:
169 ShowUsage(); 204 ShowUsage();
170 } 205 }
171 } 206 }
  207 + fprintf(stderr, "here\n");
  208 +
  209 + if (!config.model.vits.model) {
  210 + fprintf(stderr, "Please provide --vits-model\n");
  211 + ShowUsage();
  212 + }
  213 +
  214 + if (!config.model.vits.tokens) {
  215 + fprintf(stderr, "Please provide --vits-tokens\n");
  216 + ShowUsage();
  217 + }
172 218
173 - if (!config.model.vits.model || !config.model.vits.lexicon ||  
174 - !config.model.vits.tokens) { 219 + if (!config.model.vits.data_dir && !config.model.vits.lexicon) {
  220 + fprintf(stderr, "Please provide --vits-data-dir or --vits-lexicon\n");
175 ShowUsage(); 221 ShowUsage();
176 } 222 }
177 223
@@ -73,6 +73,10 @@ class BuildExtension(build_ext): @@ -73,6 +73,10 @@ class BuildExtension(build_ext):
73 73
74 extra_cmake_args = f" -DCMAKE_INSTALL_PREFIX={install_dir} " 74 extra_cmake_args = f" -DCMAKE_INSTALL_PREFIX={install_dir} "
75 extra_cmake_args += " -DBUILD_SHARED_LIBS=ON " 75 extra_cmake_args += " -DBUILD_SHARED_LIBS=ON "
  76 + extra_cmake_args += " -DBUILD_PIPER_PHONMIZE_EXE=OFF "
  77 + extra_cmake_args += " -DBUILD_PIPER_PHONMIZE_TESTS=OFF "
  78 + extra_cmake_args += " -DBUILD_ESPEAK_NG_EXE=OFF "
  79 + extra_cmake_args += " -DBUILD_ESPEAK_NG_TESTS=OFF "
76 80
77 extra_cmake_args += " -DSHERPA_ONNX_ENABLE_CHECK=OFF " 81 extra_cmake_args += " -DSHERPA_ONNX_ENABLE_CHECK=OFF "
78 extra_cmake_args += " -DSHERPA_ONNX_ENABLE_PYTHON=ON " 82 extra_cmake_args += " -DSHERPA_ONNX_ENABLE_PYTHON=ON "
@@ -146,6 +150,9 @@ class BuildExtension(build_ext): @@ -146,6 +150,9 @@ class BuildExtension(build_ext):
146 binaries += ["sherpa-onnx-core.dll"] 150 binaries += ["sherpa-onnx-core.dll"]
147 binaries += ["sherpa-onnx-portaudio.dll"] 151 binaries += ["sherpa-onnx-portaudio.dll"]
148 binaries += ["onnxruntime.dll"] 152 binaries += ["onnxruntime.dll"]
  153 + binaries += ["piper_phonemize.dll"]
  154 + binaries += ["espeak-ng.dll"]
  155 + binaries += ["ucd.dll"]
149 binaries += ["kaldi-decoder-core.dll"] 156 binaries += ["kaldi-decoder-core.dll"]
150 binaries += ["sherpa-onnx-fst.lib"] 157 binaries += ["sherpa-onnx-fst.lib"]
151 binaries += ["sherpa-onnx-kaldifst-core.lib"] 158 binaries += ["sherpa-onnx-kaldifst-core.lib"]
@@ -161,5 +168,8 @@ class BuildExtension(build_ext): @@ -161,5 +168,8 @@ class BuildExtension(build_ext):
161 shutil.copy(f"{src_file}", f"{out_bin_dir}/") 168 shutil.copy(f"{src_file}", f"{out_bin_dir}/")
162 169
163 shutil.rmtree(f"{install_dir}/bin") 170 shutil.rmtree(f"{install_dir}/bin")
  171 + shutil.rmtree(f"{install_dir}/share")
  172 + shutil.rmtree(f"{install_dir}/lib/pkgconfig")
  173 +
164 if is_windows(): 174 if is_windows():
165 shutil.rmtree(f"{install_dir}/lib") 175 shutil.rmtree(f"{install_dir}/lib")
@@ -86,7 +86,7 @@ function(download_espeak_ng_for_piper) @@ -86,7 +86,7 @@ function(download_espeak_ng_for_piper)
86 -Wno-unused-result 86 -Wno-unused-result
87 -Wno-format-overflow 87 -Wno-format-overflow
88 -Wno-format-truncation 88 -Wno-format-truncation
89 - -Wno-maybe-uninitialized 89 + -Wno-uninitialized
90 -Wno-format 90 -Wno-format
91 ) 91 )
92 92
@@ -13,4 +13,4 @@ Cflags: -I"${includedir}" @@ -13,4 +13,4 @@ Cflags: -I"${includedir}"
13 # Note: -lcargs is required only for the following file 13 # Note: -lcargs is required only for the following file
14 # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c 14 # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c
15 # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c 15 # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c
16 -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lcargs -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ 16 +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
@@ -40,7 +40,7 @@ @@ -40,7 +40,7 @@
40 /* End PBXContainerItemProxy section */ 40 /* End PBXContainerItemProxy section */
41 41
42 /* Begin PBXFileReference section */ 42 /* Begin PBXFileReference section */
43 - C93989AF2A89FE33009AB859 /* onnxruntime.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = onnxruntime.xcframework; path = "../../build-ios/ios-onnxruntime/1.16.0/onnxruntime.xcframework"; sourceTree = "<group>"; }; 43 + C93989AF2A89FE33009AB859 /* onnxruntime.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = onnxruntime.xcframework; path = "../../build-ios/ios-onnxruntime/1.16.3/onnxruntime.xcframework"; sourceTree = "<group>"; };
44 C93989B12A89FF78009AB859 /* decoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = decoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"; sourceTree = "<group>"; }; 44 C93989B12A89FF78009AB859 /* decoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = decoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"; sourceTree = "<group>"; };
45 C93989B22A89FF78009AB859 /* encoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = encoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx"; sourceTree = "<group>"; }; 45 C93989B22A89FF78009AB859 /* encoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = encoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx"; sourceTree = "<group>"; };
46 C93989B32A89FF78009AB859 /* tokens.txt */ = {isa = PBXFileReference; lastKnownFileType = text; name = tokens.txt; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt"; sourceTree = "<group>"; }; 46 C93989B32A89FF78009AB859 /* tokens.txt */ = {isa = PBXFileReference; lastKnownFileType = text; name = tokens.txt; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt"; sourceTree = "<group>"; };
@@ -65,7 +65,7 @@ struct ContentView: View { @@ -65,7 +65,7 @@ struct ContentView: View {
65 self.filename = tempDirectoryURL.appendingPathComponent("test.wav") 65 self.filename = tempDirectoryURL.appendingPathComponent("test.wav")
66 } 66 }
67 67
68 - let ret = audio.save(filename: filename.path) 68 + let _ = audio.save(filename: filename.path)
69 69
70 self.audioPlayer = try! AVAudioPlayer(contentsOf: filename) 70 self.audioPlayer = try! AVAudioPlayer(contentsOf: filename)
71 self.audioPlayer.play() 71 self.audioPlayer.play()
@@ -7,6 +7,12 @@ @@ -7,6 +7,12 @@
7 7
8 import Foundation 8 import Foundation
9 9
  10 +
  11 +// used to get the path to espeak-ng-data
  12 +func resourceURL(to path: String) -> String {
  13 + return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path
  14 +}
  15 +
10 func getResource(_ forResource: String, _ ofType: String) -> String { 16 func getResource(_ forResource: String, _ ofType: String) -> String {
11 let path = Bundle.main.path(forResource: forResource, ofType: ofType) 17 let path = Bundle.main.path(forResource: forResource, ofType: ofType)
12 precondition( 18 precondition(
@@ -59,8 +65,30 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper { @@ -59,8 +65,30 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper {
59 return SherpaOnnxOfflineTtsWrapper(config: &config) 65 return SherpaOnnxOfflineTtsWrapper(config: &config)
60 } 66 }
61 67
  68 +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  69 +func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper {
  70 + // please see https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  71 +
  72 + // vits-vctk.onnx
  73 + let model = getResource("en_US-amy-low", "onnx")
  74 +
  75 + // tokens.txt
  76 + let tokens = getResource("tokens", "txt")
  77 +
  78 + // in this case, we don't need lexicon.txt
  79 + let dataDir = resourceURL(to: "espeak-ng-data")
  80 +
  81 + let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: "", tokens: tokens, dataDir: dataDir)
  82 + let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits)
  83 + var config = sherpaOnnxOfflineTtsConfig(model: modelConfig)
  84 +
  85 + return SherpaOnnxOfflineTtsWrapper(config: &config)
  86 +}
  87 +
62 func createOfflineTts() -> SherpaOnnxOfflineTtsWrapper { 88 func createOfflineTts() -> SherpaOnnxOfflineTtsWrapper {
63 - return getTtsForVCTK() 89 + return getTtsFor_en_US_amy_low()
  90 +
  91 + // return getTtsForVCTK()
64 92
65 // return getTtsForAishell3() 93 // return getTtsForAishell3()
66 94
@@ -8,20 +8,22 @@ fun main() { @@ -8,20 +8,22 @@ fun main() {
8 } 8 }
9 9
10 fun testTts() { 10 fun testTts() {
  11 + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  12 + // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
11 var config = OfflineTtsConfig( 13 var config = OfflineTtsConfig(
12 model=OfflineTtsModelConfig( 14 model=OfflineTtsModelConfig(
13 vits=OfflineTtsVitsModelConfig( 15 vits=OfflineTtsVitsModelConfig(
14 - model="./vits-zh-aishell3/vits-aishell3.onnx",  
15 - lexicon="./vits-zh-aishell3/lexicon.txt",  
16 - tokens="./vits-zh-aishell3/tokens.txt", 16 + model="./vits-piper-en_US-amy-low/en_US-amy-low.onnx",
  17 + tokens="./vits-piper-en_US-amy-low/tokens.txt",
  18 + dataDir="./vits-piper-en_US-amy-low/espeak-ng-data",
17 ), 19 ),
18 numThreads=1, 20 numThreads=1,
19 debug=true, 21 debug=true,
20 ) 22 )
21 ) 23 )
22 val tts = OfflineTts(config=config) 24 val tts = OfflineTts(config=config)
23 - val audio = tts.generate(text="林美丽最美丽!", sid=99, speed=1.2f)  
24 - audio.save(filename="99.wav") 25 + val audio = tts.generate(text="“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”")
  26 + audio.save(filename="test-en.wav")
25 } 27 }
26 28
27 fun testAsr() { 29 fun testAsr() {
@@ -34,9 +34,10 @@ if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then @@ -34,9 +34,10 @@ if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
34 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21 34 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
35 fi 35 fi
36 36
37 -if [ ! -f ./vits-zh-aishell3/tokens.txt ]; then  
38 - git lfs install  
39 - git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 37 +if [ ! -f ./vits-piper-en_US-amy-low/en_US-amy-low.onnx ]; then
  38 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  39 + tar xf vits-piper-en_US-amy-low.tar.bz2
  40 + rm vits-piper-en_US-amy-low.tar.bz2
40 fi 41 fi
41 42
42 kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt Tts.kt 43 kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt Tts.kt
1 node_modules 1 node_modules
  2 +lib
2 package-lock.json 3 package-lock.json
@@ -42,15 +42,14 @@ In the following, we demonstrate how to run text-to-speech. @@ -42,15 +42,14 @@ In the following, we demonstrate how to run text-to-speech.
42 ## ./test-offline-tts-en.js 42 ## ./test-offline-tts-en.js
43 43
44 [./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use 44 [./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use
45 -a VITS pretrained model  
46 -[VCTK](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers) 45 +[vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2)
47 for text-to-speech. 46 for text-to-speech.
48 47
49 You can use the following command to run it: 48 You can use the following command to run it:
50 49
51 ```bash 50 ```bash
52 -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2  
53 -tar xvf vits-vctk.tar.bz2 51 +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  52 +tar xvf vits-piper-en_US-amy-low.tar.bz2
54 node ./test-offline-tts-en.js 53 node ./test-offline-tts-en.js
55 ``` 54 ```
56 55
@@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx'); @@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx');
4 4
5 function createOfflineTts() { 5 function createOfflineTts() {
6 const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); 6 const vits = new sherpa_onnx.OfflineTtsVitsModelConfig();
7 - vits.model = './vits-vctk/vits-vctk.onnx';  
8 - vits.lexicon = './vits-vctk/lexicon.txt';  
9 - vits.tokens = './vits-vctk/tokens.txt'; 7 + vits.model = 'vits-piper-en_US-amy-low/en_US-amy-low.onnx'
  8 + vits.tokens = './vits-piper-en_US-amy-low/tokens.txt';
  9 + vits.dataDir = './vits-piper-en_US-amy-low/espeak-ng-data'
10 10
11 const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); 11 const modelConfig = new sherpa_onnx.OfflineTtsModelConfig();
12 modelConfig.vits = vits; 12 modelConfig.vits = vits;
@@ -18,10 +18,11 @@ function createOfflineTts() { @@ -18,10 +18,11 @@ function createOfflineTts() {
18 } 18 }
19 19
20 const tts = createOfflineTts(); 20 const tts = createOfflineTts();
21 -const speakerId = 99; 21 +const speakerId = 0;
22 const speed = 1.0; 22 const speed = 1.0;
23 -const audio =  
24 - tts.generate('Good morning. How are you doing?', speakerId, speed); 23 +const audio = tts.generate(
  24 + '“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”',
  25 + speakerId, speed);
25 audio.save('./test-en.wav'); 26 audio.save('./test-en.wav');
26 console.log('Saved to test-en.wav successfully.'); 27 console.log('Saved to test-en.wav successfully.');
27 tts.free(); 28 tts.free();
@@ -63,16 +63,26 @@ def get_args(): @@ -63,16 +63,26 @@ def get_args():
63 parser.add_argument( 63 parser.add_argument(
64 "--vits-lexicon", 64 "--vits-lexicon",
65 type=str, 65 type=str,
  66 + default="",
66 help="Path to lexicon.txt", 67 help="Path to lexicon.txt",
67 ) 68 )
68 69
69 parser.add_argument( 70 parser.add_argument(
70 "--vits-tokens", 71 "--vits-tokens",
71 type=str, 72 type=str,
  73 + default="",
72 help="Path to tokens.txt", 74 help="Path to tokens.txt",
73 ) 75 )
74 76
75 parser.add_argument( 77 parser.add_argument(
  78 + "--vits-data-dir",
  79 + type=str,
  80 + default="",
  81 + help="""Path to the dict director of espeak-ng. If it is specified,
  82 + --vits-lexicon and --vits-tokens are ignored""",
  83 + )
  84 +
  85 + parser.add_argument(
76 "--tts-rule-fsts", 86 "--tts-rule-fsts",
77 type=str, 87 type=str,
78 default="", 88 default="",
@@ -80,6 +90,17 @@ def get_args(): @@ -80,6 +90,17 @@ def get_args():
80 ) 90 )
81 91
82 parser.add_argument( 92 parser.add_argument(
  93 + "--max-num-sentences",
  94 + type=int,
  95 + default=2,
  96 + help="""Max number of sentences in a batch to avoid OOM if the input
  97 + text is very long. Set it to -1 to process all the sentences in a
  98 + single batch. A smaller value does not mean it is slower compared
  99 + to a larger one on CPU.
  100 + """,
  101 + )
  102 +
  103 + parser.add_argument(
83 "--output-filename", 104 "--output-filename",
84 type=str, 105 type=str,
85 default="./generated.wav", 106 default="./generated.wav",
@@ -142,14 +163,19 @@ def main(): @@ -142,14 +163,19 @@ def main():
142 vits=sherpa_onnx.OfflineTtsVitsModelConfig( 163 vits=sherpa_onnx.OfflineTtsVitsModelConfig(
143 model=args.vits_model, 164 model=args.vits_model,
144 lexicon=args.vits_lexicon, 165 lexicon=args.vits_lexicon,
  166 + data_dir=args.vits_data_dir,
145 tokens=args.vits_tokens, 167 tokens=args.vits_tokens,
146 ), 168 ),
147 provider=args.provider, 169 provider=args.provider,
148 debug=args.debug, 170 debug=args.debug,
149 num_threads=args.num_threads, 171 num_threads=args.num_threads,
150 ), 172 ),
151 - rule_fsts=args.tts_rule_fsts 173 + rule_fsts=args.tts_rule_fsts,
  174 + max_num_sentences=args.max_num_sentences,
152 ) 175 )
  176 + if not tts_config.validate():
  177 + raise ValueError("Please check your config")
  178 +
153 tts = sherpa_onnx.OfflineTts(tts_config) 179 tts = sherpa_onnx.OfflineTts(tts_config)
154 180
155 start = time.time() 181 start = time.time()
@@ -37,13 +37,9 @@ model_dir={{ tts_model.model_dir }} @@ -37,13 +37,9 @@ model_dir={{ tts_model.model_dir }}
37 model_name={{ tts_model.model_name }} 37 model_name={{ tts_model.model_name }}
38 lang={{ tts_model.lang }} 38 lang={{ tts_model.lang }}
39 39
40 -mkdir $model_dir  
41 -cd $model_dir  
42 -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/$model_name  
43 -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/lexicon.txt  
44 -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/tokens.txt  
45 -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/MODEL_CARD 2>/dev/null || true  
46 -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/rule.fst 2>/dev/null || true 40 +wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2
  41 +tar xf $model_dir.tar.bz2
  42 +rm $model_dir.tar.bz2
47 43
48 popd 44 popd
49 # Now we are at the project root directory 45 # Now we are at the project root directory
@@ -52,11 +48,19 @@ git checkout . @@ -52,11 +48,19 @@ git checkout .
52 pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx 48 pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx
53 sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt 49 sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt
54 sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt 50 sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt
  51 +
55 {% if tts_model.rule_fsts %} 52 {% if tts_model.rule_fsts %}
56 rule_fsts={{ tts_model.rule_fsts }} 53 rule_fsts={{ tts_model.rule_fsts }}
57 sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt 54 sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt
58 {% endif %} 55 {% endif %}
59 56
  57 +{% if tts_model.data_dir %}
  58 + data_dir={{ tts_model.data_dir }}
  59 + sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt
  60 +{% else %}
  61 + sed -i.bak s/"lexicon = null"/"lexicon = \"lexicon.txt\""/ ./MainActivity.kt
  62 +{% endif %}
  63 +
60 git diff 64 git diff
61 popd 65 popd
62 66
@@ -27,9 +27,122 @@ def get_args(): @@ -27,9 +27,122 @@ def get_args():
27 @dataclass 27 @dataclass
28 class TtsModel: 28 class TtsModel:
29 model_dir: str 29 model_dir: str
30 - model_name: str  
31 - lang: str # en, zh, fr, de, etc. 30 + model_name: str = ""
  31 + lang: str = "" # en, zh, fr, de, etc.
32 rule_fsts: Optional[List[str]] = None 32 rule_fsts: Optional[List[str]] = None
  33 + data_dir: Optional[str] = None
  34 +
  35 +
  36 +def get_piper_models() -> List[TtsModel]:
  37 + models = [
  38 + TtsModel(model_dir="vits-piper-ar_JO-kareem-low"),
  39 + TtsModel(model_dir="vits-piper-ar_JO-kareem-medium"),
  40 + TtsModel(model_dir="vits-piper-ca_ES-upc_ona-medium"),
  41 + TtsModel(model_dir="vits-piper-ca_ES-upc_ona-x_low"),
  42 + TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"),
  43 + TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"),
  44 + TtsModel(model_dir="vits-piper-cs_CZ-jirka-medium"),
  45 + TtsModel(model_dir="vits-piper-da_DK-talesyntese-medium"),
  46 + TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low"),
  47 + TtsModel(model_dir="vits-piper-de_DE-karlsson-low"),
  48 + TtsModel(model_dir="vits-piper-de_DE-kerstin-low"),
  49 + TtsModel(model_dir="vits-piper-de_DE-pavoque-low"),
  50 + TtsModel(model_dir="vits-piper-de_DE-ramona-low"),
  51 + TtsModel(model_dir="vits-piper-de_DE-thorsten-high"),
  52 + TtsModel(model_dir="vits-piper-de_DE-thorsten-low"),
  53 + TtsModel(model_dir="vits-piper-de_DE-thorsten-medium"),
  54 + TtsModel(model_dir="vits-piper-de_DE-thorsten_emotional-medium"),
  55 + TtsModel(model_dir="vits-piper-el_GR-rapunzelina-low"),
  56 + TtsModel(model_dir="vits-piper-en_GB-alan-low"),
  57 + TtsModel(model_dir="vits-piper-en_GB-alan-medium"),
  58 + TtsModel(model_dir="vits-piper-en_GB-alba-medium"),
  59 + TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium"),
  60 + TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium"),
  61 + TtsModel(model_dir="vits-piper-en_GB-semaine-medium"),
  62 + TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low"),
  63 + TtsModel(model_dir="vits-piper-en_GB-sweetbbak-amy"),
  64 + TtsModel(model_dir="vits-piper-en_GB-vctk-medium"),
  65 + TtsModel(model_dir="vits-piper-en_US-amy-low"),
  66 + TtsModel(model_dir="vits-piper-en_US-amy-medium"),
  67 + TtsModel(model_dir="vits-piper-en_US-arctic-medium"),
  68 + TtsModel(model_dir="vits-piper-en_US-danny-low"),
  69 + TtsModel(model_dir="vits-piper-en_US-hfc_male-medium"),
  70 + TtsModel(model_dir="vits-piper-en_US-joe-medium"),
  71 + TtsModel(model_dir="vits-piper-en_US-kathleen-low"),
  72 + TtsModel(model_dir="vits-piper-en_US-kusal-medium"),
  73 + TtsModel(model_dir="vits-piper-en_US-l2arctic-medium"),
  74 + TtsModel(model_dir="vits-piper-en_US-lessac-high"),
  75 + TtsModel(model_dir="vits-piper-en_US-lessac-low"),
  76 + TtsModel(model_dir="vits-piper-en_US-lessac-medium"),
  77 + TtsModel(model_dir="vits-piper-en_US-libritts-high"),
  78 + TtsModel(model_dir="vits-piper-en_US-libritts_r-medium"),
  79 + TtsModel(model_dir="vits-piper-en_US-ryan-high"),
  80 + TtsModel(model_dir="vits-piper-en_US-ryan-low"),
  81 + TtsModel(model_dir="vits-piper-en_US-ryan-medium"),
  82 + TtsModel(model_dir="vits-piper-es_ES-carlfm-x_low"),
  83 + TtsModel(model_dir="vits-piper-es_ES-davefx-medium"),
  84 + TtsModel(model_dir="vits-piper-es_ES-mls_10246-low"),
  85 + TtsModel(model_dir="vits-piper-es_ES-mls_9972-low"),
  86 + TtsModel(model_dir="vits-piper-es_ES-sharvard-medium"),
  87 + TtsModel(model_dir="vits-piper-es_MX-ald-medium"),
  88 + TtsModel(model_dir="vits-piper-fi_FI-harri-low"),
  89 + TtsModel(model_dir="vits-piper-fi_FI-harri-medium"),
  90 + TtsModel(model_dir="vits-piper-fr_FR-siwis-low"),
  91 + TtsModel(model_dir="vits-piper-fr_FR-siwis-medium"),
  92 + TtsModel(model_dir="vits-piper-fr_FR-upmc-medium"),
  93 + TtsModel(model_dir="vits-piper-hu_HU-anna-medium"),
  94 + TtsModel(model_dir="vits-piper-hu_HU-berta-medium"),
  95 + TtsModel(model_dir="vits-piper-hu_HU-imre-medium"),
  96 + TtsModel(model_dir="vits-piper-is_IS-bui-medium"),
  97 + TtsModel(model_dir="vits-piper-is_IS-salka-medium"),
  98 + TtsModel(model_dir="vits-piper-is_IS-steinn-medium"),
  99 + TtsModel(model_dir="vits-piper-is_IS-ugla-medium"),
  100 + TtsModel(model_dir="vits-piper-it_IT-riccardo-x_low"),
  101 + TtsModel(model_dir="vits-piper-ka_GE-natia-medium"),
  102 + TtsModel(model_dir="vits-piper-kk_KZ-iseke-x_low"),
  103 + TtsModel(model_dir="vits-piper-kk_KZ-issai-high"),
  104 + TtsModel(model_dir="vits-piper-kk_KZ-raya-x_low"),
  105 + TtsModel(model_dir="vits-piper-lb_LU-marylux-medium"),
  106 + TtsModel(model_dir="vits-piper-ne_NP-google-medium"),
  107 + TtsModel(model_dir="vits-piper-ne_NP-google-x_low"),
  108 + TtsModel(model_dir="vits-piper-nl_BE-nathalie-medium"),
  109 + TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
  110 + TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
  111 + TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
  112 + TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
  113 + TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
  114 + TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
  115 + TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
  116 + TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
  117 + TtsModel(model_dir="vits-piper-pl_PL-mc_speech-medium"),
  118 + TtsModel(model_dir="vits-piper-pl_PL-mls_6892-low"),
  119 + TtsModel(model_dir="vits-piper-pt_BR-edresson-low"),
  120 + TtsModel(model_dir="vits-piper-pt_BR-faber-medium"),
  121 + TtsModel(model_dir="vits-piper-pt_PT-tugao-medium"),
  122 + TtsModel(model_dir="vits-piper-ro_RO-mihai-medium"),
  123 + TtsModel(model_dir="vits-piper-ru_RU-denis-medium"),
  124 + TtsModel(model_dir="vits-piper-ru_RU-dmitri-medium"),
  125 + TtsModel(model_dir="vits-piper-ru_RU-irina-medium"),
  126 + TtsModel(model_dir="vits-piper-ru_RU-ruslan-medium"),
  127 + TtsModel(model_dir="vits-piper-sk_SK-lili-medium"),
  128 + TtsModel(model_dir="vits-piper-sr_RS-serbski_institut-medium"),
  129 + TtsModel(model_dir="vits-piper-sv_SE-nst-medium"),
  130 + TtsModel(model_dir="vits-piper-sw_CD-lanfrica-medium"),
  131 + TtsModel(model_dir="vits-piper-tr_TR-dfki-medium"),
  132 + TtsModel(model_dir="vits-piper-tr_TR-fahrettin-medium"),
  133 + TtsModel(model_dir="vits-piper-uk_UA-lada-x_low"),
  134 + TtsModel(model_dir="vits-piper-uk_UA-ukrainian_tts-medium"),
  135 + TtsModel(model_dir="vits-piper-vi_VN-25hours_single-low"),
  136 + TtsModel(model_dir="vits-piper-vi_VN-vais1000-medium"),
  137 + TtsModel(model_dir="vits-piper-vi_VN-vivos-x_low"),
  138 + TtsModel(model_dir="vits-piper-zh_CN-huayan-medium"),
  139 + ]
  140 + for m in models:
  141 + m.data_dir = m.model_dir + "/" + "espeak-ng-data"
  142 + m.model_name = m.model_dir[len("vits-piper-") :] + ".onnx"
  143 + m.lang = "en"
  144 +
  145 + return models
33 146
34 147
35 def get_all_models() -> List[TtsModel]: 148 def get_all_models() -> List[TtsModel]:
@@ -98,56 +211,6 @@ def get_all_models() -> List[TtsModel]: @@ -98,56 +211,6 @@ def get_all_models() -> List[TtsModel]:
98 # English (US) 211 # English (US)
99 TtsModel(model_dir="vits-vctk", model_name="vits-vctk.onnx", lang="en"), 212 TtsModel(model_dir="vits-vctk", model_name="vits-vctk.onnx", lang="en"),
100 TtsModel(model_dir="vits-ljs", model_name="vits-ljs.onnx", lang="en"), 213 TtsModel(model_dir="vits-ljs", model_name="vits-ljs.onnx", lang="en"),
101 - TtsModel(model_dir="vits-piper-en_US-amy-low", model_name="en_US-amy-low.onnx", lang="en",),  
102 - TtsModel(model_dir="vits-piper-en_US-amy-medium", model_name="en_US-amy-medium.onnx", lang="en",),  
103 - TtsModel(model_dir="vits-piper-en_US-arctic-medium", model_name="en_US-arctic-medium.onnx", lang="en",),  
104 - TtsModel(model_dir="vits-piper-en_US-danny-low", model_name="en_US-danny-low.onnx", lang="en",),  
105 - TtsModel(model_dir="vits-piper-en_US-hfc_male-medium", model_name="en_US-hfc_male-medium.onnx", lang="en",),  
106 - TtsModel(model_dir="vits-piper-en_US-joe-medium", model_name="en_US-joe-medium.onnx", lang="en",),  
107 - TtsModel(model_dir="vits-piper-en_US-kathleen-low", model_name="en_US-kathleen-low.onnx", lang="en",),  
108 - TtsModel(model_dir="vits-piper-en_US-kusal-medium", model_name="en_US-kusal-medium.onnx", lang="en",),  
109 - TtsModel(model_dir="vits-piper-en_US-l2arctic-medium", model_name="en_US-l2arctic-medium.onnx", lang="en",),  
110 - TtsModel(model_dir="vits-piper-en_US-lessac-low", model_name="en_US-lessac-low.onnx", lang="en",),  
111 - TtsModel(model_dir="vits-piper-en_US-lessac-medium", model_name="en_US-lessac-medium.onnx", lang="en",),  
112 - TtsModel(model_dir="vits-piper-en_US-lessac-high", model_name="en_US-lessac-high.onnx", lang="en",),  
113 - TtsModel(model_dir="vits-piper-en_US-libritts-high", model_name="en_US-libritts-high.onnx", lang="en",),  
114 - TtsModel(model_dir="vits-piper-en_US-libritts_r-medium", model_name="en_US-libritts_r-medium.onnx", lang="en",),  
115 - TtsModel(model_dir="vits-piper-en_US-ryan-low", model_name="en_US-ryan-low.onnx", lang="en",),  
116 - TtsModel(model_dir="vits-piper-en_US-ryan-medium", model_name="en_US-ryan-medium.onnx", lang="en",),  
117 - TtsModel(model_dir="vits-piper-en_US-ryan-high", model_name="en_US-ryan-high.onnx", lang="en",),  
118 - # English (GB)  
119 - TtsModel(model_dir="vits-piper-en_GB-alan-low", model_name="en_GB-alan-low.onnx",lang="en",),  
120 - TtsModel(model_dir="vits-piper-en_GB-alan-medium", model_name="en_GB-alan-medium.onnx",lang="en",),  
121 - TtsModel(model_dir="vits-piper-en_GB-alba-medium", model_name="en_GB-alba-medium.onnx",lang="en",),  
122 - TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium", model_name="en_GB-jenny_dioco-medium.onnx",lang="en",),  
123 - TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium", model_name="en_GB-northern_english_male-medium.onnx",lang="en",),  
124 - TtsModel(model_dir="vits-piper-en_GB-semaine-medium", model_name="en_GB-semaine-medium.onnx",lang="en",),  
125 - TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low", model_name="en_GB-southern_english_female-low.onnx",lang="en",),  
126 - TtsModel(model_dir="vits-piper-en_GB-vctk-medium", model_name="en_GB-vctk-medium.onnx",lang="en",),  
127 - # German (DE)  
128 - TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low", model_name="de_DE-eva_k-x_low.onnx",lang="de",),  
129 - TtsModel(model_dir="vits-piper-de_DE-karlsson-low", model_name="de_DE-karlsson-low.onnx",lang="de",),  
130 - TtsModel(model_dir="vits-piper-de_DE-kerstin-low", model_name="de_DE-kerstin-low.onnx",lang="de",),  
131 - TtsModel(model_dir="vits-piper-de_DE-pavoque-low", model_name="de_DE-pavoque-low.onnx",lang="de",),  
132 - TtsModel(model_dir="vits-piper-de_DE-ramona-low", model_name="de_DE-ramona-low.onnx",lang="de",),  
133 - TtsModel(model_dir="vits-piper-de_DE-thorsten-low", model_name="de_DE-thorsten-low.onnx",lang="de",),  
134 - TtsModel(model_dir="vits-piper-de_DE-thorsten-medium", model_name="de_DE-thorsten-medium.onnx",lang="de",),  
135 - TtsModel(model_dir="vits-piper-de_DE-thorsten-high", model_name="de_DE-thorsten-high.onnx",lang="de",),  
136 - TtsModel(model_dir="vits-piper-de_DE-thorsten_emotional-medium", model_name="de_DE-thorsten_emotional-medium.onnx",lang="de",),  
137 - # French (FR)  
138 - TtsModel(model_dir="vits-piper-fr_FR-upmc-medium", model_name="fr_FR-upmc-medium.onnx",lang="fr",),  
139 - TtsModel(model_dir="vits-piper-fr_FR-siwis-low", model_name="fr_FR-siwis-low.onnx",lang="fr",),  
140 - TtsModel(model_dir="vits-piper-fr_FR-siwis-medium", model_name="fr_FR-siwis-medium.onnx",lang="fr",),  
141 -  
142 - # Spanish (ES)  
143 - TtsModel(model_dir="vits-piper-es_ES-carlfm-x_low", model_name="es_ES-carlfm-x_low.onnx",lang="es",),  
144 - TtsModel(model_dir="vits-piper-es_ES-davefx-medium", model_name="es_ES-davefx-medium.onnx",lang="es",),  
145 - TtsModel(model_dir="vits-piper-es_ES-mls_10246-low", model_name="es_ES-mls_10246-low.onnx",lang="es",),  
146 - TtsModel(model_dir="vits-piper-es_ES-mls_9972-low", model_name="es_ES-mls_9972-low.onnx",lang="es",),  
147 - TtsModel(model_dir="vits-piper-es_ES-sharvard-medium", model_name="es_ES-sharvard-medium.onnx",lang="es",),  
148 -  
149 - # Spanish (MX)  
150 - TtsModel(model_dir="vits-piper-es_MX-ald-medium", model_name="es_MX-ald-medium.onnx",lang="es",),  
151 # fmt: on 214 # fmt: on
152 ] 215 ]
153 216
@@ -162,7 +225,8 @@ def main(): @@ -162,7 +225,8 @@ def main():
162 s = f.read() 225 s = f.read()
163 template = environment.from_string(s) 226 template = environment.from_string(s)
164 d = dict() 227 d = dict()
165 - all_model_list = get_all_models() 228 + # all_model_list = get_all_models()
  229 + all_model_list = get_piper_models()
166 num_models = len(all_model_list) 230 num_models = len(all_model_list)
167 231
168 num_per_runner = num_models // total 232 num_per_runner = num_models // total
@@ -186,6 +186,7 @@ const SherpaOnnxOfflineTtsVitsModelConfig = StructType({ @@ -186,6 +186,7 @@ const SherpaOnnxOfflineTtsVitsModelConfig = StructType({
186 "model" : cstring, 186 "model" : cstring,
187 "lexicon" : cstring, 187 "lexicon" : cstring,
188 "tokens" : cstring, 188 "tokens" : cstring,
  189 + "dataDir" : cstring,
189 "noiseScale" : float, 190 "noiseScale" : float,
190 "noiseScaleW" : float, 191 "noiseScaleW" : float,
191 "lengthScale" : float, 192 "lengthScale" : float,
@@ -201,6 +202,7 @@ const SherpaOnnxOfflineTtsModelConfig = StructType({ @@ -201,6 +202,7 @@ const SherpaOnnxOfflineTtsModelConfig = StructType({
201 const SherpaOnnxOfflineTtsConfig = StructType({ 202 const SherpaOnnxOfflineTtsConfig = StructType({
202 "model" : SherpaOnnxOfflineTtsModelConfig, 203 "model" : SherpaOnnxOfflineTtsModelConfig,
203 "ruleFsts" : cstring, 204 "ruleFsts" : cstring,
  205 + "maxNumSentences" : int32_t,
204 }); 206 });
205 207
206 const SherpaOnnxGeneratedAudio = StructType({ 208 const SherpaOnnxGeneratedAudio = StructType({
@@ -65,6 +65,9 @@ def get_binaries_to_install(): @@ -65,6 +65,9 @@ def get_binaries_to_install():
65 binaries += ["sherpa-onnx-core.dll"] 65 binaries += ["sherpa-onnx-core.dll"]
66 binaries += ["sherpa-onnx-portaudio.dll"] 66 binaries += ["sherpa-onnx-portaudio.dll"]
67 binaries += ["onnxruntime.dll"] 67 binaries += ["onnxruntime.dll"]
  68 + binaries += ["piper_phonemize.dll"]
  69 + binaries += ["espeak-ng.dll"]
  70 + binaries += ["ucd.dll"]
68 binaries += ["kaldi-decoder-core.dll"] 71 binaries += ["kaldi-decoder-core.dll"]
69 binaries += ["sherpa-onnx-fst.lib"] 72 binaries += ["sherpa-onnx-fst.lib"]
70 binaries += ["sherpa-onnx-kaldifst-core.lib"] 73 binaries += ["sherpa-onnx-kaldifst-core.lib"]
@@ -547,6 +547,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( @@ -547,6 +547,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
547 tts_config.model.vits.lexicon = 547 tts_config.model.vits.lexicon =
548 SHERPA_ONNX_OR(config->model.vits.lexicon, ""); 548 SHERPA_ONNX_OR(config->model.vits.lexicon, "");
549 tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, ""); 549 tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, "");
  550 + tts_config.model.vits.data_dir =
  551 + SHERPA_ONNX_OR(config->model.vits.data_dir, "");
550 tts_config.model.vits.noise_scale = 552 tts_config.model.vits.noise_scale =
551 SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667); 553 SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667);
552 tts_config.model.vits.noise_scale_w = 554 tts_config.model.vits.noise_scale_w =
@@ -558,6 +560,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( @@ -558,6 +560,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
558 tts_config.model.debug = config->model.debug; 560 tts_config.model.debug = config->model.debug;
559 tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); 561 tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
560 tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); 562 tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, "");
  563 + tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2);
561 564
562 if (tts_config.model.debug) { 565 if (tts_config.model.debug) {
563 fprintf(stderr, "%s\n", tts_config.ToString().c_str()); 566 fprintf(stderr, "%s\n", tts_config.ToString().c_str());
@@ -607,6 +607,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { @@ -607,6 +607,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig {
607 const char *model; 607 const char *model;
608 const char *lexicon; 608 const char *lexicon;
609 const char *tokens; 609 const char *tokens;
  610 + const char *data_dir;
610 611
611 float noise_scale; 612 float noise_scale;
612 float noise_scale_w; 613 float noise_scale_w;
@@ -623,6 +624,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { @@ -623,6 +624,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig {
623 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { 624 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig {
624 SherpaOnnxOfflineTtsModelConfig model; 625 SherpaOnnxOfflineTtsModelConfig model;
625 const char *rule_fsts; 626 const char *rule_fsts;
  627 + int32_t max_num_sentences;
626 } SherpaOnnxOfflineTtsConfig; 628 } SherpaOnnxOfflineTtsConfig;
627 629
628 SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { 630 SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio {
@@ -74,6 +74,7 @@ set(sources @@ -74,6 +74,7 @@ set(sources
74 packed-sequence.cc 74 packed-sequence.cc
75 pad-sequence.cc 75 pad-sequence.cc
76 parse-options.cc 76 parse-options.cc
  77 + piper-phonemize-lexicon.cc
77 provider.cc 78 provider.cc
78 resample.cc 79 resample.cc
79 session.cc 80 session.cc
@@ -129,8 +129,8 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon, @@ -129,8 +129,8 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon,
129 } 129 }
130 #endif 130 #endif
131 131
132 -std::vector<int64_t> Lexicon::ConvertTextToTokenIds(  
133 - const std::string &text) const { 132 +std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIds(
  133 + const std::string &text, const std::string & /*voice*/ /*= ""*/) const {
134 switch (language_) { 134 switch (language_) {
135 case Language::kEnglish: 135 case Language::kEnglish:
136 return ConvertTextToTokenIdsEnglish(text); 136 return ConvertTextToTokenIdsEnglish(text);
@@ -150,7 +150,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds( @@ -150,7 +150,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
150 return {}; 150 return {};
151 } 151 }
152 152
153 -std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese( 153 +std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsChinese(
154 const std::string &text) const { 154 const std::string &text) const {
155 std::vector<std::string> words; 155 std::vector<std::string> words;
156 if (pattern_) { 156 if (pattern_) {
@@ -245,10 +245,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese( @@ -245,10 +245,10 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
245 ans.push_back(eos); 245 ans.push_back(eos);
246 } 246 }
247 247
248 - return ans; 248 + return {ans};
249 } 249 }
250 250
251 -std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish( 251 +std::vector<std::vector<int64_t>> Lexicon::ConvertTextToTokenIdsEnglish(
252 const std::string &_text) const { 252 const std::string &_text) const {
253 std::string text(_text); 253 std::string text(_text);
254 ToLowerCase(&text); 254 ToLowerCase(&text);
@@ -301,7 +301,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish( @@ -301,7 +301,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
301 ans.push_back(token2id_.at("$")); // eos 301 ans.push_back(token2id_.at("$")); // eos
302 } 302 }
303 303
304 - return ans; 304 + return {ans};
305 } 305 }
306 306
307 void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); } 307 void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); }
@@ -18,11 +18,15 @@ @@ -18,11 +18,15 @@
18 #include "android/asset_manager_jni.h" 18 #include "android/asset_manager_jni.h"
19 #endif 19 #endif
20 20
  21 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
  22 +
21 namespace sherpa_onnx { 23 namespace sherpa_onnx {
22 24
23 -// TODO(fangjun): Refactor it to an abstract class  
24 -class Lexicon { 25 +class Lexicon : public OfflineTtsFrontend {
25 public: 26 public:
  27 + Lexicon() = default; // for subclasses
  28 + //
  29 + // Note: for models from piper, we won't use this class.
26 Lexicon(const std::string &lexicon, const std::string &tokens, 30 Lexicon(const std::string &lexicon, const std::string &tokens,
27 const std::string &punctuations, const std::string &language, 31 const std::string &punctuations, const std::string &language,
28 bool debug = false, bool is_piper = false); 32 bool debug = false, bool is_piper = false);
@@ -34,28 +38,29 @@ class Lexicon { @@ -34,28 +38,29 @@ class Lexicon {
34 bool is_piper = false); 38 bool is_piper = false);
35 #endif 39 #endif
36 40
37 - std::vector<int64_t> ConvertTextToTokenIds(const std::string &text) const; 41 + std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
  42 + const std::string &text, const std::string &voice = "") const override;
38 43
39 private: 44 private:
40 - std::vector<int64_t> ConvertTextToTokenIdsGerman( 45 + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsGerman(
41 const std::string &text) const { 46 const std::string &text) const {
42 return ConvertTextToTokenIdsEnglish(text); 47 return ConvertTextToTokenIdsEnglish(text);
43 } 48 }
44 49
45 - std::vector<int64_t> ConvertTextToTokenIdsSpanish( 50 + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsSpanish(
46 const std::string &text) const { 51 const std::string &text) const {
47 return ConvertTextToTokenIdsEnglish(text); 52 return ConvertTextToTokenIdsEnglish(text);
48 } 53 }
49 54
50 - std::vector<int64_t> ConvertTextToTokenIdsFrench( 55 + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsFrench(
51 const std::string &text) const { 56 const std::string &text) const {
52 return ConvertTextToTokenIdsEnglish(text); 57 return ConvertTextToTokenIdsEnglish(text);
53 } 58 }
54 59
55 - std::vector<int64_t> ConvertTextToTokenIdsEnglish( 60 + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsEnglish(
56 const std::string &text) const; 61 const std::string &text) const;
57 62
58 - std::vector<int64_t> ConvertTextToTokenIdsChinese( 63 + std::vector<std::vector<int64_t>> ConvertTextToTokenIdsChinese(
59 const std::string &text) const; 64 const std::string &text) const;
60 65
61 void InitLanguage(const std::string &lang); 66 void InitLanguage(const std::string &lang);
@@ -43,6 +43,21 @@ @@ -43,6 +43,21 @@
43 } \ 43 } \
44 } while (0) 44 } while (0)
45 45
  46 +#define SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(dst, src_key, default_value) \
  47 + do { \
  48 + auto value = \
  49 + meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \
  50 + if (!value) { \
  51 + dst = default_value; \
  52 + } else { \
  53 + dst = atoi(value.get()); \
  54 + if (dst < 0) { \
  55 + SHERPA_ONNX_LOGE("Invalid value %d for %s", dst, src_key); \
  56 + exit(-1); \
  57 + } \
  58 + } \
  59 + } while (0)
  60 +
46 // read a vector of integers 61 // read a vector of integers
47 #define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key) \ 62 #define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key) \
48 do { \ 63 do { \
@@ -112,4 +127,20 @@ @@ -112,4 +127,20 @@
112 } \ 127 } \
113 } while (0) 128 } while (0)
114 129
  130 +#define SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(dst, src_key, \
  131 + default_value) \
  132 + do { \
  133 + auto value = \
  134 + meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \
  135 + if (!value) { \
  136 + dst = default_value; \
  137 + } else { \
  138 + dst = value.get(); \
  139 + if (dst.empty()) { \
  140 + SHERPA_ONNX_LOGE("Invalid value for %s\n", src_key); \
  141 + exit(-1); \
  142 + } \
  143 + } \
  144 + } while (0)
  145 +
115 #endif // SHERPA_ONNX_CSRC_MACROS_H_ 146 #endif // SHERPA_ONNX_CSRC_MACROS_H_
  1 +// sherpa-onnx/csrc/offline-tts-frontend.h
  2 +//
  3 +// Copyright (c) 2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
  6 +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
  7 +#include <cstdint>
  8 +#include <string>
  9 +#include <vector>
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +class OfflineTtsFrontend {
  14 + public:
  15 + virtual ~OfflineTtsFrontend() = default;
  16 +
  17 + /** Convert a string to token IDs.
  18 + *
  19 + * @param text The input text.
  20 + * Example 1: "This is the first sample sentence; this is the
  21 + * second one." Example 2: "这是第一句。这是第二句。"
  22 + * @param voice Optional. It is for espeak-ng.
  23 + *
  24 + * @return Return a vector-of-vector of token IDs. Each subvector contains
  25 + * a sentence that can be processed independently.
  26 + * If a frontend does not support splitting the text into sentences,
  27 + * the resulting vector contains only one subvector.
  28 + */
  29 + virtual std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
  30 + const std::string &text, const std::string &voice = "") const = 0;
  31 +};
  32 +
  33 +} // namespace sherpa_onnx
  34 +
  35 +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
@@ -18,9 +18,11 @@ @@ -18,9 +18,11 @@
18 #include "kaldifst/csrc/text-normalizer.h" 18 #include "kaldifst/csrc/text-normalizer.h"
19 #include "sherpa-onnx/csrc/lexicon.h" 19 #include "sherpa-onnx/csrc/lexicon.h"
20 #include "sherpa-onnx/csrc/macros.h" 20 #include "sherpa-onnx/csrc/macros.h"
  21 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
21 #include "sherpa-onnx/csrc/offline-tts-impl.h" 22 #include "sherpa-onnx/csrc/offline-tts-impl.h"
22 #include "sherpa-onnx/csrc/offline-tts-vits-model.h" 23 #include "sherpa-onnx/csrc/offline-tts-vits-model.h"
23 #include "sherpa-onnx/csrc/onnx-utils.h" 24 #include "sherpa-onnx/csrc/onnx-utils.h"
  25 +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
24 #include "sherpa-onnx/csrc/text-utils.h" 26 #include "sherpa-onnx/csrc/text-utils.h"
25 27
26 namespace sherpa_onnx { 28 namespace sherpa_onnx {
@@ -29,10 +31,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -29,10 +31,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
29 public: 31 public:
30 explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config) 32 explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config)
31 : config_(config), 33 : config_(config),
32 - model_(std::make_unique<OfflineTtsVitsModel>(config.model)),  
33 - lexicon_(config.model.vits.lexicon, config.model.vits.tokens,  
34 - model_->Punctuations(), model_->Language(), config.model.debug,  
35 - model_->IsPiper()) { 34 + model_(std::make_unique<OfflineTtsVitsModel>(config.model)) {
  35 + InitFrontend();
  36 +
36 if (!config.rule_fsts.empty()) { 37 if (!config.rule_fsts.empty()) {
37 std::vector<std::string> files; 38 std::vector<std::string> files;
38 SplitStringToVector(config.rule_fsts, ",", false, &files); 39 SplitStringToVector(config.rule_fsts, ",", false, &files);
@@ -49,10 +50,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -49,10 +50,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
49 #if __ANDROID_API__ >= 9 50 #if __ANDROID_API__ >= 9
50 OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config) 51 OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config)
51 : config_(config), 52 : config_(config),
52 - model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)),  
53 - lexicon_(mgr, config.model.vits.lexicon, config.model.vits.tokens,  
54 - model_->Punctuations(), model_->Language(), config.model.debug,  
55 - model_->IsPiper()) { 53 + model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)) {
  54 + InitFrontend(mgr);
  55 +
56 if (!config.rule_fsts.empty()) { 56 if (!config.rule_fsts.empty()) {
57 std::vector<std::string> files; 57 std::vector<std::string> files;
58 SplitStringToVector(config.rule_fsts, ",", false, &files); 58 SplitStringToVector(config.rule_fsts, ",", false, &files);
@@ -101,20 +101,119 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -101,20 +101,119 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
101 } 101 }
102 } 102 }
103 103
104 - std::vector<int64_t> x = lexicon_.ConvertTextToTokenIds(text);  
105 - if (x.empty()) { 104 + std::vector<std::vector<int64_t>> x =
  105 + frontend_->ConvertTextToTokenIds(text, model_->Voice());
  106 +
  107 + if (x.empty() || (x.size() == 1 && x[0].empty())) {
106 SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str()); 108 SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str());
107 return {}; 109 return {};
108 } 110 }
109 111
110 - if (model_->AddBlank()) {  
111 - std::vector<int64_t> buffer(x.size() * 2 + 1);  
112 - int32_t i = 1;  
113 - for (auto k : x) {  
114 - buffer[i] = k;  
115 - i += 2; 112 + if (model_->AddBlank() && config_.model.vits.data_dir.empty()) {
  113 + for (auto &k : x) {
  114 + k = AddBlank(k);
  115 + }
  116 + }
  117 +
  118 + int32_t x_size = static_cast<int32_t>(x.size());
  119 +
  120 + if (config_.max_num_sentences <= 0 || x_size <= config_.max_num_sentences) {
  121 + return Process(x, sid, speed);
  122 + }
  123 +
  124 + // the input text is too long, we process sentences within it in batches
  125 + // to avoid OOM. Batch size is config_.max_num_sentences
  126 + std::vector<std::vector<int64_t>> batch;
  127 + int32_t batch_size = config_.max_num_sentences;
  128 + batch.reserve(config_.max_num_sentences);
  129 + int32_t num_batches = x_size / batch_size;
  130 +
  131 + if (config_.model.debug) {
  132 + SHERPA_ONNX_LOGE(
  133 + "Text is too long. Split it into %d batches. batch size: %d. Number "
  134 + "of sentences: %d",
  135 + num_batches, batch_size, x_size);
  136 + }
  137 +
  138 + GeneratedAudio ans;
  139 +
  140 + int32_t k = 0;
  141 +
  142 + for (int32_t b = 0; b != num_batches; ++b) {
  143 + batch.clear();
  144 + for (int32_t i = 0; i != batch_size; ++i, ++k) {
  145 + batch.push_back(std::move(x[k]));
116 } 146 }
117 - x = std::move(buffer); 147 +
  148 + auto audio = Process(batch, sid, speed);
  149 + ans.sample_rate = audio.sample_rate;
  150 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  151 + audio.samples.end());
  152 + }
  153 +
  154 + batch.clear();
  155 + while (k < x.size()) {
  156 + batch.push_back(std::move(x[k]));
  157 + ++k;
  158 + }
  159 +
  160 + if (!batch.empty()) {
  161 + auto audio = Process(batch, sid, speed);
  162 + ans.sample_rate = audio.sample_rate;
  163 + ans.samples.insert(ans.samples.end(), audio.samples.begin(),
  164 + audio.samples.end());
  165 + }
  166 +
  167 + return ans;
  168 + }
  169 +
  170 + private:
  171 + void InitFrontend(AAssetManager *mgr) {
  172 + if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) {
  173 + frontend_ = std::make_unique<PiperPhonemizeLexicon>(
  174 + mgr, config_.model.vits.tokens, config_.model.vits.data_dir);
  175 + } else {
  176 + frontend_ = std::make_unique<Lexicon>(
  177 + mgr, config_.model.vits.lexicon, config_.model.vits.tokens,
  178 + model_->Punctuations(), model_->Language(), config_.model.debug,
  179 + model_->IsPiper());
  180 + }
  181 + }
  182 +
  183 + void InitFrontend() {
  184 + if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) {
  185 + frontend_ = std::make_unique<PiperPhonemizeLexicon>(
  186 + config_.model.vits.tokens, config_.model.vits.data_dir);
  187 + } else {
  188 + frontend_ = std::make_unique<Lexicon>(
  189 + config_.model.vits.lexicon, config_.model.vits.tokens,
  190 + model_->Punctuations(), model_->Language(), config_.model.debug,
  191 + model_->IsPiper());
  192 + }
  193 + }
  194 +
  195 + std::vector<int64_t> AddBlank(const std::vector<int64_t> &x) const {
  196 + // we assume the blank ID is 0
  197 + std::vector<int64_t> buffer(x.size() * 2 + 1);
  198 + int32_t i = 1;
  199 + for (auto k : x) {
  200 + buffer[i] = k;
  201 + i += 2;
  202 + }
  203 + return buffer;
  204 + }
  205 +
  206 + GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
  207 + int32_t sid, float speed) const {
  208 + int32_t num_tokens = 0;
  209 + for (const auto &k : tokens) {
  210 + num_tokens += k.size();
  211 + }
  212 +
  213 + std::vector<int64_t> x;
  214 + x.reserve(num_tokens);
  215 + for (const auto &k : tokens) {
  216 + x.insert(x.end(), k.begin(), k.end());
118 } 217 }
119 218
120 auto memory_info = 219 auto memory_info =
@@ -147,7 +246,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { @@ -147,7 +246,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
147 OfflineTtsConfig config_; 246 OfflineTtsConfig config_;
148 std::unique_ptr<OfflineTtsVitsModel> model_; 247 std::unique_ptr<OfflineTtsVitsModel> model_;
149 std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_; 248 std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_;
150 - Lexicon lexicon_; 249 + std::unique_ptr<OfflineTtsFrontend> frontend_;
151 }; 250 };
152 251
153 } // namespace sherpa_onnx 252 } // namespace sherpa_onnx
@@ -13,6 +13,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { @@ -13,6 +13,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
13 po->Register("vits-model", &model, "Path to VITS model"); 13 po->Register("vits-model", &model, "Path to VITS model");
14 po->Register("vits-lexicon", &lexicon, "Path to lexicon.txt for VITS models"); 14 po->Register("vits-lexicon", &lexicon, "Path to lexicon.txt for VITS models");
15 po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models"); 15 po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models");
  16 + po->Register("vits-data-dir", &data_dir,
  17 + "Path to the directory containing dict for espeak-ng. If it is "
  18 + "given, --vits-lexicon is ignored.");
16 po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); 19 po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models");
17 po->Register("vits-noise-scale-w", &noise_scale_w, 20 po->Register("vits-noise-scale-w", &noise_scale_w,
18 "noise_scale_w for VITS models"); 21 "noise_scale_w for VITS models");
@@ -31,16 +34,6 @@ bool OfflineTtsVitsModelConfig::Validate() const { @@ -31,16 +34,6 @@ bool OfflineTtsVitsModelConfig::Validate() const {
31 return false; 34 return false;
32 } 35 }
33 36
34 - if (lexicon.empty()) {  
35 - SHERPA_ONNX_LOGE("Please provide --vits-lexicon");  
36 - return false;  
37 - }  
38 -  
39 - if (!FileExists(lexicon)) {  
40 - SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str());  
41 - return false;  
42 - }  
43 -  
44 if (tokens.empty()) { 37 if (tokens.empty()) {
45 SHERPA_ONNX_LOGE("Please provide --vits-tokens"); 38 SHERPA_ONNX_LOGE("Please provide --vits-tokens");
46 return false; 39 return false;
@@ -51,6 +44,43 @@ bool OfflineTtsVitsModelConfig::Validate() const { @@ -51,6 +44,43 @@ bool OfflineTtsVitsModelConfig::Validate() const {
51 return false; 44 return false;
52 } 45 }
53 46
  47 + if (data_dir.empty()) {
  48 + if (lexicon.empty()) {
  49 + SHERPA_ONNX_LOGE("Please provide --vits-lexicon");
  50 + return false;
  51 + }
  52 +
  53 + if (!FileExists(lexicon)) {
  54 + SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str());
  55 + return false;
  56 + }
  57 +
  58 + } else {
  59 + if (!FileExists(data_dir + "/phontab")) {
  60 + SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test",
  61 + data_dir.c_str());
  62 + return false;
  63 + }
  64 +
  65 + if (!FileExists(data_dir + "/phonindex")) {
  66 + SHERPA_ONNX_LOGE("%s/phonindex does not exist. Skipping test",
  67 + data_dir.c_str());
  68 + return false;
  69 + }
  70 +
  71 + if (!FileExists(data_dir + "/phondata")) {
  72 + SHERPA_ONNX_LOGE("%s/phondata does not exist. Skipping test",
  73 + data_dir.c_str());
  74 + return false;
  75 + }
  76 +
  77 + if (!FileExists(data_dir + "/intonations")) {
  78 + SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test",
  79 + data_dir.c_str());
  80 + return false;
  81 + }
  82 + }
  83 +
54 return true; 84 return true;
55 } 85 }
56 86
@@ -61,6 +91,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { @@ -61,6 +91,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const {
61 os << "model=\"" << model << "\", "; 91 os << "model=\"" << model << "\", ";
62 os << "lexicon=\"" << lexicon << "\", "; 92 os << "lexicon=\"" << lexicon << "\", ";
63 os << "tokens=\"" << tokens << "\", "; 93 os << "tokens=\"" << tokens << "\", ";
  94 + os << "data_dir=\"" << data_dir << "\", ";
64 os << "noise_scale=" << noise_scale << ", "; 95 os << "noise_scale=" << noise_scale << ", ";
65 os << "noise_scale_w=" << noise_scale_w << ", "; 96 os << "noise_scale_w=" << noise_scale_w << ", ";
66 os << "length_scale=" << length_scale << ")"; 97 os << "length_scale=" << length_scale << ")";
@@ -16,6 +16,10 @@ struct OfflineTtsVitsModelConfig { @@ -16,6 +16,10 @@ struct OfflineTtsVitsModelConfig {
16 std::string lexicon; 16 std::string lexicon;
17 std::string tokens; 17 std::string tokens;
18 18
  19 + // If data_dir is given, lexicon is ignored
  20 + // data_dir is for piper-phonemize, which uses espeak-ng
  21 + std::string data_dir;
  22 +
19 float noise_scale = 0.667; 23 float noise_scale = 0.667;
20 float noise_scale_w = 0.8; 24 float noise_scale_w = 0.8;
21 float length_scale = 1; 25 float length_scale = 1;
@@ -28,11 +32,13 @@ struct OfflineTtsVitsModelConfig { @@ -28,11 +32,13 @@ struct OfflineTtsVitsModelConfig {
28 OfflineTtsVitsModelConfig(const std::string &model, 32 OfflineTtsVitsModelConfig(const std::string &model,
29 const std::string &lexicon, 33 const std::string &lexicon,
30 const std::string &tokens, 34 const std::string &tokens,
  35 + const std::string &data_dir,
31 float noise_scale = 0.667, 36 float noise_scale = 0.667,
32 float noise_scale_w = 0.8, float length_scale = 1) 37 float noise_scale_w = 0.8, float length_scale = 1)
33 : model(model), 38 : model(model),
34 lexicon(lexicon), 39 lexicon(lexicon),
35 tokens(tokens), 40 tokens(tokens),
  41 + data_dir(data_dir),
36 noise_scale(noise_scale), 42 noise_scale(noise_scale),
37 noise_scale_w(noise_scale_w), 43 noise_scale_w(noise_scale_w),
38 length_scale(length_scale) {} 44 length_scale(length_scale) {}
@@ -51,6 +51,7 @@ class OfflineTtsVitsModel::Impl { @@ -51,6 +51,7 @@ class OfflineTtsVitsModel::Impl {
51 51
52 std::string Punctuations() const { return punctuations_; } 52 std::string Punctuations() const { return punctuations_; }
53 std::string Language() const { return language_; } 53 std::string Language() const { return language_; }
  54 + std::string Voice() const { return voice_; }
54 bool IsPiper() const { return is_piper_; } 55 bool IsPiper() const { return is_piper_; }
55 int32_t NumSpeakers() const { return num_speakers_; } 56 int32_t NumSpeakers() const { return num_speakers_; }
56 57
@@ -74,10 +75,12 @@ class OfflineTtsVitsModel::Impl { @@ -74,10 +75,12 @@ class OfflineTtsVitsModel::Impl {
74 75
75 Ort::AllocatorWithDefaultOptions allocator; // used in the macro below 76 Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
76 SHERPA_ONNX_READ_META_DATA(sample_rate_, "sample_rate"); 77 SHERPA_ONNX_READ_META_DATA(sample_rate_, "sample_rate");
77 - SHERPA_ONNX_READ_META_DATA(add_blank_, "add_blank"); 78 + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(add_blank_, "add_blank", 0);
78 SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers"); 79 SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers");
79 - SHERPA_ONNX_READ_META_DATA_STR(punctuations_, "punctuation"); 80 + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(punctuations_, "punctuation",
  81 + "");
80 SHERPA_ONNX_READ_META_DATA_STR(language_, "language"); 82 SHERPA_ONNX_READ_META_DATA_STR(language_, "language");
  83 + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(voice_, "voice", "");
81 84
82 std::string comment; 85 std::string comment;
83 SHERPA_ONNX_READ_META_DATA_STR(comment, "comment"); 86 SHERPA_ONNX_READ_META_DATA_STR(comment, "comment");
@@ -215,6 +218,7 @@ class OfflineTtsVitsModel::Impl { @@ -215,6 +218,7 @@ class OfflineTtsVitsModel::Impl {
215 int32_t num_speakers_; 218 int32_t num_speakers_;
216 std::string punctuations_; 219 std::string punctuations_;
217 std::string language_; 220 std::string language_;
  221 + std::string voice_;
218 222
219 bool is_piper_ = false; 223 bool is_piper_ = false;
220 }; 224 };
@@ -244,6 +248,7 @@ std::string OfflineTtsVitsModel::Punctuations() const { @@ -244,6 +248,7 @@ std::string OfflineTtsVitsModel::Punctuations() const {
244 } 248 }
245 249
246 std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); } 250 std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); }
  251 +std::string OfflineTtsVitsModel::Voice() const { return impl_->Voice(); }
247 252
248 bool OfflineTtsVitsModel::IsPiper() const { return impl_->IsPiper(); } 253 bool OfflineTtsVitsModel::IsPiper() const { return impl_->IsPiper(); }
249 254
@@ -46,7 +46,8 @@ class OfflineTtsVitsModel { @@ -46,7 +46,8 @@ class OfflineTtsVitsModel {
46 bool AddBlank() const; 46 bool AddBlank() const;
47 47
48 std::string Punctuations() const; 48 std::string Punctuations() const;
49 - std::string Language() const; 49 + std::string Language() const; // e.g., Chinese, English, German, etc.
  50 + std::string Voice() const; // e.g., en-us, for espeak-ng
50 bool IsPiper() const; 51 bool IsPiper() const;
51 int32_t NumSpeakers() const; 52 int32_t NumSpeakers() const;
52 53
@@ -21,6 +21,12 @@ void OfflineTtsConfig::Register(ParseOptions *po) { @@ -21,6 +21,12 @@ void OfflineTtsConfig::Register(ParseOptions *po) {
21 "Multiple filenames are separated by a comma and they are " 21 "Multiple filenames are separated by a comma and they are "
22 "applied from left to right. An example value: " 22 "applied from left to right. An example value: "
23 "rule1.fst,rule2,fst,rule3.fst"); 23 "rule1.fst,rule2,fst,rule3.fst");
  24 +
  25 + po->Register(
  26 + "tts-max-num-sentences", &max_num_sentences,
  27 + "Maximum number of sentences that we process at a time. "
  28 + "This is to avoid OOM for very long input text. "
  29 + "If you set it to -1, then we process all sentences in a single batch.");
24 } 30 }
25 31
26 bool OfflineTtsConfig::Validate() const { 32 bool OfflineTtsConfig::Validate() const {
@@ -43,7 +49,8 @@ std::string OfflineTtsConfig::ToString() const { @@ -43,7 +49,8 @@ std::string OfflineTtsConfig::ToString() const {
43 49
44 os << "OfflineTtsConfig("; 50 os << "OfflineTtsConfig(";
45 os << "model=" << model.ToString() << ", "; 51 os << "model=" << model.ToString() << ", ";
46 - os << "rule_fsts=\"" << rule_fsts << "\")"; 52 + os << "rule_fsts=\"" << rule_fsts << "\", ";
  53 + os << "max_num_sentences=" << max_num_sentences << ")";
47 54
48 return os.str(); 55 return os.str();
49 } 56 }
@@ -28,10 +28,17 @@ struct OfflineTtsConfig { @@ -28,10 +28,17 @@ struct OfflineTtsConfig {
28 // If there are multiple rules, they are applied from left to right. 28 // If there are multiple rules, they are applied from left to right.
29 std::string rule_fsts; 29 std::string rule_fsts;
30 30
  31 + // Maximum number of sentences that we process at a time.
  32 + // This is to avoid OOM for very long input text.
  33 + // If you set it to -1, then we process all sentences in a single batch.
  34 + int32_t max_num_sentences = 2;
  35 +
31 OfflineTtsConfig() = default; 36 OfflineTtsConfig() = default;
32 OfflineTtsConfig(const OfflineTtsModelConfig &model, 37 OfflineTtsConfig(const OfflineTtsModelConfig &model,
33 - const std::string &rule_fsts)  
34 - : model(model), rule_fsts(rule_fsts) {} 38 + const std::string &rule_fsts, int32_t max_num_sentences)
  39 + : model(model),
  40 + rule_fsts(rule_fsts),
  41 + max_num_sentences(max_num_sentences) {}
35 42
36 void Register(ParseOptions *po); 43 void Register(ParseOptions *po);
37 bool Validate() const; 44 bool Validate() const;
  1 +// sherpa-onnx/csrc/piper-phonemize-lexicon.cc
  2 +//
  3 +// Copyright (c) 2022-2023 Xiaomi Corporation
  4 +
  5 +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
  6 +
  7 +#include <codecvt>
  8 +#include <fstream>
  9 +#include <locale>
  10 +#include <map>
  11 +#include <mutex> // NOLINT
  12 +#include <sstream>
  13 +#include <string>
  14 +#include <utility>
  15 +#include <vector>
  16 +
  17 +#if __ANDROID_API__ >= 9
  18 +#include <strstream>
  19 +
  20 +#include "android/asset_manager.h"
  21 +#include "android/asset_manager_jni.h"
  22 +#endif
  23 +
  24 +#include "espeak-ng/speak_lib.h"
  25 +#include "phoneme_ids.hpp"
  26 +#include "phonemize.hpp"
  27 +#include "sherpa-onnx/csrc/macros.h"
  28 +#include "sherpa-onnx/csrc/onnx-utils.h"
  29 +
  30 +namespace sherpa_onnx {
  31 +
  32 +static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {
  33 + std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
  34 + std::unordered_map<char32_t, int32_t> token2id;
  35 +
  36 + std::string line;
  37 +
  38 + std::string sym;
  39 + std::u32string s;
  40 + int32_t id;
  41 + while (std::getline(is, line)) {
  42 + std::istringstream iss(line);
  43 + iss >> sym;
  44 + if (iss.eof()) {
  45 + id = atoi(sym.c_str());
  46 + sym = " ";
  47 + } else {
  48 + iss >> id;
  49 + }
  50 +
  51 + // eat the trailing \r\n on windows
  52 + iss >> std::ws;
  53 + if (!iss.eof()) {
  54 + SHERPA_ONNX_LOGE("Error when reading tokens: %s", line.c_str());
  55 + exit(-1);
  56 + }
  57 +
  58 + s = conv.from_bytes(sym);
  59 + if (s.size() != 1) {
  60 + SHERPA_ONNX_LOGE("Error when reading tokens at Line %s. size: %d",
  61 + line.c_str(), static_cast<int32_t>(s.size()));
  62 + exit(-1);
  63 + }
  64 + char32_t c = s[0];
  65 +
  66 + if (token2id.count(c)) {
  67 + SHERPA_ONNX_LOGE("Duplicated token %s. Line %s. Existing ID: %d",
  68 + sym.c_str(), line.c_str(), token2id.at(c));
  69 + exit(-1);
  70 + }
  71 +
  72 + token2id.insert({c, id});
  73 + }
  74 +
  75 + return token2id;
  76 +}
  77 +
  78 +// see the function "phonemes_to_ids" from
  79 +// https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb
  80 +static std::vector<int64_t> PhonemesToIds(
  81 + const std::unordered_map<char32_t, int32_t> &token2id,
  82 + const std::vector<piper::Phoneme> &phonemes) {
  83 + // see
  84 + // https://github.com/rhasspy/piper-phonemize/blob/master/src/phoneme_ids.hpp#L17
  85 + int32_t pad = token2id.at(U'_');
  86 + int32_t bos = token2id.at(U'^');
  87 + int32_t eos = token2id.at(U'$');
  88 +
  89 + std::vector<int64_t> ans;
  90 + ans.reserve(phonemes.size());
  91 +
  92 + ans.push_back(bos);
  93 + for (auto p : phonemes) {
  94 + if (token2id.count(p)) {
  95 + ans.push_back(token2id.at(p));
  96 + ans.push_back(pad);
  97 + } else {
  98 + SHERPA_ONNX_LOGE("Skip unkown phonemes. Unicode codepoint: \\U+%04x.", p);
  99 + }
  100 + }
  101 + ans.push_back(eos);
  102 +
  103 + return ans;
  104 +}
  105 +
  106 +void InitEspeak(const std::string &data_dir) {
  107 + static std::once_flag init_flag;
  108 + std::call_once(init_flag, [data_dir]() {
  109 + int32_t result =
  110 + espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, data_dir.c_str(), 0);
  111 + if (result != 22050) {
  112 + SHERPA_ONNX_LOGE(
  113 + "Failed to initialize espeak-ng with data dir: %s. Return code is: "
  114 + "%d",
  115 + data_dir.c_str(), result);
  116 + exit(-1);
  117 + }
  118 + });
  119 +}
  120 +
  121 +PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &tokens,
  122 + const std::string &data_dir)
  123 + : data_dir_(data_dir) {
  124 + {
  125 + std::ifstream is(tokens);
  126 + token2id_ = ReadTokens(is);
  127 + }
  128 +
  129 + InitEspeak(data_dir_);
  130 +}
  131 +
  132 +#if __ANDROID_API__ >= 9
  133 +PiperPhonemizeLexicon::PiperPhonemizeLexicon(AAssetManager *mgr,
  134 + const std::string &tokens,
  135 + const std::string &data_dir) {
  136 + {
  137 + auto buf = ReadFile(mgr, tokens);
  138 + std::istrstream is(buf.data(), buf.size());
  139 + token2id_ = ReadTokens(is);
  140 + }
  141 +
  142 + // We should copy the directory of espeak-ng-data from the asset to
  143 + // some internal or external storage and then pass the directory to data_dir.
  144 + InitEspeak(data_dir_);
  145 +}
  146 +#endif
  147 +
  148 +std::vector<std::vector<int64_t>> PiperPhonemizeLexicon::ConvertTextToTokenIds(
  149 + const std::string &text, const std::string &voice /*= ""*/) const {
  150 + piper::eSpeakPhonemeConfig config;
  151 +
  152 + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
  153 + // to list available voices
  154 + config.voice = voice; // e.g., voice is en-us
  155 +
  156 + std::vector<std::vector<piper::Phoneme>> phonemes;
  157 + piper::phonemize_eSpeak(text, config, phonemes);
  158 +
  159 + std::vector<std::vector<int64_t>> ans;
  160 +
  161 + std::vector<int64_t> phoneme_ids;
  162 + for (const auto &p : phonemes) {
  163 + phoneme_ids = PhonemesToIds(token2id_, p);
  164 + ans.push_back(std::move(phoneme_ids));
  165 + }
  166 +
  167 + return ans;
  168 +}
  169 +
  170 +} // namespace sherpa_onnx
  1 +// sherpa-onnx/csrc/piper-phonemize-lexicon.h
  2 +//
  3 +// Copyright (c) 2022-2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_
  6 +#define SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_
  7 +
  8 +#include <string>
  9 +#include <unordered_map>
  10 +#include <vector>
  11 +
  12 +#if __ANDROID_API__ >= 9
  13 +#include "android/asset_manager.h"
  14 +#include "android/asset_manager_jni.h"
  15 +#endif
  16 +
  17 +#include "sherpa-onnx/csrc/offline-tts-frontend.h"
  18 +
  19 +namespace sherpa_onnx {
  20 +
  21 +class PiperPhonemizeLexicon : public OfflineTtsFrontend {
  22 + public:
  23 + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir);
  24 +
  25 +#if __ANDROID_API__ >= 9
  26 + PiperPhonemizeLexicon(AAssetManager *mgr, const std::string &tokens,
  27 + const std::string &data_dir);
  28 +#endif
  29 +
  30 + std::vector<std::vector<int64_t>> ConvertTextToTokenIds(
  31 + const std::string &text, const std::string &voice = "") const override;
  32 +
  33 + private:
  34 + std::string data_dir_;
  35 + // map unicode codepoint to an integer ID
  36 + std::unordered_map<char32_t, int32_t> token2id_;
  37 +};
  38 +
  39 +} // namespace sherpa_onnx
  40 +
  41 +#endif // SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_
@@ -48,7 +48,7 @@ TEST(PiperPhonemize, Case1) { @@ -48,7 +48,7 @@ TEST(PiperPhonemize, Case1) {
48 48
49 piper::eSpeakPhonemeConfig config; 49 piper::eSpeakPhonemeConfig config;
50 50
51 - // ./bin/espeak-ng --path ./install/share/espeak-ng-data/ --voices 51 + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
52 // to list available voices 52 // to list available voices
53 config.voice = "en-us"; 53 config.voice = "en-us";
54 54
@@ -61,15 +61,15 @@ TEST(PiperPhonemize, Case1) { @@ -61,15 +61,15 @@ TEST(PiperPhonemize, Case1) {
61 } 61 }
62 std::cout << "\n"; 62 std::cout << "\n";
63 63
64 - std::vector<piper::PhonemeId> phonemeIds;  
65 - std::map<piper::Phoneme, std::size_t> missingPhonemes; 64 + std::vector<piper::PhonemeId> phoneme_ids;
  65 + std::map<piper::Phoneme, std::size_t> missing_phonemes;
66 66
67 { 67 {
68 piper::PhonemeIdConfig config; 68 piper::PhonemeIdConfig config;
69 - phonemes_to_ids(phonemes[0], config, phonemeIds, missingPhonemes); 69 + phonemes_to_ids(phonemes[0], config, phoneme_ids, missing_phonemes);
70 } 70 }
71 71
72 - for (int32_t p : phonemeIds) { 72 + for (int32_t p : phoneme_ids) {
73 std::cout << p << " "; 73 std::cout << p << " ";
74 } 74 }
75 std::cout << "\n"; 75 std::cout << "\n";
@@ -545,6 +545,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { @@ -545,6 +545,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
545 ans.model.vits.tokens = p; 545 ans.model.vits.tokens = p;
546 env->ReleaseStringUTFChars(s, p); 546 env->ReleaseStringUTFChars(s, p);
547 547
  548 + fid = env->GetFieldID(vits_cls, "dataDir", "Ljava/lang/String;");
  549 + s = (jstring)env->GetObjectField(vits, fid);
  550 + p = env->GetStringUTFChars(s, nullptr);
  551 + ans.model.vits.data_dir = p;
  552 + env->ReleaseStringUTFChars(s, p);
  553 +
548 fid = env->GetFieldID(vits_cls, "noiseScale", "F"); 554 fid = env->GetFieldID(vits_cls, "noiseScale", "F");
549 ans.model.vits.noise_scale = env->GetFloatField(vits, fid); 555 ans.model.vits.noise_scale = env->GetFloatField(vits, fid);
550 556
@@ -573,6 +579,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { @@ -573,6 +579,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
573 ans.rule_fsts = p; 579 ans.rule_fsts = p;
574 env->ReleaseStringUTFChars(s, p); 580 env->ReleaseStringUTFChars(s, p);
575 581
  582 + fid = env->GetFieldID(cls, "maxNumSentences", "I");
  583 + ans.max_num_sentences = env->GetIntField(config, fid);
  584 +
576 return ans; 585 return ans;
577 } 586 }
578 587
@@ -589,6 +598,11 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new( @@ -589,6 +598,11 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new(
589 #endif 598 #endif
590 auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config); 599 auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config);
591 SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); 600 SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str());
  601 +
  602 + if (!config.Validate()) {
  603 + SHERPA_ONNX_LOGE("Erros found in config!");
  604 + }
  605 +
592 auto tts = new sherpa_onnx::SherpaOnnxOfflineTts( 606 auto tts = new sherpa_onnx::SherpaOnnxOfflineTts(
593 #if __ANDROID_API__ >= 9 607 #if __ANDROID_API__ >= 9
594 mgr, 608 mgr,
@@ -16,17 +16,20 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { @@ -16,17 +16,20 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) {
16 py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig") 16 py::class_<PyClass>(*m, "OfflineTtsVitsModelConfig")
17 .def(py::init<>()) 17 .def(py::init<>())
18 .def(py::init<const std::string &, const std::string &, 18 .def(py::init<const std::string &, const std::string &,
19 - const std::string &, float, float, float>(), 19 + const std::string &, const std::string, float, float,
  20 + float>(),
20 py::arg("model"), py::arg("lexicon"), py::arg("tokens"), 21 py::arg("model"), py::arg("lexicon"), py::arg("tokens"),
21 - py::arg("noise_scale") = 0.667, py::arg("noise_scale_w") = 0.8,  
22 - py::arg("length_scale") = 1.0) 22 + py::arg("data_dir") = "", py::arg("noise_scale") = 0.667,
  23 + py::arg("noise_scale_w") = 0.8, py::arg("length_scale") = 1.0)
23 .def_readwrite("model", &PyClass::model) 24 .def_readwrite("model", &PyClass::model)
24 .def_readwrite("lexicon", &PyClass::lexicon) 25 .def_readwrite("lexicon", &PyClass::lexicon)
25 .def_readwrite("tokens", &PyClass::tokens) 26 .def_readwrite("tokens", &PyClass::tokens)
  27 + .def_readwrite("data_dir", &PyClass::data_dir)
26 .def_readwrite("noise_scale", &PyClass::noise_scale) 28 .def_readwrite("noise_scale", &PyClass::noise_scale)
27 .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) 29 .def_readwrite("noise_scale_w", &PyClass::noise_scale_w)
28 .def_readwrite("length_scale", &PyClass::length_scale) 30 .def_readwrite("length_scale", &PyClass::length_scale)
29 - .def("__str__", &PyClass::ToString); 31 + .def("__str__", &PyClass::ToString)
  32 + .def("validate", &PyClass::Validate);
30 } 33 }
31 34
32 } // namespace sherpa_onnx 35 } // namespace sherpa_onnx
@@ -30,10 +30,14 @@ static void PybindOfflineTtsConfig(py::module *m) { @@ -30,10 +30,14 @@ static void PybindOfflineTtsConfig(py::module *m) {
30 using PyClass = OfflineTtsConfig; 30 using PyClass = OfflineTtsConfig;
31 py::class_<PyClass>(*m, "OfflineTtsConfig") 31 py::class_<PyClass>(*m, "OfflineTtsConfig")
32 .def(py::init<>()) 32 .def(py::init<>())
33 - .def(py::init<const OfflineTtsModelConfig &, const std::string &>(),  
34 - py::arg("model"), py::arg("rule_fsts") = "") 33 + .def(py::init<const OfflineTtsModelConfig &, const std::string &,
  34 + int32_t>(),
  35 + py::arg("model"), py::arg("rule_fsts") = "",
  36 + py::arg("max_num_sentences") = 2)
35 .def_readwrite("model", &PyClass::model) 37 .def_readwrite("model", &PyClass::model)
36 .def_readwrite("rule_fsts", &PyClass::rule_fsts) 38 .def_readwrite("rule_fsts", &PyClass::rule_fsts)
  39 + .def_readwrite("max_num_sentences", &PyClass::max_num_sentences)
  40 + .def("validate", &PyClass::Validate)
37 .def("__str__", &PyClass::ToString); 41 .def("__str__", &PyClass::ToString);
38 } 42 }
39 43
@@ -578,6 +578,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( @@ -578,6 +578,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig(
578 model: String, 578 model: String,
579 lexicon: String, 579 lexicon: String,
580 tokens: String, 580 tokens: String,
  581 + dataDir: String = "",
581 noiseScale: Float = 0.667, 582 noiseScale: Float = 0.667,
582 noiseScaleW: Float = 0.8, 583 noiseScaleW: Float = 0.8,
583 lengthScale: Float = 1.0 584 lengthScale: Float = 1.0
@@ -586,6 +587,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( @@ -586,6 +587,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig(
586 model: toCPointer(model), 587 model: toCPointer(model),
587 lexicon: toCPointer(lexicon), 588 lexicon: toCPointer(lexicon),
588 tokens: toCPointer(tokens), 589 tokens: toCPointer(tokens),
  590 + data_dir: toCPointer(dataDir),
589 noise_scale: noiseScale, 591 noise_scale: noiseScale,
590 noise_scale_w: noiseScaleW, 592 noise_scale_w: noiseScaleW,
591 length_scale: lengthScale) 593 length_scale: lengthScale)
@@ -607,11 +609,13 @@ func sherpaOnnxOfflineTtsModelConfig( @@ -607,11 +609,13 @@ func sherpaOnnxOfflineTtsModelConfig(
607 609
608 func sherpaOnnxOfflineTtsConfig( 610 func sherpaOnnxOfflineTtsConfig(
609 model: SherpaOnnxOfflineTtsModelConfig, 611 model: SherpaOnnxOfflineTtsModelConfig,
610 - ruleFsts: String = "" 612 + ruleFsts: String = "",
  613 + maxNumSenetences: Int = 2
611 ) -> SherpaOnnxOfflineTtsConfig { 614 ) -> SherpaOnnxOfflineTtsConfig {
612 return SherpaOnnxOfflineTtsConfig( 615 return SherpaOnnxOfflineTtsConfig(
613 model: model, 616 model: model,
614 - rule_fsts: toCPointer(ruleFsts) 617 + rule_fsts: toCPointer(ruleFsts),
  618 + max_num_sentences: Int32(maxNumSenetences)
615 ) 619 )
616 } 620 }
617 621
@@ -7,17 +7,12 @@ if [ ! -d ../build-swift-macos ]; then @@ -7,17 +7,12 @@ if [ ! -d ../build-swift-macos ]; then
7 exit 1 7 exit 1
8 fi 8 fi
9 9
10 -if [ ! -d ./vits-vctk ]; then  
11 - echo "Please download the pre-trained model for testing."  
12 - echo "You can refer to"  
13 - echo ""  
14 - echo "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers"  
15 - echo ""  
16 - echo "for help" 10 +if [ ! -d ./vits-piper-en_US-amy-low ]; then
  11 + echo "Download a pre-trained model for testing."
17 12
18 - wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2  
19 - tar xvf vits-vctk.tar.bz2  
20 - rm vits-vctk.tar.bz2 13 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  14 + tar xf vits-piper-en_US-amy-low.tar.bz2
  15 + rm vits-piper-en_US-amy-low.tar.bz2
21 fi 16 fi
22 17
23 if [ ! -e ./tts ]; then 18 if [ ! -e ./tts ]; then
1 func run() { 1 func run() {
2 - let model = "./vits-vctk/vits-vctk.onnx"  
3 - let lexicon = "./vits-vctk/lexicon.txt"  
4 - let tokens = "./vits-vctk/tokens.txt" 2 + let model = "./vits-piper-en_US-amy-low/en_US-amy-low.onnx"
  3 + let tokens = "./vits-piper-en_US-amy-low/tokens.txt"
  4 + let dataDir = "./vits-piper-en_US-amy-low/espeak-ng-data"
5 let vits = sherpaOnnxOfflineTtsVitsModelConfig( 5 let vits = sherpaOnnxOfflineTtsVitsModelConfig(
6 model: model, 6 model: model,
7 - lexicon: lexicon,  
8 - tokens: tokens 7 + lexicon: "",
  8 + tokens: tokens,
  9 + dataDir: dataDir
9 ) 10 )
10 let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) 11 let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits)
11 var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig) 12 var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig)
12 13
13 let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig) 14 let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig)
14 15
15 - let text = "How are you doing? Fantastic!" 16 + let text =
  17 + "“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”"
16 let sid = 99 18 let sid = 99
17 let speed: Float = 1.0 19 let speed: Float = 1.0
18 20