Committed by
GitHub
Add C# and JavaScript (wasm) API for MatchaTTS models (#1682)
正在显示
26 个修改的文件
包含
677 行增加
和
88 行删除
| @@ -2,7 +2,27 @@ | @@ -2,7 +2,27 @@ | ||
| 2 | 2 | ||
| 3 | cd dotnet-examples/ | 3 | cd dotnet-examples/ |
| 4 | 4 | ||
| 5 | -cd ./offline-speaker-diarization | 5 | +cd ./offline-tts |
| 6 | +./run-matcha-zh.sh | ||
| 7 | +ls -lh *.wav | ||
| 8 | +./run-matcha-en.sh | ||
| 9 | +ls -lh *.wav | ||
| 10 | +./run-aishell3.sh | ||
| 11 | +ls -lh *.wav | ||
| 12 | +./run-piper.sh | ||
| 13 | +ls -lh *.wav | ||
| 14 | +./run-hf-fanchen.sh | ||
| 15 | +ls -lh *.wav | ||
| 16 | +ls -lh | ||
| 17 | + | ||
| 18 | +pushd ../.. | ||
| 19 | + | ||
| 20 | +mkdir tts | ||
| 21 | + | ||
| 22 | +cp dotnet-examples/offline-tts/*.wav ./tts | ||
| 23 | +popd | ||
| 24 | + | ||
| 25 | +cd ../offline-speaker-diarization | ||
| 6 | ./run.sh | 26 | ./run.sh |
| 7 | rm -rfv *.onnx | 27 | rm -rfv *.onnx |
| 8 | rm -fv *.wav | 28 | rm -fv *.wav |
| @@ -76,14 +96,4 @@ cd ../spoken-language-identification | @@ -76,14 +96,4 @@ cd ../spoken-language-identification | ||
| 76 | ./run.sh | 96 | ./run.sh |
| 77 | rm -rf sherpa-onnx-* | 97 | rm -rf sherpa-onnx-* |
| 78 | 98 | ||
| 79 | -cd ../offline-tts | ||
| 80 | -./run-aishell3.sh | ||
| 81 | -./run-piper.sh | ||
| 82 | -./run-hf-fanchen.sh | ||
| 83 | -ls -lh | ||
| 84 | 99 | ||
| 85 | -cd ../.. | ||
| 86 | - | ||
| 87 | -mkdir tts | ||
| 88 | - | ||
| 89 | -cp dotnet-examples/offline-tts/*.wav ./tts |
| @@ -9,6 +9,48 @@ git status | @@ -9,6 +9,48 @@ git status | ||
| 9 | ls -lh | 9 | ls -lh |
| 10 | ls -lh node_modules | 10 | ls -lh node_modules |
| 11 | 11 | ||
| 12 | +# offline tts | ||
| 13 | +# | ||
| 14 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 15 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 16 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 17 | + | ||
| 18 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 19 | + | ||
| 20 | +node ./test-offline-tts-matcha-zh.js | ||
| 21 | + | ||
| 22 | +rm -rf matcha-icefall-zh-baker | ||
| 23 | +rm hifigan_v2.onnx | ||
| 24 | + | ||
| 25 | +echo "---" | ||
| 26 | + | ||
| 27 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 28 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 29 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 30 | + | ||
| 31 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 32 | + | ||
| 33 | +node ./test-offline-tts-matcha-en.js | ||
| 34 | + | ||
| 35 | +rm -rf matcha-icefall-en_US-ljspeech | ||
| 36 | +rm hifigan_v2.onnx | ||
| 37 | + | ||
| 38 | +echo "---" | ||
| 39 | + | ||
| 40 | +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 41 | +tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 42 | +node ./test-offline-tts-vits-en.js | ||
| 43 | +rm -rf vits-piper-en_US-amy-low* | ||
| 44 | + | ||
| 45 | +echo "---" | ||
| 46 | + | ||
| 47 | +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | ||
| 48 | +tar xvf vits-icefall-zh-aishell3.tar.bz2 | ||
| 49 | +node ./test-offline-tts-vits-zh.js | ||
| 50 | +rm -rf vits-icefall-zh-aishell3* | ||
| 51 | + | ||
| 52 | +ls -lh *.wav | ||
| 53 | + | ||
| 12 | echo '-----speaker diarization----------' | 54 | echo '-----speaker diarization----------' |
| 13 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | 55 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 |
| 14 | tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | 56 | tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 |
| @@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | @@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | ||
| 147 | rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 | 189 | rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 |
| 148 | node ./test-online-zipformer2-ctc-hlg.js | 190 | node ./test-online-zipformer2-ctc-hlg.js |
| 149 | rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 | 191 | rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 |
| 150 | - | ||
| 151 | -# offline tts | ||
| 152 | - | ||
| 153 | -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 154 | -tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 155 | -node ./test-offline-tts-en.js | ||
| 156 | -rm -rf vits-piper-en_US-amy-low* | ||
| 157 | - | ||
| 158 | -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | ||
| 159 | -tar xvf vits-icefall-zh-aishell3.tar.bz2 | ||
| 160 | -node ./test-offline-tts-zh.js | ||
| 161 | -rm -rf vits-icefall-zh-aishell3* |
| @@ -92,6 +92,50 @@ jobs: | @@ -92,6 +92,50 @@ jobs: | ||
| 92 | python-version: ["3.8"] | 92 | python-version: ["3.8"] |
| 93 | 93 | ||
| 94 | steps: | 94 | steps: |
| 95 | + - name: Check space | ||
| 96 | + shell: bash | ||
| 97 | + run: | | ||
| 98 | + df -h | ||
| 99 | + | ||
| 100 | + - name: Free space | ||
| 101 | + shell: bash | ||
| 102 | + run: | | ||
| 103 | + df -h | ||
| 104 | + rm -rf /opt/hostedtoolcache | ||
| 105 | + df -h | ||
| 106 | + | ||
| 107 | + - name: Free more space | ||
| 108 | + shell: bash | ||
| 109 | + run: | | ||
| 110 | + # https://github.com/orgs/community/discussions/25678 | ||
| 111 | + cd /opt | ||
| 112 | + find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';' | ||
| 113 | + | ||
| 114 | + sudo rm -rf /usr/share/dotnet | ||
| 115 | + sudo rm -rf "/usr/local/share/boost" | ||
| 116 | + sudo rm -rf "$AGENT_TOOLSDIRECTORY" | ||
| 117 | + | ||
| 118 | + - name: Free Disk Space (Ubuntu) | ||
| 119 | + uses: jlumbroso/free-disk-space@main | ||
| 120 | + with: | ||
| 121 | + # this might remove tools that are actually needed, | ||
| 122 | + # if set to "true" but frees about 6 GB | ||
| 123 | + tool-cache: false | ||
| 124 | + | ||
| 125 | + # all of these default to true, but feel free to set to | ||
| 126 | + # "false" if necessary for your workflow | ||
| 127 | + android: true | ||
| 128 | + dotnet: false | ||
| 129 | + haskell: true | ||
| 130 | + large-packages: true | ||
| 131 | + docker-images: false | ||
| 132 | + swap-storage: true | ||
| 133 | + | ||
| 134 | + - name: Check space | ||
| 135 | + shell: bash | ||
| 136 | + run: | | ||
| 137 | + df -h | ||
| 138 | + | ||
| 95 | - uses: actions/checkout@v4 | 139 | - uses: actions/checkout@v4 |
| 96 | with: | 140 | with: |
| 97 | fetch-depth: 0 | 141 | fetch-depth: 0 |
| @@ -21,48 +21,56 @@ class OfflineTtsPlayDemo | @@ -21,48 +21,56 @@ class OfflineTtsPlayDemo | ||
| 21 | { | 21 | { |
| 22 | class Options | 22 | class Options |
| 23 | { | 23 | { |
| 24 | - | ||
| 25 | [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")] | 24 | [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")] |
| 26 | - public string? RuleFsts { get; set; } | 25 | + public string RuleFsts { get; set; } = string.Empty; |
| 26 | + | ||
| 27 | + [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")] | ||
| 28 | + public string RuleFars { get; set; } = string.Empty; | ||
| 27 | 29 | ||
| 28 | - [Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] | ||
| 29 | - public string? DictDir { get; set; } | 30 | + [Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] |
| 31 | + public string DictDir { get; set; } = string.Empty; | ||
| 30 | 32 | ||
| 31 | - [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] | ||
| 32 | - public string? DataDir { get; set; } | 33 | + [Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] |
| 34 | + public string DataDir { get; set; } = string.Empty; | ||
| 33 | 35 | ||
| 34 | - [Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] | ||
| 35 | - public float LengthScale { get; set; } | 36 | + [Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] |
| 37 | + public float LengthScale { get; set; } = 1; | ||
| 36 | 38 | ||
| 37 | - [Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")] | ||
| 38 | - public float NoiseScale { get; set; } | 39 | + [Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")] |
| 40 | + public float NoiseScale { get; set; } = 0.667F; | ||
| 39 | 41 | ||
| 40 | - [Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")] | ||
| 41 | - public float NoiseScaleW { get; set; } | 42 | + [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")] |
| 43 | + public float NoiseScaleW { get; set; } = 0.8F; | ||
| 42 | 44 | ||
| 43 | - [Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] | ||
| 44 | - public string? Lexicon { get; set; } | 45 | + [Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] |
| 46 | + public string Lexicon { get; set; } = string.Empty; | ||
| 45 | 47 | ||
| 46 | - [Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")] | ||
| 47 | - public string? Tokens { get; set; } | 48 | + [Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")] |
| 49 | + public string Tokens { get; set; } = string.Empty; | ||
| 48 | 50 | ||
| 49 | [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] | 51 | [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] |
| 50 | - public int MaxNumSentences { get; set; } | 52 | + public int MaxNumSentences { get; set; } = 1; |
| 51 | 53 | ||
| 52 | [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] | 54 | [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] |
| 53 | - public int Debug { get; set; } | 55 | + public int Debug { get; set; } = 0; |
| 56 | + | ||
| 57 | + [Option("vits-model", Required = false, HelpText = "Path to VITS model")] | ||
| 58 | + public string Model { get; set; } = string.Empty; | ||
| 54 | 59 | ||
| 55 | - [Option("vits-model", Required = true, HelpText = "Path to VITS model")] | ||
| 56 | - public string? Model { get; set; } | 60 | + [Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")] |
| 61 | + public string AcousticModel { get; set; } = ""; | ||
| 62 | + | ||
| 63 | + [Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")] | ||
| 64 | + public string Vocoder { get; set; } = ""; | ||
| 57 | 65 | ||
| 58 | [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] | 66 | [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] |
| 59 | - public int SpeakerId { get; set; } | 67 | + public int SpeakerId { get; set; } = 0; |
| 60 | 68 | ||
| 61 | [Option("text", Required = true, HelpText = "Text to synthesize")] | 69 | [Option("text", Required = true, HelpText = "Text to synthesize")] |
| 62 | - public string? Text { get; set; } | 70 | + public string Text { get; set; } = string.Empty; |
| 63 | 71 | ||
| 64 | [Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")] | 72 | [Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")] |
| 65 | - public string? OutputFilename { get; set; } | 73 | + public string OutputFilename { get; set; } = "./generated.wav"; |
| 66 | } | 74 | } |
| 67 | 75 | ||
| 68 | static void Main(string[] args) | 76 | static void Main(string[] args) |
| @@ -78,6 +86,42 @@ class OfflineTtsPlayDemo | @@ -78,6 +86,42 @@ class OfflineTtsPlayDemo | ||
| 78 | private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) | 86 | private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) |
| 79 | { | 87 | { |
| 80 | string usage = @" | 88 | string usage = @" |
| 89 | +# matcha-icefall-zh-baker | ||
| 90 | + | ||
| 91 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 92 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 93 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 94 | + | ||
| 95 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 96 | + | ||
| 97 | +dotnet run \ | ||
| 98 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 99 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 100 | + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 101 | + --tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 102 | + --dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 103 | + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
| 104 | + --debug=1 \ | ||
| 105 | + --output-filename=./matcha-zh.wav \ | ||
| 106 | + --text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' | ||
| 107 | + | ||
| 108 | +# matcha-icefall-en_US-ljspeech | ||
| 109 | + | ||
| 110 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 111 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 112 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 113 | + | ||
| 114 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 115 | + | ||
| 116 | +dotnet run \ | ||
| 117 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 118 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 119 | + --tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 120 | + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 121 | + --debug=1 \ | ||
| 122 | + --output-filename=./matcha-zh.wav \ | ||
| 123 | + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' | ||
| 124 | + | ||
| 81 | # vits-aishell3 | 125 | # vits-aishell3 |
| 82 | 126 | ||
| 83 | wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 | 127 | wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 |
| @@ -85,8 +129,8 @@ tar xf vits-zh-aishell3.tar.bz2 | @@ -85,8 +129,8 @@ tar xf vits-zh-aishell3.tar.bz2 | ||
| 85 | 129 | ||
| 86 | dotnet run \ | 130 | dotnet run \ |
| 87 | --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ | 131 | --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ |
| 88 | - --vits-tokens=./vits-zh-aishell3/tokens.txt \ | ||
| 89 | - --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ | 132 | + --tokens=./vits-zh-aishell3/tokens.txt \ |
| 133 | + --lexicon=./vits-zh-aishell3/lexicon.txt \ | ||
| 90 | --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ | 134 | --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ |
| 91 | --sid=66 \ | 135 | --sid=66 \ |
| 92 | --debug=1 \ | 136 | --debug=1 \ |
| @@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 | @@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 100 | 144 | ||
| 101 | dotnet run \ | 145 | dotnet run \ |
| 102 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ | 146 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ |
| 103 | - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ | ||
| 104 | - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | 147 | + ---tokens=./vits-piper-en_US-amy-low/tokens.txt \ |
| 148 | + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | ||
| 105 | --debug=1 \ | 149 | --debug=1 \ |
| 106 | --output-filename=./amy.wav \ | 150 | --output-filename=./amy.wav \ |
| 107 | --text='This is a text to speech application in dotnet with Next Generation Kaldi' | 151 | --text='This is a text to speech application in dotnet with Next Generation Kaldi' |
| @@ -124,6 +168,7 @@ to download more models. | @@ -124,6 +168,7 @@ to download more models. | ||
| 124 | private static void Run(Options options) | 168 | private static void Run(Options options) |
| 125 | { | 169 | { |
| 126 | var config = new OfflineTtsConfig(); | 170 | var config = new OfflineTtsConfig(); |
| 171 | + | ||
| 127 | config.Model.Vits.Model = options.Model; | 172 | config.Model.Vits.Model = options.Model; |
| 128 | config.Model.Vits.Lexicon = options.Lexicon; | 173 | config.Model.Vits.Lexicon = options.Lexicon; |
| 129 | config.Model.Vits.Tokens = options.Tokens; | 174 | config.Model.Vits.Tokens = options.Tokens; |
| @@ -132,6 +177,16 @@ to download more models. | @@ -132,6 +177,16 @@ to download more models. | ||
| 132 | config.Model.Vits.NoiseScale = options.NoiseScale; | 177 | config.Model.Vits.NoiseScale = options.NoiseScale; |
| 133 | config.Model.Vits.NoiseScaleW = options.NoiseScaleW; | 178 | config.Model.Vits.NoiseScaleW = options.NoiseScaleW; |
| 134 | config.Model.Vits.LengthScale = options.LengthScale; | 179 | config.Model.Vits.LengthScale = options.LengthScale; |
| 180 | + | ||
| 181 | + config.Model.Matcha.AcousticModel = options.AcousticModel; | ||
| 182 | + config.Model.Matcha.Vocoder = options.Vocoder; | ||
| 183 | + config.Model.Matcha.Lexicon = options.Lexicon; | ||
| 184 | + config.Model.Matcha.Tokens = options.Tokens; | ||
| 185 | + config.Model.Matcha.DataDir = options.DataDir; | ||
| 186 | + config.Model.Matcha.DictDir = options.DictDir; | ||
| 187 | + config.Model.Matcha.NoiseScale = options.NoiseScale; | ||
| 188 | + config.Model.Matcha.LengthScale = options.LengthScale; | ||
| 189 | + | ||
| 135 | config.Model.NumThreads = 1; | 190 | config.Model.NumThreads = 1; |
| 136 | config.Model.Debug = options.Debug; | 191 | config.Model.Debug = options.Debug; |
| 137 | config.Model.Provider = "cpu"; | 192 | config.Model.Provider = "cpu"; |
| @@ -8,8 +8,8 @@ fi | @@ -8,8 +8,8 @@ fi | ||
| 8 | 8 | ||
| 9 | dotnet run \ | 9 | dotnet run \ |
| 10 | --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ | 10 | --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ |
| 11 | - --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ | ||
| 12 | - --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ | 11 | + --tokens=./vits-zh-hf-fanchen-C/tokens.txt \ |
| 12 | + --lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ | ||
| 13 | --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ | 13 | --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ |
| 14 | --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ | 14 | --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ |
| 15 | --sid=100 \ | 15 | --sid=100 \ |
| 1 | +#!/usr/bin/env bash | ||
| 2 | +set -ex | ||
| 3 | + | ||
| 4 | + | ||
| 5 | +# please visit | ||
| 6 | +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 7 | +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 8 | +# to download more models | ||
| 9 | +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then | ||
| 10 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 11 | + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 12 | + rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 13 | +fi | ||
| 14 | + | ||
| 15 | +if [ ! -f ./hifigan_v2.onnx ]; then | ||
| 16 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 17 | +fi | ||
| 18 | + | ||
| 19 | +dotnet run \ | ||
| 20 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 21 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 22 | + --tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ | ||
| 23 | + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 24 | + --debug=1 \ | ||
| 25 | + --output-filename=./matcha-en.wav \ | ||
| 26 | + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' |
| 1 | +#!/usr/bin/env bash | ||
| 2 | +set -ex | ||
| 3 | + | ||
| 4 | +# please visit | ||
| 5 | +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
| 6 | +# to download more models | ||
| 7 | +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then | ||
| 8 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 9 | + tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 10 | + rm matcha-icefall-zh-baker.tar.bz2 | ||
| 11 | +fi | ||
| 12 | + | ||
| 13 | +if [ ! -f ./hifigan_v2.onnx ]; then | ||
| 14 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 15 | +fi | ||
| 16 | + | ||
| 17 | + | ||
| 18 | +dotnet run \ | ||
| 19 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 20 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 21 | + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 22 | + --tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 23 | + --dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 24 | + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
| 25 | + --debug=1 \ | ||
| 26 | + --output-filename=./matcha-zh.wav \ | ||
| 27 | + --text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| @@ -9,8 +9,8 @@ fi | @@ -9,8 +9,8 @@ fi | ||
| 9 | 9 | ||
| 10 | dotnet run \ | 10 | dotnet run \ |
| 11 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ | 11 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ |
| 12 | - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ | ||
| 13 | - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | 12 | + --tokens=./vits-piper-en_US-amy-low/tokens.txt \ |
| 13 | + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | ||
| 14 | --debug=1 \ | 14 | --debug=1 \ |
| 15 | --output-filename=./amy.wav \ | 15 | --output-filename=./amy.wav \ |
| 16 | --text="This is a text to speech application in dotnet with Next Generation Kaldi" | 16 | --text="This is a text to speech application in dotnet with Next Generation Kaldi" |
| @@ -20,25 +20,25 @@ class OfflineTtsDemo | @@ -20,25 +20,25 @@ class OfflineTtsDemo | ||
| 20 | [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")] | 20 | [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")] |
| 21 | public string RuleFars { get; set; } = string.Empty; | 21 | public string RuleFars { get; set; } = string.Empty; |
| 22 | 22 | ||
| 23 | - [Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] | 23 | + [Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] |
| 24 | public string DictDir { get; set; } = string.Empty; | 24 | public string DictDir { get; set; } = string.Empty; |
| 25 | 25 | ||
| 26 | - [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] | 26 | + [Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] |
| 27 | public string DataDir { get; set; } = string.Empty; | 27 | public string DataDir { get; set; } = string.Empty; |
| 28 | 28 | ||
| 29 | - [Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] | 29 | + [Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] |
| 30 | public float LengthScale { get; set; } = 1; | 30 | public float LengthScale { get; set; } = 1; |
| 31 | 31 | ||
| 32 | - [Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")] | 32 | + [Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")] |
| 33 | public float NoiseScale { get; set; } = 0.667F; | 33 | public float NoiseScale { get; set; } = 0.667F; |
| 34 | 34 | ||
| 35 | [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")] | 35 | [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")] |
| 36 | public float NoiseScaleW { get; set; } = 0.8F; | 36 | public float NoiseScaleW { get; set; } = 0.8F; |
| 37 | 37 | ||
| 38 | - [Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] | 38 | + [Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] |
| 39 | public string Lexicon { get; set; } = string.Empty; | 39 | public string Lexicon { get; set; } = string.Empty; |
| 40 | 40 | ||
| 41 | - [Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")] | 41 | + [Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")] |
| 42 | public string Tokens { get; set; } = string.Empty; | 42 | public string Tokens { get; set; } = string.Empty; |
| 43 | 43 | ||
| 44 | [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] | 44 | [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] |
| @@ -47,9 +47,15 @@ class OfflineTtsDemo | @@ -47,9 +47,15 @@ class OfflineTtsDemo | ||
| 47 | [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] | 47 | [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] |
| 48 | public int Debug { get; set; } = 0; | 48 | public int Debug { get; set; } = 0; |
| 49 | 49 | ||
| 50 | - [Option("vits-model", Required = true, HelpText = "Path to VITS model")] | 50 | + [Option("vits-model", Required = false, HelpText = "Path to VITS model")] |
| 51 | public string Model { get; set; } = string.Empty; | 51 | public string Model { get; set; } = string.Empty; |
| 52 | 52 | ||
| 53 | + [Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")] | ||
| 54 | + public string AcousticModel { get; set; } = ""; | ||
| 55 | + | ||
| 56 | + [Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")] | ||
| 57 | + public string Vocoder { get; set; } = ""; | ||
| 58 | + | ||
| 53 | [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] | 59 | [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] |
| 54 | public int SpeakerId { get; set; } = 0; | 60 | public int SpeakerId { get; set; } = 0; |
| 55 | 61 | ||
| @@ -73,6 +79,42 @@ class OfflineTtsDemo | @@ -73,6 +79,42 @@ class OfflineTtsDemo | ||
| 73 | private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) | 79 | private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) |
| 74 | { | 80 | { |
| 75 | var usage = @" | 81 | var usage = @" |
| 82 | +# matcha-icefall-zh-baker | ||
| 83 | + | ||
| 84 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 85 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 86 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 87 | + | ||
| 88 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 89 | + | ||
| 90 | +dotnet run \ | ||
| 91 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 92 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 93 | + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 94 | + --tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 95 | + --dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 96 | + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
| 97 | + --debug=1 \ | ||
| 98 | + --output-filename=./matcha-zh.wav \ | ||
| 99 | + --text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' | ||
| 100 | + | ||
| 101 | +# matcha-icefall-en_US-ljspeech | ||
| 102 | + | ||
| 103 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 104 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 105 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 106 | + | ||
| 107 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 108 | + | ||
| 109 | +dotnet run \ | ||
| 110 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 111 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 112 | + --tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 113 | + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 114 | + --debug=1 \ | ||
| 115 | + --output-filename=./matcha-zh.wav \ | ||
| 116 | + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' | ||
| 117 | + | ||
| 76 | # vits-aishell3 | 118 | # vits-aishell3 |
| 77 | 119 | ||
| 78 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 120 | curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| @@ -80,8 +122,8 @@ tar xvf vits-icefall-zh-aishell3.tar.bz2 | @@ -80,8 +122,8 @@ tar xvf vits-icefall-zh-aishell3.tar.bz2 | ||
| 80 | 122 | ||
| 81 | dotnet run \ | 123 | dotnet run \ |
| 82 | --vits-model=./vits-icefall-zh-aishell3/model.onnx \ | 124 | --vits-model=./vits-icefall-zh-aishell3/model.onnx \ |
| 83 | - --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ | ||
| 84 | - --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ | 125 | + --tokens=./vits-icefall-zh-aishell3/tokens.txt \ |
| 126 | + --lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ | ||
| 85 | --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ | 127 | --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ |
| 86 | --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ | 128 | --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ |
| 87 | --sid=66 \ | 129 | --sid=66 \ |
| @@ -96,8 +138,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 | @@ -96,8 +138,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 96 | 138 | ||
| 97 | dotnet run \ | 139 | dotnet run \ |
| 98 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ | 140 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ |
| 99 | - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ | ||
| 100 | - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | 141 | + --tokens=./vits-piper-en_US-amy-low/tokens.txt \ |
| 142 | + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | ||
| 101 | --debug=1 \ | 143 | --debug=1 \ |
| 102 | --output-filename=./amy.wav \ | 144 | --output-filename=./amy.wav \ |
| 103 | --text='This is a text to speech application in dotnet with Next Generation Kaldi' | 145 | --text='This is a text to speech application in dotnet with Next Generation Kaldi' |
| @@ -128,6 +170,16 @@ to download more models. | @@ -128,6 +170,16 @@ to download more models. | ||
| 128 | config.Model.Vits.NoiseScale = options.NoiseScale; | 170 | config.Model.Vits.NoiseScale = options.NoiseScale; |
| 129 | config.Model.Vits.NoiseScaleW = options.NoiseScaleW; | 171 | config.Model.Vits.NoiseScaleW = options.NoiseScaleW; |
| 130 | config.Model.Vits.LengthScale = options.LengthScale; | 172 | config.Model.Vits.LengthScale = options.LengthScale; |
| 173 | + | ||
| 174 | + config.Model.Matcha.AcousticModel = options.AcousticModel; | ||
| 175 | + config.Model.Matcha.Vocoder = options.Vocoder; | ||
| 176 | + config.Model.Matcha.Lexicon = options.Lexicon; | ||
| 177 | + config.Model.Matcha.Tokens = options.Tokens; | ||
| 178 | + config.Model.Matcha.DataDir = options.DataDir; | ||
| 179 | + config.Model.Matcha.DictDir = options.DictDir; | ||
| 180 | + config.Model.Matcha.NoiseScale = options.NoiseScale; | ||
| 181 | + config.Model.Matcha.LengthScale = options.LengthScale; | ||
| 182 | + | ||
| 131 | config.Model.NumThreads = 1; | 183 | config.Model.NumThreads = 1; |
| 132 | config.Model.Debug = options.Debug; | 184 | config.Model.Debug = options.Debug; |
| 133 | config.Model.Provider = "cpu"; | 185 | config.Model.Provider = "cpu"; |
| @@ -8,8 +8,8 @@ fi | @@ -8,8 +8,8 @@ fi | ||
| 8 | 8 | ||
| 9 | dotnet run \ | 9 | dotnet run \ |
| 10 | --vits-model=./vits-icefall-zh-aishell3/model.onnx \ | 10 | --vits-model=./vits-icefall-zh-aishell3/model.onnx \ |
| 11 | - --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ | ||
| 12 | - --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ | 11 | + --tokens=./vits-icefall-zh-aishell3/tokens.txt \ |
| 12 | + --lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ | ||
| 13 | --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ | 13 | --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ |
| 14 | --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ | 14 | --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ |
| 15 | --sid=66 \ | 15 | --sid=66 \ |
| @@ -8,10 +8,10 @@ fi | @@ -8,10 +8,10 @@ fi | ||
| 8 | 8 | ||
| 9 | dotnet run \ | 9 | dotnet run \ |
| 10 | --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ | 10 | --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ |
| 11 | - --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ | ||
| 12 | - --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ | 11 | + --tokens=./vits-zh-hf-fanchen-C/tokens.txt \ |
| 12 | + --lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ | ||
| 13 | --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ | 13 | --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ |
| 14 | - --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ | 14 | + --dict-dir=./vits-zh-hf-fanchen-C/dict \ |
| 15 | --sid=100 \ | 15 | --sid=100 \ |
| 16 | --debug=1 \ | 16 | --debug=1 \ |
| 17 | --output-filename=./fanchen-100.wav \ | 17 | --output-filename=./fanchen-100.wav \ |
dotnet-examples/offline-tts/run-matcha-en.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | +set -ex | ||
| 3 | + | ||
| 4 | + | ||
| 5 | +# please visit | ||
| 6 | +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 7 | +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 8 | +# to download more models | ||
| 9 | +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then | ||
| 10 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 11 | + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 12 | + rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 13 | +fi | ||
| 14 | + | ||
| 15 | +if [ ! -f ./hifigan_v2.onnx ]; then | ||
| 16 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 17 | +fi | ||
| 18 | + | ||
| 19 | +dotnet run \ | ||
| 20 | + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ | ||
| 21 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 22 | + --tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ | ||
| 23 | + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ | ||
| 24 | + --debug=1 \ | ||
| 25 | + --output-filename=./matcha-en.wav \ | ||
| 26 | + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' |
dotnet-examples/offline-tts/run-matcha-zh.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | +set -ex | ||
| 3 | + | ||
| 4 | +# please visit | ||
| 5 | +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
| 6 | +# to download more models | ||
| 7 | +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then | ||
| 8 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 9 | + tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 10 | + rm matcha-icefall-zh-baker.tar.bz2 | ||
| 11 | +fi | ||
| 12 | + | ||
| 13 | +if [ ! -f ./hifigan_v2.onnx ]; then | ||
| 14 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 15 | +fi | ||
| 16 | + | ||
| 17 | + | ||
| 18 | +dotnet run \ | ||
| 19 | + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ | ||
| 20 | + --matcha-vocoder=./hifigan_v2.onnx \ | ||
| 21 | + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ | ||
| 22 | + --tokens=./matcha-icefall-zh-baker/tokens.txt \ | ||
| 23 | + --dict-dir=./matcha-icefall-zh-baker/dict \ | ||
| 24 | + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ | ||
| 25 | + --debug=1 \ | ||
| 26 | + --output-filename=./matcha-zh.wav \ | ||
| 27 | + --text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" |
| @@ -10,8 +10,8 @@ fi | @@ -10,8 +10,8 @@ fi | ||
| 10 | 10 | ||
| 11 | dotnet run \ | 11 | dotnet run \ |
| 12 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ | 12 | --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ |
| 13 | - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ | ||
| 14 | - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | 13 | + --tokens=./vits-piper-en_US-amy-low/tokens.txt \ |
| 14 | + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | ||
| 15 | --debug=1 \ | 15 | --debug=1 \ |
| 16 | --output-filename=./amy.wav \ | 16 | --output-filename=./amy.wav \ |
| 17 | --text="This is a text to speech application in dotnet with Next Generation Kaldi" | 17 | --text="This is a text to speech application in dotnet with Next Generation Kaldi" |
| @@ -42,9 +42,45 @@ node ./test-offline-speaker-diarization.js | @@ -42,9 +42,45 @@ node ./test-offline-speaker-diarization.js | ||
| 42 | 42 | ||
| 43 | In the following, we demonstrate how to run text-to-speech. | 43 | In the following, we demonstrate how to run text-to-speech. |
| 44 | 44 | ||
| 45 | -## ./test-offline-tts-en.js | 45 | +## ./test-offline-tts-matcha-zh.js |
| 46 | 46 | ||
| 47 | -[./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use | 47 | +[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use |
| 48 | +[matcha-icefall-zh-baker](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker) | ||
| 49 | +for text-to-speech. | ||
| 50 | + | ||
| 51 | +You can use the following command to run it: | ||
| 52 | + | ||
| 53 | +```bash | ||
| 54 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
| 55 | +tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
| 56 | +rm matcha-icefall-zh-baker.tar.bz2 | ||
| 57 | + | ||
| 58 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 59 | + | ||
| 60 | +node ./test-offline-tts-matcha-zh.js | ||
| 61 | +``` | ||
| 62 | + | ||
| 63 | +## ./test-offline-tts-matcha-en.js | ||
| 64 | + | ||
| 65 | +[./test-offline-tts-matcha-en.js](./test-offline-tts-matcha-en.js) shows how to use | ||
| 66 | +[matcha-icefall-en_US-ljspeech](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker) | ||
| 67 | +for text-to-speech. | ||
| 68 | + | ||
| 69 | +You can use the following command to run it: | ||
| 70 | + | ||
| 71 | +```bash | ||
| 72 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 73 | +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 74 | +rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
| 75 | + | ||
| 76 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
| 77 | + | ||
| 78 | +node ./test-offline-tts-matcha-en.js | ||
| 79 | +``` | ||
| 80 | + | ||
| 81 | +## ./test-offline-tts-vits-en.js | ||
| 82 | + | ||
| 83 | +[./test-offline-tts-vits-en.js](./test-offline-tts-vits-en.js) shows how to use | ||
| 48 | [vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2) | 84 | [vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2) |
| 49 | for text-to-speech. | 85 | for text-to-speech. |
| 50 | 86 | ||
| @@ -53,12 +89,12 @@ You can use the following command to run it: | @@ -53,12 +89,12 @@ You can use the following command to run it: | ||
| 53 | ```bash | 89 | ```bash |
| 54 | wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | 90 | wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 |
| 55 | tar xvf vits-piper-en_US-amy-low.tar.bz2 | 91 | tar xvf vits-piper-en_US-amy-low.tar.bz2 |
| 56 | -node ./test-offline-tts-en.js | 92 | +node ./test-offline-tts-vits-en.js |
| 57 | ``` | 93 | ``` |
| 58 | 94 | ||
| 59 | -## ./test-offline-tts-zh.js | 95 | +## ./test-offline-tts-vits-zh.js |
| 60 | 96 | ||
| 61 | -[./test-offline-tts-zh.js](./test-offline-tts-zh.js) shows how to use | 97 | +[./test-offline-tts-vits-zh.js](./test-offline-tts-vits-zh.js) shows how to use |
| 62 | a VITS pretrained model | 98 | a VITS pretrained model |
| 63 | [aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3) | 99 | [aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3) |
| 64 | for text-to-speech. | 100 | for text-to-speech. |
| @@ -68,7 +104,7 @@ You can use the following command to run it: | @@ -68,7 +104,7 @@ You can use the following command to run it: | ||
| 68 | ```bash | 104 | ```bash |
| 69 | wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 | 105 | wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 |
| 70 | tar xvf vits-icefall-zh-aishell3.tar.bz2 | 106 | tar xvf vits-icefall-zh-aishell3.tar.bz2 |
| 71 | -node ./test-offline-tts-zh.js | 107 | +node ./test-offline-tts-vits-zh.js |
| 72 | ``` | 108 | ``` |
| 73 | 109 | ||
| 74 | # Speech-to-text | 110 | # Speech-to-text |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | + | ||
| 3 | +const sherpa_onnx = require('sherpa-onnx'); | ||
| 4 | + | ||
| 5 | +function createOfflineTts() { | ||
| 6 | + let offlineTtsMatchaModelConfig = { | ||
| 7 | + acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx', | ||
| 8 | + vocoder: './hifigan_v2.onnx', | ||
| 9 | + lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt', | ||
| 10 | + tokens: './matcha-icefall-en_US-ljspeech/tokens.txt', | ||
| 11 | + dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data', | ||
| 12 | + | ||
| 13 | + noiseScale: 0.667, | ||
| 14 | + lengthScale: 1.0, | ||
| 15 | + }; | ||
| 16 | + let offlineTtsModelConfig = { | ||
| 17 | + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, | ||
| 18 | + numThreads: 1, | ||
| 19 | + debug: 1, | ||
| 20 | + provider: 'cpu', | ||
| 21 | + }; | ||
| 22 | + | ||
| 23 | + let offlineTtsConfig = { | ||
| 24 | + offlineTtsModelConfig: offlineTtsModelConfig, | ||
| 25 | + maxNumSentences: 1, | ||
| 26 | + }; | ||
| 27 | + | ||
| 28 | + return sherpa_onnx.createOfflineTts(offlineTtsConfig); | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +const tts = createOfflineTts(); | ||
| 32 | +const speakerId = 0; | ||
| 33 | +const speed = 1.0; | ||
| 34 | +const text = | ||
| 35 | + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' | ||
| 36 | + | ||
| 37 | +const audio = tts.generate({text: text, sid: speakerId, speed: speed}); | ||
| 38 | +tts.save('./test-matcha-en.wav', audio); | ||
| 39 | +console.log('Saved to test-matcha-en.wav successfully.'); | ||
| 40 | +tts.free(); |
| 1 | +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | + | ||
| 3 | +const sherpa_onnx = require('sherpa-onnx'); | ||
| 4 | + | ||
| 5 | +function createOfflineTts() { | ||
| 6 | + let offlineTtsMatchaModelConfig = { | ||
| 7 | + acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx', | ||
| 8 | + vocoder: './hifigan_v2.onnx', | ||
| 9 | + lexicon: './matcha-icefall-zh-baker/lexicon.txt', | ||
| 10 | + tokens: './matcha-icefall-zh-baker/tokens.txt', | ||
| 11 | + dictDir: './matcha-icefall-zh-baker/dict', | ||
| 12 | + noiseScale: 0.667, | ||
| 13 | + lengthScale: 1.0, | ||
| 14 | + }; | ||
| 15 | + let offlineTtsModelConfig = { | ||
| 16 | + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, | ||
| 17 | + numThreads: 1, | ||
| 18 | + debug: 1, | ||
| 19 | + provider: 'cpu', | ||
| 20 | + }; | ||
| 21 | + | ||
| 22 | + let offlineTtsConfig = { | ||
| 23 | + offlineTtsModelConfig: offlineTtsModelConfig, | ||
| 24 | + maxNumSentences: 1, | ||
| 25 | + ruleFsts: | ||
| 26 | + './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst', | ||
| 27 | + }; | ||
| 28 | + | ||
| 29 | + return sherpa_onnx.createOfflineTts(offlineTtsConfig); | ||
| 30 | +} | ||
| 31 | + | ||
| 32 | +const tts = createOfflineTts(); | ||
| 33 | +const speakerId = 0; | ||
| 34 | +const speed = 1.0; | ||
| 35 | +const text = | ||
| 36 | + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' | ||
| 37 | + | ||
| 38 | +const audio = tts.generate({text: text, sid: speakerId, speed: speed}); | ||
| 39 | +tts.save('./test-matcha-zh.wav', audio); | ||
| 40 | +console.log('Saved to test-matcha-zh.wav successfully.'); | ||
| 41 | +tts.free(); |
| @@ -37,7 +37,7 @@ const audio = tts.generate({ | @@ -37,7 +37,7 @@ const audio = tts.generate({ | ||
| 37 | speed: speed | 37 | speed: speed |
| 38 | }); | 38 | }); |
| 39 | 39 | ||
| 40 | -tts.save('./test-en.wav', audio); | ||
| 41 | -console.log('Saved to test-en.wav successfully.'); | 40 | +tts.save('./test-vits-en.wav', audio); |
| 41 | +console.log('Saved to test-vits-en.wav successfully.'); | ||
| 42 | 42 | ||
| 43 | tts.free(); | 43 | tts.free(); |
| @@ -34,6 +34,6 @@ const speakerId = 66; | @@ -34,6 +34,6 @@ const speakerId = 66; | ||
| 34 | const speed = 1.0; | 34 | const speed = 1.0; |
| 35 | const audio = tts.generate( | 35 | const audio = tts.generate( |
| 36 | {text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed}); | 36 | {text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed}); |
| 37 | -tts.save('./test-zh.wav', audio); | ||
| 38 | -console.log('Saved to test-zh.wav successfully.'); | 37 | +tts.save('./test-vits-zh.wav', audio); |
| 38 | +console.log('Saved to test-vits-zh.wav successfully.'); | ||
| 39 | tts.free(); | 39 | tts.free(); |
| 1 | +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | + | ||
| 3 | +using System.Runtime.InteropServices; | ||
| 4 | + | ||
| 5 | +namespace SherpaOnnx | ||
| 6 | +{ | ||
| 7 | + [StructLayout(LayoutKind.Sequential)] | ||
| 8 | + public struct OfflineTtsMatchaModelConfig | ||
| 9 | + { | ||
| 10 | + public OfflineTtsMatchaModelConfig() | ||
| 11 | + { | ||
| 12 | + AcousticModel = ""; | ||
| 13 | + Vocoder = ""; | ||
| 14 | + Lexicon = ""; | ||
| 15 | + Tokens = ""; | ||
| 16 | + DataDir = ""; | ||
| 17 | + | ||
| 18 | + NoiseScale = 0.667F; | ||
| 19 | + LengthScale = 1.0F; | ||
| 20 | + | ||
| 21 | + DictDir = ""; | ||
| 22 | + } | ||
| 23 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 24 | + public string AcousticModel; | ||
| 25 | + | ||
| 26 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 27 | + public string Vocoder; | ||
| 28 | + | ||
| 29 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 30 | + public string Lexicon; | ||
| 31 | + | ||
| 32 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 33 | + public string Tokens; | ||
| 34 | + | ||
| 35 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 36 | + public string DataDir; | ||
| 37 | + | ||
| 38 | + public float NoiseScale; | ||
| 39 | + public float LengthScale; | ||
| 40 | + | ||
| 41 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 42 | + public string DictDir; | ||
| 43 | + } | ||
| 44 | +} |
| @@ -11,6 +11,7 @@ namespace SherpaOnnx | @@ -11,6 +11,7 @@ namespace SherpaOnnx | ||
| 11 | public OfflineTtsModelConfig() | 11 | public OfflineTtsModelConfig() |
| 12 | { | 12 | { |
| 13 | Vits = new OfflineTtsVitsModelConfig(); | 13 | Vits = new OfflineTtsVitsModelConfig(); |
| 14 | + Matcha = new OfflineTtsMatchaModelConfig(); | ||
| 14 | NumThreads = 1; | 15 | NumThreads = 1; |
| 15 | Debug = 0; | 16 | Debug = 0; |
| 16 | Provider = "cpu"; | 17 | Provider = "cpu"; |
| @@ -21,5 +22,7 @@ namespace SherpaOnnx | @@ -21,5 +22,7 @@ namespace SherpaOnnx | ||
| 21 | public int Debug; | 22 | public int Debug; |
| 22 | [MarshalAs(UnmanagedType.LPStr)] | 23 | [MarshalAs(UnmanagedType.LPStr)] |
| 23 | public string Provider; | 24 | public string Provider; |
| 25 | + | ||
| 26 | + public OfflineTtsMatchaModelConfig Matcha; | ||
| 24 | } | 27 | } |
| 25 | -} | ||
| 28 | +} |
| 1 | <Project Sdk="Microsoft.NET.Sdk"> | 1 | <Project Sdk="Microsoft.NET.Sdk"> |
| 2 | 2 | ||
| 3 | <PropertyGroup> | 3 | <PropertyGroup> |
| 4 | - <TargetFramework>.net6</TargetFramework> | 4 | + <TargetFramework>net8.0</TargetFramework> |
| 5 | <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources> | 5 | <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources> |
| 6 | </PropertyGroup> | 6 | </PropertyGroup> |
| 7 | 7 |
| @@ -4,7 +4,7 @@ | @@ -4,7 +4,7 @@ | ||
| 4 | <PackageReadmeFile>README.md</PackageReadmeFile> | 4 | <PackageReadmeFile>README.md</PackageReadmeFile> |
| 5 | <OutputType>Library</OutputType> | 5 | <OutputType>Library</OutputType> |
| 6 | <LangVersion>10.0</LangVersion> | 6 | <LangVersion>10.0</LangVersion> |
| 7 | - <TargetFrameworks>net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks> | 7 | + <TargetFrameworks>net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks> |
| 8 | <RuntimeIdentifiers>linux-x64;linux-arm64;osx-x64;osx-arm64;win-x64;win-x86;win-arm64</RuntimeIdentifiers> | 8 | <RuntimeIdentifiers>linux-x64;linux-arm64;osx-x64;osx-arm64;win-x64;win-x86;win-arm64</RuntimeIdentifiers> |
| 9 | <AllowUnsafeBlocks>true</AllowUnsafeBlocks> | 9 | <AllowUnsafeBlocks>true</AllowUnsafeBlocks> |
| 10 | <AssemblyName>sherpa-onnx</AssemblyName> | 10 | <AssemblyName>sherpa-onnx</AssemblyName> |
| @@ -3,7 +3,7 @@ | @@ -3,7 +3,7 @@ | ||
| 3 | <PackageLicenseExpression>Apache-2.0</PackageLicenseExpression> | 3 | <PackageLicenseExpression>Apache-2.0</PackageLicenseExpression> |
| 4 | <PackageReadmeFile>README.md</PackageReadmeFile> | 4 | <PackageReadmeFile>README.md</PackageReadmeFile> |
| 5 | <OutputType>Library</OutputType> | 5 | <OutputType>Library</OutputType> |
| 6 | - <TargetFrameworks>net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks> | 6 | + <TargetFrameworks>net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks> |
| 7 | <RuntimeIdentifier>{{ dotnet_rid }}</RuntimeIdentifier> | 7 | <RuntimeIdentifier>{{ dotnet_rid }}</RuntimeIdentifier> |
| 8 | <AssemblyName>sherpa-onnx</AssemblyName> | 8 | <AssemblyName>sherpa-onnx</AssemblyName> |
| 9 | <Version>{{ version }}</Version> | 9 | <Version>{{ version }}</Version> |
| @@ -8,6 +8,10 @@ function freeConfig(config, Module) { | @@ -8,6 +8,10 @@ function freeConfig(config, Module) { | ||
| 8 | freeConfig(config.config, Module) | 8 | freeConfig(config.config, Module) |
| 9 | } | 9 | } |
| 10 | 10 | ||
| 11 | + if ('config2' in config) { | ||
| 12 | + freeConfig(config.config2, Module) | ||
| 13 | + } | ||
| 14 | + | ||
| 11 | Module._free(config.ptr); | 15 | Module._free(config.ptr); |
| 12 | } | 16 | } |
| 13 | 17 | ||
| @@ -66,11 +70,103 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | @@ -66,11 +70,103 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { | ||
| 66 | } | 70 | } |
| 67 | } | 71 | } |
| 68 | 72 | ||
| 73 | +function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) { | ||
| 74 | + const acousticModelLen = Module.lengthBytesUTF8(config.acousticModel) + 1; | ||
| 75 | + const vocoderLen = Module.lengthBytesUTF8(config.vocoder) + 1; | ||
| 76 | + const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; | ||
| 77 | + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; | ||
| 78 | + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; | ||
| 79 | + const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1; | ||
| 80 | + | ||
| 81 | + const n = acousticModelLen + vocoderLen + lexiconLen + tokensLen + | ||
| 82 | + dataDirLen + dictDirLen; | ||
| 83 | + | ||
| 84 | + const buffer = Module._malloc(n); | ||
| 85 | + | ||
| 86 | + const len = 8 * 4; | ||
| 87 | + const ptr = Module._malloc(len); | ||
| 88 | + | ||
| 89 | + let offset = 0; | ||
| 90 | + Module.stringToUTF8( | ||
| 91 | + config.acousticModel || '', buffer + offset, acousticModelLen); | ||
| 92 | + offset += acousticModelLen; | ||
| 93 | + | ||
| 94 | + Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen); | ||
| 95 | + offset += vocoderLen; | ||
| 96 | + | ||
| 97 | + Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); | ||
| 98 | + offset += lexiconLen; | ||
| 99 | + | ||
| 100 | + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); | ||
| 101 | + offset += tokensLen; | ||
| 102 | + | ||
| 103 | + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); | ||
| 104 | + offset += dataDirLen; | ||
| 105 | + | ||
| 106 | + Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen); | ||
| 107 | + offset += dictDirLen; | ||
| 108 | + | ||
| 109 | + offset = 0; | ||
| 110 | + Module.setValue(ptr, buffer + offset, 'i8*'); | ||
| 111 | + offset += acousticModelLen; | ||
| 112 | + | ||
| 113 | + Module.setValue(ptr + 4, buffer + offset, 'i8*'); | ||
| 114 | + offset += vocoderLen; | ||
| 115 | + | ||
| 116 | + Module.setValue(ptr + 8, buffer + offset, 'i8*'); | ||
| 117 | + offset += lexiconLen; | ||
| 118 | + | ||
| 119 | + Module.setValue(ptr + 12, buffer + offset, 'i8*'); | ||
| 120 | + offset += tokensLen; | ||
| 121 | + | ||
| 122 | + Module.setValue(ptr + 16, buffer + offset, 'i8*'); | ||
| 123 | + offset += dataDirLen; | ||
| 124 | + | ||
| 125 | + Module.setValue(ptr + 20, config.noiseScale || 0.667, 'float'); | ||
| 126 | + Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float'); | ||
| 127 | + Module.setValue(ptr + 28, buffer + offset, 'i8*'); | ||
| 128 | + offset += dictDirLen; | ||
| 129 | + | ||
| 130 | + return { | ||
| 131 | + buffer: buffer, ptr: ptr, len: len, | ||
| 132 | + } | ||
| 133 | +} | ||
| 134 | + | ||
| 69 | function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | 135 | function initSherpaOnnxOfflineTtsModelConfig(config, Module) { |
| 136 | + if (!('offlineTtsVitsModelConfig' in config)) { | ||
| 137 | + config.offlineTtsVitsModelConfig = { | ||
| 138 | + model: '', | ||
| 139 | + lexicon: '', | ||
| 140 | + tokens: '', | ||
| 141 | + noiseScale: 0.667, | ||
| 142 | + noiseScaleW: 0.8, | ||
| 143 | + lengthScale: 1.0, | ||
| 144 | + dataDir: '', | ||
| 145 | + dictDir: '', | ||
| 146 | + }; | ||
| 147 | + } | ||
| 148 | + | ||
| 149 | + if (!('offlineTtsMatchaModelConfig' in config)) { | ||
| 150 | + config.offlineTtsMatchaModelConfig = { | ||
| 151 | + acousticModel: '', | ||
| 152 | + vocoder: '', | ||
| 153 | + lexicon: '', | ||
| 154 | + tokens: '', | ||
| 155 | + noiseScale: 0.667, | ||
| 156 | + lengthScale: 1.0, | ||
| 157 | + dataDir: '', | ||
| 158 | + dictDir: '', | ||
| 159 | + }; | ||
| 160 | + } | ||
| 161 | + | ||
| 162 | + | ||
| 70 | const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( | 163 | const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( |
| 71 | config.offlineTtsVitsModelConfig, Module); | 164 | config.offlineTtsVitsModelConfig, Module); |
| 72 | 165 | ||
| 73 | - const len = vitsModelConfig.len + 3 * 4; | 166 | + const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig( |
| 167 | + config.offlineTtsMatchaModelConfig, Module); | ||
| 168 | + | ||
| 169 | + const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4; | ||
| 74 | const ptr = Module._malloc(len); | 170 | const ptr = Module._malloc(len); |
| 75 | 171 | ||
| 76 | let offset = 0; | 172 | let offset = 0; |
| @@ -87,9 +183,14 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | @@ -87,9 +183,14 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { | ||
| 87 | const buffer = Module._malloc(providerLen); | 183 | const buffer = Module._malloc(providerLen); |
| 88 | Module.stringToUTF8(config.provider, buffer, providerLen); | 184 | Module.stringToUTF8(config.provider, buffer, providerLen); |
| 89 | Module.setValue(ptr + offset, buffer, 'i8*'); | 185 | Module.setValue(ptr + offset, buffer, 'i8*'); |
| 186 | + offset += 4; | ||
| 187 | + | ||
| 188 | + Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset); | ||
| 189 | + offset += matchaModelConfig.len; | ||
| 90 | 190 | ||
| 91 | return { | 191 | return { |
| 92 | buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, | 192 | buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, |
| 193 | + config2: matchaModelConfig | ||
| 93 | } | 194 | } |
| 94 | } | 195 | } |
| 95 | 196 | ||
| @@ -195,12 +296,26 @@ function createOfflineTts(Module, myConfig) { | @@ -195,12 +296,26 @@ function createOfflineTts(Module, myConfig) { | ||
| 195 | noiseScaleW: 0.8, | 296 | noiseScaleW: 0.8, |
| 196 | lengthScale: 1.0, | 297 | lengthScale: 1.0, |
| 197 | }; | 298 | }; |
| 299 | + | ||
| 300 | + const offlineTtsMatchaModelConfig = { | ||
| 301 | + acousticModel: '', | ||
| 302 | + vocoder: '', | ||
| 303 | + lexicon: '', | ||
| 304 | + tokens: '', | ||
| 305 | + dataDir: '', | ||
| 306 | + dictDir: '', | ||
| 307 | + noiseScale: 0.667, | ||
| 308 | + lengthScale: 1.0, | ||
| 309 | + }; | ||
| 310 | + | ||
| 198 | const offlineTtsModelConfig = { | 311 | const offlineTtsModelConfig = { |
| 199 | offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, | 312 | offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, |
| 313 | + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, | ||
| 200 | numThreads: 1, | 314 | numThreads: 1, |
| 201 | debug: 1, | 315 | debug: 1, |
| 202 | provider: 'cpu', | 316 | provider: 'cpu', |
| 203 | }; | 317 | }; |
| 318 | + | ||
| 204 | let offlineTtsConfig = { | 319 | let offlineTtsConfig = { |
| 205 | offlineTtsModelConfig: offlineTtsModelConfig, | 320 | offlineTtsModelConfig: offlineTtsModelConfig, |
| 206 | ruleFsts: '', | 321 | ruleFsts: '', |
| @@ -14,8 +14,10 @@ | @@ -14,8 +14,10 @@ | ||
| 14 | extern "C" { | 14 | extern "C" { |
| 15 | 15 | ||
| 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); | 16 | static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); |
| 17 | +static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); | ||
| 17 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == | 18 | static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == |
| 18 | - sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, | 19 | + sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + |
| 20 | + sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4, | ||
| 19 | ""); | 21 | ""); |
| 20 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == | 22 | static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == |
| 21 | sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, | 23 | sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, |
| @@ -24,6 +26,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == | @@ -24,6 +26,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == | ||
| 24 | void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | 26 | void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { |
| 25 | auto tts_model_config = &tts_config->model; | 27 | auto tts_model_config = &tts_config->model; |
| 26 | auto vits_model_config = &tts_model_config->vits; | 28 | auto vits_model_config = &tts_model_config->vits; |
| 29 | + auto matcha_model_config = &tts_model_config->matcha; | ||
| 27 | fprintf(stdout, "----------vits model config----------\n"); | 30 | fprintf(stdout, "----------vits model config----------\n"); |
| 28 | fprintf(stdout, "model: %s\n", vits_model_config->model); | 31 | fprintf(stdout, "model: %s\n", vits_model_config->model); |
| 29 | fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); | 32 | fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); |
| @@ -34,6 +37,16 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | @@ -34,6 +37,16 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { | ||
| 34 | fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); | 37 | fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); |
| 35 | fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); | 38 | fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); |
| 36 | 39 | ||
| 40 | + fprintf(stdout, "----------matcha model config----------\n"); | ||
| 41 | + fprintf(stdout, "acoustic_model: %s\n", matcha_model_config->acoustic_model); | ||
| 42 | + fprintf(stdout, "vocoder: %s\n", matcha_model_config->vocoder); | ||
| 43 | + fprintf(stdout, "lexicon: %s\n", matcha_model_config->lexicon); | ||
| 44 | + fprintf(stdout, "tokens: %s\n", matcha_model_config->tokens); | ||
| 45 | + fprintf(stdout, "data_dir: %s\n", matcha_model_config->data_dir); | ||
| 46 | + fprintf(stdout, "noise scale: %.3f\n", matcha_model_config->noise_scale); | ||
| 47 | + fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale); | ||
| 48 | + fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir); | ||
| 49 | + | ||
| 37 | fprintf(stdout, "----------tts model config----------\n"); | 50 | fprintf(stdout, "----------tts model config----------\n"); |
| 38 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); | 51 | fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); |
| 39 | fprintf(stdout, "debug: %d\n", tts_model_config->debug); | 52 | fprintf(stdout, "debug: %d\n", tts_model_config->debug); |
-
请 注册 或 登录 后发表评论