Fangjun Kuang
Committed by GitHub

Add C# and JavaScript (wasm) API for MatchaTTS models (#1682)

@@ -2,7 +2,27 @@ @@ -2,7 +2,27 @@
2 2
3 cd dotnet-examples/ 3 cd dotnet-examples/
4 4
5 -cd ./offline-speaker-diarization 5 +cd ./offline-tts
  6 +./run-matcha-zh.sh
  7 +ls -lh *.wav
  8 +./run-matcha-en.sh
  9 +ls -lh *.wav
  10 +./run-aishell3.sh
  11 +ls -lh *.wav
  12 +./run-piper.sh
  13 +ls -lh *.wav
  14 +./run-hf-fanchen.sh
  15 +ls -lh *.wav
  16 +ls -lh
  17 +
  18 +pushd ../..
  19 +
  20 +mkdir tts
  21 +
  22 +cp dotnet-examples/offline-tts/*.wav ./tts
  23 +popd
  24 +
  25 +cd ../offline-speaker-diarization
6 ./run.sh 26 ./run.sh
7 rm -rfv *.onnx 27 rm -rfv *.onnx
8 rm -fv *.wav 28 rm -fv *.wav
@@ -76,14 +96,4 @@ cd ../spoken-language-identification @@ -76,14 +96,4 @@ cd ../spoken-language-identification
76 ./run.sh 96 ./run.sh
77 rm -rf sherpa-onnx-* 97 rm -rf sherpa-onnx-*
78 98
79 -cd ../offline-tts  
80 -./run-aishell3.sh  
81 -./run-piper.sh  
82 -./run-hf-fanchen.sh  
83 -ls -lh  
84 99
85 -cd ../..  
86 -  
87 -mkdir tts  
88 -  
89 -cp dotnet-examples/offline-tts/*.wav ./tts  
@@ -9,6 +9,48 @@ git status @@ -9,6 +9,48 @@ git status
9 ls -lh 9 ls -lh
10 ls -lh node_modules 10 ls -lh node_modules
11 11
  12 +# offline tts
  13 +#
  14 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  15 +tar xvf matcha-icefall-zh-baker.tar.bz2
  16 +rm matcha-icefall-zh-baker.tar.bz2
  17 +
  18 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  19 +
  20 +node ./test-offline-tts-matcha-zh.js
  21 +
  22 +rm -rf matcha-icefall-zh-baker
  23 +rm hifigan_v2.onnx
  24 +
  25 +echo "---"
  26 +
  27 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  28 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  29 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  30 +
  31 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  32 +
  33 +node ./test-offline-tts-matcha-en.js
  34 +
  35 +rm -rf matcha-icefall-en_US-ljspeech
  36 +rm hifigan_v2.onnx
  37 +
  38 +echo "---"
  39 +
  40 +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  41 +tar xf vits-piper-en_US-amy-low.tar.bz2
  42 +node ./test-offline-tts-vits-en.js
  43 +rm -rf vits-piper-en_US-amy-low*
  44 +
  45 +echo "---"
  46 +
  47 +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
  48 +tar xvf vits-icefall-zh-aishell3.tar.bz2
  49 +node ./test-offline-tts-vits-zh.js
  50 +rm -rf vits-icefall-zh-aishell3*
  51 +
  52 +ls -lh *.wav
  53 +
12 echo '-----speaker diarization----------' 54 echo '-----speaker diarization----------'
13 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 55 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
14 tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 56 tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
@@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 @@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
147 rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 189 rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
148 node ./test-online-zipformer2-ctc-hlg.js 190 node ./test-online-zipformer2-ctc-hlg.js
149 rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 191 rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18
150 -  
151 -# offline tts  
152 -  
153 -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2  
154 -tar xf vits-piper-en_US-amy-low.tar.bz2  
155 -node ./test-offline-tts-en.js  
156 -rm -rf vits-piper-en_US-amy-low*  
157 -  
158 -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2  
159 -tar xvf vits-icefall-zh-aishell3.tar.bz2  
160 -node ./test-offline-tts-zh.js  
161 -rm -rf vits-icefall-zh-aishell3*  
@@ -92,6 +92,50 @@ jobs: @@ -92,6 +92,50 @@ jobs:
92 python-version: ["3.8"] 92 python-version: ["3.8"]
93 93
94 steps: 94 steps:
  95 + - name: Check space
  96 + shell: bash
  97 + run: |
  98 + df -h
  99 +
  100 + - name: Free space
  101 + shell: bash
  102 + run: |
  103 + df -h
  104 + rm -rf /opt/hostedtoolcache
  105 + df -h
  106 +
  107 + - name: Free more space
  108 + shell: bash
  109 + run: |
  110 + # https://github.com/orgs/community/discussions/25678
  111 + cd /opt
  112 + find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';'
  113 +
  114 + sudo rm -rf /usr/share/dotnet
  115 + sudo rm -rf "/usr/local/share/boost"
  116 + sudo rm -rf "$AGENT_TOOLSDIRECTORY"
  117 +
  118 + - name: Free Disk Space (Ubuntu)
  119 + uses: jlumbroso/free-disk-space@main
  120 + with:
  121 + # this might remove tools that are actually needed,
  122 + # if set to "true" but frees about 6 GB
  123 + tool-cache: false
  124 +
  125 + # all of these default to true, but feel free to set to
  126 + # "false" if necessary for your workflow
  127 + android: true
  128 + dotnet: false
  129 + haskell: true
  130 + large-packages: true
  131 + docker-images: false
  132 + swap-storage: true
  133 +
  134 + - name: Check space
  135 + shell: bash
  136 + run: |
  137 + df -h
  138 +
95 - uses: actions/checkout@v4 139 - uses: actions/checkout@v4
96 with: 140 with:
97 fetch-depth: 0 141 fetch-depth: 0
@@ -21,48 +21,56 @@ class OfflineTtsPlayDemo @@ -21,48 +21,56 @@ class OfflineTtsPlayDemo
21 { 21 {
22 class Options 22 class Options
23 { 23 {
24 -  
25 [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")] 24 [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
26 - public string? RuleFsts { get; set; } 25 + public string RuleFsts { get; set; } = string.Empty;
  26 +
  27 + [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
  28 + public string RuleFars { get; set; } = string.Empty;
27 29
28 - [Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]  
29 - public string? DictDir { get; set; } 30 + [Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
  31 + public string DictDir { get; set; } = string.Empty;
30 32
31 - [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]  
32 - public string? DataDir { get; set; } 33 + [Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
  34 + public string DataDir { get; set; } = string.Empty;
33 35
34 - [Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]  
35 - public float LengthScale { get; set; } 36 + [Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
  37 + public float LengthScale { get; set; } = 1;
36 38
37 - [Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")]  
38 - public float NoiseScale { get; set; } 39 + [Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")]
  40 + public float NoiseScale { get; set; } = 0.667F;
39 41
40 - [Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")]  
41 - public float NoiseScaleW { get; set; } 42 + [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")]
  43 + public float NoiseScaleW { get; set; } = 0.8F;
42 44
43 - [Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]  
44 - public string? Lexicon { get; set; } 45 + [Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
  46 + public string Lexicon { get; set; } = string.Empty;
45 47
46 - [Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]  
47 - public string? Tokens { get; set; } 48 + [Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")]
  49 + public string Tokens { get; set; } = string.Empty;
48 50
49 [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] 51 [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
50 - public int MaxNumSentences { get; set; } 52 + public int MaxNumSentences { get; set; } = 1;
51 53
52 [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] 54 [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
53 - public int Debug { get; set; } 55 + public int Debug { get; set; } = 0;
  56 +
  57 + [Option("vits-model", Required = false, HelpText = "Path to VITS model")]
  58 + public string Model { get; set; } = string.Empty;
54 59
55 - [Option("vits-model", Required = true, HelpText = "Path to VITS model")]  
56 - public string? Model { get; set; } 60 + [Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")]
  61 + public string AcousticModel { get; set; } = "";
  62 +
  63 + [Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")]
  64 + public string Vocoder { get; set; } = "";
57 65
58 [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] 66 [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
59 - public int SpeakerId { get; set; } 67 + public int SpeakerId { get; set; } = 0;
60 68
61 [Option("text", Required = true, HelpText = "Text to synthesize")] 69 [Option("text", Required = true, HelpText = "Text to synthesize")]
62 - public string? Text { get; set; } 70 + public string Text { get; set; } = string.Empty;
63 71
64 [Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")] 72 [Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
65 - public string? OutputFilename { get; set; } 73 + public string OutputFilename { get; set; } = "./generated.wav";
66 } 74 }
67 75
68 static void Main(string[] args) 76 static void Main(string[] args)
@@ -78,6 +86,42 @@ class OfflineTtsPlayDemo @@ -78,6 +86,42 @@ class OfflineTtsPlayDemo
78 private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) 86 private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
79 { 87 {
80 string usage = @" 88 string usage = @"
  89 +# matcha-icefall-zh-baker
  90 +
  91 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  92 +tar xvf matcha-icefall-zh-baker.tar.bz2
  93 +rm matcha-icefall-zh-baker.tar.bz2
  94 +
  95 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  96 +
  97 +dotnet run \
  98 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  99 + --matcha-vocoder=./hifigan_v2.onnx \
  100 + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  101 + --tokens=./matcha-icefall-zh-baker/tokens.txt \
  102 + --dict-dir=./matcha-icefall-zh-baker/dict \
  103 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  104 + --debug=1 \
  105 + --output-filename=./matcha-zh.wav \
  106 + --text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。20241231号,拨打110或者18920240511123456块钱。'
  107 +
  108 +# matcha-icefall-en_US-ljspeech
  109 +
  110 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  111 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  112 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  113 +
  114 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  115 +
  116 +dotnet run \
  117 + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
  118 + --matcha-vocoder=./hifigan_v2.onnx \
  119 + --tokens=./matcha-icefall-zh-baker/tokens.txt \
  120 + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
  121 + --debug=1 \
  122 + --output-filename=./matcha-zh.wav \
  123 + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  124 +
81 # vits-aishell3 125 # vits-aishell3
82 126
83 wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 127 wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
@@ -85,8 +129,8 @@ tar xf vits-zh-aishell3.tar.bz2 @@ -85,8 +129,8 @@ tar xf vits-zh-aishell3.tar.bz2
85 129
86 dotnet run \ 130 dotnet run \
87 --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ 131 --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
88 - --vits-tokens=./vits-zh-aishell3/tokens.txt \  
89 - --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ 132 + --tokens=./vits-zh-aishell3/tokens.txt \
  133 + --lexicon=./vits-zh-aishell3/lexicon.txt \
90 --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ 134 --tts-rule-fsts=./vits-zh-aishell3/rule.fst \
91 --sid=66 \ 135 --sid=66 \
92 --debug=1 \ 136 --debug=1 \
@@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2
100 144
101 dotnet run \ 145 dotnet run \
102 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ 146 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
103 - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \  
104 - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ 147 + ---tokens=./vits-piper-en_US-amy-low/tokens.txt \
  148 + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
105 --debug=1 \ 149 --debug=1 \
106 --output-filename=./amy.wav \ 150 --output-filename=./amy.wav \
107 --text='This is a text to speech application in dotnet with Next Generation Kaldi' 151 --text='This is a text to speech application in dotnet with Next Generation Kaldi'
@@ -124,6 +168,7 @@ to download more models. @@ -124,6 +168,7 @@ to download more models.
124 private static void Run(Options options) 168 private static void Run(Options options)
125 { 169 {
126 var config = new OfflineTtsConfig(); 170 var config = new OfflineTtsConfig();
  171 +
127 config.Model.Vits.Model = options.Model; 172 config.Model.Vits.Model = options.Model;
128 config.Model.Vits.Lexicon = options.Lexicon; 173 config.Model.Vits.Lexicon = options.Lexicon;
129 config.Model.Vits.Tokens = options.Tokens; 174 config.Model.Vits.Tokens = options.Tokens;
@@ -132,6 +177,16 @@ to download more models. @@ -132,6 +177,16 @@ to download more models.
132 config.Model.Vits.NoiseScale = options.NoiseScale; 177 config.Model.Vits.NoiseScale = options.NoiseScale;
133 config.Model.Vits.NoiseScaleW = options.NoiseScaleW; 178 config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
134 config.Model.Vits.LengthScale = options.LengthScale; 179 config.Model.Vits.LengthScale = options.LengthScale;
  180 +
  181 + config.Model.Matcha.AcousticModel = options.AcousticModel;
  182 + config.Model.Matcha.Vocoder = options.Vocoder;
  183 + config.Model.Matcha.Lexicon = options.Lexicon;
  184 + config.Model.Matcha.Tokens = options.Tokens;
  185 + config.Model.Matcha.DataDir = options.DataDir;
  186 + config.Model.Matcha.DictDir = options.DictDir;
  187 + config.Model.Matcha.NoiseScale = options.NoiseScale;
  188 + config.Model.Matcha.LengthScale = options.LengthScale;
  189 +
135 config.Model.NumThreads = 1; 190 config.Model.NumThreads = 1;
136 config.Model.Debug = options.Debug; 191 config.Model.Debug = options.Debug;
137 config.Model.Provider = "cpu"; 192 config.Model.Provider = "cpu";
@@ -8,8 +8,8 @@ fi @@ -8,8 +8,8 @@ fi
8 8
9 dotnet run \ 9 dotnet run \
10 --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ 10 --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
11 - --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \  
12 - --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ 11 + --tokens=./vits-zh-hf-fanchen-C/tokens.txt \
  12 + --lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
13 --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ 13 --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
14 --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ 14 --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \
15 --sid=100 \ 15 --sid=100 \
  1 +#!/usr/bin/env bash
  2 +set -ex
  3 +
  4 +
  5 +# please visit
  6 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  7 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  8 +# to download more models
  9 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  10 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  11 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  12 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  13 +fi
  14 +
  15 +if [ ! -f ./hifigan_v2.onnx ]; then
  16 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  17 +fi
  18 +
  19 +dotnet run \
  20 + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
  21 + --matcha-vocoder=./hifigan_v2.onnx \
  22 + --tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
  23 + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
  24 + --debug=1 \
  25 + --output-filename=./matcha-en.wav \
  26 + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  1 +#!/usr/bin/env bash
  2 +set -ex
  3 +
  4 +# please visit
  5 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  6 +# to download more models
  7 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  8 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  9 + tar xvf matcha-icefall-zh-baker.tar.bz2
  10 + rm matcha-icefall-zh-baker.tar.bz2
  11 +fi
  12 +
  13 +if [ ! -f ./hifigan_v2.onnx ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  15 +fi
  16 +
  17 +
  18 +dotnet run \
  19 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  20 + --matcha-vocoder=./hifigan_v2.onnx \
  21 + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  22 + --tokens=./matcha-icefall-zh-baker/tokens.txt \
  23 + --dict-dir=./matcha-icefall-zh-baker/dict \
  24 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  25 + --debug=1 \
  26 + --output-filename=./matcha-zh.wav \
  27 + --text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
@@ -9,8 +9,8 @@ fi @@ -9,8 +9,8 @@ fi
9 9
10 dotnet run \ 10 dotnet run \
11 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ 11 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
12 - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \  
13 - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ 12 + --tokens=./vits-piper-en_US-amy-low/tokens.txt \
  13 + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
14 --debug=1 \ 14 --debug=1 \
15 --output-filename=./amy.wav \ 15 --output-filename=./amy.wav \
16 --text="This is a text to speech application in dotnet with Next Generation Kaldi" 16 --text="This is a text to speech application in dotnet with Next Generation Kaldi"
@@ -20,25 +20,25 @@ class OfflineTtsDemo @@ -20,25 +20,25 @@ class OfflineTtsDemo
20 [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")] 20 [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
21 public string RuleFars { get; set; } = string.Empty; 21 public string RuleFars { get; set; } = string.Empty;
22 22
23 - [Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] 23 + [Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
24 public string DictDir { get; set; } = string.Empty; 24 public string DictDir { get; set; } = string.Empty;
25 25
26 - [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] 26 + [Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
27 public string DataDir { get; set; } = string.Empty; 27 public string DataDir { get; set; } = string.Empty;
28 28
29 - [Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] 29 + [Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
30 public float LengthScale { get; set; } = 1; 30 public float LengthScale { get; set; } = 1;
31 31
32 - [Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")] 32 + [Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")]
33 public float NoiseScale { get; set; } = 0.667F; 33 public float NoiseScale { get; set; } = 0.667F;
34 34
35 [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")] 35 [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")]
36 public float NoiseScaleW { get; set; } = 0.8F; 36 public float NoiseScaleW { get; set; } = 0.8F;
37 37
38 - [Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] 38 + [Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
39 public string Lexicon { get; set; } = string.Empty; 39 public string Lexicon { get; set; } = string.Empty;
40 40
41 - [Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")] 41 + [Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")]
42 public string Tokens { get; set; } = string.Empty; 42 public string Tokens { get; set; } = string.Empty;
43 43
44 [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] 44 [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
@@ -47,9 +47,15 @@ class OfflineTtsDemo @@ -47,9 +47,15 @@ class OfflineTtsDemo
47 [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] 47 [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
48 public int Debug { get; set; } = 0; 48 public int Debug { get; set; } = 0;
49 49
50 - [Option("vits-model", Required = true, HelpText = "Path to VITS model")] 50 + [Option("vits-model", Required = false, HelpText = "Path to VITS model")]
51 public string Model { get; set; } = string.Empty; 51 public string Model { get; set; } = string.Empty;
52 52
  53 + [Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")]
  54 + public string AcousticModel { get; set; } = "";
  55 +
  56 + [Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")]
  57 + public string Vocoder { get; set; } = "";
  58 +
53 [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] 59 [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
54 public int SpeakerId { get; set; } = 0; 60 public int SpeakerId { get; set; } = 0;
55 61
@@ -73,6 +79,42 @@ class OfflineTtsDemo @@ -73,6 +79,42 @@ class OfflineTtsDemo
73 private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs) 79 private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
74 { 80 {
75 var usage = @" 81 var usage = @"
  82 +# matcha-icefall-zh-baker
  83 +
  84 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  85 +tar xvf matcha-icefall-zh-baker.tar.bz2
  86 +rm matcha-icefall-zh-baker.tar.bz2
  87 +
  88 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  89 +
  90 +dotnet run \
  91 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  92 + --matcha-vocoder=./hifigan_v2.onnx \
  93 + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  94 + --tokens=./matcha-icefall-zh-baker/tokens.txt \
  95 + --dict-dir=./matcha-icefall-zh-baker/dict \
  96 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  97 + --debug=1 \
  98 + --output-filename=./matcha-zh.wav \
  99 + --text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。20241231号,拨打110或者18920240511123456块钱。'
  100 +
  101 +# matcha-icefall-en_US-ljspeech
  102 +
  103 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  104 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  105 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  106 +
  107 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  108 +
  109 +dotnet run \
  110 + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
  111 + --matcha-vocoder=./hifigan_v2.onnx \
  112 + --tokens=./matcha-icefall-zh-baker/tokens.txt \
  113 + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
  114 + --debug=1 \
  115 + --output-filename=./matcha-zh.wav \
  116 + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  117 +
76 # vits-aishell3 118 # vits-aishell3
77 119
78 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 120 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
@@ -80,8 +122,8 @@ tar xvf vits-icefall-zh-aishell3.tar.bz2 @@ -80,8 +122,8 @@ tar xvf vits-icefall-zh-aishell3.tar.bz2
80 122
81 dotnet run \ 123 dotnet run \
82 --vits-model=./vits-icefall-zh-aishell3/model.onnx \ 124 --vits-model=./vits-icefall-zh-aishell3/model.onnx \
83 - --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \  
84 - --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ 125 + --tokens=./vits-icefall-zh-aishell3/tokens.txt \
  126 + --lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
85 --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ 127 --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
86 --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ 128 --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
87 --sid=66 \ 129 --sid=66 \
@@ -96,8 +138,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -96,8 +138,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2
96 138
97 dotnet run \ 139 dotnet run \
98 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ 140 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
99 - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \  
100 - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ 141 + --tokens=./vits-piper-en_US-amy-low/tokens.txt \
  142 + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
101 --debug=1 \ 143 --debug=1 \
102 --output-filename=./amy.wav \ 144 --output-filename=./amy.wav \
103 --text='This is a text to speech application in dotnet with Next Generation Kaldi' 145 --text='This is a text to speech application in dotnet with Next Generation Kaldi'
@@ -128,6 +170,16 @@ to download more models. @@ -128,6 +170,16 @@ to download more models.
128 config.Model.Vits.NoiseScale = options.NoiseScale; 170 config.Model.Vits.NoiseScale = options.NoiseScale;
129 config.Model.Vits.NoiseScaleW = options.NoiseScaleW; 171 config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
130 config.Model.Vits.LengthScale = options.LengthScale; 172 config.Model.Vits.LengthScale = options.LengthScale;
  173 +
  174 + config.Model.Matcha.AcousticModel = options.AcousticModel;
  175 + config.Model.Matcha.Vocoder = options.Vocoder;
  176 + config.Model.Matcha.Lexicon = options.Lexicon;
  177 + config.Model.Matcha.Tokens = options.Tokens;
  178 + config.Model.Matcha.DataDir = options.DataDir;
  179 + config.Model.Matcha.DictDir = options.DictDir;
  180 + config.Model.Matcha.NoiseScale = options.NoiseScale;
  181 + config.Model.Matcha.LengthScale = options.LengthScale;
  182 +
131 config.Model.NumThreads = 1; 183 config.Model.NumThreads = 1;
132 config.Model.Debug = options.Debug; 184 config.Model.Debug = options.Debug;
133 config.Model.Provider = "cpu"; 185 config.Model.Provider = "cpu";
@@ -8,8 +8,8 @@ fi @@ -8,8 +8,8 @@ fi
8 8
9 dotnet run \ 9 dotnet run \
10 --vits-model=./vits-icefall-zh-aishell3/model.onnx \ 10 --vits-model=./vits-icefall-zh-aishell3/model.onnx \
11 - --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \  
12 - --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ 11 + --tokens=./vits-icefall-zh-aishell3/tokens.txt \
  12 + --lexicon=./vits-icefall-zh-aishell3/lexicon.txt \
13 --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ 13 --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \
14 --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ 14 --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \
15 --sid=66 \ 15 --sid=66 \
@@ -8,10 +8,10 @@ fi @@ -8,10 +8,10 @@ fi
8 8
9 dotnet run \ 9 dotnet run \
10 --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ 10 --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
11 - --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \  
12 - --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ 11 + --tokens=./vits-zh-hf-fanchen-C/tokens.txt \
  12 + --lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
13 --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ 13 --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
14 - --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ 14 + --dict-dir=./vits-zh-hf-fanchen-C/dict \
15 --sid=100 \ 15 --sid=100 \
16 --debug=1 \ 16 --debug=1 \
17 --output-filename=./fanchen-100.wav \ 17 --output-filename=./fanchen-100.wav \
  1 +#!/usr/bin/env bash
  2 +set -ex
  3 +
  4 +
  5 +# please visit
  6 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  7 +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  8 +# to download more models
  9 +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  10 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  11 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  12 + rm matcha-icefall-en_US-ljspeech.tar.bz2
  13 +fi
  14 +
  15 +if [ ! -f ./hifigan_v2.onnx ]; then
  16 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  17 +fi
  18 +
  19 +dotnet run \
  20 + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
  21 + --matcha-vocoder=./hifigan_v2.onnx \
  22 + --tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
  23 + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
  24 + --debug=1 \
  25 + --output-filename=./matcha-en.wav \
  26 + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  1 +#!/usr/bin/env bash
  2 +set -ex
  3 +
  4 +# please visit
  5 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  6 +# to download more models
  7 +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  8 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  9 + tar xvf matcha-icefall-zh-baker.tar.bz2
  10 + rm matcha-icefall-zh-baker.tar.bz2
  11 +fi
  12 +
  13 +if [ ! -f ./hifigan_v2.onnx ]; then
  14 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  15 +fi
  16 +
  17 +
  18 +dotnet run \
  19 + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
  20 + --matcha-vocoder=./hifigan_v2.onnx \
  21 + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \
  22 + --tokens=./matcha-icefall-zh-baker/tokens.txt \
  23 + --dict-dir=./matcha-icefall-zh-baker/dict \
  24 + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
  25 + --debug=1 \
  26 + --output-filename=./matcha-zh.wav \
  27 + --text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
@@ -10,8 +10,8 @@ fi @@ -10,8 +10,8 @@ fi
10 10
11 dotnet run \ 11 dotnet run \
12 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ 12 --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
13 - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \  
14 - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ 13 + --tokens=./vits-piper-en_US-amy-low/tokens.txt \
  14 + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
15 --debug=1 \ 15 --debug=1 \
16 --output-filename=./amy.wav \ 16 --output-filename=./amy.wav \
17 --text="This is a text to speech application in dotnet with Next Generation Kaldi" 17 --text="This is a text to speech application in dotnet with Next Generation Kaldi"
@@ -42,9 +42,45 @@ node ./test-offline-speaker-diarization.js @@ -42,9 +42,45 @@ node ./test-offline-speaker-diarization.js
42 42
43 In the following, we demonstrate how to run text-to-speech. 43 In the following, we demonstrate how to run text-to-speech.
44 44
45 -## ./test-offline-tts-en.js 45 +## ./test-offline-tts-matcha-zh.js
46 46
47 -[./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use 47 +[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use
  48 +[matcha-icefall-zh-baker](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker)
  49 +for text-to-speech.
  50 +
  51 +You can use the following command to run it:
  52 +
  53 +```bash
  54 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  55 +tar xvf matcha-icefall-zh-baker.tar.bz2
  56 +rm matcha-icefall-zh-baker.tar.bz2
  57 +
  58 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  59 +
  60 +node ./test-offline-tts-matcha-zh.js
  61 +```
  62 +
  63 +## ./test-offline-tts-matcha-en.js
  64 +
  65 +[./test-offline-tts-matcha-en.js](./test-offline-tts-matcha-en.js) shows how to use
  66 +[matcha-icefall-en_US-ljspeech](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker)
  67 +for text-to-speech.
  68 +
  69 +You can use the following command to run it:
  70 +
  71 +```bash
  72 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  73 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
  74 +rm matcha-icefall-en_US-ljspeech.tar.bz2
  75 +
  76 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
  77 +
  78 +node ./test-offline-tts-matcha-en.js
  79 +```
  80 +
  81 +## ./test-offline-tts-vits-en.js
  82 +
  83 +[./test-offline-tts-vits-en.js](./test-offline-tts-vits-en.js) shows how to use
48 [vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2) 84 [vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2)
49 for text-to-speech. 85 for text-to-speech.
50 86
@@ -53,12 +89,12 @@ You can use the following command to run it: @@ -53,12 +89,12 @@ You can use the following command to run it:
53 ```bash 89 ```bash
54 wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 90 wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
55 tar xvf vits-piper-en_US-amy-low.tar.bz2 91 tar xvf vits-piper-en_US-amy-low.tar.bz2
56 -node ./test-offline-tts-en.js 92 +node ./test-offline-tts-vits-en.js
57 ``` 93 ```
58 94
59 -## ./test-offline-tts-zh.js 95 +## ./test-offline-tts-vits-zh.js
60 96
61 -[./test-offline-tts-zh.js](./test-offline-tts-zh.js) shows how to use 97 +[./test-offline-tts-vits-zh.js](./test-offline-tts-vits-zh.js) shows how to use
62 a VITS pretrained model 98 a VITS pretrained model
63 [aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3) 99 [aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3)
64 for text-to-speech. 100 for text-to-speech.
@@ -68,7 +104,7 @@ You can use the following command to run it: @@ -68,7 +104,7 @@ You can use the following command to run it:
68 ```bash 104 ```bash
69 wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 105 wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
70 tar xvf vits-icefall-zh-aishell3.tar.bz2 106 tar xvf vits-icefall-zh-aishell3.tar.bz2
71 -node ./test-offline-tts-zh.js 107 +node ./test-offline-tts-vits-zh.js
72 ``` 108 ```
73 109
74 # Speech-to-text 110 # Speech-to-text
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOfflineTts() {
  6 + let offlineTtsMatchaModelConfig = {
  7 + acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx',
  8 + vocoder: './hifigan_v2.onnx',
  9 + lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt',
  10 + tokens: './matcha-icefall-en_US-ljspeech/tokens.txt',
  11 + dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data',
  12 +
  13 + noiseScale: 0.667,
  14 + lengthScale: 1.0,
  15 + };
  16 + let offlineTtsModelConfig = {
  17 + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
  18 + numThreads: 1,
  19 + debug: 1,
  20 + provider: 'cpu',
  21 + };
  22 +
  23 + let offlineTtsConfig = {
  24 + offlineTtsModelConfig: offlineTtsModelConfig,
  25 + maxNumSentences: 1,
  26 + };
  27 +
  28 + return sherpa_onnx.createOfflineTts(offlineTtsConfig);
  29 +}
  30 +
  31 +const tts = createOfflineTts();
  32 +const speakerId = 0;
  33 +const speed = 1.0;
  34 +const text =
  35 + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
  36 +
  37 +const audio = tts.generate({text: text, sid: speakerId, speed: speed});
  38 +tts.save('./test-matcha-en.wav', audio);
  39 +console.log('Saved to test-matcha-en.wav successfully.');
  40 +tts.free();
  1 +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +const sherpa_onnx = require('sherpa-onnx');
  4 +
  5 +function createOfflineTts() {
  6 + let offlineTtsMatchaModelConfig = {
  7 + acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx',
  8 + vocoder: './hifigan_v2.onnx',
  9 + lexicon: './matcha-icefall-zh-baker/lexicon.txt',
  10 + tokens: './matcha-icefall-zh-baker/tokens.txt',
  11 + dictDir: './matcha-icefall-zh-baker/dict',
  12 + noiseScale: 0.667,
  13 + lengthScale: 1.0,
  14 + };
  15 + let offlineTtsModelConfig = {
  16 + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
  17 + numThreads: 1,
  18 + debug: 1,
  19 + provider: 'cpu',
  20 + };
  21 +
  22 + let offlineTtsConfig = {
  23 + offlineTtsModelConfig: offlineTtsModelConfig,
  24 + maxNumSentences: 1,
  25 + ruleFsts:
  26 + './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst',
  27 + };
  28 +
  29 + return sherpa_onnx.createOfflineTts(offlineTtsConfig);
  30 +}
  31 +
  32 +const tts = createOfflineTts();
  33 +const speakerId = 0;
  34 +const speed = 1.0;
  35 +const text =
  36 + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'
  37 +
  38 +const audio = tts.generate({text: text, sid: speakerId, speed: speed});
  39 +tts.save('./test-matcha-zh.wav', audio);
  40 +console.log('Saved to test-matcha-zh.wav successfully.');
  41 +tts.free();
@@ -37,7 +37,7 @@ const audio = tts.generate({ @@ -37,7 +37,7 @@ const audio = tts.generate({
37 speed: speed 37 speed: speed
38 }); 38 });
39 39
40 -tts.save('./test-en.wav', audio);  
41 -console.log('Saved to test-en.wav successfully.'); 40 +tts.save('./test-vits-en.wav', audio);
  41 +console.log('Saved to test-vits-en.wav successfully.');
42 42
43 tts.free(); 43 tts.free();
@@ -34,6 +34,6 @@ const speakerId = 66; @@ -34,6 +34,6 @@ const speakerId = 66;
34 const speed = 1.0; 34 const speed = 1.0;
35 const audio = tts.generate( 35 const audio = tts.generate(
36 {text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed}); 36 {text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed});
37 -tts.save('./test-zh.wav', audio);  
38 -console.log('Saved to test-zh.wav successfully.'); 37 +tts.save('./test-vits-zh.wav', audio);
  38 +console.log('Saved to test-vits-zh.wav successfully.');
39 tts.free(); 39 tts.free();
  1 +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +using System.Runtime.InteropServices;
  4 +
  5 +namespace SherpaOnnx
  6 +{
  7 + [StructLayout(LayoutKind.Sequential)]
  8 + public struct OfflineTtsMatchaModelConfig
  9 + {
  10 + public OfflineTtsMatchaModelConfig()
  11 + {
  12 + AcousticModel = "";
  13 + Vocoder = "";
  14 + Lexicon = "";
  15 + Tokens = "";
  16 + DataDir = "";
  17 +
  18 + NoiseScale = 0.667F;
  19 + LengthScale = 1.0F;
  20 +
  21 + DictDir = "";
  22 + }
  23 + [MarshalAs(UnmanagedType.LPStr)]
  24 + public string AcousticModel;
  25 +
  26 + [MarshalAs(UnmanagedType.LPStr)]
  27 + public string Vocoder;
  28 +
  29 + [MarshalAs(UnmanagedType.LPStr)]
  30 + public string Lexicon;
  31 +
  32 + [MarshalAs(UnmanagedType.LPStr)]
  33 + public string Tokens;
  34 +
  35 + [MarshalAs(UnmanagedType.LPStr)]
  36 + public string DataDir;
  37 +
  38 + public float NoiseScale;
  39 + public float LengthScale;
  40 +
  41 + [MarshalAs(UnmanagedType.LPStr)]
  42 + public string DictDir;
  43 + }
  44 +}
@@ -11,6 +11,7 @@ namespace SherpaOnnx @@ -11,6 +11,7 @@ namespace SherpaOnnx
11 public OfflineTtsModelConfig() 11 public OfflineTtsModelConfig()
12 { 12 {
13 Vits = new OfflineTtsVitsModelConfig(); 13 Vits = new OfflineTtsVitsModelConfig();
  14 + Matcha = new OfflineTtsMatchaModelConfig();
14 NumThreads = 1; 15 NumThreads = 1;
15 Debug = 0; 16 Debug = 0;
16 Provider = "cpu"; 17 Provider = "cpu";
@@ -21,5 +22,7 @@ namespace SherpaOnnx @@ -21,5 +22,7 @@ namespace SherpaOnnx
21 public int Debug; 22 public int Debug;
22 [MarshalAs(UnmanagedType.LPStr)] 23 [MarshalAs(UnmanagedType.LPStr)]
23 public string Provider; 24 public string Provider;
  25 +
  26 + public OfflineTtsMatchaModelConfig Matcha;
24 } 27 }
25 } 28 }
1 <Project Sdk="Microsoft.NET.Sdk"> 1 <Project Sdk="Microsoft.NET.Sdk">
2 2
3 <PropertyGroup> 3 <PropertyGroup>
4 - <TargetFramework>.net6</TargetFramework> 4 + <TargetFramework>net8.0</TargetFramework>
5 <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources> 5 <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
6 </PropertyGroup> 6 </PropertyGroup>
7 7
@@ -4,7 +4,7 @@ @@ -4,7 +4,7 @@
4 <PackageReadmeFile>README.md</PackageReadmeFile> 4 <PackageReadmeFile>README.md</PackageReadmeFile>
5 <OutputType>Library</OutputType> 5 <OutputType>Library</OutputType>
6 <LangVersion>10.0</LangVersion> 6 <LangVersion>10.0</LangVersion>
7 - <TargetFrameworks>net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks> 7 + <TargetFrameworks>net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks>
8 <RuntimeIdentifiers>linux-x64;linux-arm64;osx-x64;osx-arm64;win-x64;win-x86;win-arm64</RuntimeIdentifiers> 8 <RuntimeIdentifiers>linux-x64;linux-arm64;osx-x64;osx-arm64;win-x64;win-x86;win-arm64</RuntimeIdentifiers>
9 <AllowUnsafeBlocks>true</AllowUnsafeBlocks> 9 <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
10 <AssemblyName>sherpa-onnx</AssemblyName> 10 <AssemblyName>sherpa-onnx</AssemblyName>
@@ -3,7 +3,7 @@ @@ -3,7 +3,7 @@
3 <PackageLicenseExpression>Apache-2.0</PackageLicenseExpression> 3 <PackageLicenseExpression>Apache-2.0</PackageLicenseExpression>
4 <PackageReadmeFile>README.md</PackageReadmeFile> 4 <PackageReadmeFile>README.md</PackageReadmeFile>
5 <OutputType>Library</OutputType> 5 <OutputType>Library</OutputType>
6 - <TargetFrameworks>net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks> 6 + <TargetFrameworks>net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0</TargetFrameworks>
7 <RuntimeIdentifier>{{ dotnet_rid }}</RuntimeIdentifier> 7 <RuntimeIdentifier>{{ dotnet_rid }}</RuntimeIdentifier>
8 <AssemblyName>sherpa-onnx</AssemblyName> 8 <AssemblyName>sherpa-onnx</AssemblyName>
9 <Version>{{ version }}</Version> 9 <Version>{{ version }}</Version>
@@ -8,6 +8,10 @@ function freeConfig(config, Module) { @@ -8,6 +8,10 @@ function freeConfig(config, Module) {
8 freeConfig(config.config, Module) 8 freeConfig(config.config, Module)
9 } 9 }
10 10
  11 + if ('config2' in config) {
  12 + freeConfig(config.config2, Module)
  13 + }
  14 +
11 Module._free(config.ptr); 15 Module._free(config.ptr);
12 } 16 }
13 17
@@ -66,11 +70,103 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { @@ -66,11 +70,103 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
66 } 70 }
67 } 71 }
68 72
  73 +function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
  74 + const acousticModelLen = Module.lengthBytesUTF8(config.acousticModel) + 1;
  75 + const vocoderLen = Module.lengthBytesUTF8(config.vocoder) + 1;
  76 + const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
  77 + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
  78 + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
  79 + const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
  80 +
  81 + const n = acousticModelLen + vocoderLen + lexiconLen + tokensLen +
  82 + dataDirLen + dictDirLen;
  83 +
  84 + const buffer = Module._malloc(n);
  85 +
  86 + const len = 8 * 4;
  87 + const ptr = Module._malloc(len);
  88 +
  89 + let offset = 0;
  90 + Module.stringToUTF8(
  91 + config.acousticModel || '', buffer + offset, acousticModelLen);
  92 + offset += acousticModelLen;
  93 +
  94 + Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen);
  95 + offset += vocoderLen;
  96 +
  97 + Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
  98 + offset += lexiconLen;
  99 +
  100 + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
  101 + offset += tokensLen;
  102 +
  103 + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
  104 + offset += dataDirLen;
  105 +
  106 + Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen);
  107 + offset += dictDirLen;
  108 +
  109 + offset = 0;
  110 + Module.setValue(ptr, buffer + offset, 'i8*');
  111 + offset += acousticModelLen;
  112 +
  113 + Module.setValue(ptr + 4, buffer + offset, 'i8*');
  114 + offset += vocoderLen;
  115 +
  116 + Module.setValue(ptr + 8, buffer + offset, 'i8*');
  117 + offset += lexiconLen;
  118 +
  119 + Module.setValue(ptr + 12, buffer + offset, 'i8*');
  120 + offset += tokensLen;
  121 +
  122 + Module.setValue(ptr + 16, buffer + offset, 'i8*');
  123 + offset += dataDirLen;
  124 +
  125 + Module.setValue(ptr + 20, config.noiseScale || 0.667, 'float');
  126 + Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float');
  127 + Module.setValue(ptr + 28, buffer + offset, 'i8*');
  128 + offset += dictDirLen;
  129 +
  130 + return {
  131 + buffer: buffer, ptr: ptr, len: len,
  132 + }
  133 +}
  134 +
69 function initSherpaOnnxOfflineTtsModelConfig(config, Module) { 135 function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
  136 + if (!('offlineTtsVitsModelConfig' in config)) {
  137 + config.offlineTtsVitsModelConfig = {
  138 + model: '',
  139 + lexicon: '',
  140 + tokens: '',
  141 + noiseScale: 0.667,
  142 + noiseScaleW: 0.8,
  143 + lengthScale: 1.0,
  144 + dataDir: '',
  145 + dictDir: '',
  146 + };
  147 + }
  148 +
  149 + if (!('offlineTtsMatchaModelConfig' in config)) {
  150 + config.offlineTtsMatchaModelConfig = {
  151 + acousticModel: '',
  152 + vocoder: '',
  153 + lexicon: '',
  154 + tokens: '',
  155 + noiseScale: 0.667,
  156 + lengthScale: 1.0,
  157 + dataDir: '',
  158 + dictDir: '',
  159 + };
  160 + }
  161 +
  162 +
70 const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( 163 const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
71 config.offlineTtsVitsModelConfig, Module); 164 config.offlineTtsVitsModelConfig, Module);
72 165
73 - const len = vitsModelConfig.len + 3 * 4; 166 + const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
  167 + config.offlineTtsMatchaModelConfig, Module);
  168 +
  169 + const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4;
74 const ptr = Module._malloc(len); 170 const ptr = Module._malloc(len);
75 171
76 let offset = 0; 172 let offset = 0;
@@ -87,9 +183,14 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { @@ -87,9 +183,14 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
87 const buffer = Module._malloc(providerLen); 183 const buffer = Module._malloc(providerLen);
88 Module.stringToUTF8(config.provider, buffer, providerLen); 184 Module.stringToUTF8(config.provider, buffer, providerLen);
89 Module.setValue(ptr + offset, buffer, 'i8*'); 185 Module.setValue(ptr + offset, buffer, 'i8*');
  186 + offset += 4;
  187 +
  188 + Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
  189 + offset += matchaModelConfig.len;
90 190
91 return { 191 return {
92 buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, 192 buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
  193 + config2: matchaModelConfig
93 } 194 }
94 } 195 }
95 196
@@ -195,12 +296,26 @@ function createOfflineTts(Module, myConfig) { @@ -195,12 +296,26 @@ function createOfflineTts(Module, myConfig) {
195 noiseScaleW: 0.8, 296 noiseScaleW: 0.8,
196 lengthScale: 1.0, 297 lengthScale: 1.0,
197 }; 298 };
  299 +
  300 + const offlineTtsMatchaModelConfig = {
  301 + acousticModel: '',
  302 + vocoder: '',
  303 + lexicon: '',
  304 + tokens: '',
  305 + dataDir: '',
  306 + dictDir: '',
  307 + noiseScale: 0.667,
  308 + lengthScale: 1.0,
  309 + };
  310 +
198 const offlineTtsModelConfig = { 311 const offlineTtsModelConfig = {
199 offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, 312 offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
  313 + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
200 numThreads: 1, 314 numThreads: 1,
201 debug: 1, 315 debug: 1,
202 provider: 'cpu', 316 provider: 'cpu',
203 }; 317 };
  318 +
204 let offlineTtsConfig = { 319 let offlineTtsConfig = {
205 offlineTtsModelConfig: offlineTtsModelConfig, 320 offlineTtsModelConfig: offlineTtsModelConfig,
206 ruleFsts: '', 321 ruleFsts: '',
@@ -14,8 +14,10 @@ @@ -14,8 +14,10 @@
14 extern "C" { 14 extern "C" {
15 15
16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); 16 static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
  17 +static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
17 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == 18 static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
18 - sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, 19 + sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
  20 + sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4,
19 ""); 21 "");
20 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == 22 static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
21 sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, 23 sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
@@ -24,6 +26,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == @@ -24,6 +26,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
24 void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { 26 void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
25 auto tts_model_config = &tts_config->model; 27 auto tts_model_config = &tts_config->model;
26 auto vits_model_config = &tts_model_config->vits; 28 auto vits_model_config = &tts_model_config->vits;
  29 + auto matcha_model_config = &tts_model_config->matcha;
27 fprintf(stdout, "----------vits model config----------\n"); 30 fprintf(stdout, "----------vits model config----------\n");
28 fprintf(stdout, "model: %s\n", vits_model_config->model); 31 fprintf(stdout, "model: %s\n", vits_model_config->model);
29 fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); 32 fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
@@ -34,6 +37,16 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -34,6 +37,16 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
34 fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); 37 fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale);
35 fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); 38 fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir);
36 39
  40 + fprintf(stdout, "----------matcha model config----------\n");
  41 + fprintf(stdout, "acoustic_model: %s\n", matcha_model_config->acoustic_model);
  42 + fprintf(stdout, "vocoder: %s\n", matcha_model_config->vocoder);
  43 + fprintf(stdout, "lexicon: %s\n", matcha_model_config->lexicon);
  44 + fprintf(stdout, "tokens: %s\n", matcha_model_config->tokens);
  45 + fprintf(stdout, "data_dir: %s\n", matcha_model_config->data_dir);
  46 + fprintf(stdout, "noise scale: %.3f\n", matcha_model_config->noise_scale);
  47 + fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
  48 + fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);
  49 +
37 fprintf(stdout, "----------tts model config----------\n"); 50 fprintf(stdout, "----------tts model config----------\n");
38 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); 51 fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
39 fprintf(stdout, "debug: %d\n", tts_model_config->debug); 52 fprintf(stdout, "debug: %d\n", tts_model_config->debug);