Add Dart API for MatchaTTS models (#1687)

Fangjun Kuang · GitHub
Commit d7c95d33a351d8505335ff7735be68b37cb54f9f d7c95d33 1 parent c6fcd325
.github/scripts/test-dart.sh
.github/workflows/checksum.yaml
dart-api-examples/tts/bin/matcha-en.dart
dart-api-examples/tts/bin/matcha-zh.dart
dart-api-examples/tts/bin/zh.dart → dart-api-examples/tts/bin/vits-zh.dart
dart-api-examples/tts/run-matcha-en.sh
dart-api-examples/tts/run-matcha-zh.sh
dart-api-examples/tts/run-zh.sh → dart-api-examples/tts/run-vits-zh.sh
flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
flutter/sherpa_onnx/lib/src/tts.dart
--- a/.github/scripts/test-dart.sh
查看文件 @d7c95d3
+++ b/.github/scripts/test-dart.sh
查看文件 @d7c95d3
@@ -4,6 +4,31 @@ set -ex
 
 cd dart-api-examples
 
+ pushd tts
+ 
+ echo '----------matcha tts----------'
+ ./run-matcha-zh.sh
+ ./run-matcha-en.sh
+ ls -lh *.wav
+ rm -rf matcha-icefall-*
+ rm *.onnx
+ 
+ echo '----------piper tts----------'
+ ./run-piper.sh
+ rm -rf vits-piper-*
+ 
+ echo '----------coqui tts----------'
+ ./run-coqui.sh
+ rm -rf vits-coqui-*
+ 
+ echo '----------zh tts----------'
+ ./run-vits-zh.sh
+ rm -rf sherpa-onnx-*
+ 
+ ls -lh *.wav
+ 
+ popd # tts
+ 
 pushd speaker-diarization
 echo '----------speaker diarization----------'
 ./run.sh
@@ -106,22 +131,6 @@ rm -rf sherpa-onnx-*
 
 popd # non-streaming-asr
 
- pushd tts
- 
- echo '----------piper tts----------'
- ./run-piper.sh
- rm -rf vits-piper-*
- 
- echo '----------coqui tts----------'
- ./run-coqui.sh
- rm -rf vits-coqui-*
- 
- echo '----------zh tts----------'
- ./run-zh.sh
- rm -rf sherpa-onnx-*
- 
- popd # tts
- 
 pushd streaming-asr
 
 echo '----------streaming zipformer ctc HLG----------'
--- a/.github/workflows/checksum.yaml
查看文件 @d7c95d3
+++ b/.github/workflows/checksum.yaml
查看文件 @d7c95d3
@@ -7,6 +7,7 @@ on:
 
 jobs:
   checksum:
+     if: github.repository_owner == 'k2-fsa'
     runs-on: macos-latest
     strategy:
       matrix:
--- a/dart-api-examples/tts/bin/matcha-en.dart 0 → 100644
查看文件 @d7c95d3
+++ b/dart-api-examples/tts/bin/matcha-en.dart 0 → 100644
查看文件 @d7c95d3
+ // Copyright (c)  2025  Xiaomi Corporation
+ import 'dart:io';
+ 
+ import 'package:args/args.dart';
+ import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+ 
+ import './init.dart';
+ 
+ void main(List<String> arguments) async {
+   await initSherpaOnnx();
+ 
+   final parser = ArgParser()
+     ..addOption('acoustic-model', help: 'Path to the acoustic model')
+     ..addOption('vocoder', help: 'Path to the vocoder model')
+     ..addOption('tokens', help: 'Path to tokens.txt')
+     ..addOption(
+       'data-dir',
+       help: 'Path to espeak-ng-data directory',
+       defaultsTo: '',
+     )
+     ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
+     ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
+     ..addOption('text', help: 'Text to generate TTS for')
+     ..addOption('output-wav', help: 'Filename to save the generated audio')
+     ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
+     ..addOption(
+       'sid',
+       help: 'Speaker ID to select. Used only for multi-speaker TTS',
+       defaultsTo: '0',
+     );
+   final res = parser.parse(arguments);
+   if (res['acoustic-model'] == null ||
+       res['vocoder'] == null ||
+       res['tokens'] == null ||
+       res['data-dir'] == null ||
+       res['output-wav'] == null ||
+       res['text'] == null) {
+     print(parser.usage);
+     exit(1);
+   }
+   final acousticModel = res['acoustic-model'] as String;
+   final vocoder = res['vocoder'] as String;
+   final tokens = res['tokens'] as String;
+   final dataDir = res['data-dir'] as String;
+   final ruleFsts = res['rule-fsts'] as String;
+   final ruleFars = res['rule-fars'] as String;
+   final text = res['text'] as String;
+   final outputWav = res['output-wav'] as String;
+   var speed = double.tryParse(res['speed'] as String) ?? 1.0;
+   final sid = int.tryParse(res['sid'] as String) ?? 0;
+ 
+   if (speed == 0) {
+     speed = 1.0;
+   }
+ 
+   final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
+     acousticModel: acousticModel,
+     vocoder: vocoder,
+     tokens: tokens,
+     dataDir: dataDir,
+     lengthScale: 1 / speed,
+   );
+ 
+   final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
+     matcha: matcha,
+     numThreads: 1,
+     debug: true,
+   );
+   final config = sherpa_onnx.OfflineTtsConfig(
+     model: modelConfig,
+     maxNumSenetences: 1,
+     ruleFsts: ruleFsts,
+     ruleFars: ruleFars,
+   );
+ 
+   final tts = sherpa_onnx.OfflineTts(config);
+   final audio = tts.generate(text: text, sid: sid, speed: speed);
+   tts.free();
+ 
+   sherpa_onnx.writeWave(
+     filename: outputWav,
+     samples: audio.samples,
+     sampleRate: audio.sampleRate,
+   );
+   print('Saved to $outputWav');
+ }
--- a/dart-api-examples/tts/bin/matcha-zh.dart 0 → 100644
查看文件 @d7c95d3
+++ b/dart-api-examples/tts/bin/matcha-zh.dart 0 → 100644
查看文件 @d7c95d3
+ // Copyright (c)  2025  Xiaomi Corporation
+ import 'dart:io';
+ 
+ import 'package:args/args.dart';
+ import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
+ 
+ import './init.dart';
+ 
+ void main(List<String> arguments) async {
+   await initSherpaOnnx();
+ 
+   final parser = ArgParser()
+     ..addOption('acoustic-model', help: 'Path to the acoustic model')
+     ..addOption('vocoder', help: 'Path to the vocoder model')
+     ..addOption('tokens', help: 'Path to tokens.txt')
+     ..addOption('lexicon', help: 'Path to lexicon.txt')
+     ..addOption(
+       'dict-dir',
+       help: 'Path to jieba dict directory',
+       defaultsTo: '',
+     )
+     ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
+     ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
+     ..addOption('text', help: 'Text to generate TTS for')
+     ..addOption('output-wav', help: 'Filename to save the generated audio')
+     ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
+     ..addOption(
+       'sid',
+       help: 'Speaker ID to select. Used only for multi-speaker TTS',
+       defaultsTo: '0',
+     );
+   final res = parser.parse(arguments);
+   if (res['acoustic-model'] == null ||
+       res['vocoder'] == null ||
+       res['lexicon'] == null ||
+       res['tokens'] == null ||
+       res['dict-dir'] == null ||
+       res['output-wav'] == null ||
+       res['text'] == null) {
+     print(parser.usage);
+     exit(1);
+   }
+   final acousticModel = res['acoustic-model'] as String;
+   final vocoder = res['vocoder'] as String;
+   final lexicon = res['lexicon'] as String;
+   final tokens = res['tokens'] as String;
+   final dictDir = res['dict-dir'] as String;
+   final ruleFsts = res['rule-fsts'] as String;
+   final ruleFars = res['rule-fars'] as String;
+   final text = res['text'] as String;
+   final outputWav = res['output-wav'] as String;
+   var speed = double.tryParse(res['speed'] as String) ?? 1.0;
+   final sid = int.tryParse(res['sid'] as String) ?? 0;
+ 
+   if (speed == 0) {
+     speed = 1.0;
+   }
+ 
+   final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
+     acousticModel: acousticModel,
+     vocoder: vocoder,
+     lexicon: lexicon,
+     tokens: tokens,
+     dictDir: dictDir,
+     lengthScale: 1 / speed,
+   );
+ 
+   final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
+     matcha: matcha,
+     numThreads: 1,
+     debug: true,
+   );
+   final config = sherpa_onnx.OfflineTtsConfig(
+     model: modelConfig,
+     maxNumSenetences: 1,
+     ruleFsts: ruleFsts,
+     ruleFars: ruleFars,
+   );
+ 
+   final tts = sherpa_onnx.OfflineTts(config);
+   final audio = tts.generate(text: text, sid: sid, speed: speed);
+   tts.free();
+ 
+   sherpa_onnx.writeWave(
+     filename: outputWav,
+     samples: audio.samples,
+     sampleRate: audio.sampleRate,
+   );
+   print('Saved to $outputWav');
+ }
--- a/dart-api-examples/tts/bin/zh.dart → dart-api-examples/tts/bin/vits-zh.dart
查看文件 @d7c95d3
+++ b/dart-api-examples/tts/bin/zh.dart → dart-api-examples/tts/bin/vits-zh.dart
查看文件 @d7c95d3
--- a/dart-api-examples/tts/run-matcha-en.sh 0 → 100755
查看文件 @d7c95d3
+++ b/dart-api-examples/tts/run-matcha-en.sh 0 → 100755
查看文件 @d7c95d3
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ dart pub get
+ 
+ # please visit
+ # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+ # matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
+ # to download more models
+ if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
+   tar xf matcha-icefall-en_US-ljspeech.tar.bz2
+   rm matcha-icefall-en_US-ljspeech.tar.bz2
+ fi
+ 
+ if [ ! -f ./hifigan_v2.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+ fi
+ 
+ dart run \
+   ./bin/matcha-en.dart \
+   --acoustic-model ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
+   --vocoder ./hifigan_v2.onnx \
+   --tokens ./matcha-icefall-en_US-ljspeech/tokens.txt \
+   --data-dir ./matcha-icefall-en_US-ljspeech/espeak-ng-data \
+   --sid 0 \
+   --speed 1.0 \
+   --output-wav matcha-en-1.wav \
+   --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \
+ 
+ ls -lh *.wav
--- a/dart-api-examples/tts/run-matcha-zh.sh 0 → 100755
查看文件 @d7c95d3
+++ b/dart-api-examples/tts/run-matcha-zh.sh 0 → 100755
查看文件 @d7c95d3
+ #!/usr/bin/env bash
+ 
+ set -ex
+ 
+ dart pub get
+ 
+ # please visit
+ # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
+ # to download more models
+ if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
+   tar xvf matcha-icefall-zh-baker.tar.bz2
+   rm matcha-icefall-zh-baker.tar.bz2
+ fi
+ 
+ if [ ! -f ./hifigan_v2.onnx ]; then
+   curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
+ fi
+ 
+ dart run \
+   ./bin/matcha-zh.dart \
+   --acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
+   --vocoder ./hifigan_v2.onnx \
+   --lexicon ./matcha-icefall-zh-baker/lexicon.txt \
+   --tokens ./matcha-icefall-zh-baker/tokens.txt \
+   --dict-dir ./matcha-icefall-zh-baker/dict \
+   --rule-fsts ./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
+   --sid 0 \
+   --speed 1.0 \
+   --output-wav matcha-zh-1.wav \
+   --text "某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。" \
+ 
+ dart run \
+   ./bin/matcha-zh.dart \
+   --acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
+   --vocoder ./hifigan_v2.onnx \
+   --lexicon ./matcha-icefall-zh-baker/lexicon.txt \
+   --tokens ./matcha-icefall-zh-baker/tokens.txt \
+   --dict-dir ./matcha-icefall-zh-baker/dict \
+   --sid 0 \
+   --speed 1.0 \
+   --output-wav matcha-zh-2.wav \
+   --text "当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔." \
+ 
+ ls -lh *.wav
--- a/dart-api-examples/tts/run-zh.sh → dart-api-examples/tts/run-vits-zh.sh
查看文件 @d7c95d3
+++ b/dart-api-examples/tts/run-zh.sh → dart-api-examples/tts/run-vits-zh.sh
查看文件 @d7c95d3
@@ -16,7 +16,7 @@ if [[ ! -f ./sherpa-onnx-vits-zh-ll/tokens.txt ]]; then
 fi
 
 dart run \
-   ./bin/zh.dart \
+   ./bin/vits-zh.dart \
   --model ./sherpa-onnx-vits-zh-ll/model.onnx \
   --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
   --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
@@ -24,10 +24,10 @@ dart run \
   --sid 2 \
   --speed 1.0 \
   --text '当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔。' \
-   --output-wav zh-jieba-2.wav
+   --output-wav vits-zh-jieba-2.wav
 
 dart run \
-   ./bin/zh.dart \
+   ./bin/vits-zh.dart \
   --model ./sherpa-onnx-vits-zh-ll/model.onnx \
   --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
   --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
@@ -36,6 +36,6 @@ dart run \
   --sid 3 \
   --speed 1.0 \
   --text '今天是2024年6月15号，13点23分。如果有困难，请拨打110或者18920240511。123456块钱。' \
-   --output-wav zh-jieba-3.wav
+   --output-wav vits-zh-jieba-3.wav
 
 ls -lh *.wav
--- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
查看文件 @d7c95d3
+++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
查看文件 @d7c95d3
@@ -131,6 +131,22 @@ final class SherpaOnnxOfflineTtsVitsModelConfig extends Struct {
   external Pointer<Utf8> dictDir;
 }
 
+ final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct {
+   external Pointer<Utf8> acousticModel;
+   external Pointer<Utf8> vocoder;
+   external Pointer<Utf8> lexicon;
+   external Pointer<Utf8> tokens;
+   external Pointer<Utf8> dataDir;
+ 
+   @Float()
+   external double noiseScale;
+ 
+   @Float()
+   external double lengthScale;
+ 
+   external Pointer<Utf8> dictDir;
+ }
+ 
 final class SherpaOnnxOfflineTtsModelConfig extends Struct {
   external SherpaOnnxOfflineTtsVitsModelConfig vits;
   @Int32()
@@ -140,6 +156,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct {
   external int debug;
 
   external Pointer<Utf8> provider;
+   external SherpaOnnxOfflineTtsMatchaModelConfig matcha;
 }
 
 final class SherpaOnnxOfflineTtsConfig extends Struct {
--- a/flutter/sherpa_onnx/lib/src/tts.dart
查看文件 @d7c95d3
+++ b/flutter/sherpa_onnx/lib/src/tts.dart
查看文件 @d7c95d3
@@ -8,9 +8,9 @@ import './sherpa_onnx_bindings.dart';
 
 class OfflineTtsVitsModelConfig {
   const OfflineTtsVitsModelConfig({
-     required this.model,
+     this.model = '',
     this.lexicon = '',
-     required this.tokens,
+     this.tokens = '',
     this.dataDir = '',
     this.noiseScale = 0.667,
     this.noiseScaleW = 0.8,
@@ -33,9 +33,37 @@ class OfflineTtsVitsModelConfig {
   final String dictDir;
 }
 
+ class OfflineTtsMatchaModelConfig {
+   const OfflineTtsMatchaModelConfig({
+     this.acousticModel = '',
+     this.vocoder = '',
+     this.lexicon = '',
+     this.tokens = '',
+     this.dataDir = '',
+     this.noiseScale = 0.667,
+     this.lengthScale = 1.0,
+     this.dictDir = '',
+   });
+ 
+   @override
+   String toString() {
+     return 'OfflineTtsMatchaModelConfig(acousticModel: $acousticModel, vocoder: $vocoder, lexicon: $lexicon, tokens: $tokens, dataDir: $dataDir, noiseScale: $noiseScale, lengthScale: $lengthScale, dictDir: $dictDir)';
+   }
+ 
+   final String acousticModel;
+   final String vocoder;
+   final String lexicon;
+   final String tokens;
+   final String dataDir;
+   final double noiseScale;
+   final double lengthScale;
+   final String dictDir;
+ }
+ 
 class OfflineTtsModelConfig {
   const OfflineTtsModelConfig({
-     required this.vits,
+     this.vits = const OfflineTtsVitsModelConfig(),
+     this.matcha = const OfflineTtsMatchaModelConfig(),
     this.numThreads = 1,
     this.debug = true,
     this.provider = 'cpu',
@@ -43,10 +71,11 @@ class OfflineTtsModelConfig {
 
   @override
   String toString() {
-     return 'OfflineTtsModelConfig(vits: $vits, numThreads: $numThreads, debug: $debug, provider: $provider)';
+     return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, numThreads: $numThreads, debug: $debug, provider: $provider)';
   }
 
   final OfflineTtsVitsModelConfig vits;
+   final OfflineTtsMatchaModelConfig matcha;
   final int numThreads;
   final bool debug;
   final String provider;
@@ -99,6 +128,16 @@ class OfflineTts {
     c.ref.model.vits.lengthScale = config.model.vits.lengthScale;
     c.ref.model.vits.dictDir = config.model.vits.dictDir.toNativeUtf8();
 
+     c.ref.model.matcha.acousticModel =
+         config.model.matcha.acousticModel.toNativeUtf8();
+     c.ref.model.matcha.vocoder = config.model.matcha.vocoder.toNativeUtf8();
+     c.ref.model.matcha.lexicon = config.model.matcha.lexicon.toNativeUtf8();
+     c.ref.model.matcha.tokens = config.model.matcha.tokens.toNativeUtf8();
+     c.ref.model.matcha.dataDir = config.model.matcha.dataDir.toNativeUtf8();
+     c.ref.model.matcha.noiseScale = config.model.matcha.noiseScale;
+     c.ref.model.matcha.lengthScale = config.model.matcha.lengthScale;
+     c.ref.model.matcha.dictDir = config.model.matcha.dictDir.toNativeUtf8();
+ 
     c.ref.model.numThreads = config.model.numThreads;
     c.ref.model.debug = config.model.debug ? 1 : 0;
     c.ref.model.provider = config.model.provider.toNativeUtf8();
@@ -112,6 +151,12 @@ class OfflineTts {
     calloc.free(c.ref.ruleFars);
     calloc.free(c.ref.ruleFsts);
     calloc.free(c.ref.model.provider);
+     calloc.free(c.ref.model.matcha.dictDir);
+     calloc.free(c.ref.model.matcha.dataDir);
+     calloc.free(c.ref.model.matcha.tokens);
+     calloc.free(c.ref.model.matcha.lexicon);
+     calloc.free(c.ref.model.matcha.vocoder);
+     calloc.free(c.ref.model.matcha.acousticModel);
     calloc.free(c.ref.model.vits.dictDir);
     calloc.free(c.ref.model.vits.dataDir);
     calloc.free(c.ref.model.vits.tokens);