Committed by
GitHub
Add Dart API for Kokoro TTS models (#1723)
正在显示
5 个修改的文件
包含
162 行增加
和
1 行删除
| @@ -7,6 +7,7 @@ cd dart-api-examples | @@ -7,6 +7,7 @@ cd dart-api-examples | ||
| 7 | pushd tts | 7 | pushd tts |
| 8 | 8 | ||
| 9 | echo '----------matcha tts----------' | 9 | echo '----------matcha tts----------' |
| 10 | +./run-kokoro-en.sh | ||
| 10 | ./run-matcha-zh.sh | 11 | ./run-matcha-zh.sh |
| 11 | ./run-matcha-en.sh | 12 | ./run-matcha-en.sh |
| 12 | ls -lh *.wav | 13 | ls -lh *.wav |
dart-api-examples/tts/bin/kokoro-en.dart
0 → 100644
| 1 | +// Copyright (c) 2025 Xiaomi Corporation | ||
| 2 | +import 'dart:io'; | ||
| 3 | + | ||
| 4 | +import 'package:args/args.dart'; | ||
| 5 | +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; | ||
| 6 | + | ||
| 7 | +import './init.dart'; | ||
| 8 | + | ||
| 9 | +void main(List<String> arguments) async { | ||
| 10 | + await initSherpaOnnx(); | ||
| 11 | + | ||
| 12 | + final parser = ArgParser() | ||
| 13 | + ..addOption('model', help: 'Path to the onnx model') | ||
| 14 | + ..addOption('voices', help: 'Path to the voices.bin') | ||
| 15 | + ..addOption('tokens', help: 'Path to tokens.txt') | ||
| 16 | + ..addOption( | ||
| 17 | + 'data-dir', | ||
| 18 | + help: 'Path to espeak-ng-data directory', | ||
| 19 | + defaultsTo: '', | ||
| 20 | + ) | ||
| 21 | + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') | ||
| 22 | + ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') | ||
| 23 | + ..addOption('text', help: 'Text to generate TTS for') | ||
| 24 | + ..addOption('output-wav', help: 'Filename to save the generated audio') | ||
| 25 | + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') | ||
| 26 | + ..addOption( | ||
| 27 | + 'sid', | ||
| 28 | + help: 'Speaker ID to select. Used only for multi-speaker TTS', | ||
| 29 | + defaultsTo: '0', | ||
| 30 | + ); | ||
| 31 | + final res = parser.parse(arguments); | ||
| 32 | + if (res['model'] == null || | ||
| 33 | + res['voices'] == null || | ||
| 34 | + res['tokens'] == null || | ||
| 35 | + res['data-dir'] == null || | ||
| 36 | + res['output-wav'] == null || | ||
| 37 | + res['text'] == null) { | ||
| 38 | + print(parser.usage); | ||
| 39 | + exit(1); | ||
| 40 | + } | ||
| 41 | + final model = res['model'] as String; | ||
| 42 | + final voices = res['voices'] as String; | ||
| 43 | + final tokens = res['tokens'] as String; | ||
| 44 | + final dataDir = res['data-dir'] as String; | ||
| 45 | + final ruleFsts = res['rule-fsts'] as String; | ||
| 46 | + final ruleFars = res['rule-fars'] as String; | ||
| 47 | + final text = res['text'] as String; | ||
| 48 | + final outputWav = res['output-wav'] as String; | ||
| 49 | + var speed = double.tryParse(res['speed'] as String) ?? 1.0; | ||
| 50 | + final sid = int.tryParse(res['sid'] as String) ?? 0; | ||
| 51 | + | ||
| 52 | + if (speed == 0) { | ||
| 53 | + speed = 1.0; | ||
| 54 | + } | ||
| 55 | + | ||
| 56 | + final kokoro = sherpa_onnx.OfflineTtsKokoroModelConfig( | ||
| 57 | + model: model, | ||
| 58 | + voices: voices, | ||
| 59 | + tokens: tokens, | ||
| 60 | + dataDir: dataDir, | ||
| 61 | + lengthScale: 1 / speed, | ||
| 62 | + ); | ||
| 63 | + | ||
| 64 | + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( | ||
| 65 | + kokoro: kokoro, | ||
| 66 | + numThreads: 1, | ||
| 67 | + debug: true, | ||
| 68 | + ); | ||
| 69 | + final config = sherpa_onnx.OfflineTtsConfig( | ||
| 70 | + model: modelConfig, | ||
| 71 | + maxNumSenetences: 1, | ||
| 72 | + ruleFsts: ruleFsts, | ||
| 73 | + ruleFars: ruleFars, | ||
| 74 | + ); | ||
| 75 | + | ||
| 76 | + final tts = sherpa_onnx.OfflineTts(config); | ||
| 77 | + final audio = tts.generate(text: text, sid: sid, speed: speed); | ||
| 78 | + tts.free(); | ||
| 79 | + | ||
| 80 | + sherpa_onnx.writeWave( | ||
| 81 | + filename: outputWav, | ||
| 82 | + samples: audio.samples, | ||
| 83 | + sampleRate: audio.sampleRate, | ||
| 84 | + ); | ||
| 85 | + print('Saved to $outputWav'); | ||
| 86 | +} |
dart-api-examples/tts/run-kokoro-en.sh
0 → 100755
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +dart pub get | ||
| 6 | + | ||
| 7 | +# please visit | ||
| 8 | +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html | ||
| 9 | +# to download more models | ||
| 10 | +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then | ||
| 11 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 | ||
| 12 | + tar xf kokoro-en-v0_19.tar.bz2 | ||
| 13 | + rm kokoro-en-v0_19.tar.bz2 | ||
| 14 | +fi | ||
| 15 | + | ||
| 16 | +dart run \ | ||
| 17 | + ./bin/kokoro-en.dart \ | ||
| 18 | + --model ./kokoro-en-v0_19/model.onnx \ | ||
| 19 | + --voices ./kokoro-en-v0_19/voices.bin \ | ||
| 20 | + --tokens ./kokoro-en-v0_19/tokens.txt \ | ||
| 21 | + --data-dir ./kokoro-en-v0_19/espeak-ng-data \ | ||
| 22 | + --sid 9 \ | ||
| 23 | + --speed 1.0 \ | ||
| 24 | + --output-wav kokoro-en-9.wav \ | ||
| 25 | + --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \ | ||
| 26 | + | ||
| 27 | +ls -lh *.wav |
| @@ -147,6 +147,16 @@ final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct { | @@ -147,6 +147,16 @@ final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct { | ||
| 147 | external Pointer<Utf8> dictDir; | 147 | external Pointer<Utf8> dictDir; |
| 148 | } | 148 | } |
| 149 | 149 | ||
| 150 | +final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct { | ||
| 151 | + external Pointer<Utf8> model; | ||
| 152 | + external Pointer<Utf8> voices; | ||
| 153 | + external Pointer<Utf8> tokens; | ||
| 154 | + external Pointer<Utf8> dataDir; | ||
| 155 | + | ||
| 156 | + @Float() | ||
| 157 | + external double lengthScale; | ||
| 158 | +} | ||
| 159 | + | ||
| 150 | final class SherpaOnnxOfflineTtsModelConfig extends Struct { | 160 | final class SherpaOnnxOfflineTtsModelConfig extends Struct { |
| 151 | external SherpaOnnxOfflineTtsVitsModelConfig vits; | 161 | external SherpaOnnxOfflineTtsVitsModelConfig vits; |
| 152 | @Int32() | 162 | @Int32() |
| @@ -157,6 +167,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct { | @@ -157,6 +167,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct { | ||
| 157 | 167 | ||
| 158 | external Pointer<Utf8> provider; | 168 | external Pointer<Utf8> provider; |
| 159 | external SherpaOnnxOfflineTtsMatchaModelConfig matcha; | 169 | external SherpaOnnxOfflineTtsMatchaModelConfig matcha; |
| 170 | + external SherpaOnnxOfflineTtsKokoroModelConfig kokoro; | ||
| 160 | } | 171 | } |
| 161 | 172 | ||
| 162 | final class SherpaOnnxOfflineTtsConfig extends Struct { | 173 | final class SherpaOnnxOfflineTtsConfig extends Struct { |
| @@ -60,10 +60,32 @@ class OfflineTtsMatchaModelConfig { | @@ -60,10 +60,32 @@ class OfflineTtsMatchaModelConfig { | ||
| 60 | final String dictDir; | 60 | final String dictDir; |
| 61 | } | 61 | } |
| 62 | 62 | ||
| 63 | +class OfflineTtsKokoroModelConfig { | ||
| 64 | + const OfflineTtsKokoroModelConfig({ | ||
| 65 | + this.model = '', | ||
| 66 | + this.voices = '', | ||
| 67 | + this.tokens = '', | ||
| 68 | + this.dataDir = '', | ||
| 69 | + this.lengthScale = 1.0, | ||
| 70 | + }); | ||
| 71 | + | ||
| 72 | + @override | ||
| 73 | + String toString() { | ||
| 74 | + return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale)'; | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + final String model; | ||
| 78 | + final String voices; | ||
| 79 | + final String tokens; | ||
| 80 | + final String dataDir; | ||
| 81 | + final double lengthScale; | ||
| 82 | +} | ||
| 83 | + | ||
| 63 | class OfflineTtsModelConfig { | 84 | class OfflineTtsModelConfig { |
| 64 | const OfflineTtsModelConfig({ | 85 | const OfflineTtsModelConfig({ |
| 65 | this.vits = const OfflineTtsVitsModelConfig(), | 86 | this.vits = const OfflineTtsVitsModelConfig(), |
| 66 | this.matcha = const OfflineTtsMatchaModelConfig(), | 87 | this.matcha = const OfflineTtsMatchaModelConfig(), |
| 88 | + this.kokoro = const OfflineTtsKokoroModelConfig(), | ||
| 67 | this.numThreads = 1, | 89 | this.numThreads = 1, |
| 68 | this.debug = true, | 90 | this.debug = true, |
| 69 | this.provider = 'cpu', | 91 | this.provider = 'cpu', |
| @@ -71,11 +93,12 @@ class OfflineTtsModelConfig { | @@ -71,11 +93,12 @@ class OfflineTtsModelConfig { | ||
| 71 | 93 | ||
| 72 | @override | 94 | @override |
| 73 | String toString() { | 95 | String toString() { |
| 74 | - return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, numThreads: $numThreads, debug: $debug, provider: $provider)'; | 96 | + return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, kokoro: $kokoro, numThreads: $numThreads, debug: $debug, provider: $provider)'; |
| 75 | } | 97 | } |
| 76 | 98 | ||
| 77 | final OfflineTtsVitsModelConfig vits; | 99 | final OfflineTtsVitsModelConfig vits; |
| 78 | final OfflineTtsMatchaModelConfig matcha; | 100 | final OfflineTtsMatchaModelConfig matcha; |
| 101 | + final OfflineTtsKokoroModelConfig kokoro; | ||
| 79 | final int numThreads; | 102 | final int numThreads; |
| 80 | final bool debug; | 103 | final bool debug; |
| 81 | final String provider; | 104 | final String provider; |
| @@ -138,6 +161,12 @@ class OfflineTts { | @@ -138,6 +161,12 @@ class OfflineTts { | ||
| 138 | c.ref.model.matcha.lengthScale = config.model.matcha.lengthScale; | 161 | c.ref.model.matcha.lengthScale = config.model.matcha.lengthScale; |
| 139 | c.ref.model.matcha.dictDir = config.model.matcha.dictDir.toNativeUtf8(); | 162 | c.ref.model.matcha.dictDir = config.model.matcha.dictDir.toNativeUtf8(); |
| 140 | 163 | ||
| 164 | + c.ref.model.kokoro.model = config.model.kokoro.model.toNativeUtf8(); | ||
| 165 | + c.ref.model.kokoro.voices = config.model.kokoro.voices.toNativeUtf8(); | ||
| 166 | + c.ref.model.kokoro.tokens = config.model.kokoro.tokens.toNativeUtf8(); | ||
| 167 | + c.ref.model.kokoro.dataDir = config.model.kokoro.dataDir.toNativeUtf8(); | ||
| 168 | + c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale; | ||
| 169 | + | ||
| 141 | c.ref.model.numThreads = config.model.numThreads; | 170 | c.ref.model.numThreads = config.model.numThreads; |
| 142 | c.ref.model.debug = config.model.debug ? 1 : 0; | 171 | c.ref.model.debug = config.model.debug ? 1 : 0; |
| 143 | c.ref.model.provider = config.model.provider.toNativeUtf8(); | 172 | c.ref.model.provider = config.model.provider.toNativeUtf8(); |
| @@ -151,12 +180,19 @@ class OfflineTts { | @@ -151,12 +180,19 @@ class OfflineTts { | ||
| 151 | calloc.free(c.ref.ruleFars); | 180 | calloc.free(c.ref.ruleFars); |
| 152 | calloc.free(c.ref.ruleFsts); | 181 | calloc.free(c.ref.ruleFsts); |
| 153 | calloc.free(c.ref.model.provider); | 182 | calloc.free(c.ref.model.provider); |
| 183 | + | ||
| 184 | + calloc.free(c.ref.model.kokoro.dataDir); | ||
| 185 | + calloc.free(c.ref.model.kokoro.tokens); | ||
| 186 | + calloc.free(c.ref.model.kokoro.voices); | ||
| 187 | + calloc.free(c.ref.model.kokoro.model); | ||
| 188 | + | ||
| 154 | calloc.free(c.ref.model.matcha.dictDir); | 189 | calloc.free(c.ref.model.matcha.dictDir); |
| 155 | calloc.free(c.ref.model.matcha.dataDir); | 190 | calloc.free(c.ref.model.matcha.dataDir); |
| 156 | calloc.free(c.ref.model.matcha.tokens); | 191 | calloc.free(c.ref.model.matcha.tokens); |
| 157 | calloc.free(c.ref.model.matcha.lexicon); | 192 | calloc.free(c.ref.model.matcha.lexicon); |
| 158 | calloc.free(c.ref.model.matcha.vocoder); | 193 | calloc.free(c.ref.model.matcha.vocoder); |
| 159 | calloc.free(c.ref.model.matcha.acousticModel); | 194 | calloc.free(c.ref.model.matcha.acousticModel); |
| 195 | + | ||
| 160 | calloc.free(c.ref.model.vits.dictDir); | 196 | calloc.free(c.ref.model.vits.dictDir); |
| 161 | calloc.free(c.ref.model.vits.dataDir); | 197 | calloc.free(c.ref.model.vits.dataDir); |
| 162 | calloc.free(c.ref.model.vits.tokens); | 198 | calloc.free(c.ref.model.vits.tokens); |
-
请 注册 或 登录 后发表评论