Fangjun Kuang
Committed by GitHub

Add Dart API for Kokoro TTS models (#1723)

@@ -7,6 +7,7 @@ cd dart-api-examples @@ -7,6 +7,7 @@ cd dart-api-examples
7 pushd tts 7 pushd tts
8 8
9 echo '----------matcha tts----------' 9 echo '----------matcha tts----------'
  10 +./run-kokoro-en.sh
10 ./run-matcha-zh.sh 11 ./run-matcha-zh.sh
11 ./run-matcha-en.sh 12 ./run-matcha-en.sh
12 ls -lh *.wav 13 ls -lh *.wav
  1 +// Copyright (c) 2025 Xiaomi Corporation
  2 +import 'dart:io';
  3 +
  4 +import 'package:args/args.dart';
  5 +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
  6 +
  7 +import './init.dart';
  8 +
  9 +void main(List<String> arguments) async {
  10 + await initSherpaOnnx();
  11 +
  12 + final parser = ArgParser()
  13 + ..addOption('model', help: 'Path to the onnx model')
  14 + ..addOption('voices', help: 'Path to the voices.bin')
  15 + ..addOption('tokens', help: 'Path to tokens.txt')
  16 + ..addOption(
  17 + 'data-dir',
  18 + help: 'Path to espeak-ng-data directory',
  19 + defaultsTo: '',
  20 + )
  21 + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
  22 + ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
  23 + ..addOption('text', help: 'Text to generate TTS for')
  24 + ..addOption('output-wav', help: 'Filename to save the generated audio')
  25 + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
  26 + ..addOption(
  27 + 'sid',
  28 + help: 'Speaker ID to select. Used only for multi-speaker TTS',
  29 + defaultsTo: '0',
  30 + );
  31 + final res = parser.parse(arguments);
  32 + if (res['model'] == null ||
  33 + res['voices'] == null ||
  34 + res['tokens'] == null ||
  35 + res['data-dir'] == null ||
  36 + res['output-wav'] == null ||
  37 + res['text'] == null) {
  38 + print(parser.usage);
  39 + exit(1);
  40 + }
  41 + final model = res['model'] as String;
  42 + final voices = res['voices'] as String;
  43 + final tokens = res['tokens'] as String;
  44 + final dataDir = res['data-dir'] as String;
  45 + final ruleFsts = res['rule-fsts'] as String;
  46 + final ruleFars = res['rule-fars'] as String;
  47 + final text = res['text'] as String;
  48 + final outputWav = res['output-wav'] as String;
  49 + var speed = double.tryParse(res['speed'] as String) ?? 1.0;
  50 + final sid = int.tryParse(res['sid'] as String) ?? 0;
  51 +
  52 + if (speed == 0) {
  53 + speed = 1.0;
  54 + }
  55 +
  56 + final kokoro = sherpa_onnx.OfflineTtsKokoroModelConfig(
  57 + model: model,
  58 + voices: voices,
  59 + tokens: tokens,
  60 + dataDir: dataDir,
  61 + lengthScale: 1 / speed,
  62 + );
  63 +
  64 + final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
  65 + kokoro: kokoro,
  66 + numThreads: 1,
  67 + debug: true,
  68 + );
  69 + final config = sherpa_onnx.OfflineTtsConfig(
  70 + model: modelConfig,
  71 + maxNumSenetences: 1,
  72 + ruleFsts: ruleFsts,
  73 + ruleFars: ruleFars,
  74 + );
  75 +
  76 + final tts = sherpa_onnx.OfflineTts(config);
  77 + final audio = tts.generate(text: text, sid: sid, speed: speed);
  78 + tts.free();
  79 +
  80 + sherpa_onnx.writeWave(
  81 + filename: outputWav,
  82 + samples: audio.samples,
  83 + sampleRate: audio.sampleRate,
  84 + );
  85 + print('Saved to $outputWav');
  86 +}
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +dart pub get
  6 +
  7 +# please visit
  8 +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html
  9 +# to download more models
  10 +if [ ! -f ./kokoro-en-v0_19/model.onnx ]; then
  11 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
  12 + tar xf kokoro-en-v0_19.tar.bz2
  13 + rm kokoro-en-v0_19.tar.bz2
  14 +fi
  15 +
  16 +dart run \
  17 + ./bin/kokoro-en.dart \
  18 + --model ./kokoro-en-v0_19/model.onnx \
  19 + --voices ./kokoro-en-v0_19/voices.bin \
  20 + --tokens ./kokoro-en-v0_19/tokens.txt \
  21 + --data-dir ./kokoro-en-v0_19/espeak-ng-data \
  22 + --sid 9 \
  23 + --speed 1.0 \
  24 + --output-wav kokoro-en-9.wav \
  25 + --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \
  26 +
  27 +ls -lh *.wav
@@ -147,6 +147,16 @@ final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct { @@ -147,6 +147,16 @@ final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct {
147 external Pointer<Utf8> dictDir; 147 external Pointer<Utf8> dictDir;
148 } 148 }
149 149
  150 +final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct {
  151 + external Pointer<Utf8> model;
  152 + external Pointer<Utf8> voices;
  153 + external Pointer<Utf8> tokens;
  154 + external Pointer<Utf8> dataDir;
  155 +
  156 + @Float()
  157 + external double lengthScale;
  158 +}
  159 +
150 final class SherpaOnnxOfflineTtsModelConfig extends Struct { 160 final class SherpaOnnxOfflineTtsModelConfig extends Struct {
151 external SherpaOnnxOfflineTtsVitsModelConfig vits; 161 external SherpaOnnxOfflineTtsVitsModelConfig vits;
152 @Int32() 162 @Int32()
@@ -157,6 +167,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct { @@ -157,6 +167,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct {
157 167
158 external Pointer<Utf8> provider; 168 external Pointer<Utf8> provider;
159 external SherpaOnnxOfflineTtsMatchaModelConfig matcha; 169 external SherpaOnnxOfflineTtsMatchaModelConfig matcha;
  170 + external SherpaOnnxOfflineTtsKokoroModelConfig kokoro;
160 } 171 }
161 172
162 final class SherpaOnnxOfflineTtsConfig extends Struct { 173 final class SherpaOnnxOfflineTtsConfig extends Struct {
@@ -60,10 +60,32 @@ class OfflineTtsMatchaModelConfig { @@ -60,10 +60,32 @@ class OfflineTtsMatchaModelConfig {
60 final String dictDir; 60 final String dictDir;
61 } 61 }
62 62
  63 +class OfflineTtsKokoroModelConfig {
  64 + const OfflineTtsKokoroModelConfig({
  65 + this.model = '',
  66 + this.voices = '',
  67 + this.tokens = '',
  68 + this.dataDir = '',
  69 + this.lengthScale = 1.0,
  70 + });
  71 +
  72 + @override
  73 + String toString() {
  74 + return 'OfflineTtsKokoroModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale)';
  75 + }
  76 +
  77 + final String model;
  78 + final String voices;
  79 + final String tokens;
  80 + final String dataDir;
  81 + final double lengthScale;
  82 +}
  83 +
63 class OfflineTtsModelConfig { 84 class OfflineTtsModelConfig {
64 const OfflineTtsModelConfig({ 85 const OfflineTtsModelConfig({
65 this.vits = const OfflineTtsVitsModelConfig(), 86 this.vits = const OfflineTtsVitsModelConfig(),
66 this.matcha = const OfflineTtsMatchaModelConfig(), 87 this.matcha = const OfflineTtsMatchaModelConfig(),
  88 + this.kokoro = const OfflineTtsKokoroModelConfig(),
67 this.numThreads = 1, 89 this.numThreads = 1,
68 this.debug = true, 90 this.debug = true,
69 this.provider = 'cpu', 91 this.provider = 'cpu',
@@ -71,11 +93,12 @@ class OfflineTtsModelConfig { @@ -71,11 +93,12 @@ class OfflineTtsModelConfig {
71 93
72 @override 94 @override
73 String toString() { 95 String toString() {
74 - return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, numThreads: $numThreads, debug: $debug, provider: $provider)'; 96 + return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, kokoro: $kokoro, numThreads: $numThreads, debug: $debug, provider: $provider)';
75 } 97 }
76 98
77 final OfflineTtsVitsModelConfig vits; 99 final OfflineTtsVitsModelConfig vits;
78 final OfflineTtsMatchaModelConfig matcha; 100 final OfflineTtsMatchaModelConfig matcha;
  101 + final OfflineTtsKokoroModelConfig kokoro;
79 final int numThreads; 102 final int numThreads;
80 final bool debug; 103 final bool debug;
81 final String provider; 104 final String provider;
@@ -138,6 +161,12 @@ class OfflineTts { @@ -138,6 +161,12 @@ class OfflineTts {
138 c.ref.model.matcha.lengthScale = config.model.matcha.lengthScale; 161 c.ref.model.matcha.lengthScale = config.model.matcha.lengthScale;
139 c.ref.model.matcha.dictDir = config.model.matcha.dictDir.toNativeUtf8(); 162 c.ref.model.matcha.dictDir = config.model.matcha.dictDir.toNativeUtf8();
140 163
  164 + c.ref.model.kokoro.model = config.model.kokoro.model.toNativeUtf8();
  165 + c.ref.model.kokoro.voices = config.model.kokoro.voices.toNativeUtf8();
  166 + c.ref.model.kokoro.tokens = config.model.kokoro.tokens.toNativeUtf8();
  167 + c.ref.model.kokoro.dataDir = config.model.kokoro.dataDir.toNativeUtf8();
  168 + c.ref.model.kokoro.lengthScale = config.model.kokoro.lengthScale;
  169 +
141 c.ref.model.numThreads = config.model.numThreads; 170 c.ref.model.numThreads = config.model.numThreads;
142 c.ref.model.debug = config.model.debug ? 1 : 0; 171 c.ref.model.debug = config.model.debug ? 1 : 0;
143 c.ref.model.provider = config.model.provider.toNativeUtf8(); 172 c.ref.model.provider = config.model.provider.toNativeUtf8();
@@ -151,12 +180,19 @@ class OfflineTts { @@ -151,12 +180,19 @@ class OfflineTts {
151 calloc.free(c.ref.ruleFars); 180 calloc.free(c.ref.ruleFars);
152 calloc.free(c.ref.ruleFsts); 181 calloc.free(c.ref.ruleFsts);
153 calloc.free(c.ref.model.provider); 182 calloc.free(c.ref.model.provider);
  183 +
  184 + calloc.free(c.ref.model.kokoro.dataDir);
  185 + calloc.free(c.ref.model.kokoro.tokens);
  186 + calloc.free(c.ref.model.kokoro.voices);
  187 + calloc.free(c.ref.model.kokoro.model);
  188 +
154 calloc.free(c.ref.model.matcha.dictDir); 189 calloc.free(c.ref.model.matcha.dictDir);
155 calloc.free(c.ref.model.matcha.dataDir); 190 calloc.free(c.ref.model.matcha.dataDir);
156 calloc.free(c.ref.model.matcha.tokens); 191 calloc.free(c.ref.model.matcha.tokens);
157 calloc.free(c.ref.model.matcha.lexicon); 192 calloc.free(c.ref.model.matcha.lexicon);
158 calloc.free(c.ref.model.matcha.vocoder); 193 calloc.free(c.ref.model.matcha.vocoder);
159 calloc.free(c.ref.model.matcha.acousticModel); 194 calloc.free(c.ref.model.matcha.acousticModel);
  195 +
160 calloc.free(c.ref.model.vits.dictDir); 196 calloc.free(c.ref.model.vits.dictDir);
161 calloc.free(c.ref.model.vits.dataDir); 197 calloc.free(c.ref.model.vits.dataDir);
162 calloc.free(c.ref.model.vits.tokens); 198 calloc.free(c.ref.model.vits.tokens);