Fangjun Kuang
Committed by GitHub

Add Dart API for KittenTTS (#2475)

... ... @@ -4,6 +4,34 @@ set -ex
cd dart-api-examples
pushd tts
echo '----------matcha tts----------'
./run-kitten-en.sh
./run-kokoro-zh-en.sh
./run-kokoro-en.sh
./run-matcha-zh.sh
./run-matcha-en.sh
ls -lh *.wav
rm -rf matcha-icefall-*
rm *.onnx
echo '----------piper tts----------'
./run-piper.sh
rm -rf vits-piper-*
echo '----------coqui tts----------'
./run-coqui.sh
rm -rf vits-coqui-*
echo '----------zh tts----------'
./run-vits-zh.sh
rm -rf sherpa-onnx-*
ls -lh *.wav
popd # tts
pushd vad
./run-ten-vad.sh
./run.sh
... ... @@ -72,33 +100,6 @@ echo "speech enhancement with gtcrn models"
ls -lh
popd
pushd tts
echo '----------matcha tts----------'
./run-kokoro-zh-en.sh
./run-kokoro-en.sh
./run-matcha-zh.sh
./run-matcha-en.sh
ls -lh *.wav
rm -rf matcha-icefall-*
rm *.onnx
echo '----------piper tts----------'
./run-piper.sh
rm -rf vits-piper-*
echo '----------coqui tts----------'
./run-coqui.sh
rm -rf vits-coqui-*
echo '----------zh tts----------'
./run-vits-zh.sh
rm -rf sherpa-onnx-*
ls -lh *.wav
popd # tts
pushd speaker-diarization
echo '----------speaker diarization----------'
./run.sh
... ...
// Copyright (c) 2025 Xiaomi Corporation
import 'dart:io';
import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
import './init.dart';
void main(List<String> arguments) async {
await initSherpaOnnx();
final parser = ArgParser()
..addOption('model', help: 'Path to the onnx model')
..addOption('voices', help: 'Path to the voices.bin')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption(
'data-dir',
help: 'Path to espeak-ng-data directory',
defaultsTo: '',
)
..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
..addOption('text', help: 'Text to generate TTS for')
..addOption('output-wav', help: 'Filename to save the generated audio')
..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
..addOption(
'sid',
help: 'Speaker ID to select. Used only for multi-speaker TTS',
defaultsTo: '0',
);
final res = parser.parse(arguments);
if (res['model'] == null ||
res['voices'] == null ||
res['tokens'] == null ||
res['data-dir'] == null ||
res['output-wav'] == null ||
res['text'] == null) {
print(parser.usage);
exit(1);
}
final model = res['model'] as String;
final voices = res['voices'] as String;
final tokens = res['tokens'] as String;
final dataDir = res['data-dir'] as String;
final ruleFsts = res['rule-fsts'] as String;
final ruleFars = res['rule-fars'] as String;
final text = res['text'] as String;
final outputWav = res['output-wav'] as String;
var speed = double.tryParse(res['speed'] as String) ?? 1.0;
final sid = int.tryParse(res['sid'] as String) ?? 0;
if (speed == 0) {
speed = 1.0;
}
final kitten = sherpa_onnx.OfflineTtsKittenModelConfig(
model: model,
voices: voices,
tokens: tokens,
dataDir: dataDir,
lengthScale: 1 / speed,
);
final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
kitten: kitten,
numThreads: 1,
debug: true,
);
final config = sherpa_onnx.OfflineTtsConfig(
model: modelConfig,
maxNumSenetences: 1,
ruleFsts: ruleFsts,
ruleFars: ruleFars,
);
final tts = sherpa_onnx.OfflineTts(config);
final audio = tts.generate(text: text, sid: sid, speed: speed);
tts.free();
sherpa_onnx.writeWave(
filename: outputWav,
samples: audio.samples,
sampleRate: audio.sampleRate,
);
print('Saved to $outputWav');
}
... ...
#!/usr/bin/env bash
set -ex
dart pub get
# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kitten.html
# to download more models
if [ ! -f ./kitten-nano-en-v0_1-fp16/model.fp16.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-nano-en-v0_1-fp16.tar.bz2
tar xf kitten-nano-en-v0_1-fp16.tar.bz2
rm kitten-nano-en-v0_1-fp16.tar.bz2
fi
dart run \
./bin/kitten-en.dart \
--model ./kitten-nano-en-v0_1-fp16/model.fp16.onnx \
--voices ./kitten-nano-en-v0_1-fp16/voices.bin \
--tokens ./kitten-nano-en-v0_1-fp16/tokens.txt \
--data-dir ./kitten-nano-en-v0_1-fp16/espeak-ng-data \
--sid 0 \
--speed 1.0 \
--output-wav kitten-en-0.wav \
--text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
ls -lh *.wav
... ...
... ... @@ -150,9 +150,18 @@ class AudioTagging {
final labelsPtr = config.labels.toNativeUtf8();
c.ref.labels = labelsPtr;
if (SherpaOnnxBindings.sherpaOnnxCreateAudioTagging == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr =
SherpaOnnxBindings.sherpaOnnxCreateAudioTagging?.call(c) ?? nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create audio tagging. Please check your config");
}
calloc.free(labelsPtr);
calloc.free(providerPtr);
calloc.free(cedPtr);
... ...
... ... @@ -140,8 +140,16 @@ class KeywordSpotter {
c.ref.keywordsBuf = config.keywordsBuf.toNativeUtf8();
c.ref.keywordsBufSize = config.keywordsBufSize;
if (SherpaOnnxBindings.createKeywordSpotter == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr = SherpaOnnxBindings.createKeywordSpotter?.call(c) ?? nullptr;
if (ptr == nullptr) {
throw Exception("Failed to create kws. Please check your config");
}
calloc.free(c.ref.keywordsBuf);
calloc.free(c.ref.keywordsFile);
calloc.free(c.ref.model.bpeVocab);
... ...
... ... @@ -79,10 +79,19 @@ class OfflinePunctuation {
final providerPtr = config.model.provider.toNativeUtf8();
c.ref.model.provider = providerPtr;
if (SherpaOnnxBindings.sherpaOnnxCreateOfflinePunctuation == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr =
SherpaOnnxBindings.sherpaOnnxCreateOfflinePunctuation?.call(c) ??
nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create offline punctuation. Please check your config");
}
calloc.free(providerPtr);
calloc.free(ctTransformerPtr);
calloc.free(c);
... ...
... ... @@ -598,8 +598,17 @@ class OfflineRecognizer {
factory OfflineRecognizer(OfflineRecognizerConfig config) {
final c = convertConfig(config);
if (SherpaOnnxBindings.createOfflineRecognizer == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr = SherpaOnnxBindings.createOfflineRecognizer?.call(c) ?? nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create offline recognizer. Please check your config");
}
freeConfig(c);
return OfflineRecognizer._(ptr: ptr, config: config);
... ...
... ... @@ -211,10 +211,19 @@ class OfflineSpeakerDiarization {
c.ref.minDurationOn = config.minDurationOn;
c.ref.minDurationOff = config.minDurationOff;
if (SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeakerDiarization == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr =
SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeakerDiarization?.call(c) ??
nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create offline speaker diarization. Please check your config");
}
calloc.free(c.ref.embedding.provider);
calloc.free(c.ref.embedding.model);
calloc.free(c.ref.segmentation.provider);
... ...
... ... @@ -118,10 +118,19 @@ class OfflineSpeechDenoiser {
c.ref.model.debug = config.model.debug ? 1 : 0;
c.ref.model.provider = config.model.provider.toNativeUtf8();
if (SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeechDenoiser == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr =
SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeechDenoiser?.call(c) ??
nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create offline speech denoiser. Please check your config");
}
calloc.free(c.ref.model.provider);
calloc.free(c.ref.model.gtcrn.model);
... ...
... ... @@ -89,9 +89,18 @@ class OnlinePunctuation {
final providerPtr = config.model.provider.toNativeUtf8();
c.ref.model.provider = providerPtr;
if (SherpaOnnxBindings.sherpaOnnxCreateOnlinePunctuation == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr = SherpaOnnxBindings.sherpaOnnxCreateOnlinePunctuation?.call(c) ??
nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create online punctuation. Please check your config");
}
// Free the allocated strings and struct memory
calloc.free(providerPtr);
calloc.free(cnnBiLstmPtr);
... ...
... ... @@ -391,8 +391,17 @@ class OnlineRecognizer {
c.ref.hr.lexicon = config.hr.lexicon.toNativeUtf8();
c.ref.hr.ruleFsts = config.hr.ruleFsts.toNativeUtf8();
if (SherpaOnnxBindings.createOnlineRecognizer == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr = SherpaOnnxBindings.createOnlineRecognizer?.call(c) ?? nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create online recognizer. Please check your config");
}
calloc.free(c.ref.hr.dictDir);
calloc.free(c.ref.hr.lexicon);
calloc.free(c.ref.hr.ruleFsts);
... ...
... ... @@ -204,6 +204,16 @@ final class SherpaOnnxOfflineTtsKokoroModelConfig extends Struct {
external Pointer<Utf8> lang;
}
final class SherpaOnnxOfflineTtsKittenModelConfig extends Struct {
external Pointer<Utf8> model;
external Pointer<Utf8> voices;
external Pointer<Utf8> tokens;
external Pointer<Utf8> dataDir;
@Float()
external double lengthScale;
}
final class SherpaOnnxOfflineTtsModelConfig extends Struct {
external SherpaOnnxOfflineTtsVitsModelConfig vits;
@Int32()
... ... @@ -215,6 +225,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct {
external Pointer<Utf8> provider;
external SherpaOnnxOfflineTtsMatchaModelConfig matcha;
external SherpaOnnxOfflineTtsKokoroModelConfig kokoro;
external SherpaOnnxOfflineTtsKittenModelConfig kitten;
}
final class SherpaOnnxOfflineTtsConfig extends Struct {
... ...
... ... @@ -60,9 +60,18 @@ class SpeakerEmbeddingExtractor {
final providerPtr = config.provider.toNativeUtf8();
c.ref.provider = providerPtr;
if (SherpaOnnxBindings.createSpeakerEmbeddingExtractor == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr =
SherpaOnnxBindings.createSpeakerEmbeddingExtractor?.call(c) ?? nullptr;
if (ptr == nullptr) {
throw Exception(
"Failed to create speaker embedding extractor. Please check your config");
}
calloc.free(providerPtr);
calloc.free(modelPtr);
calloc.free(c);
... ...
... ... @@ -159,11 +159,51 @@ class OfflineTtsKokoroModelConfig {
final String lang;
}
class OfflineTtsKittenModelConfig {
const OfflineTtsKittenModelConfig({
this.model = '',
this.voices = '',
this.tokens = '',
this.dataDir = '',
this.lengthScale = 1.0,
});
factory OfflineTtsKittenModelConfig.fromJson(Map<String, dynamic> json) {
return OfflineTtsKittenModelConfig(
model: json['model'] as String? ?? '',
voices: json['voices'] as String? ?? '',
tokens: json['tokens'] as String? ?? '',
dataDir: json['dataDir'] as String? ?? '',
lengthScale: (json['lengthScale'] as num?)?.toDouble() ?? 1.0,
);
}
@override
String toString() {
return 'OfflineTtsKittenModelConfig(model: $model, voices: $voices, tokens: $tokens, dataDir: $dataDir, lengthScale: $lengthScale)';
}
Map<String, dynamic> toJson() => {
'model': model,
'voices': voices,
'tokens': tokens,
'dataDir': dataDir,
'lengthScale': lengthScale,
};
final String model;
final String voices;
final String tokens;
final String dataDir;
final double lengthScale;
}
class OfflineTtsModelConfig {
const OfflineTtsModelConfig({
this.vits = const OfflineTtsVitsModelConfig(),
this.matcha = const OfflineTtsMatchaModelConfig(),
this.kokoro = const OfflineTtsKokoroModelConfig(),
this.kitten = const OfflineTtsKittenModelConfig(),
this.numThreads = 1,
this.debug = true,
this.provider = 'cpu',
... ... @@ -177,6 +217,8 @@ class OfflineTtsModelConfig {
json['matcha'] as Map<String, dynamic>? ?? const {}),
kokoro: OfflineTtsKokoroModelConfig.fromJson(
json['kokoro'] as Map<String, dynamic>? ?? const {}),
kitten: OfflineTtsKittenModelConfig.fromJson(
json['kitten'] as Map<String, dynamic>? ?? const {}),
numThreads: json['numThreads'] as int? ?? 1,
debug: json['debug'] as bool? ?? true,
provider: json['provider'] as String? ?? 'cpu',
... ... @@ -185,13 +227,14 @@ class OfflineTtsModelConfig {
@override
String toString() {
return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, kokoro: $kokoro, numThreads: $numThreads, debug: $debug, provider: $provider)';
return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, kokoro: $kokoro, kitten: $kitten, numThreads: $numThreads, debug: $debug, provider: $provider)';
}
Map<String, dynamic> toJson() => {
'vits': vits.toJson(),
'matcha': matcha.toJson(),
'kokoro': kokoro.toJson(),
'kitten': kitten.toJson(),
'numThreads': numThreads,
'debug': debug,
'provider': provider,
... ... @@ -200,6 +243,7 @@ class OfflineTtsModelConfig {
final OfflineTtsVitsModelConfig vits;
final OfflineTtsMatchaModelConfig matcha;
final OfflineTtsKokoroModelConfig kokoro;
final OfflineTtsKittenModelConfig kitten;
final int numThreads;
final bool debug;
final String provider;
... ... @@ -292,6 +336,12 @@ class OfflineTts {
c.ref.model.kokoro.lexicon = config.model.kokoro.lexicon.toNativeUtf8();
c.ref.model.kokoro.lang = config.model.kokoro.lang.toNativeUtf8();
c.ref.model.kitten.model = config.model.kitten.model.toNativeUtf8();
c.ref.model.kitten.voices = config.model.kitten.voices.toNativeUtf8();
c.ref.model.kitten.tokens = config.model.kitten.tokens.toNativeUtf8();
c.ref.model.kitten.dataDir = config.model.kitten.dataDir.toNativeUtf8();
c.ref.model.kitten.lengthScale = config.model.kitten.lengthScale;
c.ref.model.numThreads = config.model.numThreads;
c.ref.model.debug = config.model.debug ? 1 : 0;
c.ref.model.provider = config.model.provider.toNativeUtf8();
... ... @@ -301,12 +351,25 @@ class OfflineTts {
c.ref.ruleFars = config.ruleFars.toNativeUtf8();
c.ref.silenceScale = config.silenceScale;
if (SherpaOnnxBindings.createOfflineTts == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr;
if (ptr == nullptr) {
throw Exception("Failed to create offline tts. Please check your config");
}
calloc.free(c.ref.ruleFars);
calloc.free(c.ref.ruleFsts);
calloc.free(c.ref.model.provider);
calloc.free(c.ref.model.kitten.dataDir);
calloc.free(c.ref.model.kitten.tokens);
calloc.free(c.ref.model.kitten.voices);
calloc.free(c.ref.model.kitten.model);
calloc.free(c.ref.model.kokoro.lang);
calloc.free(c.ref.model.kokoro.lexicon);
calloc.free(c.ref.model.kokoro.dictDir);
... ...
... ... @@ -153,9 +153,19 @@ class CircularBuffer {
/// to avoid memory leak.
factory CircularBuffer({required int capacity}) {
assert(capacity > 0, 'capacity is $capacity');
if (SherpaOnnxBindings.createCircularBuffer == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final p =
SherpaOnnxBindings.createCircularBuffer?.call(capacity) ?? nullptr;
if (p == nullptr) {
throw Exception(
"Failed to create circular buffer. Please check your config");
}
return CircularBuffer._(ptr: p);
}
... ... @@ -243,10 +253,18 @@ class VoiceActivityDetector {
c.ref.debug = config.debug ? 1 : 0;
if (SherpaOnnxBindings.createVoiceActivityDetector == null) {
throw Exception("Please initialize sherpa-onnx first");
}
final ptr = SherpaOnnxBindings.createVoiceActivityDetector
?.call(c, bufferSizeInSeconds) ??
nullptr;
if (ptr == nullptr) {
throw Exception("Failed to create vad. Please check your config");
}
calloc.free(providerPtr);
calloc.free(tenVadModelPtr);
calloc.free(sileroVadModelPtr);
... ...
... ... @@ -15,6 +15,11 @@ class WaveData {
WaveData readWave(String filename) {
final Pointer<Utf8> str = filename.toNativeUtf8();
if (SherpaOnnxBindings.readWave == null) {
throw Exception("Please initialize sherpa-onnx first");
}
Pointer<SherpaOnnxWave> wave =
SherpaOnnxBindings.readWave?.call(str) ?? nullptr;
calloc.free(str);
... ...
... ... @@ -17,6 +17,10 @@ bool writeWave(
final pList = p.asTypedList(n);
pList.setAll(0, samples);
if (SherpaOnnxBindings.writeWave == null) {
throw Exception("Please initialize sherpa-onnx first");
}
int ok =
SherpaOnnxBindings.writeWave?.call(p, n, sampleRate, filenamePtr) ?? 0;
... ...