Fangjun Kuang
Committed by GitHub

Add jni interface and kotlin API examples for TTS. (#381)

hs_err*
main.jar
vits-zh-aishell3
... ...
... ... @@ -3,6 +3,28 @@ package com.k2fsa.sherpa.onnx
import android.content.res.AssetManager
fun main() {
testTts()
testAsr()
}
fun testTts() {
var config = OfflineTtsConfig(
model=OfflineTtsModelConfig(
vits=OfflineTtsVitsModelConfig(
model="./vits-zh-aishell3/vits-aishell3.onnx",
lexicon="./vits-zh-aishell3/lexicon.txt",
tokens="./vits-zh-aishell3/tokens.txt",
),
numThreads=1,
debug=true,
)
)
val tts = OfflineTts(config=config)
val audio = tts.generate(text="林美丽最美丽!", sid=99, speed=1.2f)
audio.save(filename="99.wav")
}
fun testAsr() {
var featConfig = FeatureConfig(
sampleRate = 16000,
featureDim = 80,
... ...
// Copyright (c) 2023 Xiaomi Corporation
package com.k2fsa.sherpa.onnx
import android.content.res.AssetManager
data class OfflineTtsVitsModelConfig(
var model: String,
var lexicon: String,
var tokens: String,
var noiseScale: Float = 0.667f,
var noiseScaleW: Float = 0.8f,
var lengthScale: Float = 1.0f,
)
data class OfflineTtsModelConfig(
var vits: OfflineTtsVitsModelConfig,
var numThreads: Int = 1,
var debug: Boolean = false,
var provider: String = "cpu",
)
data class OfflineTtsConfig(
var model: OfflineTtsModelConfig,
)
class GeneratedAudio(
val samples : FloatArray,
val sampleRate: Int,
) {
fun save(filename: String) = saveImpl(filename=filename, samples=samples, sampleRate=sampleRate)
private external fun saveImpl(
filename: String,
samples: FloatArray,
sampleRate: Int
): Boolean
}
class OfflineTts(
assetManager: AssetManager? = null,
var config: OfflineTtsConfig,
) {
private var ptr: Long
init {
if (assetManager != null) {
ptr = new(assetManager, config)
} else {
ptr = newFromFile(config)
}
}
fun generate(
text: String,
sid: Int = 0,
speed: Float = 1.0f
): GeneratedAudio {
var objArray = generateImpl(ptr, text=text, sid=sid, speed=speed)
return GeneratedAudio(samples=objArray[0] as FloatArray,
sampleRate=objArray[1] as Int)
}
fun allocate(assetManager: AssetManager? = null) {
if (ptr == 0L) {
if (assetManager != null) {
ptr = new(assetManager, config)
} else {
ptr = newFromFile(config)
}
}
}
fun free() {
if (ptr != 0L) {
delete(ptr)
ptr = 0
}
}
protected fun finalize() {
delete(ptr)
}
private external fun new(
assetManager: AssetManager,
config: OfflineTtsConfig,
): Long
private external fun newFromFile(
config: OfflineTtsConfig,
): Long
private external fun delete(ptr: Long)
// The returned array has two entries:
// - the first entry is an 1-D float array containing audio samples.
// Each sample is normalized to the range [-1, 1]
// - the second entry is the sample rate
external fun generateImpl(
ptr: Long,
text: String,
sid: Int = 0,
speed: Float = 1.0f
): Array<Any>
companion object {
init {
System.loadLibrary("sherpa-onnx-jni")
}
}
}
... ...
... ... @@ -6,11 +6,13 @@
set -e
cd ..
mkdir -p build
cd build
cmake \
if [ ! -f ../build/lib/libsherpa-onnx-jni.dylib ]; then
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
... ... @@ -19,8 +21,9 @@ cmake \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..
make -j4
ls -lh lib
make -j4
ls -lh lib
fi
export LD_LIBRARY_PATH=$PWD/build/lib:$LD_LIBRARY_PATH
... ... @@ -31,7 +34,7 @@ if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
fi
kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt
kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt Tts.kt
ls -lh main.jar
... ...
... ... @@ -10,7 +10,15 @@
#include <sstream>
#include <utility>
#if __ANDROID_API__ >= 9
#include <strstream>
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/text-utils.h"
namespace sherpa_onnx {
... ... @@ -22,11 +30,9 @@ static void ToLowerCase(std::string *in_out) {
// Note: We don't use SymbolTable here since tokens may contain a blank
// in the first column
static std::unordered_map<std::string, int32_t> ReadTokens(
const std::string &tokens) {
static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) {
std::unordered_map<std::string, int32_t> token2id;
std::ifstream is(tokens);
std::string line;
std::string sym;
... ... @@ -80,11 +86,43 @@ Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens,
bool debug /*= false*/)
: debug_(debug) {
InitLanguage(language);
InitTokens(tokens);
InitLexicon(lexicon);
{
std::ifstream is(tokens);
InitTokens(is);
}
{
std::ifstream is(lexicon);
InitLexicon(is);
}
InitPunctuations(punctuations);
}
#if __ANDROID_API__ >= 9
Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon,
const std::string &tokens, const std::string &punctuations,
const std::string &language, bool debug /*= false*/)
: debug_(debug) {
InitLanguage(language);
{
auto buf = ReadFile(mgr, tokens);
std::istrstream is(buf.data(), buf.size());
InitTokens(is);
}
{
auto buf = ReadFile(mgr, lexicon);
std::istrstream is(buf.data(), buf.size());
InitLexicon(is);
}
InitPunctuations(punctuations);
}
#endif
std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
const std::string &text) const {
switch (language_) {
... ... @@ -192,9 +230,7 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
return ans;
}
void Lexicon::InitTokens(const std::string &tokens) {
token2id_ = ReadTokens(tokens);
}
void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); }
void Lexicon::InitLanguage(const std::string &_lang) {
std::string lang(_lang);
... ... @@ -209,9 +245,7 @@ void Lexicon::InitLanguage(const std::string &_lang) {
}
}
void Lexicon::InitLexicon(const std::string &lexicon) {
std::ifstream is(lexicon);
void Lexicon::InitLexicon(std::istream &is) {
std::string word;
std::vector<std::string> token_list;
std::string line;
... ...
... ... @@ -6,11 +6,17 @@
#define SHERPA_ONNX_CSRC_LEXICON_H_
#include <cstdint>
#include <iostream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
namespace sherpa_onnx {
// TODO(fangjun): Refactor it to an abstract class
... ... @@ -20,6 +26,12 @@ class Lexicon {
const std::string &punctuations, const std::string &language,
bool debug = false);
#if __ANDROID_API__ >= 9
Lexicon(AAssetManager *mgr, const std::string &lexicon,
const std::string &tokens, const std::string &punctuations,
const std::string &language, bool debug = false);
#endif
std::vector<int64_t> ConvertTextToTokenIds(const std::string &text) const;
private:
... ... @@ -30,8 +42,8 @@ class Lexicon {
const std::string &text) const;
void InitLanguage(const std::string &lang);
void InitTokens(const std::string &tokens);
void InitLexicon(const std::string &lexicon);
void InitTokens(std::istream &is);
void InitLexicon(std::istream &is);
void InitPunctuations(const std::string &punctuations);
private:
... ...
... ... @@ -16,4 +16,12 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
return std::make_unique<OfflineTtsVitsImpl>(config);
}
#if __ANDROID_API__ >= 9
std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
AAssetManager *mgr, const OfflineTtsConfig &config) {
// TODO(fangjun): Support other types
return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
}
#endif
} // namespace sherpa_onnx
... ...
... ... @@ -8,6 +8,11 @@
#include <memory>
#include <string>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/offline-tts.h"
namespace sherpa_onnx {
... ... @@ -18,6 +23,11 @@ class OfflineTtsImpl {
static std::unique_ptr<OfflineTtsImpl> Create(const OfflineTtsConfig &config);
#if __ANDROID_API__ >= 9
static std::unique_ptr<OfflineTtsImpl> Create(AAssetManager *mgr,
const OfflineTtsConfig &config);
#endif
virtual GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const = 0;
};
... ...
... ... @@ -9,6 +9,11 @@
#include <utility>
#include <vector>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/lexicon.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-tts-impl.h"
... ... @@ -24,6 +29,14 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
model_->Punctuations(), model_->Language(),
config.model.debug) {}
#if __ANDROID_API__ >= 9
OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config)
: model_(std::make_unique<OfflineTtsVitsModel>(mgr, config.model)),
lexicon_(mgr, config.model.vits.lexicon, config.model.vits.tokens,
model_->Punctuations(), model_->Language(),
config.model.debug) {}
#endif
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const override {
int32_t num_speakers = model_->NumSpeakers();
... ...
... ... @@ -26,6 +26,17 @@ class OfflineTtsVitsModel::Impl {
Init(buf.data(), buf.size());
}
#if __ANDROID_API__ >= 9
Impl(AAssetManager *mgr, const OfflineTtsModelConfig &config)
: config_(config),
env_(ORT_LOGGING_LEVEL_WARNING),
sess_opts_(GetSessionOptions(config)),
allocator_{} {
auto buf = ReadFile(mgr, config.vits.model);
Init(buf.data(), buf.size());
}
#endif
Ort::Value Run(Ort::Value x, int64_t sid, float speed) {
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
... ... @@ -141,6 +152,12 @@ class OfflineTtsVitsModel::Impl {
OfflineTtsVitsModel::OfflineTtsVitsModel(const OfflineTtsModelConfig &config)
: impl_(std::make_unique<Impl>(config)) {}
#if __ANDROID_API__ >= 9
OfflineTtsVitsModel::OfflineTtsVitsModel(AAssetManager *mgr,
const OfflineTtsModelConfig &config)
: impl_(std::make_unique<Impl>(mgr, config)) {}
#endif
OfflineTtsVitsModel::~OfflineTtsVitsModel() = default;
Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/,
... ...
... ... @@ -8,6 +8,11 @@
#include <memory>
#include <string>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "onnxruntime_cxx_api.h" // NOLINT
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
... ... @@ -18,6 +23,9 @@ class OfflineTtsVitsModel {
~OfflineTtsVitsModel();
explicit OfflineTtsVitsModel(const OfflineTtsModelConfig &config);
#if __ANDROID_API__ >= 9
OfflineTtsVitsModel(AAssetManager *mgr, const OfflineTtsModelConfig &config);
#endif
/** Run the model.
*
... ...
... ... @@ -26,6 +26,11 @@ std::string OfflineTtsConfig::ToString() const {
OfflineTts::OfflineTts(const OfflineTtsConfig &config)
: impl_(OfflineTtsImpl::Create(config)) {}
#if __ANDROID_API__ >= 9
OfflineTts::OfflineTts(AAssetManager *mgr, const OfflineTtsConfig &config)
: impl_(OfflineTtsImpl::Create(mgr, config)) {}
#endif
OfflineTts::~OfflineTts() = default;
GeneratedAudio OfflineTts::Generate(const std::string &text, int64_t sid /*=0*/,
... ...
... ... @@ -9,6 +9,11 @@
#include <string>
#include <vector>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
#include "sherpa-onnx/csrc/parse-options.h"
... ... @@ -38,6 +43,11 @@ class OfflineTts {
public:
~OfflineTts();
explicit OfflineTts(const OfflineTtsConfig &config);
#if __ANDROID_API__ >= 9
OfflineTts(AAssetManager *mgr, const OfflineTtsConfig &config);
#endif
// @param text A string containing words separated by spaces
// @param sid Speaker ID. Used only for multi-speaker models, e.g., models
// trained using the VCTK dataset. It is not used for
... ...
... ... @@ -7,12 +7,13 @@
#include <cassert>
#include <fstream>
#include <sstream>
#include <strstream>
#include "sherpa-onnx/csrc/base64-decode.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#if __ANDROID_API__ >= 9
#include <strstream>
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
... ...
... ... @@ -21,10 +21,12 @@
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/offline-tts.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/voice-activity-detector.h"
#include "sherpa-onnx/csrc/wave-reader.h"
#include "sherpa-onnx/csrc/wave-writer.h"
#define SHERPA_ONNX_EXTERN_C extern "C"
... ... @@ -124,7 +126,7 @@ class SherpaOnnxVad {
void Pop() { vad_.Pop(); }
void Clear() { vad_.Clear();}
void Clear() { vad_.Clear(); }
const SpeechSegment &Front() const { return vad_.Front(); }
... ... @@ -491,9 +493,173 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
return ans;
}
class SherpaOnnxOfflineTts {
public:
#if __ANDROID_API__ >= 9
SherpaOnnxOfflineTts(AAssetManager *mgr, const OfflineTtsConfig &config)
: tts_(mgr, config) {}
#endif
explicit SherpaOnnxOfflineTts(const OfflineTtsConfig &config)
: tts_(config) {}
GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
float speed = 1.0) const {
return tts_.Generate(text, sid, speed);
}
private:
OfflineTts tts_;
};
static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) {
OfflineTtsConfig ans;
jclass cls = env->GetObjectClass(config);
jfieldID fid;
fid = env->GetFieldID(cls, "model",
"Lcom/k2fsa/sherpa/onnx/OfflineTtsModelConfig;");
jobject model = env->GetObjectField(config, fid);
jclass model_config_cls = env->GetObjectClass(model);
fid = env->GetFieldID(model_config_cls, "vits",
"Lcom/k2fsa/sherpa/onnx/OfflineTtsVitsModelConfig;");
jobject vits = env->GetObjectField(model, fid);
jclass vits_cls = env->GetObjectClass(vits);
fid = env->GetFieldID(vits_cls, "model", "Ljava/lang/String;");
jstring s = (jstring)env->GetObjectField(vits, fid);
const char *p = env->GetStringUTFChars(s, nullptr);
ans.model.vits.model = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(vits_cls, "lexicon", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(vits, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.vits.lexicon = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(vits_cls, "tokens", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(vits, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.vits.tokens = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(vits_cls, "noiseScale", "F");
ans.model.vits.noise_scale = env->GetFloatField(vits, fid);
fid = env->GetFieldID(vits_cls, "noiseScaleW", "F");
ans.model.vits.noise_scale_w = env->GetFloatField(vits, fid);
fid = env->GetFieldID(vits_cls, "lengthScale", "F");
ans.model.vits.length_scale = env->GetFloatField(vits, fid);
fid = env->GetFieldID(model_config_cls, "numThreads", "I");
ans.model.num_threads = env->GetIntField(model, fid);
fid = env->GetFieldID(model_config_cls, "debug", "Z");
ans.model.debug = env->GetBooleanField(model, fid);
fid = env->GetFieldID(model_config_cls, "provider", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(model, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model.provider = p;
env->ReleaseStringUTFChars(s, p);
return ans;
}
} // namespace sherpa_onnx
SHERPA_ONNX_EXTERN_C
JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new(
JNIEnv *env, jobject /*obj*/, jobject asset_manager, jobject _config) {
#if __ANDROID_API__ >= 9
AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager);
if (!mgr) {
SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr);
}
#endif
auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config);
SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str());
auto tts = new sherpa_onnx::SherpaOnnxOfflineTts(
#if __ANDROID_API__ >= 9
mgr,
#endif
config);
return (jlong)tts;
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_newFromFile(
JNIEnv *env, jobject /*obj*/, jobject _config) {
auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config);
SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str());
auto tts = new sherpa_onnx::SherpaOnnxOfflineTts(config);
return (jlong)tts;
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_delete(
JNIEnv *env, jobject /*obj*/, jlong ptr) {
delete reinterpret_cast<sherpa_onnx::SherpaOnnxOfflineTts *>(ptr);
}
// see
// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
static jobject NewInteger(JNIEnv *env, int32_t value) {
jclass cls = env->FindClass("java/lang/Integer");
jmethodID constructor = env->GetMethodID(cls, "<init>", "(I)V");
return env->NewObject(cls, constructor, value);
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT jobjectArray JNICALL
Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/,
jlong ptr, jstring text,
jint sid, jfloat speed) {
const char *p_text = env->GetStringUTFChars(text, nullptr);
SHERPA_ONNX_LOGE("string is: %s", p_text);
auto audio =
reinterpret_cast<sherpa_onnx::SherpaOnnxOfflineTts *>(ptr)->Generate(
p_text, sid, speed);
jfloatArray samples_arr = env->NewFloatArray(audio.samples.size());
env->SetFloatArrayRegion(samples_arr, 0, audio.samples.size(),
audio.samples.data());
jobjectArray obj_arr = (jobjectArray)env->NewObjectArray(
2, env->FindClass("java/lang/Object"), nullptr);
env->SetObjectArrayElement(obj_arr, 0, samples_arr);
env->SetObjectArrayElement(obj_arr, 1, NewInteger(env, audio.sample_rate));
env->ReleaseStringUTFChars(text, p_text);
return obj_arr;
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT jboolean JNICALL Java_com_k2fsa_sherpa_onnx_GeneratedAudio_saveImpl(
JNIEnv *env, jobject /*obj*/, jstring filename, jfloatArray samples,
jint sample_rate) {
const char *p_filename = env->GetStringUTFChars(filename, nullptr);
jfloat *p = env->GetFloatArrayElements(samples, nullptr);
jsize n = env->GetArrayLength(samples);
bool ok = sherpa_onnx::WriteWave(p_filename, sample_rate, p, n);
env->ReleaseStringUTFChars(filename, p_filename);
env->ReleaseFloatArrayElements(samples, p, JNI_ABORT);
return ok;
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_Vad_new(
JNIEnv *env, jobject /*obj*/, jobject asset_manager, jobject _config) {
#if __ANDROID_API__ >= 9
... ... @@ -513,6 +679,7 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_Vad_new(
return (jlong)model;
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_Vad_newFromFile(
JNIEnv *env, jobject /*obj*/, jobject _config) {
auto config = sherpa_onnx::GetVadModelConfig(env, _config);
... ... @@ -566,14 +733,6 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_clear(JNIEnv *env,
model->Clear();
}
// see
// https://stackoverflow.com/questions/29043872/android-jni-return-multiple-variables
static jobject NewInteger(JNIEnv *env, int32_t value) {
jclass cls = env->FindClass("java/lang/Integer");
jmethodID constructor = env->GetMethodID(cls, "<init>", "(I)V");
return env->NewObject(cls, constructor, value);
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT jobjectArray JNICALL
Java_com_k2fsa_sherpa_onnx_Vad_front(JNIEnv *env, jobject /*obj*/, jlong ptr) {
... ...