Robin Zhong
Committed by GitHub

Add emotion, event of SenseVoice. (#1257)

* Add emotion, event of SenseVoice.

* Fix tokens size check and update java api.

https://github.com/k2-fsa/sherpa-onnx/pull/1257
... ... @@ -531,6 +531,20 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult(
c_lang[lang.size()] = '\0';
r->lang = c_lang;
// emotion
const auto &emotion = result.emotion;
char *c_emotion = new char[emotion.size() + 1];
std::copy(emotion.begin(), emotion.end(), c_emotion);
c_emotion[emotion.size()] = '\0';
r->emotion = c_emotion;
// event
const auto &event = result.event;
char *c_event = new char[event.size() + 1];
std::copy(event.begin(), event.end(), c_event);
c_event[event.size()] = '\0';
r->event = c_event;
// copy json
std::string json = result.AsJsonString();
char *pJson = new char[json.size() + 1];
... ... @@ -588,6 +602,8 @@ void SherpaOnnxDestroyOfflineRecognizerResult(
delete[] r->tokens_arr;
delete[] r->json;
delete[] r->lang;
delete[] r->emotion;
delete[] r->event;
delete r;
}
}
... ...
... ... @@ -544,6 +544,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
// return recognized language
const char *lang;
// return emotion.
const char *emotion;
// return event.
const char *event;
} SherpaOnnxOfflineRecognizerResult;
/// Get the result of the offline stream.
... ...
... ... @@ -52,6 +52,13 @@ static OfflineRecognitionResult ConvertSenseVoiceResult(
r.words = std::move(src.words);
// parse lang, emotion and event from tokens.
if (src.tokens.size() >= 3) {
r.lang = sym_table[src.tokens[0]];
r.emotion = sym_table[src.tokens[1]];
r.event = sym_table[src.tokens[2]];
}
return r;
}
... ...
... ... @@ -304,6 +304,19 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const {
std::string OfflineRecognitionResult::AsJsonString() const {
std::ostringstream os;
os << "{";
os << "\"lang\""
<< ": ";
os << std::quoted(lang) << ", ";
os << "\"emotion\""
<< ": ";
os << std::quoted(emotion) << ", ";
os << "\"event\""
<< ": ";
os << std::quoted(event) << ", ";
os << "\"text\""
<< ": ";
os << std::quoted(text) << ", ";
... ...
... ... @@ -28,6 +28,12 @@ struct OfflineRecognitionResult {
std::string lang;
// emotion target of the audio.
std::string emotion;
// event target of the audio.
std::string event;
/// timestamps.size() == tokens.size()
/// timestamps[i] records the time in seconds when tokens[i] is decoded.
std::vector<float> timestamps;
... ...
... ... @@ -41,7 +41,10 @@ public class OfflineRecognizer {
String text = (String) arr[0];
String[] tokens = (String[]) arr[1];
float[] timestamps = (float[]) arr[2];
return new OfflineRecognizerResult(text, tokens, timestamps);
String lang = (String) arr[3];
String emotion = (String) arr[4];
String event = (String) arr[5];
return new OfflineRecognizerResult(text, tokens, timestamps, lang, emotion, event);
}
private native void delete(long ptr);
... ...
... ... @@ -6,11 +6,17 @@ public class OfflineRecognizerResult {
private final String text;
private final String[] tokens;
private final float[] timestamps;
private final String lang;
private final String emotion;
private final String event;
public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps) {
public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps, String lang, String emotion, String event) {
this.text = text;
this.tokens = tokens;
this.timestamps = timestamps;
this.lang = lang;
this.emotion = emotion;
this.event = event;
}
public String getText() {
... ... @@ -24,4 +30,16 @@ public class OfflineRecognizerResult {
public float[] getTimestamps() {
return timestamps;
}
public String getLang() {
return lang;
}
public String getEmotion() {
return emotion;
}
public String getEvent() {
return event;
}
}
... ...
... ... @@ -320,8 +320,11 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
// [0]: text, jstring
// [1]: tokens, array of jstring
// [2]: timestamps, array of float
// [3]: lang, jstring
// [4]: emotion, jstring
// [5]: event, jstring
jobjectArray obj_arr = (jobjectArray)env->NewObjectArray(
3, env->FindClass("java/lang/Object"), nullptr);
6, env->FindClass("java/lang/Object"), nullptr);
jstring text = env->NewStringUTF(result.text.c_str());
env->SetObjectArrayElement(obj_arr, 0, text);
... ... @@ -344,5 +347,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
env->SetObjectArrayElement(obj_arr, 2, timestamps_arr);
// [3]: lang, jstring
// [4]: emotion, jstring
// [5]: event, jstring
env->SetObjectArrayElement(obj_arr, 3, env->NewStringUTF(result.lang.c_str()));
env->SetObjectArrayElement(obj_arr, 4, env->NewStringUTF(result.emotion.c_str()));
env->SetObjectArrayElement(obj_arr, 5, env->NewStringUTF(result.event.c_str()));
return obj_arr;
}
... ...
... ... @@ -6,6 +6,9 @@ data class OfflineRecognizerResult(
val text: String,
val tokens: Array<String>,
val timestamps: FloatArray,
val lang: String,
val emotion: String,
val event: String,
)
data class OfflineTransducerModelConfig(
... ... @@ -96,7 +99,10 @@ class OfflineRecognizer(
val text = objArray[0] as String
val tokens = objArray[1] as Array<String>
val timestamps = objArray[2] as FloatArray
return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps)
val lang = objArray[3] as String
val emotion = objArray[4] as String
val event = objArray[5] as String
return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps, lang = lang, emotion = emotion, event = event)
}
fun decode(stream: OfflineStream) = decode(ptr, stream.ptr)
... ...
... ... @@ -32,6 +32,12 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
self.text.size(), "ignore"));
})
.def_property_readonly("lang",
[](const PyClass &self) { return self.lang; })
.def_property_readonly("emotion",
[](const PyClass &self) { return self.emotion; })
.def_property_readonly("event",
[](const PyClass &self) { return self.event; })
.def_property_readonly("tokens",
[](const PyClass &self) { return self.tokens; })
.def_property_readonly("words",
... ...