Robin Zhong
Committed by GitHub

Add emotion, event of SenseVoice. (#1257)

* Add emotion, event of SenseVoice.

* Fix tokens size check and update java api.

https://github.com/k2-fsa/sherpa-onnx/pull/1257
@@ -531,6 +531,20 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult( @@ -531,6 +531,20 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult(
531 c_lang[lang.size()] = '\0'; 531 c_lang[lang.size()] = '\0';
532 r->lang = c_lang; 532 r->lang = c_lang;
533 533
  534 + // emotion
  535 + const auto &emotion = result.emotion;
  536 + char *c_emotion = new char[emotion.size() + 1];
  537 + std::copy(emotion.begin(), emotion.end(), c_emotion);
  538 + c_emotion[emotion.size()] = '\0';
  539 + r->emotion = c_emotion;
  540 +
  541 + // event
  542 + const auto &event = result.event;
  543 + char *c_event = new char[event.size() + 1];
  544 + std::copy(event.begin(), event.end(), c_event);
  545 + c_event[event.size()] = '\0';
  546 + r->event = c_event;
  547 +
534 // copy json 548 // copy json
535 std::string json = result.AsJsonString(); 549 std::string json = result.AsJsonString();
536 char *pJson = new char[json.size() + 1]; 550 char *pJson = new char[json.size() + 1];
@@ -588,6 +602,8 @@ void SherpaOnnxDestroyOfflineRecognizerResult( @@ -588,6 +602,8 @@ void SherpaOnnxDestroyOfflineRecognizerResult(
588 delete[] r->tokens_arr; 602 delete[] r->tokens_arr;
589 delete[] r->json; 603 delete[] r->json;
590 delete[] r->lang; 604 delete[] r->lang;
  605 + delete[] r->emotion;
  606 + delete[] r->event;
591 delete r; 607 delete r;
592 } 608 }
593 } 609 }
@@ -544,6 +544,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { @@ -544,6 +544,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
544 544
545 // return recognized language 545 // return recognized language
546 const char *lang; 546 const char *lang;
  547 +
  548 + // return emotion.
  549 + const char *emotion;
  550 +
  551 + // return event.
  552 + const char *event;
547 } SherpaOnnxOfflineRecognizerResult; 553 } SherpaOnnxOfflineRecognizerResult;
548 554
549 /// Get the result of the offline stream. 555 /// Get the result of the offline stream.
@@ -52,6 +52,13 @@ static OfflineRecognitionResult ConvertSenseVoiceResult( @@ -52,6 +52,13 @@ static OfflineRecognitionResult ConvertSenseVoiceResult(
52 52
53 r.words = std::move(src.words); 53 r.words = std::move(src.words);
54 54
  55 + // parse lang, emotion and event from tokens.
  56 + if (src.tokens.size() >= 3) {
  57 + r.lang = sym_table[src.tokens[0]];
  58 + r.emotion = sym_table[src.tokens[1]];
  59 + r.event = sym_table[src.tokens[2]];
  60 + }
  61 +
55 return r; 62 return r;
56 } 63 }
57 64
@@ -304,6 +304,19 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const { @@ -304,6 +304,19 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const {
304 std::string OfflineRecognitionResult::AsJsonString() const { 304 std::string OfflineRecognitionResult::AsJsonString() const {
305 std::ostringstream os; 305 std::ostringstream os;
306 os << "{"; 306 os << "{";
  307 +
  308 + os << "\"lang\""
  309 + << ": ";
  310 + os << std::quoted(lang) << ", ";
  311 +
  312 + os << "\"emotion\""
  313 + << ": ";
  314 + os << std::quoted(emotion) << ", ";
  315 +
  316 + os << "\"event\""
  317 + << ": ";
  318 + os << std::quoted(event) << ", ";
  319 +
307 os << "\"text\"" 320 os << "\"text\""
308 << ": "; 321 << ": ";
309 os << std::quoted(text) << ", "; 322 os << std::quoted(text) << ", ";
@@ -28,6 +28,12 @@ struct OfflineRecognitionResult { @@ -28,6 +28,12 @@ struct OfflineRecognitionResult {
28 28
29 std::string lang; 29 std::string lang;
30 30
  31 + // emotion target of the audio.
  32 + std::string emotion;
  33 +
  34 + // event target of the audio.
  35 + std::string event;
  36 +
31 /// timestamps.size() == tokens.size() 37 /// timestamps.size() == tokens.size()
32 /// timestamps[i] records the time in seconds when tokens[i] is decoded. 38 /// timestamps[i] records the time in seconds when tokens[i] is decoded.
33 std::vector<float> timestamps; 39 std::vector<float> timestamps;
@@ -41,7 +41,10 @@ public class OfflineRecognizer { @@ -41,7 +41,10 @@ public class OfflineRecognizer {
41 String text = (String) arr[0]; 41 String text = (String) arr[0];
42 String[] tokens = (String[]) arr[1]; 42 String[] tokens = (String[]) arr[1];
43 float[] timestamps = (float[]) arr[2]; 43 float[] timestamps = (float[]) arr[2];
44 - return new OfflineRecognizerResult(text, tokens, timestamps); 44 + String lang = (String) arr[3];
  45 + String emotion = (String) arr[4];
  46 + String event = (String) arr[5];
  47 + return new OfflineRecognizerResult(text, tokens, timestamps, lang, emotion, event);
45 } 48 }
46 49
47 private native void delete(long ptr); 50 private native void delete(long ptr);
@@ -6,11 +6,17 @@ public class OfflineRecognizerResult { @@ -6,11 +6,17 @@ public class OfflineRecognizerResult {
6 private final String text; 6 private final String text;
7 private final String[] tokens; 7 private final String[] tokens;
8 private final float[] timestamps; 8 private final float[] timestamps;
  9 + private final String lang;
  10 + private final String emotion;
  11 + private final String event;
9 12
10 - public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps) { 13 + public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps, String lang, String emotion, String event) {
11 this.text = text; 14 this.text = text;
12 this.tokens = tokens; 15 this.tokens = tokens;
13 this.timestamps = timestamps; 16 this.timestamps = timestamps;
  17 + this.lang = lang;
  18 + this.emotion = emotion;
  19 + this.event = event;
14 } 20 }
15 21
16 public String getText() { 22 public String getText() {
@@ -24,4 +30,16 @@ public class OfflineRecognizerResult { @@ -24,4 +30,16 @@ public class OfflineRecognizerResult {
24 public float[] getTimestamps() { 30 public float[] getTimestamps() {
25 return timestamps; 31 return timestamps;
26 } 32 }
  33 +
  34 + public String getLang() {
  35 + return lang;
  36 + }
  37 +
  38 + public String getEmotion() {
  39 + return emotion;
  40 + }
  41 +
  42 + public String getEvent() {
  43 + return event;
  44 + }
27 } 45 }
@@ -320,8 +320,11 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env, @@ -320,8 +320,11 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
320 // [0]: text, jstring 320 // [0]: text, jstring
321 // [1]: tokens, array of jstring 321 // [1]: tokens, array of jstring
322 // [2]: timestamps, array of float 322 // [2]: timestamps, array of float
  323 + // [3]: lang, jstring
  324 + // [4]: emotion, jstring
  325 + // [5]: event, jstring
323 jobjectArray obj_arr = (jobjectArray)env->NewObjectArray( 326 jobjectArray obj_arr = (jobjectArray)env->NewObjectArray(
324 - 3, env->FindClass("java/lang/Object"), nullptr); 327 + 6, env->FindClass("java/lang/Object"), nullptr);
325 328
326 jstring text = env->NewStringUTF(result.text.c_str()); 329 jstring text = env->NewStringUTF(result.text.c_str());
327 env->SetObjectArrayElement(obj_arr, 0, text); 330 env->SetObjectArrayElement(obj_arr, 0, text);
@@ -344,5 +347,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env, @@ -344,5 +347,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
344 347
345 env->SetObjectArrayElement(obj_arr, 2, timestamps_arr); 348 env->SetObjectArrayElement(obj_arr, 2, timestamps_arr);
346 349
  350 + // [3]: lang, jstring
  351 + // [4]: emotion, jstring
  352 + // [5]: event, jstring
  353 + env->SetObjectArrayElement(obj_arr, 3, env->NewStringUTF(result.lang.c_str()));
  354 + env->SetObjectArrayElement(obj_arr, 4, env->NewStringUTF(result.emotion.c_str()));
  355 + env->SetObjectArrayElement(obj_arr, 5, env->NewStringUTF(result.event.c_str()));
  356 +
347 return obj_arr; 357 return obj_arr;
348 } 358 }
@@ -6,6 +6,9 @@ data class OfflineRecognizerResult( @@ -6,6 +6,9 @@ data class OfflineRecognizerResult(
6 val text: String, 6 val text: String,
7 val tokens: Array<String>, 7 val tokens: Array<String>,
8 val timestamps: FloatArray, 8 val timestamps: FloatArray,
  9 + val lang: String,
  10 + val emotion: String,
  11 + val event: String,
9 ) 12 )
10 13
11 data class OfflineTransducerModelConfig( 14 data class OfflineTransducerModelConfig(
@@ -96,7 +99,10 @@ class OfflineRecognizer( @@ -96,7 +99,10 @@ class OfflineRecognizer(
96 val text = objArray[0] as String 99 val text = objArray[0] as String
97 val tokens = objArray[1] as Array<String> 100 val tokens = objArray[1] as Array<String>
98 val timestamps = objArray[2] as FloatArray 101 val timestamps = objArray[2] as FloatArray
99 - return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps) 102 + val lang = objArray[3] as String
  103 + val emotion = objArray[4] as String
  104 + val event = objArray[5] as String
  105 + return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps, lang = lang, emotion = emotion, event = event)
100 } 106 }
101 107
102 fun decode(stream: OfflineStream) = decode(ptr, stream.ptr) 108 fun decode(stream: OfflineStream) = decode(ptr, stream.ptr)
@@ -32,6 +32,12 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT @@ -32,6 +32,12 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
32 return py::str(PyUnicode_DecodeUTF8(self.text.c_str(), 32 return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
33 self.text.size(), "ignore")); 33 self.text.size(), "ignore"));
34 }) 34 })
  35 + .def_property_readonly("lang",
  36 + [](const PyClass &self) { return self.lang; })
  37 + .def_property_readonly("emotion",
  38 + [](const PyClass &self) { return self.emotion; })
  39 + .def_property_readonly("event",
  40 + [](const PyClass &self) { return self.event; })
35 .def_property_readonly("tokens", 41 .def_property_readonly("tokens",
36 [](const PyClass &self) { return self.tokens; }) 42 [](const PyClass &self) { return self.tokens; })
37 .def_property_readonly("words", 43 .def_property_readonly("words",