Committed by
GitHub
Add emotion, event of SenseVoice. (#1257)
* Add emotion, event of SenseVoice. * Fix tokens size check and update java api. https://github.com/k2-fsa/sherpa-onnx/pull/1257
正在显示
10 个修改的文件
包含
95 行增加
和
4 行删除
| @@ -531,6 +531,20 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult( | @@ -531,6 +531,20 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult( | ||
| 531 | c_lang[lang.size()] = '\0'; | 531 | c_lang[lang.size()] = '\0'; |
| 532 | r->lang = c_lang; | 532 | r->lang = c_lang; |
| 533 | 533 | ||
| 534 | + // emotion | ||
| 535 | + const auto &emotion = result.emotion; | ||
| 536 | + char *c_emotion = new char[emotion.size() + 1]; | ||
| 537 | + std::copy(emotion.begin(), emotion.end(), c_emotion); | ||
| 538 | + c_emotion[emotion.size()] = '\0'; | ||
| 539 | + r->emotion = c_emotion; | ||
| 540 | + | ||
| 541 | + // event | ||
| 542 | + const auto &event = result.event; | ||
| 543 | + char *c_event = new char[event.size() + 1]; | ||
| 544 | + std::copy(event.begin(), event.end(), c_event); | ||
| 545 | + c_event[event.size()] = '\0'; | ||
| 546 | + r->event = c_event; | ||
| 547 | + | ||
| 534 | // copy json | 548 | // copy json |
| 535 | std::string json = result.AsJsonString(); | 549 | std::string json = result.AsJsonString(); |
| 536 | char *pJson = new char[json.size() + 1]; | 550 | char *pJson = new char[json.size() + 1]; |
| @@ -588,6 +602,8 @@ void SherpaOnnxDestroyOfflineRecognizerResult( | @@ -588,6 +602,8 @@ void SherpaOnnxDestroyOfflineRecognizerResult( | ||
| 588 | delete[] r->tokens_arr; | 602 | delete[] r->tokens_arr; |
| 589 | delete[] r->json; | 603 | delete[] r->json; |
| 590 | delete[] r->lang; | 604 | delete[] r->lang; |
| 605 | + delete[] r->emotion; | ||
| 606 | + delete[] r->event; | ||
| 591 | delete r; | 607 | delete r; |
| 592 | } | 608 | } |
| 593 | } | 609 | } |
| @@ -544,6 +544,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { | @@ -544,6 +544,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { | ||
| 544 | 544 | ||
| 545 | // return recognized language | 545 | // return recognized language |
| 546 | const char *lang; | 546 | const char *lang; |
| 547 | + | ||
| 548 | + // return emotion. | ||
| 549 | + const char *emotion; | ||
| 550 | + | ||
| 551 | + // return event. | ||
| 552 | + const char *event; | ||
| 547 | } SherpaOnnxOfflineRecognizerResult; | 553 | } SherpaOnnxOfflineRecognizerResult; |
| 548 | 554 | ||
| 549 | /// Get the result of the offline stream. | 555 | /// Get the result of the offline stream. |
| @@ -52,6 +52,13 @@ static OfflineRecognitionResult ConvertSenseVoiceResult( | @@ -52,6 +52,13 @@ static OfflineRecognitionResult ConvertSenseVoiceResult( | ||
| 52 | 52 | ||
| 53 | r.words = std::move(src.words); | 53 | r.words = std::move(src.words); |
| 54 | 54 | ||
| 55 | + // parse lang, emotion and event from tokens. | ||
| 56 | + if (src.tokens.size() >= 3) { | ||
| 57 | + r.lang = sym_table[src.tokens[0]]; | ||
| 58 | + r.emotion = sym_table[src.tokens[1]]; | ||
| 59 | + r.event = sym_table[src.tokens[2]]; | ||
| 60 | + } | ||
| 61 | + | ||
| 55 | return r; | 62 | return r; |
| 56 | } | 63 | } |
| 57 | 64 |
| @@ -304,6 +304,19 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const { | @@ -304,6 +304,19 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const { | ||
| 304 | std::string OfflineRecognitionResult::AsJsonString() const { | 304 | std::string OfflineRecognitionResult::AsJsonString() const { |
| 305 | std::ostringstream os; | 305 | std::ostringstream os; |
| 306 | os << "{"; | 306 | os << "{"; |
| 307 | + | ||
| 308 | + os << "\"lang\"" | ||
| 309 | + << ": "; | ||
| 310 | + os << std::quoted(lang) << ", "; | ||
| 311 | + | ||
| 312 | + os << "\"emotion\"" | ||
| 313 | + << ": "; | ||
| 314 | + os << std::quoted(emotion) << ", "; | ||
| 315 | + | ||
| 316 | + os << "\"event\"" | ||
| 317 | + << ": "; | ||
| 318 | + os << std::quoted(event) << ", "; | ||
| 319 | + | ||
| 307 | os << "\"text\"" | 320 | os << "\"text\"" |
| 308 | << ": "; | 321 | << ": "; |
| 309 | os << std::quoted(text) << ", "; | 322 | os << std::quoted(text) << ", "; |
| @@ -28,6 +28,12 @@ struct OfflineRecognitionResult { | @@ -28,6 +28,12 @@ struct OfflineRecognitionResult { | ||
| 28 | 28 | ||
| 29 | std::string lang; | 29 | std::string lang; |
| 30 | 30 | ||
| 31 | + // emotion target of the audio. | ||
| 32 | + std::string emotion; | ||
| 33 | + | ||
| 34 | + // event target of the audio. | ||
| 35 | + std::string event; | ||
| 36 | + | ||
| 31 | /// timestamps.size() == tokens.size() | 37 | /// timestamps.size() == tokens.size() |
| 32 | /// timestamps[i] records the time in seconds when tokens[i] is decoded. | 38 | /// timestamps[i] records the time in seconds when tokens[i] is decoded. |
| 33 | std::vector<float> timestamps; | 39 | std::vector<float> timestamps; |
| @@ -41,7 +41,10 @@ public class OfflineRecognizer { | @@ -41,7 +41,10 @@ public class OfflineRecognizer { | ||
| 41 | String text = (String) arr[0]; | 41 | String text = (String) arr[0]; |
| 42 | String[] tokens = (String[]) arr[1]; | 42 | String[] tokens = (String[]) arr[1]; |
| 43 | float[] timestamps = (float[]) arr[2]; | 43 | float[] timestamps = (float[]) arr[2]; |
| 44 | - return new OfflineRecognizerResult(text, tokens, timestamps); | 44 | + String lang = (String) arr[3]; |
| 45 | + String emotion = (String) arr[4]; | ||
| 46 | + String event = (String) arr[5]; | ||
| 47 | + return new OfflineRecognizerResult(text, tokens, timestamps, lang, emotion, event); | ||
| 45 | } | 48 | } |
| 46 | 49 | ||
| 47 | private native void delete(long ptr); | 50 | private native void delete(long ptr); |
| @@ -6,11 +6,17 @@ public class OfflineRecognizerResult { | @@ -6,11 +6,17 @@ public class OfflineRecognizerResult { | ||
| 6 | private final String text; | 6 | private final String text; |
| 7 | private final String[] tokens; | 7 | private final String[] tokens; |
| 8 | private final float[] timestamps; | 8 | private final float[] timestamps; |
| 9 | + private final String lang; | ||
| 10 | + private final String emotion; | ||
| 11 | + private final String event; | ||
| 9 | 12 | ||
| 10 | - public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps) { | 13 | + public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps, String lang, String emotion, String event) { |
| 11 | this.text = text; | 14 | this.text = text; |
| 12 | this.tokens = tokens; | 15 | this.tokens = tokens; |
| 13 | this.timestamps = timestamps; | 16 | this.timestamps = timestamps; |
| 17 | + this.lang = lang; | ||
| 18 | + this.emotion = emotion; | ||
| 19 | + this.event = event; | ||
| 14 | } | 20 | } |
| 15 | 21 | ||
| 16 | public String getText() { | 22 | public String getText() { |
| @@ -24,4 +30,16 @@ public class OfflineRecognizerResult { | @@ -24,4 +30,16 @@ public class OfflineRecognizerResult { | ||
| 24 | public float[] getTimestamps() { | 30 | public float[] getTimestamps() { |
| 25 | return timestamps; | 31 | return timestamps; |
| 26 | } | 32 | } |
| 33 | + | ||
| 34 | + public String getLang() { | ||
| 35 | + return lang; | ||
| 36 | + } | ||
| 37 | + | ||
| 38 | + public String getEmotion() { | ||
| 39 | + return emotion; | ||
| 40 | + } | ||
| 41 | + | ||
| 42 | + public String getEvent() { | ||
| 43 | + return event; | ||
| 44 | + } | ||
| 27 | } | 45 | } |
| @@ -320,8 +320,11 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env, | @@ -320,8 +320,11 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env, | ||
| 320 | // [0]: text, jstring | 320 | // [0]: text, jstring |
| 321 | // [1]: tokens, array of jstring | 321 | // [1]: tokens, array of jstring |
| 322 | // [2]: timestamps, array of float | 322 | // [2]: timestamps, array of float |
| 323 | + // [3]: lang, jstring | ||
| 324 | + // [4]: emotion, jstring | ||
| 325 | + // [5]: event, jstring | ||
| 323 | jobjectArray obj_arr = (jobjectArray)env->NewObjectArray( | 326 | jobjectArray obj_arr = (jobjectArray)env->NewObjectArray( |
| 324 | - 3, env->FindClass("java/lang/Object"), nullptr); | 327 | + 6, env->FindClass("java/lang/Object"), nullptr); |
| 325 | 328 | ||
| 326 | jstring text = env->NewStringUTF(result.text.c_str()); | 329 | jstring text = env->NewStringUTF(result.text.c_str()); |
| 327 | env->SetObjectArrayElement(obj_arr, 0, text); | 330 | env->SetObjectArrayElement(obj_arr, 0, text); |
| @@ -344,5 +347,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env, | @@ -344,5 +347,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env, | ||
| 344 | 347 | ||
| 345 | env->SetObjectArrayElement(obj_arr, 2, timestamps_arr); | 348 | env->SetObjectArrayElement(obj_arr, 2, timestamps_arr); |
| 346 | 349 | ||
| 350 | + // [3]: lang, jstring | ||
| 351 | + // [4]: emotion, jstring | ||
| 352 | + // [5]: event, jstring | ||
| 353 | + env->SetObjectArrayElement(obj_arr, 3, env->NewStringUTF(result.lang.c_str())); | ||
| 354 | + env->SetObjectArrayElement(obj_arr, 4, env->NewStringUTF(result.emotion.c_str())); | ||
| 355 | + env->SetObjectArrayElement(obj_arr, 5, env->NewStringUTF(result.event.c_str())); | ||
| 356 | + | ||
| 347 | return obj_arr; | 357 | return obj_arr; |
| 348 | } | 358 | } |
| @@ -6,6 +6,9 @@ data class OfflineRecognizerResult( | @@ -6,6 +6,9 @@ data class OfflineRecognizerResult( | ||
| 6 | val text: String, | 6 | val text: String, |
| 7 | val tokens: Array<String>, | 7 | val tokens: Array<String>, |
| 8 | val timestamps: FloatArray, | 8 | val timestamps: FloatArray, |
| 9 | + val lang: String, | ||
| 10 | + val emotion: String, | ||
| 11 | + val event: String, | ||
| 9 | ) | 12 | ) |
| 10 | 13 | ||
| 11 | data class OfflineTransducerModelConfig( | 14 | data class OfflineTransducerModelConfig( |
| @@ -96,7 +99,10 @@ class OfflineRecognizer( | @@ -96,7 +99,10 @@ class OfflineRecognizer( | ||
| 96 | val text = objArray[0] as String | 99 | val text = objArray[0] as String |
| 97 | val tokens = objArray[1] as Array<String> | 100 | val tokens = objArray[1] as Array<String> |
| 98 | val timestamps = objArray[2] as FloatArray | 101 | val timestamps = objArray[2] as FloatArray |
| 99 | - return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps) | 102 | + val lang = objArray[3] as String |
| 103 | + val emotion = objArray[4] as String | ||
| 104 | + val event = objArray[5] as String | ||
| 105 | + return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps, lang = lang, emotion = emotion, event = event) | ||
| 100 | } | 106 | } |
| 101 | 107 | ||
| 102 | fun decode(stream: OfflineStream) = decode(ptr, stream.ptr) | 108 | fun decode(stream: OfflineStream) = decode(ptr, stream.ptr) |
| @@ -32,6 +32,12 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT | @@ -32,6 +32,12 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT | ||
| 32 | return py::str(PyUnicode_DecodeUTF8(self.text.c_str(), | 32 | return py::str(PyUnicode_DecodeUTF8(self.text.c_str(), |
| 33 | self.text.size(), "ignore")); | 33 | self.text.size(), "ignore")); |
| 34 | }) | 34 | }) |
| 35 | + .def_property_readonly("lang", | ||
| 36 | + [](const PyClass &self) { return self.lang; }) | ||
| 37 | + .def_property_readonly("emotion", | ||
| 38 | + [](const PyClass &self) { return self.emotion; }) | ||
| 39 | + .def_property_readonly("event", | ||
| 40 | + [](const PyClass &self) { return self.event; }) | ||
| 35 | .def_property_readonly("tokens", | 41 | .def_property_readonly("tokens", |
| 36 | [](const PyClass &self) { return self.tokens; }) | 42 | [](const PyClass &self) { return self.tokens; }) |
| 37 | .def_property_readonly("words", | 43 | .def_property_readonly("words", |
-
请 注册 或 登录 后发表评论