Leo Huang
Committed by GitHub

Added tokens, tokens_arr and json for offline recongnizer result (#936)

Co-authored-by: leo <webmaster@360converter.com>
@@ -444,14 +444,49 @@ const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult( @@ -444,14 +444,49 @@ const SherpaOnnxOfflineRecognizerResult *GetOfflineStreamResult(
444 pText[text.size()] = 0; 444 pText[text.size()] = 0;
445 r->text = pText; 445 r->text = pText;
446 446
447 - if (!result.timestamps.empty()) {  
448 - r->timestamps = new float[result.timestamps.size()];  
449 - std::copy(result.timestamps.begin(), result.timestamps.end(),  
450 - r->timestamps);  
451 - r->count = result.timestamps.size(); 447 + // copy json
  448 + const auto &json = result.AsJsonString();
  449 + char *pJson = new char[json.size() + 1];
  450 + std::copy(json.begin(), json.end(), pJson);
  451 + pJson[json.size()] = 0;
  452 + r->json = pJson;
  453 +
  454 + // copy tokens
  455 + auto count = result.tokens.size();
  456 + if (count > 0) {
  457 + size_t total_length = 0;
  458 + for (const auto &token : result.tokens) {
  459 + // +1 for the null character at the end of each token
  460 + total_length += token.size() + 1;
  461 + }
  462 +
  463 + r->count = count;
  464 + // Each word ends with nullptr
  465 + char *tokens = new char[total_length]{};
  466 + char **tokens_temp = new char *[r->count];
  467 + int32_t pos = 0;
  468 + for (int32_t i = 0; i < r->count; ++i) {
  469 + tokens_temp[i] = tokens + pos;
  470 + memcpy(tokens + pos, result.tokens[i].c_str(), result.tokens[i].size());
  471 + // +1 to move past the null character
  472 + pos += result.tokens[i].size() + 1;
  473 + }
  474 + r->tokens_arr = tokens_temp;
  475 +
  476 + if (!result.timestamps.empty()) {
  477 + r->timestamps = new float[r->count];
  478 + std::copy(result.timestamps.begin(), result.timestamps.end(),
  479 + r->timestamps);
  480 + } else {
  481 + r->timestamps = nullptr;
  482 + }
  483 +
  484 + r->tokens = tokens;
452 } else { 485 } else {
453 - r->timestamps = nullptr;  
454 r->count = 0; 486 r->count = 0;
  487 + r->timestamps = nullptr;
  488 + r->tokens = nullptr;
  489 + r->tokens_arr = nullptr;
455 } 490 }
456 491
457 return r; 492 return r;
@@ -462,6 +497,9 @@ void DestroyOfflineRecognizerResult( @@ -462,6 +497,9 @@ void DestroyOfflineRecognizerResult(
462 if (r) { 497 if (r) {
463 delete[] r->text; 498 delete[] r->text;
464 delete[] r->timestamps; 499 delete[] r->timestamps;
  500 + delete[] r->tokens;
  501 + delete[] r->tokens_arr;
  502 + delete[] r->json;
465 delete r; 503 delete r;
466 } 504 }
467 } 505 }
@@ -481,7 +481,27 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult { @@ -481,7 +481,27 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
481 481
482 // number of entries in timestamps 482 // number of entries in timestamps
483 int32_t count; 483 int32_t count;
484 - // TODO(fangjun): Add more fields 484 +
  485 + // Pointer to continuous memory which holds string based tokens
  486 + // which are separated by \0
  487 + const char *tokens;
  488 +
  489 + // a pointer array containing the address of the first item in tokens
  490 + const char *const *tokens_arr;
  491 +
  492 + /** Return a json string.
  493 + *
  494 + * The returned string contains:
  495 + * {
  496 + * "text": "The recognition result",
  497 + * "tokens": [x, x, x],
  498 + * "timestamps": [x, x, x],
  499 + * "segment": x,
  500 + * "start_time": x,
  501 + * "is_final": true|false
  502 + * }
  503 + */
  504 + const char *json;
485 } SherpaOnnxOfflineRecognizerResult; 505 } SherpaOnnxOfflineRecognizerResult;
486 506
487 /// Get the result of the offline stream. 507 /// Get the result of the offline stream.