Fangjun Kuang
Committed by GitHub

C API for speaker diarization (#1402)

@@ -120,3 +120,4 @@ vits-melo-tts-zh_en @@ -120,3 +120,4 @@ vits-melo-tts-zh_en
120 sherpa-onnx-online-punct-en-2024-08-06 120 sherpa-onnx-online-punct-en-2024-08-06
121 *.mp4 121 *.mp4
122 *.mp3 122 *.mp3
  123 +sherpa-onnx-pyannote-segmentation-3-0
1 ### Supported functions 1 ### Supported functions
2 2
3 -|Speech recognition| Speech synthesis | Speaker verification | Speaker identification |  
4 -|------------------|------------------|----------------------|------------------------|  
5 -| ✔️ | ✔️ | ✔️ | ✔️ | 3 +|Speech recognition| Speech synthesis |
  4 +|------------------|------------------|
  5 +| ✔️ | ✔️ |
  6 +
  7 +|Speaker identification| Speaker diarization | Speaker identification |
  8 +|----------------------|-------------------- |------------------------|
  9 +| ✔️ | ✔️ | ✔️ |
6 10
7 | Spoken Language identification | Audio tagging | Voice activity detection | 11 | Spoken Language identification | Audio tagging | Voice activity detection |
8 |--------------------------------|---------------|--------------------------| 12 |--------------------------------|---------------|--------------------------|
@@ -47,6 +51,7 @@ This repository supports running the following functions **locally** @@ -47,6 +51,7 @@ This repository supports running the following functions **locally**
47 51
48 - Speech-to-text (i.e., ASR); both streaming and non-streaming are supported 52 - Speech-to-text (i.e., ASR); both streaming and non-streaming are supported
49 - Text-to-speech (i.e., TTS) 53 - Text-to-speech (i.e., TTS)
  54 + - Speaker diarization
50 - Speaker identification 55 - Speaker identification
51 - Speaker verification 56 - Speaker verification
52 - Spoken language identification 57 - Spoken language identification
@@ -9,6 +9,11 @@ if(SHERPA_ONNX_ENABLE_TTS) @@ -9,6 +9,11 @@ if(SHERPA_ONNX_ENABLE_TTS)
9 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) 9 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs)
10 endif() 10 endif()
11 11
  12 +if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION)
  13 + add_executable(offline-speaker-diarization-c-api offline-speaker-diarization-c-api.c)
  14 + target_link_libraries(offline-speaker-diarization-c-api sherpa-onnx-c-api)
  15 +endif()
  16 +
12 add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c) 17 add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c)
13 target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) 18 target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
14 19
  1 +// c-api-examples/offline-sepaker-diarization-c-api.c
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +//
  6 +// This file demonstrates how to implement speaker diarization with
  7 +// sherpa-onnx's C API.
  8 +
  9 +// clang-format off
  10 +/*
  11 +Usage:
  12 +
  13 +Step 1: Download a speaker segmentation model
  14 +
  15 +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
  16 +for a list of available models. The following is an example
  17 +
  18 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  19 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  20 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  21 +
  22 +Step 2: Download a speaker embedding extractor model
  23 +
  24 +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  25 +for a list of available models. The following is an example
  26 +
  27 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
  28 +
  29 +Step 3. Download test wave files
  30 +
  31 +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
  32 +for a list of available test wave files. The following is an example
  33 +
  34 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
  35 +
  36 +Step 4. Run it
  37 +
  38 + */
  39 +// clang-format on
  40 +
  41 +#include <stdio.h>
  42 +#include <string.h>
  43 +
  44 +#include "sherpa-onnx/c-api/c-api.h"
  45 +
  46 +static int32_t ProgressCallback(int32_t num_processed_chunks,
  47 + int32_t num_total_chunks, void *arg) {
  48 + float progress = 100.0 * num_processed_chunks / num_total_chunks;
  49 + fprintf(stderr, "progress %.2f%%\n", progress);
  50 +
  51 + // the return value is currently ignored
  52 + return 0;
  53 +}
  54 +
  55 +int main() {
  56 + // Please see the comments at the start of this file for how to download
  57 + // the .onnx file and .wav files below
  58 + const char *segmentation_model =
  59 + "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx";
  60 +
  61 + const char *embedding_extractor_model =
  62 + "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx";
  63 +
  64 + const char *wav_filename = "./0-four-speakers-zh.wav";
  65 +
  66 + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
  67 + if (wave == NULL) {
  68 + fprintf(stderr, "Failed to read %s\n", wav_filename);
  69 + return -1;
  70 + }
  71 +
  72 + SherpaOnnxOfflineSpeakerDiarizationConfig config;
  73 + memset(&config, 0, sizeof(config));
  74 +
  75 + config.segmentation.pyannote.model = segmentation_model;
  76 + config.embedding.model = embedding_extractor_model;
  77 +
  78 + // the test wave ./0-four-speakers-zh.wav has 4 speakers, so
  79 + // we set num_clusters to 4
  80 + //
  81 + config.clustering.num_clusters = 4;
  82 + // If you don't know the number of speakers in the test wave file, please
  83 + // use
  84 + // config.clustering.threshold = 0.5; // You need to tune this threshold
  85 +
  86 + const SherpaOnnxOfflineSpeakerDiarization *sd =
  87 + SherpaOnnxCreateOfflineSpeakerDiarization(&config);
  88 +
  89 + if (!sd) {
  90 + fprintf(stderr, "Failed to initialize offline speaker diarization\n");
  91 + return -1;
  92 + }
  93 +
  94 + if (SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd) !=
  95 + wave->sample_rate) {
  96 + fprintf(
  97 + stderr,
  98 + "Expected sample rate: %d. Actual sample rate from the wave file: %d\n",
  99 + SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd),
  100 + wave->sample_rate);
  101 + goto failed;
  102 + }
  103 +
  104 + const SherpaOnnxOfflineSpeakerDiarizationResult *result =
  105 + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(
  106 + sd, wave->samples, wave->num_samples, ProgressCallback, NULL);
  107 + if (!result) {
  108 + fprintf(stderr, "Failed to do speaker diarization");
  109 + goto failed;
  110 + }
  111 +
  112 + int32_t num_segments =
  113 + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result);
  114 +
  115 + const SherpaOnnxOfflineSpeakerDiarizationSegment *segments =
  116 + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result);
  117 +
  118 + for (int32_t i = 0; i != num_segments; ++i) {
  119 + fprintf(stderr, "%.3f -- %.3f speaker_%02d\n", segments[i].start,
  120 + segments[i].end, segments[i].speaker);
  121 + }
  122 +
  123 +failed:
  124 +
  125 + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments);
  126 + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result);
  127 + SherpaOnnxDestroyOfflineSpeakerDiarization(sd);
  128 + SherpaOnnxFreeWave(wave);
  129 +
  130 + return 0;
  131 +}
@@ -31,6 +31,10 @@ @@ -31,6 +31,10 @@
31 #include "sherpa-onnx/csrc/offline-tts.h" 31 #include "sherpa-onnx/csrc/offline-tts.h"
32 #endif 32 #endif
33 33
  34 +#if SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION == 1
  35 +#include "sherpa-onnx/csrc/offline-speaker-diarization.h"
  36 +#endif
  37 +
34 struct SherpaOnnxOnlineRecognizer { 38 struct SherpaOnnxOnlineRecognizer {
35 std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl; 39 std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl;
36 }; 40 };
@@ -1670,3 +1674,144 @@ void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { @@ -1670,3 +1674,144 @@ void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
1670 int32_t SherpaOnnxFileExists(const char *filename) { 1674 int32_t SherpaOnnxFileExists(const char *filename) {
1671 return sherpa_onnx::FileExists(filename); 1675 return sherpa_onnx::FileExists(filename);
1672 } 1676 }
  1677 +
  1678 +#if SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION == 1
  1679 +
  1680 +struct SherpaOnnxOfflineSpeakerDiarization {
  1681 + std::unique_ptr<sherpa_onnx::OfflineSpeakerDiarization> impl;
  1682 +};
  1683 +
  1684 +struct SherpaOnnxOfflineSpeakerDiarizationResult {
  1685 + sherpa_onnx::OfflineSpeakerDiarizationResult impl;
  1686 +};
  1687 +
  1688 +const SherpaOnnxOfflineSpeakerDiarization *
  1689 +SherpaOnnxCreateOfflineSpeakerDiarization(
  1690 + const SherpaOnnxOfflineSpeakerDiarizationConfig *config) {
  1691 + sherpa_onnx::OfflineSpeakerDiarizationConfig sd_config;
  1692 +
  1693 + sd_config.segmentation.pyannote.model =
  1694 + SHERPA_ONNX_OR(config->segmentation.pyannote.model, "");
  1695 + sd_config.segmentation.num_threads =
  1696 + SHERPA_ONNX_OR(config->segmentation.num_threads, 1);
  1697 + sd_config.segmentation.debug = config->segmentation.debug;
  1698 + sd_config.segmentation.provider =
  1699 + SHERPA_ONNX_OR(config->segmentation.provider, "cpu");
  1700 + if (sd_config.segmentation.provider.empty()) {
  1701 + sd_config.segmentation.provider = "cpu";
  1702 + }
  1703 +
  1704 + sd_config.embedding.model = SHERPA_ONNX_OR(config->embedding.model, "");
  1705 + sd_config.embedding.num_threads =
  1706 + SHERPA_ONNX_OR(config->embedding.num_threads, 1);
  1707 + sd_config.embedding.debug = config->embedding.debug;
  1708 + sd_config.embedding.provider =
  1709 + SHERPA_ONNX_OR(config->embedding.provider, "cpu");
  1710 + if (sd_config.embedding.provider.empty()) {
  1711 + sd_config.embedding.provider = "cpu";
  1712 + }
  1713 +
  1714 + sd_config.clustering.num_clusters =
  1715 + SHERPA_ONNX_OR(config->clustering.num_clusters, -1);
  1716 +
  1717 + sd_config.clustering.threshold =
  1718 + SHERPA_ONNX_OR(config->clustering.threshold, 0.5);
  1719 +
  1720 + sd_config.min_duration_on = SHERPA_ONNX_OR(config->min_duration_on, 0.3);
  1721 +
  1722 + sd_config.min_duration_off = SHERPA_ONNX_OR(config->min_duration_off, 0.5);
  1723 +
  1724 + if (!sd_config.Validate()) {
  1725 + SHERPA_ONNX_LOGE("Errors in config");
  1726 + return nullptr;
  1727 + }
  1728 +
  1729 + SherpaOnnxOfflineSpeakerDiarization *sd =
  1730 + new SherpaOnnxOfflineSpeakerDiarization;
  1731 +
  1732 + sd->impl =
  1733 + std::make_unique<sherpa_onnx::OfflineSpeakerDiarization>(sd_config);
  1734 +
  1735 + if (sd_config.segmentation.debug || sd_config.embedding.debug) {
  1736 + SHERPA_ONNX_LOGE("%s\n", sd_config.ToString().c_str());
  1737 + }
  1738 +
  1739 + return sd;
  1740 +}
  1741 +
  1742 +void SherpaOnnxDestroyOfflineSpeakerDiarization(
  1743 + const SherpaOnnxOfflineSpeakerDiarization *sd) {
  1744 + delete sd;
  1745 +}
  1746 +
  1747 +int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
  1748 + const SherpaOnnxOfflineSpeakerDiarization *sd) {
  1749 + return sd->impl->SampleRate();
  1750 +}
  1751 +
  1752 +int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers(
  1753 + const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
  1754 + return r->impl.NumSpeakers();
  1755 +}
  1756 +
  1757 +int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(
  1758 + const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
  1759 + return r->impl.NumSegments();
  1760 +}
  1761 +
  1762 +const SherpaOnnxOfflineSpeakerDiarizationSegment *
  1763 +SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(
  1764 + const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
  1765 + if (r->impl.NumSegments() == 0) {
  1766 + return nullptr;
  1767 + }
  1768 +
  1769 + auto segments = r->impl.SortByStartTime();
  1770 +
  1771 + int32_t n = segments.size();
  1772 + SherpaOnnxOfflineSpeakerDiarizationSegment *ans =
  1773 + new SherpaOnnxOfflineSpeakerDiarizationSegment[n];
  1774 +
  1775 + for (int32_t i = 0; i != n; ++i) {
  1776 + const auto &s = segments[i];
  1777 +
  1778 + ans[i].start = s.Start();
  1779 + ans[i].end = s.End();
  1780 + ans[i].speaker = s.Speaker();
  1781 + }
  1782 +
  1783 + return ans;
  1784 +}
  1785 +
  1786 +void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(
  1787 + const SherpaOnnxOfflineSpeakerDiarizationSegment *s) {
  1788 + delete[] s;
  1789 +}
  1790 +
  1791 +const SherpaOnnxOfflineSpeakerDiarizationResult *
  1792 +SherpaOnnxOfflineSpeakerDiarizationProcess(
  1793 + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
  1794 + int32_t n) {
  1795 + auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult;
  1796 + ans->impl = sd->impl->Process(samples, n);
  1797 +
  1798 + return ans;
  1799 +}
  1800 +
  1801 +void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(
  1802 + const SherpaOnnxOfflineSpeakerDiarizationResult *r) {
  1803 + delete r;
  1804 +}
  1805 +
  1806 +const SherpaOnnxOfflineSpeakerDiarizationResult *
  1807 +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(
  1808 + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
  1809 + int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback,
  1810 + void *arg) {
  1811 + auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult;
  1812 + ans->impl = sd->impl->Process(samples, n, callback, arg);
  1813 +
  1814 + return ans;
  1815 +}
  1816 +
  1817 +#endif
@@ -927,7 +927,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; @@ -927,7 +927,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts;
927 SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( 927 SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
928 const SherpaOnnxOfflineTtsConfig *config); 928 const SherpaOnnxOfflineTtsConfig *config);
929 929
930 -// Free the pointer returned by CreateOfflineTts() 930 +// Free the pointer returned by SherpaOnnxCreateOfflineTts()
931 SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); 931 SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts);
932 932
933 // Return the sample rate of the current TTS object 933 // Return the sample rate of the current TTS object
@@ -954,6 +954,11 @@ SherpaOnnxOfflineTtsGenerateWithCallback( @@ -954,6 +954,11 @@ SherpaOnnxOfflineTtsGenerateWithCallback(
954 const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, 954 const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
955 SherpaOnnxGeneratedAudioCallback callback); 955 SherpaOnnxGeneratedAudioCallback callback);
956 956
  957 +const SherpaOnnxGeneratedAudio *
  958 +SherpaOnnxOfflineTtsGenerateWithProgressCallback(
  959 + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed,
  960 + SherpaOnnxGeneratedAudioProgressCallback callback);
  961 +
957 // Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional 962 // Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional
958 // `void* arg` to the callback. 963 // `void* arg` to the callback.
959 SHERPA_ONNX_API const SherpaOnnxGeneratedAudio * 964 SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *
@@ -1384,6 +1389,115 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( @@ -1384,6 +1389,115 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
1384 // Return 1 if the file exists; return 0 if the file does not exist. 1389 // Return 1 if the file exists; return 0 if the file does not exist.
1385 SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); 1390 SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename);
1386 1391
  1392 +// =========================================================================
  1393 +// For offline speaker diarization (i.e., non-streaming speaker diarization)
  1394 +// =========================================================================
  1395 +SHERPA_ONNX_API typedef struct
  1396 + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig {
  1397 + const char *model;
  1398 +} SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
  1399 +
  1400 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerSegmentationModelConfig {
  1401 + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig pyannote;
  1402 + int32_t num_threads; // 1
  1403 + int32_t debug; // false
  1404 + const char *provider; // "cpu"
  1405 +} SherpaOnnxOfflineSpeakerSegmentationModelConfig;
  1406 +
  1407 +SHERPA_ONNX_API typedef struct SherpaOnnxFastClusteringConfig {
  1408 + // If greater than 0, then threshold is ignored.
  1409 + //
  1410 + // We strongly recommend that you set it if you know the number of clusters
  1411 + // in advance
  1412 + int32_t num_clusters;
  1413 +
  1414 + // distance threshold.
  1415 + //
  1416 + // The smaller, the more clusters it will generate.
  1417 + // The larger, the fewer clusters it will generate.
  1418 + float threshold;
  1419 +} SherpaOnnxFastClusteringConfig;
  1420 +
  1421 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationConfig {
  1422 + SherpaOnnxOfflineSpeakerSegmentationModelConfig segmentation;
  1423 + SherpaOnnxSpeakerEmbeddingExtractorConfig embedding;
  1424 + SherpaOnnxFastClusteringConfig clustering;
  1425 +
  1426 + // if a segment is less than this value, then it is discarded
  1427 + float min_duration_on; // in seconds
  1428 +
  1429 + // if the gap between to segments of the same speaker is less than this value,
  1430 + // then these two segments are merged into a single segment.
  1431 + // We do this recursively.
  1432 + float min_duration_off; // in seconds
  1433 +} SherpaOnnxOfflineSpeakerDiarizationConfig;
  1434 +
  1435 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarization
  1436 + SherpaOnnxOfflineSpeakerDiarization;
  1437 +
  1438 +// The users has to invoke SherpaOnnxDestroyOfflineSpeakerDiarization()
  1439 +// to free the returned pointer to avoid memory leak
  1440 +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarization *
  1441 +SherpaOnnxCreateOfflineSpeakerDiarization(
  1442 + const SherpaOnnxOfflineSpeakerDiarizationConfig *config);
  1443 +
  1444 +// Free the pointer returned by SherpaOnnxCreateOfflineSpeakerDiarization()
  1445 +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineSpeakerDiarization(
  1446 + const SherpaOnnxOfflineSpeakerDiarization *sd);
  1447 +
  1448 +// Expected sample rate of the input audio samples
  1449 +SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
  1450 + const SherpaOnnxOfflineSpeakerDiarization *sd);
  1451 +
  1452 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationResult
  1453 + SherpaOnnxOfflineSpeakerDiarizationResult;
  1454 +
  1455 +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationSegment {
  1456 + float start;
  1457 + float end;
  1458 + int32_t speaker;
  1459 +} SherpaOnnxOfflineSpeakerDiarizationSegment;
  1460 +
  1461 +SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers(
  1462 + const SherpaOnnxOfflineSpeakerDiarizationResult *r);
  1463 +
  1464 +SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(
  1465 + const SherpaOnnxOfflineSpeakerDiarizationResult *r);
  1466 +
  1467 +// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroySegment()
  1468 +// to free the returned pointer to avoid memory leak.
  1469 +//
  1470 +// The returned pointer is the start address of an array.
  1471 +// Number of entries in the array equals to the value
  1472 +// returned by SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments()
  1473 +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationSegment *
  1474 +SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(
  1475 + const SherpaOnnxOfflineSpeakerDiarizationResult *r);
  1476 +
  1477 +SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(
  1478 + const SherpaOnnxOfflineSpeakerDiarizationSegment *s);
  1479 +
  1480 +typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallback)(
  1481 + int32_t num_processed_chunk, int32_t num_total_chunks, void *arg);
  1482 +
  1483 +// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult()
  1484 +// to free the returned pointer to avoid memory leak.
  1485 +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult *
  1486 +SherpaOnnxOfflineSpeakerDiarizationProcess(
  1487 + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
  1488 + int32_t n);
  1489 +
  1490 +// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult()
  1491 +// to free the returned pointer to avoid memory leak.
  1492 +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult *
  1493 +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(
  1494 + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples,
  1495 + int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback,
  1496 + void *arg);
  1497 +
  1498 +SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(
  1499 + const SherpaOnnxOfflineSpeakerDiarizationResult *r);
  1500 +
1387 #if defined(__GNUC__) 1501 #if defined(__GNUC__)
1388 #pragma GCC diagnostic pop 1502 #pragma GCC diagnostic pop
1389 #endif 1503 #endif
@@ -20,8 +20,8 @@ struct FastClusteringConfig { @@ -20,8 +20,8 @@ struct FastClusteringConfig {
20 20
21 // distance threshold. 21 // distance threshold.
22 // 22 //
23 - // The lower, the more clusters it will generate.  
24 - // The higher, the fewer clusters it will generate. 23 + // The smaller, the more clusters it will generate.
  24 + // The larger, the fewer clusters it will generate.
25 float threshold = 0.5; 25 float threshold = 0.5;
26 26
27 FastClusteringConfig() = default; 27 FastClusteringConfig() = default;
@@ -43,6 +43,16 @@ bool OfflineSpeakerDiarizationConfig::Validate() const { @@ -43,6 +43,16 @@ bool OfflineSpeakerDiarizationConfig::Validate() const {
43 return false; 43 return false;
44 } 44 }
45 45
  46 + if (min_duration_on < 0) {
  47 + SHERPA_ONNX_LOGE("min_duration_on %.3f is negative", min_duration_on);
  48 + return false;
  49 + }
  50 +
  51 + if (min_duration_off < 0) {
  52 + SHERPA_ONNX_LOGE("min_duration_off %.3f is negative", min_duration_off);
  53 + return false;
  54 + }
  55 +
46 return true; 56 return true;
47 } 57 }
48 58
@@ -7,7 +7,7 @@ @@ -7,7 +7,7 @@
7 #include "sherpa-onnx/csrc/wave-reader.h" 7 #include "sherpa-onnx/csrc/wave-reader.h"
8 8
9 static int32_t ProgressCallback(int32_t processed_chunks, int32_t num_chunks, 9 static int32_t ProgressCallback(int32_t processed_chunks, int32_t num_chunks,
10 - void *arg) { 10 + void *) {
11 float progress = 100.0 * processed_chunks / num_chunks; 11 float progress = 100.0 * processed_chunks / num_chunks;
12 fprintf(stderr, "progress %.2f%%\n", progress); 12 fprintf(stderr, "progress %.2f%%\n", progress);
13 13