正在显示
9 个修改的文件
包含
418 行增加
和
7 行删除
| 1 | ### Supported functions | 1 | ### Supported functions |
| 2 | 2 | ||
| 3 | -|Speech recognition| Speech synthesis | Speaker verification | Speaker identification | | ||
| 4 | -|------------------|------------------|----------------------|------------------------| | ||
| 5 | -| ✔️ | ✔️ | ✔️ | ✔️ | | 3 | +|Speech recognition| Speech synthesis | |
| 4 | +|------------------|------------------| | ||
| 5 | +| ✔️ | ✔️ | | ||
| 6 | + | ||
| 7 | +|Speaker identification| Speaker diarization | Speaker identification | | ||
| 8 | +|----------------------|-------------------- |------------------------| | ||
| 9 | +| ✔️ | ✔️ | ✔️ | | ||
| 6 | 10 | ||
| 7 | | Spoken Language identification | Audio tagging | Voice activity detection | | 11 | | Spoken Language identification | Audio tagging | Voice activity detection | |
| 8 | |--------------------------------|---------------|--------------------------| | 12 | |--------------------------------|---------------|--------------------------| |
| @@ -47,6 +51,7 @@ This repository supports running the following functions **locally** | @@ -47,6 +51,7 @@ This repository supports running the following functions **locally** | ||
| 47 | 51 | ||
| 48 | - Speech-to-text (i.e., ASR); both streaming and non-streaming are supported | 52 | - Speech-to-text (i.e., ASR); both streaming and non-streaming are supported |
| 49 | - Text-to-speech (i.e., TTS) | 53 | - Text-to-speech (i.e., TTS) |
| 54 | + - Speaker diarization | ||
| 50 | - Speaker identification | 55 | - Speaker identification |
| 51 | - Speaker verification | 56 | - Speaker verification |
| 52 | - Spoken language identification | 57 | - Spoken language identification |
| @@ -9,6 +9,11 @@ if(SHERPA_ONNX_ENABLE_TTS) | @@ -9,6 +9,11 @@ if(SHERPA_ONNX_ENABLE_TTS) | ||
| 9 | target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) | 9 | target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) |
| 10 | endif() | 10 | endif() |
| 11 | 11 | ||
| 12 | +if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION) | ||
| 13 | + add_executable(offline-speaker-diarization-c-api offline-speaker-diarization-c-api.c) | ||
| 14 | + target_link_libraries(offline-speaker-diarization-c-api sherpa-onnx-c-api) | ||
| 15 | +endif() | ||
| 16 | + | ||
| 12 | add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c) | 17 | add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c) |
| 13 | target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) | 18 | target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) |
| 14 | 19 |
| 1 | +// c-api-examples/offline-sepaker-diarization-c-api.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// | ||
| 6 | +// This file demonstrates how to implement speaker diarization with | ||
| 7 | +// sherpa-onnx's C API. | ||
| 8 | + | ||
| 9 | +// clang-format off | ||
| 10 | +/* | ||
| 11 | +Usage: | ||
| 12 | + | ||
| 13 | +Step 1: Download a speaker segmentation model | ||
| 14 | + | ||
| 15 | +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models | ||
| 16 | +for a list of available models. The following is an example | ||
| 17 | + | ||
| 18 | + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 19 | + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 20 | + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 | ||
| 21 | + | ||
| 22 | +Step 2: Download a speaker embedding extractor model | ||
| 23 | + | ||
| 24 | +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 25 | +for a list of available models. The following is an example | ||
| 26 | + | ||
| 27 | + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx | ||
| 28 | + | ||
| 29 | +Step 3. Download test wave files | ||
| 30 | + | ||
| 31 | +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models | ||
| 32 | +for a list of available test wave files. The following is an example | ||
| 33 | + | ||
| 34 | + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav | ||
| 35 | + | ||
| 36 | +Step 4. Run it | ||
| 37 | + | ||
| 38 | + */ | ||
| 39 | +// clang-format on | ||
| 40 | + | ||
| 41 | +#include <stdio.h> | ||
| 42 | +#include <string.h> | ||
| 43 | + | ||
| 44 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 45 | + | ||
| 46 | +static int32_t ProgressCallback(int32_t num_processed_chunks, | ||
| 47 | + int32_t num_total_chunks, void *arg) { | ||
| 48 | + float progress = 100.0 * num_processed_chunks / num_total_chunks; | ||
| 49 | + fprintf(stderr, "progress %.2f%%\n", progress); | ||
| 50 | + | ||
| 51 | + // the return value is currently ignored | ||
| 52 | + return 0; | ||
| 53 | +} | ||
| 54 | + | ||
| 55 | +int main() { | ||
| 56 | + // Please see the comments at the start of this file for how to download | ||
| 57 | + // the .onnx file and .wav files below | ||
| 58 | + const char *segmentation_model = | ||
| 59 | + "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"; | ||
| 60 | + | ||
| 61 | + const char *embedding_extractor_model = | ||
| 62 | + "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; | ||
| 63 | + | ||
| 64 | + const char *wav_filename = "./0-four-speakers-zh.wav"; | ||
| 65 | + | ||
| 66 | + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); | ||
| 67 | + if (wave == NULL) { | ||
| 68 | + fprintf(stderr, "Failed to read %s\n", wav_filename); | ||
| 69 | + return -1; | ||
| 70 | + } | ||
| 71 | + | ||
| 72 | + SherpaOnnxOfflineSpeakerDiarizationConfig config; | ||
| 73 | + memset(&config, 0, sizeof(config)); | ||
| 74 | + | ||
| 75 | + config.segmentation.pyannote.model = segmentation_model; | ||
| 76 | + config.embedding.model = embedding_extractor_model; | ||
| 77 | + | ||
| 78 | + // the test wave ./0-four-speakers-zh.wav has 4 speakers, so | ||
| 79 | + // we set num_clusters to 4 | ||
| 80 | + // | ||
| 81 | + config.clustering.num_clusters = 4; | ||
| 82 | + // If you don't know the number of speakers in the test wave file, please | ||
| 83 | + // use | ||
| 84 | + // config.clustering.threshold = 0.5; // You need to tune this threshold | ||
| 85 | + | ||
| 86 | + const SherpaOnnxOfflineSpeakerDiarization *sd = | ||
| 87 | + SherpaOnnxCreateOfflineSpeakerDiarization(&config); | ||
| 88 | + | ||
| 89 | + if (!sd) { | ||
| 90 | + fprintf(stderr, "Failed to initialize offline speaker diarization\n"); | ||
| 91 | + return -1; | ||
| 92 | + } | ||
| 93 | + | ||
| 94 | + if (SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd) != | ||
| 95 | + wave->sample_rate) { | ||
| 96 | + fprintf( | ||
| 97 | + stderr, | ||
| 98 | + "Expected sample rate: %d. Actual sample rate from the wave file: %d\n", | ||
| 99 | + SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd), | ||
| 100 | + wave->sample_rate); | ||
| 101 | + goto failed; | ||
| 102 | + } | ||
| 103 | + | ||
| 104 | + const SherpaOnnxOfflineSpeakerDiarizationResult *result = | ||
| 105 | + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( | ||
| 106 | + sd, wave->samples, wave->num_samples, ProgressCallback, NULL); | ||
| 107 | + if (!result) { | ||
| 108 | + fprintf(stderr, "Failed to do speaker diarization"); | ||
| 109 | + goto failed; | ||
| 110 | + } | ||
| 111 | + | ||
| 112 | + int32_t num_segments = | ||
| 113 | + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result); | ||
| 114 | + | ||
| 115 | + const SherpaOnnxOfflineSpeakerDiarizationSegment *segments = | ||
| 116 | + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result); | ||
| 117 | + | ||
| 118 | + for (int32_t i = 0; i != num_segments; ++i) { | ||
| 119 | + fprintf(stderr, "%.3f -- %.3f speaker_%02d\n", segments[i].start, | ||
| 120 | + segments[i].end, segments[i].speaker); | ||
| 121 | + } | ||
| 122 | + | ||
| 123 | +failed: | ||
| 124 | + | ||
| 125 | + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments); | ||
| 126 | + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result); | ||
| 127 | + SherpaOnnxDestroyOfflineSpeakerDiarization(sd); | ||
| 128 | + SherpaOnnxFreeWave(wave); | ||
| 129 | + | ||
| 130 | + return 0; | ||
| 131 | +} |
| @@ -31,6 +31,10 @@ | @@ -31,6 +31,10 @@ | ||
| 31 | #include "sherpa-onnx/csrc/offline-tts.h" | 31 | #include "sherpa-onnx/csrc/offline-tts.h" |
| 32 | #endif | 32 | #endif |
| 33 | 33 | ||
| 34 | +#if SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION == 1 | ||
| 35 | +#include "sherpa-onnx/csrc/offline-speaker-diarization.h" | ||
| 36 | +#endif | ||
| 37 | + | ||
| 34 | struct SherpaOnnxOnlineRecognizer { | 38 | struct SherpaOnnxOnlineRecognizer { |
| 35 | std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl; | 39 | std::unique_ptr<sherpa_onnx::OnlineRecognizer> impl; |
| 36 | }; | 40 | }; |
| @@ -1670,3 +1674,144 @@ void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { | @@ -1670,3 +1674,144 @@ void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { | ||
| 1670 | int32_t SherpaOnnxFileExists(const char *filename) { | 1674 | int32_t SherpaOnnxFileExists(const char *filename) { |
| 1671 | return sherpa_onnx::FileExists(filename); | 1675 | return sherpa_onnx::FileExists(filename); |
| 1672 | } | 1676 | } |
| 1677 | + | ||
| 1678 | +#if SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION == 1 | ||
| 1679 | + | ||
| 1680 | +struct SherpaOnnxOfflineSpeakerDiarization { | ||
| 1681 | + std::unique_ptr<sherpa_onnx::OfflineSpeakerDiarization> impl; | ||
| 1682 | +}; | ||
| 1683 | + | ||
| 1684 | +struct SherpaOnnxOfflineSpeakerDiarizationResult { | ||
| 1685 | + sherpa_onnx::OfflineSpeakerDiarizationResult impl; | ||
| 1686 | +}; | ||
| 1687 | + | ||
| 1688 | +const SherpaOnnxOfflineSpeakerDiarization * | ||
| 1689 | +SherpaOnnxCreateOfflineSpeakerDiarization( | ||
| 1690 | + const SherpaOnnxOfflineSpeakerDiarizationConfig *config) { | ||
| 1691 | + sherpa_onnx::OfflineSpeakerDiarizationConfig sd_config; | ||
| 1692 | + | ||
| 1693 | + sd_config.segmentation.pyannote.model = | ||
| 1694 | + SHERPA_ONNX_OR(config->segmentation.pyannote.model, ""); | ||
| 1695 | + sd_config.segmentation.num_threads = | ||
| 1696 | + SHERPA_ONNX_OR(config->segmentation.num_threads, 1); | ||
| 1697 | + sd_config.segmentation.debug = config->segmentation.debug; | ||
| 1698 | + sd_config.segmentation.provider = | ||
| 1699 | + SHERPA_ONNX_OR(config->segmentation.provider, "cpu"); | ||
| 1700 | + if (sd_config.segmentation.provider.empty()) { | ||
| 1701 | + sd_config.segmentation.provider = "cpu"; | ||
| 1702 | + } | ||
| 1703 | + | ||
| 1704 | + sd_config.embedding.model = SHERPA_ONNX_OR(config->embedding.model, ""); | ||
| 1705 | + sd_config.embedding.num_threads = | ||
| 1706 | + SHERPA_ONNX_OR(config->embedding.num_threads, 1); | ||
| 1707 | + sd_config.embedding.debug = config->embedding.debug; | ||
| 1708 | + sd_config.embedding.provider = | ||
| 1709 | + SHERPA_ONNX_OR(config->embedding.provider, "cpu"); | ||
| 1710 | + if (sd_config.embedding.provider.empty()) { | ||
| 1711 | + sd_config.embedding.provider = "cpu"; | ||
| 1712 | + } | ||
| 1713 | + | ||
| 1714 | + sd_config.clustering.num_clusters = | ||
| 1715 | + SHERPA_ONNX_OR(config->clustering.num_clusters, -1); | ||
| 1716 | + | ||
| 1717 | + sd_config.clustering.threshold = | ||
| 1718 | + SHERPA_ONNX_OR(config->clustering.threshold, 0.5); | ||
| 1719 | + | ||
| 1720 | + sd_config.min_duration_on = SHERPA_ONNX_OR(config->min_duration_on, 0.3); | ||
| 1721 | + | ||
| 1722 | + sd_config.min_duration_off = SHERPA_ONNX_OR(config->min_duration_off, 0.5); | ||
| 1723 | + | ||
| 1724 | + if (!sd_config.Validate()) { | ||
| 1725 | + SHERPA_ONNX_LOGE("Errors in config"); | ||
| 1726 | + return nullptr; | ||
| 1727 | + } | ||
| 1728 | + | ||
| 1729 | + SherpaOnnxOfflineSpeakerDiarization *sd = | ||
| 1730 | + new SherpaOnnxOfflineSpeakerDiarization; | ||
| 1731 | + | ||
| 1732 | + sd->impl = | ||
| 1733 | + std::make_unique<sherpa_onnx::OfflineSpeakerDiarization>(sd_config); | ||
| 1734 | + | ||
| 1735 | + if (sd_config.segmentation.debug || sd_config.embedding.debug) { | ||
| 1736 | + SHERPA_ONNX_LOGE("%s\n", sd_config.ToString().c_str()); | ||
| 1737 | + } | ||
| 1738 | + | ||
| 1739 | + return sd; | ||
| 1740 | +} | ||
| 1741 | + | ||
| 1742 | +void SherpaOnnxDestroyOfflineSpeakerDiarization( | ||
| 1743 | + const SherpaOnnxOfflineSpeakerDiarization *sd) { | ||
| 1744 | + delete sd; | ||
| 1745 | +} | ||
| 1746 | + | ||
| 1747 | +int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate( | ||
| 1748 | + const SherpaOnnxOfflineSpeakerDiarization *sd) { | ||
| 1749 | + return sd->impl->SampleRate(); | ||
| 1750 | +} | ||
| 1751 | + | ||
| 1752 | +int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers( | ||
| 1753 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r) { | ||
| 1754 | + return r->impl.NumSpeakers(); | ||
| 1755 | +} | ||
| 1756 | + | ||
| 1757 | +int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments( | ||
| 1758 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r) { | ||
| 1759 | + return r->impl.NumSegments(); | ||
| 1760 | +} | ||
| 1761 | + | ||
| 1762 | +const SherpaOnnxOfflineSpeakerDiarizationSegment * | ||
| 1763 | +SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime( | ||
| 1764 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r) { | ||
| 1765 | + if (r->impl.NumSegments() == 0) { | ||
| 1766 | + return nullptr; | ||
| 1767 | + } | ||
| 1768 | + | ||
| 1769 | + auto segments = r->impl.SortByStartTime(); | ||
| 1770 | + | ||
| 1771 | + int32_t n = segments.size(); | ||
| 1772 | + SherpaOnnxOfflineSpeakerDiarizationSegment *ans = | ||
| 1773 | + new SherpaOnnxOfflineSpeakerDiarizationSegment[n]; | ||
| 1774 | + | ||
| 1775 | + for (int32_t i = 0; i != n; ++i) { | ||
| 1776 | + const auto &s = segments[i]; | ||
| 1777 | + | ||
| 1778 | + ans[i].start = s.Start(); | ||
| 1779 | + ans[i].end = s.End(); | ||
| 1780 | + ans[i].speaker = s.Speaker(); | ||
| 1781 | + } | ||
| 1782 | + | ||
| 1783 | + return ans; | ||
| 1784 | +} | ||
| 1785 | + | ||
| 1786 | +void SherpaOnnxOfflineSpeakerDiarizationDestroySegment( | ||
| 1787 | + const SherpaOnnxOfflineSpeakerDiarizationSegment *s) { | ||
| 1788 | + delete[] s; | ||
| 1789 | +} | ||
| 1790 | + | ||
| 1791 | +const SherpaOnnxOfflineSpeakerDiarizationResult * | ||
| 1792 | +SherpaOnnxOfflineSpeakerDiarizationProcess( | ||
| 1793 | + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, | ||
| 1794 | + int32_t n) { | ||
| 1795 | + auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult; | ||
| 1796 | + ans->impl = sd->impl->Process(samples, n); | ||
| 1797 | + | ||
| 1798 | + return ans; | ||
| 1799 | +} | ||
| 1800 | + | ||
| 1801 | +void SherpaOnnxOfflineSpeakerDiarizationDestroyResult( | ||
| 1802 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r) { | ||
| 1803 | + delete r; | ||
| 1804 | +} | ||
| 1805 | + | ||
| 1806 | +const SherpaOnnxOfflineSpeakerDiarizationResult * | ||
| 1807 | +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( | ||
| 1808 | + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, | ||
| 1809 | + int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback, | ||
| 1810 | + void *arg) { | ||
| 1811 | + auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult; | ||
| 1812 | + ans->impl = sd->impl->Process(samples, n, callback, arg); | ||
| 1813 | + | ||
| 1814 | + return ans; | ||
| 1815 | +} | ||
| 1816 | + | ||
| 1817 | +#endif |
| @@ -927,7 +927,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; | @@ -927,7 +927,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; | ||
| 927 | SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( | 927 | SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( |
| 928 | const SherpaOnnxOfflineTtsConfig *config); | 928 | const SherpaOnnxOfflineTtsConfig *config); |
| 929 | 929 | ||
| 930 | -// Free the pointer returned by CreateOfflineTts() | 930 | +// Free the pointer returned by SherpaOnnxCreateOfflineTts() |
| 931 | SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); | 931 | SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); |
| 932 | 932 | ||
| 933 | // Return the sample rate of the current TTS object | 933 | // Return the sample rate of the current TTS object |
| @@ -954,6 +954,11 @@ SherpaOnnxOfflineTtsGenerateWithCallback( | @@ -954,6 +954,11 @@ SherpaOnnxOfflineTtsGenerateWithCallback( | ||
| 954 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, | 954 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, |
| 955 | SherpaOnnxGeneratedAudioCallback callback); | 955 | SherpaOnnxGeneratedAudioCallback callback); |
| 956 | 956 | ||
| 957 | +const SherpaOnnxGeneratedAudio * | ||
| 958 | +SherpaOnnxOfflineTtsGenerateWithProgressCallback( | ||
| 959 | + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, | ||
| 960 | + SherpaOnnxGeneratedAudioProgressCallback callback); | ||
| 961 | + | ||
| 957 | // Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional | 962 | // Same as SherpaOnnxGeneratedAudioCallback but you can pass an additional |
| 958 | // `void* arg` to the callback. | 963 | // `void* arg` to the callback. |
| 959 | SHERPA_ONNX_API const SherpaOnnxGeneratedAudio * | 964 | SHERPA_ONNX_API const SherpaOnnxGeneratedAudio * |
| @@ -1384,6 +1389,115 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( | @@ -1384,6 +1389,115 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( | ||
| 1384 | // Return 1 if the file exists; return 0 if the file does not exist. | 1389 | // Return 1 if the file exists; return 0 if the file does not exist. |
| 1385 | SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); | 1390 | SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); |
| 1386 | 1391 | ||
| 1392 | +// ========================================================================= | ||
| 1393 | +// For offline speaker diarization (i.e., non-streaming speaker diarization) | ||
| 1394 | +// ========================================================================= | ||
| 1395 | +SHERPA_ONNX_API typedef struct | ||
| 1396 | + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig { | ||
| 1397 | + const char *model; | ||
| 1398 | +} SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig; | ||
| 1399 | + | ||
| 1400 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerSegmentationModelConfig { | ||
| 1401 | + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig pyannote; | ||
| 1402 | + int32_t num_threads; // 1 | ||
| 1403 | + int32_t debug; // false | ||
| 1404 | + const char *provider; // "cpu" | ||
| 1405 | +} SherpaOnnxOfflineSpeakerSegmentationModelConfig; | ||
| 1406 | + | ||
| 1407 | +SHERPA_ONNX_API typedef struct SherpaOnnxFastClusteringConfig { | ||
| 1408 | + // If greater than 0, then threshold is ignored. | ||
| 1409 | + // | ||
| 1410 | + // We strongly recommend that you set it if you know the number of clusters | ||
| 1411 | + // in advance | ||
| 1412 | + int32_t num_clusters; | ||
| 1413 | + | ||
| 1414 | + // distance threshold. | ||
| 1415 | + // | ||
| 1416 | + // The smaller, the more clusters it will generate. | ||
| 1417 | + // The larger, the fewer clusters it will generate. | ||
| 1418 | + float threshold; | ||
| 1419 | +} SherpaOnnxFastClusteringConfig; | ||
| 1420 | + | ||
| 1421 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationConfig { | ||
| 1422 | + SherpaOnnxOfflineSpeakerSegmentationModelConfig segmentation; | ||
| 1423 | + SherpaOnnxSpeakerEmbeddingExtractorConfig embedding; | ||
| 1424 | + SherpaOnnxFastClusteringConfig clustering; | ||
| 1425 | + | ||
| 1426 | + // if a segment is less than this value, then it is discarded | ||
| 1427 | + float min_duration_on; // in seconds | ||
| 1428 | + | ||
| 1429 | + // if the gap between to segments of the same speaker is less than this value, | ||
| 1430 | + // then these two segments are merged into a single segment. | ||
| 1431 | + // We do this recursively. | ||
| 1432 | + float min_duration_off; // in seconds | ||
| 1433 | +} SherpaOnnxOfflineSpeakerDiarizationConfig; | ||
| 1434 | + | ||
| 1435 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarization | ||
| 1436 | + SherpaOnnxOfflineSpeakerDiarization; | ||
| 1437 | + | ||
| 1438 | +// The users has to invoke SherpaOnnxDestroyOfflineSpeakerDiarization() | ||
| 1439 | +// to free the returned pointer to avoid memory leak | ||
| 1440 | +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarization * | ||
| 1441 | +SherpaOnnxCreateOfflineSpeakerDiarization( | ||
| 1442 | + const SherpaOnnxOfflineSpeakerDiarizationConfig *config); | ||
| 1443 | + | ||
| 1444 | +// Free the pointer returned by SherpaOnnxCreateOfflineSpeakerDiarization() | ||
| 1445 | +SHERPA_ONNX_API void SherpaOnnxDestroyOfflineSpeakerDiarization( | ||
| 1446 | + const SherpaOnnxOfflineSpeakerDiarization *sd); | ||
| 1447 | + | ||
| 1448 | +// Expected sample rate of the input audio samples | ||
| 1449 | +SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationGetSampleRate( | ||
| 1450 | + const SherpaOnnxOfflineSpeakerDiarization *sd); | ||
| 1451 | + | ||
| 1452 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationResult | ||
| 1453 | + SherpaOnnxOfflineSpeakerDiarizationResult; | ||
| 1454 | + | ||
| 1455 | +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSpeakerDiarizationSegment { | ||
| 1456 | + float start; | ||
| 1457 | + float end; | ||
| 1458 | + int32_t speaker; | ||
| 1459 | +} SherpaOnnxOfflineSpeakerDiarizationSegment; | ||
| 1460 | + | ||
| 1461 | +SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers( | ||
| 1462 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r); | ||
| 1463 | + | ||
| 1464 | +SHERPA_ONNX_API int32_t SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments( | ||
| 1465 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r); | ||
| 1466 | + | ||
| 1467 | +// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroySegment() | ||
| 1468 | +// to free the returned pointer to avoid memory leak. | ||
| 1469 | +// | ||
| 1470 | +// The returned pointer is the start address of an array. | ||
| 1471 | +// Number of entries in the array equals to the value | ||
| 1472 | +// returned by SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments() | ||
| 1473 | +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationSegment * | ||
| 1474 | +SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime( | ||
| 1475 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r); | ||
| 1476 | + | ||
| 1477 | +SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroySegment( | ||
| 1478 | + const SherpaOnnxOfflineSpeakerDiarizationSegment *s); | ||
| 1479 | + | ||
| 1480 | +typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallback)( | ||
| 1481 | + int32_t num_processed_chunk, int32_t num_total_chunks, void *arg); | ||
| 1482 | + | ||
| 1483 | +// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult() | ||
| 1484 | +// to free the returned pointer to avoid memory leak. | ||
| 1485 | +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * | ||
| 1486 | +SherpaOnnxOfflineSpeakerDiarizationProcess( | ||
| 1487 | + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, | ||
| 1488 | + int32_t n); | ||
| 1489 | + | ||
| 1490 | +// The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult() | ||
| 1491 | +// to free the returned pointer to avoid memory leak. | ||
| 1492 | +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * | ||
| 1493 | +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( | ||
| 1494 | + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, | ||
| 1495 | + int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback, | ||
| 1496 | + void *arg); | ||
| 1497 | + | ||
| 1498 | +SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroyResult( | ||
| 1499 | + const SherpaOnnxOfflineSpeakerDiarizationResult *r); | ||
| 1500 | + | ||
| 1387 | #if defined(__GNUC__) | 1501 | #if defined(__GNUC__) |
| 1388 | #pragma GCC diagnostic pop | 1502 | #pragma GCC diagnostic pop |
| 1389 | #endif | 1503 | #endif |
| @@ -20,8 +20,8 @@ struct FastClusteringConfig { | @@ -20,8 +20,8 @@ struct FastClusteringConfig { | ||
| 20 | 20 | ||
| 21 | // distance threshold. | 21 | // distance threshold. |
| 22 | // | 22 | // |
| 23 | - // The lower, the more clusters it will generate. | ||
| 24 | - // The higher, the fewer clusters it will generate. | 23 | + // The smaller, the more clusters it will generate. |
| 24 | + // The larger, the fewer clusters it will generate. | ||
| 25 | float threshold = 0.5; | 25 | float threshold = 0.5; |
| 26 | 26 | ||
| 27 | FastClusteringConfig() = default; | 27 | FastClusteringConfig() = default; |
| @@ -43,6 +43,16 @@ bool OfflineSpeakerDiarizationConfig::Validate() const { | @@ -43,6 +43,16 @@ bool OfflineSpeakerDiarizationConfig::Validate() const { | ||
| 43 | return false; | 43 | return false; |
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | + if (min_duration_on < 0) { | ||
| 47 | + SHERPA_ONNX_LOGE("min_duration_on %.3f is negative", min_duration_on); | ||
| 48 | + return false; | ||
| 49 | + } | ||
| 50 | + | ||
| 51 | + if (min_duration_off < 0) { | ||
| 52 | + SHERPA_ONNX_LOGE("min_duration_off %.3f is negative", min_duration_off); | ||
| 53 | + return false; | ||
| 54 | + } | ||
| 55 | + | ||
| 46 | return true; | 56 | return true; |
| 47 | } | 57 | } |
| 48 | 58 |
| @@ -7,7 +7,7 @@ | @@ -7,7 +7,7 @@ | ||
| 7 | #include "sherpa-onnx/csrc/wave-reader.h" | 7 | #include "sherpa-onnx/csrc/wave-reader.h" |
| 8 | 8 | ||
| 9 | static int32_t ProgressCallback(int32_t processed_chunks, int32_t num_chunks, | 9 | static int32_t ProgressCallback(int32_t processed_chunks, int32_t num_chunks, |
| 10 | - void *arg) { | 10 | + void *) { |
| 11 | float progress = 100.0 * processed_chunks / num_chunks; | 11 | float progress = 100.0 * processed_chunks / num_chunks; |
| 12 | fprintf(stderr, "progress %.2f%%\n", progress); | 12 | fprintf(stderr, "progress %.2f%%\n", progress); |
| 13 | 13 |
-
请 注册 或 登录 后发表评论