Committed by
GitHub
Add C API for speaker embedding extractor. (#711)
正在显示
23 个修改的文件
包含
739 行增加
和
80 行删除
| 1 | #!/usr/bin/env bash | 1 | #!/usr/bin/env bash |
| 2 | 2 | ||
| 3 | -set -e | 3 | +set -ex |
| 4 | 4 | ||
| 5 | log() { | 5 | log() { |
| 6 | # This function is from espnet | 6 | # This function is from espnet |
| @@ -9,6 +9,7 @@ log() { | @@ -9,6 +9,7 @@ log() { | ||
| 9 | } | 9 | } |
| 10 | 10 | ||
| 11 | echo "SLID_EXE is $SLID_EXE" | 11 | echo "SLID_EXE is $SLID_EXE" |
| 12 | +echo "SID_EXE is $SID_EXE" | ||
| 12 | echo "PATH: $PATH" | 13 | echo "PATH: $PATH" |
| 13 | 14 | ||
| 14 | 15 | ||
| @@ -24,3 +25,15 @@ rm sherpa-onnx-whisper-tiny.tar.bz2 | @@ -24,3 +25,15 @@ rm sherpa-onnx-whisper-tiny.tar.bz2 | ||
| 24 | $SLID_EXE | 25 | $SLID_EXE |
| 25 | 26 | ||
| 26 | rm -rf sherpa-onnx-whisper-tiny* | 27 | rm -rf sherpa-onnx-whisper-tiny* |
| 28 | + | ||
| 29 | +log "------------------------------------------------------------" | ||
| 30 | +log "Download file for speaker identification and verification " | ||
| 31 | +log "------------------------------------------------------------" | ||
| 32 | + | ||
| 33 | +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx | ||
| 34 | +git clone https://github.com/csukuangfj/sr-data | ||
| 35 | + | ||
| 36 | +$SID_EXE | ||
| 37 | + | ||
| 38 | +rm -fv *.onnx | ||
| 39 | +rm -rf sr-data |
| @@ -124,11 +124,12 @@ jobs: | @@ -124,11 +124,12 @@ jobs: | ||
| 124 | name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} | 124 | name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} |
| 125 | path: build/bin/* | 125 | path: build/bin/* |
| 126 | 126 | ||
| 127 | - - name: Test spoken language identification (C API) | 127 | + - name: Test C API |
| 128 | shell: bash | 128 | shell: bash |
| 129 | run: | | 129 | run: | |
| 130 | export PATH=$PWD/build/bin:$PATH | 130 | export PATH=$PWD/build/bin:$PATH |
| 131 | export SLID_EXE=spoken-language-identification-c-api | 131 | export SLID_EXE=spoken-language-identification-c-api |
| 132 | + export SID_EXE=speaker-identification-c-api | ||
| 132 | 133 | ||
| 133 | .github/scripts/test-c-api.sh | 134 | .github/scripts/test-c-api.sh |
| 134 | 135 |
| @@ -103,11 +103,12 @@ jobs: | @@ -103,11 +103,12 @@ jobs: | ||
| 103 | otool -L build/bin/sherpa-onnx | 103 | otool -L build/bin/sherpa-onnx |
| 104 | otool -l build/bin/sherpa-onnx | 104 | otool -l build/bin/sherpa-onnx |
| 105 | 105 | ||
| 106 | - - name: Test spoken language identification (C API) | 106 | + - name: Test C API |
| 107 | shell: bash | 107 | shell: bash |
| 108 | run: | | 108 | run: | |
| 109 | export PATH=$PWD/build/bin:$PATH | 109 | export PATH=$PWD/build/bin:$PATH |
| 110 | export SLID_EXE=spoken-language-identification-c-api | 110 | export SLID_EXE=spoken-language-identification-c-api |
| 111 | + export SID_EXE=speaker-identification-c-api | ||
| 111 | 112 | ||
| 112 | .github/scripts/test-c-api.sh | 113 | .github/scripts/test-c-api.sh |
| 113 | 114 |
| @@ -70,11 +70,12 @@ jobs: | @@ -70,11 +70,12 @@ jobs: | ||
| 70 | 70 | ||
| 71 | ls -lh ./bin/Release/sherpa-onnx.exe | 71 | ls -lh ./bin/Release/sherpa-onnx.exe |
| 72 | 72 | ||
| 73 | - - name: Test spoken language identification (C API) | 73 | + - name: Test C API |
| 74 | shell: bash | 74 | shell: bash |
| 75 | run: | | 75 | run: | |
| 76 | export PATH=$PWD/build/bin/Release:$PATH | 76 | export PATH=$PWD/build/bin/Release:$PATH |
| 77 | export SLID_EXE=spoken-language-identification-c-api.exe | 77 | export SLID_EXE=spoken-language-identification-c-api.exe |
| 78 | + export SID_EXE=speaker-identification-c-api.exe | ||
| 78 | 79 | ||
| 79 | .github/scripts/test-c-api.sh | 80 | .github/scripts/test-c-api.sh |
| 80 | 81 |
| @@ -12,6 +12,9 @@ endif() | @@ -12,6 +12,9 @@ endif() | ||
| 12 | add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c) | 12 | add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c) |
| 13 | target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) | 13 | target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api) |
| 14 | 14 | ||
| 15 | +add_executable(speaker-identification-c-api speaker-identification-c-api.c) | ||
| 16 | +target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api) | ||
| 17 | + | ||
| 15 | if(SHERPA_ONNX_HAS_ALSA) | 18 | if(SHERPA_ONNX_HAS_ALSA) |
| 16 | add_subdirectory(./asr-microphone-example) | 19 | add_subdirectory(./asr-microphone-example) |
| 17 | elseif((UNIX AND NOT APPLE) OR LINUX) | 20 | elseif((UNIX AND NOT APPLE) OR LINUX) |
| @@ -188,10 +188,11 @@ int32_t main(int32_t argc, char *argv[]) { | @@ -188,10 +188,11 @@ int32_t main(int32_t argc, char *argv[]) { | ||
| 188 | } | 188 | } |
| 189 | } | 189 | } |
| 190 | 190 | ||
| 191 | - SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config); | ||
| 192 | - SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); | 191 | + const SherpaOnnxOnlineRecognizer *recognizer = |
| 192 | + CreateOnlineRecognizer(&config); | ||
| 193 | + const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); | ||
| 193 | 194 | ||
| 194 | - SherpaOnnxDisplay *display = CreateDisplay(50); | 195 | + const SherpaOnnxDisplay *display = CreateDisplay(50); |
| 195 | int32_t segment_id = 0; | 196 | int32_t segment_id = 0; |
| 196 | 197 | ||
| 197 | const char *device_name = argv[context.index]; | 198 | const char *device_name = argv[context.index]; |
| @@ -162,10 +162,11 @@ int32_t main(int32_t argc, char *argv[]) { | @@ -162,10 +162,11 @@ int32_t main(int32_t argc, char *argv[]) { | ||
| 162 | } | 162 | } |
| 163 | } | 163 | } |
| 164 | 164 | ||
| 165 | - SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config); | ||
| 166 | - SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); | 165 | + const SherpaOnnxOnlineRecognizer *recognizer = |
| 166 | + CreateOnlineRecognizer(&config); | ||
| 167 | + const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); | ||
| 167 | 168 | ||
| 168 | - SherpaOnnxDisplay *display = CreateDisplay(50); | 169 | + const SherpaOnnxDisplay *display = CreateDisplay(50); |
| 169 | int32_t segment_id = 0; | 170 | int32_t segment_id = 0; |
| 170 | 171 | ||
| 171 | const char *wav_filename = argv[context.index]; | 172 | const char *wav_filename = argv[context.index]; |
| 1 | +// c-api-examples/speaker-identification-c-api.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// We assume you have pre-downloaded the speaker embedding extractor model | ||
| 6 | +// from | ||
| 7 | +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 8 | +// | ||
| 9 | +// An example command to download | ||
| 10 | +// "3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx" | ||
| 11 | +// is given below: | ||
| 12 | +// | ||
| 13 | +// clang-format off | ||
| 14 | +// | ||
| 15 | +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx | ||
| 16 | +// | ||
| 17 | +// clang-format on | ||
| 18 | +// | ||
| 19 | +// Also, please download the test wave files from | ||
| 20 | +// | ||
| 21 | +// https://github.com/csukuangfj/sr-data | ||
| 22 | + | ||
| 23 | +#include <stdio.h> | ||
| 24 | +#include <stdlib.h> | ||
| 25 | +#include <string.h> | ||
| 26 | + | ||
| 27 | +#include "sherpa-onnx/c-api/c-api.h" | ||
| 28 | + | ||
| 29 | +static const float *ComputeEmbedding( | ||
| 30 | + const SherpaOnnxSpeakerEmbeddingExtractor *ex, const char *wav_filename) { | ||
| 31 | + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); | ||
| 32 | + if (wave == NULL) { | ||
| 33 | + fprintf(stderr, "Failed to read %s\n", wav_filename); | ||
| 34 | + exit(-1); | ||
| 35 | + } | ||
| 36 | + | ||
| 37 | + const SherpaOnnxOnlineStream *stream = | ||
| 38 | + SherpaOnnxSpeakerEmbeddingExtractorCreateStream(ex); | ||
| 39 | + | ||
| 40 | + AcceptWaveform(stream, wave->sample_rate, wave->samples, wave->num_samples); | ||
| 41 | + InputFinished(stream); | ||
| 42 | + | ||
| 43 | + if (!SherpaOnnxSpeakerEmbeddingExtractorIsReady(ex, stream)) { | ||
| 44 | + fprintf(stderr, "The input wave file %s is too short!\n", wav_filename); | ||
| 45 | + exit(-1); | ||
| 46 | + } | ||
| 47 | + | ||
| 48 | + // we will free `v` outside of this function | ||
| 49 | + const float *v = | ||
| 50 | + SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding(ex, stream); | ||
| 51 | + | ||
| 52 | + DestroyOnlineStream(stream); | ||
| 53 | + SherpaOnnxFreeWave(wave); | ||
| 54 | + | ||
| 55 | + // Remeber to free v to avoid memory leak | ||
| 56 | + return v; | ||
| 57 | +} | ||
| 58 | + | ||
| 59 | +int32_t main() { | ||
| 60 | + SherpaOnnxSpeakerEmbeddingExtractorConfig config; | ||
| 61 | + | ||
| 62 | + memset(&config, 0, sizeof(config)); | ||
| 63 | + | ||
| 64 | + // please download the model from | ||
| 65 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models | ||
| 66 | + config.model = "./3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"; | ||
| 67 | + | ||
| 68 | + config.num_threads = 1; | ||
| 69 | + config.debug = 0; | ||
| 70 | + config.provider = "cpu"; | ||
| 71 | + | ||
| 72 | + const SherpaOnnxSpeakerEmbeddingExtractor *ex = | ||
| 73 | + SherpaOnnxCreateSpeakerEmbeddingExtractor(&config); | ||
| 74 | + if (!ex) { | ||
| 75 | + fprintf(stderr, "Failed to create speaker embedding extractor"); | ||
| 76 | + return -1; | ||
| 77 | + } | ||
| 78 | + | ||
| 79 | + int32_t dim = SherpaOnnxSpeakerEmbeddingExtractorDim(ex); | ||
| 80 | + | ||
| 81 | + const SherpaOnnxSpeakerEmbeddingManager *manager = | ||
| 82 | + SherpaOnnxCreateSpeakerEmbeddingManager(dim); | ||
| 83 | + | ||
| 84 | + // Please download the test data from | ||
| 85 | + // https://github.com/csukuangfj/sr-data | ||
| 86 | + const char *spk1_1 = "./sr-data/enroll/fangjun-sr-1.wav"; | ||
| 87 | + const char *spk1_2 = "./sr-data/enroll/fangjun-sr-2.wav"; | ||
| 88 | + const char *spk1_3 = "./sr-data/enroll/fangjun-sr-3.wav"; | ||
| 89 | + | ||
| 90 | + const char *spk2_1 = "./sr-data/enroll/leijun-sr-1.wav"; | ||
| 91 | + const char *spk2_2 = "./sr-data/enroll/leijun-sr-2.wav"; | ||
| 92 | + | ||
| 93 | + const float *spk1_vec[4] = {NULL}; | ||
| 94 | + spk1_vec[0] = ComputeEmbedding(ex, spk1_1); | ||
| 95 | + spk1_vec[1] = ComputeEmbedding(ex, spk1_2); | ||
| 96 | + spk1_vec[2] = ComputeEmbedding(ex, spk1_3); | ||
| 97 | + | ||
| 98 | + const float *spk2_vec[3] = {NULL}; | ||
| 99 | + spk2_vec[0] = ComputeEmbedding(ex, spk2_1); | ||
| 100 | + spk2_vec[1] = ComputeEmbedding(ex, spk2_2); | ||
| 101 | + | ||
| 102 | + if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "fangjun", spk1_vec)) { | ||
| 103 | + fprintf(stderr, "Failed to register fangjun\n"); | ||
| 104 | + exit(-1); | ||
| 105 | + } | ||
| 106 | + | ||
| 107 | + if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "fangjun")) { | ||
| 108 | + fprintf(stderr, "Failed to find fangjun\n"); | ||
| 109 | + exit(-1); | ||
| 110 | + } | ||
| 111 | + | ||
| 112 | + if (!SherpaOnnxSpeakerEmbeddingManagerAddList(manager, "leijun", spk2_vec)) { | ||
| 113 | + fprintf(stderr, "Failed to register leijun\n"); | ||
| 114 | + exit(-1); | ||
| 115 | + } | ||
| 116 | + | ||
| 117 | + if (!SherpaOnnxSpeakerEmbeddingManagerContains(manager, "leijun")) { | ||
| 118 | + fprintf(stderr, "Failed to find leijun\n"); | ||
| 119 | + exit(-1); | ||
| 120 | + } | ||
| 121 | + | ||
| 122 | + if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 2) { | ||
| 123 | + fprintf(stderr, "There should be two speakers: fangjun and leijun\n"); | ||
| 124 | + exit(-1); | ||
| 125 | + } | ||
| 126 | + | ||
| 127 | + const char *const *all_speakers = | ||
| 128 | + SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager); | ||
| 129 | + const char *const *p = all_speakers; | ||
| 130 | + fprintf(stderr, "list of registered speakers\n-----\n"); | ||
| 131 | + while (p[0]) { | ||
| 132 | + fprintf(stderr, "speaker: %s\n", p[0]); | ||
| 133 | + ++p; | ||
| 134 | + } | ||
| 135 | + fprintf(stderr, "----\n"); | ||
| 136 | + | ||
| 137 | + SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers); | ||
| 138 | + | ||
| 139 | + const char *test1 = "./sr-data/test/fangjun-test-sr-1.wav"; | ||
| 140 | + const char *test2 = "./sr-data/test/leijun-test-sr-1.wav"; | ||
| 141 | + const char *test3 = "./sr-data/test/liudehua-test-sr-1.wav"; | ||
| 142 | + | ||
| 143 | + const float *v1 = ComputeEmbedding(ex, test1); | ||
| 144 | + const float *v2 = ComputeEmbedding(ex, test2); | ||
| 145 | + const float *v3 = ComputeEmbedding(ex, test3); | ||
| 146 | + | ||
| 147 | + float threshold = 0.6; | ||
| 148 | + | ||
| 149 | + const char *name1 = | ||
| 150 | + SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold); | ||
| 151 | + if (name1) { | ||
| 152 | + fprintf(stderr, "%s: Found %s\n", test1, name1); | ||
| 153 | + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1); | ||
| 154 | + } else { | ||
| 155 | + fprintf(stderr, "%s: Not found\n", test1); | ||
| 156 | + } | ||
| 157 | + | ||
| 158 | + const char *name2 = | ||
| 159 | + SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold); | ||
| 160 | + if (name2) { | ||
| 161 | + fprintf(stderr, "%s: Found %s\n", test2, name2); | ||
| 162 | + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2); | ||
| 163 | + } else { | ||
| 164 | + fprintf(stderr, "%s: Not found\n", test2); | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + const char *name3 = | ||
| 168 | + SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v3, threshold); | ||
| 169 | + if (name3) { | ||
| 170 | + fprintf(stderr, "%s: Found %s\n", test3, name3); | ||
| 171 | + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name3); | ||
| 172 | + } else { | ||
| 173 | + fprintf(stderr, "%s: Not found\n", test3); | ||
| 174 | + } | ||
| 175 | + | ||
| 176 | + int32_t ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v1, | ||
| 177 | + threshold); | ||
| 178 | + if (ok) { | ||
| 179 | + fprintf(stderr, "%s matches fangjun\n", test1); | ||
| 180 | + } else { | ||
| 181 | + fprintf(stderr, "%s does NOT match fangjun\n", test1); | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + ok = SherpaOnnxSpeakerEmbeddingManagerVerify(manager, "fangjun", v2, | ||
| 185 | + threshold); | ||
| 186 | + if (ok) { | ||
| 187 | + fprintf(stderr, "%s matches fangjun\n", test2); | ||
| 188 | + } else { | ||
| 189 | + fprintf(stderr, "%s does NOT match fangjun\n", test2); | ||
| 190 | + } | ||
| 191 | + | ||
| 192 | + fprintf(stderr, "Removing fangjun\n"); | ||
| 193 | + if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "fangjun")) { | ||
| 194 | + fprintf(stderr, "Failed to remove fangjun\n"); | ||
| 195 | + exit(-1); | ||
| 196 | + } | ||
| 197 | + | ||
| 198 | + if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 1) { | ||
| 199 | + fprintf(stderr, "There should be only 1 speaker left\n"); | ||
| 200 | + exit(-1); | ||
| 201 | + } | ||
| 202 | + | ||
| 203 | + name1 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v1, threshold); | ||
| 204 | + if (name1) { | ||
| 205 | + fprintf(stderr, "%s: Found %s\n", test1, name1); | ||
| 206 | + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name1); | ||
| 207 | + } else { | ||
| 208 | + fprintf(stderr, "%s: Not found\n", test1); | ||
| 209 | + } | ||
| 210 | + | ||
| 211 | + fprintf(stderr, "Removing leijun\n"); | ||
| 212 | + if (!SherpaOnnxSpeakerEmbeddingManagerRemove(manager, "leijun")) { | ||
| 213 | + fprintf(stderr, "Failed to remove leijun\n"); | ||
| 214 | + exit(-1); | ||
| 215 | + } | ||
| 216 | + | ||
| 217 | + if (SherpaOnnxSpeakerEmbeddingManagerNumSpeakers(manager) != 0) { | ||
| 218 | + fprintf(stderr, "There should be only 1 speaker left\n"); | ||
| 219 | + exit(-1); | ||
| 220 | + } | ||
| 221 | + | ||
| 222 | + name2 = SherpaOnnxSpeakerEmbeddingManagerSearch(manager, v2, threshold); | ||
| 223 | + if (name2) { | ||
| 224 | + fprintf(stderr, "%s: Found %s\n", test2, name2); | ||
| 225 | + SherpaOnnxSpeakerEmbeddingManagerFreeSearch(name2); | ||
| 226 | + } else { | ||
| 227 | + fprintf(stderr, "%s: Not found\n", test2); | ||
| 228 | + } | ||
| 229 | + | ||
| 230 | + all_speakers = SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(manager); | ||
| 231 | + | ||
| 232 | + p = all_speakers; | ||
| 233 | + fprintf(stderr, "list of registered speakers\n-----\n"); | ||
| 234 | + while (p[0]) { | ||
| 235 | + fprintf(stderr, "speaker: %s\n", p[0]); | ||
| 236 | + ++p; | ||
| 237 | + } | ||
| 238 | + fprintf(stderr, "----\n"); | ||
| 239 | + | ||
| 240 | + SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(all_speakers); | ||
| 241 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v1); | ||
| 242 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v2); | ||
| 243 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(v3); | ||
| 244 | + | ||
| 245 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[0]); | ||
| 246 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[1]); | ||
| 247 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk1_vec[2]); | ||
| 248 | + | ||
| 249 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[0]); | ||
| 250 | + SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(spk2_vec[1]); | ||
| 251 | + | ||
| 252 | + SherpaOnnxDestroySpeakerEmbeddingManager(manager); | ||
| 253 | + SherpaOnnxDestroySpeakerEmbeddingExtractor(ex); | ||
| 254 | + | ||
| 255 | + return 0; | ||
| 256 | +} |
| 1 | +// c-api-examples/spoken-language-identification-c-api.c | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2024 Xiaomi Corporation | ||
| 1 | 4 | ||
| 2 | // We assume you have pre-downloaded the whisper multi-lingual models | 5 | // We assume you have pre-downloaded the whisper multi-lingual models |
| 3 | // from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | 6 | // from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models |
| @@ -83,7 +83,7 @@ class ViewController: UIViewController { | @@ -83,7 +83,7 @@ class ViewController: UIViewController { | ||
| 83 | // Please select one model that is best suitable for you. | 83 | // Please select one model that is best suitable for you. |
| 84 | // | 84 | // |
| 85 | // You can also modify Model.swift to add new pre-trained models from | 85 | // You can also modify Model.swift to add new pre-trained models from |
| 86 | - // https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html | 86 | + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html |
| 87 | 87 | ||
| 88 | // let modelConfig = getBilingualStreamZhEnZipformer20230220() | 88 | // let modelConfig = getBilingualStreamZhEnZipformer20230220() |
| 89 | // let modelConfig = getZhZipformer20230615() | 89 | // let modelConfig = getZhZipformer20230615() |
| @@ -4,7 +4,7 @@ | @@ -4,7 +4,7 @@ | ||
| 4 | // | 4 | // |
| 5 | // Created by fangjun on 2023/11/23. | 5 | // Created by fangjun on 2023/11/23. |
| 6 | // | 6 | // |
| 7 | -// Speech-to-text with Next-gen Kaldi on iOS without Internet connection | 7 | +// Text-to-speech with Next-gen Kaldi on iOS without Internet connection |
| 8 | 8 | ||
| 9 | import SwiftUI | 9 | import SwiftUI |
| 10 | import AVFoundation | 10 | import AVFoundation |
| @@ -183,7 +183,7 @@ event = threading.Event() | @@ -183,7 +183,7 @@ event = threading.Event() | ||
| 183 | first_message_time = None | 183 | first_message_time = None |
| 184 | 184 | ||
| 185 | 185 | ||
| 186 | -def generated_audio_callback(samples: np.ndarray): | 186 | +def generated_audio_callback(samples: np.ndarray, progress: float): |
| 187 | """This function is called whenever max_num_sentences sentences | 187 | """This function is called whenever max_num_sentences sentences |
| 188 | have been processed. | 188 | have been processed. |
| 189 | 189 |
| @@ -16,6 +16,8 @@ | @@ -16,6 +16,8 @@ | ||
| 16 | #include "sherpa-onnx/csrc/macros.h" | 16 | #include "sherpa-onnx/csrc/macros.h" |
| 17 | #include "sherpa-onnx/csrc/offline-recognizer.h" | 17 | #include "sherpa-onnx/csrc/offline-recognizer.h" |
| 18 | #include "sherpa-onnx/csrc/online-recognizer.h" | 18 | #include "sherpa-onnx/csrc/online-recognizer.h" |
| 19 | +#include "sherpa-onnx/csrc/speaker-embedding-extractor.h" | ||
| 20 | +#include "sherpa-onnx/csrc/speaker-embedding-manager.h" | ||
| 19 | #include "sherpa-onnx/csrc/spoken-language-identification.h" | 21 | #include "sherpa-onnx/csrc/spoken-language-identification.h" |
| 20 | #include "sherpa-onnx/csrc/voice-activity-detector.h" | 22 | #include "sherpa-onnx/csrc/voice-activity-detector.h" |
| 21 | #include "sherpa-onnx/csrc/wave-reader.h" | 23 | #include "sherpa-onnx/csrc/wave-reader.h" |
| @@ -114,7 +116,7 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( | @@ -114,7 +116,7 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( | ||
| 114 | return recognizer; | 116 | return recognizer; |
| 115 | } | 117 | } |
| 116 | 118 | ||
| 117 | -void DestroyOnlineRecognizer(SherpaOnnxOnlineRecognizer *recognizer) { | 119 | +void DestroyOnlineRecognizer(const SherpaOnnxOnlineRecognizer *recognizer) { |
| 118 | delete recognizer; | 120 | delete recognizer; |
| 119 | } | 121 | } |
| 120 | 122 | ||
| @@ -132,25 +134,28 @@ SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords( | @@ -132,25 +134,28 @@ SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords( | ||
| 132 | return stream; | 134 | return stream; |
| 133 | } | 135 | } |
| 134 | 136 | ||
| 135 | -void DestroyOnlineStream(SherpaOnnxOnlineStream *stream) { delete stream; } | 137 | +void DestroyOnlineStream(const SherpaOnnxOnlineStream *stream) { |
| 138 | + delete stream; | ||
| 139 | +} | ||
| 136 | 140 | ||
| 137 | -void AcceptWaveform(SherpaOnnxOnlineStream *stream, int32_t sample_rate, | 141 | +void AcceptWaveform(const SherpaOnnxOnlineStream *stream, int32_t sample_rate, |
| 138 | const float *samples, int32_t n) { | 142 | const float *samples, int32_t n) { |
| 139 | stream->impl->AcceptWaveform(sample_rate, samples, n); | 143 | stream->impl->AcceptWaveform(sample_rate, samples, n); |
| 140 | } | 144 | } |
| 141 | 145 | ||
| 142 | -int32_t IsOnlineStreamReady(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 143 | - SherpaOnnxOnlineStream *stream) { | 146 | +int32_t IsOnlineStreamReady(const SherpaOnnxOnlineRecognizer *recognizer, |
| 147 | + const SherpaOnnxOnlineStream *stream) { | ||
| 144 | return recognizer->impl->IsReady(stream->impl.get()); | 148 | return recognizer->impl->IsReady(stream->impl.get()); |
| 145 | } | 149 | } |
| 146 | 150 | ||
| 147 | -void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 148 | - SherpaOnnxOnlineStream *stream) { | 151 | +void DecodeOnlineStream(const SherpaOnnxOnlineRecognizer *recognizer, |
| 152 | + const SherpaOnnxOnlineStream *stream) { | ||
| 149 | recognizer->impl->DecodeStream(stream->impl.get()); | 153 | recognizer->impl->DecodeStream(stream->impl.get()); |
| 150 | } | 154 | } |
| 151 | 155 | ||
| 152 | -void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 153 | - SherpaOnnxOnlineStream **streams, int32_t n) { | 156 | +void DecodeMultipleOnlineStreams(const SherpaOnnxOnlineRecognizer *recognizer, |
| 157 | + const SherpaOnnxOnlineStream **streams, | ||
| 158 | + int32_t n) { | ||
| 154 | std::vector<sherpa_onnx::OnlineStream *> ss(n); | 159 | std::vector<sherpa_onnx::OnlineStream *> ss(n); |
| 155 | for (int32_t i = 0; i != n; ++i) { | 160 | for (int32_t i = 0; i != n; ++i) { |
| 156 | ss[i] = streams[i]->impl.get(); | 161 | ss[i] = streams[i]->impl.get(); |
| @@ -159,7 +164,8 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, | @@ -159,7 +164,8 @@ void DecodeMultipleOnlineStreams(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 159 | } | 164 | } |
| 160 | 165 | ||
| 161 | const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( | 166 | const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( |
| 162 | - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream) { | 167 | + const SherpaOnnxOnlineRecognizer *recognizer, |
| 168 | + const SherpaOnnxOnlineStream *stream) { | ||
| 163 | sherpa_onnx::OnlineRecognizerResult result = | 169 | sherpa_onnx::OnlineRecognizerResult result = |
| 164 | recognizer->impl->GetResult(stream->impl.get()); | 170 | recognizer->impl->GetResult(stream->impl.get()); |
| 165 | const auto &text = result.text; | 171 | const auto &text = result.text; |
| @@ -232,29 +238,30 @@ void DestroyOnlineRecognizerResult(const SherpaOnnxOnlineRecognizerResult *r) { | @@ -232,29 +238,30 @@ void DestroyOnlineRecognizerResult(const SherpaOnnxOnlineRecognizerResult *r) { | ||
| 232 | } | 238 | } |
| 233 | } | 239 | } |
| 234 | 240 | ||
| 235 | -void Reset(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 236 | - SherpaOnnxOnlineStream *stream) { | 241 | +void Reset(const SherpaOnnxOnlineRecognizer *recognizer, |
| 242 | + const SherpaOnnxOnlineStream *stream) { | ||
| 237 | recognizer->impl->Reset(stream->impl.get()); | 243 | recognizer->impl->Reset(stream->impl.get()); |
| 238 | } | 244 | } |
| 239 | 245 | ||
| 240 | -void InputFinished(SherpaOnnxOnlineStream *stream) { | 246 | +void InputFinished(const SherpaOnnxOnlineStream *stream) { |
| 241 | stream->impl->InputFinished(); | 247 | stream->impl->InputFinished(); |
| 242 | } | 248 | } |
| 243 | 249 | ||
| 244 | -int32_t IsEndpoint(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 245 | - SherpaOnnxOnlineStream *stream) { | 250 | +int32_t IsEndpoint(const SherpaOnnxOnlineRecognizer *recognizer, |
| 251 | + const SherpaOnnxOnlineStream *stream) { | ||
| 246 | return recognizer->impl->IsEndpoint(stream->impl.get()); | 252 | return recognizer->impl->IsEndpoint(stream->impl.get()); |
| 247 | } | 253 | } |
| 248 | 254 | ||
| 249 | -SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line) { | 255 | +const SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line) { |
| 250 | SherpaOnnxDisplay *ans = new SherpaOnnxDisplay; | 256 | SherpaOnnxDisplay *ans = new SherpaOnnxDisplay; |
| 251 | ans->impl = std::make_unique<sherpa_onnx::Display>(max_word_per_line); | 257 | ans->impl = std::make_unique<sherpa_onnx::Display>(max_word_per_line); |
| 252 | return ans; | 258 | return ans; |
| 253 | } | 259 | } |
| 254 | 260 | ||
| 255 | -void DestroyDisplay(SherpaOnnxDisplay *display) { delete display; } | 261 | +void DestroyDisplay(const SherpaOnnxDisplay *display) { delete display; } |
| 256 | 262 | ||
| 257 | -void SherpaOnnxPrint(SherpaOnnxDisplay *display, int32_t idx, const char *s) { | 263 | +void SherpaOnnxPrint(const SherpaOnnxDisplay *display, int32_t idx, |
| 264 | + const char *s) { | ||
| 258 | display->impl->Print(idx, s); | 265 | display->impl->Print(idx, s); |
| 259 | } | 266 | } |
| 260 | 267 | ||
| @@ -808,9 +815,8 @@ int32_t SherpaOnnxOfflineTtsNumSpeakers(const SherpaOnnxOfflineTts *tts) { | @@ -808,9 +815,8 @@ int32_t SherpaOnnxOfflineTtsNumSpeakers(const SherpaOnnxOfflineTts *tts) { | ||
| 808 | } | 815 | } |
| 809 | 816 | ||
| 810 | static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal( | 817 | static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal( |
| 811 | - const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, | ||
| 812 | - float speed, std::function<void(const float *, int32_t, float)> callback) | ||
| 813 | -{ | 818 | + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, |
| 819 | + std::function<void(const float *, int32_t, float)> callback) { | ||
| 814 | sherpa_onnx::GeneratedAudio audio = | 820 | sherpa_onnx::GeneratedAudio audio = |
| 815 | tts->impl->Generate(text, sid, speed, callback); | 821 | tts->impl->Generate(text, sid, speed, callback); |
| 816 | 822 | ||
| @@ -833,36 +839,37 @@ static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal( | @@ -833,36 +839,37 @@ static const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateInternal( | ||
| 833 | const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( | 839 | const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( |
| 834 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, | 840 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, |
| 835 | float speed) { | 841 | float speed) { |
| 836 | - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, nullptr ); | 842 | + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, nullptr); |
| 837 | } | 843 | } |
| 838 | 844 | ||
| 839 | const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallback( | 845 | const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallback( |
| 840 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, | 846 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, |
| 841 | SherpaOnnxGeneratedAudioCallback callback) { | 847 | SherpaOnnxGeneratedAudioCallback callback) { |
| 842 | - auto wrapper = [callback](const float *samples, int32_t n, float /*progress*/) { | ||
| 843 | - callback(samples, n ); | ||
| 844 | - }; | 848 | + auto wrapper = [callback](const float *samples, int32_t n, |
| 849 | + float /*progress*/) { callback(samples, n); }; | ||
| 845 | 850 | ||
| 846 | - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper ); | 851 | + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper); |
| 847 | } | 852 | } |
| 848 | 853 | ||
| 849 | -const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithProgressCallback( | 854 | +const SherpaOnnxGeneratedAudio * |
| 855 | +SherpaOnnxOfflineTtsGenerateWithProgressCallback( | ||
| 850 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, | 856 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, |
| 851 | SherpaOnnxGeneratedAudioProgressCallback callback) { | 857 | SherpaOnnxGeneratedAudioProgressCallback callback) { |
| 852 | auto wrapper = [callback](const float *samples, int32_t n, float progress) { | 858 | auto wrapper = [callback](const float *samples, int32_t n, float progress) { |
| 853 | - callback(samples, n, progress ); | 859 | + callback(samples, n, progress); |
| 854 | }; | 860 | }; |
| 855 | - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper ); | 861 | + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper); |
| 856 | } | 862 | } |
| 857 | 863 | ||
| 858 | const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallbackWithArg( | 864 | const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerateWithCallbackWithArg( |
| 859 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, | 865 | const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, float speed, |
| 860 | SherpaOnnxGeneratedAudioCallbackWithArg callback, void *arg) { | 866 | SherpaOnnxGeneratedAudioCallbackWithArg callback, void *arg) { |
| 861 | - auto wrapper = [callback, arg](const float *samples, int32_t n, float /*progress*/) { | 867 | + auto wrapper = [callback, arg](const float *samples, int32_t n, |
| 868 | + float /*progress*/) { | ||
| 862 | callback(samples, n, arg); | 869 | callback(samples, n, arg); |
| 863 | }; | 870 | }; |
| 864 | 871 | ||
| 865 | - return SherpaOnnxOfflineTtsGenerateInternal( tts, text, sid, speed, wrapper ); | 872 | + return SherpaOnnxOfflineTtsGenerateInternal(tts, text, sid, speed, wrapper); |
| 866 | } | 873 | } |
| 867 | 874 | ||
| 868 | void SherpaOnnxDestroyOfflineTtsGeneratedAudio( | 875 | void SherpaOnnxDestroyOfflineTtsGeneratedAudio( |
| @@ -972,3 +979,200 @@ void SherpaOnnxDestroySpokenLanguageIdentificationResult( | @@ -972,3 +979,200 @@ void SherpaOnnxDestroySpokenLanguageIdentificationResult( | ||
| 972 | delete r; | 979 | delete r; |
| 973 | } | 980 | } |
| 974 | } | 981 | } |
| 982 | + | ||
| 983 | +struct SherpaOnnxSpeakerEmbeddingExtractor { | ||
| 984 | + std::unique_ptr<sherpa_onnx::SpeakerEmbeddingExtractor> impl; | ||
| 985 | +}; | ||
| 986 | + | ||
| 987 | +const SherpaOnnxSpeakerEmbeddingExtractor * | ||
| 988 | +SherpaOnnxCreateSpeakerEmbeddingExtractor( | ||
| 989 | + const SherpaOnnxSpeakerEmbeddingExtractorConfig *config) { | ||
| 990 | + sherpa_onnx::SpeakerEmbeddingExtractorConfig c; | ||
| 991 | + c.model = SHERPA_ONNX_OR(config->model, ""); | ||
| 992 | + | ||
| 993 | + c.num_threads = SHERPA_ONNX_OR(config->num_threads, 1); | ||
| 994 | + c.debug = SHERPA_ONNX_OR(config->debug, 0); | ||
| 995 | + c.provider = SHERPA_ONNX_OR(config->provider, "cpu"); | ||
| 996 | + | ||
| 997 | + if (config->debug) { | ||
| 998 | + SHERPA_ONNX_LOGE("%s\n", c.ToString().c_str()); | ||
| 999 | + } | ||
| 1000 | + | ||
| 1001 | + if (!c.Validate()) { | ||
| 1002 | + SHERPA_ONNX_LOGE("Errors in config!"); | ||
| 1003 | + return nullptr; | ||
| 1004 | + } | ||
| 1005 | + | ||
| 1006 | + auto p = new SherpaOnnxSpeakerEmbeddingExtractor; | ||
| 1007 | + | ||
| 1008 | + p->impl = std::make_unique<sherpa_onnx::SpeakerEmbeddingExtractor>(c); | ||
| 1009 | + | ||
| 1010 | + return p; | ||
| 1011 | +} | ||
| 1012 | + | ||
| 1013 | +void SherpaOnnxDestroySpeakerEmbeddingExtractor( | ||
| 1014 | + const SherpaOnnxSpeakerEmbeddingExtractor *p) { | ||
| 1015 | + delete p; | ||
| 1016 | +} | ||
| 1017 | + | ||
| 1018 | +int32_t SherpaOnnxSpeakerEmbeddingExtractorDim( | ||
| 1019 | + const SherpaOnnxSpeakerEmbeddingExtractor *p) { | ||
| 1020 | + return p->impl->Dim(); | ||
| 1021 | +} | ||
| 1022 | + | ||
| 1023 | +const SherpaOnnxOnlineStream *SherpaOnnxSpeakerEmbeddingExtractorCreateStream( | ||
| 1024 | + const SherpaOnnxSpeakerEmbeddingExtractor *p) { | ||
| 1025 | + SherpaOnnxOnlineStream *stream = | ||
| 1026 | + new SherpaOnnxOnlineStream(p->impl->CreateStream()); | ||
| 1027 | + return stream; | ||
| 1028 | +} | ||
| 1029 | + | ||
| 1030 | +int32_t SherpaOnnxSpeakerEmbeddingExtractorIsReady( | ||
| 1031 | + const SherpaOnnxSpeakerEmbeddingExtractor *p, | ||
| 1032 | + const SherpaOnnxOnlineStream *s) { | ||
| 1033 | + return p->impl->IsReady(s->impl.get()); | ||
| 1034 | +} | ||
| 1035 | + | ||
| 1036 | +const float *SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding( | ||
| 1037 | + const SherpaOnnxSpeakerEmbeddingExtractor *p, | ||
| 1038 | + const SherpaOnnxOnlineStream *s) { | ||
| 1039 | + std::vector<float> v = p->impl->Compute(s->impl.get()); | ||
| 1040 | + float *ans = new float[v.size()]; | ||
| 1041 | + std::copy(v.begin(), v.end(), ans); | ||
| 1042 | + return ans; | ||
| 1043 | +} | ||
| 1044 | + | ||
| 1045 | +void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding(const float *v) { | ||
| 1046 | + delete[] v; | ||
| 1047 | +} | ||
| 1048 | + | ||
| 1049 | +struct SherpaOnnxSpeakerEmbeddingManager { | ||
| 1050 | + std::unique_ptr<sherpa_onnx::SpeakerEmbeddingManager> impl; | ||
| 1051 | +}; | ||
| 1052 | + | ||
| 1053 | +const SherpaOnnxSpeakerEmbeddingManager * | ||
| 1054 | +SherpaOnnxCreateSpeakerEmbeddingManager(int32_t dim) { | ||
| 1055 | + auto p = new SherpaOnnxSpeakerEmbeddingManager; | ||
| 1056 | + p->impl = std::make_unique<sherpa_onnx::SpeakerEmbeddingManager>(dim); | ||
| 1057 | + return p; | ||
| 1058 | +} | ||
| 1059 | + | ||
| 1060 | +void SherpaOnnxDestroySpeakerEmbeddingManager( | ||
| 1061 | + const SherpaOnnxSpeakerEmbeddingManager *p) { | ||
| 1062 | + delete p; | ||
| 1063 | +} | ||
| 1064 | + | ||
| 1065 | +int32_t SherpaOnnxSpeakerEmbeddingManagerAdd( | ||
| 1066 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, | ||
| 1067 | + const float *v) { | ||
| 1068 | + return p->impl->Add(name, v); | ||
| 1069 | +} | ||
| 1070 | + | ||
| 1071 | +int32_t SherpaOnnxSpeakerEmbeddingManagerAddList( | ||
| 1072 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, | ||
| 1073 | + const float **v) { | ||
| 1074 | + int32_t n = 0; | ||
| 1075 | + auto q = v; | ||
| 1076 | + while (q && q[0]) { | ||
| 1077 | + ++n; | ||
| 1078 | + ++q; | ||
| 1079 | + } | ||
| 1080 | + | ||
| 1081 | + if (n == 0) { | ||
| 1082 | + SHERPA_ONNX_LOGE("Empty embedding!"); | ||
| 1083 | + return 0; | ||
| 1084 | + } | ||
| 1085 | + | ||
| 1086 | + std::vector<std::vector<float>> vec(n); | ||
| 1087 | + int32_t dim = p->impl->Dim(); | ||
| 1088 | + | ||
| 1089 | + for (int32_t i = 0; i != n; ++i) { | ||
| 1090 | + vec[i] = std::vector<float>(v[i], v[i] + dim); | ||
| 1091 | + } | ||
| 1092 | + | ||
| 1093 | + return p->impl->Add(name, vec); | ||
| 1094 | +} | ||
| 1095 | + | ||
| 1096 | +int32_t SherpaOnnxSpeakerEmbeddingManagerAddListFlattened( | ||
| 1097 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, | ||
| 1098 | + const float *v, int32_t n) { | ||
| 1099 | + std::vector<std::vector<float>> vec(n); | ||
| 1100 | + | ||
| 1101 | + int32_t dim = p->impl->Dim(); | ||
| 1102 | + | ||
| 1103 | + for (int32_t i = 0; i != n; ++i, v += dim) { | ||
| 1104 | + vec[i] = std::vector<float>(v, v + dim); | ||
| 1105 | + } | ||
| 1106 | + | ||
| 1107 | + return p->impl->Add(name, vec); | ||
| 1108 | +} | ||
| 1109 | + | ||
| 1110 | +int32_t SherpaOnnxSpeakerEmbeddingManagerRemove( | ||
| 1111 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name) { | ||
| 1112 | + return p->impl->Remove(name); | ||
| 1113 | +} | ||
| 1114 | + | ||
| 1115 | +const char *SherpaOnnxSpeakerEmbeddingManagerSearch( | ||
| 1116 | + const SherpaOnnxSpeakerEmbeddingManager *p, const float *v, | ||
| 1117 | + float threshold) { | ||
| 1118 | + auto r = p->impl->Search(v, threshold); | ||
| 1119 | + if (r.empty()) { | ||
| 1120 | + return nullptr; | ||
| 1121 | + } | ||
| 1122 | + | ||
| 1123 | + char *name = new char[r.size() + 1]; | ||
| 1124 | + std::copy(r.begin(), r.end(), name); | ||
| 1125 | + name[r.size()] = '\0'; | ||
| 1126 | + | ||
| 1127 | + return name; | ||
| 1128 | +} | ||
| 1129 | + | ||
| 1130 | +void SherpaOnnxSpeakerEmbeddingManagerFreeSearch(const char *name) { | ||
| 1131 | + delete[] name; | ||
| 1132 | +} | ||
| 1133 | + | ||
| 1134 | +int32_t SherpaOnnxSpeakerEmbeddingManagerVerify( | ||
| 1135 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, | ||
| 1136 | + const float *v, float threshold) { | ||
| 1137 | + return p->impl->Verify(name, v, threshold); | ||
| 1138 | +} | ||
| 1139 | + | ||
| 1140 | +int32_t SherpaOnnxSpeakerEmbeddingManagerContains( | ||
| 1141 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name) { | ||
| 1142 | + return p->impl->Contains(name); | ||
| 1143 | +} | ||
| 1144 | + | ||
| 1145 | +int32_t SherpaOnnxSpeakerEmbeddingManagerNumSpeakers( | ||
| 1146 | + const SherpaOnnxSpeakerEmbeddingManager *p) { | ||
| 1147 | + return p->impl->NumSpeakers(); | ||
| 1148 | +} | ||
| 1149 | + | ||
| 1150 | +const char *const *SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers( | ||
| 1151 | + const SherpaOnnxSpeakerEmbeddingManager *manager) { | ||
| 1152 | + std::vector<std::string> all_speakers = manager->impl->GetAllSpeakers(); | ||
| 1153 | + int32_t num_speakers = all_speakers.size(); | ||
| 1154 | + char **p = new char *[num_speakers + 1]; | ||
| 1155 | + p[num_speakers] = nullptr; | ||
| 1156 | + | ||
| 1157 | + int32_t i = 0; | ||
| 1158 | + for (const auto &name : all_speakers) { | ||
| 1159 | + p[i] = new char[name.size() + 1]; | ||
| 1160 | + std::copy(name.begin(), name.end(), p[i]); | ||
| 1161 | + p[i][name.size()] = '\0'; | ||
| 1162 | + | ||
| 1163 | + i += 1; | ||
| 1164 | + } | ||
| 1165 | + return p; | ||
| 1166 | +} | ||
| 1167 | + | ||
| 1168 | +void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers( | ||
| 1169 | + const char *const *names) { | ||
| 1170 | + auto p = names; | ||
| 1171 | + | ||
| 1172 | + while (p && p[0]) { | ||
| 1173 | + delete[] p[0]; | ||
| 1174 | + ++p; | ||
| 1175 | + } | ||
| 1176 | + | ||
| 1177 | + delete[] names; | ||
| 1178 | +} |
| @@ -186,7 +186,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( | @@ -186,7 +186,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( | ||
| 186 | /// | 186 | /// |
| 187 | /// @param p A pointer returned by CreateOnlineRecognizer() | 187 | /// @param p A pointer returned by CreateOnlineRecognizer() |
| 188 | SHERPA_ONNX_API void DestroyOnlineRecognizer( | 188 | SHERPA_ONNX_API void DestroyOnlineRecognizer( |
| 189 | - SherpaOnnxOnlineRecognizer *recognizer); | 189 | + const SherpaOnnxOnlineRecognizer *recognizer); |
| 190 | 190 | ||
| 191 | /// Create an online stream for accepting wave samples. | 191 | /// Create an online stream for accepting wave samples. |
| 192 | /// | 192 | /// |
| @@ -208,7 +208,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords( | @@ -208,7 +208,7 @@ SHERPA_ONNX_API SherpaOnnxOnlineStream *CreateOnlineStreamWithHotwords( | ||
| 208 | /// Destroy an online stream. | 208 | /// Destroy an online stream. |
| 209 | /// | 209 | /// |
| 210 | /// @param stream A pointer returned by CreateOnlineStream() | 210 | /// @param stream A pointer returned by CreateOnlineStream() |
| 211 | -SHERPA_ONNX_API void DestroyOnlineStream(SherpaOnnxOnlineStream *stream); | 211 | +SHERPA_ONNX_API void DestroyOnlineStream(const SherpaOnnxOnlineStream *stream); |
| 212 | 212 | ||
| 213 | /// Accept input audio samples and compute the features. | 213 | /// Accept input audio samples and compute the features. |
| 214 | /// The user has to invoke DecodeOnlineStream() to run the neural network and | 214 | /// The user has to invoke DecodeOnlineStream() to run the neural network and |
| @@ -221,7 +221,7 @@ SHERPA_ONNX_API void DestroyOnlineStream(SherpaOnnxOnlineStream *stream); | @@ -221,7 +221,7 @@ SHERPA_ONNX_API void DestroyOnlineStream(SherpaOnnxOnlineStream *stream); | ||
| 221 | /// @param samples A pointer to a 1-D array containing audio samples. | 221 | /// @param samples A pointer to a 1-D array containing audio samples. |
| 222 | /// The range of samples has to be normalized to [-1, 1]. | 222 | /// The range of samples has to be normalized to [-1, 1]. |
| 223 | /// @param n Number of elements in the samples array. | 223 | /// @param n Number of elements in the samples array. |
| 224 | -SHERPA_ONNX_API void AcceptWaveform(SherpaOnnxOnlineStream *stream, | 224 | +SHERPA_ONNX_API void AcceptWaveform(const SherpaOnnxOnlineStream *stream, |
| 225 | int32_t sample_rate, const float *samples, | 225 | int32_t sample_rate, const float *samples, |
| 226 | int32_t n); | 226 | int32_t n); |
| 227 | 227 | ||
| @@ -230,8 +230,9 @@ SHERPA_ONNX_API void AcceptWaveform(SherpaOnnxOnlineStream *stream, | @@ -230,8 +230,9 @@ SHERPA_ONNX_API void AcceptWaveform(SherpaOnnxOnlineStream *stream, | ||
| 230 | /// | 230 | /// |
| 231 | /// @param recognizer A pointer returned by CreateOnlineRecognizer | 231 | /// @param recognizer A pointer returned by CreateOnlineRecognizer |
| 232 | /// @param stream A pointer returned by CreateOnlineStream | 232 | /// @param stream A pointer returned by CreateOnlineStream |
| 233 | -SHERPA_ONNX_API int32_t IsOnlineStreamReady( | ||
| 234 | - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); | 233 | +SHERPA_ONNX_API int32_t |
| 234 | +IsOnlineStreamReady(const SherpaOnnxOnlineRecognizer *recognizer, | ||
| 235 | + const SherpaOnnxOnlineStream *stream); | ||
| 235 | 236 | ||
| 236 | /// Call this function to run the neural network model and decoding. | 237 | /// Call this function to run the neural network model and decoding. |
| 237 | // | 238 | // |
| @@ -243,8 +244,9 @@ SHERPA_ONNX_API int32_t IsOnlineStreamReady( | @@ -243,8 +244,9 @@ SHERPA_ONNX_API int32_t IsOnlineStreamReady( | ||
| 243 | /// DecodeOnlineStream(recognizer, stream); | 244 | /// DecodeOnlineStream(recognizer, stream); |
| 244 | /// } | 245 | /// } |
| 245 | /// | 246 | /// |
| 246 | -SHERPA_ONNX_API void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 247 | - SherpaOnnxOnlineStream *stream); | 247 | +SHERPA_ONNX_API void DecodeOnlineStream( |
| 248 | + const SherpaOnnxOnlineRecognizer *recognizer, | ||
| 249 | + const SherpaOnnxOnlineStream *stream); | ||
| 248 | 250 | ||
| 249 | /// This function is similar to DecodeOnlineStream(). It decodes multiple | 251 | /// This function is similar to DecodeOnlineStream(). It decodes multiple |
| 250 | /// OnlineStream in parallel. | 252 | /// OnlineStream in parallel. |
| @@ -257,8 +259,8 @@ SHERPA_ONNX_API void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer, | @@ -257,8 +259,8 @@ SHERPA_ONNX_API void DecodeOnlineStream(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 257 | /// CreateOnlineRecognizer() | 259 | /// CreateOnlineRecognizer() |
| 258 | /// @param n Number of elements in the given streams array. | 260 | /// @param n Number of elements in the given streams array. |
| 259 | SHERPA_ONNX_API void DecodeMultipleOnlineStreams( | 261 | SHERPA_ONNX_API void DecodeMultipleOnlineStreams( |
| 260 | - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream **streams, | ||
| 261 | - int32_t n); | 262 | + const SherpaOnnxOnlineRecognizer *recognizer, |
| 263 | + const SherpaOnnxOnlineStream **streams, int32_t n); | ||
| 262 | 264 | ||
| 263 | /// Get the decoding results so far for an OnlineStream. | 265 | /// Get the decoding results so far for an OnlineStream. |
| 264 | /// | 266 | /// |
| @@ -268,7 +270,8 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams( | @@ -268,7 +270,8 @@ SHERPA_ONNX_API void DecodeMultipleOnlineStreams( | ||
| 268 | /// DestroyOnlineRecognizerResult() to free the returned pointer to | 270 | /// DestroyOnlineRecognizerResult() to free the returned pointer to |
| 269 | /// avoid memory leak. | 271 | /// avoid memory leak. |
| 270 | SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( | 272 | SHERPA_ONNX_API const SherpaOnnxOnlineRecognizerResult *GetOnlineStreamResult( |
| 271 | - SherpaOnnxOnlineRecognizer *recognizer, SherpaOnnxOnlineStream *stream); | 273 | + const SherpaOnnxOnlineRecognizer *recognizer, |
| 274 | + const SherpaOnnxOnlineStream *stream); | ||
| 272 | 275 | ||
| 273 | /// Destroy the pointer returned by GetOnlineStreamResult(). | 276 | /// Destroy the pointer returned by GetOnlineStreamResult(). |
| 274 | /// | 277 | /// |
| @@ -281,35 +284,36 @@ SHERPA_ONNX_API void DestroyOnlineRecognizerResult( | @@ -281,35 +284,36 @@ SHERPA_ONNX_API void DestroyOnlineRecognizerResult( | ||
| 281 | /// | 284 | /// |
| 282 | /// @param recognizer A pointer returned by CreateOnlineRecognizer(). | 285 | /// @param recognizer A pointer returned by CreateOnlineRecognizer(). |
| 283 | /// @param stream A pointer returned by CreateOnlineStream | 286 | /// @param stream A pointer returned by CreateOnlineStream |
| 284 | -SHERPA_ONNX_API void Reset(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 285 | - SherpaOnnxOnlineStream *stream); | 287 | +SHERPA_ONNX_API void Reset(const SherpaOnnxOnlineRecognizer *recognizer, |
| 288 | + const SherpaOnnxOnlineStream *stream); | ||
| 286 | 289 | ||
| 287 | /// Signal that no more audio samples would be available. | 290 | /// Signal that no more audio samples would be available. |
| 288 | /// After this call, you cannot call AcceptWaveform() any more. | 291 | /// After this call, you cannot call AcceptWaveform() any more. |
| 289 | /// | 292 | /// |
| 290 | /// @param stream A pointer returned by CreateOnlineStream() | 293 | /// @param stream A pointer returned by CreateOnlineStream() |
| 291 | -SHERPA_ONNX_API void InputFinished(SherpaOnnxOnlineStream *stream); | 294 | +SHERPA_ONNX_API void InputFinished(const SherpaOnnxOnlineStream *stream); |
| 292 | 295 | ||
| 293 | /// Return 1 if an endpoint has been detected. | 296 | /// Return 1 if an endpoint has been detected. |
| 294 | /// | 297 | /// |
| 295 | /// @param recognizer A pointer returned by CreateOnlineRecognizer() | 298 | /// @param recognizer A pointer returned by CreateOnlineRecognizer() |
| 296 | /// @param stream A pointer returned by CreateOnlineStream() | 299 | /// @param stream A pointer returned by CreateOnlineStream() |
| 297 | /// @return Return 1 if an endpoint is detected. Return 0 otherwise. | 300 | /// @return Return 1 if an endpoint is detected. Return 0 otherwise. |
| 298 | -SHERPA_ONNX_API int32_t IsEndpoint(SherpaOnnxOnlineRecognizer *recognizer, | ||
| 299 | - SherpaOnnxOnlineStream *stream); | 301 | +SHERPA_ONNX_API int32_t IsEndpoint(const SherpaOnnxOnlineRecognizer *recognizer, |
| 302 | + const SherpaOnnxOnlineStream *stream); | ||
| 300 | 303 | ||
| 301 | // for displaying results on Linux/macOS. | 304 | // for displaying results on Linux/macOS. |
| 302 | SHERPA_ONNX_API typedef struct SherpaOnnxDisplay SherpaOnnxDisplay; | 305 | SHERPA_ONNX_API typedef struct SherpaOnnxDisplay SherpaOnnxDisplay; |
| 303 | 306 | ||
| 304 | /// Create a display object. Must be freed using DestroyDisplay to avoid | 307 | /// Create a display object. Must be freed using DestroyDisplay to avoid |
| 305 | /// memory leak. | 308 | /// memory leak. |
| 306 | -SHERPA_ONNX_API SherpaOnnxDisplay *CreateDisplay(int32_t max_word_per_line); | 309 | +SHERPA_ONNX_API const SherpaOnnxDisplay *CreateDisplay( |
| 310 | + int32_t max_word_per_line); | ||
| 307 | 311 | ||
| 308 | -SHERPA_ONNX_API void DestroyDisplay(SherpaOnnxDisplay *display); | 312 | +SHERPA_ONNX_API void DestroyDisplay(const SherpaOnnxDisplay *display); |
| 309 | 313 | ||
| 310 | /// Print the result. | 314 | /// Print the result. |
| 311 | -SHERPA_ONNX_API void SherpaOnnxPrint(SherpaOnnxDisplay *display, int32_t idx, | ||
| 312 | - const char *s); | 315 | +SHERPA_ONNX_API void SherpaOnnxPrint(const SherpaOnnxDisplay *display, |
| 316 | + int32_t idx, const char *s); | ||
| 313 | // ============================================================ | 317 | // ============================================================ |
| 314 | // For offline ASR (i.e., non-streaming ASR) | 318 | // For offline ASR (i.e., non-streaming ASR) |
| 315 | // ============================================================ | 319 | // ============================================================ |
| @@ -769,7 +773,7 @@ typedef void (*SherpaOnnxGeneratedAudioCallbackWithArg)(const float *samples, | @@ -769,7 +773,7 @@ typedef void (*SherpaOnnxGeneratedAudioCallbackWithArg)(const float *samples, | ||
| 769 | int32_t n, void *arg); | 773 | int32_t n, void *arg); |
| 770 | 774 | ||
| 771 | typedef void (*SherpaOnnxGeneratedAudioProgressCallback)(const float *samples, | 775 | typedef void (*SherpaOnnxGeneratedAudioProgressCallback)(const float *samples, |
| 772 | - int32_t n, float p); | 776 | + int32_t n, float p); |
| 773 | 777 | ||
| 774 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; | 778 | SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTts SherpaOnnxOfflineTts; |
| 775 | 779 | ||
| @@ -839,7 +843,9 @@ SHERPA_ONNX_API const SherpaOnnxWave *SherpaOnnxReadWave(const char *filename); | @@ -839,7 +843,9 @@ SHERPA_ONNX_API const SherpaOnnxWave *SherpaOnnxReadWave(const char *filename); | ||
| 839 | 843 | ||
| 840 | SHERPA_ONNX_API void SherpaOnnxFreeWave(const SherpaOnnxWave *wave); | 844 | SHERPA_ONNX_API void SherpaOnnxFreeWave(const SherpaOnnxWave *wave); |
| 841 | 845 | ||
| 842 | -// Spoken language identification | 846 | +// ============================================================ |
| 847 | +// For spoken language identification | ||
| 848 | +// ============================================================ | ||
| 843 | 849 | ||
| 844 | SHERPA_ONNX_API typedef struct | 850 | SHERPA_ONNX_API typedef struct |
| 845 | SherpaOnnxSpokenLanguageIdentificationWhisperConfig { | 851 | SherpaOnnxSpokenLanguageIdentificationWhisperConfig { |
| @@ -893,6 +899,169 @@ SherpaOnnxSpokenLanguageIdentificationCompute( | @@ -893,6 +899,169 @@ SherpaOnnxSpokenLanguageIdentificationCompute( | ||
| 893 | SHERPA_ONNX_API void SherpaOnnxDestroySpokenLanguageIdentificationResult( | 899 | SHERPA_ONNX_API void SherpaOnnxDestroySpokenLanguageIdentificationResult( |
| 894 | const SherpaOnnxSpokenLanguageIdentificationResult *r); | 900 | const SherpaOnnxSpokenLanguageIdentificationResult *r); |
| 895 | 901 | ||
| 902 | +// ============================================================ | ||
| 903 | +// For speaker embedding extraction | ||
| 904 | +// ============================================================ | ||
| 905 | +SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingExtractorConfig { | ||
| 906 | + const char *model; | ||
| 907 | + int32_t num_threads; | ||
| 908 | + int32_t debug; | ||
| 909 | + const char *provider; | ||
| 910 | +} SherpaOnnxSpeakerEmbeddingExtractorConfig; | ||
| 911 | + | ||
| 912 | +SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingExtractor | ||
| 913 | + SherpaOnnxSpeakerEmbeddingExtractor; | ||
| 914 | + | ||
| 915 | +// The user has to invoke SherpaOnnxDestroySpeakerEmbeddingExtractor() | ||
| 916 | +// to free the returned pointer to avoid memory leak | ||
| 917 | +SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor * | ||
| 918 | +SherpaOnnxCreateSpeakerEmbeddingExtractor( | ||
| 919 | + const SherpaOnnxSpeakerEmbeddingExtractorConfig *config); | ||
| 920 | + | ||
| 921 | +SHERPA_ONNX_API void SherpaOnnxDestroySpeakerEmbeddingExtractor( | ||
| 922 | + const SherpaOnnxSpeakerEmbeddingExtractor *p); | ||
| 923 | + | ||
| 924 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingExtractorDim( | ||
| 925 | + const SherpaOnnxSpeakerEmbeddingExtractor *p); | ||
| 926 | + | ||
| 927 | +// The user has to invoke DestroyOnlineStream() to free the returned pointer | ||
| 928 | +// to avoid memory leak | ||
| 929 | +SHERPA_ONNX_API const SherpaOnnxOnlineStream * | ||
| 930 | +SherpaOnnxSpeakerEmbeddingExtractorCreateStream( | ||
| 931 | + const SherpaOnnxSpeakerEmbeddingExtractor *p); | ||
| 932 | + | ||
| 933 | +// Return 1 if the stream has enough feature frames for computing embeddings. | ||
| 934 | +// Return 0 otherwise. | ||
| 935 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingExtractorIsReady( | ||
| 936 | + const SherpaOnnxSpeakerEmbeddingExtractor *p, | ||
| 937 | + const SherpaOnnxOnlineStream *s); | ||
| 938 | + | ||
| 939 | +// Compute the embedding of the stream. | ||
| 940 | +// | ||
| 941 | +// @return Return a pointer pointing to an array containing the embedding. | ||
| 942 | +// The length of the array is `dim` as returned by | ||
| 943 | +// SherpaOnnxSpeakerEmbeddingExtractorDim(p) | ||
| 944 | +// | ||
| 945 | +// The user has to invoke SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding() | ||
| 946 | +// to free the returned pointer to avoid memory leak. | ||
| 947 | +SHERPA_ONNX_API const float * | ||
| 948 | +SherpaOnnxSpeakerEmbeddingExtractorComputeEmbedding( | ||
| 949 | + const SherpaOnnxSpeakerEmbeddingExtractor *p, | ||
| 950 | + const SherpaOnnxOnlineStream *s); | ||
| 951 | + | ||
| 952 | +SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingExtractorDestroyEmbedding( | ||
| 953 | + const float *v); | ||
| 954 | + | ||
| 955 | +SHERPA_ONNX_API typedef struct SherpaOnnxSpeakerEmbeddingManager | ||
| 956 | + SherpaOnnxSpeakerEmbeddingManager; | ||
| 957 | + | ||
| 958 | +// The user has to invoke SherpaOnnxDestroySpeakerEmbeddingManager() | ||
| 959 | +// to free the returned pointer to avoid memory leak | ||
| 960 | +SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingManager * | ||
| 961 | +SherpaOnnxCreateSpeakerEmbeddingManager(int32_t dim); | ||
| 962 | + | ||
| 963 | +SHERPA_ONNX_API void SherpaOnnxDestroySpeakerEmbeddingManager( | ||
| 964 | + const SherpaOnnxSpeakerEmbeddingManager *p); | ||
| 965 | + | ||
| 966 | +// Register the embedding of a user | ||
| 967 | +// | ||
| 968 | +// @param name The name of the user | ||
| 969 | +// @param p Pointer to an array containing the embeddings. The length of the | ||
| 970 | +// array must be equal to `dim` used to construct the manager `p`. | ||
| 971 | +// | ||
| 972 | +// @return Return 1 if added successfully. Return 0 on error | ||
| 973 | +SHERPA_ONNX_API int32_t | ||
| 974 | +SherpaOnnxSpeakerEmbeddingManagerAdd(const SherpaOnnxSpeakerEmbeddingManager *p, | ||
| 975 | + const char *name, const float *v); | ||
| 976 | + | ||
| 977 | +// @param v Pointer to an array of embeddings. If there are n embeddings, then | ||
| 978 | +// v[0] is the pointer to the 0-th array containing the embeddings | ||
| 979 | +// v[1] is the pointer to the 1-st array containing the embeddings | ||
| 980 | +// v[n-1] is the pointer to the last array containing the embeddings | ||
| 981 | +// v[n] is a NULL pointer | ||
| 982 | +// @return Return 1 if added successfully. Return 0 on error | ||
| 983 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerAddList( | ||
| 984 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, | ||
| 985 | + const float **v); | ||
| 986 | + | ||
| 987 | +// Similar to SherpaOnnxSpeakerEmbeddingManagerAddList() but the memory | ||
| 988 | +// is flattened. | ||
| 989 | +// | ||
| 990 | +// The length of the input array should be `n * dim`. | ||
| 991 | +// | ||
| 992 | +// @return Return 1 if added successfully. Return 0 on error | ||
| 993 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerAddListFlattened( | ||
| 994 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, | ||
| 995 | + const float *v, int32_t n); | ||
| 996 | + | ||
| 997 | +// Remove a user. | ||
| 998 | +// @param naem The name of the user to remove. | ||
| 999 | +// @return Return 1 if removed successfully; return 0 on error. | ||
| 1000 | +// | ||
| 1001 | +// Note if the user does not exist, it also returns 0. | ||
| 1002 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerRemove( | ||
| 1003 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name); | ||
| 1004 | + | ||
| 1005 | +// Search if an existing users' embedding matches the given one. | ||
| 1006 | +// | ||
| 1007 | +// @param p Pointer to an array containing the embedding. The dim | ||
| 1008 | +// of the array must equal to `dim` used to construct the manager `p`. | ||
| 1009 | +// @param threshold A value between 0 and 1. If the similarity score exceeds | ||
| 1010 | +// this threshold, we say a match is found. | ||
| 1011 | +// @return Returns the name of the user if found. Return NULL if not found. | ||
| 1012 | +// If not NULL, the caller has to invoke | ||
| 1013 | +// SherpaOnnxSpeakerEmbeddingManagerFreeSearch() to free the returned | ||
| 1014 | +// pointer to avoid memory leak. | ||
| 1015 | +SHERPA_ONNX_API const char *SherpaOnnxSpeakerEmbeddingManagerSearch( | ||
| 1016 | + const SherpaOnnxSpeakerEmbeddingManager *p, const float *v, | ||
| 1017 | + float threshold); | ||
| 1018 | + | ||
| 1019 | +SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeSearch( | ||
| 1020 | + const char *name); | ||
| 1021 | + | ||
| 1022 | +// Check whether the input embedding matches the embedding of the input | ||
| 1023 | +// speaker. | ||
| 1024 | +// | ||
| 1025 | +// It is for speaker verification. | ||
| 1026 | +// | ||
| 1027 | +// @param name The target speaker name. | ||
| 1028 | +// @param p The input embedding to check. | ||
| 1029 | +// @param threshold A value between 0 and 1. | ||
| 1030 | +// @return Return 1 if it matches. Otherwise, it returns 0. | ||
| 1031 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerVerify( | ||
| 1032 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name, | ||
| 1033 | + const float *v, float threshold); | ||
| 1034 | + | ||
| 1035 | +// Return 1 if the user with the name is in the manager. | ||
| 1036 | +// Return 0 if the user does not exist. | ||
| 1037 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerContains( | ||
| 1038 | + const SherpaOnnxSpeakerEmbeddingManager *p, const char *name); | ||
| 1039 | + | ||
| 1040 | +// Return number of speakers in the manager. | ||
| 1041 | +SHERPA_ONNX_API int32_t SherpaOnnxSpeakerEmbeddingManagerNumSpeakers( | ||
| 1042 | + const SherpaOnnxSpeakerEmbeddingManager *p); | ||
| 1043 | + | ||
| 1044 | +// Return the name of all speakers in the manager. | ||
| 1045 | +// | ||
| 1046 | +// @return Return an array of pointers `ans`. If there are n speakers, then | ||
| 1047 | +// - ans[0] contains the name of the 0-th speaker | ||
| 1048 | +// - ans[1] contains the name of the 1-st speaker | ||
| 1049 | +// - ans[n-1] contains the name of the last speaker | ||
| 1050 | +// - ans[n] is NULL | ||
| 1051 | +// If there are no users at all, then ans[0] is NULL. In any case, | ||
| 1052 | +// `ans` is not NULL. | ||
| 1053 | +// | ||
| 1054 | +// Each name is NULL-terminated | ||
| 1055 | +// | ||
| 1056 | +// The caller has to invoke SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers() | ||
| 1057 | +// to free the returned pointer to avoid memory leak. | ||
| 1058 | +SHERPA_ONNX_API const char *const * | ||
| 1059 | +SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers( | ||
| 1060 | + const SherpaOnnxSpeakerEmbeddingManager *p); | ||
| 1061 | + | ||
| 1062 | +SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers( | ||
| 1063 | + const char *const *names); | ||
| 1064 | + | ||
| 896 | #if defined(__GNUC__) | 1065 | #if defined(__GNUC__) |
| 897 | #pragma GCC diagnostic pop | 1066 | #pragma GCC diagnostic pop |
| 898 | #endif | 1067 | #endif |
| @@ -168,7 +168,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | @@ -168,7 +168,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { | ||
| 168 | ans.samples.insert(ans.samples.end(), audio.samples.begin(), | 168 | ans.samples.insert(ans.samples.end(), audio.samples.begin(), |
| 169 | audio.samples.end()); | 169 | audio.samples.end()); |
| 170 | if (callback) { | 170 | if (callback) { |
| 171 | - callback(audio.samples.data(), audio.samples.size(), b * 1.0 / num_batches); | 171 | + callback(audio.samples.data(), audio.samples.size(), |
| 172 | + b * 1.0 / num_batches); | ||
| 172 | // Caution(fangjun): audio is freed when the callback returns, so users | 173 | // Caution(fangjun): audio is freed when the callback returns, so users |
| 173 | // should copy the data if they want to access the data after | 174 | // should copy the data if they want to access the data after |
| 174 | // the callback returns to avoid segmentation fault. | 175 | // the callback returns to avoid segmentation fault. |
| @@ -54,8 +54,8 @@ struct GeneratedAudio { | @@ -54,8 +54,8 @@ struct GeneratedAudio { | ||
| 54 | 54 | ||
| 55 | class OfflineTtsImpl; | 55 | class OfflineTtsImpl; |
| 56 | 56 | ||
| 57 | -using GeneratedAudioCallback = | ||
| 58 | - std::function<void(const float * /*samples*/, int32_t /*n*/, float /*progress*/)>; | 57 | +using GeneratedAudioCallback = std::function<void( |
| 58 | + const float * /*samples*/, int32_t /*n*/, float /*progress*/)>; | ||
| 59 | 59 | ||
| 60 | class OfflineTts { | 60 | class OfflineTts { |
| 61 | public: | 61 | public: |
| @@ -44,7 +44,8 @@ static void Handler(int32_t /*sig*/) { | @@ -44,7 +44,8 @@ static void Handler(int32_t /*sig*/) { | ||
| 44 | fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); | 44 | fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); |
| 45 | } | 45 | } |
| 46 | 46 | ||
| 47 | -static void AudioGeneratedCallback(const float *s, int32_t n) { | 47 | +static void AudioGeneratedCallback(const float *s, int32_t n, |
| 48 | + float /*progress*/) { | ||
| 48 | if (n > 0) { | 49 | if (n > 0) { |
| 49 | std::lock_guard<std::mutex> lock(g_buffer.mutex); | 50 | std::lock_guard<std::mutex> lock(g_buffer.mutex); |
| 50 | g_buffer.samples.push({s, s + n}); | 51 | g_buffer.samples.push({s, s + n}); |
| @@ -47,7 +47,8 @@ static void Handler(int32_t /*sig*/) { | @@ -47,7 +47,8 @@ static void Handler(int32_t /*sig*/) { | ||
| 47 | fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); | 47 | fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); |
| 48 | } | 48 | } |
| 49 | 49 | ||
| 50 | -static void AudioGeneratedCallback(const float *s, int32_t n, float /*progress*/) { | 50 | +static void AudioGeneratedCallback(const float *s, int32_t n, |
| 51 | + float /*progress*/) { | ||
| 51 | if (n > 0) { | 52 | if (n > 0) { |
| 52 | Samples samples; | 53 | Samples samples; |
| 53 | samples.data = std::vector<float>{s, s + n}; | 54 | samples.data = std::vector<float>{s, s + n}; |
| @@ -9,9 +9,8 @@ | @@ -9,9 +9,8 @@ | ||
| 9 | #include "sherpa-onnx/csrc/parse-options.h" | 9 | #include "sherpa-onnx/csrc/parse-options.h" |
| 10 | #include "sherpa-onnx/csrc/wave-writer.h" | 10 | #include "sherpa-onnx/csrc/wave-writer.h" |
| 11 | 11 | ||
| 12 | -void audioCallback(const float *samples, int32_t n, float progress) | ||
| 13 | -{ | ||
| 14 | - printf( "sample=%d, progress=%f\n", n, progress ); | 12 | +void audioCallback(const float *samples, int32_t n, float progress) { |
| 13 | + printf("sample=%d, progress=%f\n", n, progress); | ||
| 15 | } | 14 | } |
| 16 | 15 | ||
| 17 | int main(int32_t argc, char *argv[]) { | 16 | int main(int32_t argc, char *argv[]) { |
| @@ -93,7 +93,7 @@ class SpeakerEmbeddingManager::Impl { | @@ -93,7 +93,7 @@ class SpeakerEmbeddingManager::Impl { | ||
| 93 | int32_t num_rows = embedding_matrix_.rows(); | 93 | int32_t num_rows = embedding_matrix_.rows(); |
| 94 | 94 | ||
| 95 | if (row_idx < num_rows - 1) { | 95 | if (row_idx < num_rows - 1) { |
| 96 | - embedding_matrix_.block(row_idx, 0, num_rows - -1 - row_idx, dim_) = | 96 | + embedding_matrix_.block(row_idx, 0, num_rows - 1 - row_idx, dim_) = |
| 97 | embedding_matrix_.bottomRows(num_rows - 1 - row_idx); | 97 | embedding_matrix_.bottomRows(num_rows - 1 - row_idx); |
| 98 | } | 98 | } |
| 99 | 99 |
| @@ -795,9 +795,10 @@ class SherpaOnnxOfflineTts { | @@ -795,9 +795,10 @@ class SherpaOnnxOfflineTts { | ||
| 795 | explicit SherpaOnnxOfflineTts(const OfflineTtsConfig &config) | 795 | explicit SherpaOnnxOfflineTts(const OfflineTtsConfig &config) |
| 796 | : tts_(config) {} | 796 | : tts_(config) {} |
| 797 | 797 | ||
| 798 | - GeneratedAudio Generate( | ||
| 799 | - const std::string &text, int64_t sid = 0, float speed = 1.0, | ||
| 800 | - std::function<void(const float *, int32_t, float)> callback = nullptr) const { | 798 | + GeneratedAudio Generate(const std::string &text, int64_t sid = 0, |
| 799 | + float speed = 1.0, | ||
| 800 | + std::function<void(const float *, int32_t, float)> | ||
| 801 | + callback = nullptr) const { | ||
| 801 | return tts_.Generate(text, sid, speed, callback); | 802 | return tts_.Generate(text, sid, speed, callback); |
| 802 | } | 803 | } |
| 803 | 804 |
| @@ -55,14 +55,16 @@ void PybindOfflineTts(py::module *m) { | @@ -55,14 +55,16 @@ void PybindOfflineTts(py::module *m) { | ||
| 55 | .def( | 55 | .def( |
| 56 | "generate", | 56 | "generate", |
| 57 | [](const PyClass &self, const std::string &text, int64_t sid, | 57 | [](const PyClass &self, const std::string &text, int64_t sid, |
| 58 | - float speed, std::function<void(py::array_t<float>, float)> callback) | 58 | + float speed, |
| 59 | + std::function<void(py::array_t<float>, float)> callback) | ||
| 59 | -> GeneratedAudio { | 60 | -> GeneratedAudio { |
| 60 | if (!callback) { | 61 | if (!callback) { |
| 61 | return self.Generate(text, sid, speed); | 62 | return self.Generate(text, sid, speed); |
| 62 | } | 63 | } |
| 63 | 64 | ||
| 64 | - std::function<void(const float *, int32_t, float)> callback_wrapper = | ||
| 65 | - [callback](const float *samples, int32_t n, float progress) { | 65 | + std::function<void(const float *, int32_t, float)> |
| 66 | + callback_wrapper = [callback](const float *samples, int32_t n, | ||
| 67 | + float progress) { | ||
| 66 | // CAUTION(fangjun): we have to copy samples since it is | 68 | // CAUTION(fangjun): we have to copy samples since it is |
| 67 | // freed once the call back returns. | 69 | // freed once the call back returns. |
| 68 | 70 |
-
请 注册 或 登录 后发表评论