Fangjun Kuang
Committed by GitHub

Use alsa to read microphone in speaker identification demo. (#605)

@@ -167,6 +167,7 @@ class BuildExtension(build_ext): @@ -167,6 +167,7 @@ class BuildExtension(build_ext):
167 if enable_alsa(): 167 if enable_alsa():
168 binaries += ["sherpa-onnx-alsa"] 168 binaries += ["sherpa-onnx-alsa"]
169 binaries += ["sherpa-onnx-offline-tts-play-alsa"] 169 binaries += ["sherpa-onnx-offline-tts-play-alsa"]
  170 + binaries += ["sherpa-onnx-alsa-offline-speaker-identification"]
170 171
171 if is_windows(): 172 if is_windows():
172 binaries += ["kaldi-native-fbank-core.dll"] 173 binaries += ["kaldi-native-fbank-core.dll"]
@@ -60,6 +60,7 @@ def get_binaries_to_install(): @@ -60,6 +60,7 @@ def get_binaries_to_install():
60 if enable_alsa(): 60 if enable_alsa():
61 binaries += ["sherpa-onnx-alsa"] 61 binaries += ["sherpa-onnx-alsa"]
62 binaries += ["sherpa-onnx-offline-tts-play-alsa"] 62 binaries += ["sherpa-onnx-offline-tts-play-alsa"]
  63 + binaries += ["sherpa-onnx-alsa-offline-speaker-identification"]
63 64
64 if is_windows(): 65 if is_windows():
65 binaries += ["kaldi-native-fbank-core.dll"] 66 binaries += ["kaldi-native-fbank-core.dll"]
@@ -231,10 +231,12 @@ endif() @@ -231,10 +231,12 @@ endif()
231 if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY) 231 if(SHERPA_ONNX_HAS_ALSA AND SHERPA_ONNX_ENABLE_BINARY)
232 add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) 232 add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
233 add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc) 233 add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
  234 + add_executable(sherpa-onnx-alsa-offline-speaker-identification sherpa-onnx-alsa-offline-speaker-identification.cc alsa.cc)
234 235
235 set(exes 236 set(exes
236 sherpa-onnx-alsa 237 sherpa-onnx-alsa
237 sherpa-onnx-offline-tts-play-alsa 238 sherpa-onnx-offline-tts-play-alsa
  239 + sherpa-onnx-alsa-offline-speaker-identification
238 ) 240 )
239 foreach(exe IN LISTS exes) 241 foreach(exe IN LISTS exes)
240 target_link_libraries(${exe} sherpa-onnx-core) 242 target_link_libraries(${exe} sherpa-onnx-core)
  1 +// sherpa-onnx/csrc/sherpa-onnx-alsa-offline-speaker-identification.cc
  2 +//
  3 +// Copyright (c) 2024 Xiaomi Corporation
  4 +
  5 +#include <signal.h>
  6 +#include <stdio.h>
  7 +#include <stdlib.h>
  8 +
  9 +#include <algorithm>
  10 +#include <fstream>
  11 +#include <mutex> // NOLINT
  12 +#include <sstream>
  13 +#include <thread> // NOLINT
  14 +
  15 +#include "sherpa-onnx/csrc/alsa.h"
  16 +#include "sherpa-onnx/csrc/macros.h"
  17 +#include "sherpa-onnx/csrc/microphone.h"
  18 +#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
  19 +#include "sherpa-onnx/csrc/speaker-embedding-manager.h"
  20 +#include "sherpa-onnx/csrc/wave-reader.h"
  21 +
  22 +enum class State {
  23 + kIdle,
  24 + kRecording,
  25 + kComputing,
  26 +};
  27 +
  28 +State state = State::kIdle;
  29 +
  30 +// true to stop the program and exit
  31 +bool stop = false;
  32 +
  33 +std::vector<float> samples;
  34 +std::mutex samples_mutex;
  35 +
  36 +static void DetectKeyPress() {
  37 + SHERPA_ONNX_LOGE("\nPress Enter to start");
  38 + int32_t key;
  39 + while (!stop && (key = getchar())) {
  40 + if (key != 0x0a) {
  41 + continue;
  42 + }
  43 +
  44 + switch (state) {
  45 + case State::kIdle:
  46 + SHERPA_ONNX_LOGE("\nStart recording. Press Enter to stop recording");
  47 + state = State::kRecording;
  48 + {
  49 + std::lock_guard<std::mutex> lock(samples_mutex);
  50 + samples.clear();
  51 + }
  52 + break;
  53 + case State::kRecording:
  54 + SHERPA_ONNX_LOGE("\nStop recording. Computing ...");
  55 + state = State::kComputing;
  56 + break;
  57 + case State::kComputing:
  58 + break;
  59 + }
  60 + }
  61 +}
  62 +
  63 +static void Record(const char *device_name, int32_t expected_sample_rate) {
  64 + sherpa_onnx::Alsa alsa(device_name);
  65 +
  66 + if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
  67 + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
  68 + expected_sample_rate);
  69 + exit(-1);
  70 + }
  71 +
  72 + int32_t chunk = 0.1 * alsa.GetActualSampleRate();
  73 + while (!stop) {
  74 + std::lock_guard<std::mutex> lock(samples_mutex);
  75 + const std::vector<float> &s = alsa.Read(chunk);
  76 + samples.insert(samples.end(), s.begin(), s.end());
  77 + }
  78 +}
  79 +
  80 +static void Handler(int32_t sig) {
  81 + stop = true;
  82 + fprintf(stderr, "\nCaught Ctrl + C. Press Enter to exit\n");
  83 +}
  84 +
  85 +static std::vector<std::vector<float>> ComputeEmbeddings(
  86 + const std::vector<std::string> &filenames,
  87 + sherpa_onnx::SpeakerEmbeddingExtractor *extractor) {
  88 + std::vector<std::vector<float>> embedding_list;
  89 + embedding_list.reserve(filenames.size());
  90 +
  91 + for (const auto &f : filenames) {
  92 + int32_t sampling_rate = -1;
  93 +
  94 + bool is_ok = false;
  95 + const std::vector<float> samples =
  96 + sherpa_onnx::ReadWave(f, &sampling_rate, &is_ok);
  97 +
  98 + if (!is_ok) {
  99 + fprintf(stderr, "Failed to read %s\n", f.c_str());
  100 + exit(-1);
  101 + }
  102 +
  103 + auto s = extractor->CreateStream();
  104 + s->AcceptWaveform(sampling_rate, samples.data(), samples.size());
  105 + s->InputFinished();
  106 + auto embedding = extractor->Compute(s.get());
  107 + embedding_list.push_back(embedding);
  108 + }
  109 + return embedding_list;
  110 +}
  111 +
  112 +static std::unordered_map<std::string, std::vector<std::string>>
  113 +ReadSpeakerFile(const std::string &filename) {
  114 + std::unordered_map<std::string, std::vector<std::string>> ans;
  115 +
  116 + std::ifstream is(filename);
  117 + if (!is) {
  118 + fprintf(stderr, "Failed to open %s", filename.c_str());
  119 + exit(0);
  120 + }
  121 +
  122 + std::string line;
  123 + std::string name;
  124 + std::string path;
  125 +
  126 + while (std::getline(is, line)) {
  127 + std::istringstream iss(line);
  128 + name.clear();
  129 + path.clear();
  130 +
  131 + iss >> name >> path;
  132 + if (!iss || !iss.eof() || name.empty() || path.empty()) {
  133 + fprintf(stderr, "Invalid line: %s\n", line.c_str());
  134 + exit(-1);
  135 + }
  136 + ans[name].push_back(path);
  137 + }
  138 +
  139 + return ans;
  140 +}
  141 +
  142 +int32_t main(int32_t argc, char *argv[]) {
  143 + signal(SIGINT, Handler);
  144 +
  145 + const char *kUsageMessage = R"usage(
  146 +This program shows how to use non-streaming speaker identification.
  147 +Usage:
  148 +
  149 +(1) Prepare a text file containing speaker related files.
  150 +
  151 +Each line in the text file contains two columns. The first column is the
  152 +speaker name, while the second column contains the wave file of the speaker.
  153 +
  154 +If the text file contains multiple wave files for the same speaker, then the
  155 +embeddings of these files are averaged.
  156 +
  157 +An example text file is given below:
  158 +
  159 + foo /path/to/a.wav
  160 + bar /path/to/b.wav
  161 + foo /path/to/c.wav
  162 + foobar /path/to/d.wav
  163 +
  164 +Each wave file should contain only a single channel; the sample format
  165 +should be int16_t; the sample rate can be arbitrary.
  166 +
  167 +(2) Download a model for computing speaker embeddings
  168 +
  169 +Please visit
  170 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
  171 +to download a model. An example is given below:
  172 +
  173 + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_zh_cnceleb_resnet34.onnx
  174 +
  175 +Note that `zh` means Chinese, while `en` means English.
  176 +
  177 +(3) Run it !
  178 +
  179 + ./bin/sherpa-onnx-alsa-offline-speaker-identification \
  180 + --model=/path/to/your-model.onnx \
  181 + --speaker-file=/path/to/speaker.txt \
  182 + device_name
  183 +
  184 +The device name specifies which microphone to use in case there are several
  185 +on your system. You can use
  186 +
  187 + arecord -l
  188 +
  189 +to find all available microphones on your computer. For instance, if it outputs
  190 +
  191 +**** List of CAPTURE Hardware Devices ****
  192 +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
  193 + Subdevices: 1/1
  194 + Subdevice #0: subdevice #0
  195 +
  196 +and if you want to select card 3 and the device 0 on that card, please use:
  197 + plughw:3,0
  198 +as the device_name.
  199 +
  200 +)usage";
  201 +
  202 + sherpa_onnx::ParseOptions po(kUsageMessage);
  203 + float threshold = 0.5;
  204 + std::string speaker_file;
  205 +
  206 + po.Register("threshold", &threshold,
  207 + "Threshold for comparing embedding scores.");
  208 +
  209 + po.Register("speaker-file", &speaker_file, "Path to speaker.txt");
  210 +
  211 + sherpa_onnx::SpeakerEmbeddingExtractorConfig config;
  212 + config.Register(&po);
  213 +
  214 + po.Read(argc, argv);
  215 + if (po.NumArgs() != 1) {
  216 + fprintf(stderr, "Please provide only 1 argument: the device name\n");
  217 + po.PrintUsage();
  218 + exit(EXIT_FAILURE);
  219 + }
  220 +
  221 + fprintf(stderr, "%s\n", config.ToString().c_str());
  222 +
  223 + if (!config.Validate()) {
  224 + fprintf(stderr, "Errors in config! Please use --help to view the usage.\n");
  225 + return -1;
  226 + }
  227 +
  228 + SHERPA_ONNX_LOGE("\nCreating extractor ...");
  229 + sherpa_onnx::SpeakerEmbeddingExtractor extractor(config);
  230 + SHERPA_ONNX_LOGE("\nextractor created!");
  231 +
  232 + sherpa_onnx::SpeakerEmbeddingManager manager(extractor.Dim());
  233 +
  234 + auto name2files = ReadSpeakerFile(speaker_file);
  235 + for (const auto &p : name2files) {
  236 + SHERPA_ONNX_LOGE("\nProcessing speaker %s", p.first.c_str());
  237 + auto embedding_list = ComputeEmbeddings(p.second, &extractor);
  238 + manager.Add(p.first, embedding_list);
  239 + }
  240 +
  241 + std::string device_name = po.GetArg(1);
  242 + fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
  243 + int32_t sample_rate = 16000;
  244 +
  245 + std::thread t(DetectKeyPress);
  246 + std::thread t2(Record, device_name.c_str(), sample_rate);
  247 +
  248 + while (!stop) {
  249 + switch (state) {
  250 + case State::kIdle:
  251 + break;
  252 + case State::kRecording:
  253 + break;
  254 + case State::kComputing: {
  255 + std::vector<float> buf;
  256 + {
  257 + std::lock_guard<std::mutex> lock(samples_mutex);
  258 + buf = std::move(samples);
  259 + }
  260 +
  261 + auto s = extractor.CreateStream();
  262 + s->AcceptWaveform(sample_rate, buf.data(), buf.size());
  263 + s->InputFinished();
  264 + auto embedding = extractor.Compute(s.get());
  265 + auto name = manager.Search(embedding.data(), threshold);
  266 +
  267 + if (name.empty()) {
  268 + name = "--Unknown--";
  269 + }
  270 +
  271 + SHERPA_ONNX_LOGE("\nDone!\nDetected speaker is: %s", name.c_str());
  272 +
  273 + state = State::kIdle;
  274 + SHERPA_ONNX_LOGE("\nPress Enter to start");
  275 + break;
  276 + }
  277 + }
  278 +
  279 + using namespace std::chrono_literals;
  280 + std::this_thread::sleep_for(20ms); // sleep for 20ms
  281 + }
  282 +
  283 + t.join();
  284 + t2.join();
  285 +
  286 + return 0;
  287 +}
@@ -34,14 +34,14 @@ Usage: @@ -34,14 +34,14 @@ Usage:
34 --provider=cpu \ 34 --provider=cpu \
35 --num-threads=2 \ 35 --num-threads=2 \
36 --decoding-method=greedy_search \ 36 --decoding-method=greedy_search \
37 - device_name \ 37 + device_name
38 38
39 Please refer to 39 Please refer to
40 https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html 40 https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
41 for a list of pre-trained models to download. 41 for a list of pre-trained models to download.
42 42
43 The device name specifies which microphone to use in case there are several 43 The device name specifies which microphone to use in case there are several
44 -on you system. You can use 44 +on your system. You can use
45 45
46 arecord -l 46 arecord -l
47 47