Committed by
GitHub
Support playing generated audio as it is generating for MFC. (#462)
* Support playing generated audio as it is generating for MFC. * support espeak-ng-data
正在显示
5 个修改的文件
包含
246 行增加
和
35 行删除
| @@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { | @@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { | ||
| 214 | param.sampleFormat = paFloat32; | 214 | param.sampleFormat = paFloat32; |
| 215 | param.suggestedLatency = info->defaultLowInputLatency; | 215 | param.suggestedLatency = info->defaultLowInputLatency; |
| 216 | param.hostApiSpecificStreamInfo = nullptr; | 216 | param.hostApiSpecificStreamInfo = nullptr; |
| 217 | - float sample_rate = config_.feat_config.sample_rate; | 217 | + float sample_rate = static_cast<float>(config_.feat_config.sample_rate); |
| 218 | pa_stream_ = nullptr; | 218 | pa_stream_ = nullptr; |
| 219 | PaError err = | 219 | PaError err = |
| 220 | Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */ | 220 | Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */ |
| @@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { | @@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { | ||
| 259 | SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_); | 259 | SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_); |
| 260 | 260 | ||
| 261 | AcceptWaveformOffline(stream, config_.feat_config.sample_rate, | 261 | AcceptWaveformOffline(stream, config_.feat_config.sample_rate, |
| 262 | - samples_.data(), samples_.size()); | 262 | + samples_.data(), static_cast<int32_t>(samples_.size())); |
| 263 | DecodeOfflineStream(recognizer_, stream); | 263 | DecodeOfflineStream(recognizer_, stream); |
| 264 | auto r = GetOfflineStreamResult(stream); | 264 | auto r = GetOfflineStreamResult(stream); |
| 265 | results_.emplace_back(r->text); | 265 | results_.emplace_back(r->text); |
| @@ -9,14 +9,184 @@ | @@ -9,14 +9,184 @@ | ||
| 9 | #include "afxdialogex.h" | 9 | #include "afxdialogex.h" |
| 10 | 10 | ||
| 11 | #include <fstream> | 11 | #include <fstream> |
| 12 | +#include <mutex> // NOLINT | ||
| 13 | +#include <queue> | ||
| 12 | #include <stdexcept> | 14 | #include <stdexcept> |
| 13 | #include <string> | 15 | #include <string> |
| 16 | +#include <thread> // NOLINT | ||
| 14 | #include <vector> | 17 | #include <vector> |
| 15 | 18 | ||
| 16 | #ifdef _DEBUG | 19 | #ifdef _DEBUG |
| 17 | #define new DEBUG_NEW | 20 | #define new DEBUG_NEW |
| 18 | #endif | 21 | #endif |
| 19 | 22 | ||
| 23 | +Microphone::Microphone() { | ||
| 24 | + PaError err = Pa_Initialize(); | ||
| 25 | + if (err != paNoError) { | ||
| 26 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 27 | + exit(-2); | ||
| 28 | + } | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +Microphone::~Microphone() { | ||
| 32 | + PaError err = Pa_Terminate(); | ||
| 33 | + if (err != paNoError) { | ||
| 34 | + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); | ||
| 35 | + exit(-2); | ||
| 36 | + } | ||
| 37 | +} | ||
| 38 | + | ||
| 39 | +// NOTE(fangjun): Code is copied from | ||
| 40 | +// https://github.com/k2-fsa/sherpa-onnx/blob/master/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc#L22 | ||
| 41 | +static std::condition_variable g_cv; | ||
| 42 | +static std::mutex g_cv_m; | ||
| 43 | + | ||
| 44 | +struct Samples { | ||
| 45 | + std::vector<float> data; | ||
| 46 | + int32_t consumed = 0; | ||
| 47 | +}; | ||
| 48 | + | ||
| 49 | +struct Buffer { | ||
| 50 | + std::queue<Samples> samples; | ||
| 51 | + std::mutex mutex; | ||
| 52 | +}; | ||
| 53 | + | ||
| 54 | +static Buffer g_buffer; | ||
| 55 | + | ||
| 56 | +static bool g_started = false; | ||
| 57 | +static bool g_stopped = false; | ||
| 58 | +static bool g_killed = false; | ||
| 59 | + | ||
| 60 | +static void AudioGeneratedCallback(const float *s, int32_t n) { | ||
| 61 | + if (n > 0) { | ||
| 62 | + Samples samples; | ||
| 63 | + samples.data = std::vector<float>{s, s + n}; | ||
| 64 | + | ||
| 65 | + std::lock_guard<std::mutex> lock(g_buffer.mutex); | ||
| 66 | + g_buffer.samples.push(std::move(samples)); | ||
| 67 | + g_started = true; | ||
| 68 | + } | ||
| 69 | +} | ||
| 70 | + | ||
| 71 | +static int PlayCallback(const void * /*in*/, void *out, | ||
| 72 | + unsigned long _n, // NOLINT | ||
| 73 | + const PaStreamCallbackTimeInfo * /*time_info*/, | ||
| 74 | + PaStreamCallbackFlags /*status_flags*/, | ||
| 75 | + void * /*user_data*/) { | ||
| 76 | + int32_t n = static_cast<int32_t>(_n); | ||
| 77 | + if (g_killed) { | ||
| 78 | + return paComplete; | ||
| 79 | + } | ||
| 80 | + | ||
| 81 | + float *pout = reinterpret_cast<float *>(out); | ||
| 82 | + std::lock_guard<std::mutex> lock(g_buffer.mutex); | ||
| 83 | + | ||
| 84 | + if (g_buffer.samples.empty()) { | ||
| 85 | + if (g_stopped) { | ||
| 86 | + // no more data is available and we have processed all of the samples | ||
| 87 | + return paComplete; | ||
| 88 | + } | ||
| 89 | + | ||
| 90 | + // The current sentence is so long, though very unlikely, that | ||
| 91 | + // the model has not finished processing it yet. | ||
| 92 | + std::fill_n(pout, n, 0); | ||
| 93 | + | ||
| 94 | + return paContinue; | ||
| 95 | + } | ||
| 96 | + | ||
| 97 | + int32_t k = 0; | ||
| 98 | + for (; k < n && !g_buffer.samples.empty();) { | ||
| 99 | + int32_t this_block = n - k; | ||
| 100 | + | ||
| 101 | + auto &p = g_buffer.samples.front(); | ||
| 102 | + | ||
| 103 | + int32_t remaining = static_cast<int32_t>(p.data.size()) - p.consumed; | ||
| 104 | + | ||
| 105 | + if (this_block <= remaining) { | ||
| 106 | + std::copy(p.data.begin() + p.consumed, | ||
| 107 | + p.data.begin() + p.consumed + this_block, pout + k); | ||
| 108 | + p.consumed += this_block; | ||
| 109 | + | ||
| 110 | + k = n; | ||
| 111 | + | ||
| 112 | + if (p.consumed == p.data.size()) { | ||
| 113 | + g_buffer.samples.pop(); | ||
| 114 | + } | ||
| 115 | + break; | ||
| 116 | + } | ||
| 117 | + | ||
| 118 | + std::copy(p.data.begin() + p.consumed, p.data.end(), pout + k); | ||
| 119 | + k += static_cast<int32_t>(p.data.size()) - p.consumed; | ||
| 120 | + g_buffer.samples.pop(); | ||
| 121 | + } | ||
| 122 | + | ||
| 123 | + if (k < n) { | ||
| 124 | + std::fill_n(pout + k, n - k, 0); | ||
| 125 | + } | ||
| 126 | + | ||
| 127 | + if (g_stopped && g_buffer.samples.empty()) { | ||
| 128 | + return paComplete; | ||
| 129 | + } | ||
| 130 | + | ||
| 131 | + return paContinue; | ||
| 132 | +} | ||
| 133 | + | ||
| 134 | +static void PlayCallbackFinished(void *userData) { g_cv.notify_all(); } | ||
| 135 | + | ||
| 136 | +static void StartPlayback(int32_t sample_rate) { | ||
| 137 | + int32_t frames_per_buffer = 1024; | ||
| 138 | + PaStreamParameters outputParameters; | ||
| 139 | + PaStream *stream; | ||
| 140 | + PaError err; | ||
| 141 | + | ||
| 142 | + outputParameters.device = | ||
| 143 | + Pa_GetDefaultOutputDevice(); /* default output device */ | ||
| 144 | + | ||
| 145 | + outputParameters.channelCount = 1; /* stereo output */ | ||
| 146 | + outputParameters.sampleFormat = paFloat32; /* 32 bit floating point output */ | ||
| 147 | + outputParameters.suggestedLatency = | ||
| 148 | + Pa_GetDeviceInfo(outputParameters.device)->defaultLowOutputLatency; | ||
| 149 | + outputParameters.hostApiSpecificStreamInfo = nullptr; | ||
| 150 | + | ||
| 151 | + err = Pa_OpenStream(&stream, nullptr, /* no input */ | ||
| 152 | + &outputParameters, sample_rate, frames_per_buffer, | ||
| 153 | + paClipOff, // we won't output out of range samples so | ||
| 154 | + // don't bother clipping them | ||
| 155 | + PlayCallback, nullptr); | ||
| 156 | + if (err != paNoError) { | ||
| 157 | + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err)); | ||
| 158 | + return; | ||
| 159 | + } | ||
| 160 | + | ||
| 161 | + err = Pa_SetStreamFinishedCallback(stream, &PlayCallbackFinished); | ||
| 162 | + if (err != paNoError) { | ||
| 163 | + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err)); | ||
| 164 | + return; | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + err = Pa_StartStream(stream); | ||
| 168 | + if (err != paNoError) { | ||
| 169 | + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err)); | ||
| 170 | + return; | ||
| 171 | + } | ||
| 172 | + | ||
| 173 | + std::unique_lock<std::mutex> lock(g_cv_m); | ||
| 174 | + while (!g_killed && !g_stopped && | ||
| 175 | + (!g_started || (g_started && !g_buffer.samples.empty()))) { | ||
| 176 | + g_cv.wait(lock); | ||
| 177 | + } | ||
| 178 | + | ||
| 179 | + err = Pa_StopStream(stream); | ||
| 180 | + if (err != paNoError) { | ||
| 181 | + return; | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + err = Pa_CloseStream(stream); | ||
| 185 | + if (err != paNoError) { | ||
| 186 | + return; | ||
| 187 | + } | ||
| 188 | +} | ||
| 189 | + | ||
| 20 | 190 | ||
| 21 | // CAboutDlg dialog used for App About | 191 | // CAboutDlg dialog used for App About |
| 22 | 192 | ||
| @@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() { | @@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() { | ||
| 261 | ok = false; | 431 | ok = false; |
| 262 | } | 432 | } |
| 263 | 433 | ||
| 264 | - if (!Exists("./lexicon.txt")) { | ||
| 265 | - error_message += "Cannot find ./lexicon.txt\r\n"; | 434 | + if (!Exists("./lexicon.txt") && !Exists("./espeak-ng-data/phontab")) { |
| 435 | + error_message += "Cannot find espeak-ng-data directory or ./lexicon.txt\r\n"; | ||
| 266 | ok = false; | 436 | ok = false; |
| 267 | } | 437 | } |
| 268 | 438 | ||
| @@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() { | @@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() { | ||
| 275 | generate_btn_.EnableWindow(FALSE); | 445 | generate_btn_.EnableWindow(FALSE); |
| 276 | error_message += | 446 | error_message += |
| 277 | "\r\nPlease refer to\r\n" | 447 | "\r\nPlease refer to\r\n" |
| 278 | - "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html"; | 448 | + "https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models"; |
| 279 | error_message += "\r\nto download models.\r\n"; | 449 | error_message += "\r\nto download models.\r\n"; |
| 280 | - error_message += "\r\nWe given an example below\r\n\r\n"; | ||
| 281 | - error_message += | ||
| 282 | - "wget -O model.onnx " | ||
| 283 | - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" | ||
| 284 | - "vits-aishell3.onnx\r\n"; | 450 | + error_message += "\r\nWe give an example below\r\n\r\n"; |
| 285 | error_message += | 451 | error_message += |
| 286 | - "wget " | ||
| 287 | - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" | ||
| 288 | - "lexicon.txt\r\n"; | ||
| 289 | - error_message += | ||
| 290 | - "wget " | ||
| 291 | - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" | ||
| 292 | - "tokens.txt\r\n"; | 452 | + "1. Download vits-piper-en_US-amy-low.tar.bz2 from the following URL\r\n\r\n" |
| 453 | + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2\r\n\r\n" | ||
| 454 | + "2. Uncompress it and you will get a directory vits-piper-en_US-amy-low \r\n\r\n" | ||
| 455 | + "3. Switch to the directory vits-piper-en_US-amy-low \r\n\r\n" | ||
| 456 | + "4. Rename en_US-amy-low.onnx to model.onnx \r\n\r\n" | ||
| 457 | + "5. Copy the current exe to the directory vits-piper-en_US-amy-low\r\n\r\n" | ||
| 458 | + "6. Done! You can now run the exe in the directory vits-piper-en_US-amy-low\r\n\r\n"; | ||
| 293 | 459 | ||
| 294 | AppendLineToMultilineEditCtrl(my_hint_, error_message); | 460 | AppendLineToMultilineEditCtrl(my_hint_, error_message); |
| 295 | return; | 461 | return; |
| @@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() { | @@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() { | ||
| 299 | SherpaOnnxOfflineTtsConfig config; | 465 | SherpaOnnxOfflineTtsConfig config; |
| 300 | memset(&config, 0, sizeof(config)); | 466 | memset(&config, 0, sizeof(config)); |
| 301 | config.model.debug = 0; | 467 | config.model.debug = 0; |
| 302 | - config.model.num_threads = 1; | 468 | + config.model.num_threads = 2; |
| 303 | config.model.provider = "cpu"; | 469 | config.model.provider = "cpu"; |
| 304 | config.model.vits.model = "./model.onnx"; | 470 | config.model.vits.model = "./model.onnx"; |
| 305 | - config.model.vits.lexicon = "./lexicon.txt"; | 471 | + if (Exists("./espeak-ng-data/phontab")) { |
| 472 | + config.model.vits.data_dir = "./espeak-ng-data"; | ||
| 473 | + } else { | ||
| 474 | + config.model.vits.lexicon = "./lexicon.txt"; | ||
| 475 | + } | ||
| 306 | config.model.vits.tokens = "./tokens.txt"; | 476 | config.model.vits.tokens = "./tokens.txt"; |
| 307 | 477 | ||
| 308 | tts_ = SherpaOnnxCreateOfflineTts(&config); | 478 | tts_ = SherpaOnnxCreateOfflineTts(&config); |
| @@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() { | @@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() { | ||
| 321 | } | 491 | } |
| 322 | 492 | ||
| 323 | void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { | 493 | void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { |
| 324 | - // TODO: Add your control notification handler code here | ||
| 325 | CString s; | 494 | CString s; |
| 326 | speaker_id_.GetWindowText(s); | 495 | speaker_id_.GetWindowText(s); |
| 327 | int speaker_id = _ttoi(s); | 496 | int speaker_id = _ttoi(s); |
| @@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { | @@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { | ||
| 338 | } | 507 | } |
| 339 | 508 | ||
| 340 | my_text_.GetWindowText(s); | 509 | my_text_.GetWindowText(s); |
| 510 | + | ||
| 341 | std::string ss = ToString(s); | 511 | std::string ss = ToString(s); |
| 342 | if (ss.empty()) { | 512 | if (ss.empty()) { |
| 343 | AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK); | 513 | AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK); |
| 344 | return; | 514 | return; |
| 345 | } | 515 | } |
| 346 | 516 | ||
| 517 | + if (play_thread_) { | ||
| 518 | + g_killed = true; | ||
| 519 | + g_stopped = true; | ||
| 520 | + if (play_thread_->joinable()) { | ||
| 521 | + play_thread_->join(); | ||
| 522 | + } | ||
| 523 | + } | ||
| 524 | + | ||
| 525 | + g_killed = false; | ||
| 526 | + g_stopped = false; | ||
| 527 | + g_started = false; | ||
| 528 | + g_buffer.samples = {}; | ||
| 529 | + | ||
| 530 | + // Caution(fangjun): It is not efficient to re-create the thread. We use this approach | ||
| 531 | + // for simplicity | ||
| 532 | + play_thread_ = std::make_unique<std::thread>(StartPlayback, SherpaOnnxOfflineTtsSampleRate(tts_)); | ||
| 533 | + | ||
| 534 | + generate_btn_.EnableWindow(FALSE); | ||
| 535 | + | ||
| 347 | const SherpaOnnxGeneratedAudio *audio = | 536 | const SherpaOnnxGeneratedAudio *audio = |
| 348 | - SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed); | 537 | + SherpaOnnxOfflineTtsGenerateWithCallback(tts_, ss.c_str(), speaker_id, speed, &AudioGeneratedCallback); |
| 538 | + | ||
| 539 | + generate_btn_.EnableWindow(TRUE); | ||
| 540 | + | ||
| 349 | output_filename_.GetWindowText(s); | 541 | output_filename_.GetWindowText(s); |
| 350 | std::string filename = ToString(s); | 542 | std::string filename = ToString(s); |
| 543 | + | ||
| 351 | int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, | 544 | int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, |
| 352 | filename.c_str()); | 545 | filename.c_str()); |
| 353 | 546 | ||
| 354 | SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); | 547 | SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); |
| 355 | 548 | ||
| 356 | if (ok) { | 549 | if (ok) { |
| 357 | - AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK); | 550 | + // AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK); |
| 551 | + AppendLineToMultilineEditCtrl(my_hint_, std::string("Saved to ") + filename + " successfully"); | ||
| 358 | } else { | 552 | } else { |
| 359 | - AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK); | 553 | + // AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK); |
| 554 | + AppendLineToMultilineEditCtrl(my_hint_, std::string("Failed to saved to ") + filename); | ||
| 360 | } | 555 | } |
| 361 | 556 | ||
| 362 | //CDialogEx::OnOK(); | 557 | //CDialogEx::OnOK(); |
| @@ -6,6 +6,16 @@ | @@ -6,6 +6,16 @@ | ||
| 6 | 6 | ||
| 7 | #include "sherpa-onnx/c-api/c-api.h" | 7 | #include "sherpa-onnx/c-api/c-api.h" |
| 8 | 8 | ||
| 9 | +#include <memory> | ||
| 10 | +#include <thread> | ||
| 11 | + | ||
| 12 | +#include "portaudio.h" | ||
| 13 | + | ||
| 14 | +class Microphone { | ||
| 15 | + public: | ||
| 16 | + Microphone(); | ||
| 17 | + ~Microphone(); | ||
| 18 | +}; | ||
| 9 | 19 | ||
| 10 | // CNonStreamingTextToSpeechDlg dialog | 20 | // CNonStreamingTextToSpeechDlg dialog |
| 11 | class CNonStreamingTextToSpeechDlg : public CDialogEx | 21 | class CNonStreamingTextToSpeechDlg : public CDialogEx |
| @@ -34,16 +44,21 @@ protected: | @@ -34,16 +44,21 @@ protected: | ||
| 34 | afx_msg void OnPaint(); | 44 | afx_msg void OnPaint(); |
| 35 | afx_msg HCURSOR OnQueryDragIcon(); | 45 | afx_msg HCURSOR OnQueryDragIcon(); |
| 36 | DECLARE_MESSAGE_MAP() | 46 | DECLARE_MESSAGE_MAP() |
| 37 | - public: | ||
| 38 | - CEdit my_hint_; | ||
| 39 | - CEdit speaker_id_; | ||
| 40 | - CEdit speed_; | ||
| 41 | - void Init(); | ||
| 42 | - void InitHint(); | ||
| 43 | - CButton generate_btn_; | ||
| 44 | - afx_msg void OnBnClickedOk(); | ||
| 45 | - | ||
| 46 | - SherpaOnnxOfflineTts *tts_; | ||
| 47 | - CEdit my_text_; | ||
| 48 | - CEdit output_filename_; | 47 | +public: |
| 48 | + CEdit my_hint_; | ||
| 49 | + CEdit speaker_id_; | ||
| 50 | + CEdit speed_; | ||
| 51 | + void Init(); | ||
| 52 | + void InitHint(); | ||
| 53 | + CButton generate_btn_; | ||
| 54 | + afx_msg void OnBnClickedOk(); | ||
| 55 | + | ||
| 56 | + SherpaOnnxOfflineTts *tts_ = nullptr; | ||
| 57 | + CEdit my_text_; | ||
| 58 | + CEdit output_filename_; | ||
| 59 | + | ||
| 60 | +private: | ||
| 61 | + Microphone mic_; | ||
| 62 | + std::unique_ptr<std::thread> play_thread_; | ||
| 63 | + | ||
| 49 | }; | 64 | }; |
-
请 注册 或 登录 后发表评论