Fangjun Kuang
Committed by GitHub

Support playing generated audio as it is generating for MFC. (#462)

* Support playing generated audio as it is generating for MFC.

* support espeak-ng-data
@@ -80,3 +80,4 @@ jslint.mjs @@ -80,3 +80,4 @@ jslint.mjs
80 vits-piper-en_US-amy-low 80 vits-piper-en_US-amy-low
81 vits-piper-*-*-* 81 vits-piper-*-*-*
82 log 82 log
  83 +*.exe
@@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { @@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
214 param.sampleFormat = paFloat32; 214 param.sampleFormat = paFloat32;
215 param.suggestedLatency = info->defaultLowInputLatency; 215 param.suggestedLatency = info->defaultLowInputLatency;
216 param.hostApiSpecificStreamInfo = nullptr; 216 param.hostApiSpecificStreamInfo = nullptr;
217 - float sample_rate = config_.feat_config.sample_rate; 217 + float sample_rate = static_cast<float>(config_.feat_config.sample_rate);
218 pa_stream_ = nullptr; 218 pa_stream_ = nullptr;
219 PaError err = 219 PaError err =
220 Pa_OpenStream(&pa_stream_, &param, nullptr, /* &outputParameters, */ 220 Pa_OpenStream(&pa_stream_, &param, nullptr, /* &outputParameters, */
@@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { @@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
259 SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_); 259 SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_);
260 260
261 AcceptWaveformOffline(stream, config_.feat_config.sample_rate, 261 AcceptWaveformOffline(stream, config_.feat_config.sample_rate,
262 - samples_.data(), samples_.size()); 262 + samples_.data(), static_cast<int32_t>(samples_.size()));
263 DecodeOfflineStream(recognizer_, stream); 263 DecodeOfflineStream(recognizer_, stream);
264 auto r = GetOfflineStreamResult(stream); 264 auto r = GetOfflineStreamResult(stream);
265 results_.emplace_back(r->text); 265 results_.emplace_back(r->text);
1 B// Microsoft Visual C++ generated resource script. 1 B// Microsoft Visual C++ generated resource script.
@@ -9,14 +9,184 @@ @@ -9,14 +9,184 @@
9 #include "afxdialogex.h" 9 #include "afxdialogex.h"
10 10
11 #include <fstream> 11 #include <fstream>
  12 +#include <mutex> // NOLINT
  13 +#include <queue>
12 #include <stdexcept> 14 #include <stdexcept>
13 #include <string> 15 #include <string>
  16 +#include <thread> // NOLINT
14 #include <vector> 17 #include <vector>
15 18
16 #ifdef _DEBUG 19 #ifdef _DEBUG
17 #define new DEBUG_NEW 20 #define new DEBUG_NEW
18 #endif 21 #endif
19 22
  23 +Microphone::Microphone() {
  24 + PaError err = Pa_Initialize();
  25 + if (err != paNoError) {
  26 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  27 + exit(-2);
  28 + }
  29 +}
  30 +
  31 +Microphone::~Microphone() {
  32 + PaError err = Pa_Terminate();
  33 + if (err != paNoError) {
  34 + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
  35 + exit(-2);
  36 + }
  37 +}
  38 +
  39 +// NOTE(fangjun): Code is copied from
  40 +// https://github.com/k2-fsa/sherpa-onnx/blob/master/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc#L22
  41 +static std::condition_variable g_cv;
  42 +static std::mutex g_cv_m;
  43 +
  44 +struct Samples {
  45 + std::vector<float> data;
  46 + int32_t consumed = 0;
  47 +};
  48 +
  49 +struct Buffer {
  50 + std::queue<Samples> samples;
  51 + std::mutex mutex;
  52 +};
  53 +
  54 +static Buffer g_buffer;
  55 +
  56 +static bool g_started = false;
  57 +static bool g_stopped = false;
  58 +static bool g_killed = false;
  59 +
  60 +static void AudioGeneratedCallback(const float *s, int32_t n) {
  61 + if (n > 0) {
  62 + Samples samples;
  63 + samples.data = std::vector<float>{s, s + n};
  64 +
  65 + std::lock_guard<std::mutex> lock(g_buffer.mutex);
  66 + g_buffer.samples.push(std::move(samples));
  67 + g_started = true;
  68 + }
  69 +}
  70 +
  71 +static int PlayCallback(const void * /*in*/, void *out,
  72 + unsigned long _n, // NOLINT
  73 + const PaStreamCallbackTimeInfo * /*time_info*/,
  74 + PaStreamCallbackFlags /*status_flags*/,
  75 + void * /*user_data*/) {
  76 + int32_t n = static_cast<int32_t>(_n);
  77 + if (g_killed) {
  78 + return paComplete;
  79 + }
  80 +
  81 + float *pout = reinterpret_cast<float *>(out);
  82 + std::lock_guard<std::mutex> lock(g_buffer.mutex);
  83 +
  84 + if (g_buffer.samples.empty()) {
  85 + if (g_stopped) {
  86 + // no more data is available and we have processed all of the samples
  87 + return paComplete;
  88 + }
  89 +
  90 + // The current sentence is so long, though very unlikely, that
  91 + // the model has not finished processing it yet.
  92 + std::fill_n(pout, n, 0);
  93 +
  94 + return paContinue;
  95 + }
  96 +
  97 + int32_t k = 0;
  98 + for (; k < n && !g_buffer.samples.empty();) {
  99 + int32_t this_block = n - k;
  100 +
  101 + auto &p = g_buffer.samples.front();
  102 +
  103 + int32_t remaining = static_cast<int32_t>(p.data.size()) - p.consumed;
  104 +
  105 + if (this_block <= remaining) {
  106 + std::copy(p.data.begin() + p.consumed,
  107 + p.data.begin() + p.consumed + this_block, pout + k);
  108 + p.consumed += this_block;
  109 +
  110 + k = n;
  111 +
  112 + if (p.consumed == p.data.size()) {
  113 + g_buffer.samples.pop();
  114 + }
  115 + break;
  116 + }
  117 +
  118 + std::copy(p.data.begin() + p.consumed, p.data.end(), pout + k);
  119 + k += static_cast<int32_t>(p.data.size()) - p.consumed;
  120 + g_buffer.samples.pop();
  121 + }
  122 +
  123 + if (k < n) {
  124 + std::fill_n(pout + k, n - k, 0);
  125 + }
  126 +
  127 + if (g_stopped && g_buffer.samples.empty()) {
  128 + return paComplete;
  129 + }
  130 +
  131 + return paContinue;
  132 +}
  133 +
  134 +static void PlayCallbackFinished(void *userData) { g_cv.notify_all(); }
  135 +
  136 +static void StartPlayback(int32_t sample_rate) {
  137 + int32_t frames_per_buffer = 1024;
  138 + PaStreamParameters outputParameters;
  139 + PaStream *stream;
  140 + PaError err;
  141 +
  142 + outputParameters.device =
  143 + Pa_GetDefaultOutputDevice(); /* default output device */
  144 +
  145 + outputParameters.channelCount = 1; /* stereo output */
  146 + outputParameters.sampleFormat = paFloat32; /* 32 bit floating point output */
  147 + outputParameters.suggestedLatency =
  148 + Pa_GetDeviceInfo(outputParameters.device)->defaultLowOutputLatency;
  149 + outputParameters.hostApiSpecificStreamInfo = nullptr;
  150 +
  151 + err = Pa_OpenStream(&stream, nullptr, /* no input */
  152 + &outputParameters, sample_rate, frames_per_buffer,
  153 + paClipOff, // we won't output out of range samples so
  154 + // don't bother clipping them
  155 + PlayCallback, nullptr);
  156 + if (err != paNoError) {
  157 + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
  158 + return;
  159 + }
  160 +
  161 + err = Pa_SetStreamFinishedCallback(stream, &PlayCallbackFinished);
  162 + if (err != paNoError) {
  163 + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
  164 + return;
  165 + }
  166 +
  167 + err = Pa_StartStream(stream);
  168 + if (err != paNoError) {
  169 + fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
  170 + return;
  171 + }
  172 +
  173 + std::unique_lock<std::mutex> lock(g_cv_m);
  174 + while (!g_killed && !g_stopped &&
  175 + (!g_started || (g_started && !g_buffer.samples.empty()))) {
  176 + g_cv.wait(lock);
  177 + }
  178 +
  179 + err = Pa_StopStream(stream);
  180 + if (err != paNoError) {
  181 + return;
  182 + }
  183 +
  184 + err = Pa_CloseStream(stream);
  185 + if (err != paNoError) {
  186 + return;
  187 + }
  188 +}
  189 +
20 190
21 // CAboutDlg dialog used for App About 191 // CAboutDlg dialog used for App About
22 192
@@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() { @@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() {
261 ok = false; 431 ok = false;
262 } 432 }
263 433
264 - if (!Exists("./lexicon.txt")) {  
265 - error_message += "Cannot find ./lexicon.txt\r\n"; 434 + if (!Exists("./lexicon.txt") && !Exists("./espeak-ng-data/phontab")) {
  435 + error_message += "Cannot find espeak-ng-data directory or ./lexicon.txt\r\n";
266 ok = false; 436 ok = false;
267 } 437 }
268 438
@@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() { @@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() {
275 generate_btn_.EnableWindow(FALSE); 445 generate_btn_.EnableWindow(FALSE);
276 error_message += 446 error_message +=
277 "\r\nPlease refer to\r\n" 447 "\r\nPlease refer to\r\n"
278 - "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html"; 448 + "https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models";
279 error_message += "\r\nto download models.\r\n"; 449 error_message += "\r\nto download models.\r\n";
280 - error_message += "\r\nWe given an example below\r\n\r\n";  
281 - error_message +=  
282 - "wget -O model.onnx "  
283 - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"  
284 - "vits-aishell3.onnx\r\n"; 450 + error_message += "\r\nWe give an example below\r\n\r\n";
285 error_message += 451 error_message +=
286 - "wget "  
287 - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"  
288 - "lexicon.txt\r\n";  
289 - error_message +=  
290 - "wget "  
291 - "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"  
292 - "tokens.txt\r\n"; 452 + "1. Download vits-piper-en_US-amy-low.tar.bz2 from the following URL\r\n\r\n"
  453 + "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2\r\n\r\n"
  454 + "2. Uncompress it and you will get a directory vits-piper-en_US-amy-low \r\n\r\n"
  455 + "3. Switch to the directory vits-piper-en_US-amy-low \r\n\r\n"
  456 + "4. Rename en_US-amy-low.onnx to model.onnx \r\n\r\n"
  457 + "5. Copy the current exe to the directory vits-piper-en_US-amy-low\r\n\r\n"
  458 + "6. Done! You can now run the exe in the directory vits-piper-en_US-amy-low\r\n\r\n";
293 459
294 AppendLineToMultilineEditCtrl(my_hint_, error_message); 460 AppendLineToMultilineEditCtrl(my_hint_, error_message);
295 return; 461 return;
@@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() { @@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() {
299 SherpaOnnxOfflineTtsConfig config; 465 SherpaOnnxOfflineTtsConfig config;
300 memset(&config, 0, sizeof(config)); 466 memset(&config, 0, sizeof(config));
301 config.model.debug = 0; 467 config.model.debug = 0;
302 - config.model.num_threads = 1; 468 + config.model.num_threads = 2;
303 config.model.provider = "cpu"; 469 config.model.provider = "cpu";
304 config.model.vits.model = "./model.onnx"; 470 config.model.vits.model = "./model.onnx";
305 - config.model.vits.lexicon = "./lexicon.txt"; 471 + if (Exists("./espeak-ng-data/phontab")) {
  472 + config.model.vits.data_dir = "./espeak-ng-data";
  473 + } else {
  474 + config.model.vits.lexicon = "./lexicon.txt";
  475 + }
306 config.model.vits.tokens = "./tokens.txt"; 476 config.model.vits.tokens = "./tokens.txt";
307 477
308 tts_ = SherpaOnnxCreateOfflineTts(&config); 478 tts_ = SherpaOnnxCreateOfflineTts(&config);
@@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() { @@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() {
321 } 491 }
322 492
323 void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { 493 void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
324 - // TODO: Add your control notification handler code here  
325 CString s; 494 CString s;
326 speaker_id_.GetWindowText(s); 495 speaker_id_.GetWindowText(s);
327 int speaker_id = _ttoi(s); 496 int speaker_id = _ttoi(s);
@@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { @@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
338 } 507 }
339 508
340 my_text_.GetWindowText(s); 509 my_text_.GetWindowText(s);
  510 +
341 std::string ss = ToString(s); 511 std::string ss = ToString(s);
342 if (ss.empty()) { 512 if (ss.empty()) {
343 AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK); 513 AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK);
344 return; 514 return;
345 } 515 }
346 516
  517 + if (play_thread_) {
  518 + g_killed = true;
  519 + g_stopped = true;
  520 + if (play_thread_->joinable()) {
  521 + play_thread_->join();
  522 + }
  523 + }
  524 +
  525 + g_killed = false;
  526 + g_stopped = false;
  527 + g_started = false;
  528 + g_buffer.samples = {};
  529 +
  530 + // Caution(fangjun): It is not efficient to re-create the thread. We use this approach
  531 + // for simplicity
  532 + play_thread_ = std::make_unique<std::thread>(StartPlayback, SherpaOnnxOfflineTtsSampleRate(tts_));
  533 +
  534 + generate_btn_.EnableWindow(FALSE);
  535 +
347 const SherpaOnnxGeneratedAudio *audio = 536 const SherpaOnnxGeneratedAudio *audio =
348 - SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed); 537 + SherpaOnnxOfflineTtsGenerateWithCallback(tts_, ss.c_str(), speaker_id, speed, &AudioGeneratedCallback);
  538 +
  539 + generate_btn_.EnableWindow(TRUE);
  540 +
349 output_filename_.GetWindowText(s); 541 output_filename_.GetWindowText(s);
350 std::string filename = ToString(s); 542 std::string filename = ToString(s);
  543 +
351 int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, 544 int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate,
352 filename.c_str()); 545 filename.c_str());
353 546
354 SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); 547 SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
355 548
356 if (ok) { 549 if (ok) {
357 - AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK); 550 + // AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK);
  551 + AppendLineToMultilineEditCtrl(my_hint_, std::string("Saved to ") + filename + " successfully");
358 } else { 552 } else {
359 - AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK); 553 + // AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK);
  554 + AppendLineToMultilineEditCtrl(my_hint_, std::string("Failed to saved to ") + filename);
360 } 555 }
361 556
362 //CDialogEx::OnOK(); 557 //CDialogEx::OnOK();
@@ -6,6 +6,16 @@ @@ -6,6 +6,16 @@
6 6
7 #include "sherpa-onnx/c-api/c-api.h" 7 #include "sherpa-onnx/c-api/c-api.h"
8 8
  9 +#include <memory>
  10 +#include <thread>
  11 +
  12 +#include "portaudio.h"
  13 +
  14 +class Microphone {
  15 + public:
  16 + Microphone();
  17 + ~Microphone();
  18 +};
9 19
10 // CNonStreamingTextToSpeechDlg dialog 20 // CNonStreamingTextToSpeechDlg dialog
11 class CNonStreamingTextToSpeechDlg : public CDialogEx 21 class CNonStreamingTextToSpeechDlg : public CDialogEx
@@ -34,16 +44,21 @@ protected: @@ -34,16 +44,21 @@ protected:
34 afx_msg void OnPaint(); 44 afx_msg void OnPaint();
35 afx_msg HCURSOR OnQueryDragIcon(); 45 afx_msg HCURSOR OnQueryDragIcon();
36 DECLARE_MESSAGE_MAP() 46 DECLARE_MESSAGE_MAP()
37 - public:  
38 - CEdit my_hint_;  
39 - CEdit speaker_id_;  
40 - CEdit speed_;  
41 - void Init();  
42 - void InitHint();  
43 - CButton generate_btn_;  
44 - afx_msg void OnBnClickedOk();  
45 -  
46 - SherpaOnnxOfflineTts *tts_;  
47 - CEdit my_text_;  
48 - CEdit output_filename_; 47 +public:
  48 + CEdit my_hint_;
  49 + CEdit speaker_id_;
  50 + CEdit speed_;
  51 + void Init();
  52 + void InitHint();
  53 + CButton generate_btn_;
  54 + afx_msg void OnBnClickedOk();
  55 +
  56 + SherpaOnnxOfflineTts *tts_ = nullptr;
  57 + CEdit my_text_;
  58 + CEdit output_filename_;
  59 +
  60 +private:
  61 + Microphone mic_;
  62 + std::unique_ptr<std::thread> play_thread_;
  63 +
49 }; 64 };