Support playing generated audio as it is generating for MFC. (#462)

* Support playing generated audio as it is generating for MFC. * support espeak-ng-data

Support playing generated audio as it is generating for MFC. (#462)
* Support playing generated audio as it is generating for MFC. * support espeak-ng-data
Fangjun Kuang · GitHub
Commit 73afa0248becf2a957403f651c5029b6ade4fd1e 73afa024 1 parent 86b4be52
.gitignore
mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc
mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp
mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h
--- a/.gitignore
查看文件 @73afa02
+++ b/.gitignore
查看文件 @73afa02
@@ -80,3 +80,4 @@ jslint.mjs
 vits-piper-en_US-amy-low
 vits-piper-*-*-*
 log
+ *.exe
--- a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
查看文件 @73afa02
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
查看文件 @73afa02
@@ -214,7 +214,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
     param.sampleFormat = paFloat32;
     param.suggestedLatency = info->defaultLowInputLatency;
     param.hostApiSpecificStreamInfo = nullptr;
-     float sample_rate = config_.feat_config.sample_rate;
+     float sample_rate = static_cast<float>(config_.feat_config.sample_rate);
     pa_stream_ = nullptr;
     PaError err =
         Pa_OpenStream(&pa_stream_, &param, nullptr, /* &outputParameters, */
@@ -259,7 +259,7 @@ void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
     SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_);
 
     AcceptWaveformOffline(stream, config_.feat_config.sample_rate,
-                           samples_.data(), samples_.size());
+                           samples_.data(), static_cast<int32_t>(samples_.size()));
     DecodeOfflineStream(recognizer_, stream);
     auto r = GetOfflineStreamResult(stream);
     results_.emplace_back(r->text);
--- a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc
查看文件 @73afa02
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc
查看文件 @73afa02
 B// Microsoft Visual C++ generated resource script.
--- a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp
查看文件 @73afa02
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp
查看文件 @73afa02
@@ -9,14 +9,184 @@
 #include "afxdialogex.h"
 
 #include <fstream>
+ #include <mutex>  // NOLINT
+ #include <queue>
 #include <stdexcept>
 #include <string>
+ #include <thread>  // NOLINT
 #include <vector>
 
 #ifdef _DEBUG
 #define new DEBUG_NEW
 #endif
 
+ Microphone::Microphone() {
+   PaError err = Pa_Initialize();
+   if (err != paNoError) {
+     fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
+     exit(-2);
+   }
+ }
+ 
+ Microphone::~Microphone() {
+   PaError err = Pa_Terminate();
+   if (err != paNoError) {
+     fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
+     exit(-2);
+   }
+ }
+ 
+ // NOTE(fangjun): Code is copied from
+ // https://github.com/k2-fsa/sherpa-onnx/blob/master/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play.cc#L22
+ static std::condition_variable g_cv;
+ static std::mutex g_cv_m;
+ 
+ struct Samples {
+   std::vector<float> data;
+   int32_t consumed = 0;
+ };
+ 
+ struct Buffer {
+   std::queue<Samples> samples;
+   std::mutex mutex;
+ };
+ 
+ static Buffer g_buffer;
+ 
+ static bool g_started = false;
+ static bool g_stopped = false;
+ static bool g_killed = false;
+ 
+ static void AudioGeneratedCallback(const float *s, int32_t n) {
+   if (n > 0) {
+     Samples samples;
+     samples.data = std::vector<float>{s, s + n};
+ 
+     std::lock_guard<std::mutex> lock(g_buffer.mutex);
+     g_buffer.samples.push(std::move(samples));
+     g_started = true;
+   }
+ }
+ 
+ static int PlayCallback(const void * /*in*/, void *out,
+                         unsigned long _n,  // NOLINT
+                         const PaStreamCallbackTimeInfo * /*time_info*/,
+                         PaStreamCallbackFlags /*status_flags*/,
+                         void * /*user_data*/) {
+   int32_t n = static_cast<int32_t>(_n);
+   if (g_killed) {
+     return paComplete;
+   }
+ 
+   float *pout = reinterpret_cast<float *>(out);
+   std::lock_guard<std::mutex> lock(g_buffer.mutex);
+ 
+   if (g_buffer.samples.empty()) {
+     if (g_stopped) {
+       // no more data is available and we have processed all of the samples
+       return paComplete;
+     }
+ 
+     // The current sentence is so long, though very unlikely, that
+     // the model has not finished processing it yet.
+     std::fill_n(pout, n, 0);
+ 
+     return paContinue;
+   }
+ 
+   int32_t k = 0;
+   for (; k < n && !g_buffer.samples.empty();) {
+     int32_t this_block = n - k;
+ 
+     auto &p = g_buffer.samples.front();
+ 
+     int32_t remaining = static_cast<int32_t>(p.data.size()) - p.consumed;
+ 
+     if (this_block <= remaining) {
+       std::copy(p.data.begin() + p.consumed,
+                 p.data.begin() + p.consumed + this_block, pout + k);
+       p.consumed += this_block;
+ 
+       k = n;
+ 
+       if (p.consumed == p.data.size()) {
+         g_buffer.samples.pop();
+       }
+       break;
+     }
+ 
+     std::copy(p.data.begin() + p.consumed, p.data.end(), pout + k);
+     k += static_cast<int32_t>(p.data.size()) - p.consumed;
+     g_buffer.samples.pop();
+   }
+ 
+   if (k < n) {
+     std::fill_n(pout + k, n - k, 0);
+   }
+ 
+   if (g_stopped && g_buffer.samples.empty()) {
+     return paComplete;
+   }
+ 
+   return paContinue;
+ }
+ 
+ static void PlayCallbackFinished(void *userData) { g_cv.notify_all(); }
+ 
+ static void StartPlayback(int32_t sample_rate) {
+   int32_t frames_per_buffer = 1024;
+   PaStreamParameters outputParameters;
+   PaStream *stream;
+   PaError err;
+ 
+   outputParameters.device =
+       Pa_GetDefaultOutputDevice(); /* default output device */
+ 
+   outputParameters.channelCount = 1;         /* stereo output */
+   outputParameters.sampleFormat = paFloat32; /* 32 bit floating point output */
+   outputParameters.suggestedLatency =
+       Pa_GetDeviceInfo(outputParameters.device)->defaultLowOutputLatency;
+   outputParameters.hostApiSpecificStreamInfo = nullptr;
+ 
+   err = Pa_OpenStream(&stream, nullptr, /* no input */
+                       &outputParameters, sample_rate, frames_per_buffer,
+                       paClipOff,  // we won't output out of range samples so
+                                   //   don't bother clipping them
+                       PlayCallback, nullptr);
+   if (err != paNoError) {
+     fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
+     return;
+   }
+ 
+   err = Pa_SetStreamFinishedCallback(stream, &PlayCallbackFinished);
+   if (err != paNoError) {
+     fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
+     return;
+   }
+ 
+   err = Pa_StartStream(stream);
+   if (err != paNoError) {
+     fprintf(stderr, "%d portaudio error: %s\n", __LINE__, Pa_GetErrorText(err));
+     return;
+   }
+ 
+   std::unique_lock<std::mutex> lock(g_cv_m);
+   while (!g_killed && !g_stopped &&
+          (!g_started || (g_started && !g_buffer.samples.empty()))) {
+     g_cv.wait(lock);
+   }
+ 
+   err = Pa_StopStream(stream);
+   if (err != paNoError) {
+     return;
+   }
+ 
+   err = Pa_CloseStream(stream);
+   if (err != paNoError) {
+     return;
+   }
+ }
+ 
 
 // CAboutDlg dialog used for App About
 
@@ -261,8 +431,8 @@ void CNonStreamingTextToSpeechDlg::Init() {
     ok = false;
   }
 
-   if (!Exists("./lexicon.txt")) {
-     error_message += "Cannot find ./lexicon.txt\r\n";
+   if (!Exists("./lexicon.txt") && !Exists("./espeak-ng-data/phontab")) {
+     error_message += "Cannot find espeak-ng-data directory or ./lexicon.txt\r\n";
     ok = false;
   }
 
@@ -275,21 +445,17 @@ void CNonStreamingTextToSpeechDlg::Init() {
     generate_btn_.EnableWindow(FALSE);
     error_message +=
         "\r\nPlease refer to\r\n"
-         "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html";
+         "https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models";
     error_message += "\r\nto download models.\r\n";
-     error_message += "\r\nWe given an example below\r\n\r\n";
-     error_message +=
-         "wget -O model.onnx "
-         "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
-         "vits-aishell3.onnx\r\n";
+     error_message += "\r\nWe give an example below\r\n\r\n";
     error_message +=
-         "wget  "
-         "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
-         "lexicon.txt\r\n";
-     error_message +=
-         "wget  "
-         "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
-         "tokens.txt\r\n";
+         "1. Download vits-piper-en_US-amy-low.tar.bz2 from the following URL\r\n\r\n"
+         "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2\r\n\r\n"
+         "2. Uncompress it and you will get a directory vits-piper-en_US-amy-low \r\n\r\n"
+         "3. Switch to the directory vits-piper-en_US-amy-low \r\n\r\n"
+         "4. Rename en_US-amy-low.onnx to model.onnx \r\n\r\n"
+         "5. Copy the current exe to the directory vits-piper-en_US-amy-low\r\n\r\n"
+         "6. Done! You can now run the exe in the directory vits-piper-en_US-amy-low\r\n\r\n";
 
     AppendLineToMultilineEditCtrl(my_hint_, error_message);
     return;
@@ -299,10 +465,14 @@ void CNonStreamingTextToSpeechDlg::Init() {
   SherpaOnnxOfflineTtsConfig config;
   memset(&config, 0, sizeof(config));
   config.model.debug = 0;
-   config.model.num_threads = 1;
+   config.model.num_threads = 2;
   config.model.provider = "cpu";
   config.model.vits.model = "./model.onnx";
-   config.model.vits.lexicon = "./lexicon.txt";
+   if (Exists("./espeak-ng-data/phontab")) {
+     config.model.vits.data_dir = "./espeak-ng-data";
+   } else {
+     config.model.vits.lexicon = "./lexicon.txt";
+   }
   config.model.vits.tokens = "./tokens.txt";
 
   tts_ = SherpaOnnxCreateOfflineTts(&config);
@@ -321,7 +491,6 @@ void CNonStreamingTextToSpeechDlg::Init() {
  }
 
 void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
-   // TODO: Add your control notification handler code here
   CString s;
   speaker_id_.GetWindowText(s);
   int speaker_id = _ttoi(s);
@@ -338,25 +507,51 @@ void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
   }
 
   my_text_.GetWindowText(s);
+ 
   std::string ss = ToString(s);
   if (ss.empty()) {
     AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK);
     return;
   }
 
+   if (play_thread_) {
+     g_killed = true;
+     g_stopped = true;
+     if (play_thread_->joinable()) {
+       play_thread_->join();
+     }
+   }
+ 
+   g_killed = false;
+   g_stopped = false;
+   g_started = false;
+   g_buffer.samples = {};
+ 
+   // Caution(fangjun): It is not efficient to re-create the thread. We use this approach
+   // for simplicity
+   play_thread_ = std::make_unique<std::thread>(StartPlayback, SherpaOnnxOfflineTtsSampleRate(tts_));
+ 
+   generate_btn_.EnableWindow(FALSE);
+ 
   const SherpaOnnxGeneratedAudio *audio =
-       SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed);
+       SherpaOnnxOfflineTtsGenerateWithCallback(tts_, ss.c_str(), speaker_id, speed, &AudioGeneratedCallback);
+ 
+   generate_btn_.EnableWindow(TRUE);
+ 
   output_filename_.GetWindowText(s);
   std::string filename = ToString(s);
+ 
   int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate,
                     filename.c_str());
 
   SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
 
   if (ok) {
-     AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK);
+     // AfxMessageBox(Utf8ToUtf16(std::string("Saved to ") + filename + " successfully").c_str(), MB_OK);
+     AppendLineToMultilineEditCtrl(my_hint_, std::string("Saved to ") + filename + " successfully");
   } else {
-     AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK);
+     // AfxMessageBox(Utf8ToUtf16(std::string("Failed to save to ") + filename).c_str(), MB_OK);
+     AppendLineToMultilineEditCtrl(my_hint_, std::string("Failed to saved to ") + filename);
   }
 
   //CDialogEx::OnOK();
--- a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h
查看文件 @73afa02
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h
查看文件 @73afa02
@@ -6,6 +6,16 @@
 
 #include "sherpa-onnx/c-api/c-api.h"
 
+ #include <memory>
+ #include <thread>
+ 
+ #include "portaudio.h"
+ 
+ class Microphone {
+  public:
+   Microphone();
+   ~Microphone();
+ };
 
 // CNonStreamingTextToSpeechDlg dialog
 class CNonStreamingTextToSpeechDlg : public CDialogEx
@@ -34,16 +44,21 @@ protected:
 	afx_msg void OnPaint();
 	afx_msg HCURSOR OnQueryDragIcon();
 	DECLARE_MESSAGE_MAP()
-        public:
-         CEdit my_hint_;
-         CEdit speaker_id_;
-         CEdit speed_;
-         void Init();
-         void InitHint();
-         CButton generate_btn_;
-         afx_msg void OnBnClickedOk();
- 
- 		SherpaOnnxOfflineTts *tts_;
-         CEdit my_text_;
-                 CEdit output_filename_;
+ public:
+ 	CEdit my_hint_;
+ 	CEdit speaker_id_;
+ 	CEdit speed_;
+ 	void Init();
+ 	void InitHint();
+ 	CButton generate_btn_;
+ 	afx_msg void OnBnClickedOk();
+ 
+ 	SherpaOnnxOfflineTts *tts_ = nullptr;
+ 	CEdit my_text_;
+ 	CEdit output_filename_;
+ 
+ private:
+     Microphone mic_;
+ 	std::unique_ptr<std::thread> play_thread_;
+ 
 };