Play generated audio using alsa for TTS (#482)

Fangjun Kuang · GitHub
Commit b18812ceff83b9560f96709b1c1521c4f8eda50b b18812ce 1 parent 9829d7c4
CMakeLists.txt
cmake/cmake_extension.py
setup.py
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/alsa-play.cc
sherpa-onnx/csrc/alsa-play.h
sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc
--- a/CMakeLists.txt
查看文件 @b18812c
+++ b/CMakeLists.txt
查看文件 @b18812c
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-onnx)
 
- set(SHERPA_ONNX_VERSION "1.9.3")
+ set(SHERPA_ONNX_VERSION "1.9.4")
 
 # Disable warning about
 #
@@ -106,10 +106,23 @@ endif()
 set(CMAKE_CXX_EXTENSIONS OFF)
 message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
 
+ 
 include(CheckIncludeFileCXX)
- check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
- if(SHERPA_ONNX_HAS_ALSA)
-   add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
+ 
+ if(UNIX AND NOT APPLE)
+   check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
+   if(SHERPA_ONNX_HAS_ALSA)
+     add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
+   else()
+     message(WARNING "\
+ Could not find alsa/asoundlib.h !
+ We won't build sherpa-ncnn-alsa
+ To fix that, please do:
+   (1) sudo apt-get install alsa-utils libasound2-dev
+   (2) rm -rf build
+   (3) re-try
+   ")
+   endif()
 endif()
 
 check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H)
--- a/cmake/cmake_extension.py
查看文件 @b18812c
+++ b/cmake/cmake_extension.py
查看文件 @b18812c
@@ -144,6 +144,8 @@ class BuildExtension(build_ext):
         binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
         binaries += ["sherpa-onnx-offline-tts"]
         binaries += ["sherpa-onnx-offline-tts-play"]
+         binaries += ["sherpa-onnx-alsa"]
+         binaries += ["sherpa-onnx-offline-tts-play-alsa"]
 
         if is_windows():
             binaries += ["kaldi-native-fbank-core.dll"]
@@ -165,6 +167,11 @@ class BuildExtension(build_ext):
                 src_file = install_dir / "lib" / (f + suffix)
             if not src_file.is_file():
                 src_file = install_dir / ".." / (f + suffix)
+ 
+             if not src_file.is_file() and 'alsa' in f:
+                 print(f'Skipping {f}')
+                 continue
+ 
             print(f"Copying {src_file} to {out_bin_dir}/")
             shutil.copy(f"{src_file}", f"{out_bin_dir}/")
 
--- a/setup.py
查看文件 @b18812c
+++ b/setup.py
查看文件 @b18812c
@@ -60,6 +60,8 @@ def get_binaries_to_install():
     binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
     binaries += ["sherpa-onnx-offline-tts"]
     binaries += ["sherpa-onnx-offline-tts-play"]
+     binaries += ["sherpa-onnx-alsa"]
+     binaries += ["sherpa-onnx-offline-tts-play-alsa"]
     if is_windows():
         binaries += ["kaldi-native-fbank-core.dll"]
         binaries += ["sherpa-onnx-c-api.dll"]
--- a/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @b18812c
+++ b/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @b18812c
@@ -207,14 +207,42 @@ install(
 
 if(SHERPA_ONNX_HAS_ALSA)
   add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
-   target_link_libraries(sherpa-onnx-alsa sherpa-onnx-core)
+   add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
 
-   if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
-     target_link_libraries(sherpa-onnx-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
-   else()
-     target_link_libraries(sherpa-onnx-alsa asound)
+   set(exes
+     sherpa-onnx-alsa
+     sherpa-onnx-offline-tts-play-alsa
+   )
+   foreach(exe IN LISTS exes)
+     target_link_libraries(${exe} sherpa-onnx-core)
+   endforeach()
+ 
+   foreach(exe IN LISTS exes)
+     if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
+       target_link_libraries(${exe} -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
+     else()
+       target_link_libraries(${exe} asound)
+     endif()
+   endforeach()
+ 
+   if(NOT WIN32)
+     foreach(exe IN LISTS exes)
+       target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
+       target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
+     endforeach()
+ 
+     if(SHERPA_ONNX_ENABLE_PYTHON)
+       foreach(exe IN LISTS exes)
+         target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
+       endforeach()
+     endif()
   endif()
-   install(TARGETS sherpa-onnx-alsa DESTINATION bin)
+ 
+   install(
+     TARGETS ${exes}
+     DESTINATION
+       bin
+   )
 endif()
 
 if(SHERPA_ONNX_ENABLE_PORTAUDIO)
--- a/sherpa-onnx/csrc/alsa-play.cc 0 → 100644
查看文件 @b18812c
+++ b/sherpa-onnx/csrc/alsa-play.cc 0 → 100644
查看文件 @b18812c
+ // sherpa-onnx/csrc/alsa-play.cc
+ //
+ // Copyright (c)  2022-2023  Xiaomi Corporation
+ 
+ #ifdef SHERPA_ONNX_ENABLE_ALSA
+ 
+ #include "sherpa-onnx/csrc/alsa-play.h"
+ 
+ #include <algorithm>
+ 
+ namespace sherpa_onnx {
+ 
+ AlsaPlay::AlsaPlay(const char *device_name, int32_t sample_rate) {
+   int32_t err = snd_pcm_open(&handle_, device_name, SND_PCM_STREAM_PLAYBACK, 0);
+ 
+   if (err) {
+     fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err));
+     exit(-1);
+   }
+ 
+   SetParameters(sample_rate);
+ }
+ 
+ AlsaPlay::~AlsaPlay() {
+   if (handle_) {
+     int32_t err = snd_pcm_close(handle_);
+     if (err < 0) {
+       printf("Failed to close pcm: %s\n", snd_strerror(err));
+     }
+   }
+ }
+ 
+ void AlsaPlay::SetParameters(int32_t sample_rate) {
+   // set the following parameters
+   // 1. sample_rate
+   // 2. sample format: int16_t
+   // 3. num_channels: 1
+   snd_pcm_hw_params_t *params;
+   snd_pcm_hw_params_alloca(&params);
+   snd_pcm_hw_params_any(handle_, params);
+ 
+   int32_t err = snd_pcm_hw_params_set_access(handle_, params,
+                                              SND_PCM_ACCESS_RW_INTERLEAVED);
+   if (err < 0) {
+     printf("SND_PCM_ACCESS_RW_INTERLEAVED is not supported: %s\n",
+            snd_strerror(err));
+     exit(-1);
+   }
+ 
+   err = snd_pcm_hw_params_set_format(handle_, params, SND_PCM_FORMAT_S16_LE);
+ 
+   if (err < 0) {
+     printf("Can't set format to 16-bit: %s\n", snd_strerror(err));
+     exit(-1);
+   }
+ 
+   err = snd_pcm_hw_params_set_channels(handle_, params, 1);
+ 
+   if (err < 0) {
+     printf("Can't set channel number to 1: %s\n", snd_strerror(err));
+   }
+ 
+   uint32_t rate = sample_rate;
+   err = snd_pcm_hw_params_set_rate_near(handle_, params, &rate, 0);
+   if (err < 0) {
+     printf("Can't set rate to %d. %s\n", rate, snd_strerror(err));
+   }
+ 
+   err = snd_pcm_hw_params(handle_, params);
+   if (err < 0) {
+     printf("Can't set hardware parameters. %s\n", snd_strerror(err));
+     exit(-1);
+   }
+ 
+   uint32_t tmp;
+   snd_pcm_hw_params_get_rate(params, &tmp, 0);
+   int32_t actual_sample_rate = tmp;
+   if (actual_sample_rate != sample_rate) {
+     fprintf(stderr,
+             "Creating a resampler:\n"
+             "   in_sample_rate: %d\n"
+             "   output_sample_rate: %d\n",
+             sample_rate, actual_sample_rate);
+ 
+     float min_freq = std::min(actual_sample_rate, sample_rate);
+     float lowpass_cutoff = 0.99 * 0.5 * min_freq;
+ 
+     int32_t lowpass_filter_width = 6;
+     resampler_ = std::make_unique<LinearResample>(
+         sample_rate, actual_sample_rate, lowpass_cutoff, lowpass_filter_width);
+   }
+ 
+   snd_pcm_uframes_t frames;
+   snd_pcm_hw_params_get_period_size(params, &frames, 0);
+   buf_.resize(frames);
+ }
+ 
+ void AlsaPlay::Play(const std::vector<float> &samples) {
+   std::vector<float> tmp;
+   const float *p = samples.data();
+   int32_t num_samples = samples.size();
+   if (resampler_) {
+     resampler_->Resample(samples.data(), samples.size(), false, &tmp);
+     p = tmp.data();
+     num_samples = tmp.size();
+   }
+ 
+   int32_t frames = buf_.size();
+   int32_t i = 0;
+   for (; i + frames < num_samples; i += frames) {
+     for (int32_t k = 0; k != frames; ++k) {
+       buf_[k] = p[i + k] * 32767;
+     }
+ 
+     int32_t err = snd_pcm_writei(handle_, buf_.data(), frames);
+     if (err == -EPIPE) {
+       printf("XRUN.\n");
+       snd_pcm_prepare(handle_);
+     } else if (err < 0) {
+       printf("Can't write to PCM device: %s\n", snd_strerror(err));
+       exit(-1);
+     }
+   }
+ 
+   if (i < num_samples) {
+     for (int32_t k = 0; k + i < num_samples; ++k) {
+       buf_[k] = p[i + k] * 32767;
+     }
+ 
+     int32_t err = snd_pcm_writei(handle_, buf_.data(), num_samples - i);
+     if (err == -EPIPE) {
+       printf("XRUN.\n");
+       snd_pcm_prepare(handle_);
+     } else if (err < 0) {
+       printf("Can't write to PCM device: %s\n", snd_strerror(err));
+       exit(-1);
+     }
+   }
+ }
+ 
+ void AlsaPlay::Drain() {
+   int32_t err = snd_pcm_drain(handle_);
+   if (err < 0) {
+     printf("Failed to drain pcm. %s\n", snd_strerror(err));
+   }
+ }
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_ENABLE_ALSA
--- a/sherpa-onnx/csrc/alsa-play.h 0 → 100644
查看文件 @b18812c
+++ b/sherpa-onnx/csrc/alsa-play.h 0 → 100644
查看文件 @b18812c
+ // sherpa-onnx/csrc/alsa-play.h
+ //
+ // Copyright (c)  2022-2023  Xiaomi Corporation
+ 
+ #ifndef SHERPA_ONNX_CSRC_ALSA_PLAY_H_
+ #define SHERPA_ONNX_CSRC_ALSA_PLAY_H_
+ 
+ #include <cstdint>
+ #include <memory>
+ #include <vector>
+ 
+ #include "alsa/asoundlib.h"
+ #include "sherpa-onnx/csrc/resample.h"
+ 
+ namespace sherpa_onnx {
+ 
+ class AlsaPlay {
+  public:
+   AlsaPlay(const char *device_name, int32_t sample_rate);
+   ~AlsaPlay();
+   void Play(const std::vector<float> &samples);
+ 
+   // wait for all the samples to be played
+   void Drain();
+ 
+  private:
+   void SetParameters(int32_t sample_rate);
+ 
+  private:
+   snd_pcm_t *handle_ = nullptr;
+   std::unique_ptr<LinearResample> resampler_;
+   std::vector<int16_t> buf_;
+ };
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_CSRC_ALSA_PLAY_H_
--- a/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc 0 → 100644
查看文件 @b18812c
+++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc 0 → 100644
查看文件 @b18812c
+ // sherpa-onnx/csrc/sherpa-onnx-tts-play-alsa.cc
+ //
+ // Copyright (c)  2022-2023  Xiaomi Corporation
+ 
+ // see https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
+ // https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m___h_w___params.html
+ // https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
+ 
+ #include <signal.h>
+ 
+ #include <algorithm>
+ #include <chrono>              // NOLINT
+ #include <condition_variable>  // NOLINT
+ #include <fstream>
+ #include <mutex>  // NOLINT
+ #include <queue>
+ #include <thread>  // NOLINT
+ #include <vector>
+ 
+ #include "sherpa-onnx/csrc/alsa-play.h"
+ #include "sherpa-onnx/csrc/offline-tts.h"
+ #include "sherpa-onnx/csrc/parse-options.h"
+ #include "sherpa-onnx/csrc/wave-writer.h"
+ 
+ static std::condition_variable g_cv;
+ static std::mutex g_cv_m;
+ 
+ struct Buffer {
+   std::queue<std::vector<float>> samples;
+   std::mutex mutex;
+ };
+ 
+ static Buffer g_buffer;
+ 
+ static bool g_stopped = false;
+ static bool g_killed = false;
+ 
+ static void Handler(int32_t /*sig*/) {
+   if (g_killed) {
+     exit(0);
+   }
+ 
+   g_killed = true;
+   fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
+ }
+ 
+ static void AudioGeneratedCallback(const float *s, int32_t n) {
+   if (n > 0) {
+     std::lock_guard<std::mutex> lock(g_buffer.mutex);
+     g_buffer.samples.push({s, s + n});
+     g_cv.notify_all();
+   }
+ }
+ 
+ static void StartPlayback(const std::string &device_name, int32_t sample_rate) {
+   sherpa_onnx::AlsaPlay alsa(device_name.c_str(), sample_rate);
+ 
+   std::unique_lock<std::mutex> lock(g_cv_m);
+   while (!g_killed && !g_stopped) {
+     while (!g_buffer.samples.empty()) {
+       auto &p = g_buffer.samples.front();
+       alsa.Play(p);
+       g_buffer.samples.pop();
+     }
+ 
+     g_cv.wait(lock);
+   }
+ 
+   if (g_killed) {
+     return;
+   }
+ 
+   if (g_stopped) {
+     while (!g_buffer.samples.empty()) {
+       auto &p = g_buffer.samples.front();
+       alsa.Play(p);
+       g_buffer.samples.pop();
+     }
+   }
+ 
+   alsa.Drain();
+ }
+ 
+ int main(int32_t argc, char *argv[]) {
+   signal(SIGINT, Handler);
+ 
+   const char *kUsageMessage = R"usage(
+ Offline text-to-speech with sherpa-onnx.
+ 
+ It plays the generated audio as the model is processing.
+ 
+ Note that it is alsa so it works only on **Linux**. For instance, you can
+ use it on Raspberry Pi.
+ 
+ Usage example:
+ 
+ wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
+ tar xf vits-piper-en_US-amy-low.tar.bz2
+ 
+ ./bin/sherpa-onnx-offline-tts-play-alsa \
+  --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
+  --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
+  --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
+  --output-filename=./generated.wav \
+  "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
+ 
+ It will generate a file ./generated.wav as specified by --output-filename.
+ 
+ You can find more models at
+ https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
+ 
+ Please see
+ https://k2-fsa.github.io/sherpa/onnx/tts/index.html
+ or details.
+ )usage";
+ 
+   sherpa_onnx::ParseOptions po(kUsageMessage);
+   std::string device_name = "default";
+   std::string output_filename = "./generated.wav";
+   int32_t sid = 0;
+ 
+   po.Register("output-filename", &output_filename,
+               "Path to save the generated audio");
+ 
+   po.Register("device-name", &device_name,
+               "Name of the device to play the generated audio");
+ 
+   po.Register("sid", &sid,
+               "Speaker ID. Used only for multi-speaker models, e.g., models "
+               "trained using the VCTK dataset. Not used for single-speaker "
+               "models, e.g., models trained using the LJSpeech dataset");
+ 
+   sherpa_onnx::OfflineTtsConfig config;
+ 
+   config.Register(&po);
+   po.Read(argc, argv);
+ 
+   if (po.NumArgs() == 0) {
+     fprintf(stderr, "Error: Please provide the text to generate audio.\n\n");
+     po.PrintUsage();
+     exit(EXIT_FAILURE);
+   }
+ 
+   if (po.NumArgs() > 1) {
+     fprintf(stderr,
+             "Error: Accept only one positional argument. Please use single "
+             "quotes to wrap your text\n");
+     po.PrintUsage();
+     exit(EXIT_FAILURE);
+   }
+ 
+   if (!config.Validate()) {
+     fprintf(stderr, "Errors in config!\n");
+     exit(EXIT_FAILURE);
+   }
+ 
+   if (config.max_num_sentences != 1) {
+     fprintf(stderr, "Setting config.max_num_sentences to 1\n");
+     config.max_num_sentences = 1;
+   }
+ 
+   fprintf(stderr, "Loading the model\n");
+   sherpa_onnx::OfflineTts tts(config);
+ 
+   fprintf(stderr, "Start the playback thread\n");
+   std::thread playback_thread(StartPlayback, device_name, tts.SampleRate());
+ 
+   float speed = 1.0;
+ 
+   fprintf(stderr, "Generating ...\n");
+   const auto begin = std::chrono::steady_clock::now();
+   auto audio = tts.Generate(po.GetArg(1), sid, speed, AudioGeneratedCallback);
+   const auto end = std::chrono::steady_clock::now();
+   g_stopped = true;
+   g_cv.notify_all();
+   fprintf(stderr, "Generating done!\n");
+   if (audio.samples.empty()) {
+     fprintf(
+         stderr,
+         "Error in generating audio. Please read previous error messages.\n");
+     exit(EXIT_FAILURE);
+   }
+ 
+   float elapsed_seconds =
+       std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
+           .count() /
+       1000.;
+   float duration = audio.samples.size() / static_cast<float>(audio.sample_rate);
+ 
+   float rtf = elapsed_seconds / duration;
+   fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
+   fprintf(stderr, "Audio duration: %.3f s\n", duration);
+   fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds,
+           duration, rtf);
+ 
+   bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate,
+                                    audio.samples.data(), audio.samples.size());
+   if (!ok) {
+     fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str());
+     exit(EXIT_FAILURE);
+   }
+ 
+   fprintf(stderr, "The text is: %s. Speaker ID: %d\n\n", po.GetArg(1).c_str(),
+           sid);
+   fprintf(stderr, "\n**** Saved to %s successfully! ****\n",
+           output_filename.c_str());
+ 
+   fprintf(stderr, "\n");
+   fprintf(
+       stderr,
+       "Wait for the playback to finish. You can safely press ctrl + C to stop "
+       "the playback.\n");
+   playback_thread.join();
+ 
+   fprintf(stderr, "Done!\n");
+ 
+   return 0;
+ }