Fangjun Kuang
Committed by GitHub

Play generated audio using alsa for TTS (#482)

1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 1 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 project(sherpa-onnx) 2 project(sherpa-onnx)
3 3
4 -set(SHERPA_ONNX_VERSION "1.9.3") 4 +set(SHERPA_ONNX_VERSION "1.9.4")
5 5
6 # Disable warning about 6 # Disable warning about
7 # 7 #
@@ -106,10 +106,23 @@ endif() @@ -106,10 +106,23 @@ endif()
106 set(CMAKE_CXX_EXTENSIONS OFF) 106 set(CMAKE_CXX_EXTENSIONS OFF)
107 message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}") 107 message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
108 108
  109 +
109 include(CheckIncludeFileCXX) 110 include(CheckIncludeFileCXX)
110 -check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)  
111 -if(SHERPA_ONNX_HAS_ALSA) 111 +
  112 +if(UNIX AND NOT APPLE)
  113 + check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
  114 + if(SHERPA_ONNX_HAS_ALSA)
112 add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) 115 add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
  116 + else()
  117 + message(WARNING "\
  118 +Could not find alsa/asoundlib.h !
  119 +We won't build sherpa-ncnn-alsa
  120 +To fix that, please do:
  121 + (1) sudo apt-get install alsa-utils libasound2-dev
  122 + (2) rm -rf build
  123 + (3) re-try
  124 + ")
  125 + endif()
113 endif() 126 endif()
114 127
115 check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H) 128 check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H)
@@ -144,6 +144,8 @@ class BuildExtension(build_ext): @@ -144,6 +144,8 @@ class BuildExtension(build_ext):
144 binaries += ["sherpa-onnx-vad-microphone-offline-asr"] 144 binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
145 binaries += ["sherpa-onnx-offline-tts"] 145 binaries += ["sherpa-onnx-offline-tts"]
146 binaries += ["sherpa-onnx-offline-tts-play"] 146 binaries += ["sherpa-onnx-offline-tts-play"]
  147 + binaries += ["sherpa-onnx-alsa"]
  148 + binaries += ["sherpa-onnx-offline-tts-play-alsa"]
147 149
148 if is_windows(): 150 if is_windows():
149 binaries += ["kaldi-native-fbank-core.dll"] 151 binaries += ["kaldi-native-fbank-core.dll"]
@@ -165,6 +167,11 @@ class BuildExtension(build_ext): @@ -165,6 +167,11 @@ class BuildExtension(build_ext):
165 src_file = install_dir / "lib" / (f + suffix) 167 src_file = install_dir / "lib" / (f + suffix)
166 if not src_file.is_file(): 168 if not src_file.is_file():
167 src_file = install_dir / ".." / (f + suffix) 169 src_file = install_dir / ".." / (f + suffix)
  170 +
  171 + if not src_file.is_file() and 'alsa' in f:
  172 + print(f'Skipping {f}')
  173 + continue
  174 +
168 print(f"Copying {src_file} to {out_bin_dir}/") 175 print(f"Copying {src_file} to {out_bin_dir}/")
169 shutil.copy(f"{src_file}", f"{out_bin_dir}/") 176 shutil.copy(f"{src_file}", f"{out_bin_dir}/")
170 177
@@ -60,6 +60,8 @@ def get_binaries_to_install(): @@ -60,6 +60,8 @@ def get_binaries_to_install():
60 binaries += ["sherpa-onnx-vad-microphone-offline-asr"] 60 binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
61 binaries += ["sherpa-onnx-offline-tts"] 61 binaries += ["sherpa-onnx-offline-tts"]
62 binaries += ["sherpa-onnx-offline-tts-play"] 62 binaries += ["sherpa-onnx-offline-tts-play"]
  63 + binaries += ["sherpa-onnx-alsa"]
  64 + binaries += ["sherpa-onnx-offline-tts-play-alsa"]
63 if is_windows(): 65 if is_windows():
64 binaries += ["kaldi-native-fbank-core.dll"] 66 binaries += ["kaldi-native-fbank-core.dll"]
65 binaries += ["sherpa-onnx-c-api.dll"] 67 binaries += ["sherpa-onnx-c-api.dll"]
@@ -207,14 +207,42 @@ install( @@ -207,14 +207,42 @@ install(
207 207
208 if(SHERPA_ONNX_HAS_ALSA) 208 if(SHERPA_ONNX_HAS_ALSA)
209 add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) 209 add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
210 - target_link_libraries(sherpa-onnx-alsa sherpa-onnx-core) 210 + add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
211 211
  212 + set(exes
  213 + sherpa-onnx-alsa
  214 + sherpa-onnx-offline-tts-play-alsa
  215 + )
  216 + foreach(exe IN LISTS exes)
  217 + target_link_libraries(${exe} sherpa-onnx-core)
  218 + endforeach()
  219 +
  220 + foreach(exe IN LISTS exes)
212 if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) 221 if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
213 - target_link_libraries(sherpa-onnx-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) 222 + target_link_libraries(${exe} -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
214 else() 223 else()
215 - target_link_libraries(sherpa-onnx-alsa asound) 224 + target_link_libraries(${exe} asound)
216 endif() 225 endif()
217 - install(TARGETS sherpa-onnx-alsa DESTINATION bin) 226 + endforeach()
  227 +
  228 + if(NOT WIN32)
  229 + foreach(exe IN LISTS exes)
  230 + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
  231 + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
  232 + endforeach()
  233 +
  234 + if(SHERPA_ONNX_ENABLE_PYTHON)
  235 + foreach(exe IN LISTS exes)
  236 + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
  237 + endforeach()
  238 + endif()
  239 + endif()
  240 +
  241 + install(
  242 + TARGETS ${exes}
  243 + DESTINATION
  244 + bin
  245 + )
218 endif() 246 endif()
219 247
220 if(SHERPA_ONNX_ENABLE_PORTAUDIO) 248 if(SHERPA_ONNX_ENABLE_PORTAUDIO)
  1 +// sherpa-onnx/csrc/alsa-play.cc
  2 +//
  3 +// Copyright (c) 2022-2023 Xiaomi Corporation
  4 +
  5 +#ifdef SHERPA_ONNX_ENABLE_ALSA
  6 +
  7 +#include "sherpa-onnx/csrc/alsa-play.h"
  8 +
  9 +#include <algorithm>
  10 +
  11 +namespace sherpa_onnx {
  12 +
  13 +AlsaPlay::AlsaPlay(const char *device_name, int32_t sample_rate) {
  14 + int32_t err = snd_pcm_open(&handle_, device_name, SND_PCM_STREAM_PLAYBACK, 0);
  15 +
  16 + if (err) {
  17 + fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err));
  18 + exit(-1);
  19 + }
  20 +
  21 + SetParameters(sample_rate);
  22 +}
  23 +
  24 +AlsaPlay::~AlsaPlay() {
  25 + if (handle_) {
  26 + int32_t err = snd_pcm_close(handle_);
  27 + if (err < 0) {
  28 + printf("Failed to close pcm: %s\n", snd_strerror(err));
  29 + }
  30 + }
  31 +}
  32 +
  33 +void AlsaPlay::SetParameters(int32_t sample_rate) {
  34 + // set the following parameters
  35 + // 1. sample_rate
  36 + // 2. sample format: int16_t
  37 + // 3. num_channels: 1
  38 + snd_pcm_hw_params_t *params;
  39 + snd_pcm_hw_params_alloca(&params);
  40 + snd_pcm_hw_params_any(handle_, params);
  41 +
  42 + int32_t err = snd_pcm_hw_params_set_access(handle_, params,
  43 + SND_PCM_ACCESS_RW_INTERLEAVED);
  44 + if (err < 0) {
  45 + printf("SND_PCM_ACCESS_RW_INTERLEAVED is not supported: %s\n",
  46 + snd_strerror(err));
  47 + exit(-1);
  48 + }
  49 +
  50 + err = snd_pcm_hw_params_set_format(handle_, params, SND_PCM_FORMAT_S16_LE);
  51 +
  52 + if (err < 0) {
  53 + printf("Can't set format to 16-bit: %s\n", snd_strerror(err));
  54 + exit(-1);
  55 + }
  56 +
  57 + err = snd_pcm_hw_params_set_channels(handle_, params, 1);
  58 +
  59 + if (err < 0) {
  60 + printf("Can't set channel number to 1: %s\n", snd_strerror(err));
  61 + }
  62 +
  63 + uint32_t rate = sample_rate;
  64 + err = snd_pcm_hw_params_set_rate_near(handle_, params, &rate, 0);
  65 + if (err < 0) {
  66 + printf("Can't set rate to %d. %s\n", rate, snd_strerror(err));
  67 + }
  68 +
  69 + err = snd_pcm_hw_params(handle_, params);
  70 + if (err < 0) {
  71 + printf("Can't set hardware parameters. %s\n", snd_strerror(err));
  72 + exit(-1);
  73 + }
  74 +
  75 + uint32_t tmp;
  76 + snd_pcm_hw_params_get_rate(params, &tmp, 0);
  77 + int32_t actual_sample_rate = tmp;
  78 + if (actual_sample_rate != sample_rate) {
  79 + fprintf(stderr,
  80 + "Creating a resampler:\n"
  81 + " in_sample_rate: %d\n"
  82 + " output_sample_rate: %d\n",
  83 + sample_rate, actual_sample_rate);
  84 +
  85 + float min_freq = std::min(actual_sample_rate, sample_rate);
  86 + float lowpass_cutoff = 0.99 * 0.5 * min_freq;
  87 +
  88 + int32_t lowpass_filter_width = 6;
  89 + resampler_ = std::make_unique<LinearResample>(
  90 + sample_rate, actual_sample_rate, lowpass_cutoff, lowpass_filter_width);
  91 + }
  92 +
  93 + snd_pcm_uframes_t frames;
  94 + snd_pcm_hw_params_get_period_size(params, &frames, 0);
  95 + buf_.resize(frames);
  96 +}
  97 +
  98 +void AlsaPlay::Play(const std::vector<float> &samples) {
  99 + std::vector<float> tmp;
  100 + const float *p = samples.data();
  101 + int32_t num_samples = samples.size();
  102 + if (resampler_) {
  103 + resampler_->Resample(samples.data(), samples.size(), false, &tmp);
  104 + p = tmp.data();
  105 + num_samples = tmp.size();
  106 + }
  107 +
  108 + int32_t frames = buf_.size();
  109 + int32_t i = 0;
  110 + for (; i + frames < num_samples; i += frames) {
  111 + for (int32_t k = 0; k != frames; ++k) {
  112 + buf_[k] = p[i + k] * 32767;
  113 + }
  114 +
  115 + int32_t err = snd_pcm_writei(handle_, buf_.data(), frames);
  116 + if (err == -EPIPE) {
  117 + printf("XRUN.\n");
  118 + snd_pcm_prepare(handle_);
  119 + } else if (err < 0) {
  120 + printf("Can't write to PCM device: %s\n", snd_strerror(err));
  121 + exit(-1);
  122 + }
  123 + }
  124 +
  125 + if (i < num_samples) {
  126 + for (int32_t k = 0; k + i < num_samples; ++k) {
  127 + buf_[k] = p[i + k] * 32767;
  128 + }
  129 +
  130 + int32_t err = snd_pcm_writei(handle_, buf_.data(), num_samples - i);
  131 + if (err == -EPIPE) {
  132 + printf("XRUN.\n");
  133 + snd_pcm_prepare(handle_);
  134 + } else if (err < 0) {
  135 + printf("Can't write to PCM device: %s\n", snd_strerror(err));
  136 + exit(-1);
  137 + }
  138 + }
  139 +}
  140 +
  141 +void AlsaPlay::Drain() {
  142 + int32_t err = snd_pcm_drain(handle_);
  143 + if (err < 0) {
  144 + printf("Failed to drain pcm. %s\n", snd_strerror(err));
  145 + }
  146 +}
  147 +
  148 +} // namespace sherpa_onnx
  149 +
  150 +#endif // SHERPA_ONNX_ENABLE_ALSA
  1 +// sherpa-onnx/csrc/alsa-play.h
  2 +//
  3 +// Copyright (c) 2022-2023 Xiaomi Corporation
  4 +
  5 +#ifndef SHERPA_ONNX_CSRC_ALSA_PLAY_H_
  6 +#define SHERPA_ONNX_CSRC_ALSA_PLAY_H_
  7 +
  8 +#include <cstdint>
  9 +#include <memory>
  10 +#include <vector>
  11 +
  12 +#include "alsa/asoundlib.h"
  13 +#include "sherpa-onnx/csrc/resample.h"
  14 +
  15 +namespace sherpa_onnx {
  16 +
  17 +class AlsaPlay {
  18 + public:
  19 + AlsaPlay(const char *device_name, int32_t sample_rate);
  20 + ~AlsaPlay();
  21 + void Play(const std::vector<float> &samples);
  22 +
  23 + // wait for all the samples to be played
  24 + void Drain();
  25 +
  26 + private:
  27 + void SetParameters(int32_t sample_rate);
  28 +
  29 + private:
  30 + snd_pcm_t *handle_ = nullptr;
  31 + std::unique_ptr<LinearResample> resampler_;
  32 + std::vector<int16_t> buf_;
  33 +};
  34 +
  35 +} // namespace sherpa_onnx
  36 +
  37 +#endif // SHERPA_ONNX_CSRC_ALSA_PLAY_H_
  1 +// sherpa-onnx/csrc/sherpa-onnx-tts-play-alsa.cc
  2 +//
  3 +// Copyright (c) 2022-2023 Xiaomi Corporation
  4 +
  5 +// see https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
  6 +// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m___h_w___params.html
  7 +// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
  8 +
  9 +#include <signal.h>
  10 +
  11 +#include <algorithm>
  12 +#include <chrono> // NOLINT
  13 +#include <condition_variable> // NOLINT
  14 +#include <fstream>
  15 +#include <mutex> // NOLINT
  16 +#include <queue>
  17 +#include <thread> // NOLINT
  18 +#include <vector>
  19 +
  20 +#include "sherpa-onnx/csrc/alsa-play.h"
  21 +#include "sherpa-onnx/csrc/offline-tts.h"
  22 +#include "sherpa-onnx/csrc/parse-options.h"
  23 +#include "sherpa-onnx/csrc/wave-writer.h"
  24 +
  25 +static std::condition_variable g_cv;
  26 +static std::mutex g_cv_m;
  27 +
  28 +struct Buffer {
  29 + std::queue<std::vector<float>> samples;
  30 + std::mutex mutex;
  31 +};
  32 +
  33 +static Buffer g_buffer;
  34 +
  35 +static bool g_stopped = false;
  36 +static bool g_killed = false;
  37 +
  38 +static void Handler(int32_t /*sig*/) {
  39 + if (g_killed) {
  40 + exit(0);
  41 + }
  42 +
  43 + g_killed = true;
  44 + fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
  45 +}
  46 +
  47 +static void AudioGeneratedCallback(const float *s, int32_t n) {
  48 + if (n > 0) {
  49 + std::lock_guard<std::mutex> lock(g_buffer.mutex);
  50 + g_buffer.samples.push({s, s + n});
  51 + g_cv.notify_all();
  52 + }
  53 +}
  54 +
  55 +static void StartPlayback(const std::string &device_name, int32_t sample_rate) {
  56 + sherpa_onnx::AlsaPlay alsa(device_name.c_str(), sample_rate);
  57 +
  58 + std::unique_lock<std::mutex> lock(g_cv_m);
  59 + while (!g_killed && !g_stopped) {
  60 + while (!g_buffer.samples.empty()) {
  61 + auto &p = g_buffer.samples.front();
  62 + alsa.Play(p);
  63 + g_buffer.samples.pop();
  64 + }
  65 +
  66 + g_cv.wait(lock);
  67 + }
  68 +
  69 + if (g_killed) {
  70 + return;
  71 + }
  72 +
  73 + if (g_stopped) {
  74 + while (!g_buffer.samples.empty()) {
  75 + auto &p = g_buffer.samples.front();
  76 + alsa.Play(p);
  77 + g_buffer.samples.pop();
  78 + }
  79 + }
  80 +
  81 + alsa.Drain();
  82 +}
  83 +
  84 +int main(int32_t argc, char *argv[]) {
  85 + signal(SIGINT, Handler);
  86 +
  87 + const char *kUsageMessage = R"usage(
  88 +Offline text-to-speech with sherpa-onnx.
  89 +
  90 +It plays the generated audio as the model is processing.
  91 +
  92 +Note that it is alsa so it works only on **Linux**. For instance, you can
  93 +use it on Raspberry Pi.
  94 +
  95 +Usage example:
  96 +
  97 +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
  98 +tar xf vits-piper-en_US-amy-low.tar.bz2
  99 +
  100 +./bin/sherpa-onnx-offline-tts-play-alsa \
  101 + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
  102 + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
  103 + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
  104 + --output-filename=./generated.wav \
  105 + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
  106 +
  107 +It will generate a file ./generated.wav as specified by --output-filename.
  108 +
  109 +You can find more models at
  110 +https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  111 +
  112 +Please see
  113 +https://k2-fsa.github.io/sherpa/onnx/tts/index.html
  114 +or details.
  115 +)usage";
  116 +
  117 + sherpa_onnx::ParseOptions po(kUsageMessage);
  118 + std::string device_name = "default";
  119 + std::string output_filename = "./generated.wav";
  120 + int32_t sid = 0;
  121 +
  122 + po.Register("output-filename", &output_filename,
  123 + "Path to save the generated audio");
  124 +
  125 + po.Register("device-name", &device_name,
  126 + "Name of the device to play the generated audio");
  127 +
  128 + po.Register("sid", &sid,
  129 + "Speaker ID. Used only for multi-speaker models, e.g., models "
  130 + "trained using the VCTK dataset. Not used for single-speaker "
  131 + "models, e.g., models trained using the LJSpeech dataset");
  132 +
  133 + sherpa_onnx::OfflineTtsConfig config;
  134 +
  135 + config.Register(&po);
  136 + po.Read(argc, argv);
  137 +
  138 + if (po.NumArgs() == 0) {
  139 + fprintf(stderr, "Error: Please provide the text to generate audio.\n\n");
  140 + po.PrintUsage();
  141 + exit(EXIT_FAILURE);
  142 + }
  143 +
  144 + if (po.NumArgs() > 1) {
  145 + fprintf(stderr,
  146 + "Error: Accept only one positional argument. Please use single "
  147 + "quotes to wrap your text\n");
  148 + po.PrintUsage();
  149 + exit(EXIT_FAILURE);
  150 + }
  151 +
  152 + if (!config.Validate()) {
  153 + fprintf(stderr, "Errors in config!\n");
  154 + exit(EXIT_FAILURE);
  155 + }
  156 +
  157 + if (config.max_num_sentences != 1) {
  158 + fprintf(stderr, "Setting config.max_num_sentences to 1\n");
  159 + config.max_num_sentences = 1;
  160 + }
  161 +
  162 + fprintf(stderr, "Loading the model\n");
  163 + sherpa_onnx::OfflineTts tts(config);
  164 +
  165 + fprintf(stderr, "Start the playback thread\n");
  166 + std::thread playback_thread(StartPlayback, device_name, tts.SampleRate());
  167 +
  168 + float speed = 1.0;
  169 +
  170 + fprintf(stderr, "Generating ...\n");
  171 + const auto begin = std::chrono::steady_clock::now();
  172 + auto audio = tts.Generate(po.GetArg(1), sid, speed, AudioGeneratedCallback);
  173 + const auto end = std::chrono::steady_clock::now();
  174 + g_stopped = true;
  175 + g_cv.notify_all();
  176 + fprintf(stderr, "Generating done!\n");
  177 + if (audio.samples.empty()) {
  178 + fprintf(
  179 + stderr,
  180 + "Error in generating audio. Please read previous error messages.\n");
  181 + exit(EXIT_FAILURE);
  182 + }
  183 +
  184 + float elapsed_seconds =
  185 + std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
  186 + .count() /
  187 + 1000.;
  188 + float duration = audio.samples.size() / static_cast<float>(audio.sample_rate);
  189 +
  190 + float rtf = elapsed_seconds / duration;
  191 + fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
  192 + fprintf(stderr, "Audio duration: %.3f s\n", duration);
  193 + fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds,
  194 + duration, rtf);
  195 +
  196 + bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate,
  197 + audio.samples.data(), audio.samples.size());
  198 + if (!ok) {
  199 + fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str());
  200 + exit(EXIT_FAILURE);
  201 + }
  202 +
  203 + fprintf(stderr, "The text is: %s. Speaker ID: %d\n\n", po.GetArg(1).c_str(),
  204 + sid);
  205 + fprintf(stderr, "\n**** Saved to %s successfully! ****\n",
  206 + output_filename.c_str());
  207 +
  208 + fprintf(stderr, "\n");
  209 + fprintf(
  210 + stderr,
  211 + "Wait for the playback to finish. You can safely press ctrl + C to stop "
  212 + "the playback.\n");
  213 + playback_thread.join();
  214 +
  215 + fprintf(stderr, "Done!\n");
  216 +
  217 + return 0;
  218 +}