Fangjun Kuang
Committed by GitHub

Play generated audio using alsa for TTS (#482)

cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)
set(SHERPA_ONNX_VERSION "1.9.3")
set(SHERPA_ONNX_VERSION "1.9.4")
# Disable warning about
#
... ... @@ -106,10 +106,23 @@ endif()
set(CMAKE_CXX_EXTENSIONS OFF)
message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
include(CheckIncludeFileCXX)
check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
if(SHERPA_ONNX_HAS_ALSA)
add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
if(UNIX AND NOT APPLE)
check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA)
if(SHERPA_ONNX_HAS_ALSA)
add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1)
else()
message(WARNING "\
Could not find alsa/asoundlib.h !
We won't build sherpa-ncnn-alsa
To fix that, please do:
(1) sudo apt-get install alsa-utils libasound2-dev
(2) rm -rf build
(3) re-try
")
endif()
endif()
check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H)
... ...
... ... @@ -144,6 +144,8 @@ class BuildExtension(build_ext):
binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
binaries += ["sherpa-onnx-offline-tts"]
binaries += ["sherpa-onnx-offline-tts-play"]
binaries += ["sherpa-onnx-alsa"]
binaries += ["sherpa-onnx-offline-tts-play-alsa"]
if is_windows():
binaries += ["kaldi-native-fbank-core.dll"]
... ... @@ -165,6 +167,11 @@ class BuildExtension(build_ext):
src_file = install_dir / "lib" / (f + suffix)
if not src_file.is_file():
src_file = install_dir / ".." / (f + suffix)
if not src_file.is_file() and 'alsa' in f:
print(f'Skipping {f}')
continue
print(f"Copying {src_file} to {out_bin_dir}/")
shutil.copy(f"{src_file}", f"{out_bin_dir}/")
... ...
... ... @@ -60,6 +60,8 @@ def get_binaries_to_install():
binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
binaries += ["sherpa-onnx-offline-tts"]
binaries += ["sherpa-onnx-offline-tts-play"]
binaries += ["sherpa-onnx-alsa"]
binaries += ["sherpa-onnx-offline-tts-play-alsa"]
if is_windows():
binaries += ["kaldi-native-fbank-core.dll"]
binaries += ["sherpa-onnx-c-api.dll"]
... ...
... ... @@ -207,14 +207,42 @@ install(
if(SHERPA_ONNX_HAS_ALSA)
add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc)
target_link_libraries(sherpa-onnx-alsa sherpa-onnx-core)
add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc)
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
target_link_libraries(sherpa-onnx-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
else()
target_link_libraries(sherpa-onnx-alsa asound)
set(exes
sherpa-onnx-alsa
sherpa-onnx-offline-tts-play-alsa
)
foreach(exe IN LISTS exes)
target_link_libraries(${exe} sherpa-onnx-core)
endforeach()
foreach(exe IN LISTS exes)
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
target_link_libraries(${exe} -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
else()
target_link_libraries(${exe} asound)
endif()
endforeach()
if(NOT WIN32)
foreach(exe IN LISTS exes)
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib")
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib")
endforeach()
if(SHERPA_ONNX_ENABLE_PYTHON)
foreach(exe IN LISTS exes)
target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib")
endforeach()
endif()
endif()
install(TARGETS sherpa-onnx-alsa DESTINATION bin)
install(
TARGETS ${exes}
DESTINATION
bin
)
endif()
if(SHERPA_ONNX_ENABLE_PORTAUDIO)
... ...
// sherpa-onnx/csrc/alsa-play.cc
//
// Copyright (c) 2022-2023 Xiaomi Corporation
#ifdef SHERPA_ONNX_ENABLE_ALSA
#include "sherpa-onnx/csrc/alsa-play.h"
#include <algorithm>
namespace sherpa_onnx {
AlsaPlay::AlsaPlay(const char *device_name, int32_t sample_rate) {
int32_t err = snd_pcm_open(&handle_, device_name, SND_PCM_STREAM_PLAYBACK, 0);
if (err) {
fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err));
exit(-1);
}
SetParameters(sample_rate);
}
AlsaPlay::~AlsaPlay() {
if (handle_) {
int32_t err = snd_pcm_close(handle_);
if (err < 0) {
printf("Failed to close pcm: %s\n", snd_strerror(err));
}
}
}
void AlsaPlay::SetParameters(int32_t sample_rate) {
// set the following parameters
// 1. sample_rate
// 2. sample format: int16_t
// 3. num_channels: 1
snd_pcm_hw_params_t *params;
snd_pcm_hw_params_alloca(&params);
snd_pcm_hw_params_any(handle_, params);
int32_t err = snd_pcm_hw_params_set_access(handle_, params,
SND_PCM_ACCESS_RW_INTERLEAVED);
if (err < 0) {
printf("SND_PCM_ACCESS_RW_INTERLEAVED is not supported: %s\n",
snd_strerror(err));
exit(-1);
}
err = snd_pcm_hw_params_set_format(handle_, params, SND_PCM_FORMAT_S16_LE);
if (err < 0) {
printf("Can't set format to 16-bit: %s\n", snd_strerror(err));
exit(-1);
}
err = snd_pcm_hw_params_set_channels(handle_, params, 1);
if (err < 0) {
printf("Can't set channel number to 1: %s\n", snd_strerror(err));
}
uint32_t rate = sample_rate;
err = snd_pcm_hw_params_set_rate_near(handle_, params, &rate, 0);
if (err < 0) {
printf("Can't set rate to %d. %s\n", rate, snd_strerror(err));
}
err = snd_pcm_hw_params(handle_, params);
if (err < 0) {
printf("Can't set hardware parameters. %s\n", snd_strerror(err));
exit(-1);
}
uint32_t tmp;
snd_pcm_hw_params_get_rate(params, &tmp, 0);
int32_t actual_sample_rate = tmp;
if (actual_sample_rate != sample_rate) {
fprintf(stderr,
"Creating a resampler:\n"
" in_sample_rate: %d\n"
" output_sample_rate: %d\n",
sample_rate, actual_sample_rate);
float min_freq = std::min(actual_sample_rate, sample_rate);
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
int32_t lowpass_filter_width = 6;
resampler_ = std::make_unique<LinearResample>(
sample_rate, actual_sample_rate, lowpass_cutoff, lowpass_filter_width);
}
snd_pcm_uframes_t frames;
snd_pcm_hw_params_get_period_size(params, &frames, 0);
buf_.resize(frames);
}
void AlsaPlay::Play(const std::vector<float> &samples) {
std::vector<float> tmp;
const float *p = samples.data();
int32_t num_samples = samples.size();
if (resampler_) {
resampler_->Resample(samples.data(), samples.size(), false, &tmp);
p = tmp.data();
num_samples = tmp.size();
}
int32_t frames = buf_.size();
int32_t i = 0;
for (; i + frames < num_samples; i += frames) {
for (int32_t k = 0; k != frames; ++k) {
buf_[k] = p[i + k] * 32767;
}
int32_t err = snd_pcm_writei(handle_, buf_.data(), frames);
if (err == -EPIPE) {
printf("XRUN.\n");
snd_pcm_prepare(handle_);
} else if (err < 0) {
printf("Can't write to PCM device: %s\n", snd_strerror(err));
exit(-1);
}
}
if (i < num_samples) {
for (int32_t k = 0; k + i < num_samples; ++k) {
buf_[k] = p[i + k] * 32767;
}
int32_t err = snd_pcm_writei(handle_, buf_.data(), num_samples - i);
if (err == -EPIPE) {
printf("XRUN.\n");
snd_pcm_prepare(handle_);
} else if (err < 0) {
printf("Can't write to PCM device: %s\n", snd_strerror(err));
exit(-1);
}
}
}
void AlsaPlay::Drain() {
int32_t err = snd_pcm_drain(handle_);
if (err < 0) {
printf("Failed to drain pcm. %s\n", snd_strerror(err));
}
}
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_ENABLE_ALSA
... ...
// sherpa-onnx/csrc/alsa-play.h
//
// Copyright (c) 2022-2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_ALSA_PLAY_H_
#define SHERPA_ONNX_CSRC_ALSA_PLAY_H_
#include <cstdint>
#include <memory>
#include <vector>
#include "alsa/asoundlib.h"
#include "sherpa-onnx/csrc/resample.h"
namespace sherpa_onnx {
class AlsaPlay {
public:
AlsaPlay(const char *device_name, int32_t sample_rate);
~AlsaPlay();
void Play(const std::vector<float> &samples);
// wait for all the samples to be played
void Drain();
private:
void SetParameters(int32_t sample_rate);
private:
snd_pcm_t *handle_ = nullptr;
std::unique_ptr<LinearResample> resampler_;
std::vector<int16_t> buf_;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_ALSA_PLAY_H_
... ...
// sherpa-onnx/csrc/sherpa-onnx-tts-play-alsa.cc
//
// Copyright (c) 2022-2023 Xiaomi Corporation
// see https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m___h_w___params.html
// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html
#include <signal.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <fstream>
#include <mutex> // NOLINT
#include <queue>
#include <thread> // NOLINT
#include <vector>
#include "sherpa-onnx/csrc/alsa-play.h"
#include "sherpa-onnx/csrc/offline-tts.h"
#include "sherpa-onnx/csrc/parse-options.h"
#include "sherpa-onnx/csrc/wave-writer.h"
static std::condition_variable g_cv;
static std::mutex g_cv_m;
struct Buffer {
std::queue<std::vector<float>> samples;
std::mutex mutex;
};
static Buffer g_buffer;
static bool g_stopped = false;
static bool g_killed = false;
static void Handler(int32_t /*sig*/) {
if (g_killed) {
exit(0);
}
g_killed = true;
fprintf(stderr, "\nCaught Ctrl + C. Exiting\n");
}
static void AudioGeneratedCallback(const float *s, int32_t n) {
if (n > 0) {
std::lock_guard<std::mutex> lock(g_buffer.mutex);
g_buffer.samples.push({s, s + n});
g_cv.notify_all();
}
}
static void StartPlayback(const std::string &device_name, int32_t sample_rate) {
sherpa_onnx::AlsaPlay alsa(device_name.c_str(), sample_rate);
std::unique_lock<std::mutex> lock(g_cv_m);
while (!g_killed && !g_stopped) {
while (!g_buffer.samples.empty()) {
auto &p = g_buffer.samples.front();
alsa.Play(p);
g_buffer.samples.pop();
}
g_cv.wait(lock);
}
if (g_killed) {
return;
}
if (g_stopped) {
while (!g_buffer.samples.empty()) {
auto &p = g_buffer.samples.front();
alsa.Play(p);
g_buffer.samples.pop();
}
}
alsa.Drain();
}
int main(int32_t argc, char *argv[]) {
signal(SIGINT, Handler);
const char *kUsageMessage = R"usage(
Offline text-to-speech with sherpa-onnx.
It plays the generated audio as the model is processing.
Note that it is alsa so it works only on **Linux**. For instance, you can
use it on Raspberry Pi.
Usage example:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
./bin/sherpa-onnx-offline-tts-play-alsa \
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
--output-filename=./generated.wav \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
It will generate a file ./generated.wav as specified by --output-filename.
You can find more models at
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
Please see
https://k2-fsa.github.io/sherpa/onnx/tts/index.html
or details.
)usage";
sherpa_onnx::ParseOptions po(kUsageMessage);
std::string device_name = "default";
std::string output_filename = "./generated.wav";
int32_t sid = 0;
po.Register("output-filename", &output_filename,
"Path to save the generated audio");
po.Register("device-name", &device_name,
"Name of the device to play the generated audio");
po.Register("sid", &sid,
"Speaker ID. Used only for multi-speaker models, e.g., models "
"trained using the VCTK dataset. Not used for single-speaker "
"models, e.g., models trained using the LJSpeech dataset");
sherpa_onnx::OfflineTtsConfig config;
config.Register(&po);
po.Read(argc, argv);
if (po.NumArgs() == 0) {
fprintf(stderr, "Error: Please provide the text to generate audio.\n\n");
po.PrintUsage();
exit(EXIT_FAILURE);
}
if (po.NumArgs() > 1) {
fprintf(stderr,
"Error: Accept only one positional argument. Please use single "
"quotes to wrap your text\n");
po.PrintUsage();
exit(EXIT_FAILURE);
}
if (!config.Validate()) {
fprintf(stderr, "Errors in config!\n");
exit(EXIT_FAILURE);
}
if (config.max_num_sentences != 1) {
fprintf(stderr, "Setting config.max_num_sentences to 1\n");
config.max_num_sentences = 1;
}
fprintf(stderr, "Loading the model\n");
sherpa_onnx::OfflineTts tts(config);
fprintf(stderr, "Start the playback thread\n");
std::thread playback_thread(StartPlayback, device_name, tts.SampleRate());
float speed = 1.0;
fprintf(stderr, "Generating ...\n");
const auto begin = std::chrono::steady_clock::now();
auto audio = tts.Generate(po.GetArg(1), sid, speed, AudioGeneratedCallback);
const auto end = std::chrono::steady_clock::now();
g_stopped = true;
g_cv.notify_all();
fprintf(stderr, "Generating done!\n");
if (audio.samples.empty()) {
fprintf(
stderr,
"Error in generating audio. Please read previous error messages.\n");
exit(EXIT_FAILURE);
}
float elapsed_seconds =
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
.count() /
1000.;
float duration = audio.samples.size() / static_cast<float>(audio.sample_rate);
float rtf = elapsed_seconds / duration;
fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
fprintf(stderr, "Audio duration: %.3f s\n", duration);
fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds,
duration, rtf);
bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate,
audio.samples.data(), audio.samples.size());
if (!ok) {
fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str());
exit(EXIT_FAILURE);
}
fprintf(stderr, "The text is: %s. Speaker ID: %d\n\n", po.GetArg(1).c_str(),
sid);
fprintf(stderr, "\n**** Saved to %s successfully! ****\n",
output_filename.c_str());
fprintf(stderr, "\n");
fprintf(
stderr,
"Wait for the playback to finish. You can safely press ctrl + C to stop "
"the playback.\n");
playback_thread.join();
fprintf(stderr, "Done!\n");
return 0;
}
... ...