Committed by
GitHub
Play generated audio using alsa for TTS (#482)
正在显示
7 个修改的文件
包含
465 行增加
和
10 行删除
| 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) | 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) |
| 2 | project(sherpa-onnx) | 2 | project(sherpa-onnx) |
| 3 | 3 | ||
| 4 | -set(SHERPA_ONNX_VERSION "1.9.3") | 4 | +set(SHERPA_ONNX_VERSION "1.9.4") |
| 5 | 5 | ||
| 6 | # Disable warning about | 6 | # Disable warning about |
| 7 | # | 7 | # |
| @@ -106,10 +106,23 @@ endif() | @@ -106,10 +106,23 @@ endif() | ||
| 106 | set(CMAKE_CXX_EXTENSIONS OFF) | 106 | set(CMAKE_CXX_EXTENSIONS OFF) |
| 107 | message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}") | 107 | message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}") |
| 108 | 108 | ||
| 109 | + | ||
| 109 | include(CheckIncludeFileCXX) | 110 | include(CheckIncludeFileCXX) |
| 110 | -check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) | ||
| 111 | -if(SHERPA_ONNX_HAS_ALSA) | ||
| 112 | - add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) | 111 | + |
| 112 | +if(UNIX AND NOT APPLE) | ||
| 113 | + check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) | ||
| 114 | + if(SHERPA_ONNX_HAS_ALSA) | ||
| 115 | + add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) | ||
| 116 | + else() | ||
| 117 | + message(WARNING "\ | ||
| 118 | +Could not find alsa/asoundlib.h ! | ||
| 119 | +We won't build sherpa-ncnn-alsa | ||
| 120 | +To fix that, please do: | ||
| 121 | + (1) sudo apt-get install alsa-utils libasound2-dev | ||
| 122 | + (2) rm -rf build | ||
| 123 | + (3) re-try | ||
| 124 | + ") | ||
| 125 | + endif() | ||
| 113 | endif() | 126 | endif() |
| 114 | 127 | ||
| 115 | check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H) | 128 | check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H) |
| @@ -144,6 +144,8 @@ class BuildExtension(build_ext): | @@ -144,6 +144,8 @@ class BuildExtension(build_ext): | ||
| 144 | binaries += ["sherpa-onnx-vad-microphone-offline-asr"] | 144 | binaries += ["sherpa-onnx-vad-microphone-offline-asr"] |
| 145 | binaries += ["sherpa-onnx-offline-tts"] | 145 | binaries += ["sherpa-onnx-offline-tts"] |
| 146 | binaries += ["sherpa-onnx-offline-tts-play"] | 146 | binaries += ["sherpa-onnx-offline-tts-play"] |
| 147 | + binaries += ["sherpa-onnx-alsa"] | ||
| 148 | + binaries += ["sherpa-onnx-offline-tts-play-alsa"] | ||
| 147 | 149 | ||
| 148 | if is_windows(): | 150 | if is_windows(): |
| 149 | binaries += ["kaldi-native-fbank-core.dll"] | 151 | binaries += ["kaldi-native-fbank-core.dll"] |
| @@ -165,6 +167,11 @@ class BuildExtension(build_ext): | @@ -165,6 +167,11 @@ class BuildExtension(build_ext): | ||
| 165 | src_file = install_dir / "lib" / (f + suffix) | 167 | src_file = install_dir / "lib" / (f + suffix) |
| 166 | if not src_file.is_file(): | 168 | if not src_file.is_file(): |
| 167 | src_file = install_dir / ".." / (f + suffix) | 169 | src_file = install_dir / ".." / (f + suffix) |
| 170 | + | ||
| 171 | + if not src_file.is_file() and 'alsa' in f: | ||
| 172 | + print(f'Skipping {f}') | ||
| 173 | + continue | ||
| 174 | + | ||
| 168 | print(f"Copying {src_file} to {out_bin_dir}/") | 175 | print(f"Copying {src_file} to {out_bin_dir}/") |
| 169 | shutil.copy(f"{src_file}", f"{out_bin_dir}/") | 176 | shutil.copy(f"{src_file}", f"{out_bin_dir}/") |
| 170 | 177 |
| @@ -60,6 +60,8 @@ def get_binaries_to_install(): | @@ -60,6 +60,8 @@ def get_binaries_to_install(): | ||
| 60 | binaries += ["sherpa-onnx-vad-microphone-offline-asr"] | 60 | binaries += ["sherpa-onnx-vad-microphone-offline-asr"] |
| 61 | binaries += ["sherpa-onnx-offline-tts"] | 61 | binaries += ["sherpa-onnx-offline-tts"] |
| 62 | binaries += ["sherpa-onnx-offline-tts-play"] | 62 | binaries += ["sherpa-onnx-offline-tts-play"] |
| 63 | + binaries += ["sherpa-onnx-alsa"] | ||
| 64 | + binaries += ["sherpa-onnx-offline-tts-play-alsa"] | ||
| 63 | if is_windows(): | 65 | if is_windows(): |
| 64 | binaries += ["kaldi-native-fbank-core.dll"] | 66 | binaries += ["kaldi-native-fbank-core.dll"] |
| 65 | binaries += ["sherpa-onnx-c-api.dll"] | 67 | binaries += ["sherpa-onnx-c-api.dll"] |
| @@ -207,14 +207,42 @@ install( | @@ -207,14 +207,42 @@ install( | ||
| 207 | 207 | ||
| 208 | if(SHERPA_ONNX_HAS_ALSA) | 208 | if(SHERPA_ONNX_HAS_ALSA) |
| 209 | add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) | 209 | add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) |
| 210 | - target_link_libraries(sherpa-onnx-alsa sherpa-onnx-core) | 210 | + add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc) |
| 211 | 211 | ||
| 212 | - if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) | ||
| 213 | - target_link_libraries(sherpa-onnx-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) | ||
| 214 | - else() | ||
| 215 | - target_link_libraries(sherpa-onnx-alsa asound) | 212 | + set(exes |
| 213 | + sherpa-onnx-alsa | ||
| 214 | + sherpa-onnx-offline-tts-play-alsa | ||
| 215 | + ) | ||
| 216 | + foreach(exe IN LISTS exes) | ||
| 217 | + target_link_libraries(${exe} sherpa-onnx-core) | ||
| 218 | + endforeach() | ||
| 219 | + | ||
| 220 | + foreach(exe IN LISTS exes) | ||
| 221 | + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) | ||
| 222 | + target_link_libraries(${exe} -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) | ||
| 223 | + else() | ||
| 224 | + target_link_libraries(${exe} asound) | ||
| 225 | + endif() | ||
| 226 | + endforeach() | ||
| 227 | + | ||
| 228 | + if(NOT WIN32) | ||
| 229 | + foreach(exe IN LISTS exes) | ||
| 230 | + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib") | ||
| 231 | + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib") | ||
| 232 | + endforeach() | ||
| 233 | + | ||
| 234 | + if(SHERPA_ONNX_ENABLE_PYTHON) | ||
| 235 | + foreach(exe IN LISTS exes) | ||
| 236 | + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib") | ||
| 237 | + endforeach() | ||
| 238 | + endif() | ||
| 216 | endif() | 239 | endif() |
| 217 | - install(TARGETS sherpa-onnx-alsa DESTINATION bin) | 240 | + |
| 241 | + install( | ||
| 242 | + TARGETS ${exes} | ||
| 243 | + DESTINATION | ||
| 244 | + bin | ||
| 245 | + ) | ||
| 218 | endif() | 246 | endif() |
| 219 | 247 | ||
| 220 | if(SHERPA_ONNX_ENABLE_PORTAUDIO) | 248 | if(SHERPA_ONNX_ENABLE_PORTAUDIO) |
sherpa-onnx/csrc/alsa-play.cc
0 → 100644
| 1 | +// sherpa-onnx/csrc/alsa-play.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifdef SHERPA_ONNX_ENABLE_ALSA | ||
| 6 | + | ||
| 7 | +#include "sherpa-onnx/csrc/alsa-play.h" | ||
| 8 | + | ||
| 9 | +#include <algorithm> | ||
| 10 | + | ||
| 11 | +namespace sherpa_onnx { | ||
| 12 | + | ||
| 13 | +AlsaPlay::AlsaPlay(const char *device_name, int32_t sample_rate) { | ||
| 14 | + int32_t err = snd_pcm_open(&handle_, device_name, SND_PCM_STREAM_PLAYBACK, 0); | ||
| 15 | + | ||
| 16 | + if (err) { | ||
| 17 | + fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err)); | ||
| 18 | + exit(-1); | ||
| 19 | + } | ||
| 20 | + | ||
| 21 | + SetParameters(sample_rate); | ||
| 22 | +} | ||
| 23 | + | ||
| 24 | +AlsaPlay::~AlsaPlay() { | ||
| 25 | + if (handle_) { | ||
| 26 | + int32_t err = snd_pcm_close(handle_); | ||
| 27 | + if (err < 0) { | ||
| 28 | + printf("Failed to close pcm: %s\n", snd_strerror(err)); | ||
| 29 | + } | ||
| 30 | + } | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +void AlsaPlay::SetParameters(int32_t sample_rate) { | ||
| 34 | + // set the following parameters | ||
| 35 | + // 1. sample_rate | ||
| 36 | + // 2. sample format: int16_t | ||
| 37 | + // 3. num_channels: 1 | ||
| 38 | + snd_pcm_hw_params_t *params; | ||
| 39 | + snd_pcm_hw_params_alloca(¶ms); | ||
| 40 | + snd_pcm_hw_params_any(handle_, params); | ||
| 41 | + | ||
| 42 | + int32_t err = snd_pcm_hw_params_set_access(handle_, params, | ||
| 43 | + SND_PCM_ACCESS_RW_INTERLEAVED); | ||
| 44 | + if (err < 0) { | ||
| 45 | + printf("SND_PCM_ACCESS_RW_INTERLEAVED is not supported: %s\n", | ||
| 46 | + snd_strerror(err)); | ||
| 47 | + exit(-1); | ||
| 48 | + } | ||
| 49 | + | ||
| 50 | + err = snd_pcm_hw_params_set_format(handle_, params, SND_PCM_FORMAT_S16_LE); | ||
| 51 | + | ||
| 52 | + if (err < 0) { | ||
| 53 | + printf("Can't set format to 16-bit: %s\n", snd_strerror(err)); | ||
| 54 | + exit(-1); | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + err = snd_pcm_hw_params_set_channels(handle_, params, 1); | ||
| 58 | + | ||
| 59 | + if (err < 0) { | ||
| 60 | + printf("Can't set channel number to 1: %s\n", snd_strerror(err)); | ||
| 61 | + } | ||
| 62 | + | ||
| 63 | + uint32_t rate = sample_rate; | ||
| 64 | + err = snd_pcm_hw_params_set_rate_near(handle_, params, &rate, 0); | ||
| 65 | + if (err < 0) { | ||
| 66 | + printf("Can't set rate to %d. %s\n", rate, snd_strerror(err)); | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | + err = snd_pcm_hw_params(handle_, params); | ||
| 70 | + if (err < 0) { | ||
| 71 | + printf("Can't set hardware parameters. %s\n", snd_strerror(err)); | ||
| 72 | + exit(-1); | ||
| 73 | + } | ||
| 74 | + | ||
| 75 | + uint32_t tmp; | ||
| 76 | + snd_pcm_hw_params_get_rate(params, &tmp, 0); | ||
| 77 | + int32_t actual_sample_rate = tmp; | ||
| 78 | + if (actual_sample_rate != sample_rate) { | ||
| 79 | + fprintf(stderr, | ||
| 80 | + "Creating a resampler:\n" | ||
| 81 | + " in_sample_rate: %d\n" | ||
| 82 | + " output_sample_rate: %d\n", | ||
| 83 | + sample_rate, actual_sample_rate); | ||
| 84 | + | ||
| 85 | + float min_freq = std::min(actual_sample_rate, sample_rate); | ||
| 86 | + float lowpass_cutoff = 0.99 * 0.5 * min_freq; | ||
| 87 | + | ||
| 88 | + int32_t lowpass_filter_width = 6; | ||
| 89 | + resampler_ = std::make_unique<LinearResample>( | ||
| 90 | + sample_rate, actual_sample_rate, lowpass_cutoff, lowpass_filter_width); | ||
| 91 | + } | ||
| 92 | + | ||
| 93 | + snd_pcm_uframes_t frames; | ||
| 94 | + snd_pcm_hw_params_get_period_size(params, &frames, 0); | ||
| 95 | + buf_.resize(frames); | ||
| 96 | +} | ||
| 97 | + | ||
| 98 | +void AlsaPlay::Play(const std::vector<float> &samples) { | ||
| 99 | + std::vector<float> tmp; | ||
| 100 | + const float *p = samples.data(); | ||
| 101 | + int32_t num_samples = samples.size(); | ||
| 102 | + if (resampler_) { | ||
| 103 | + resampler_->Resample(samples.data(), samples.size(), false, &tmp); | ||
| 104 | + p = tmp.data(); | ||
| 105 | + num_samples = tmp.size(); | ||
| 106 | + } | ||
| 107 | + | ||
| 108 | + int32_t frames = buf_.size(); | ||
| 109 | + int32_t i = 0; | ||
| 110 | + for (; i + frames < num_samples; i += frames) { | ||
| 111 | + for (int32_t k = 0; k != frames; ++k) { | ||
| 112 | + buf_[k] = p[i + k] * 32767; | ||
| 113 | + } | ||
| 114 | + | ||
| 115 | + int32_t err = snd_pcm_writei(handle_, buf_.data(), frames); | ||
| 116 | + if (err == -EPIPE) { | ||
| 117 | + printf("XRUN.\n"); | ||
| 118 | + snd_pcm_prepare(handle_); | ||
| 119 | + } else if (err < 0) { | ||
| 120 | + printf("Can't write to PCM device: %s\n", snd_strerror(err)); | ||
| 121 | + exit(-1); | ||
| 122 | + } | ||
| 123 | + } | ||
| 124 | + | ||
| 125 | + if (i < num_samples) { | ||
| 126 | + for (int32_t k = 0; k + i < num_samples; ++k) { | ||
| 127 | + buf_[k] = p[i + k] * 32767; | ||
| 128 | + } | ||
| 129 | + | ||
| 130 | + int32_t err = snd_pcm_writei(handle_, buf_.data(), num_samples - i); | ||
| 131 | + if (err == -EPIPE) { | ||
| 132 | + printf("XRUN.\n"); | ||
| 133 | + snd_pcm_prepare(handle_); | ||
| 134 | + } else if (err < 0) { | ||
| 135 | + printf("Can't write to PCM device: %s\n", snd_strerror(err)); | ||
| 136 | + exit(-1); | ||
| 137 | + } | ||
| 138 | + } | ||
| 139 | +} | ||
| 140 | + | ||
| 141 | +void AlsaPlay::Drain() { | ||
| 142 | + int32_t err = snd_pcm_drain(handle_); | ||
| 143 | + if (err < 0) { | ||
| 144 | + printf("Failed to drain pcm. %s\n", snd_strerror(err)); | ||
| 145 | + } | ||
| 146 | +} | ||
| 147 | + | ||
| 148 | +} // namespace sherpa_onnx | ||
| 149 | + | ||
| 150 | +#endif // SHERPA_ONNX_ENABLE_ALSA |
sherpa-onnx/csrc/alsa-play.h
0 → 100644
| 1 | +// sherpa-onnx/csrc/alsa-play.h | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +#ifndef SHERPA_ONNX_CSRC_ALSA_PLAY_H_ | ||
| 6 | +#define SHERPA_ONNX_CSRC_ALSA_PLAY_H_ | ||
| 7 | + | ||
| 8 | +#include <cstdint> | ||
| 9 | +#include <memory> | ||
| 10 | +#include <vector> | ||
| 11 | + | ||
| 12 | +#include "alsa/asoundlib.h" | ||
| 13 | +#include "sherpa-onnx/csrc/resample.h" | ||
| 14 | + | ||
| 15 | +namespace sherpa_onnx { | ||
| 16 | + | ||
| 17 | +class AlsaPlay { | ||
| 18 | + public: | ||
| 19 | + AlsaPlay(const char *device_name, int32_t sample_rate); | ||
| 20 | + ~AlsaPlay(); | ||
| 21 | + void Play(const std::vector<float> &samples); | ||
| 22 | + | ||
| 23 | + // wait for all the samples to be played | ||
| 24 | + void Drain(); | ||
| 25 | + | ||
| 26 | + private: | ||
| 27 | + void SetParameters(int32_t sample_rate); | ||
| 28 | + | ||
| 29 | + private: | ||
| 30 | + snd_pcm_t *handle_ = nullptr; | ||
| 31 | + std::unique_ptr<LinearResample> resampler_; | ||
| 32 | + std::vector<int16_t> buf_; | ||
| 33 | +}; | ||
| 34 | + | ||
| 35 | +} // namespace sherpa_onnx | ||
| 36 | + | ||
| 37 | +#endif // SHERPA_ONNX_CSRC_ALSA_PLAY_H_ |
| 1 | +// sherpa-onnx/csrc/sherpa-onnx-tts-play-alsa.cc | ||
| 2 | +// | ||
| 3 | +// Copyright (c) 2022-2023 Xiaomi Corporation | ||
| 4 | + | ||
| 5 | +// see https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html | ||
| 6 | +// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m___h_w___params.html | ||
| 7 | +// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html | ||
| 8 | + | ||
| 9 | +#include <signal.h> | ||
| 10 | + | ||
| 11 | +#include <algorithm> | ||
| 12 | +#include <chrono> // NOLINT | ||
| 13 | +#include <condition_variable> // NOLINT | ||
| 14 | +#include <fstream> | ||
| 15 | +#include <mutex> // NOLINT | ||
| 16 | +#include <queue> | ||
| 17 | +#include <thread> // NOLINT | ||
| 18 | +#include <vector> | ||
| 19 | + | ||
| 20 | +#include "sherpa-onnx/csrc/alsa-play.h" | ||
| 21 | +#include "sherpa-onnx/csrc/offline-tts.h" | ||
| 22 | +#include "sherpa-onnx/csrc/parse-options.h" | ||
| 23 | +#include "sherpa-onnx/csrc/wave-writer.h" | ||
| 24 | + | ||
| 25 | +static std::condition_variable g_cv; | ||
| 26 | +static std::mutex g_cv_m; | ||
| 27 | + | ||
| 28 | +struct Buffer { | ||
| 29 | + std::queue<std::vector<float>> samples; | ||
| 30 | + std::mutex mutex; | ||
| 31 | +}; | ||
| 32 | + | ||
| 33 | +static Buffer g_buffer; | ||
| 34 | + | ||
| 35 | +static bool g_stopped = false; | ||
| 36 | +static bool g_killed = false; | ||
| 37 | + | ||
| 38 | +static void Handler(int32_t /*sig*/) { | ||
| 39 | + if (g_killed) { | ||
| 40 | + exit(0); | ||
| 41 | + } | ||
| 42 | + | ||
| 43 | + g_killed = true; | ||
| 44 | + fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); | ||
| 45 | +} | ||
| 46 | + | ||
| 47 | +static void AudioGeneratedCallback(const float *s, int32_t n) { | ||
| 48 | + if (n > 0) { | ||
| 49 | + std::lock_guard<std::mutex> lock(g_buffer.mutex); | ||
| 50 | + g_buffer.samples.push({s, s + n}); | ||
| 51 | + g_cv.notify_all(); | ||
| 52 | + } | ||
| 53 | +} | ||
| 54 | + | ||
| 55 | +static void StartPlayback(const std::string &device_name, int32_t sample_rate) { | ||
| 56 | + sherpa_onnx::AlsaPlay alsa(device_name.c_str(), sample_rate); | ||
| 57 | + | ||
| 58 | + std::unique_lock<std::mutex> lock(g_cv_m); | ||
| 59 | + while (!g_killed && !g_stopped) { | ||
| 60 | + while (!g_buffer.samples.empty()) { | ||
| 61 | + auto &p = g_buffer.samples.front(); | ||
| 62 | + alsa.Play(p); | ||
| 63 | + g_buffer.samples.pop(); | ||
| 64 | + } | ||
| 65 | + | ||
| 66 | + g_cv.wait(lock); | ||
| 67 | + } | ||
| 68 | + | ||
| 69 | + if (g_killed) { | ||
| 70 | + return; | ||
| 71 | + } | ||
| 72 | + | ||
| 73 | + if (g_stopped) { | ||
| 74 | + while (!g_buffer.samples.empty()) { | ||
| 75 | + auto &p = g_buffer.samples.front(); | ||
| 76 | + alsa.Play(p); | ||
| 77 | + g_buffer.samples.pop(); | ||
| 78 | + } | ||
| 79 | + } | ||
| 80 | + | ||
| 81 | + alsa.Drain(); | ||
| 82 | +} | ||
| 83 | + | ||
| 84 | +int main(int32_t argc, char *argv[]) { | ||
| 85 | + signal(SIGINT, Handler); | ||
| 86 | + | ||
| 87 | + const char *kUsageMessage = R"usage( | ||
| 88 | +Offline text-to-speech with sherpa-onnx. | ||
| 89 | + | ||
| 90 | +It plays the generated audio as the model is processing. | ||
| 91 | + | ||
| 92 | +Note that it is alsa so it works only on **Linux**. For instance, you can | ||
| 93 | +use it on Raspberry Pi. | ||
| 94 | + | ||
| 95 | +Usage example: | ||
| 96 | + | ||
| 97 | +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 | ||
| 98 | +tar xf vits-piper-en_US-amy-low.tar.bz2 | ||
| 99 | + | ||
| 100 | +./bin/sherpa-onnx-offline-tts-play-alsa \ | ||
| 101 | + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ | ||
| 102 | + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ | ||
| 103 | + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ | ||
| 104 | + --output-filename=./generated.wav \ | ||
| 105 | + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." | ||
| 106 | + | ||
| 107 | +It will generate a file ./generated.wav as specified by --output-filename. | ||
| 108 | + | ||
| 109 | +You can find more models at | ||
| 110 | +https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 111 | + | ||
| 112 | +Please see | ||
| 113 | +https://k2-fsa.github.io/sherpa/onnx/tts/index.html | ||
| 114 | +or details. | ||
| 115 | +)usage"; | ||
| 116 | + | ||
| 117 | + sherpa_onnx::ParseOptions po(kUsageMessage); | ||
| 118 | + std::string device_name = "default"; | ||
| 119 | + std::string output_filename = "./generated.wav"; | ||
| 120 | + int32_t sid = 0; | ||
| 121 | + | ||
| 122 | + po.Register("output-filename", &output_filename, | ||
| 123 | + "Path to save the generated audio"); | ||
| 124 | + | ||
| 125 | + po.Register("device-name", &device_name, | ||
| 126 | + "Name of the device to play the generated audio"); | ||
| 127 | + | ||
| 128 | + po.Register("sid", &sid, | ||
| 129 | + "Speaker ID. Used only for multi-speaker models, e.g., models " | ||
| 130 | + "trained using the VCTK dataset. Not used for single-speaker " | ||
| 131 | + "models, e.g., models trained using the LJSpeech dataset"); | ||
| 132 | + | ||
| 133 | + sherpa_onnx::OfflineTtsConfig config; | ||
| 134 | + | ||
| 135 | + config.Register(&po); | ||
| 136 | + po.Read(argc, argv); | ||
| 137 | + | ||
| 138 | + if (po.NumArgs() == 0) { | ||
| 139 | + fprintf(stderr, "Error: Please provide the text to generate audio.\n\n"); | ||
| 140 | + po.PrintUsage(); | ||
| 141 | + exit(EXIT_FAILURE); | ||
| 142 | + } | ||
| 143 | + | ||
| 144 | + if (po.NumArgs() > 1) { | ||
| 145 | + fprintf(stderr, | ||
| 146 | + "Error: Accept only one positional argument. Please use single " | ||
| 147 | + "quotes to wrap your text\n"); | ||
| 148 | + po.PrintUsage(); | ||
| 149 | + exit(EXIT_FAILURE); | ||
| 150 | + } | ||
| 151 | + | ||
| 152 | + if (!config.Validate()) { | ||
| 153 | + fprintf(stderr, "Errors in config!\n"); | ||
| 154 | + exit(EXIT_FAILURE); | ||
| 155 | + } | ||
| 156 | + | ||
| 157 | + if (config.max_num_sentences != 1) { | ||
| 158 | + fprintf(stderr, "Setting config.max_num_sentences to 1\n"); | ||
| 159 | + config.max_num_sentences = 1; | ||
| 160 | + } | ||
| 161 | + | ||
| 162 | + fprintf(stderr, "Loading the model\n"); | ||
| 163 | + sherpa_onnx::OfflineTts tts(config); | ||
| 164 | + | ||
| 165 | + fprintf(stderr, "Start the playback thread\n"); | ||
| 166 | + std::thread playback_thread(StartPlayback, device_name, tts.SampleRate()); | ||
| 167 | + | ||
| 168 | + float speed = 1.0; | ||
| 169 | + | ||
| 170 | + fprintf(stderr, "Generating ...\n"); | ||
| 171 | + const auto begin = std::chrono::steady_clock::now(); | ||
| 172 | + auto audio = tts.Generate(po.GetArg(1), sid, speed, AudioGeneratedCallback); | ||
| 173 | + const auto end = std::chrono::steady_clock::now(); | ||
| 174 | + g_stopped = true; | ||
| 175 | + g_cv.notify_all(); | ||
| 176 | + fprintf(stderr, "Generating done!\n"); | ||
| 177 | + if (audio.samples.empty()) { | ||
| 178 | + fprintf( | ||
| 179 | + stderr, | ||
| 180 | + "Error in generating audio. Please read previous error messages.\n"); | ||
| 181 | + exit(EXIT_FAILURE); | ||
| 182 | + } | ||
| 183 | + | ||
| 184 | + float elapsed_seconds = | ||
| 185 | + std::chrono::duration_cast<std::chrono::milliseconds>(end - begin) | ||
| 186 | + .count() / | ||
| 187 | + 1000.; | ||
| 188 | + float duration = audio.samples.size() / static_cast<float>(audio.sample_rate); | ||
| 189 | + | ||
| 190 | + float rtf = elapsed_seconds / duration; | ||
| 191 | + fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds); | ||
| 192 | + fprintf(stderr, "Audio duration: %.3f s\n", duration); | ||
| 193 | + fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds, | ||
| 194 | + duration, rtf); | ||
| 195 | + | ||
| 196 | + bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate, | ||
| 197 | + audio.samples.data(), audio.samples.size()); | ||
| 198 | + if (!ok) { | ||
| 199 | + fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str()); | ||
| 200 | + exit(EXIT_FAILURE); | ||
| 201 | + } | ||
| 202 | + | ||
| 203 | + fprintf(stderr, "The text is: %s. Speaker ID: %d\n\n", po.GetArg(1).c_str(), | ||
| 204 | + sid); | ||
| 205 | + fprintf(stderr, "\n**** Saved to %s successfully! ****\n", | ||
| 206 | + output_filename.c_str()); | ||
| 207 | + | ||
| 208 | + fprintf(stderr, "\n"); | ||
| 209 | + fprintf( | ||
| 210 | + stderr, | ||
| 211 | + "Wait for the playback to finish. You can safely press ctrl + C to stop " | ||
| 212 | + "the playback.\n"); | ||
| 213 | + playback_thread.join(); | ||
| 214 | + | ||
| 215 | + fprintf(stderr, "Done!\n"); | ||
| 216 | + | ||
| 217 | + return 0; | ||
| 218 | +} |
-
请 注册 或 登录 后发表评论