Support writing generated audio samples to wave files (#363)

Fangjun Kuang · GitHub
Commit 1ac2232e149d3998eeee9e02107fcbb4be53af46 1ac2232e 1 parent 536d5804
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/lexicon.cc
sherpa-onnx/csrc/lexicon.h
sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc
sherpa-onnx/csrc/wave-writer.cc
sherpa-onnx/csrc/wave-writer.h
--- a/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @1ac2232
+++ b/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @1ac2232
@@ -93,6 +93,7 @@ list(APPEND sources
   offline-tts-vits-model-config.cc
   offline-tts-vits-model.cc
   offline-tts.cc
+   wave-writer.cc
 )
 
 if(SHERPA_ONNX_ENABLE_CHECK)
--- a/sherpa-onnx/csrc/lexicon.cc
查看文件 @1ac2232
+++ b/sherpa-onnx/csrc/lexicon.cc
查看文件 @1ac2232
@@ -53,7 +53,7 @@ static std::unordered_map<std::string, int32_t> ReadTokens(
       exit(-1);
     }
 #endif
-     token2id.insert({sym, id});
+     token2id.insert({std::move(sym), id});
   }
 
   return token2id;
@@ -78,6 +78,7 @@ static std::vector<int32_t> ConvertTokensToIds(
 Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens,
                  const std::string &punctuations) {
   token2id_ = ReadTokens(tokens);
+   blank_ = token2id_.at(" ");
   std::ifstream is(lexicon);
 
   std::string word;
@@ -149,6 +150,11 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIds(
     ans.insert(ans.end(), prefix.begin(), prefix.end());
     ans.insert(ans.end(), token_ids.begin(), token_ids.end());
     ans.insert(ans.end(), suffix.rbegin(), suffix.rend());
+     ans.push_back(blank_);
+   }
+ 
+   if (!ans.empty()) {
+     ans.resize(ans.size() - 1);
   }
 
   return ans;
--- a/sherpa-onnx/csrc/lexicon.h
查看文件 @1ac2232
+++ b/sherpa-onnx/csrc/lexicon.h
查看文件 @1ac2232
@@ -24,6 +24,7 @@ class Lexicon {
   std::unordered_map<std::string, std::vector<int32_t>> word2ids_;
   std::unordered_set<std::string> punctuations_;
   std::unordered_map<std::string, int32_t> token2id_;
+   int32_t blank_;  // ID for the blank token
 };
 
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc
查看文件 @1ac2232
+++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc
查看文件 @1ac2232
@@ -6,6 +6,7 @@
 
 #include "sherpa-onnx/csrc/offline-tts.h"
 #include "sherpa-onnx/csrc/parse-options.h"
+ #include "sherpa-onnx/csrc/wave-writer.h"
 
 int main(int32_t argc, char *argv[]) {
   const char *kUsageMessage = R"usage(
@@ -15,13 +16,34 @@ Offline text-to-speech with sherpa-onnx
  --vits-model /path/to/model.onnx \
  --vits-lexicon /path/to/lexicon.txt \
  --vits-tokens /path/to/tokens.txt
+  --output-filename ./generated.wav \
  'some text within single quotes'
 
- It will generate a file test.wav.
+ It will generate a file ./generated.wav as specified by --output-filename.
+ 
+ You can download a test model from
+ https://huggingface.co/csukuangfj/vits-ljs
+ 
+ For instance, you can use:
+ wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx
+ wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt
+ wget https://huggingface.co/csukuangfj/vits-ljs/resolve/main/tokens.txt
+ 
+ ./bin/sherpa-onnx-offline-tts \
+   --vits-model=./vits-ljs.onnx \
+   --vits-lexicon=./lexicon.txt \
+   --vits-tokens=./tokens.txt \
+   --output-filename=./generated.wav \
+   'liliana, the most beautiful and lovely assistant of our team!'
 )usage";
 
   sherpa_onnx::ParseOptions po(kUsageMessage);
+   std::string output_filename = "./generated.wav";
+   po.Register("output-filename", &output_filename,
+               "Path to save the generated audio");
+ 
   sherpa_onnx::OfflineTtsConfig config;
+ 
   config.Register(&po);
   po.Read(argc, argv);
 
@@ -47,11 +69,15 @@ It will generate a file test.wav.
   sherpa_onnx::OfflineTts tts(config);
   auto audio = tts.Generate(po.GetArg(1));
 
-   std::ofstream os("t.pcm", std::ios::binary);
-   os.write(reinterpret_cast<const char *>(audio.samples.data()),
-            sizeof(float) * audio.samples.size());
+   bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate,
+                                    audio.samples.data(), audio.samples.size());
+   if (!ok) {
+     fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str());
+     exit(EXIT_FAILURE);
+   }
 
-   // sox -t raw -r 22050 -b 32 -e floating-point -c 1 ./t.pcm ./t.wav
+   fprintf(stderr, "The text is: %s\n", po.GetArg(1).c_str());
+   fprintf(stderr, "Saved to %s successfully!\n", output_filename.c_str());
 
   return 0;
 }
--- a/sherpa-onnx/csrc/wave-writer.cc 0 → 100644
查看文件 @1ac2232
+++ b/sherpa-onnx/csrc/wave-writer.cc 0 → 100644
查看文件 @1ac2232
+ // sherpa-onnx/csrc/wave-writer.cc
+ //
+ // Copyright (c)  2023  Xiaomi Corporation
+ 
+ #include "sherpa-onnx/csrc/wave-writer.h"
+ 
+ #include <fstream>
+ #include <string>
+ #include <vector>
+ 
+ #include "sherpa-onnx/csrc/macros.h"
+ 
+ namespace sherpa_onnx {
+ namespace {
+ 
+ // see http://soundfile.sapp.org/doc/WaveFormat/
+ //
+ // Note: We assume little endian here
+ // TODO(fangjun): Support big endian
+ struct WaveHeader {
+   int32_t chunk_id;
+   int32_t chunk_size;
+   int32_t format;
+   int32_t subchunk1_id;
+   int32_t subchunk1_size;
+   int16_t audio_format;
+   int16_t num_channels;
+   int32_t sample_rate;
+   int32_t byte_rate;
+   int16_t block_align;
+   int16_t bits_per_sample;
+   int32_t subchunk2_id;    // a tag of this chunk
+   int32_t subchunk2_size;  // size of subchunk2
+ };
+ 
+ }  // namespace
+ 
+ bool WriteWave(const std::string &filename, int32_t sampling_rate,
+                const float *samples, int32_t n) {
+   WaveHeader header;
+   header.chunk_id = 0x46464952;      // FFIR
+   header.format = 0x45564157;        // EVAW
+   header.subchunk1_id = 0x20746d66;  // "fmt "
+   header.subchunk1_size = 16;        // 16 for PCM
+   header.audio_format = 1;           // PCM =1
+ 
+   int32_t num_channels = 1;
+   int32_t bits_per_sample = 16;  // int16_t
+   header.num_channels = num_channels;
+   header.sample_rate = sampling_rate;
+   header.byte_rate = sampling_rate * num_channels * bits_per_sample / 8;
+   header.block_align = num_channels * bits_per_sample / 8;
+   header.bits_per_sample = bits_per_sample;
+   header.subchunk2_id = 0x61746164;  // atad
+   header.subchunk2_size = n * num_channels * bits_per_sample / 8;
+ 
+   header.chunk_size = 36 + header.subchunk2_size;
+ 
+   std::vector<int16_t> samples_int16(n);
+   for (int32_t i = 0; i != n; ++i) {
+     samples_int16[i] = samples[i] * 32676;
+   }
+ 
+   std::ofstream os(filename, std::ios::binary);
+   if (!os) {
+     SHERPA_ONNX_LOGE("Failed to create %s", filename.c_str());
+     return false;
+   }
+ 
+   os.write(reinterpret_cast<const char *>(&header), sizeof(header));
+   os.write(reinterpret_cast<const char *>(samples_int16.data()),
+            samples_int16.size() * sizeof(int16_t));
+ 
+   if (!os) {
+     SHERPA_ONNX_LOGE("Write %s failed", filename.c_str());
+     return false;
+   }
+ 
+   return true;
+ }
+ 
+ }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/wave-writer.h 0 → 100644
查看文件 @1ac2232
+++ b/sherpa-onnx/csrc/wave-writer.h 0 → 100644
查看文件 @1ac2232
+ // sherpa-onnx/csrc/wave-writer.h
+ //
+ // Copyright (c)  2023  Xiaomi Corporation
+ 
+ #ifndef SHERPA_ONNX_CSRC_WAVE_WRITER_H_
+ #define SHERPA_ONNX_CSRC_WAVE_WRITER_H_
+ 
+ #include <cstdint>
+ #include <string>
+ 
+ namespace sherpa_onnx {
+ 
+ // Write a single channel wave file.
+ // Note that the input samples are in the range [-1, 1]. It will be multiplied
+ // by 32767 and saved in int16_t format in the wave file.
+ //
+ // @param filename Path to save the samples.
+ // @param sampling_rate Sample rate of the samples.
+ // @param samples Pointer to the samples
+ // @param n Number of samples
+ // @return Return true if the write succeeds; return false otherwise.
+ bool WriteWave(const std::string &filename, int32_t sampling_rate,
+                const float *samples, int32_t n);
+ 
+ }  // namespace sherpa_onnx
+ 
+ #endif  // SHERPA_ONNX_CSRC_WAVE_WRITER_H_