Add Zipvoice (#2487)

Co-authored-by: yaozengwei <yaozengwei@outlook.com>

Add Zipvoice (#2487)
Co-authored-by: yaozengwei <yaozengwei@outlook.com>
Wei Kang · GitHub
Commit c149696cb3c485a163b31e6c632df9b571ee5a44 c149696c 1 parent 6768ca78
CMakeLists.txt
cmake/cppinyin.cmake
python-api-examples/offline-tts.py
python-api-examples/offline-zeroshot-tts.py
sherpa-onnx/csrc/CMakeLists.txt
sherpa-onnx/csrc/offline-tts-impl.cc
sherpa-onnx/csrc/offline-tts-impl.h
sherpa-onnx/csrc/offline-tts-matcha-impl.h
sherpa-onnx/csrc/offline-tts-model-config.cc
sherpa-onnx/csrc/offline-tts-model-config.h
sherpa-onnx/csrc/offline-tts-zipvoice-frontend-test.cc
sherpa-onnx/csrc/offline-tts-zipvoice-frontend.cc
sherpa-onnx/csrc/offline-tts-zipvoice-frontend.h
sherpa-onnx/csrc/offline-tts-zipvoice-impl.h
sherpa-onnx/csrc/offline-tts-zipvoice-model-config.cc
sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h
sherpa-onnx/csrc/offline-tts-zipvoice-model-meta-data.h
sherpa-onnx/csrc/offline-tts-zipvoice-model.cc
sherpa-onnx/csrc/offline-tts-zipvoice-model.h
sherpa-onnx/csrc/offline-tts.cc
--- a/CMakeLists.txt
查看文件 @c149696
+++ b/CMakeLists.txt
查看文件 @c149696
@@ -372,6 +372,7 @@ endif()
 include(kaldi-native-fbank)
 include(kaldi-decoder)
 include(onnxruntime)
+include(cppinyin)
 include(simple-sentencepiece)
 set(ONNXRUNTIME_DIR ${onnxruntime_SOURCE_DIR})
 message(STATUS "ONNXRUNTIME_DIR: ${ONNXRUNTIME_DIR}")
--- a/cmake/cppinyin.cmake 0 → 100644
查看文件 @c149696
+++ b/cmake/cppinyin.cmake 0 → 100644
查看文件 @c149696
+function(download_cppinyin)
+  include(FetchContent)
+
+  set(cppinyin_URL "https://github.com/pkufool/cppinyin/archive/refs/tags/v0.10.tar.gz")
+  set(cppinyin_URL2 "https://gh-proxy.com/https://github.com/pkufool/cppinyin/archive/refs/tags/v0.10.tar.gz")
+  set(cppinyin_HASH "SHA256=abe6584d7ee56829e8f4b5fbda3b50ecdf49a13be8e413a78d1b0d5d5c019982")
+
+  # If you don't have access to the Internet,
+  # please pre-download cppinyin
+  set(possible_file_locations
+    $ENV{HOME}/Downloads/cppinyin-0.10.tar.gz
+    ${CMAKE_SOURCE_DIR}/cppinyin-0.10.tar.gz
+    ${CMAKE_BINARY_DIR}/cppinyin-0.10.tar.gz
+    /tmp/cppinyin-0.10.tar.gz
+    /star-fj/fangjun/download/github/cppinyin-0.10.tar.gz
+  )
+
+  foreach(f IN LISTS possible_file_locations)
+    if(EXISTS ${f})
+      set(cppinyin_URL  "${f}")
+      file(TO_CMAKE_PATH "${cppinyin_URL}" cppinyin_URL)
+      message(STATUS "Found local downloaded cppinyin: ${cppinyin_URL}")
+      set(cppinyin_URL2)
+      break()
+    endif()
+  endforeach()
+
+  set(CPPINYIN_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
+  set(CPPINYIN_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
+
+  FetchContent_Declare(cppinyin
+    URL
+      ${cppinyin_URL}
+      ${cppinyin_URL2}
+    URL_HASH
+      ${cppinyin_HASH}
+  )
+
+  FetchContent_GetProperties(cppinyin)
+  if(NOT cppinyin_POPULATED)
+    message(STATUS "Downloading cppinyin ${cppinyin_URL}")
+    FetchContent_Populate(cppinyin)
+  endif()
+  message(STATUS "cppinyin is downloaded to ${cppinyin_SOURCE_DIR}")
+
+  if(BUILD_SHARED_LIBS)
+    set(_build_shared_libs_bak ${BUILD_SHARED_LIBS})
+    set(BUILD_SHARED_LIBS OFF)
+  endif()
+
+  add_subdirectory(${cppinyin_SOURCE_DIR} ${cppinyin_BINARY_DIR} EXCLUDE_FROM_ALL)
+
+  if(_build_shared_libs_bak)
+    set_target_properties(cppinyin_core
+      PROPERTIES
+        POSITION_INDEPENDENT_CODE ON
+        C_VISIBILITY_PRESET hidden
+        CXX_VISIBILITY_PRESET hidden
+    )
+    set(BUILD_SHARED_LIBS ON)
+  endif()
+
+  target_include_directories(cppinyin_core
+    PUBLIC
+      ${cppinyin_SOURCE_DIR}/
+  )
+
+  if(NOT BUILD_SHARED_LIBS)
+    install(TARGETS cppinyin_core DESTINATION lib)
+  endif()
+
+endfunction()
+
+download_cppinyin()
--- a/python-api-examples/offline-tts.py
查看文件 @c149696
+++ b/python-api-examples/offline-tts.py
查看文件 @c149696
 #!/usr/bin/env python3
 #
-# Copyright (c)  2023  Xiaomi Corporation
+# Copyright (c)  2023-2025  Xiaomi Corporation
 """
 This file demonstrates how to use sherpa-onnx Python API to generate audio
@@ -453,7 +453,9 @@ def main():
     end = time.time()
     if len(audio.samples) == 0:
-        print("Error in generating audios. Please read previous error messages.")
+        print(
+            "Error in generating audios. Please read previous error messages."
+        )
         return
     elapsed_seconds = end - start
@@ -470,7 +472,9 @@ def main():
     print(f"The text is '{args.text}'")
     print(f"Elapsed seconds: {elapsed_seconds:.3f}")
     print(f"Audio duration in seconds: {audio_duration:.3f}")
-    print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
+    print(
+        f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}"
+    )
 if __name__ == "__main__":
--- a/python-api-examples/offline-zeroshot-tts.py 0 → 100755
查看文件 @c149696
+++ b/python-api-examples/offline-zeroshot-tts.py 0 → 100755
查看文件 @c149696
+#!/usr/bin/env python3
+#
+# Copyright (c)  2025  Xiaomi Corporation
+
+"""
+This file demonstrates how to use sherpa-onnx Python API to generate audio
+from text with prompt, i.e., zero shot text-to-speech.
+
+Usage:
+
+Example (zipvoice)
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-zipvoice-distill-zh-en-emilia.tar.bz2
+tar xf sherpa-onnx-zipvoice-distill-zh-en-emilia.tar.bz2
+
+python3 ./python-api-examples/offline-zeroshot-tts.py \
+  --zipvoice-flow-matching-model sherpa-onnx-zipvoice-distill-zh-en-emilia/fm_decoder.onnx \
+  --zipvoice-text-model sherpa-onnx-zipvoice-distill-zh-en-emilia/text_encoder.onnx \
+  --zipvoice-data-dir sherpa-onnx-zipvoice-distill-zh-en-emilia/espeak-ng-data \
+  --zipvoice-pinyin-dict sherpa-onnx-zipvoice-distill-zh-en-emilia/pinyin.raw \
+  --zipvoice-tokens sherpa-onnx-zipvoice-distill-zh-en-emilia/tokens.txt \
+  --zipvoice-vocoder sherpa-onnx-zipvoice-distill-zh-en-emilia/vocos_24khz.onnx \
+  --prompt-audio sherpa-onnx-zipvoice-distill-zh-en-emilia/prompt.wav \
+  --zipvoice-num-steps 4 \
+  --num-threads 4 \
+  --prompt-text "周日被我射熄火了，所以今天是周一。" \
+  "我是中国人民的儿子，我爱我的祖国。我得祖国是一个伟大的国家，拥有五千年的文明史。"
+"""
+
+import argparse
+import time
+import wave
+import numpy as np
+
+from typing import Tuple
+
+import sherpa_onnx
+import soundfile as sf
+
+
+def add_zipvoice_args(parser):
+    parser.add_argument(
+        "--zipvoice-tokens",
+        type=str,
+        default="",
+        help="Path to tokens.txt for Zipvoice models.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-text-model",
+        type=str,
+        default="",
+        help="Path to zipvoice text model.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-flow-matching-model",
+        type=str,
+        default="",
+        help="Path to zipvoice flow matching model.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-data-dir",
+        type=str,
+        default="",
+        help="Path to the dict directory of espeak-ng.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-pinyin-dict",
+        type=str,
+        default="",
+        help="Path to the pinyin dictionary.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-vocoder",
+        type=str,
+        default="",
+        help="Path to the vocos vocoder.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-num-steps",
+        type=int,
+        default=4,
+        help="Number of steps for Zipvoice.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-feat-scale",
+        type=float,
+        default=0.1,
+        help="Scale factor for Zipvoice features.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-t-shift",
+        type=float,
+        default=0.5,
+        help="Shift t to smaller ones if t-shift < 1.0.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-target-rms",
+        type=float,
+        default=0.1,
+        help="Target speech normalization RMS value for Zipvoice.",
+    )
+
+    parser.add_argument(
+        "--zipvoice-guidance-scale",
+        type=float,
+        default=1.0,
+        help="The scale of classifier-free guidance during inference for for Zipvoice.",
+    )
+
+
+def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
+    """
+    Args:
+      wave_filename:
+        Path to a wave file. It should be single channel and each sample should
+        be 16-bit. Its sample rate does not need to be 16kHz.
+    Returns:
+      Return a tuple containing:
+       - A 1-D array of dtype np.float32 containing the samples, which are
+       normalized to the range [-1, 1].
+       - sample rate of the wave file
+    """
+
+    with wave.open(wave_filename) as f:
+        assert f.getnchannels() == 1, f.getnchannels()
+        assert f.getsampwidth() == 2, f.getsampwidth()  # it is in bytes
+        num_samples = f.getnframes()
+        samples = f.readframes(num_samples)
+        samples_int16 = np.frombuffer(samples, dtype=np.int16)
+        samples_float32 = samples_int16.astype(np.float32)
+
+        samples_float32 = samples_float32 / 32768
+        return samples_float32, f.getframerate()
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    add_zipvoice_args(parser)
+
+    parser.add_argument(
+        "--tts-rule-fsts",
+        type=str,
+        default="",
+        help="Path to rule.fst",
+    )
+
+    parser.add_argument(
+        "--max-num-sentences",
+        type=int,
+        default=1,
+        help="""Max number of sentences in a batch to avoid OOM if the input
+        text is very long. Set it to -1 to process all the sentences in a
+        single batch. A smaller value does not mean it is slower compared
+        to a larger one on CPU.
+        """,
+    )
+
+    parser.add_argument(
+        "--output-filename",
+        type=str,
+        default="./generated.wav",
+        help="Path to save generated wave",
+    )
+
+    parser.add_argument(
+        "--debug",
+        type=bool,
+        default=False,
+        help="True to show debug messages",
+    )
+
+    parser.add_argument(
+        "--provider",
+        type=str,
+        default="cpu",
+        help="valid values: cpu, cuda, coreml",
+    )
+
+    parser.add_argument(
+        "--num-threads",
+        type=int,
+        default=1,
+        help="Number of threads for neural network computation",
+    )
+
+    parser.add_argument(
+        "--speed",
+        type=float,
+        default=1.0,
+        help="Speech speed. Larger->faster; smaller->slower",
+    )
+
+    parser.add_argument(
+        "--prompt-text",
+        type=str,
+        required=True,
+        help="The transcription of prompt audio (Zipvoice)",
+    )
+
+    parser.add_argument(
+        "--prompt-audio",
+        type=str,
+        required=True,
+        help="The path to prompt audio (Zipvoice).",
+    )
+
+    parser.add_argument(
+        "text",
+        type=str,
+        help="The input text to generate audio for",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    print(args)
+
+    tts_config = sherpa_onnx.OfflineTtsConfig(
+        model=sherpa_onnx.OfflineTtsModelConfig(
+            zipvoice=sherpa_onnx.OfflineTtsZipvoiceModelConfig(
+                tokens=args.zipvoice_tokens,
+                text_model=args.zipvoice_text_model,
+                flow_matching_model=args.zipvoice_flow_matching_model,
+                data_dir=args.zipvoice_data_dir,
+                pinyin_dict=args.zipvoice_pinyin_dict,
+                vocoder=args.zipvoice_vocoder,
+                feat_scale=args.zipvoice_feat_scale,
+                t_shift=args.zipvoice_t_shift,
+                target_rms=args.zipvoice_target_rms,
+                guidance_scale=args.zipvoice_guidance_scale,
+            ),
+            provider=args.provider,
+            debug=args.debug,
+            num_threads=args.num_threads,
+        ),
+        rule_fsts=args.tts_rule_fsts,
+        max_num_sentences=args.max_num_sentences,
+    )
+    if not tts_config.validate():
+        raise ValueError("Please check your config")
+
+    tts = sherpa_onnx.OfflineTts(tts_config)
+
+    start = time.time()
+    prompt_samples, sample_rate = read_wave(args.prompt_audio)
+    audio = tts.generate(
+        args.text,
+        args.prompt_text,
+        prompt_samples,
+        sample_rate,
+        speed=args.speed,
+        num_steps=args.zipvoice_num_steps,
+    )
+    end = time.time()
+
+    if len(audio.samples) == 0:
+        print(
+            "Error in generating audios. Please read previous error messages."
+        )
+        return
+
+    elapsed_seconds = end - start
+    audio_duration = len(audio.samples) / audio.sample_rate
+    real_time_factor = elapsed_seconds / audio_duration
+
+    sf.write(
+        args.output_filename,
+        audio.samples,
+        samplerate=audio.sample_rate,
+        subtype="PCM_16",
+    )
+    print(f"Saved to {args.output_filename}")
+    print(f"The text is '{args.text}'")
+    print(f"Elapsed seconds: {elapsed_seconds:.3f}")
+    print(f"Audio duration in seconds: {audio_duration:.3f}")
+    print(
+        f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}"
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @c149696
+++ b/sherpa-onnx/csrc/CMakeLists.txt
查看文件 @c149696
@@ -201,6 +201,9 @@ if(SHERPA_ONNX_ENABLE_TTS)
     offline-tts-model-config.cc
     offline-tts-vits-model-config.cc
     offline-tts-vits-model.cc
+    offline-tts-zipvoice-frontend.cc
+    offline-tts-zipvoice-model.cc
+    offline-tts-zipvoice-model-config.cc
     offline-tts.cc
     piper-phonemize-lexicon.cc
     vocoder.cc
@@ -265,6 +268,7 @@ if(ANDROID_NDK)
 endif()
 target_link_libraries(sherpa-onnx-core
+  cppinyin_core
   kaldi-native-fbank-core
   kaldi-decoder-core
   ssentencepiece_core
@@ -348,6 +352,7 @@ if(SHERPA_ONNX_ENABLE_BINARY)
   if(SHERPA_ONNX_ENABLE_TTS)
     add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
+    add_executable(sherpa-onnx-offline-zeroshot-tts sherpa-onnx-offline-zeroshot-tts.cc)
   endif()
   if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION)
@@ -370,6 +375,7 @@ if(SHERPA_ONNX_ENABLE_BINARY)
   if(SHERPA_ONNX_ENABLE_TTS)
     list(APPEND main_exes
       sherpa-onnx-offline-tts
+      sherpa-onnx-offline-zeroshot-tts
     )
   endif()
@@ -667,6 +673,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
   if(SHERPA_ONNX_ENABLE_TTS)
     list(APPEND sherpa_onnx_test_srcs
       cppjieba-test.cc
+      offline-tts-zipvoice-frontend-test.cc
       piper-phonemize-test.cc
     )
   endif()
--- a/sherpa-onnx/csrc/offline-tts-impl.cc
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-impl.cc
查看文件 @c149696
@@ -20,6 +20,7 @@
 #include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h"
 #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
 #include "sherpa-onnx/csrc/offline-tts-vits-impl.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-impl.h"
 namespace sherpa_onnx {
@@ -41,6 +42,9 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
     return std::make_unique<OfflineTtsVitsImpl>(config);
   } else if (!config.model.matcha.acoustic_model.empty()) {
     return std::make_unique<OfflineTtsMatchaImpl>(config);
+  } else if (!config.model.zipvoice.text_model.empty() &&
+             !config.model.zipvoice.flow_matching_model.empty()) {
+    return std::make_unique<OfflineTtsZipvoiceImpl>(config);
   } else if (!config.model.kokoro.model.empty()) {
     return std::make_unique<OfflineTtsKokoroImpl>(config);
   } else if (!config.model.kitten.model.empty()) {
@@ -59,6 +63,9 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
     return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
   } else if (!config.model.matcha.acoustic_model.empty()) {
     return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
+  } else if (!config.model.zipvoice.text_model.empty() &&
+             !config.model.zipvoice.flow_matching_model.empty()) {
+    return std::make_unique<OfflineTtsZipvoiceImpl>(mgr, config);
   } else if (!config.model.kokoro.model.empty()) {
     return std::make_unique<OfflineTtsKokoroImpl>(mgr, config);
   } else if (!config.model.kitten.model.empty()) {
--- a/sherpa-onnx/csrc/offline-tts-impl.h
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-impl.h
查看文件 @c149696
@@ -6,9 +6,11 @@
 #define SHERPA_ONNX_CSRC_OFFLINE_TTS_IMPL_H_
 #include <memory>
+#include <stdexcept>
 #include <string>
 #include <vector>
+#include "sherpa-onnx/csrc/macros.h"
 #include "sherpa-onnx/csrc/offline-tts.h"
 namespace sherpa_onnx {
@@ -25,14 +27,29 @@ class OfflineTtsImpl {
   virtual GeneratedAudio Generate(
       const std::string &text, int64_t sid = 0, float speed = 1.0,
-      GeneratedAudioCallback callback = nullptr) const = 0;
+      GeneratedAudioCallback callback = nullptr) const {
+    throw std::runtime_error(
+        "OfflineTtsImpl backend does not support non zero-shot Generate()");
+  }
+
+  virtual GeneratedAudio Generate(
+      const std::string &text, const std::string &prompt_text,
+      const std::vector<float> &prompt_samples, int32_t sample_rate,
+      float speed = 1.0, int32_t num_step = 4,
+      GeneratedAudioCallback callback = nullptr) const {
+    throw std::runtime_error(
+        "OfflineTtsImpl backend does not support zero-shot Generate()");
+  }
   // Return the sample rate of the generated audio
   virtual int32_t SampleRate() const = 0;
   // Number of supported speakers.
   // If it supports only a single speaker, then it return 0 or 1.
-  virtual int32_t NumSpeakers() const = 0;
+  virtual int32_t NumSpeakers() const {
+    throw std::runtime_error(
+        "Zero-shot OfflineTts does not support NumSpeakers()");
+  }
   std::vector<int64_t> AddBlank(const std::vector<int64_t> &x,
                                 int32_t blank_id = 0) const;
--- a/sherpa-onnx/csrc/offline-tts-matcha-impl.h
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-matcha-impl.h
查看文件 @c149696
@@ -140,8 +140,8 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
           tn_list_.push_back(
               std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
         }  // for (; !reader->Done(); reader->Next())
-      }    // for (const auto &f : files)
-    }      // if (!config.rule_fars.empty())
+      }  // for (const auto &f : files)
+    }  // if (!config.rule_fars.empty())
   }
   int32_t SampleRate() const override {
--- a/sherpa-onnx/csrc/offline-tts-model-config.cc
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-model-config.cc
查看文件 @c149696
@@ -12,6 +12,7 @@ void OfflineTtsModelConfig::Register(ParseOptions *po) {
   vits.Register(po);
   matcha.Register(po);
   kokoro.Register(po);
+  zipvoice.Register(po);
   kitten.Register(po);
   po->Register("num-threads", &num_threads,
@@ -38,6 +39,10 @@ bool OfflineTtsModelConfig::Validate() const {
     return matcha.Validate();
   }
+  if (!zipvoice.flow_matching_model.empty()) {
+    return zipvoice.Validate();
+  }
+
   if (!kokoro.model.empty()) {
     return kokoro.Validate();
   }
@@ -58,6 +63,7 @@ std::string OfflineTtsModelConfig::ToString() const {
   os << "vits=" << vits.ToString() << ", ";
   os << "matcha=" << matcha.ToString() << ", ";
   os << "kokoro=" << kokoro.ToString() << ", ";
+  os << "zipvoice=" << zipvoice.ToString() << ", ";
   os << "kitten=" << kitten.ToString() << ", ";
   os << "num_threads=" << num_threads << ", ";
   os << "debug=" << (debug ? "True" : "False") << ", ";
--- a/sherpa-onnx/csrc/offline-tts-model-config.h
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-model-config.h
查看文件 @c149696
@@ -11,6 +11,7 @@
 #include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
 #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h"
 #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h"
 #include "sherpa-onnx/csrc/parse-options.h"
 namespace sherpa_onnx {
@@ -19,6 +20,7 @@ struct OfflineTtsModelConfig {
   OfflineTtsVitsModelConfig vits;
   OfflineTtsMatchaModelConfig matcha;
   OfflineTtsKokoroModelConfig kokoro;
+  OfflineTtsZipvoiceModelConfig zipvoice;
   OfflineTtsKittenModelConfig kitten;
   int32_t num_threads = 1;
@@ -30,12 +32,14 @@ struct OfflineTtsModelConfig {
   OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits,
                         const OfflineTtsMatchaModelConfig &matcha,
                         const OfflineTtsKokoroModelConfig &kokoro,
+                        const OfflineTtsZipvoiceModelConfig &zipvoice,
                         const OfflineTtsKittenModelConfig &kitten,
                         int32_t num_threads, bool debug,
                         const std::string &provider)
       : vits(vits),
         matcha(matcha),
         kokoro(kokoro),
+        zipvoice(zipvoice),
         kitten(kitten),
         num_threads(num_threads),
         debug(debug),
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-frontend-test.cc 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-frontend-test.cc 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-frontend-test.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-frontend.h"
+
+#include "espeak-ng/speak_lib.h"
+#include "gtest/gtest.h"
+#include "phoneme_ids.hpp"
+#include "phonemize.hpp"
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+
+namespace sherpa_onnx {
+
+TEST(ZipVoiceFrontend, Case1) {
+  std::string data_dir = "../zipvoice/espeak-ng-data";
+  if (!FileExists(data_dir + "/en_dict")) {
+    SHERPA_ONNX_LOGE("%s/en_dict does not exist. Skipping test",
+                     data_dir.c_str());
+    return;
+  }
+
+  if (!FileExists(data_dir + "/phontab")) {
+    SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test",
+                     data_dir.c_str());
+    return;
+  }
+
+  if (!FileExists(data_dir + "/phonindex")) {
+    SHERPA_ONNX_LOGE("%s/phonindex does not exist. Skipping test",
+                     data_dir.c_str());
+    return;
+  }
+
+  if (!FileExists(data_dir + "/phondata")) {
+    SHERPA_ONNX_LOGE("%s/phondata does not exist. Skipping test",
+                     data_dir.c_str());
+    return;
+  }
+
+  if (!FileExists(data_dir + "/intonations")) {
+    SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test",
+                     data_dir.c_str());
+    return;
+  }
+
+  std::string pinyin_dict = data_dir + "/../pinyin.dict";
+  if (!FileExists(pinyin_dict)) {
+    SHERPA_ONNX_LOGE("%s does not exist. Skipping test", pinyin_dict.c_str());
+    return;
+  }
+
+  std::string tokens_file = data_dir + "/../tokens.txt";
+  if (!FileExists(tokens_file)) {
+    SHERPA_ONNX_LOGE("%s does not exist. Skipping test", tokens_file.c_str());
+    return;
+  }
+
+  auto frontend = OfflineTtsZipvoiceFrontend(
+      tokens_file, data_dir, pinyin_dict,
+      OfflineTtsZipvoiceModelMetaData{.use_espeak = true, .use_pinyin = true},
+      true);
+
+  std::string text = "how are you doing?";
+  std::vector<sherpa_onnx::TokenIDs> ans =
+      frontend.ConvertTextToTokenIds(text, "en-us");
+
+  text = "这是第一句。这是第二句。";
+  ans = frontend.ConvertTextToTokenIds(text, "en-us");
+
+  text =
+      "这是第一句。这是第二句。<pin1><yin2>测试 [S1]and hello "
+      "world[S2]这是第三句。";
+  ans = frontend.ConvertTextToTokenIds(text, "en-us");
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-frontend.cc 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-frontend.cc 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-frontend.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include <algorithm>
+#include <cctype>
+#include <codecvt>
+#include <fstream>
+#include <locale>
+#include <regex>  // NOLINT
+#include <sstream>
+#include <strstream>
+#include <utility>
+
+#if __ANDROID_API__ >= 9
+#include "android/asset_manager.h"
+#include "android/asset_manager_jni.h"
+#endif
+
+#if __OHOS__
+#include "rawfile/raw_file_manager.h"
+#endif
+
+#include "cppinyin/csrc/cppinyin.h"
+#include "espeak-ng/speak_lib.h"
+#include "phoneme_ids.hpp"
+#include "phonemize.hpp"
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-frontend.h"
+#include "sherpa-onnx/csrc/text-utils.h"
+
+namespace sherpa_onnx {
+
+void CallPhonemizeEspeak(const std::string &text,
+                         piper::eSpeakPhonemeConfig &config,  // NOLINT
+                         std::vector<std::vector<piper::Phoneme>> *phonemes);
+
+static std::unordered_map<std::string, int32_t> ReadTokens(std::istream &is) {
+  std::unordered_map<std::string, int32_t> token2id;
+
+  std::string line;
+  std::string sym;
+  int32_t id = 0;
+  while (std::getline(is, line)) {
+    std::istringstream iss(line);
+    iss >> sym;
+    if (iss.eof()) {
+      id = atoi(sym.c_str());
+      sym = " ";
+    } else {
+      iss >> id;
+    }
+    // eat the trailing \r\n on windows
+    iss >> std::ws;
+    if (!iss.eof()) {
+      SHERPA_ONNX_LOGE("Error when reading tokens: %s", line.c_str());
+      exit(-1);
+    }
+
+    if (token2id.count(sym)) {
+      SHERPA_ONNX_LOGE("Duplicated token %s. Line %s. Existing ID: %d",
+                       sym.c_str(), line.c_str(), token2id.at(sym));
+      exit(-1);
+    }
+    token2id.insert({sym, id});
+  }
+  return token2id;
+}
+
+static std::string MapPunctuations(
+    const std::string &text,
+    const std::unordered_map<std::string, std::string> &punct_map) {
+  std::string result = text;
+  for (const auto &kv : punct_map) {
+    // Replace all occurrences of kv.first with kv.second
+    size_t pos = 0;
+    while ((pos = result.find(kv.first, pos)) != std::string::npos) {
+      result.replace(pos, kv.first.length(), kv.second);
+      pos += kv.second.length();
+    }
+  }
+  return result;
+}
+
+static void ProcessPinyin(
+    const std::string &pinyin, const cppinyin::PinyinEncoder *pinyin_encoder,
+    const std::unordered_map<std::string, int32_t> &token2id,
+    std::vector<int64_t> *tokens_ids, std::vector<std::string> *tokens) {
+  auto initial = pinyin_encoder->ToInitial(pinyin);
+  if (!initial.empty()) {
+    // append '0' to fix the conflict with espeak token
+    initial = initial + "0";
+    if (token2id.count(initial)) {
+      tokens_ids->push_back(token2id.at(initial));
+      tokens->push_back(initial);
+    } else {
+      SHERPA_ONNX_LOGE("Skip unknown initial %s", initial.c_str());
+    }
+  }
+  auto final_t = pinyin_encoder->ToFinal(pinyin);
+  if (!final_t.empty()) {
+    if (!std::isdigit(final_t.back())) {
+      final_t = final_t + "5";  // use 5 for neutral tone
+    }
+    if (token2id.count(final_t)) {
+      tokens_ids->push_back(token2id.at(final_t));
+      tokens->push_back(final_t);
+    } else {
+      SHERPA_ONNX_LOGE("Skip unknown final %s", final_t.c_str());
+    }
+  }
+}
+
+static void TokenizeZh(const std::string &words,
+                       const cppinyin::PinyinEncoder *pinyin_encoder,
+                       const std::unordered_map<std::string, int32_t> &token2id,
+                       std::vector<int64_t> *token_ids,
+                       std::vector<std::string> *tokens) {
+  std::vector<std::string> pinyins;
+  pinyin_encoder->Encode(words, &pinyins, "number" /*tone*/, false /*partial*/);
+  for (const auto &pinyin : pinyins) {
+    if (pinyin_encoder->ValidPinyin(pinyin, "number" /*tone*/)) {
+      ProcessPinyin(pinyin, pinyin_encoder, token2id, token_ids, tokens);
+    } else {
+      auto wstext = ToWideString(pinyin);
+      for (auto &wc : wstext) {
+        auto c = ToString(std::wstring(1, wc));
+        if (token2id.count(c)) {
+          token_ids->push_back(token2id.at(c));
+          tokens->push_back(c);
+        } else {
+          SHERPA_ONNX_LOGE("Skip unknown character %s", c.c_str());
+        }
+      }
+    }
+  }
+}
+
+static void TokenizeEn(const std::string &words,
+                       const std::unordered_map<std::string, int32_t> &token2id,
+                       const std::string &voice,
+                       std::vector<int64_t> *token_ids,
+                       std::vector<std::string> *tokens) {
+  piper::eSpeakPhonemeConfig config;
+  // ./bin/espeak-ng-bin --path  ./install/share/espeak-ng-data/ --voices
+  // to list available voices
+  config.voice = voice;  // e.g., voice is en-us
+
+  std::vector<std::vector<piper::Phoneme>> phonemes;
+
+  CallPhonemizeEspeak(words, config, &phonemes);
+
+  for (const auto &p : phonemes) {
+    for (const auto &ph : p) {
+      auto token = Utf32ToUtf8(std::u32string(1, ph));
+      if (token2id.count(token)) {
+        token_ids->push_back(token2id.at(token));
+        tokens->push_back(token);
+      } else {
+        SHERPA_ONNX_LOGE("Skip unknown phoneme %s", token.c_str());
+      }
+    }
+  }
+}
+
+static void TokenizeTag(
+    const std::string &words,
+    const std::unordered_map<std::string, int32_t> &token2id,
+    std::vector<int64_t> *tokens_ids, std::vector<std::string> *tokens) {
+  // in zipvoice tags are all in upper case
+  std::string tag = ToUpperAscii(words);
+  if (token2id.count(tag)) {
+    tokens_ids->push_back(token2id.at(tag));
+    tokens->push_back(tag);
+  } else {
+    SHERPA_ONNX_LOGE("Skip unknown tag %s", tag.c_str());
+  }
+}
+
+static void TokenizePinyin(
+    const std::string &words, const cppinyin::PinyinEncoder *pinyin_encoder,
+    const std::unordered_map<std::string, int32_t> &token2id,
+    std::vector<int64_t> *tokens_ids, std::vector<std::string> *tokens) {
+  // words are in the form of <ha3>, <ha4>
+  std::string pinyin = words.substr(1, words.size() - 2);
+  if (!pinyin.empty()) {
+    if (pinyin[pinyin.size() - 1] == '5') {
+      pinyin = pinyin.substr(0, pinyin.size() - 1);  // remove the tone
+    }
+    if (pinyin_encoder->ValidPinyin(pinyin, "number" /*tone*/)) {
+      ProcessPinyin(pinyin, pinyin_encoder, token2id, tokens_ids, tokens);
+    } else {
+      SHERPA_ONNX_LOGE("Invalid pinyin %s", pinyin.c_str());
+    }
+  }
+}
+
+OfflineTtsZipvoiceFrontend::OfflineTtsZipvoiceFrontend(
+    const std::string &tokens, const std::string &data_dir,
+    const std::string &pinyin_dict,
+    const OfflineTtsZipvoiceModelMetaData &meta_data, bool debug)
+    : debug_(debug), meta_data_(meta_data) {
+  std::ifstream is(tokens);
+  token2id_ = ReadTokens(is);
+  if (meta_data_.use_pinyin) {
+    pinyin_encoder_ = std::make_unique<cppinyin::PinyinEncoder>(pinyin_dict);
+  } else {
+    pinyin_encoder_ = nullptr;
+  }
+  if (meta_data_.use_espeak) {
+    // We should copy the directory of espeak-ng-data from the asset to
+    // some internal or external storage and then pass the directory to
+    // data_dir.
+    InitEspeak(data_dir);
+  }
+}
+
+template <typename Manager>
+OfflineTtsZipvoiceFrontend::OfflineTtsZipvoiceFrontend(
+    Manager *mgr, const std::string &tokens, const std::string &data_dir,
+    const std::string &pinyin_dict,
+    const OfflineTtsZipvoiceModelMetaData &meta_data, bool debug)
+    : debug_(debug), meta_data_(meta_data) {
+  auto buf = ReadFile(mgr, tokens);
+  std::istrstream is(buf.data(), buf.size());
+  token2id_ = ReadTokens(is);
+  if (meta_data_.use_pinyin) {
+    auto buf = ReadFile(mgr, pinyin_dict);
+    std::istringstream iss(std::string(buf.begin(), buf.end()));
+    pinyin_encoder_ = std::make_unique<cppinyin::PinyinEncoder>(iss);
+  } else {
+    pinyin_encoder_ = nullptr;
+  }
+  if (meta_data_.use_espeak) {
+    // We should copy the directory of espeak-ng-data from the asset to
+    // some internal or external storage and then pass the directory to
+    // data_dir.
+    InitEspeak(data_dir);
+  }
+}
+
+std::vector<TokenIDs> OfflineTtsZipvoiceFrontend::ConvertTextToTokenIds(
+    const std::string &_text, const std::string &voice) const {
+  std::string text = _text;
+  if (meta_data_.use_espeak) {
+    text = ToLowerAscii(_text);
+  }
+
+  text = MapPunctuations(text, punct_map_);
+
+  auto wstext = ToWideString(text);
+
+  std::vector<std::string> parts;
+  // Match <...>, [...], or single character
+  std::wregex part_pattern(LR"([<\[].*?[>\]]|.)");
+  auto words_begin =
+      std::wsregex_iterator(wstext.begin(), wstext.end(), part_pattern);
+  auto words_end = std::wsregex_iterator();
+  for (std::wsregex_iterator i = words_begin; i != words_end; ++i) {
+    parts.push_back(ToString(i->str()));
+  }
+
+  // types are en, zh, tag, pinyin, other
+  // tag is [...]
+  // pinyin is <...>
+  // other is any other text that does not match the above, normally numbers and
+  // punctuations
+  std::vector<std::string> types;
+  for (auto &word : parts) {
+    if (word.size() == 1 && std::isalpha(word[0])) {
+      // single character, e.g., 'a', 'b', 'c'
+      types.push_back("en");
+    } else if (word.size() > 1 && word[0] == '<' && word.back() == '>') {
+      // e.g., <ha3>, <ha4>
+      types.push_back("pinyin");
+    } else if (word.size() > 1 && word[0] == '[' && word.back() == ']') {
+      types.push_back("tag");
+    } else if (ContainsCJK(word)) {  // word contains one CJK characters
+      types.push_back("zh");
+    } else {
+      types.push_back("other");
+    }
+  }
+
+  std::vector<std::pair<std::string, std::string>> parts_with_types;
+  std::ostringstream oss;
+  std::string t_lang;
+  oss.str("");
+  std::ostringstream debug_oss;
+  if (debug_) {
+    debug_oss << "Text : " << _text << ", Parts with types: \n";
+  }
+  for (int32_t i = 0; i < types.size(); ++i) {
+    if (i == 0) {
+      oss << parts[i];
+      t_lang = types[i];
+    } else {
+      if (t_lang == "other" && (types[i] != "tag" && types[i] != "pinyin")) {
+        // combine into current type if the previous part is "other"
+        // do not combine with "tag" or "pinyin"
+        oss << parts[i];
+        t_lang = types[i];
+      } else {
+        if ((t_lang == types[i] || types[i] == "other") && t_lang != "pinyin" &&
+            t_lang != "tag") {
+          // same language or other, continue
+          // do not combine other into "pinyin" or "tag"
+          oss << parts[i];
+        } else {
+          // different language, start a new sentence
+          std::string part = oss.str();
+          oss.str("");
+          parts_with_types.emplace_back(part, t_lang);
+          if (debug_) {
+            debug_oss << "(" << part << ", " << t_lang << "),";
+          }
+          oss << parts[i];
+          t_lang = types[i];
+        }
+      }
+    }
+  }
+
+  std::string part = oss.str();
+  oss.str("");
+  parts_with_types.emplace_back(part, t_lang);
+  if (debug_) {
+    debug_oss << "(" << part << ", " << t_lang << ")\n";
+    SHERPA_ONNX_LOGE("%s", debug_oss.str().c_str());
+    debug_oss.str("");
+  }
+
+  std::vector<int64_t> token_ids;
+  std::vector<std::string> tokens;  // for debugging
+  for (const auto &pt : parts_with_types) {
+    if (pt.second == "zh") {
+      TokenizeZh(pt.first, pinyin_encoder_.get(), token2id_, &token_ids,
+                 &tokens);
+    } else if (pt.second == "en") {
+      TokenizeEn(pt.first, token2id_, voice, &token_ids, &tokens);
+    } else if (pt.second == "pinyin") {
+      TokenizePinyin(pt.first, pinyin_encoder_.get(), token2id_, &token_ids,
+                     &tokens);
+    } else if (pt.second == "tag") {
+      TokenizeTag(pt.first, token2id_, &token_ids, &tokens);
+    } else {
+      SHERPA_ONNX_LOGE("Unexpected type: %s", pt.second.c_str());
+      exit(-1);
+    }
+  }
+  if (debug_) {
+    debug_oss << "Tokens and IDs: \n";
+    for (int32_t i = 0; i < tokens.size(); i++) {
+      debug_oss << "(" << tokens[i] << ", " << token_ids[i] << "),";
+    }
+    debug_oss << "\n";
+    SHERPA_ONNX_LOGE("%s", debug_oss.str().c_str());
+  }
+
+  std::vector<TokenIDs> ans;
+  ans.push_back(TokenIDs(std::move(token_ids)));
+  return ans;
+}
+
+#if __ANDROID_API__ >= 9
+template OfflineTtsZipvoiceFrontend::OfflineTtsZipvoiceFrontend(
+    AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
+    const std::string &pinyin_dict,
+    const OfflineTtsZipvoiceModelMetaData &meta_data);
+
+#endif
+
+#if __OHOS__
+template OfflineTtsZipvoiceFrontend::OfflineTtsZipvoiceFrontend(
+    NativeResourceManager *mgr, const std::string &tokens,
+    const std::string &data_dir, const std::string &pinyin_dict,
+    const OfflineTtsZipvoiceModelMetaData &meta_data);
+
+#endif
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-frontend.h 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-frontend.h 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-frontend.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_FRONTEND_H_
+#define SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_FRONTEND_H_
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cppinyin/csrc/cppinyin.h"
+#include "sherpa-onnx/csrc/offline-tts-frontend.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model-meta-data.h"
+
+namespace sherpa_onnx {
+
+class OfflineTtsZipvoiceFrontend : public OfflineTtsFrontend {
+ public:
+  OfflineTtsZipvoiceFrontend(const std::string &tokens,
+                             const std::string &data_dir,
+                             const std::string &pinyin_dict,
+                             const OfflineTtsZipvoiceModelMetaData &meta_data,
+                             bool debug = false);
+
+  template <typename Manager>
+  OfflineTtsZipvoiceFrontend(Manager *mgr, const std::string &tokens,
+                             const std::string &data_dir,
+                             const std::string &pinyin_dict,
+                             const OfflineTtsZipvoiceModelMetaData &meta_data,
+                             bool debug = false);
+
+  /** Convert a string to token IDs.
+   *
+   * @param text The input text.
+   *             Example 1: "This is the first sample sentence; this is the
+   *             second one." Example 2: "这是第一句。这是第二句。"
+   * @param voice Optional. It is for espeak-ng.
+   *
+   * @return Return a vector-of-vector of token IDs. Each subvector contains
+   *         a sentence that can be processed independently.
+   *         If a frontend does not support splitting the text into
+   * sentences, the resulting vector contains only one subvector.
+   */
+  std::vector<TokenIDs> ConvertTextToTokenIds(
+      const std::string &text, const std::string &voice = "") const override;
+
+ private:
+  bool debug_ = false;
+  std::unordered_map<std::string, int32_t> token2id_;
+  const std::unordered_map<std::string, std::string> punct_map_ = {
+      {"，", ","}, {"。", "."}, {"！", "!"},  {"？", "?"},     {"；", ";"},
+      {"：", ":"}, {"、", ","}, {"‘", "'"},   {"“", "\""},     {"”", "\""},
+      {"’", "'"},  {"⋯", "…"},  {"···", "…"}, {"・・・", "…"}, {"...", "…"}};
+  OfflineTtsZipvoiceModelMetaData meta_data_;
+  std::unique_ptr<cppinyin::PinyinEncoder> pinyin_encoder_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_FRONTEND_H_
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-impl.h 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-impl.h 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-impl.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_IMPL_H_
+#define SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_IMPL_H_
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <strstream>
+#include <utility>
+#include <vector>
+
+#include "kaldi-native-fbank/csrc/mel-computations.h"
+#include "kaldi-native-fbank/csrc/stft.h"
+#include "sherpa-onnx/csrc/macros.h"
+#include "sherpa-onnx/csrc/offline-tts-frontend.h"
+#include "sherpa-onnx/csrc/offline-tts-impl.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-frontend.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model.h"
+#include "sherpa-onnx/csrc/onnx-utils.h"
+#include "sherpa-onnx/csrc/resample.h"
+#include "sherpa-onnx/csrc/vocoder.h"
+
+namespace sherpa_onnx {
+
+class OfflineTtsZipvoiceImpl : public OfflineTtsImpl {
+ public:
+  explicit OfflineTtsZipvoiceImpl(const OfflineTtsConfig &config)
+      : config_(config),
+        model_(std::make_unique<OfflineTtsZipvoiceModel>(config.model)),
+        vocoder_(Vocoder::Create(config.model)) {
+    InitFrontend();
+  }
+
+  template <typename Manager>
+  OfflineTtsZipvoiceImpl(Manager *mgr, const OfflineTtsConfig &config)
+      : config_(config),
+        model_(std::make_unique<OfflineTtsZipvoiceModel>(mgr, config.model)),
+        vocoder_(Vocoder::Create(mgr, config.model)) {
+    InitFrontend(mgr);
+  }
+
+  int32_t SampleRate() const override {
+    return model_->GetMetaData().sample_rate;
+  }
+
+  GeneratedAudio Generate(
+      const std::string &text, const std::string &prompt_text,
+      const std::vector<float> &prompt_samples, int32_t sample_rate,
+      float speed, int32_t num_steps,
+      GeneratedAudioCallback callback = nullptr) const override {
+    std::vector<TokenIDs> text_token_ids =
+        frontend_->ConvertTextToTokenIds(text);
+
+    std::vector<TokenIDs> prompt_token_ids =
+        frontend_->ConvertTextToTokenIds(prompt_text);
+
+    if (text_token_ids.empty() ||
+        (text_token_ids.size() == 1 && text_token_ids[0].tokens.empty())) {
+#if __OHOS__
+      SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs",
+                       text.c_str());
+#else
+      SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str());
+#endif
+      return {};
+    }
+
+    if (prompt_token_ids.empty() ||
+        (prompt_token_ids.size() == 1 && prompt_token_ids[0].tokens.empty())) {
+#if __OHOS__
+      SHERPA_ONNX_LOGE(
+          "Failed to convert prompt text '%{public}s' to token IDs",
+          prompt_text.c_str());
+#else
+      SHERPA_ONNX_LOGE("Failed to convert prompt text '%s' to token IDs",
+                       prompt_text.c_str());
+#endif
+      return {};
+    }
+
+    // we assume batch size is 1
+    std::vector<int64_t> tokens = text_token_ids[0].tokens;
+    std::vector<int64_t> prompt_tokens = prompt_token_ids[0].tokens;
+
+    return Process(tokens, prompt_tokens, prompt_samples, sample_rate, speed,
+                   num_steps);
+  }
+
+ private:
+  template <typename Manager>
+  void InitFrontend(Manager *mgr) {
+    const auto &meta_data = model_->GetMetaData();
+    frontend_ = std::make_unique<OfflineTtsZipvoiceFrontend>(
+        mgr, config_.model.zipvoice.tokens, config_.model.zipvoice.data_dir,
+        config_.model.zipvoice.pinyin_dict, meta_data, config_.model.debug);
+  }
+
+  void InitFrontend() {
+    const auto &meta_data = model_->GetMetaData();
+
+    if (meta_data.use_pinyin && config_.model.zipvoice.pinyin_dict.empty()) {
+      SHERPA_ONNX_LOGE(
+          "Please provide --zipvoice-pinyin-dict for converting Chinese into "
+          "pinyin.");
+      exit(-1);
+    }
+    if (meta_data.use_espeak && config_.model.zipvoice.data_dir.empty()) {
+      SHERPA_ONNX_LOGE("Please provide --zipvoice-data-dir for espeak-ng.");
+      exit(-1);
+    }
+    frontend_ = std::make_unique<OfflineTtsZipvoiceFrontend>(
+        config_.model.zipvoice.tokens, config_.model.zipvoice.data_dir,
+        config_.model.zipvoice.pinyin_dict, meta_data, config_.model.debug);
+  }
+
+  std::vector<int32_t> ComputeMelSpectrogram(
+      const std::vector<float> &_samples, int32_t sample_rate,
+      std::vector<float> *prompt_features) const {
+    const auto &meta = model_->GetMetaData();
+    if (sample_rate != meta.sample_rate) {
+      SHERPA_ONNX_LOGE(
+          "Creating a resampler:\n"
+          "   in_sample_rate: %d\n"
+          "   output_sample_rate: %d\n",
+          sample_rate, static_cast<int32_t>(meta.sample_rate));
+
+      float min_freq = std::min<int32_t>(sample_rate, meta.sample_rate);
+      float lowpass_cutoff = 0.99 * 0.5 * min_freq;
+
+      int32_t lowpass_filter_width = 6;
+      auto resampler = std::make_unique<LinearResample>(
+          sample_rate, meta.sample_rate, lowpass_cutoff, lowpass_filter_width);
+      std::vector<float> samples;
+      resampler->Resample(_samples.data(), _samples.size(), true, &samples);
+      return ComputeMelSpectrogram(samples, prompt_features);
+    } else {
+      // Use the original samples if the sample rate matches
+      return ComputeMelSpectrogram(_samples, prompt_features);
+    }
+  }
+
+  std::vector<int32_t> ComputeMelSpectrogram(
+      const std::vector<float> &samples,
+      std::vector<float> *prompt_features) const {
+    const auto &meta = model_->GetMetaData();
+
+    int32_t sample_rate = meta.sample_rate;
+    int32_t n_fft = meta.n_fft;
+    int32_t hop_length = meta.hop_length;
+    int32_t win_length = meta.window_length;
+    int32_t num_mels = meta.num_mels;
+
+    knf::StftConfig stft_config;
+    stft_config.n_fft = n_fft;
+    stft_config.hop_length = hop_length;
+    stft_config.win_length = win_length;
+    stft_config.window_type = "hann";
+    stft_config.center = true;
+
+    knf::Stft stft(stft_config);
+    auto stft_result = stft.Compute(samples.data(), samples.size());
+    int32_t num_frames = stft_result.num_frames;
+    int32_t fft_bins = n_fft / 2 + 1;
+
+    knf::FrameExtractionOptions frame_opts;
+    frame_opts.samp_freq = sample_rate;
+    frame_opts.frame_length_ms = win_length * 1000 / sample_rate;
+    frame_opts.frame_shift_ms = hop_length * 1000 / sample_rate;
+    frame_opts.window_type = "hanning";
+
+    knf::MelBanksOptions mel_opts;
+    mel_opts.num_bins = num_mels;
+    mel_opts.low_freq = 0;
+    mel_opts.high_freq = sample_rate / 2;
+    mel_opts.is_librosa = true;
+    mel_opts.use_slaney_mel_scale = false;
+    mel_opts.norm = "";
+
+    knf::MelBanks mel_banks(mel_opts, frame_opts, 1.0f);
+
+    prompt_features->clear();
+    prompt_features->reserve(num_frames * num_mels);
+
+    for (int32_t i = 0; i < num_frames; ++i) {
+      std::vector<float> magnitude_spectrum(fft_bins);
+      for (int32_t k = 0; k < fft_bins; ++k) {
+        float real = stft_result.real[i * fft_bins + k];
+        float imag = stft_result.imag[i * fft_bins + k];
+        magnitude_spectrum[k] = std::sqrt(real * real + imag * imag);
+      }
+      std::vector<float> mel_features(num_mels, 0.0f);
+      mel_banks.Compute(magnitude_spectrum.data(), mel_features.data());
+      for (auto &v : mel_features) {
+        v = std::log(v + 1e-10f);
+      }
+      // Instead of push_back a vector, push elements individually
+      prompt_features->insert(prompt_features->end(), mel_features.begin(),
+                              mel_features.end());
+    }
+    if (num_frames == 0) {
+      SHERPA_ONNX_LOGE("No frames extracted from the prompt audio");
+      return {0, 0};
+    } else {
+      return {num_frames, num_mels};
+    }
+  }
+
+  GeneratedAudio Process(const std::vector<int64_t> &tokens,
+                         const std::vector<int64_t> &prompt_tokens,
+                         const std::vector<float> &prompt_samples,
+                         int32_t sample_rate, float speed,
+                         int num_steps) const {
+    auto memory_info =
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+    std::array<int64_t, 2> tokens_shape = {1,
+                                           static_cast<int64_t>(tokens.size())};
+    Ort::Value tokens_tensor = Ort::Value::CreateTensor(
+        memory_info, const_cast<int64_t *>(tokens.data()), tokens.size(),
+        tokens_shape.data(), tokens_shape.size());
+
+    std::array<int64_t, 2> prompt_tokens_shape = {
+        1, static_cast<int64_t>(prompt_tokens.size())};
+    Ort::Value prompt_tokens_tensor = Ort::Value::CreateTensor(
+        memory_info, const_cast<int64_t *>(prompt_tokens.data()),
+        prompt_tokens.size(), prompt_tokens_shape.data(),
+        prompt_tokens_shape.size());
+
+    float target_rms = config_.model.zipvoice.target_rms;
+    float feat_scale = config_.model.zipvoice.feat_scale;
+
+    // Scale prompt_samples
+    std::vector<float> prompt_samples_scaled = prompt_samples;
+    float prompt_rms = 0.0f;
+    double sum_sq = 0.0;
+    // Compute RMS of prompt_samples
+    for (float s : prompt_samples_scaled) {
+      sum_sq += s * s;
+    }
+    prompt_rms = std::sqrt(sum_sq / prompt_samples_scaled.size());
+    if (prompt_rms < target_rms && prompt_rms > 0.0f) {
+      float scale = target_rms / static_cast<float>(prompt_rms);
+      for (auto &s : prompt_samples_scaled) {
+        s *= scale;
+      }
+    }
+
+    std::vector<float> prompt_features;
+    auto res_shape = ComputeMelSpectrogram(prompt_samples_scaled, sample_rate,
+                                           &prompt_features);
+
+    int32_t num_frames = res_shape[0];
+    int32_t mel_dim = res_shape[1];
+
+    if (feat_scale != 1.0f) {
+      for (auto &item : prompt_features) {
+        item *= feat_scale;
+      }
+    }
+
+    std::array<int64_t, 3> shape = {1, num_frames, mel_dim};
+    auto prompt_features_tensor = Ort::Value::CreateTensor(
+        memory_info, prompt_features.data(), prompt_features.size(),
+        shape.data(), shape.size());
+
+    Ort::Value mel =
+        model_->Run(std::move(tokens_tensor), std::move(prompt_tokens_tensor),
+                    std::move(prompt_features_tensor), speed, num_steps);
+
+    // Assume mel_shape = {1, T, C}
+    std::vector<int64_t> mel_shape = mel.GetTensorTypeAndShapeInfo().GetShape();
+    int64_t T = mel_shape[1], C = mel_shape[2];
+
+    float *mel_data = mel.GetTensorMutableData<float>();
+    std::vector<float> mel_permuted(C * T);
+
+    for (int64_t c = 0; c < C; ++c) {
+      for (int64_t t = 0; t < T; ++t) {
+        int64_t src_idx = t * C + c;  // src: [T, C] (row major)
+        int64_t dst_idx = c * T + t;  // dst: [C, T] (row major)
+        mel_permuted[dst_idx] = mel_data[src_idx] / feat_scale;
+      }
+    }
+
+    std::array<int64_t, 3> new_shape = {1, C, T};
+    Ort::Value mel_new = Ort::Value::CreateTensor<float>(
+        memory_info, mel_permuted.data(), mel_permuted.size(), new_shape.data(),
+        new_shape.size());
+
+    GeneratedAudio ans;
+    ans.samples = vocoder_->Run(std::move(mel_new));
+    ans.sample_rate = model_->GetMetaData().sample_rate;
+
+    if (prompt_rms < target_rms && target_rms > 0.0f) {
+      float scale = prompt_rms / target_rms;
+      for (auto &s : ans.samples) {
+        s *= scale;
+      }
+    }
+    return ans;
+  }
+
+ private:
+  OfflineTtsConfig config_;
+  std::unique_ptr<OfflineTtsZipvoiceModel> model_;
+  std::unique_ptr<Vocoder> vocoder_;
+  std::unique_ptr<OfflineTtsFrontend> frontend_;
+};
+
+}  // namespace sherpa_onnx
+#endif  // SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_IMPL_H_
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-model-config.cc 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-model-config.cc 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-model-config.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h"
+
+#include <vector>
+
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+
+namespace sherpa_onnx {
+
+void OfflineTtsZipvoiceModelConfig::Register(ParseOptions *po) {
+  po->Register("zipvoice-tokens", &tokens,
+               "Path to tokens.txt for ZipVoice models");
+  po->Register("zipvoice-data-dir", &data_dir,
+               "Path to the directory containing dict for espeak-ng.");
+  po->Register("zipvoice-pinyin-dict", &pinyin_dict,
+               "Path to the pinyin dictionary for cppinyin (i.e converting "
+               "Chinese into phones).");
+  po->Register("zipvoice-text-model", &text_model,
+               "Path to zipvoice text model");
+  po->Register("zipvoice-flow-matching-model", &flow_matching_model,
+               "Path to zipvoice flow-matching model");
+  po->Register("zipvoice-vocoder", &vocoder, "Path to zipvoice vocoder");
+  po->Register("zipvoice-feat-scale", &feat_scale,
+               "Feature scale for ZipVoice (default: 0.1)");
+  po->Register("zipvoice-t-shift", &t_shift,
+               "Shift t to smaller ones if t_shift < 1.0 (default: 0.5)");
+  po->Register(
+      "zipvoice-target-rms", &target_rms,
+      "Target speech normalization rms value for ZipVoice (default: 0.1)");
+  po->Register(
+      "zipvoice-guidance-scale", &guidance_scale,
+      "The scale of classifier-free guidance during inference for ZipVoice "
+      "(default: 1.0)");
+}
+
+bool OfflineTtsZipvoiceModelConfig::Validate() const {
+  if (tokens.empty()) {
+    SHERPA_ONNX_LOGE("Please provide --zipvoice-tokens");
+    return false;
+  }
+  if (!FileExists(tokens)) {
+    SHERPA_ONNX_LOGE("--zipvoice-tokens: '%s' does not exist", tokens.c_str());
+    return false;
+  }
+
+  if (text_model.empty()) {
+    SHERPA_ONNX_LOGE("Please provide --zipvoice-text-model");
+    return false;
+  }
+  if (!FileExists(text_model)) {
+    SHERPA_ONNX_LOGE("--zipvoice-text-model: '%s' does not exist",
+                     text_model.c_str());
+    return false;
+  }
+
+  if (flow_matching_model.empty()) {
+    SHERPA_ONNX_LOGE("Please provide --zipvoice-flow-matching-model");
+    return false;
+  }
+  if (!FileExists(flow_matching_model)) {
+    SHERPA_ONNX_LOGE("--zipvoice-flow-matching-model: '%s' does not exist",
+                     flow_matching_model.c_str());
+    return false;
+  }
+
+  if (vocoder.empty()) {
+    SHERPA_ONNX_LOGE("Please provide --zipvoice-vocoder");
+    return false;
+  }
+
+  if (!FileExists(vocoder)) {
+    SHERPA_ONNX_LOGE("--zipvoice-vocoder: '%s' does not exist",
+                     vocoder.c_str());
+    return false;
+  }
+
+  if (!data_dir.empty()) {
+    std::vector<std::string> required_files = {
+        "phontab",
+        "phonindex",
+        "phondata",
+        "intonations",
+    };
+    for (const auto &f : required_files) {
+      if (!FileExists(data_dir + "/" + f)) {
+        SHERPA_ONNX_LOGE(
+            "'%s/%s' does not exist. Please check zipvoice-data-dir",
+            data_dir.c_str(), f.c_str());
+        return false;
+      }
+    }
+  }
+
+  if (!pinyin_dict.empty() && !FileExists(pinyin_dict)) {
+    SHERPA_ONNX_LOGE("--zipvoice-pinyin-dict: '%s' does not exist",
+                     pinyin_dict.c_str());
+    return false;
+  }
+
+  if (feat_scale <= 0) {
+    SHERPA_ONNX_LOGE("--zipvoice-feat-scale must be positive. Given: %f",
+                     feat_scale);
+    return false;
+  }
+
+  if (t_shift < 0) {
+    SHERPA_ONNX_LOGE("--zipvoice-t-shift must be non-negative. Given: %f",
+                     t_shift);
+    return false;
+  }
+
+  if (target_rms <= 0) {
+    SHERPA_ONNX_LOGE("--zipvoice-target-rms must be positive. Given: %f",
+                     target_rms);
+    return false;
+  }
+
+  if (guidance_scale <= 0) {
+    SHERPA_ONNX_LOGE("--zipvoice-guidance-scale must be positive. Given: %f",
+                     guidance_scale);
+    return false;
+  }
+
+  return true;
+}
+
+std::string OfflineTtsZipvoiceModelConfig::ToString() const {
+  std::ostringstream os;
+
+  os << "OfflineTtsZipvoiceModelConfig(";
+  os << "tokens=\"" << tokens << "\", ";
+  os << "text_model=\"" << text_model << "\", ";
+  os << "flow_matching_model=\"" << flow_matching_model << "\", ";
+  os << "vocoder=\"" << vocoder << "\", ";
+  os << "data_dir=\"" << data_dir << "\", ";
+  os << "pinyin_dict=\"" << pinyin_dict << "\", ";
+  os << "feat_scale=" << feat_scale << ", ";
+  os << "t_shift=" << t_shift << ", ";
+  os << "target_rms=" << target_rms << ", ";
+  os << "guidance_scale=" << guidance_scale << ")";
+
+  return os.str();
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_CONFIG_H_
+#define SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_CONFIG_H_
+
+#include <cstdint>
+#include <string>
+
+#include "sherpa-onnx/csrc/parse-options.h"
+
+namespace sherpa_onnx {
+
+struct OfflineTtsZipvoiceModelConfig {
+  std::string tokens;
+  std::string text_model;
+  std::string flow_matching_model;
+  std::string vocoder;
+
+  // If data_dir is given, lexicon is ignored
+  // data_dir is for piper-phonemize, which uses espeak-ng
+  std::string data_dir;
+
+  // Used for converting Chinese characters to pinyin
+  std::string pinyin_dict;
+
+  float feat_scale = 0.1;
+  float t_shift = 0.5;
+  float target_rms = 0.1;
+  float guidance_scale = 1.0;
+
+  OfflineTtsZipvoiceModelConfig() = default;
+
+  OfflineTtsZipvoiceModelConfig(
+      const std::string &tokens, const std::string &text_model,
+      const std::string &flow_matching_model, const std::string &vocoder,
+      const std::string &data_dir, const std::string &pinyin_dict,
+      float feat_scale = 0.1, float t_shift = 0.5, float target_rms = 0.1,
+      float guidance_scale = 1.0)
+      : tokens(tokens),
+        text_model(text_model),
+        flow_matching_model(flow_matching_model),
+        vocoder(vocoder),
+        data_dir(data_dir),
+        pinyin_dict(pinyin_dict),
+        feat_scale(feat_scale),
+        t_shift(t_shift),
+        target_rms(target_rms),
+        guidance_scale(guidance_scale) {}
+
+  void Register(ParseOptions *po);
+  bool Validate() const;
+
+  std::string ToString() const;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_CONFIG_H_
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-model-meta-data.h 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-model-meta-data.h 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-model-meta-data.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_META_DATA_H_
+#define SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_META_DATA_H_
+
+#include <cstdint>
+#include <string>
+
+namespace sherpa_onnx {
+
+// If you are not sure what each field means, please
+// have a look of the Python file in the model directory that
+// you have downloaded.
+struct OfflineTtsZipvoiceModelMetaData {
+  int32_t version = 1;
+  int32_t feat_dim = 100;
+  int32_t sample_rate = 24000;
+  int32_t n_fft = 1024;
+  int32_t hop_length = 256;
+  int32_t window_length = 1024;
+  int32_t num_mels = 100;
+  int32_t use_espeak = 1;
+  int32_t use_pinyin = 1;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_META_DATA_H_
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-model.cc 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-model.cc 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-model.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model.h"
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if __ANDROID_API__ >= 9
+#include "android/asset_manager.h"
+#include "android/asset_manager_jni.h"
+#endif
+
+#if __OHOS__
+#include "rawfile/raw_file_manager.h"
+#endif
+
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+#include "sherpa-onnx/csrc/onnx-utils.h"
+#include "sherpa-onnx/csrc/session.h"
+#include "sherpa-onnx/csrc/text-utils.h"
+
+namespace sherpa_onnx {
+
+class OfflineTtsZipvoiceModel::Impl {
+ public:
+  explicit Impl(const OfflineTtsModelConfig &config)
+      : config_(config),
+        env_(ORT_LOGGING_LEVEL_ERROR),
+        sess_opts_(GetSessionOptions(config)),
+        allocator_{} {
+    auto text_buf = ReadFile(config.zipvoice.text_model);
+    auto fm_buf = ReadFile(config.zipvoice.flow_matching_model);
+    Init(text_buf.data(), text_buf.size(), fm_buf.data(), fm_buf.size());
+  }
+
+  template <typename Manager>
+  Impl(Manager *mgr, const OfflineTtsModelConfig &config)
+      : config_(config),
+        env_(ORT_LOGGING_LEVEL_ERROR),
+        sess_opts_(GetSessionOptions(config)),
+        allocator_{} {
+    auto text_buf = ReadFile(mgr, config.zipvoice.text_model);
+    auto fm_buf = ReadFile(mgr, config.zipvoice.flow_matching_model);
+    Init(text_buf.data(), text_buf.size(), fm_buf.data(), fm_buf.size());
+  }
+
+  const OfflineTtsZipvoiceModelMetaData &GetMetaData() const {
+    return meta_data_;
+  }
+
+  Ort::Value Run(Ort::Value tokens, Ort::Value prompt_tokens,
+                 Ort::Value prompt_features, float speed, int32_t num_steps) {
+    auto memory_info =
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+    std::vector<int64_t> tokens_shape =
+        tokens.GetTensorTypeAndShapeInfo().GetShape();
+    int64_t batch_size = tokens_shape[0];
+    if (batch_size != 1) {
+      SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d",
+                       static_cast<int32_t>(batch_size));
+      exit(-1);
+    }
+
+    std::vector<int64_t> prompt_feat_shape =
+        prompt_features.GetTensorTypeAndShapeInfo().GetShape();
+
+    int64_t prompt_feat_len = prompt_feat_shape[1];
+    int64_t prompt_feat_len_shape = 1;
+    Ort::Value prompt_feat_len_tensor = Ort::Value::CreateTensor<int64_t>(
+        memory_info, &prompt_feat_len, 1, &prompt_feat_len_shape, 1);
+
+    int64_t speed_shape = 1;
+    Ort::Value speed_tensor = Ort::Value::CreateTensor<float>(
+        memory_info, &speed, 1, &speed_shape, 1);
+
+    std::vector<Ort::Value> text_inputs;
+    text_inputs.reserve(4);
+    text_inputs.push_back(std::move(tokens));
+    text_inputs.push_back(std::move(prompt_tokens));
+    text_inputs.push_back(std::move(prompt_feat_len_tensor));
+    text_inputs.push_back(std::move(speed_tensor));
+
+    // forward text-encoder
+    auto text_out =
+        text_sess_->Run({}, text_input_names_ptr_.data(), text_inputs.data(),
+                        text_inputs.size(), text_output_names_ptr_.data(),
+                        text_output_names_ptr_.size());
+
+    Ort::Value &text_condition = text_out[0];
+
+    std::vector<int64_t> text_cond_shape =
+        text_condition.GetTensorTypeAndShapeInfo().GetShape();
+    int64_t num_frames = text_cond_shape[1];
+
+    int64_t feat_dim = meta_data_.feat_dim;
+
+    std::vector<float> x_data(batch_size * num_frames * feat_dim);
+    std::default_random_engine rng(std::random_device{}());
+    std::normal_distribution<float> norm(0, 1);
+    for (auto &v : x_data) v = norm(rng);
+    std::vector<int64_t> x_shape = {batch_size, num_frames, feat_dim};
+    Ort::Value x = Ort::Value::CreateTensor<float>(
+        memory_info, x_data.data(), x_data.size(), x_shape.data(),
+        x_shape.size());
+
+    std::vector<float> speech_cond_data(batch_size * num_frames * feat_dim,
+                                        0.0f);
+    const float *src = prompt_features.GetTensorData<float>();
+    float *dst = speech_cond_data.data();
+    std::memcpy(dst, src,
+                batch_size * prompt_feat_len * feat_dim * sizeof(float));
+    std::vector<int64_t> speech_cond_shape = {batch_size, num_frames, feat_dim};
+    Ort::Value speech_condition = Ort::Value::CreateTensor<float>(
+        memory_info, speech_cond_data.data(), speech_cond_data.size(),
+        speech_cond_shape.data(), speech_cond_shape.size());
+
+    float t_shift = config_.zipvoice.t_shift;
+    float guidance_scale = config_.zipvoice.guidance_scale;
+
+    std::vector<float> timesteps(num_steps + 1);
+    for (int32_t i = 0; i <= num_steps; ++i) {
+      float t = static_cast<float>(i) / num_steps;
+      timesteps[i] = t_shift * t / (1.0f + (t_shift - 1.0f) * t);
+    }
+
+    int64_t guidance_scale_shape = 1;
+    Ort::Value guidance_scale_tensor = Ort::Value::CreateTensor<float>(
+        memory_info, &guidance_scale, 1, &guidance_scale_shape, 1);
+
+    std::vector<Ort::Value> fm_inputs;
+    fm_inputs.reserve(5);
+    // fm_inputs[0] is t tensor, will set in for loop
+    fm_inputs.emplace_back(nullptr);
+    fm_inputs.push_back(std::move(x));
+    fm_inputs.push_back(std::move(text_condition));
+    fm_inputs.push_back(std::move(speech_condition));
+    fm_inputs.push_back(std::move(guidance_scale_tensor));
+
+    for (int32_t step = 0; step < num_steps; ++step) {
+      float t_val = timesteps[step];
+      int64_t t_shape = 1;
+      Ort::Value t_tensor =
+          Ort::Value::CreateTensor<float>(memory_info, &t_val, 1, &t_shape, 1);
+      fm_inputs[0] = std::move(t_tensor);
+      auto fm_out = fm_sess_->Run(
+          {}, fm_input_names_ptr_.data(), fm_inputs.data(), fm_inputs.size(),
+          fm_output_names_ptr_.data(), fm_output_names_ptr_.size());
+      Ort::Value &v = fm_out[0];
+
+      float delta_t = timesteps[step + 1] - timesteps[step];
+      float *x_ptr = fm_inputs[1].GetTensorMutableData<float>();
+      const float *v_ptr = v.GetTensorData<float>();
+      int64_t N = batch_size * num_frames * feat_dim;
+      for (int64_t i = 0; i < N; ++i) {
+        x_ptr[i] += v_ptr[i] * delta_t;
+      }
+    }
+
+    int64_t keep_frames = num_frames - prompt_feat_len;
+    std::vector<float> out_data(batch_size * keep_frames * feat_dim);
+    x = std::move(fm_inputs[1]);
+    const float *x_ptr = x.GetTensorData<float>();
+    for (int64_t b = 0; b < batch_size; ++b) {
+      std::memcpy(out_data.data() + b * keep_frames * feat_dim,
+                  x_ptr + (b * num_frames + prompt_feat_len) * feat_dim,
+                  keep_frames * feat_dim * sizeof(float));
+    }
+    std::vector<int64_t> out_shape = {batch_size, keep_frames, feat_dim};
+    return Ort::Value::CreateTensor<float>(memory_info, out_data.data(),
+                                           out_data.size(), out_shape.data(),
+                                           out_shape.size());
+  }
+
+ private:
+  void Init(void *text_model_data, size_t text_model_data_length,
+            void *fm_model_data, size_t fm_model_data_length) {
+    // Init text-encoder model
+    text_sess_ = std::make_unique<Ort::Session>(
+        env_, text_model_data, text_model_data_length, sess_opts_);
+    GetInputNames(text_sess_.get(), &text_input_names_, &text_input_names_ptr_);
+    GetOutputNames(text_sess_.get(), &text_output_names_,
+                   &text_output_names_ptr_);
+
+    // Init flow-matching model
+    fm_sess_ = std::make_unique<Ort::Session>(env_, fm_model_data,
+                                              fm_model_data_length, sess_opts_);
+    GetInputNames(fm_sess_.get(), &fm_input_names_, &fm_input_names_ptr_);
+    GetOutputNames(fm_sess_.get(), &fm_output_names_, &fm_output_names_ptr_);
+
+    Ort::AllocatorWithDefaultOptions allocator;  // used in the macro below
+
+    Ort::ModelMetadata meta_data = text_sess_->GetModelMetadata();
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_espeak, "use_espeak",
+                                            1);
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.use_pinyin, "use_pinyin",
+                                            1);
+
+    meta_data = fm_sess_->GetModelMetadata();
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.feat_dim, "feat_dim",
+                                            100);
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.sample_rate,
+                                            "sample_rate", 24000);
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.n_fft, "n_fft", 1024);
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.hop_length, "hop_length",
+                                            256);
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.window_length,
+                                            "window_length", 1024);
+    SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.num_mels, "num_mels",
+                                            100);
+
+    if (config_.debug) {
+      std::ostringstream os;
+
+      os << "---zipvoice text-encoder model---\n";
+      Ort::ModelMetadata text_meta_data = text_sess_->GetModelMetadata();
+      PrintModelMetadata(os, text_meta_data);
+
+      os << "----------input names----------\n";
+      int32_t i = 0;
+      for (const auto &s : text_input_names_) {
+        os << i << " " << s << "\n";
+        ++i;
+      }
+      os << "----------output names----------\n";
+      i = 0;
+      for (const auto &s : text_output_names_) {
+        os << i << " " << s << "\n";
+        ++i;
+      }
+
+      os << "---zipvoice flow-matching model---\n";
+      PrintModelMetadata(os, meta_data);
+
+      os << "----------input names----------\n";
+      i = 0;
+      for (const auto &s : fm_input_names_) {
+        os << i << " " << s << "\n";
+        ++i;
+      }
+      os << "----------output names----------\n";
+      i = 0;
+      for (const auto &s : fm_output_names_) {
+        os << i << " " << s << "\n";
+        ++i;
+      }
+
+#if __OHOS__
+      SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
+#else
+      SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
+#endif
+    }
+  }
+
+ private:
+  OfflineTtsModelConfig config_;
+  Ort::Env env_;
+  Ort::SessionOptions sess_opts_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  std::unique_ptr<Ort::Session> text_sess_;
+  std::unique_ptr<Ort::Session> fm_sess_;
+
+  std::vector<std::string> text_input_names_;
+  std::vector<const char *> text_input_names_ptr_;
+
+  std::vector<std::string> text_output_names_;
+  std::vector<const char *> text_output_names_ptr_;
+
+  std::vector<std::string> fm_input_names_;
+  std::vector<const char *> fm_input_names_ptr_;
+
+  std::vector<std::string> fm_output_names_;
+  std::vector<const char *> fm_output_names_ptr_;
+
+  OfflineTtsZipvoiceModelMetaData meta_data_;
+};
+
+OfflineTtsZipvoiceModel::OfflineTtsZipvoiceModel(
+    const OfflineTtsModelConfig &config)
+    : impl_(std::make_unique<Impl>(config)) {}
+
+template <typename Manager>
+OfflineTtsZipvoiceModel::OfflineTtsZipvoiceModel(
+    Manager *mgr, const OfflineTtsModelConfig &config)
+    : impl_(std::make_unique<Impl>(mgr, config)) {}
+
+OfflineTtsZipvoiceModel::~OfflineTtsZipvoiceModel() = default;
+
+const OfflineTtsZipvoiceModelMetaData &OfflineTtsZipvoiceModel::GetMetaData()
+    const {
+  return impl_->GetMetaData();
+}
+
+Ort::Value OfflineTtsZipvoiceModel::Run(Ort::Value tokens,
+                                        Ort::Value prompt_tokens,
+                                        Ort::Value prompt_features,
+                                        float speed /*= 1.0*/,
+                                        int32_t num_steps /*= 16*/) const {
+  return impl_->Run(std::move(tokens), std::move(prompt_tokens),
+                    std::move(prompt_features), speed, num_steps);
+}
+
+#if __ANDROID_API__ >= 9
+template OfflineTtsZipvoiceModel::OfflineTtsZipvoiceModel(
+    AAssetManager *mgr, const OfflineTtsModelConfig &config);
+#endif
+
+#if __OHOS__
+template OfflineTtsZipvoiceModel::OfflineTtsZipvoiceModel(
+    NativeResourceManager *mgr, const OfflineTtsModelConfig &config);
+#endif
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/offline-tts-zipvoice-model.h 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts-zipvoice-model.h 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/offline-tts-zipvoice-model.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_H_
+#define SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_H_
+
+#include <memory>
+#include <string>
+
+#include "onnxruntime_cxx_api.h"  // NOLINT
+#include "sherpa-onnx/csrc/offline-tts-model-config.h"
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model-meta-data.h"
+
+namespace sherpa_onnx {
+
+class OfflineTtsZipvoiceModel {
+ public:
+  ~OfflineTtsZipvoiceModel();
+
+  explicit OfflineTtsZipvoiceModel(const OfflineTtsModelConfig &config);
+
+  template <typename Manager>
+  OfflineTtsZipvoiceModel(Manager *mgr, const OfflineTtsModelConfig &config);
+
+  // Return a float32 tensor containing the mel
+  // of shape (batch_size, mel_dim, num_frames)
+  Ort::Value Run(Ort::Value tokens, Ort::Value prompt_tokens,
+                 Ort::Value prompt_features, float speed,
+                 int32_t num_steps) const;
+
+  const OfflineTtsZipvoiceModelMetaData &GetMetaData() const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_H_
--- a/sherpa-onnx/csrc/offline-tts.cc
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts.cc
查看文件 @c149696
@@ -196,6 +196,46 @@ GeneratedAudio OfflineTts::Generate(
 #endif
 }
+GeneratedAudio OfflineTts::Generate(
+    const std::string &text, const std::string &prompt_text,
+    const std::vector<float> &prompt_samples, int32_t sample_rate,
+    float speed /*=1.0*/, int32_t num_steps /*=4*/,
+    GeneratedAudioCallback callback /*=nullptr*/) const {
+#if !defined(_WIN32)
+  return impl_->Generate(text, prompt_text, prompt_samples, sample_rate, speed,
+                         num_steps, std::move(callback));
+#else
+  static bool printed = false;
+  auto utf8_text = text;
+  if (IsGB2312(text)) {
+    utf8_text = Gb2312ToUtf8(text);
+    if (!printed) {
+      SHERPA_ONNX_LOGE("Detected GB2312 encoded text! Converting it to UTF8.");
+      printed = true;
+    }
+  }
+  auto utf8_prompt_text = prompt_text;
+  if (IsGB2312(prompt_text)) {
+    utf8_prompt_text = Gb2312ToUtf8(prompt_text);
+    if (!printed) {
+      SHERPA_ONNX_LOGE(
+          "Detected GB2312 encoded prompt text! Converting it to UTF8.");
+      printed = true;
+    }
+  }
+  if (IsUtf8(utf8_text) && IsUtf8(utf8_prompt_text)) {
+    return impl_->Generate(utf8_text, utf8_prompt_text, prompt_samples,
+                           sample_rate, speed, num_steps, std::move(callback));
+  } else {
+    SHERPA_ONNX_LOGE(
+        "Non UTF8 encoded string is received. You would not get expected "
+        "results!");
+    return impl_->Generate(utf8_text, utf8_prompt_text, prompt_samples,
+                           sample_rate, speed, num_steps, std::move(callback));
+  }
+#endif
+}
+
 int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); }
 int32_t OfflineTts::NumSpeakers() const { return impl_->NumSpeakers(); }
--- a/sherpa-onnx/csrc/offline-tts.h
查看文件 @c149696
+++ b/sherpa-onnx/csrc/offline-tts.h
查看文件 @c149696
@@ -95,6 +95,26 @@ class OfflineTts {
                           float speed = 1.0,
                           GeneratedAudioCallback callback = nullptr) const;
+  // @param text The string to be synthesized.
+  // @param prompt_text The transcribe of `prompt_sampes`.
+  // @param prompt_samples The prompt audio samples (mono PCM floats in [-1,1]).
+  // @param sample_rate The sample rate of `prompt_audio` in Hz.
+  // @param speed The speed for the generated speech. E.g., 2 means 2x faster.
+  // @param num_steps The number of flow steps to generate the audio.
+  // @param callback If not NULL, it is called whenever config.max_num_sentences
+  //                 sentences have been processed. Note that the passed
+  //                 pointer `samples` for the callback might be invalidated
+  //                 after the callback is returned, so the caller should not
+  //                 keep a reference to it. The caller can copy the data if
+  //                 he/she wants to access the samples after the callback
+  //                 returns. The callback is called in the current thread.
+  GeneratedAudio Generate(const std::string &text,
+                          const std::string &prompt_text,
+                          const std::vector<float> &prompt_samples,
+                          int32_t sample_rate, float speed = 1.0,
+                          int32_t num_steps = 4,
+                          GeneratedAudioCallback callback = nullptr) const;
+
   // Return the sample rate of the generated audio
   int32_t SampleRate() const;
--- a/sherpa-onnx/csrc/sherpa-onnx-offline-zeroshot-tts.cc 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/csrc/sherpa-onnx-offline-zeroshot-tts.cc 0 → 100644
查看文件 @c149696
+// sherpa-onnx/csrc/sherpa-onnx-offline-zeroshot-tts.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include <chrono>  // NOLINT
+#include <fstream>
+
+#include "sherpa-onnx/csrc/offline-tts.h"
+#include "sherpa-onnx/csrc/parse-options.h"
+#include "sherpa-onnx/csrc/wave-reader.h"
+#include "sherpa-onnx/csrc/wave-writer.h"
+
+static int32_t AudioCallback(const float * /*samples*/, int32_t n,
+                             float progress) {
+  printf("sample=%d, progress=%f\n", n, progress);
+  return 1;
+}
+
+int main(int32_t argc, char *argv[]) {
+  const char *kUsageMessage = R"usage(
+Offline/Non-streaming zero-shot text-to-speech with sherpa-onnx
+
+Usage example:
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-zipvoice-distill-zh-en-emilia.tar.bz2
+tar xf sherpa-onnx-zipvoice-distill-zh-en-emilia.tar.bz2
+
+./bin/sherpa-onnx-offline-zeroshot-tts \
+  --zipvoice-flow-matching-model=sherpa-onnx-zipvoice-distill-zh-en-emilia/fm_decoder.onnx \
+  --zipvoice-text-model=sherpa-onnx-zipvoice-distill-zh-en-emilia/text_encoder.onnx \
+  --zipvoice-data-dir=sherpa-onnx-zipvoice-distill-zh-en-emilia/espeak-ng-data \
+  --zipvoice-pinyin-dict=sherpa-onnx-zipvoice-distill-zh-en-emilia/pinyin.raw \
+  --zipvoice-tokens=sherpa-onnx-zipvoice-distill-zh-en-emilia/tokens.txt \
+  --zipvoice-vocoder=sherpa-onnx-zipvoice-distill-zh-en-emilia/vocos_24khz.onnx \
+  --prompt-audio=sherpa-onnx-zipvoice-distill-zh-en-emilia/prompt.wav \
+  --num-steps=4 \
+  --num-threads=4 \
+  --prompt-text="周日被我射熄火了，所以今天是周一。" \
+  "我是中国人民的儿子，我爱我的祖国。我得祖国是一个伟大的国家，拥有五千年的文明史。"
+
+It will generate a file ./generated.wav as specified by --output-filename.
+)usage";
+
+  sherpa_onnx::ParseOptions po(kUsageMessage);
+  std::string output_filename = "./generated.wav";
+
+  int32_t num_steps = 4;
+  float speed = 1.0;
+  std::string prompt_text;
+  std::string prompt_audio;
+
+  po.Register("output-filename", &output_filename,
+              "Path to save the generated audio");
+
+  po.Register("num-steps", &num_steps,
+              "Number of inference steps for ZipVoice (default: 4)");
+
+  po.Register("speed", &speed,
+              "Speech speed for ZipVoice (default: 1.0, larger=faster, "
+              "smaller=slower)");
+
+  po.Register("prompt-text", &prompt_text, "The transcribe of prompt_samples.");
+
+  po.Register("prompt-audio", &prompt_audio,
+              "The prompt audio file, single channel pcm. ");
+
+  sherpa_onnx::OfflineTtsConfig config;
+
+  config.Register(&po);
+  po.Read(argc, argv);
+
+  if (po.NumArgs() == 0) {
+    fprintf(stderr, "Error: Please provide the text to generate audio.\n\n");
+    po.PrintUsage();
+    exit(EXIT_FAILURE);
+  }
+
+  if (po.NumArgs() > 1) {
+    fprintf(stderr,
+            "Error: Accept only one positional argument. Please use single "
+            "quotes to wrap your text\n");
+    po.PrintUsage();
+    exit(EXIT_FAILURE);
+  }
+
+  if (config.model.debug) {
+    fprintf(stderr, "%s\n", config.model.ToString().c_str());
+  }
+
+  if (!config.Validate()) {
+    fprintf(stderr, "Errors in config!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (prompt_text.empty() || prompt_audio.empty()) {
+    fprintf(stderr, "Please provide both --prompt-text and --prompt-audio\n");
+    exit(EXIT_FAILURE);
+  }
+
+  sherpa_onnx::OfflineTts tts(config);
+
+  int32_t sample_rate = -1;
+  bool is_ok = false;
+  const std::vector<float> prompt_samples =
+      sherpa_onnx::ReadWave(prompt_audio, &sample_rate, &is_ok);
+
+  if (!is_ok) {
+    fprintf(stderr, "Failed to read '%s'\n", prompt_audio.c_str());
+    return -1;
+  }
+
+  const auto begin = std::chrono::steady_clock::now();
+  auto audio = tts.Generate(po.GetArg(1), prompt_text, prompt_samples,
+                            sample_rate, speed, num_steps, AudioCallback);
+  const auto end = std::chrono::steady_clock::now();
+
+  if (audio.samples.empty()) {
+    fprintf(
+        stderr,
+        "Error in generating audio. Please read previous error messages.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  float elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
+          .count() /
+      1000.;
+  float duration = audio.samples.size() / static_cast<float>(audio.sample_rate);
+
+  float rtf = elapsed_seconds / duration;
+  fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds);
+  fprintf(stderr, "Audio duration: %.3f s\n", duration);
+  fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds,
+          duration, rtf);
+
+  bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate,
+                                   audio.samples.data(), audio.samples.size());
+  if (!ok) {
+    fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str());
+    exit(EXIT_FAILURE);
+  }
+
+  fprintf(stderr, "The text is: %s.\n", po.GetArg(1).c_str());
+  fprintf(stderr, "Saved to %s successfully!\n", output_filename.c_str());
+
+  return 0;
+}
--- a/sherpa-onnx/csrc/text-utils-test.cc
查看文件 @c149696
+++ b/sherpa-onnx/csrc/text-utils-test.cc
查看文件 @c149696
@@ -4,6 +4,9 @@
 #include "sherpa-onnx/csrc/text-utils.h"
+#include <regex>
+#include <sstream>
+
 #include "gtest/gtest.h"
 namespace sherpa_onnx {
@@ -55,7 +58,6 @@ TEST(RemoveInvalidUtf8Sequences, Case1) {
   EXPECT_EQ(s.size() + 4, v.size());
 }
-
 // Tests for sanitizeUtf8
 TEST(RemoveInvalidUtf8Sequences, ValidUtf8StringPassesUnchanged) {
   std::string input = "Valid UTF-8 🌍";
@@ -82,7 +84,7 @@ TEST(RemoveInvalidUtf8Sequences, MultipleInvalidBytes) {
 TEST(RemoveInvalidUtf8Sequences, BreakingCase_SpaceFollowedByInvalidByte) {
   std::string input = "\x20\xC4";  // Space followed by an invalid byte
-  std::string expected = " ";  // 0xC4 removed
+  std::string expected = " ";      // 0xC4 removed
   EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
 }
@@ -99,19 +101,19 @@ TEST(RemoveInvalidUtf8Sequences, MixedValidAndInvalidBytes) {
 TEST(RemoveInvalidUtf8Sequences, SpaceFollowedByInvalidByte) {
   std::string input = "\x20\xC4";  // Space (0x20) followed by invalid (0xC4)
-  std::string expected = " ";  // Space remains, 0xC4 is removed
+  std::string expected = " ";      // Space remains, 0xC4 is removed
   EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
 }
 TEST(RemoveInvalidUtf8Sequences, RemoveTruncatedC4) {
   std::string input = "Hello \xc4 world";  // Invalid `0xC4`
-  std::string expected = "Hello  world";  // `0xC4` should be removed
+  std::string expected = "Hello  world";   // `0xC4` should be removed
   EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
 }
 TEST(RemoveInvalidUtf8Sequences, SpaceFollowedByInvalidByte_Breaking) {
   std::string input = "\x20\xc4";  // Space followed by invalid `0xc4`
-  std::string expected = " ";  // `0xc4` should be removed, space remains
+  std::string expected = " ";      // `0xc4` should be removed, space remains
   EXPECT_EQ(RemoveInvalidUtf8Sequences(input), expected);
 }
--- a/sherpa-onnx/csrc/text-utils.cc
查看文件 @c149696
+++ b/sherpa-onnx/csrc/text-utils.cc
查看文件 @c149696
@@ -724,4 +724,62 @@ std::vector<std::string> SplitString(const std::string &s, int32_t chunk_size) {
   return ans;
 }
+std::u32string Utf8ToUtf32(const std::string &str) {
+  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
+  return conv.from_bytes(str);
+}
+
+std::string Utf32ToUtf8(const std::u32string &str) {
+  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
+  return conv.to_bytes(str);
+}
+
+// Helper: Convert ASCII chars in a std::string to uppercase (leaves non-ASCII
+// unchanged)
+std::string ToUpperAscii(const std::string &str) {
+  std::string out = str;
+  for (char &c : out) {
+    unsigned char uc = static_cast<unsigned char>(c);
+    if (uc >= 'a' && uc <= 'z') {
+      c = static_cast<char>(uc - 'a' + 'A');
+    }
+  }
+  return out;
+}
+
+// Helper: Convert ASCII chars in a std::string to lowercase (leaves non-ASCII
+// unchanged)
+std::string ToLowerAscii(const std::string &str) {
+  std::string out = str;
+  for (char &c : out) {
+    unsigned char uc = static_cast<unsigned char>(c);
+    if (uc >= 'A' && uc <= 'Z') {
+      c = static_cast<char>(uc - 'A' + 'a');
+    }
+  }
+  return out;
+}
+
+// Detect if a codepoint is a CJK character
+bool IsCJK(char32_t cp) {
+  return (cp >= 0x1100 && cp <= 0x11FF) || (cp >= 0x2E80 && cp <= 0xA4CF) ||
+         (cp >= 0xA840 && cp <= 0xD7AF) || (cp >= 0xF900 && cp <= 0xFAFF) ||
+         (cp >= 0xFE30 && cp <= 0xFE4F) || (cp >= 0xFF65 && cp <= 0xFFDC) ||
+         (cp >= 0x20000 && cp <= 0x2FFFF);
+}
+
+bool ContainsCJK(const std::string &text) {
+  std::u32string utf32_text = Utf8ToUtf32(text);
+  return ContainsCJK(utf32_text);
+}
+
+bool ContainsCJK(const std::u32string &text) {
+  for (char32_t cp : text) {
+    if (IsCJK(cp)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/text-utils.h
查看文件 @c149696
+++ b/sherpa-onnx/csrc/text-utils.h
查看文件 @c149696
@@ -149,6 +149,29 @@ bool EndsWith(const std::string &haystack, const std::string &needle);
 std::vector<std::string> SplitString(const std::string &s, int32_t chunk_size);
+// Converts a UTF-8 std::string to a UTF-32 std::u32string
+std::u32string Utf8ToUtf32(const std::string &str);
+
+// Converts a UTF-32 std::u32string to a UTF-8 std::string
+std::string Utf32ToUtf8(const std::u32string &str);
+
+// Helper: Convert ASCII chars in a std::string to uppercase (leaves non-ASCII
+// unchanged)
+std::string ToUpperAscii(const std::string &str);
+
+// Helper: Convert ASCII chars in a std::string to lowercase (leaves non-ASCII
+// unchanged)
+std::string ToLowerAscii(const std::string &str);
+
+// Detect if a codepoint is a CJK character
+bool IsCJK(char32_t cp);
+
+bool ContainsCJK(const std::string &text);
+
+bool ContainsCJK(const std::u32string &text);
+
+bool StringToBool(const std::string &s);
+
 }  // namespace sherpa_onnx
 #endif  // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
--- a/sherpa-onnx/csrc/vocoder.cc
查看文件 @c149696
+++ b/sherpa-onnx/csrc/vocoder.cc
查看文件 @c149696
@@ -74,7 +74,18 @@ static ModelType GetModelType(char *model_data, size_t model_data_length,
 }
 std::unique_ptr<Vocoder> Vocoder::Create(const OfflineTtsModelConfig &config) {
-  auto buffer = ReadFile(config.matcha.vocoder);
+  std::vector<char> buffer;
+  if (!config.matcha.vocoder.empty()) {
+    SHERPA_ONNX_LOGE("Using matcha vocoder: %s", config.matcha.vocoder.c_str());
+    buffer = ReadFile(config.matcha.vocoder);
+  } else if (!config.zipvoice.vocoder.empty()) {
+    SHERPA_ONNX_LOGE("Using zipvoice vocoder: %s",
+                     config.zipvoice.vocoder.c_str());
+    buffer = ReadFile(config.zipvoice.vocoder);
+  } else {
+    SHERPA_ONNX_LOGE("No vocoder model provided in the config!");
+    exit(-1);
+  }
   auto model_type = GetModelType(buffer.data(), buffer.size(), config.debug);
   switch (model_type) {
@@ -94,7 +105,19 @@ std::unique_ptr<Vocoder> Vocoder::Create(const OfflineTtsModelConfig &config) {
 template <typename Manager>
 std::unique_ptr<Vocoder> Vocoder::Create(Manager *mgr,
                                          const OfflineTtsModelConfig &config) {
-  auto buffer = ReadFile(mgr, config.matcha.vocoder);
+  std::vector<char> buffer;
+  if (!config.matcha.vocoder.empty()) {
+    SHERPA_ONNX_LOGE("Using matcha vocoder: %s", config.matcha.vocoder.c_str());
+    buffer = ReadFile(mgr, config.matcha.vocoder);
+  } else if (!config.zipvoice.vocoder.empty()) {
+    SHERPA_ONNX_LOGE("Using zipvoice vocoder: %s",
+                     config.zipvoice.vocoder.c_str());
+    buffer = ReadFile(mgr, config.zipvoice.vocoder);
+  } else {
+    SHERPA_ONNX_LOGE("No vocoder model provided in the config!");
+    return nullptr;
+  }
+
   auto model_type = GetModelType(buffer.data(), buffer.size(), config.debug);
   switch (model_type) {
--- a/sherpa-onnx/csrc/vocos-vocoder.cc
查看文件 @c149696
+++ b/sherpa-onnx/csrc/vocos-vocoder.cc
查看文件 @c149696
@@ -42,8 +42,16 @@ class VocosVocoder::Impl {
         env_(ORT_LOGGING_LEVEL_ERROR),
         sess_opts_(GetSessionOptions(config.num_threads, config.provider)),
         allocator_{} {
-    auto buf = ReadFile(config.matcha.vocoder);
-    Init(buf.data(), buf.size());
+    std::vector<char> buffer;
+    if (!config.matcha.vocoder.empty()) {
+      buffer = ReadFile(config.matcha.vocoder);
+    } else if (!config.zipvoice.vocoder.empty()) {
+      buffer = ReadFile(config.zipvoice.vocoder);
+    } else {
+      SHERPA_ONNX_LOGE("No vocoder model provided in the config!");
+      exit(-1);
+    }
+    Init(buffer.data(), buffer.size());
   }
   template <typename Manager>
@@ -52,8 +60,16 @@ class VocosVocoder::Impl {
         env_(ORT_LOGGING_LEVEL_ERROR),
         sess_opts_(GetSessionOptions(config.num_threads, config.provider)),
         allocator_{} {
-    auto buf = ReadFile(mgr, config.matcha.vocoder);
-    Init(buf.data(), buf.size());
+    std::vector<char> buffer;
+    if (!config.matcha.vocoder.empty()) {
+      buffer = ReadFile(mgr, config.matcha.vocoder);
+    } else if (!config.zipvoice.vocoder.empty()) {
+      buffer = ReadFile(mgr, config.zipvoice.vocoder);
+    } else {
+      SHERPA_ONNX_LOGE("No vocoder model provided in the config!");
+      exit(-1);
+    }
+    Init(buffer.data(), buffer.size());
   }
   std::vector<float> Run(Ort::Value mel) const {
--- a/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @c149696
+++ b/sherpa-onnx/python/csrc/CMakeLists.txt
查看文件 @c149696
@@ -72,6 +72,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
     offline-tts-matcha-model-config.cc
     offline-tts-model-config.cc
     offline-tts-vits-model-config.cc
+    offline-tts-zipvoice-model-config.cc
     offline-tts.cc
   )
 endif()
--- a/sherpa-onnx/python/csrc/offline-tts-model-config.cc
查看文件 @c149696
+++ b/sherpa-onnx/python/csrc/offline-tts-model-config.cc
查看文件 @c149696
@@ -11,6 +11,7 @@
 #include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h"
 #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h"
 #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h"
+#include "sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.h"
 namespace sherpa_onnx {
@@ -18,6 +19,7 @@ void PybindOfflineTtsModelConfig(py::module *m) {
   PybindOfflineTtsVitsModelConfig(m);
   PybindOfflineTtsMatchaModelConfig(m);
   PybindOfflineTtsKokoroModelConfig(m);
+  PybindOfflineTtsZipvoiceModelConfig(m);
   PybindOfflineTtsKittenModelConfig(m);
   using PyClass = OfflineTtsModelConfig;
@@ -27,17 +29,20 @@ void PybindOfflineTtsModelConfig(py::module *m) {
       .def(py::init<const OfflineTtsVitsModelConfig &,
                     const OfflineTtsMatchaModelConfig &,
                     const OfflineTtsKokoroModelConfig &,
+                    const OfflineTtsZipvoiceModelConfig &,
                     const OfflineTtsKittenModelConfig &, int32_t, bool,
                     const std::string &>(),
            py::arg("vits") = OfflineTtsVitsModelConfig{},
            py::arg("matcha") = OfflineTtsMatchaModelConfig{},
            py::arg("kokoro") = OfflineTtsKokoroModelConfig{},
+           py::arg("zipvoice") = OfflineTtsZipvoiceModelConfig{},
            py::arg("kitten") = OfflineTtsKittenModelConfig{},
            py::arg("num_threads") = 1, py::arg("debug") = false,
            py::arg("provider") = "cpu")
       .def_readwrite("vits", &PyClass::vits)
       .def_readwrite("matcha", &PyClass::matcha)
       .def_readwrite("kokoro", &PyClass::kokoro)
+      .def_readwrite("zipvoice", &PyClass::zipvoice)
       .def_readwrite("kitten", &PyClass::kitten)
       .def_readwrite("num_threads", &PyClass::num_threads)
       .def_readwrite("debug", &PyClass::debug)
--- a/sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.cc 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.cc 0 → 100644
查看文件 @c149696
+// sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.h"
+
+#include <string>
+
+#include "sherpa-onnx/csrc/offline-tts-zipvoice-model-config.h"
+
+namespace sherpa_onnx {
+
+void PybindOfflineTtsZipvoiceModelConfig(py::module *m) {
+  using PyClass = OfflineTtsZipvoiceModelConfig;
+
+  py::class_<PyClass>(*m, "OfflineTtsZipvoiceModelConfig")
+      .def(py::init<>())
+      .def(py::init<const std::string &, const std::string &,
+                    const std::string &, const std::string &,
+                    const std::string &, const std::string &, float, float,
+                    float, float>(),
+           py::arg("tokens"), py::arg("text_model"),
+           py::arg("flow_matching_model"), py::arg("vocoder"),
+           py::arg("data_dir") = "", py::arg("pinyin_dict") = "",
+           py::arg("feat_scale") = 0.1, py::arg("t_shift") = 0.5,
+           py::arg("target_rms") = 0.1, py::arg("guidance_scale") = 1.0)
+      .def_readwrite("tokens", &PyClass::tokens)
+      .def_readwrite("text_model", &PyClass::text_model)
+      .def_readwrite("flow_matching_model", &PyClass::flow_matching_model)
+      .def_readwrite("vocoder", &PyClass::vocoder)
+      .def_readwrite("data_dir", &PyClass::data_dir)
+      .def_readwrite("pinyin_dict", &PyClass::pinyin_dict)
+      .def_readwrite("feat_scale", &PyClass::feat_scale)
+      .def_readwrite("t_shift", &PyClass::t_shift)
+      .def_readwrite("target_rms", &PyClass::target_rms)
+      .def_readwrite("guidance_scale", &PyClass::guidance_scale)
+      .def("__str__", &PyClass::ToString)
+      .def("validate", &PyClass::Validate);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.h 0 → 100644
查看文件 @c149696
+++ b/sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.h 0 → 100644
查看文件 @c149696
+// sherpa-onnx/python/csrc/offline-tts-zipvoice-model-config.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_CONFIG_H_
+#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_CONFIG_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindOfflineTtsZipvoiceModelConfig(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_ZIPVOICE_MODEL_CONFIG_H_
--- a/sherpa-onnx/python/csrc/offline-tts.cc
查看文件 @c149696
+++ b/sherpa-onnx/python/csrc/offline-tts.cc
查看文件 @c149696
@@ -84,6 +84,41 @@ void PybindOfflineTts(py::module *m) {
           },
           py::arg("text"), py::arg("sid") = 0, py::arg("speed") = 1.0,
           py::arg("callback") = py::none(),
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "generate",
+          [](const PyClass &self, const std::string &text,
+             const std::string &prompt_text,
+             const std::vector<float> &prompt_samples, int32_t sample_rate,
+             float speed, int32_t num_steps,
+             std::function<int32_t(py::array_t<float>, float)> callback)
+              -> GeneratedAudio {
+            if (!callback) {
+              return self.Generate(text, prompt_text, prompt_samples,
+                                   sample_rate, speed, num_steps);
+            }
+
+            std::function<int32_t(const float *, int32_t, float)>
+                callback_wrapper = [callback](const float *samples, int32_t n,
+                                              float progress) {
+                  // CAUTION(fangjun): we have to copy samples since it is
+                  // freed once the call back returns.
+
+                  pybind11::gil_scoped_acquire acquire;
+
+                  pybind11::array_t<float> array(n);
+                  py::buffer_info buf = array.request();
+                  auto p = static_cast<float *>(buf.ptr);
+                  std::copy(samples, samples + n, p);
+                  return callback(array, progress);
+                };
+
+            return self.Generate(text, prompt_text, prompt_samples, sample_rate,
+                                 speed, num_steps, callback_wrapper);
+          },
+          py::arg("text"), py::arg("prompt_text"), py::arg("prompt_samples"),
+          py::arg("sample_rate"), py::arg("speed") = 1.0,
+          py::arg("num_steps") = 4, py::arg("callback") = py::none(),
           py::call_guard<py::gil_scoped_release>());
 }
--- a/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @c149696
+++ b/sherpa-onnx/python/sherpa_onnx/__init__.py
查看文件 @c149696
@@ -49,6 +49,7 @@ from sherpa_onnx.lib._sherpa_onnx import (
     OfflineTtsMatchaModelConfig,
     OfflineTtsModelConfig,
     OfflineTtsVitsModelConfig,
+    OfflineTtsZipvoiceModelConfig,
     OfflineWenetCtcModelConfig,
     OfflineWhisperModelConfig,
     OfflineZipformerAudioTaggingModelConfig,