Committed by
GitHub
Add HarmonyOS examples for MatchaTTS. (#1678)
正在显示
9 个修改的文件
包含
140 行增加
和
14 行删除
| 1 | /** | 1 | /** |
| 2 | * Use these variables when you tailor your ArkTS code. They must be of the const type. | 2 | * Use these variables when you tailor your ArkTS code. They must be of the const type. |
| 3 | */ | 3 | */ |
| 4 | -export const HAR_VERSION = '1.10.35'; | 4 | +export const HAR_VERSION = '1.10.37'; |
| 5 | export const BUILD_MODE_NAME = 'debug'; | 5 | export const BUILD_MODE_NAME = 'debug'; |
| 6 | export const DEBUG = true; | 6 | export const DEBUG = true; |
| 7 | export const TARGET_NAME = 'default'; | 7 | export const TARGET_NAME = 'default'; |
| 1 | export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so"; | 1 | export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so"; |
| 2 | 2 | ||
| 3 | -export { CircularBuffer, | ||
| 4 | - SileroVadConfig, | ||
| 5 | - SpeechSegment, | ||
| 6 | - Vad, | ||
| 7 | - VadConfig, | ||
| 8 | -} from './src/main/ets/components/Vad'; | 3 | +export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad'; |
| 9 | 4 | ||
| 10 | 5 | ||
| 11 | export { Samples, | 6 | export { Samples, |
| @@ -36,7 +31,8 @@ export { OnlineStream, | @@ -36,7 +31,8 @@ export { OnlineStream, | ||
| 36 | OnlineRecognizer, | 31 | OnlineRecognizer, |
| 37 | } from './src/main/ets/components/StreamingAsr'; | 32 | } from './src/main/ets/components/StreamingAsr'; |
| 38 | 33 | ||
| 39 | -export { OfflineTtsVitsModelConfig, | 34 | +export { OfflineTtsMatchaModelConfig, |
| 35 | + OfflineTtsVitsModelConfig, | ||
| 40 | OfflineTtsModelConfig, | 36 | OfflineTtsModelConfig, |
| 41 | OfflineTtsConfig, | 37 | OfflineTtsConfig, |
| 42 | OfflineTts, | 38 | OfflineTts, |
| @@ -17,8 +17,20 @@ export class OfflineTtsVitsModelConfig { | @@ -17,8 +17,20 @@ export class OfflineTtsVitsModelConfig { | ||
| 17 | public lengthScale: number = 1.0; | 17 | public lengthScale: number = 1.0; |
| 18 | } | 18 | } |
| 19 | 19 | ||
| 20 | +export class OfflineTtsMatchaModelConfig { | ||
| 21 | + public acousticModel: string = ''; | ||
| 22 | + public vocoder: string = ''; | ||
| 23 | + public lexicon: string = ''; | ||
| 24 | + public tokens: string = ''; | ||
| 25 | + public dataDir: string = ''; | ||
| 26 | + public dictDir: String = ''; | ||
| 27 | + public noiseScale: number = 0.667; | ||
| 28 | + public lengthScale: number = 1.0; | ||
| 29 | +} | ||
| 30 | + | ||
| 20 | export class OfflineTtsModelConfig { | 31 | export class OfflineTtsModelConfig { |
| 21 | public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); | 32 | public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); |
| 33 | + public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig(); | ||
| 22 | public numThreads: number = 1; | 34 | public numThreads: number = 1; |
| 23 | public debug: boolean = false; | 35 | public debug: boolean = false; |
| 24 | public provider: string = 'cpu'; | 36 | public provider: string = 'cpu'; |
| @@ -73,7 +73,16 @@ function initTts(context: Context): OfflineTts { | @@ -73,7 +73,16 @@ function initTts(context: Context): OfflineTts { | ||
| 73 | // for details | 73 | // for details |
| 74 | 74 | ||
| 75 | let modelDir = ''; | 75 | let modelDir = ''; |
| 76 | + | ||
| 77 | + // for VITS begin | ||
| 76 | let modelName = ''; | 78 | let modelName = ''; |
| 79 | + // for VITS end | ||
| 80 | + | ||
| 81 | + // for Matcha begin | ||
| 82 | + let acousticModelName = ''; | ||
| 83 | + let vocoder = ''; | ||
| 84 | + // for Matcha end | ||
| 85 | + | ||
| 77 | let ruleFsts = ''; | 86 | let ruleFsts = ''; |
| 78 | let ruleFars = ''; | 87 | let ruleFars = ''; |
| 79 | let lexicon = ''; | 88 | let lexicon = ''; |
| @@ -134,15 +143,47 @@ function initTts(context: Context): OfflineTts { | @@ -134,15 +143,47 @@ function initTts(context: Context): OfflineTts { | ||
| 134 | // dictDir = 'dict'; | 143 | // dictDir = 'dict'; |
| 135 | // ruleFsts = `date.fst,phone.fst,number.fst`; | 144 | // ruleFsts = `date.fst,phone.fst,number.fst`; |
| 136 | 145 | ||
| 146 | + // Example 8 | ||
| 147 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 148 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
| 149 | + // modelDir = 'matcha-icefall-zh-baker' | ||
| 150 | + // acousticModelName = 'model-steps-3.onnx' | ||
| 151 | + // vocoder = 'hifigan_v2.onnx' | ||
| 152 | + // lexicon = 'lexicon.txt' | ||
| 153 | + // dictDir = 'dict'; | ||
| 154 | + // ruleFsts = `date.fst,phone.fst,number.fst`; | ||
| 155 | + | ||
| 156 | + // Example 9 | ||
| 157 | + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | ||
| 158 | + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
| 159 | + // modelDir = 'matcha-icefall-en_US-ljspeech' | ||
| 160 | + // acousticModelName = 'model-steps-3.onnx' | ||
| 161 | + // vocoder = 'hifigan_v2.onnx' | ||
| 162 | + // dataDir = 'espeak-ng-data'; | ||
| 163 | + | ||
| 137 | // ============================================================ | 164 | // ============================================================ |
| 138 | // Please don't change the remaining part of this function | 165 | // Please don't change the remaining part of this function |
| 139 | // ============================================================ | 166 | // ============================================================ |
| 140 | 167 | ||
| 141 | - if (modelName == '') { | 168 | + if (modelName == '' && acousticModelName == '' && vocoder == '') { |
| 142 | throw new Error('You are supposed to select a model by changing the code before you run the app'); | 169 | throw new Error('You are supposed to select a model by changing the code before you run the app'); |
| 143 | } | 170 | } |
| 144 | 171 | ||
| 172 | + if (modelName != '' && acousticModelName != '') { | ||
| 173 | + throw new Error('Please select either VITS or Matcha, not both'); | ||
| 174 | + } | ||
| 175 | + | ||
| 176 | + if (acousticModelName != '' && vocoder == '') { | ||
| 177 | + throw new Error('Please provider vocoder for matcha tts models'); | ||
| 178 | + } | ||
| 179 | + | ||
| 180 | + if (modelName != '') { | ||
| 145 | modelName = modelDir + '/' + modelName; | 181 | modelName = modelDir + '/' + modelName; |
| 182 | + } | ||
| 183 | + | ||
| 184 | + if (acousticModelName != '') { | ||
| 185 | + acousticModelName = modelDir + '/' + acousticModelName; | ||
| 186 | + } | ||
| 146 | 187 | ||
| 147 | if (ruleFsts != '') { | 188 | if (ruleFsts != '') { |
| 148 | let fsts = ruleFsts.split(',') | 189 | let fsts = ruleFsts.split(',') |
| @@ -186,6 +227,14 @@ function initTts(context: Context): OfflineTts { | @@ -186,6 +227,14 @@ function initTts(context: Context): OfflineTts { | ||
| 186 | config.model.vits.tokens = tokens; | 227 | config.model.vits.tokens = tokens; |
| 187 | config.model.vits.dataDir = dataDir; | 228 | config.model.vits.dataDir = dataDir; |
| 188 | config.model.vits.dictDir = dictDir; | 229 | config.model.vits.dictDir = dictDir; |
| 230 | + | ||
| 231 | + config.model.matcha.acousticModel = acousticModelName; | ||
| 232 | + config.model.matcha.vocoder = vocoder; | ||
| 233 | + config.model.matcha.lexicon = lexicon; | ||
| 234 | + config.model.matcha.tokens = tokens; | ||
| 235 | + config.model.matcha.dataDir = dataDir; | ||
| 236 | + config.model.matcha.dictDir = dictDir; | ||
| 237 | + | ||
| 189 | config.model.numThreads = 2; | 238 | config.model.numThreads = 2; |
| 190 | config.model.debug = true; | 239 | config.model.debug = true; |
| 191 | config.ruleFsts = ruleFsts; | 240 | config.ruleFsts = ruleFsts; |
| @@ -2098,7 +2098,7 @@ SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS( | @@ -2098,7 +2098,7 @@ SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS( | ||
| 2098 | } | 2098 | } |
| 2099 | 2099 | ||
| 2100 | #if SHERPA_ONNX_ENABLE_TTS == 1 | 2100 | #if SHERPA_ONNX_ENABLE_TTS == 1 |
| 2101 | -SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( | 2101 | +const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( |
| 2102 | const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr) { | 2102 | const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr) { |
| 2103 | if (!mgr) { | 2103 | if (!mgr) { |
| 2104 | return SherpaOnnxCreateOfflineTts(config); | 2104 | return SherpaOnnxCreateOfflineTts(config); |
| @@ -1618,7 +1618,7 @@ SherpaOnnxCreateVoiceActivityDetectorOHOS( | @@ -1618,7 +1618,7 @@ SherpaOnnxCreateVoiceActivityDetectorOHOS( | ||
| 1618 | const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds, | 1618 | const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds, |
| 1619 | NativeResourceManager *mgr); | 1619 | NativeResourceManager *mgr); |
| 1620 | 1620 | ||
| 1621 | -SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( | 1621 | +SHERPA_ONNX_API const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( |
| 1622 | const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr); | 1622 | const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr); |
| 1623 | 1623 | ||
| 1624 | SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor * | 1624 | SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor * |
| @@ -6,12 +6,23 @@ | @@ -6,12 +6,23 @@ | ||
| 6 | 6 | ||
| 7 | #include <fstream> | 7 | #include <fstream> |
| 8 | #include <regex> // NOLINT | 8 | #include <regex> // NOLINT |
| 9 | +#include <strstream> | ||
| 9 | #include <unordered_set> | 10 | #include <unordered_set> |
| 10 | #include <utility> | 11 | #include <utility> |
| 11 | 12 | ||
| 13 | +#if __ANDROID_API__ >= 9 | ||
| 14 | +#include "android/asset_manager.h" | ||
| 15 | +#include "android/asset_manager_jni.h" | ||
| 16 | +#endif | ||
| 17 | + | ||
| 18 | +#if __OHOS__ | ||
| 19 | +#include "rawfile/raw_file_manager.h" | ||
| 20 | +#endif | ||
| 21 | + | ||
| 12 | #include "cppjieba/Jieba.hpp" | 22 | #include "cppjieba/Jieba.hpp" |
| 13 | #include "sherpa-onnx/csrc/file-utils.h" | 23 | #include "sherpa-onnx/csrc/file-utils.h" |
| 14 | #include "sherpa-onnx/csrc/macros.h" | 24 | #include "sherpa-onnx/csrc/macros.h" |
| 25 | +#include "sherpa-onnx/csrc/onnx-utils.h" | ||
| 15 | #include "sherpa-onnx/csrc/symbol-table.h" | 26 | #include "sherpa-onnx/csrc/symbol-table.h" |
| 16 | #include "sherpa-onnx/csrc/text-utils.h" | 27 | #include "sherpa-onnx/csrc/text-utils.h" |
| 17 | 28 | ||
| @@ -56,6 +67,39 @@ class JiebaLexicon::Impl { | @@ -56,6 +67,39 @@ class JiebaLexicon::Impl { | ||
| 56 | } | 67 | } |
| 57 | } | 68 | } |
| 58 | 69 | ||
| 70 | + template <typename Manager> | ||
| 71 | + Impl(Manager *mgr, const std::string &lexicon, const std::string &tokens, | ||
| 72 | + const std::string &dict_dir, bool debug) | ||
| 73 | + : debug_(debug) { | ||
| 74 | + std::string dict = dict_dir + "/jieba.dict.utf8"; | ||
| 75 | + std::string hmm = dict_dir + "/hmm_model.utf8"; | ||
| 76 | + std::string user_dict = dict_dir + "/user.dict.utf8"; | ||
| 77 | + std::string idf = dict_dir + "/idf.utf8"; | ||
| 78 | + std::string stop_word = dict_dir + "/stop_words.utf8"; | ||
| 79 | + | ||
| 80 | + AssertFileExists(dict); | ||
| 81 | + AssertFileExists(hmm); | ||
| 82 | + AssertFileExists(user_dict); | ||
| 83 | + AssertFileExists(idf); | ||
| 84 | + AssertFileExists(stop_word); | ||
| 85 | + | ||
| 86 | + jieba_ = | ||
| 87 | + std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word); | ||
| 88 | + | ||
| 89 | + { | ||
| 90 | + auto buf = ReadFile(mgr, tokens); | ||
| 91 | + std::istrstream is(buf.data(), buf.size()); | ||
| 92 | + | ||
| 93 | + InitTokens(is); | ||
| 94 | + } | ||
| 95 | + | ||
| 96 | + { | ||
| 97 | + auto buf = ReadFile(mgr, lexicon); | ||
| 98 | + std::istrstream is(buf.data(), buf.size()); | ||
| 99 | + InitLexicon(is); | ||
| 100 | + } | ||
| 101 | + } | ||
| 102 | + | ||
| 59 | std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &text) const { | 103 | std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &text) const { |
| 60 | // see | 104 | // see |
| 61 | // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244 | 105 | // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244 |
| @@ -279,9 +323,29 @@ JiebaLexicon::JiebaLexicon(const std::string &lexicon, | @@ -279,9 +323,29 @@ JiebaLexicon::JiebaLexicon(const std::string &lexicon, | ||
| 279 | const std::string &dict_dir, bool debug) | 323 | const std::string &dict_dir, bool debug) |
| 280 | : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, debug)) {} | 324 | : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, debug)) {} |
| 281 | 325 | ||
| 326 | +template <typename Manager> | ||
| 327 | +JiebaLexicon::JiebaLexicon(Manager *mgr, const std::string &lexicon, | ||
| 328 | + const std::string &tokens, | ||
| 329 | + const std::string &dict_dir, bool debug) | ||
| 330 | + : impl_(std::make_unique<Impl>(mgr, lexicon, tokens, dict_dir, debug)) {} | ||
| 331 | + | ||
| 282 | std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds( | 332 | std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds( |
| 283 | const std::string &text, const std::string & /*unused_voice = ""*/) const { | 333 | const std::string &text, const std::string & /*unused_voice = ""*/) const { |
| 284 | return impl_->ConvertTextToTokenIds(text); | 334 | return impl_->ConvertTextToTokenIds(text); |
| 285 | } | 335 | } |
| 286 | 336 | ||
| 337 | +#if __ANDROID_API__ >= 9 | ||
| 338 | +template JiebaLexicon::JiebaLexicon(AAssetManager *mgr, | ||
| 339 | + const std::string &lexicon, | ||
| 340 | + const std::string &tokens, | ||
| 341 | + const std::string &dict_dir, bool debug); | ||
| 342 | +#endif | ||
| 343 | + | ||
| 344 | +#if __OHOS__ | ||
| 345 | +template JiebaLexicon::JiebaLexicon(NativeResourceManager *mgr, | ||
| 346 | + const std::string &lexicon, | ||
| 347 | + const std::string &tokens, | ||
| 348 | + const std::string &dict_dir, bool debug); | ||
| 349 | +#endif | ||
| 350 | + | ||
| 287 | } // namespace sherpa_onnx | 351 | } // namespace sherpa_onnx |
| @@ -17,9 +17,15 @@ namespace sherpa_onnx { | @@ -17,9 +17,15 @@ namespace sherpa_onnx { | ||
| 17 | class JiebaLexicon : public OfflineTtsFrontend { | 17 | class JiebaLexicon : public OfflineTtsFrontend { |
| 18 | public: | 18 | public: |
| 19 | ~JiebaLexicon() override; | 19 | ~JiebaLexicon() override; |
| 20 | + | ||
| 20 | JiebaLexicon(const std::string &lexicon, const std::string &tokens, | 21 | JiebaLexicon(const std::string &lexicon, const std::string &tokens, |
| 21 | const std::string &dict_dir, bool debug); | 22 | const std::string &dict_dir, bool debug); |
| 22 | 23 | ||
| 24 | + template <typename Manager> | ||
| 25 | + JiebaLexicon(Manager *mgr, const std::string &lexicon, | ||
| 26 | + const std::string &tokens, const std::string &dict_dir, | ||
| 27 | + bool debug); | ||
| 28 | + | ||
| 23 | std::vector<TokenIDs> ConvertTextToTokenIds( | 29 | std::vector<TokenIDs> ConvertTextToTokenIds( |
| 24 | const std::string &text, | 30 | const std::string &text, |
| 25 | const std::string &unused_voice = "") const override; | 31 | const std::string &unused_voice = "") const override; |
| @@ -327,13 +327,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | @@ -327,13 +327,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { | ||
| 327 | // from assets to disk | 327 | // from assets to disk |
| 328 | // | 328 | // |
| 329 | // for jieba | 329 | // for jieba |
| 330 | - // we require that you copy tokens.txt, lexicon.txt and dict | ||
| 331 | - // from assets to disk | 330 | + // we require that you copy dict from assets to disk |
| 332 | const auto &meta_data = model_->GetMetaData(); | 331 | const auto &meta_data = model_->GetMetaData(); |
| 333 | 332 | ||
| 334 | if (meta_data.jieba && !meta_data.has_espeak) { | 333 | if (meta_data.jieba && !meta_data.has_espeak) { |
| 335 | frontend_ = std::make_unique<JiebaLexicon>( | 334 | frontend_ = std::make_unique<JiebaLexicon>( |
| 336 | - config_.model.matcha.lexicon, config_.model.matcha.tokens, | 335 | + mgr, config_.model.matcha.lexicon, config_.model.matcha.tokens, |
| 337 | config_.model.matcha.dict_dir, config_.model.debug); | 336 | config_.model.matcha.dict_dir, config_.model.debug); |
| 338 | } else if (meta_data.has_espeak && !meta_data.jieba) { | 337 | } else if (meta_data.has_espeak && !meta_data.jieba) { |
| 339 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( | 338 | frontend_ = std::make_unique<PiperPhonemizeLexicon>( |
-
请 注册 或 登录 后发表评论