Fangjun Kuang
Committed by GitHub

Add HarmonyOS examples for MatchaTTS. (#1678)

1 /** 1 /**
2 * Use these variables when you tailor your ArkTS code. They must be of the const type. 2 * Use these variables when you tailor your ArkTS code. They must be of the const type.
3 */ 3 */
4 -export const HAR_VERSION = '1.10.35'; 4 +export const HAR_VERSION = '1.10.37';
5 export const BUILD_MODE_NAME = 'debug'; 5 export const BUILD_MODE_NAME = 'debug';
6 export const DEBUG = true; 6 export const DEBUG = true;
7 export const TARGET_NAME = 'default'; 7 export const TARGET_NAME = 'default';
1 export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so"; 1 export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so";
2 2
3 -export { CircularBuffer,  
4 - SileroVadConfig,  
5 - SpeechSegment,  
6 - Vad,  
7 - VadConfig,  
8 -} from './src/main/ets/components/Vad'; 3 +export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad';
9 4
10 5
11 export { Samples, 6 export { Samples,
@@ -36,7 +31,8 @@ export { OnlineStream, @@ -36,7 +31,8 @@ export { OnlineStream,
36 OnlineRecognizer, 31 OnlineRecognizer,
37 } from './src/main/ets/components/StreamingAsr'; 32 } from './src/main/ets/components/StreamingAsr';
38 33
39 -export { OfflineTtsVitsModelConfig, 34 +export { OfflineTtsMatchaModelConfig,
  35 + OfflineTtsVitsModelConfig,
40 OfflineTtsModelConfig, 36 OfflineTtsModelConfig,
41 OfflineTtsConfig, 37 OfflineTtsConfig,
42 OfflineTts, 38 OfflineTts,
@@ -17,8 +17,20 @@ export class OfflineTtsVitsModelConfig { @@ -17,8 +17,20 @@ export class OfflineTtsVitsModelConfig {
17 public lengthScale: number = 1.0; 17 public lengthScale: number = 1.0;
18 } 18 }
19 19
  20 +export class OfflineTtsMatchaModelConfig {
  21 + public acousticModel: string = '';
  22 + public vocoder: string = '';
  23 + public lexicon: string = '';
  24 + public tokens: string = '';
  25 + public dataDir: string = '';
  26 + public dictDir: String = '';
  27 + public noiseScale: number = 0.667;
  28 + public lengthScale: number = 1.0;
  29 +}
  30 +
20 export class OfflineTtsModelConfig { 31 export class OfflineTtsModelConfig {
21 public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); 32 public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig();
  33 + public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig();
22 public numThreads: number = 1; 34 public numThreads: number = 1;
23 public debug: boolean = false; 35 public debug: boolean = false;
24 public provider: string = 'cpu'; 36 public provider: string = 'cpu';
@@ -73,7 +73,16 @@ function initTts(context: Context): OfflineTts { @@ -73,7 +73,16 @@ function initTts(context: Context): OfflineTts {
73 // for details 73 // for details
74 74
75 let modelDir = ''; 75 let modelDir = '';
  76 +
  77 + // for VITS begin
76 let modelName = ''; 78 let modelName = '';
  79 + // for VITS end
  80 +
  81 + // for Matcha begin
  82 + let acousticModelName = '';
  83 + let vocoder = '';
  84 + // for Matcha end
  85 +
77 let ruleFsts = ''; 86 let ruleFsts = '';
78 let ruleFars = ''; 87 let ruleFars = '';
79 let lexicon = ''; 88 let lexicon = '';
@@ -134,15 +143,47 @@ function initTts(context: Context): OfflineTts { @@ -134,15 +143,47 @@ function initTts(context: Context): OfflineTts {
134 // dictDir = 'dict'; 143 // dictDir = 'dict';
135 // ruleFsts = `date.fst,phone.fst,number.fst`; 144 // ruleFsts = `date.fst,phone.fst,number.fst`;
136 145
  146 + // Example 8
  147 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  148 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
  149 + // modelDir = 'matcha-icefall-zh-baker'
  150 + // acousticModelName = 'model-steps-3.onnx'
  151 + // vocoder = 'hifigan_v2.onnx'
  152 + // lexicon = 'lexicon.txt'
  153 + // dictDir = 'dict';
  154 + // ruleFsts = `date.fst,phone.fst,number.fst`;
  155 +
  156 + // Example 9
  157 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
  158 + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
  159 + // modelDir = 'matcha-icefall-en_US-ljspeech'
  160 + // acousticModelName = 'model-steps-3.onnx'
  161 + // vocoder = 'hifigan_v2.onnx'
  162 + // dataDir = 'espeak-ng-data';
  163 +
137 // ============================================================ 164 // ============================================================
138 // Please don't change the remaining part of this function 165 // Please don't change the remaining part of this function
139 // ============================================================ 166 // ============================================================
140 167
141 - if (modelName == '') { 168 + if (modelName == '' && acousticModelName == '' && vocoder == '') {
142 throw new Error('You are supposed to select a model by changing the code before you run the app'); 169 throw new Error('You are supposed to select a model by changing the code before you run the app');
143 } 170 }
144 171
  172 + if (modelName != '' && acousticModelName != '') {
  173 + throw new Error('Please select either VITS or Matcha, not both');
  174 + }
  175 +
  176 + if (acousticModelName != '' && vocoder == '') {
  177 + throw new Error('Please provider vocoder for matcha tts models');
  178 + }
  179 +
  180 + if (modelName != '') {
145 modelName = modelDir + '/' + modelName; 181 modelName = modelDir + '/' + modelName;
  182 + }
  183 +
  184 + if (acousticModelName != '') {
  185 + acousticModelName = modelDir + '/' + acousticModelName;
  186 + }
146 187
147 if (ruleFsts != '') { 188 if (ruleFsts != '') {
148 let fsts = ruleFsts.split(',') 189 let fsts = ruleFsts.split(',')
@@ -186,6 +227,14 @@ function initTts(context: Context): OfflineTts { @@ -186,6 +227,14 @@ function initTts(context: Context): OfflineTts {
186 config.model.vits.tokens = tokens; 227 config.model.vits.tokens = tokens;
187 config.model.vits.dataDir = dataDir; 228 config.model.vits.dataDir = dataDir;
188 config.model.vits.dictDir = dictDir; 229 config.model.vits.dictDir = dictDir;
  230 +
  231 + config.model.matcha.acousticModel = acousticModelName;
  232 + config.model.matcha.vocoder = vocoder;
  233 + config.model.matcha.lexicon = lexicon;
  234 + config.model.matcha.tokens = tokens;
  235 + config.model.matcha.dataDir = dataDir;
  236 + config.model.matcha.dictDir = dictDir;
  237 +
189 config.model.numThreads = 2; 238 config.model.numThreads = 2;
190 config.model.debug = true; 239 config.model.debug = true;
191 config.ruleFsts = ruleFsts; 240 config.ruleFsts = ruleFsts;
@@ -2098,7 +2098,7 @@ SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS( @@ -2098,7 +2098,7 @@ SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS(
2098 } 2098 }
2099 2099
2100 #if SHERPA_ONNX_ENABLE_TTS == 1 2100 #if SHERPA_ONNX_ENABLE_TTS == 1
2101 -SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( 2101 +const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS(
2102 const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr) { 2102 const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr) {
2103 if (!mgr) { 2103 if (!mgr) {
2104 return SherpaOnnxCreateOfflineTts(config); 2104 return SherpaOnnxCreateOfflineTts(config);
@@ -1618,7 +1618,7 @@ SherpaOnnxCreateVoiceActivityDetectorOHOS( @@ -1618,7 +1618,7 @@ SherpaOnnxCreateVoiceActivityDetectorOHOS(
1618 const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds, 1618 const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds,
1619 NativeResourceManager *mgr); 1619 NativeResourceManager *mgr);
1620 1620
1621 -SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( 1621 +SHERPA_ONNX_API const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS(
1622 const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr); 1622 const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr);
1623 1623
1624 SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor * 1624 SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor *
@@ -6,12 +6,23 @@ @@ -6,12 +6,23 @@
6 6
7 #include <fstream> 7 #include <fstream>
8 #include <regex> // NOLINT 8 #include <regex> // NOLINT
  9 +#include <strstream>
9 #include <unordered_set> 10 #include <unordered_set>
10 #include <utility> 11 #include <utility>
11 12
  13 +#if __ANDROID_API__ >= 9
  14 +#include "android/asset_manager.h"
  15 +#include "android/asset_manager_jni.h"
  16 +#endif
  17 +
  18 +#if __OHOS__
  19 +#include "rawfile/raw_file_manager.h"
  20 +#endif
  21 +
12 #include "cppjieba/Jieba.hpp" 22 #include "cppjieba/Jieba.hpp"
13 #include "sherpa-onnx/csrc/file-utils.h" 23 #include "sherpa-onnx/csrc/file-utils.h"
14 #include "sherpa-onnx/csrc/macros.h" 24 #include "sherpa-onnx/csrc/macros.h"
  25 +#include "sherpa-onnx/csrc/onnx-utils.h"
15 #include "sherpa-onnx/csrc/symbol-table.h" 26 #include "sherpa-onnx/csrc/symbol-table.h"
16 #include "sherpa-onnx/csrc/text-utils.h" 27 #include "sherpa-onnx/csrc/text-utils.h"
17 28
@@ -56,6 +67,39 @@ class JiebaLexicon::Impl { @@ -56,6 +67,39 @@ class JiebaLexicon::Impl {
56 } 67 }
57 } 68 }
58 69
  70 + template <typename Manager>
  71 + Impl(Manager *mgr, const std::string &lexicon, const std::string &tokens,
  72 + const std::string &dict_dir, bool debug)
  73 + : debug_(debug) {
  74 + std::string dict = dict_dir + "/jieba.dict.utf8";
  75 + std::string hmm = dict_dir + "/hmm_model.utf8";
  76 + std::string user_dict = dict_dir + "/user.dict.utf8";
  77 + std::string idf = dict_dir + "/idf.utf8";
  78 + std::string stop_word = dict_dir + "/stop_words.utf8";
  79 +
  80 + AssertFileExists(dict);
  81 + AssertFileExists(hmm);
  82 + AssertFileExists(user_dict);
  83 + AssertFileExists(idf);
  84 + AssertFileExists(stop_word);
  85 +
  86 + jieba_ =
  87 + std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
  88 +
  89 + {
  90 + auto buf = ReadFile(mgr, tokens);
  91 + std::istrstream is(buf.data(), buf.size());
  92 +
  93 + InitTokens(is);
  94 + }
  95 +
  96 + {
  97 + auto buf = ReadFile(mgr, lexicon);
  98 + std::istrstream is(buf.data(), buf.size());
  99 + InitLexicon(is);
  100 + }
  101 + }
  102 +
59 std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &text) const { 103 std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &text) const {
60 // see 104 // see
61 // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244 105 // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244
@@ -279,9 +323,29 @@ JiebaLexicon::JiebaLexicon(const std::string &lexicon, @@ -279,9 +323,29 @@ JiebaLexicon::JiebaLexicon(const std::string &lexicon,
279 const std::string &dict_dir, bool debug) 323 const std::string &dict_dir, bool debug)
280 : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, debug)) {} 324 : impl_(std::make_unique<Impl>(lexicon, tokens, dict_dir, debug)) {}
281 325
  326 +template <typename Manager>
  327 +JiebaLexicon::JiebaLexicon(Manager *mgr, const std::string &lexicon,
  328 + const std::string &tokens,
  329 + const std::string &dict_dir, bool debug)
  330 + : impl_(std::make_unique<Impl>(mgr, lexicon, tokens, dict_dir, debug)) {}
  331 +
282 std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds( 332 std::vector<TokenIDs> JiebaLexicon::ConvertTextToTokenIds(
283 const std::string &text, const std::string & /*unused_voice = ""*/) const { 333 const std::string &text, const std::string & /*unused_voice = ""*/) const {
284 return impl_->ConvertTextToTokenIds(text); 334 return impl_->ConvertTextToTokenIds(text);
285 } 335 }
286 336
  337 +#if __ANDROID_API__ >= 9
  338 +template JiebaLexicon::JiebaLexicon(AAssetManager *mgr,
  339 + const std::string &lexicon,
  340 + const std::string &tokens,
  341 + const std::string &dict_dir, bool debug);
  342 +#endif
  343 +
  344 +#if __OHOS__
  345 +template JiebaLexicon::JiebaLexicon(NativeResourceManager *mgr,
  346 + const std::string &lexicon,
  347 + const std::string &tokens,
  348 + const std::string &dict_dir, bool debug);
  349 +#endif
  350 +
287 } // namespace sherpa_onnx 351 } // namespace sherpa_onnx
@@ -17,9 +17,15 @@ namespace sherpa_onnx { @@ -17,9 +17,15 @@ namespace sherpa_onnx {
17 class JiebaLexicon : public OfflineTtsFrontend { 17 class JiebaLexicon : public OfflineTtsFrontend {
18 public: 18 public:
19 ~JiebaLexicon() override; 19 ~JiebaLexicon() override;
  20 +
20 JiebaLexicon(const std::string &lexicon, const std::string &tokens, 21 JiebaLexicon(const std::string &lexicon, const std::string &tokens,
21 const std::string &dict_dir, bool debug); 22 const std::string &dict_dir, bool debug);
22 23
  24 + template <typename Manager>
  25 + JiebaLexicon(Manager *mgr, const std::string &lexicon,
  26 + const std::string &tokens, const std::string &dict_dir,
  27 + bool debug);
  28 +
23 std::vector<TokenIDs> ConvertTextToTokenIds( 29 std::vector<TokenIDs> ConvertTextToTokenIds(
24 const std::string &text, 30 const std::string &text,
25 const std::string &unused_voice = "") const override; 31 const std::string &unused_voice = "") const override;
@@ -327,13 +327,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { @@ -327,13 +327,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
327 // from assets to disk 327 // from assets to disk
328 // 328 //
329 // for jieba 329 // for jieba
330 - // we require that you copy tokens.txt, lexicon.txt and dict  
331 - // from assets to disk 330 + // we require that you copy dict from assets to disk
332 const auto &meta_data = model_->GetMetaData(); 331 const auto &meta_data = model_->GetMetaData();
333 332
334 if (meta_data.jieba && !meta_data.has_espeak) { 333 if (meta_data.jieba && !meta_data.has_espeak) {
335 frontend_ = std::make_unique<JiebaLexicon>( 334 frontend_ = std::make_unique<JiebaLexicon>(
336 - config_.model.matcha.lexicon, config_.model.matcha.tokens, 335 + mgr, config_.model.matcha.lexicon, config_.model.matcha.tokens,
337 config_.model.matcha.dict_dir, config_.model.debug); 336 config_.model.matcha.dict_dir, config_.model.debug);
338 } else if (meta_data.has_espeak && !meta_data.jieba) { 337 } else if (meta_data.has_espeak && !meta_data.jieba) {
339 frontend_ = std::make_unique<PiperPhonemizeLexicon>( 338 frontend_ = std::make_unique<PiperPhonemizeLexicon>(