spoken-language-identification.cc
3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// sherpa-onnx/csrc/spoken-language-identification.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include "sherpa-onnx/csrc/spoken-language-identification.h"
#include <string>
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/spoken-language-identification-impl.h"
namespace sherpa_onnx {
void SpokenLanguageIdentificationWhisperConfig::Register(ParseOptions *po) {
po->Register(
"whisper-encoder", &encoder,
"Path to then encoder of a whisper multilingual model. Support only "
"tiny, base, small, medium, large.");
po->Register(
"whisper-decoder", &decoder,
"Path to the decoder of a whisper multilingual model. Support only "
"tiny, base, small, medium, large.");
po->Register(
"whisper-tail-paddings", &tail_paddings,
"Suggested value: 300 for multilingual models. "
"Since we have removed the 30-second constraint, we need to add some "
"tail padding frames "
"so that whisper can detect the eot token. Leave it to -1 to use 1000");
}
bool SpokenLanguageIdentificationWhisperConfig::Validate() const {
if (encoder.empty()) {
SHERPA_ONNX_LOGE("Please provide --whisper-encoder");
return false;
}
if (!FileExists(encoder)) {
SHERPA_ONNX_LOGE("whisper encoder file %s does not exist", encoder.c_str());
return false;
}
if (decoder.empty()) {
SHERPA_ONNX_LOGE("Please provide --whisper-decoder");
return false;
}
if (!FileExists(decoder)) {
SHERPA_ONNX_LOGE("whisper decoder file %s does not exist", decoder.c_str());
return false;
}
return true;
}
std::string SpokenLanguageIdentificationWhisperConfig::ToString() const {
std::ostringstream os;
os << "SpokenLanguageIdentificationWhisperConfig(";
os << "encoder=\"" << encoder << "\", ";
os << "decoder=\"" << decoder << "\", ";
os << "tail_paddings=" << tail_paddings << ")";
return os.str();
}
void SpokenLanguageIdentificationConfig::Register(ParseOptions *po) {
whisper.Register(po);
po->Register("num-threads", &num_threads,
"Number of threads to run the neural network");
po->Register("debug", &debug,
"true to print model information while loading it.");
po->Register("provider", &provider,
"Specify a provider to use: cpu, cuda, coreml");
}
bool SpokenLanguageIdentificationConfig::Validate() const {
if (!whisper.Validate()) {
return false;
}
return true;
}
std::string SpokenLanguageIdentificationConfig::ToString() const {
std::ostringstream os;
os << "SpokenLanguageIdentificationConfig(";
os << "whisper=\"" << whisper.ToString() << "\", ";
os << "num_threads=" << num_threads << ", ";
os << "debug=" << (debug ? "True" : "False") << ", ";
os << "provider=\"" << provider << "\")";
return os.str();
}
SpokenLanguageIdentification::SpokenLanguageIdentification(
const SpokenLanguageIdentificationConfig &config)
: impl_(SpokenLanguageIdentificationImpl::Create(config)) {}
SpokenLanguageIdentification::~SpokenLanguageIdentification() = default;
std::unique_ptr<OfflineStream> SpokenLanguageIdentification::CreateStream()
const {
return impl_->CreateStream();
}
std::string SpokenLanguageIdentification::Compute(OfflineStream *s) const {
return impl_->Compute(s);
}
} // namespace sherpa_onnx