Fangjun Kuang
Committed by GitHub

Update kaldi-native-fbank. (#2259)

Now it supports FFT of an even number, not necessarily a power of 2.
function(download_kaldi_native_fbank)
include(FetchContent)
set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.1.tar.gz")
set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.1.tar.gz")
set(kaldi_native_fbank_HASH "SHA256=37c1aa230b00fe062791d800d8fc50aa3de215918d3dce6440699e67275d859e")
set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.2.tar.gz")
set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.2.tar.gz")
set(kaldi_native_fbank_HASH "SHA256=f4bd7d53fe8aeaecc4eda9680c72696bb86bf74e86371d81aacacd6f4ca3914d")
set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
... ... @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
# If you don't have access to the Internet,
# please pre-download kaldi-native-fbank
set(possible_file_locations
$ENV{HOME}/Downloads/kaldi-native-fbank-1.21.1.tar.gz
${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.1.tar.gz
${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.1.tar.gz
/tmp/kaldi-native-fbank-1.21.1.tar.gz
/star-fj/fangjun/download/github/kaldi-native-fbank-1.21.1.tar.gz
$ENV{HOME}/Downloads/kaldi-native-fbank-1.21.2.tar.gz
${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.2.tar.gz
${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.2.tar.gz
/tmp/kaldi-native-fbank-1.21.2.tar.gz
/star-fj/fangjun/download/github/kaldi-native-fbank-1.21.2.tar.gz
)
foreach(f IN LISTS possible_file_locations)
... ...
... ... @@ -22,4 +22,4 @@ Cflags: -I"${includedir}"
# Note: -lcargs is required only for the following file
# https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c
# We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c
Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lkissfft-float -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
... ...
... ... @@ -22,4 +22,4 @@ Cflags: -I"${includedir}"
# Note: -lcargs is required only for the following file
# https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c
# We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c
Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lkissfft-float -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
... ...
... ... @@ -18,9 +18,7 @@ def create_fbank():
opts.frame_opts.preemph_coeff = 0
opts.frame_opts.window_type = "hann"
# Even though GigaAM uses 400 for fft, here we use 512
# since kaldi-native-fbank only supports fft for power of 2.
opts.frame_opts.round_to_power_of_two = True
opts.frame_opts.round_to_power_of_two = False
opts.mel_opts.low_freq = 0
opts.mel_opts.high_freq = 8000
... ...
... ... @@ -19,9 +19,7 @@ def create_fbank():
opts.frame_opts.preemph_coeff = 0
opts.frame_opts.window_type = "hann"
# Even though GigaAM uses 400 for fft, here we use 512
# since kaldi-native-fbank only supports fft for power of 2.
opts.frame_opts.round_to_power_of_two = True
opts.frame_opts.round_to_power_of_two = False
opts.mel_opts.low_freq = 0
opts.mel_opts.high_freq = 8000
... ...
... ... @@ -197,6 +197,7 @@ class FeatureExtractor::Impl {
opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset;
opts_.frame_opts.preemph_coeff = config_.preemph_coeff;
opts_.frame_opts.window_type = config_.window_type;
opts_.frame_opts.round_to_power_of_two = config_.round_to_power_of_two;
opts_.mel_opts.num_bins = config_.feature_dim;
... ... @@ -216,6 +217,7 @@ class FeatureExtractor::Impl {
mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset;
mfcc_opts_.frame_opts.preemph_coeff = config_.preemph_coeff;
mfcc_opts_.frame_opts.window_type = config_.window_type;
mfcc_opts_.frame_opts.round_to_power_of_two = config_.round_to_power_of_two;
mfcc_opts_.mel_opts.num_bins = config_.feature_dim;
... ...
... ... @@ -79,6 +79,8 @@ struct FeatureExtractorConfig {
bool is_mfcc = false;
bool round_to_power_of_two = true;
std::string ToString() const;
void Register(ParseOptions *po);
... ...
... ... @@ -109,6 +109,12 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
config_.feat_config.preemph_coeff = 0;
config_.feat_config.window_type = "hann";
config_.feat_config.feature_dim = 64;
// see
// https://github.com/salute-developers/GigaAM/blob/main/gigaam/preprocess.py#L68
//
// GigaAM uses n_fft 400
config_.feat_config.round_to_power_of_two = false;
} else {
config_.feat_config.low_freq = 0;
config_.feat_config.high_freq = 0;
... ...
... ... @@ -156,6 +156,12 @@ class OfflineRecognizerTransducerNeMoImpl : public OfflineRecognizerImpl {
config_.feat_config.preemph_coeff = 0;
config_.feat_config.window_type = "hann";
config_.feat_config.feature_dim = 64;
// see
// https://github.com/salute-developers/GigaAM/blob/main/gigaam/preprocess.py#L68
//
// GigaAM uses n_fft 400
config_.feat_config.round_to_power_of_two = false;
} else {
config_.feat_config.low_freq = 0;
// config_.feat_config.high_freq = 8000;
... ...