Fangjun Kuang
Committed by GitHub

Update kaldi-native-fbank. (#2259)

Now it supports FFT of an even number, not necessarily a power of 2.
1 function(download_kaldi_native_fbank) 1 function(download_kaldi_native_fbank)
2 include(FetchContent) 2 include(FetchContent)
3 3
4 - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.1.tar.gz")  
5 - set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.1.tar.gz")  
6 - set(kaldi_native_fbank_HASH "SHA256=37c1aa230b00fe062791d800d8fc50aa3de215918d3dce6440699e67275d859e") 4 + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.2.tar.gz")
  5 + set(kaldi_native_fbank_URL2 "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.2.tar.gz")
  6 + set(kaldi_native_fbank_HASH "SHA256=f4bd7d53fe8aeaecc4eda9680c72696bb86bf74e86371d81aacacd6f4ca3914d")
7 7
8 set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) 8 set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
9 set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) 9 set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
@@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
12 # If you don't have access to the Internet, 12 # If you don't have access to the Internet,
13 # please pre-download kaldi-native-fbank 13 # please pre-download kaldi-native-fbank
14 set(possible_file_locations 14 set(possible_file_locations
15 - $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.1.tar.gz  
16 - ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.1.tar.gz  
17 - ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.1.tar.gz  
18 - /tmp/kaldi-native-fbank-1.21.1.tar.gz  
19 - /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.1.tar.gz 15 + $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.2.tar.gz
  16 + ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.2.tar.gz
  17 + ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.2.tar.gz
  18 + /tmp/kaldi-native-fbank-1.21.2.tar.gz
  19 + /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.2.tar.gz
20 ) 20 )
21 21
22 foreach(f IN LISTS possible_file_locations) 22 foreach(f IN LISTS possible_file_locations)
@@ -22,4 +22,4 @@ Cflags: -I"${includedir}" @@ -22,4 +22,4 @@ Cflags: -I"${includedir}"
22 # Note: -lcargs is required only for the following file 22 # Note: -lcargs is required only for the following file
23 # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c 23 # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c
24 # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c 24 # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c
25 -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ 25 +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lkissfft-float -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
@@ -22,4 +22,4 @@ Cflags: -I"${includedir}" @@ -22,4 +22,4 @@ Cflags: -I"${includedir}"
22 # Note: -lcargs is required only for the following file 22 # Note: -lcargs is required only for the following file
23 # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c 23 # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c
24 # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c 24 # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c
25 -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ 25 +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lkaldi-native-fbank-core -lkissfft-float -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -lssentencepiece_core -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_WITH_CARGS@ @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@
@@ -18,9 +18,7 @@ def create_fbank(): @@ -18,9 +18,7 @@ def create_fbank():
18 opts.frame_opts.preemph_coeff = 0 18 opts.frame_opts.preemph_coeff = 0
19 opts.frame_opts.window_type = "hann" 19 opts.frame_opts.window_type = "hann"
20 20
21 - # Even though GigaAM uses 400 for fft, here we use 512  
22 - # since kaldi-native-fbank only supports fft for power of 2.  
23 - opts.frame_opts.round_to_power_of_two = True 21 + opts.frame_opts.round_to_power_of_two = False
24 22
25 opts.mel_opts.low_freq = 0 23 opts.mel_opts.low_freq = 0
26 opts.mel_opts.high_freq = 8000 24 opts.mel_opts.high_freq = 8000
@@ -19,9 +19,7 @@ def create_fbank(): @@ -19,9 +19,7 @@ def create_fbank():
19 opts.frame_opts.preemph_coeff = 0 19 opts.frame_opts.preemph_coeff = 0
20 opts.frame_opts.window_type = "hann" 20 opts.frame_opts.window_type = "hann"
21 21
22 - # Even though GigaAM uses 400 for fft, here we use 512  
23 - # since kaldi-native-fbank only supports fft for power of 2.  
24 - opts.frame_opts.round_to_power_of_two = True 22 + opts.frame_opts.round_to_power_of_two = False
25 23
26 opts.mel_opts.low_freq = 0 24 opts.mel_opts.low_freq = 0
27 opts.mel_opts.high_freq = 8000 25 opts.mel_opts.high_freq = 8000
@@ -197,6 +197,7 @@ class FeatureExtractor::Impl { @@ -197,6 +197,7 @@ class FeatureExtractor::Impl {
197 opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; 197 opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset;
198 opts_.frame_opts.preemph_coeff = config_.preemph_coeff; 198 opts_.frame_opts.preemph_coeff = config_.preemph_coeff;
199 opts_.frame_opts.window_type = config_.window_type; 199 opts_.frame_opts.window_type = config_.window_type;
  200 + opts_.frame_opts.round_to_power_of_two = config_.round_to_power_of_two;
200 201
201 opts_.mel_opts.num_bins = config_.feature_dim; 202 opts_.mel_opts.num_bins = config_.feature_dim;
202 203
@@ -216,6 +217,7 @@ class FeatureExtractor::Impl { @@ -216,6 +217,7 @@ class FeatureExtractor::Impl {
216 mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; 217 mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset;
217 mfcc_opts_.frame_opts.preemph_coeff = config_.preemph_coeff; 218 mfcc_opts_.frame_opts.preemph_coeff = config_.preemph_coeff;
218 mfcc_opts_.frame_opts.window_type = config_.window_type; 219 mfcc_opts_.frame_opts.window_type = config_.window_type;
  220 + mfcc_opts_.frame_opts.round_to_power_of_two = config_.round_to_power_of_two;
219 221
220 mfcc_opts_.mel_opts.num_bins = config_.feature_dim; 222 mfcc_opts_.mel_opts.num_bins = config_.feature_dim;
221 223
@@ -79,6 +79,8 @@ struct FeatureExtractorConfig { @@ -79,6 +79,8 @@ struct FeatureExtractorConfig {
79 79
80 bool is_mfcc = false; 80 bool is_mfcc = false;
81 81
  82 + bool round_to_power_of_two = true;
  83 +
82 std::string ToString() const; 84 std::string ToString() const;
83 85
84 void Register(ParseOptions *po); 86 void Register(ParseOptions *po);
@@ -109,6 +109,12 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { @@ -109,6 +109,12 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
109 config_.feat_config.preemph_coeff = 0; 109 config_.feat_config.preemph_coeff = 0;
110 config_.feat_config.window_type = "hann"; 110 config_.feat_config.window_type = "hann";
111 config_.feat_config.feature_dim = 64; 111 config_.feat_config.feature_dim = 64;
  112 +
  113 + // see
  114 + // https://github.com/salute-developers/GigaAM/blob/main/gigaam/preprocess.py#L68
  115 + //
  116 + // GigaAM uses n_fft 400
  117 + config_.feat_config.round_to_power_of_two = false;
112 } else { 118 } else {
113 config_.feat_config.low_freq = 0; 119 config_.feat_config.low_freq = 0;
114 config_.feat_config.high_freq = 0; 120 config_.feat_config.high_freq = 0;
@@ -156,6 +156,12 @@ class OfflineRecognizerTransducerNeMoImpl : public OfflineRecognizerImpl { @@ -156,6 +156,12 @@ class OfflineRecognizerTransducerNeMoImpl : public OfflineRecognizerImpl {
156 config_.feat_config.preemph_coeff = 0; 156 config_.feat_config.preemph_coeff = 0;
157 config_.feat_config.window_type = "hann"; 157 config_.feat_config.window_type = "hann";
158 config_.feat_config.feature_dim = 64; 158 config_.feat_config.feature_dim = 64;
  159 +
  160 + // see
  161 + // https://github.com/salute-developers/GigaAM/blob/main/gigaam/preprocess.py#L68
  162 + //
  163 + // GigaAM uses n_fft 400
  164 + config_.feat_config.round_to_power_of_two = false;
159 } else { 165 } else {
160 config_.feat_config.low_freq = 0; 166 config_.feat_config.low_freq = 0;
161 // config_.feat_config.high_freq = 8000; 167 // config_.feat_config.high_freq = 8000;