intrin_avx512.hpp 156.7 KB

原文件审查历史永久链接

// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html

#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
#define OPENCV_HAL_INTRIN_AVX512_HPP

#if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/)
# pragma warning(disable:4146)  // unary minus operator applied to unsigned type, result still unsigned
# pragma warning(disable:4309)  // 'argument': truncation of constant value
# pragma warning(disable:4310)  // cast truncates constant value
#endif

#define CVT_ROUND_MODES_IMPLEMENTED 0

#define CV_SIMD512 1
#define CV_SIMD512_64F 1
#define CV_SIMD512_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)

#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
        _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
                         ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
                        a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
        _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
                        ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
                        ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
                        ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
        _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
                        ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
                        ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
                        ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
                        ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
                        ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
                        ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
                        ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
        _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
                       (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
                       (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
                       (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
                       (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
                       (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
                       (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
                       (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))

#ifndef _mm512_cvtpd_pslo
#ifdef _mm512_zextsi256_si512
#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
#else
//if preferred way to extend with zeros is unavailable
#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
#endif
#endif
///////// Utils ////////////

namespace
{

inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }

inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }

inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }

inline int _v_cvtsi512_si32(const __m512i& a)
{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }

inline __m256i _v512_extract_high(const __m512i& v)
{ return _mm512_extracti32x8_epi32(v, 1); }

inline __m256  _v512_extract_high(const __m512& v)
{ return _mm512_extractf32x8_ps(v, 1); }

inline __m256d _v512_extract_high(const __m512d& v)
{ return _mm512_extractf64x4_pd(v, 1); }

inline __m256i _v512_extract_low(const __m512i& v)
{ return _mm512_castsi512_si256(v); }

inline __m256  _v512_extract_low(const __m512& v)
{ return _mm512_castps512_ps256(v); }

inline __m256d _v512_extract_low(const __m512d& v)
{ return _mm512_castpd512_pd256(v); }

inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
{ return _mm512_inserti32x8(a, b, 0); }

inline __m512 _v512_insert(const __m512& a, const __m256& b)
{ return _mm512_insertf32x8(a, b, 0); }

inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
{ return _mm512_insertf64x4(a, b, 0); }

}

namespace cv
{

//! @cond IGNORED

CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN

///////// Types ////////////

struct v_uint8x64
{
    typedef uchar lane_type;
    enum { nlanes = 64 };
    __m512i val;

    explicit v_uint8x64(__m512i v) : val(v) {}
    v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,
               uchar v4,  uchar v5,  uchar v6,  uchar v7,
               uchar v8,  uchar v9,  uchar v10, uchar v11,
               uchar v12, uchar v13, uchar v14, uchar v15,
               uchar v16, uchar v17, uchar v18, uchar v19,
               uchar v20, uchar v21, uchar v22, uchar v23,
               uchar v24, uchar v25, uchar v26, uchar v27,
               uchar v28, uchar v29, uchar v30, uchar v31,
               uchar v32, uchar v33, uchar v34, uchar v35,
               uchar v36, uchar v37, uchar v38, uchar v39,
               uchar v40, uchar v41, uchar v42, uchar v43,
               uchar v44, uchar v45, uchar v46, uchar v47,
               uchar v48, uchar v49, uchar v50, uchar v51,
               uchar v52, uchar v53, uchar v54, uchar v55,
               uchar v56, uchar v57, uchar v58, uchar v59,
               uchar v60, uchar v61, uchar v62, uchar v63)
    {
        val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
    }
    v_uint8x64() {}

    static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }

    uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
};

struct v_int8x64
{
    typedef schar lane_type;
    enum { nlanes = 64 };
    __m512i val;

    explicit v_int8x64(__m512i v) : val(v) {}
    v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,
              schar v4,  schar v5,  schar v6,  schar v7,
              schar v8,  schar v9,  schar v10, schar v11,
              schar v12, schar v13, schar v14, schar v15,
              schar v16, schar v17, schar v18, schar v19,
              schar v20, schar v21, schar v22, schar v23,
              schar v24, schar v25, schar v26, schar v27,
              schar v28, schar v29, schar v30, schar v31,
              schar v32, schar v33, schar v34, schar v35,
              schar v36, schar v37, schar v38, schar v39,
              schar v40, schar v41, schar v42, schar v43,
              schar v44, schar v45, schar v46, schar v47,
              schar v48, schar v49, schar v50, schar v51,
              schar v52, schar v53, schar v54, schar v55,
              schar v56, schar v57, schar v58, schar v59,
              schar v60, schar v61, schar v62, schar v63)
    {
        val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
    }
    v_int8x64() {}

    static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }

    schar get0() const { return (schar)_v_cvtsi512_si32(val); }
};

struct v_uint16x32
{
    typedef ushort lane_type;
    enum { nlanes = 32 };
    __m512i val;

    explicit v_uint16x32(__m512i v) : val(v) {}
    v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,
                ushort v4,  ushort v5,  ushort v6,  ushort v7,
                ushort v8,  ushort v9,  ushort v10, ushort v11,
                ushort v12, ushort v13, ushort v14, ushort v15,
                ushort v16, ushort v17, ushort v18, ushort v19,
                ushort v20, ushort v21, ushort v22, ushort v23,
                ushort v24, ushort v25, ushort v26, ushort v27,
                ushort v28, ushort v29, ushort v30, ushort v31)
    {
        val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
                              v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
    }
    v_uint16x32() {}

    static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }

    ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
};

struct v_int16x32
{
    typedef short lane_type;
    enum { nlanes = 32 };
    __m512i val;

    explicit v_int16x32(__m512i v) : val(v) {}
    v_int16x32(short v0,  short v1,  short v2,  short v3,  short v4,  short v5,  short v6,  short v7,
               short v8,  short v9,  short v10, short v11, short v12, short v13, short v14, short v15,
               short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
               short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
    {
        val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
                              (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
                              (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
                              (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
    }
    v_int16x32() {}

    static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }

    short get0() const { return (short)_v_cvtsi512_si32(val); }
};

struct v_uint32x16
{
    typedef unsigned lane_type;
    enum { nlanes = 16 };
    __m512i val;

    explicit v_uint32x16(__m512i v) : val(v) {}
    v_uint32x16(unsigned v0,  unsigned v1,  unsigned v2,  unsigned v3,
                unsigned v4,  unsigned v5,  unsigned v6,  unsigned v7,
                unsigned v8,  unsigned v9,  unsigned v10, unsigned v11,
                unsigned v12, unsigned v13, unsigned v14, unsigned v15)
    {
        val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,
                                (int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
    }
    v_uint32x16() {}

    static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }

    unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
};

struct v_int32x16
{
    typedef int lane_type;
    enum { nlanes = 16 };
    __m512i val;

    explicit v_int32x16(__m512i v) : val(v) {}
    v_int32x16(int v0, int v1, int v2,  int v3,  int v4,  int v5,  int v6,  int v7,
               int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
    {
        val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
    }
    v_int32x16() {}

    static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }

    int get0() const { return _v_cvtsi512_si32(val); }
};

struct v_float32x16
{
    typedef float lane_type;
    enum { nlanes = 16 };
    __m512 val;

    explicit v_float32x16(__m512 v) : val(v) {}
    v_float32x16(float v0, float v1, float v2,  float v3,  float v4,  float v5,  float v6,  float v7,
                 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
    {
        val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
    }
    v_float32x16() {}

    static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }

    float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
};

struct v_uint64x8
{
    typedef uint64 lane_type;
    enum { nlanes = 8 };
    __m512i val;

    explicit v_uint64x8(__m512i v) : val(v) {}
    v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
    { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
    v_uint64x8() {}

    static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }

    uint64 get0() const
    {
    #if defined __x86_64__ || defined _M_X64
        return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
    #else
        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
        return (unsigned)a | ((uint64)(unsigned)b << 32);
    #endif
    }
};

struct v_int64x8
{
    typedef int64 lane_type;
    enum { nlanes = 8 };
    __m512i val;

    explicit v_int64x8(__m512i v) : val(v) {}
    v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
    { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
    v_int64x8() {}

    static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }

    int64 get0() const
    {
    #if defined __x86_64__ || defined _M_X64
        return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
    #else
        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
    #endif
    }
};

struct v_float64x8
{
    typedef double lane_type;
    enum { nlanes = 8 };
    __m512d val;

    explicit v_float64x8(__m512d v) : val(v) {}
    v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
    { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
    v_float64x8() {}

    static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }

    double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
};

//////////////// Load and store operations ///////////////

#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \
    inline _Tpvec v512_load(const _Tp* ptr)                           \
    { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \
    inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \
    { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \
    inline _Tpvec v512_load_low(const _Tp* ptr)                       \
    {                                                                 \
        __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \
        return _Tpvec(_mm512_castsi256_si512(v256));                  \
    }                                                                 \
    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
    {                                                                 \
        __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \
        __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \
        return _Tpvec(_v512_combine(vlo, vhi));                       \
    }                                                                 \
    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
    { _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \
    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
    { _mm512_store_si512((__m512i*)ptr, a.val); }                     \
    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
    { _mm512_stream_si512((__m512i*)ptr, a.val); }                    \
    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
    { \
        if( mode == hal::STORE_UNALIGNED ) \
            _mm512_storeu_si512((__m512i*)ptr, a.val); \
        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
            _mm512_stream_si512((__m512i*)ptr, a.val); \
        else \
            _mm512_store_si512((__m512i*)ptr, a.val); \
    } \
    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \
    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }

OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)
OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)
OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,  short)
OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,  unsigned)
OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,   int)
OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)
OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)

#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
    inline _Tpvec v512_load(const _Tp* ptr)                               \
    { return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \
    inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \
    { return _Tpvec(_mm512_load_##suffix(ptr)); }                         \
    inline _Tpvec v512_load_low(const _Tp* ptr)                           \
    {                                                                     \
        return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \
                     (_mm256_loadu_##suffix(ptr)));                       \
    }                                                                     \
    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
    {                                                                     \
        halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \
        halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \
        return _Tpvec(_v512_combine(vlo, vhi));                           \
    }                                                                     \
    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
    { _mm512_storeu_##suffix(ptr, a.val); }                               \
    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
    { _mm512_store_##suffix(ptr, a.val); }                                \
    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
    { _mm512_stream_##suffix(ptr, a.val); }                               \
    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
    { \
        if( mode == hal::STORE_UNALIGNED ) \
            _mm512_storeu_##suffix(ptr, a.val); \
        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
            _mm512_stream_##suffix(ptr, a.val); \
        else \
            _mm512_store_##suffix(ptr, a.val); \
    } \
    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
    { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \
    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
    { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }

OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float,  ps, __m256)
OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)

#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
    { return _Tpvec(cast(a.val)); }

#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \
    inline _Tpvec v512_setzero_##suffix()                                          \
    { return _Tpvec(_mm512_setzero_si512()); }                                     \
    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)

OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,   char)
OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,   char)
OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,  short)
OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,  short,    s16, epi16,  short)
OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32,  int)
OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,  int,      s32, epi32,  int)
OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)
OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)

#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
    inline _Tpvec v512_setzero_##suffix()                                   \
    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
    inline _Tpvec v512_setall_##suffix(_Tp v)                               \
    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \
    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)

OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float,  f32, ps, _mm512_castsi512_ps)
OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,  double, f64, pd, _mm512_castsi512_pd)

inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
{ return a; }
inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
{ return v_float32x16(_mm512_castpd_ps(a.val)); }

inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
{ return a; }
inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
{ return v_float64x8(_mm512_castps_pd(a.val)); }

// FP16
inline v_float32x16 v512_load_expand(const float16_t* ptr)
{
    return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
}

inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
{
    __m256i ah = _mm512_cvtps_ph(a.val, 0);
    _mm256_storeu_si256((__m256i*)ptr, ah);
}

/* Recombine & ZIP */
inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
{
#if CV_AVX_512VBMI
    __m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,
                                    87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,
                                    79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,
                                    71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);
    ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
    __m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,
                                   119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,
                                   111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,
                                   103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);
    ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
#else
    __m512i low  = _mm512_unpacklo_epi8(a.val, b.val);
    __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
    ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));
    ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
#endif
}
inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
{
    __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,
                                    39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);
    ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
    __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
                                    55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
    ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
}
inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
{
    __m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);
    ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
    __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
    ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
}
inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
{
    __m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);
    ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
    __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
    ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
}

inline void v_zip(const v_uint8x64&  a, const v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)
{
    v_int8x64 i0, i1;
    v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
    ab0 = v_reinterpret_as_u8(i0);
    ab1 = v_reinterpret_as_u8(i1);
}
inline void v_zip(const v_uint16x32&  a, const v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)
{
    v_int16x32 i0, i1;
    v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
    ab0 = v_reinterpret_as_u16(i0);
    ab1 = v_reinterpret_as_u16(i1);
}
inline void v_zip(const v_uint32x16&  a, const v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)
{
    v_int32x16 i0, i1;
    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
    ab0 = v_reinterpret_as_u32(i0);
    ab1 = v_reinterpret_as_u32(i1);
}
inline void v_zip(const v_uint64x8&  a, const v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)
{
    v_int64x8 i0, i1;
    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
    ab0 = v_reinterpret_as_u64(i0);
    ab1 = v_reinterpret_as_u64(i1);
}
inline void v_zip(const v_float32x16&  a, const v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)
{
    v_int32x16 i0, i1;
    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
    ab0 = v_reinterpret_as_f32(i0);
    ab1 = v_reinterpret_as_f32(i1);
}
inline void v_zip(const v_float64x8&  a, const v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)
{
    v_int64x8 i0, i1;
    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
    ab0 = v_reinterpret_as_f64(i0);
    ab1 = v_reinterpret_as_f64(i1);
}

#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \
    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \
    { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \
    { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \
    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \
                                  _Tpvec& c, _Tpvec& d)                                   \
    {                                                                                     \
        c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \
        d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \
    }


OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)

////////// Arithmetic, bitwise and comparison operations /////////

/* Element-wise binary and unary operations */

/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \
    { return _Tpvec(intrin(a.val, b.val)); }

OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)

inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
{
    __m512i ad = _mm512_srai_epi16(a.val, 8);
    __m512i bd = _mm512_srai_epi16(b.val, 8);
    __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
    __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
    return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
}
inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
{
    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
}

#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
    { return _Tpvec(intrin(a.val, b.val)); }                             \
    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
    { a.val = intrin(a.val, b.val); return a; }

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)

OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)

/** Saturating arithmetics **/
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)

OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)

// saturating multiply
inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
{
    v_uint16x32 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
}
inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
{
    v_int16x32 c, d;
    v_mul_expand(a, b, c, d);
    return v_pack(c, d);
}
inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
{
    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);

    const __m512i m = _mm512_set1_epi32(65535);
    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
}
inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
{
    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
    return v_int16x32(_mm512_packs_epi32(p0, p1));
}

inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
{ a = a * b; return a; }
inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
{ a = a * b; return a; }
inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
{ a = a * b; return a; }
inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
{ a = a * b; return a; }

inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }

//  Multiply and expand
inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
                         v_uint16x32& c, v_uint16x32& d)
{
    v_uint16x32 a0, a1, b0, b1;
    v_expand(a, a0, a1);
    v_expand(b, b0, b1);
    c = v_mul_wrap(a0, b0);
    d = v_mul_wrap(a1, b1);
}

inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
                         v_int16x32& c, v_int16x32& d)
{
    v_int16x32 a0, a1, b0, b1;
    v_expand(a, a0, a1);
    v_expand(b, b0, b1);
    c = v_mul_wrap(a0, b0);
    d = v_mul_wrap(a1, b1);
}

inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
                         v_int32x16& c, v_int32x16& d)
{
    v_int16x32 v0, v1;
    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);

    c = v_reinterpret_as_s32(v0);
    d = v_reinterpret_as_s32(v1);
}

inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
                         v_uint32x16& c, v_uint32x16& d)
{
    v_uint16x32 v0, v1;
    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);

    c = v_reinterpret_as_u32(v0);
    d = v_reinterpret_as_u32(v1);
}

inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
                         v_uint64x8& c, v_uint64x8& d)
{
    v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
          v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
}

inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
    v_int64x8& c, v_int64x8& d)
{
    v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
          v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
}

/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
    template<int imm>                                             \
    inline _Tpuvec v_shl(const _Tpuvec& a)                        \
    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
    template<int imm>                                             \
    inline _Tpsvec v_shl(const _Tpsvec& a)                        \
    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
    template<int imm>                                             \
    inline _Tpuvec v_shr(const _Tpuvec& a)                        \
    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
    template<int imm>                                             \
    inline _Tpsvec v_shr(const _Tpsvec& a)                        \
    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }

OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)


/** Bitwise logic **/
#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
    inline _Tpvec operator ~ (const _Tpvec& a)                     \
    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }

OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))

/** Select **/
#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \
    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
    { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }

OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)
OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)
OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)

/** Comparison **/
#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }

#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)

OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)

#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }

#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)

OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)

inline v_float32x16 v_not_nan(const v_float32x16& a)
{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
inline v_float64x8 v_not_nan(const v_float64x8& a)
{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }

/** min/max **/
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)
OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)

/** Rotate **/
namespace {
    template<bool prec, int imm4, bool part, int imm32>
    struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
    template<int imm4, int imm32>
    struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
    {
        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32    ),    imm4 *8),
                                         _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
    }};
    template<int imm4>
    struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
    {
        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15),    imm4 *8),
                                         _mm512_slli_epi32(                                b.val, (4-imm4)*8)));
    }};
    template<int imm4, int imm32>
    struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
    {
        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16),    imm4 *8),
                                         _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
    }};
    template<int imm4>
    struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
    { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
    template<int imm32>
    struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
    { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
    template<>
    struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};
    template<int imm32>
    struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
    { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
    template<>
    struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};
    template<>
    struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
}
template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
{
    return imm >= 128 ? v_int8x64() :
#if CV_AVX_512VBMI
    v_int8x64(_mm512_permutex2var_epi8(a.val,
    _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
                   0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
                   0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
                   0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
                   0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
                   0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
                   0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
                   0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
#else
    _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
#endif
}
template<int imm>
inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
{
    if (imm == 0) return a;
    if (imm == 64) return b;
    if (imm >= 128) return v_int8x64();
#if CV_AVX_512VBMI
    return v_int8x64(_mm512_permutex2var_epi8(b.val,
           _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
                          0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
                          0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
                          0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
                          0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
                          0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
                          0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
                          0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
#else
    return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
#endif
}
template<int imm>
inline v_int8x64 v_rotate_right(const v_int8x64& a)
{
    if (imm == 0) return a;
    if (imm >= 64) return v_int8x64();
#if CV_AVX_512VBMI
    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
#else
    return v_rotate_right<imm>(a, v512_setzero_s8());
#endif
}
template<int imm>
inline v_int8x64 v_rotate_left(const v_int8x64& a)
{
    if (imm == 0) return a;
    if (imm >= 64) return v_int8x64();
#if CV_AVX_512VBMI
    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
           _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
                          0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
                          0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
                          0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
                          0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
                          0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
                          0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
                          0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
#else
    return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
#endif
}

#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                   \
template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                            \
{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }      \
template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                           \
{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }     \
template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                             \
{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                              \
template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                            \
{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }

#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                   \
template<int imm>                                                                                                                          \
inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                              \
{                                                                                                                                          \
    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
    if (imm == 0) return a;                                                                                                                \
    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
}                                                                                                                                          \
template<int imm>                                                                                                                          \
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                             \
{                                                                                                                                          \
    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
    if (imm == 0) return a;                                                                                                                \
    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
}                                                                                                                                          \
template<int imm>                                                                                                                          \
inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                               \
{                                                                                                                                          \
    if (imm == 0) return a;                                                                                                                \
    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
    return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                              \
}                                                                                                                                          \
template<int imm>                                                                                                                          \
inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                              \
{                                                                                                                                          \
    if (imm == 0) return a;                                                                                                                \
    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
    return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                            \
}

OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)
OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)
OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)

/** Reverse **/
inline v_uint8x64 v_reverse(const v_uint8x64 &a)
{
#if CV_AVX_512VBMI
    static const __m512i perm = _mm512_set_epi32(
            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
            0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
            0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
            0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
    return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
#else
    static const __m512i shuf = _mm512_set_epi32(
            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
    return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
#endif
}

inline v_int8x64 v_reverse(const v_int8x64 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }

inline v_uint16x32 v_reverse(const v_uint16x32 &a)
{
#if CV_AVX_512VBMI
    static const __m512i perm = _mm512_set_epi32(
            0x00000001, 0x00020003, 0x00040005, 0x00060007,
            0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
            0x00100011, 0x00120013, 0x00140015, 0x00160017,
            0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
    return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
#else
    static const __m512i shuf = _mm512_set_epi32(
            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
    return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
#endif
}

inline v_int16x32 v_reverse(const v_int16x32 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }

inline v_uint32x16 v_reverse(const v_uint32x16 &a)
{
    static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
}

inline v_int32x16 v_reverse(const v_int32x16 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }

inline v_float32x16 v_reverse(const v_float32x16 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }

inline v_uint64x8 v_reverse(const v_uint64x8 &a)
{
    static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
    return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
}

inline v_int64x8 v_reverse(const v_int64x8 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }

inline v_float64x8 v_reverse(const v_float64x8 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }

////////// Reduce /////////

/** Reduce **/
#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \
    inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \
    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \
      sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \
      _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
      return scop(idx[0], idx[1]); }
OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  min, v_int64x8,  min_epi64, min)
OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  max, v_int64x8,  max_epi64, max)
OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)

#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \
    inline double v_reduce_##func(const v_float64x8& a)                                             \
    { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \
      double CV_DECL_ALIGNED(64) idx[2];                                                            \
      _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
      return scop(idx[0], idx[1]); }
OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)

#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \
    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
      return (sctype)_mm_cvtsi128_si32(quarter); }
OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  min, v_int32x16,  min_epi32)
OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  max, v_int32x16,  max_epi32)

#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \
    inline float v_reduce_##func(const v_float32x16& a)                                           \
    { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \
      __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \
      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \
      return _mm_cvtss_f32(quarter); }
OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)

inline float v_reduce_sum(const v_float32x16& a)
{
    __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
    __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
    quarter = _mm_hadd_ps(quarter, quarter);
    return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
}
inline int v_reduce_sum(const v_int32x16& a)
{
    __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
    quarter = _mm_hadd_epi32(quarter, quarter);
    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
}
inline uint v_reduce_sum(const v_uint32x16& a)
{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }

#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \
    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
      return (sctype)_mm_cvtsi128_si32(quarter); }
OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)

inline int v_reduce_sum(const v_int16x32& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline uint v_reduce_sum(const v_uint16x32& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }

#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \
      return (sctype)_mm_cvtsi128_si32(quarter); }
OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64,  min_epi8)
OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64,  max_epi8)

#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \
    inline sctype v_reduce_sum(const _Tpvec& a)                                                         \
    {   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \
                                       _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \
        a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
        __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \
        __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \
        a4 = _mm_hadd_epi32(a4, a4);                                                                    \
        return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)

inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
                                  const v_float32x16& c, const v_float32x16& d)
{
    __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
    __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
    __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
    __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
    return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
}

inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
{
    __m512i val = _mm512_sad_epu8(a.val, b.val);
    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
{
    __m512i val = _mm512_set1_epi8(-128);
    val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }

/** Popcount **/
inline v_uint8x64 v_popcount(const v_int8x64& a)
{
#if CV_AVX_512BITALG
    return v_uint8x64(_mm512_popcnt_epi8(a.val));
#elif CV_AVX_512VBMI
    __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
                                            4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
    __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
    return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
#else
    __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
    __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);

    return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),
                                      _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
#endif
}
inline v_uint16x32 v_popcount(const v_int16x32& a)
{
#if CV_AVX_512BITALG
    return v_uint16x32(_mm512_popcnt_epi16(a.val));
#elif CV_AVX_512VPOPCNTDQ
    __m512i zero = _mm512_setzero_si512();
    return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
#else
    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
    p += v_rotate_right<1>(p);
    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
#endif
}
inline v_uint32x16 v_popcount(const v_int32x16& a)
{
#if CV_AVX_512VPOPCNTDQ
    return v_uint32x16(_mm512_popcnt_epi32(a.val));
#else
    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
    p += v_rotate_right<1>(p);
    p += v_rotate_right<2>(p);
    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
#endif
}
inline v_uint64x8 v_popcount(const v_int64x8& a)
{
#if CV_AVX_512VPOPCNTDQ
    return v_uint64x8(_mm512_popcnt_epi64(a.val));
#else
    return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
#endif
}


inline v_uint8x64  v_popcount(const v_uint8x64&  a) { return v_popcount(v_reinterpret_as_s8 (a)); }
inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinterpret_as_s64(a)); }


////////// Other math /////////

/** Some frequent operations **/
#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \
    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
    { return v_fma(a, a, b * b); }                                            \
    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
    { return v_sqrt(v_fma(a, a, b * b)); }

OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)

inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
{ return a * b + c; }
inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
{ return v_fma(a, b, c); }

inline v_float32x16 v_invsqrt(const v_float32x16& x)
{
#if CV_AVX_512ER
    return v_float32x16(_mm512_rsqrt28_ps(x.val));
#else
    v_float32x16 half = x * v512_setall_f32(0.5);
    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
    t *= v512_setall_f32(1.5) - ((t * t) * half);
    return t;
#endif
}

inline v_float64x8 v_invsqrt(const v_float64x8& x)
{
#if CV_AVX_512ER
    return v_float64x8(_mm512_rsqrt28_pd(x.val));
#else
    return v512_setall_f64(1.) / v_sqrt(x);
//    v_float64x8 half = x * v512_setall_f64(0.5);
//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
//    t *= v512_setall_f64(1.5) - ((t * t) * half);
//    t *= v512_setall_f64(1.5) - ((t * t) * half);
//    return t;
#endif
}

/** Absolute values **/
#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
    inline _Tpuvec v_abs(const _Tpvec& x)                   \
    { return _Tpuvec(_mm512_abs_##suffix(x.val)); }

OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)
OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)
OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)
OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)

inline v_float32x16 v_abs(const v_float32x16& x)
{
#ifdef _mm512_abs_pd
    return v_float32x16(_mm512_abs_ps(x.val));
#else
    return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
                        _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
                                        0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
#endif
}

inline v_float64x8 v_abs(const v_float64x8& x)
{
#ifdef _mm512_abs_pd
    #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
        return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
    #else
        return v_float64x8(_mm512_abs_pd(x.val));
    #endif
#else
    return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
                       _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
                                       0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
#endif
}

/** Absolute difference **/
inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
{ return v_add_wrap(a - b,  b - a); }
inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
{ return v_add_wrap(a - b,  b - a); }
inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
{ return v_max(a, b) - v_min(a, b); }

inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
{
    v_int8x64 d = v_sub_wrap(a, b);
    v_int8x64 m = a < b;
    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
}

inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }

inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
{
    v_int32x16 d = a - b;
    v_int32x16 m = a < b;
    return v_reinterpret_as_u32((d ^ m) - m);
}

inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
{ return v_abs(a - b); }

inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
{ return v_abs(a - b); }

/** Saturating absolute difference **/
inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
{
    v_int8x64 d = a - b;
    v_int8x64 m = a < b;
    return (d ^ m) - m;
}
inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
{ return v_max(a, b) - v_min(a, b); }

////////// Conversions /////////

/** Rounding **/
inline v_int32x16 v_round(const v_float32x16& a)
{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }

inline v_int32x16 v_round(const v_float64x8& a)
{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }

inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }

inline v_int32x16 v_trunc(const v_float32x16& a)
{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }

inline v_int32x16 v_trunc(const v_float64x8& a)
{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }

#if CVT_ROUND_MODES_IMPLEMENTED
inline v_int32x16 v_floor(const v_float32x16& a)
{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }

inline v_int32x16 v_floor(const v_float64x8& a)
{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }

inline v_int32x16 v_ceil(const v_float32x16& a)
{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }

inline v_int32x16 v_ceil(const v_float64x8& a)
{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
#else
inline v_int32x16 v_floor(const v_float32x16& a)
{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }

inline v_int32x16 v_floor(const v_float64x8& a)
{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }

inline v_int32x16 v_ceil(const v_float32x16& a)
{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }

inline v_int32x16 v_ceil(const v_float64x8& a)
{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
#endif

/** To float **/
inline v_float32x16 v_cvt_f32(const v_int32x16& a)
{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }

inline v_float32x16 v_cvt_f32(const v_float64x8& a)
{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }

inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }

inline v_float64x8 v_cvt_f64(const v_int32x16& a)
{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }

inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }

inline v_float64x8 v_cvt_f64(const v_float32x16& a)
{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }

inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }

// from (Mysticial and wim) https://stackoverflow.com/q/41144668
inline v_float64x8 v_cvt_f64(const v_int64x8& v)
{
#if CV_AVX_512DQ
    return v_float64x8(_mm512_cvtepi64_pd(v.val));
#else
    // constants encoded as floating-point
    __m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000); // 2^52
    __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
    __m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
    __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);

    // Blend the 32 lowest significant bits of v with magic_int_lo
    __m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
    // Extract the 32 most significant bits of v
    __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
    // Flip the msb of v_hi and blend with 0x45300000
            v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);
    // Compute in double precision
    __m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
    __m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
    return v_float64x8(result);
#endif
}

////////////// Lookup table access ////////////////////

inline v_int8x64 v512_lut(const schar* tab, const int* idx)
{
    __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
    __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
    __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
    __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
    return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
}
inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
{
    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
    return v_int8x64(_v512_combine(p0, p1));
}
inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
{
    return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
}
inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }

inline v_int16x32 v512_lut(const short* tab, const int* idx)
{
    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 2));
    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
    return v_int16x32(_v512_combine(p0, p1));
}
inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
{
    return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
}
inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
{
#if defined(__GNUC__)
    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
#else
    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
#endif
}
inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }

inline v_int32x16 v512_lut(const int* tab, const int* idx)
{
    return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
}
inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
{
#if defined(__GNUC__)
    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
#else
    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
#endif
}
inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
{
    return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
                          _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
                          _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
                          _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
                          _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
}
inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }

inline v_int64x8 v512_lut(const int64* tab, const int* idx)
{
#if defined(__GNUC__)
    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
#else
    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
#endif
}
inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
{
    return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
                         _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
                         _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
                         _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
                         _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
}
inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }

inline v_float32x16 v512_lut(const float* tab, const int* idx)
{
    return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
}
inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }

inline v_float64x8 v512_lut(const double* tab, const int* idx)
{
    return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
}
inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
{
        return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
                               _mm_loadu_pd(tab + idx[0])),
                               _mm_loadu_pd(tab + idx[1]), 1),
                               _mm_loadu_pd(tab + idx[2]), 2),
                               _mm_loadu_pd(tab + idx[3]), 3));
}

inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
{
    return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
}

inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
{
    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
}

inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
{
    return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
}

inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
{
    return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
}

inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
{
    x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
    y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
}

inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
{
    x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
    y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
}

inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
{
    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
}
inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
{
    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
}
inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }

inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
{
    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
}
inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
{
    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
}
inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }

inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
{
    return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
}
inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }

inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
{
    return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
                                              _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
}
inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }

inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
{
    return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
                                                               0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
}
inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }

inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
{
    return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
                                                               0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
}
inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
{
    return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
}

////////// Matrix operations /////////

//////// Dot Product ////////

// 16 >> 32
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
{ return v_dotprod(a, b) + c; }

// 32 >> 64
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
{
    __m512i even = _mm512_mul_epi32(a.val, b.val);
    __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
    return v_int64x8(_mm512_add_epi64(even, odd));
}
inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
{ return v_dotprod(a, b) + c; }

// 8 >> 32
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
{
    __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
    __m512i odd_a  = _mm512_srli_epi16(a.val, 8);

    __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
    __m512i odd_b  = _mm512_srli_epi16(b.val, 8);

    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
    return v_uint32x16(_mm512_add_epi32(prod0, prod1));
}
inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
{ return v_dotprod_expand(a, b) + c; }

inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
{
    __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
    __m512i odd_a  = _mm512_srai_epi16(a.val, 8);

    __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
    __m512i odd_b  = _mm512_srai_epi16(b.val, 8);

    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
    return v_int32x16(_mm512_add_epi32(prod0, prod1));
}
inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
{ return v_dotprod_expand(a, b) + c; }

// 16 >> 64
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
{
    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);

    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
    __m512i p13   = _mm512_srli_epi64(mul0, 32);
    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
    __m512i p57   = _mm512_srli_epi64(mul1, 32);

    __m512i p15_  = _mm512_add_epi64(p02, p13);
    __m512i p9d_  = _mm512_add_epi64(p46, p57);

    return v_uint64x8(_mm512_add_epi64(
        _mm512_unpacklo_epi64(p15_, p9d_),
        _mm512_unpackhi_epi64(p15_, p9d_)
    ));
}
inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
{ return v_dotprod_expand(a, b) + c; }

inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
{
    __m512i prod = _mm512_madd_epi16(a.val, b.val);
    __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
    __m512i odd  = _mm512_srai_epi64(prod, 32);
    return v_int64x8(_mm512_add_epi64(even, odd));
}
inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
{ return v_dotprod_expand(a, b) + c; }

// 32 >> 64f
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
{ return v_dotprod_expand(a, b) + c; }

//////// Fast Dot Product ////////

// 16 >> 32
inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
{ return v_dotprod(a, b); }
inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
{ return v_dotprod(a, b, c); }

// 32 >> 64
inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
{ return v_dotprod(a, b); }
inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
{ return v_dotprod(a, b, c); }

// 8 >> 32
inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
{ return v_dotprod_expand(a, b); }
inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
{ return v_dotprod_expand(a, b, c); }

inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
{ return v_dotprod_expand(a, b); }
inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
{ return v_dotprod_expand(a, b, c); }

// 16 >> 64
inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
{
    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);

    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
    __m512i p13   = _mm512_srli_epi64(mul0, 32);
    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
    __m512i p57   = _mm512_srli_epi64(mul1, 32);

    __m512i p15_  = _mm512_add_epi64(p02, p13);
    __m512i p9d_  = _mm512_add_epi64(p46, p57);
    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
}
inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
{ return v_dotprod_expand_fast(a, b) + c; }

inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
{ return v_dotprod_expand(a, b); }
inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
{ return v_dotprod_expand(a, b, c); }

// 32 >> 64f
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
{ return v_dotprod_expand(a, b); }
inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
{ return v_dotprod_expand(a, b) + c; }


#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))

inline v_float32x16 v_matmul(const v_float32x16& v,
                             const v_float32x16& m0, const v_float32x16& m1,
                             const v_float32x16& m2, const v_float32x16& m3)
{
    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
}

inline v_float32x16 v_matmuladd(const v_float32x16& v,
                                const v_float32x16& m0, const v_float32x16& m1,
                                const v_float32x16& m2, const v_float32x16& a)
{
    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
}

#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
                               const _Tpvec& a2, const _Tpvec& a3,              \
                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
    {                                                                           \
        __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \
        __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \
        __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \
        __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \
        b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \
        b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \
        b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \
        b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \
    }

OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)

//////////////// Value reordering ///////////////

/* Expand */
#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
    {                                                               \
        b0.val = intrin(_v512_extract_low(a.val));                  \
        b1.val = intrin(_v512_extract_high(a.val));                 \
    }                                                               \
    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
    { return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \
    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
    { return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \
    inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \
    {                                                               \
        __m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \
        return _Tpwvec(intrin(a));                                  \
    }

OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)
OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)
OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)
OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,  short,    _mm512_cvtepi16_epi32)
OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,  unsigned, _mm512_cvtepu32_epi64)
OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,   int,      _mm512_cvtepi32_epi64)

#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
    inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \
    {                                                        \
        __m128i a = _mm_loadu_si128((const __m128i*)ptr);    \
        return _Tpvec(intrin(a));                            \
    }

OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)

/* pack */
// 16
inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }

inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
{
    const __m512i t = _mm512_set1_epi16(255);
    return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
}

inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
{
    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
}

inline void v_pack_store(schar* ptr, const v_int16x32& a)
{ v_store_low(ptr, v_pack(a, a)); }

inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
{
    const __m512i m = _mm512_set1_epi16(255);
    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
}

inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
{ v_store_low(ptr, v_pack_u(a, a)); }

template<int n> inline
v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
{
    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
                    v_reinterpret_as_s16((b + delta) >> n));
}

template<int n> inline
void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
{
    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
}

template<int n> inline
v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
{
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
    return v_pack_u((a + delta) >> n, (b + delta) >> n);
}

template<int n> inline
void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
{
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
    v_pack_u_store(ptr, (a + delta) >> n);
}

template<int n> inline
v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
{
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
    return v_pack((a + delta) >> n, (b + delta) >> n);
}

template<int n> inline
void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
{
    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
    v_pack_store(ptr, (a + delta) >> n);
}

// 32
inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }

inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
{
    const __m512i m = _mm512_set1_epi32(65535);
    return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
}

inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }

inline void v_pack_store(short* ptr, const v_int32x16& a)
{ v_store_low(ptr, v_pack(a, a)); }

inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
{
    const __m512i m = _mm512_set1_epi32(65535);
    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
}

inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
{ v_store_low(ptr, v_pack_u(a, a)); }


template<int n> inline
v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
{
    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
                    v_reinterpret_as_s32((b + delta) >> n));
}

template<int n> inline
void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
{
    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
}

template<int n> inline
v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
{
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
    return v_pack_u((a + delta) >> n, (b + delta) >> n);
}

template<int n> inline
void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
{
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
    v_pack_u_store(ptr, (a + delta) >> n);
}

template<int n> inline
v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
{
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
    return v_pack((a + delta) >> n, (b + delta) >> n);
}

template<int n> inline
void v_rshr_pack_store(short* ptr, const v_int32x16& a)
{
    v_int32x16 delta = v512_setall_s32(1 << (n-1));
    v_pack_store(ptr, (a + delta) >> n);
}

// 64
// Non-saturating pack
inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }

inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }

inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }

inline void v_pack_store(int* ptr, const v_int64x8& b)
{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }

template<int n> inline
v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
{
    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
    return v_pack((a + delta) >> n, (b + delta) >> n);
}

template<int n> inline
void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
{
    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
    v_pack_store(ptr, (a + delta) >> n);
}

template<int n> inline
v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
{
    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
    return v_pack((a + delta) >> n, (b + delta) >> n);
}

template<int n> inline
void v_rshr_pack_store(int* ptr, const v_int64x8& a)
{
    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
    v_pack_store(ptr, (a + delta) >> n);
}

// pack boolean
inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }

inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
                           const v_uint32x16& c, const v_uint32x16& d)
{
    __m512i ab = _mm512_packs_epi32(a.val, b.val);
    __m512i cd = _mm512_packs_epi32(c.val, d.val);

    return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
}

inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
                           const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
                           const v_uint64x8& g, const v_uint64x8& h)
{
    __m512i ab = _mm512_packs_epi32(a.val, b.val);
    __m512i cd = _mm512_packs_epi32(c.val, d.val);
    __m512i ef = _mm512_packs_epi32(e.val, f.val);
    __m512i gh = _mm512_packs_epi32(g.val, h.val);

    __m512i abcd = _mm512_packs_epi32(ab, cd);
    __m512i efgh = _mm512_packs_epi32(ef, gh);

    return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
                                                               27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));
}

/* Recombine */
// its up there with load and store operations

/* Extract */
#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \
    template<int s>                                           \
    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
    { return v_rotate_right<s>(a, b); }

OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)

#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }

OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)

template<int i>
inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
{
    static const __m512i perm = _mm512_set1_epi32((char)i);
    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
}

template<int i>
inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }

template<int i>
inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }


///////////////////// load deinterleave /////////////////////////////

inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
{
    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
#if CV_AVX_512VBMI
    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
    a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
    b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
#else
    __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
    __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
    __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
    __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
    __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
    a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
    b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
#endif
}

inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
{
    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
    a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
    b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
}

inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
{
    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
    a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
    b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
}

inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
{
    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
    a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
    b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
}

inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
{
    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));

#if CV_AVX_512VBMI2
    __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,
                                    78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,
                                    30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,
                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);
    __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
    __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
    __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
                    _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,
                                    77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
                                    94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,
                                    46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);
    a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
    b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
    c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
#elif CV_AVX_512VBMI
    __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
    __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
    __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
    a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,
                                                                    77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,
                                                                    46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,
                                                                    23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));
    b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,
                                                                    39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,
                                                                    15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,
                                                                   111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));
    c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,
                                                                    15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,
                                                                    95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,
                                                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));
#else
    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);

    __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
    __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
                                                                   14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
    __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
    a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
    c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
    b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
#endif
}

inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
{
    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));

    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);

    a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
    b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
                                                                    14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
    c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
}

inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
{
    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));

    __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
    __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
    __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
    __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);

    a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
    b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
    c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
}

inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
{
    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));

    __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
    __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
    __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
    __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);

    a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
    c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
    b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
}

inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
{
    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));

#if CV_AVX_512VBMI
    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);

    __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
    __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
    __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
    __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);

    a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
    c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
    b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
    d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
#else
    __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
    __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
    __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
    __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
    __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);

    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);

    __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
    __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
    __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
    __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);

    a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
    c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
    b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
    d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
#endif
}

inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
{
    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));

    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);

    __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
    __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
    __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
    __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);

    a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
    c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
    b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
    d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
}

inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
{
    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));

    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);

    __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
    __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
    __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
    __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);

    a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
    c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
    b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
    d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
}

inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
{
    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));

    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);

    __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
    __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
    __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
    __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);

    a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
    c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
    b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
    d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
}

///////////////////////////// store interleave /////////////////////////////////////

inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint8x64 low, high;
    v_zip(x, y, low, high);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, low.val);
        _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, low.val);
        _mm512_store_si512((__m512i*)(ptr + 64), high.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, low.val);
        _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
    }
}

inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint16x32 low, high;
    v_zip(x, y, low, high);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, low.val);
        _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, low.val);
        _mm512_store_si512((__m512i*)(ptr + 32), high.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, low.val);
        _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
    }
}

inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint32x16 low, high;
    v_zip(x, y, low, high);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, low.val);
        _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, low.val);
        _mm512_store_si512((__m512i*)(ptr + 16), high.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, low.val);
        _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
    }
}

inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint64x8 low, high;
    v_zip(x, y, low, high);
    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, low.val);
        _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, low.val);
        _mm512_store_si512((__m512i*)(ptr + 8), high.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, low.val);
        _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
    }
}

inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
#if CV_AVX_512VBMI
    __m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,
                                    79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,
                                    10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,
                                   111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);
    __m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,
                                    37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,
                                    95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,
                                     5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);
    __m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,
                                   122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,
                                    53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,
                                    90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);
    __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
    __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
    __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);

    __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
    __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
    __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
#else
    __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
    __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
    __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
    __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);

    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
    __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
    __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
    __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);

    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
#endif

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgr0);
        _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
        _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgr0);
        _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
        _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgr0);
        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
        _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
    }
}

inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
    __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
    __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
    __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);

    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgr0);
        _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
        _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgr0);
        _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
        _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgr0);
        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
    }
}

inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
    __m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);
    __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
    __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);

    __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
    __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
    __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgr0);
        _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
        _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgr0);
        _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
        _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgr0);
        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
    }
}

inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    __m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);
    __m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);
    __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
    __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);

    __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
    __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
    __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgr0);
        _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
        _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgr0);
        _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
        _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgr0);
        _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
    }
}

inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,
                                const v_uint8x64& c, const v_uint8x64& d,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint8x64 br01, br23, ga01, ga23;
    v_zip(a, c, br01, br23);
    v_zip(b, d, ga01, ga23);
    v_uint8x64 bgra0, bgra1, bgra2, bgra3;
    v_zip(br01, ga01, bgra0, bgra1);
    v_zip(br23, ga23, bgra2, bgra3);

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
        _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
        _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
        _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgra0.val);
        _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
        _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
        _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
        _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
        _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
    }
}

inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,
                                const v_uint16x32& c, const v_uint16x32& d,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint16x32 br01, br23, ga01, ga23;
    v_zip(a, c, br01, br23);
    v_zip(b, d, ga01, ga23);
    v_uint16x32 bgra0, bgra1, bgra2, bgra3;
    v_zip(br01, ga01, bgra0, bgra1);
    v_zip(br23, ga23, bgra2, bgra3);

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
        _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
        _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
        _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgra0.val);
        _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
        _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
        _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
        _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
    }
}

inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,
                                const v_uint32x16& c, const v_uint32x16& d,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint32x16 br01, br23, ga01, ga23;
    v_zip(a, c, br01, br23);
    v_zip(b, d, ga01, ga23);
    v_uint32x16 bgra0, bgra1, bgra2, bgra3;
    v_zip(br01, ga01, bgra0, bgra1);
    v_zip(br23, ga23, bgra2, bgra3);

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
        _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
        _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
        _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgra0.val);
        _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
        _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
        _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
        _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
    }
}

inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,
                                const v_uint64x8& c, const v_uint64x8& d,
                                hal::StoreMode mode=hal::STORE_UNALIGNED )
{
    v_uint64x8 br01, br23, ga01, ga23;
    v_zip(a, c, br01, br23);
    v_zip(b, d, ga01, ga23);
    v_uint64x8 bgra0, bgra1, bgra2, bgra3;
    v_zip(br01, ga01, bgra0, bgra1);
    v_zip(br23, ga23, bgra2, bgra3);

    if( mode == hal::STORE_ALIGNED_NOCACHE )
    {
        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
        _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
        _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
        _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
    }
    else if( mode == hal::STORE_ALIGNED )
    {
        _mm512_store_si512((__m512i*)ptr, bgra0.val);
        _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
        _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
        _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
    }
    else
    {
        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
        _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
        _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
    }
}

#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
{ \
    _Tpvec1 a1, b1; \
    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
    a0 = v_reinterpret_as_##suffix0(a1); \
    b0 = v_reinterpret_as_##suffix0(b1); \
} \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
{ \
    _Tpvec1 a1, b1, c1; \
    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
    a0 = v_reinterpret_as_##suffix0(a1); \
    b0 = v_reinterpret_as_##suffix0(b1); \
    c0 = v_reinterpret_as_##suffix0(c1); \
} \
inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
{ \
    _Tpvec1 a1, b1, c1, d1; \
    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
    a0 = v_reinterpret_as_##suffix0(a1); \
    b0 = v_reinterpret_as_##suffix0(b1); \
    c0 = v_reinterpret_as_##suffix0(c1); \
    d0 = v_reinterpret_as_##suffix0(d1); \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
} \
inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
                                const _Tpvec0& c0, const _Tpvec0& d0, \
                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
{ \
    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
}

OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)

////////// Mask and checks /////////

/** Mask **/
inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }

inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }

/** Checks **/
inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }

inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }

inline int v_scan_forward(const v_int8x64& a)
{
    int64 mask = _mm512_movepi8_mask(a.val);
    int mask32 = (int)mask;
    return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
}
inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }

inline void v512_cleanup() { _mm256_zeroall(); }

CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

//! @endcond

} // cv::

#endif // OPENCV_HAL_INTRIN_AVX_HPP