From 4ecbcf0885472631b21a459b722403f6a7efed04 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Thu, 16 Jan 2020 15:06:34 +0300
Subject: [PATCH 1/4] imgproc: copy sumpixels.simd.hpp

---
 modules/imgproc/src/{sumpixels.cpp => sumpixels.simd.hpp} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename modules/imgproc/src/{sumpixels.cpp => sumpixels.simd.hpp} (100%)
 mode change 100755 => 100644

diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.simd.hpp
old mode 100755
new mode 100644
similarity index 100%
rename from modules/imgproc/src/sumpixels.cpp
rename to modules/imgproc/src/sumpixels.simd.hpp

From c6a622542d59b8f949812038b8b24f664e580729 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Thu, 16 Jan 2020 15:07:48 +0300
Subject: [PATCH 2/4] imgproc: copy sumpixels.dispatch.cpp

---
 modules/imgproc/src/{sumpixels.cpp => sumpixels.dispatch.cpp} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename modules/imgproc/src/{sumpixels.cpp => sumpixels.dispatch.cpp} (100%)

diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.dispatch.cpp
similarity index 100%
rename from modules/imgproc/src/sumpixels.cpp
rename to modules/imgproc/src/sumpixels.dispatch.cpp

From b4316af83496e9e4dbfdf680244fa84bd0bc174e Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 17 Jan 2020 16:49:46 +0300
Subject: [PATCH 3/4] imgproc: rename sumpixels.avx512_skx.{cpp,hpp}

---
 .../src/{sumpixels.avx512_skx.cpp => sumpixels.avx512_skx.hpp}    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename modules/imgproc/src/{sumpixels.avx512_skx.cpp => sumpixels.avx512_skx.hpp} (100%)

diff --git a/modules/imgproc/src/sumpixels.avx512_skx.cpp b/modules/imgproc/src/sumpixels.avx512_skx.hpp
similarity index 100%
rename from modules/imgproc/src/sumpixels.avx512_skx.cpp
rename to modules/imgproc/src/sumpixels.avx512_skx.hpp

From 09b3383a7e1072ac831eaf71fc56a33941de4db9 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.alekhin@intel.com>
Date: Fri, 17 Jan 2020 16:54:08 +0300
Subject: [PATCH 4/4] imgproc: dispatch sumpixels (integral)

---
 modules/imgproc/CMakeLists.txt               |   1 +
 modules/imgproc/src/sumpixels.avx512_skx.hpp |  19 +-
 modules/imgproc/src/sumpixels.dispatch.cpp   | 479 ++++++-------------
 modules/imgproc/src/sumpixels.hpp            |  25 -
 modules/imgproc/src/sumpixels.simd.hpp       | 459 ++----------------
 5 files changed, 202 insertions(+), 781 deletions(-)
 delete mode 100644 modules/imgproc/src/sumpixels.hpp

diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index f26ea0b3bf..a74c883cd3 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -9,5 +9,6 @@ ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(morph SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2)
+ocv_add_dispatched_file(sumpixels SSE2 AVX2 AVX512_SKX)
 ocv_add_dispatched_file(undistort SSE2 AVX2)
 ocv_define_module(imgproc opencv_core WRAP java python js)
diff --git a/modules/imgproc/src/sumpixels.avx512_skx.hpp b/modules/imgproc/src/sumpixels.avx512_skx.hpp
index 804b48d8c5..3c9c90c658 100644
--- a/modules/imgproc/src/sumpixels.avx512_skx.hpp
+++ b/modules/imgproc/src/sumpixels.avx512_skx.hpp
@@ -2,14 +2,13 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2019, Intel Corporation, all rights reserved.
-#include "precomp.hpp"
-#include "sumpixels.hpp"
+// Copyright (C) 2019-2020, Intel Corporation, all rights reserved.
 
 #include "opencv2/core/hal/intrin.hpp"
 
+namespace cv { namespace hal {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-namespace cv {
 namespace { // Anonymous namespace to avoid exposing the implementation classes
 
 //
@@ -432,16 +431,14 @@ __m512d IntegralCalculator < 4 > ::calculate_integral(const __m512i src_longs, c
 
 } // end of anonymous namespace
 
-namespace opt_AVX512_SKX {
-
-// This is the implementation for the external callers interface entry point.
-// It should be the only function called into this file from outside
-// Any new implementations should be directed from here
+static
 void calculate_integral_avx512(const uchar *src,   size_t _srcstep,
                                double      *sum,   size_t _sumstep,
                                double      *sqsum, size_t _sqsumstep,
                                int width, int height, int cn)
 {
+    CV_INSTRUMENT_REGION();
+
     switch(cn){
         case 1: {
             IntegralCalculator< 1 > calculator;
@@ -466,5 +463,5 @@ void calculate_integral_avx512(const uchar *src,   size_t _srcstep,
 }
 
 
-} // end namespace opt_AVX512_SXK
-} // end namespace cv
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // end namespace cv::hal
diff --git a/modules/imgproc/src/sumpixels.dispatch.cpp b/modules/imgproc/src/sumpixels.dispatch.cpp
index 89337f3507..b828ec70c0 100755
--- a/modules/imgproc/src/sumpixels.dispatch.cpp
+++ b/modules/imgproc/src/sumpixels.dispatch.cpp
@@ -10,7 +10,7 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008,2019 Intel Corporation, all rights reserved.
+// Copyright (C) 2000-2020 Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
@@ -44,210 +44,157 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "opencv2/core/hal/intrin.hpp"
-#include "sumpixels.hpp"
 
-namespace cv
-{
+#include "sumpixels.simd.hpp"
+#include "sumpixels.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
-template <typename T, typename ST, typename QT>
-struct Integral_SIMD
+
+namespace cv {
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
 {
-    bool operator()(const T *, size_t,
-                    ST *, size_t,
-                    QT *, size_t,
-                    ST *, size_t,
-                    int, int, int) const
-    {
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if ( (_src.type() != CV_8UC1) ||
+        !(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
         return false;
-    }
-};
 
+    static const int tileSize = 16;
 
-template <>
-struct Integral_SIMD<uchar, double, double> {
-    Integral_SIMD() {};
+    String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
+                                ocl::typeToStr(sdepth), tileSize,
+                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");
 
+    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (kcols.empty())
+        return false;
 
-    bool operator()(const uchar *src, size_t _srcstep,
-                    double *sum,      size_t _sumstep,
-                    double *sqsum,    size_t _sqsumstep,
-                    double *tilted,   size_t _tiltedstep,
-                    int width, int height, int cn) const
+    UMat src = _src.getUMat();
+    Size src_size = src.size();
+    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
+    UMat buf(bufsize, sdepth);
+    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
+    size_t gt = src.cols, lt = tileSize;
+    if (!kcols.run(1, &gt, &lt, false))
+        return false;
+
+    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (krows.empty())
+        return false;
+
+    Size sumsize(src_size.width + 1, src_size.height + 1);
+    _sum.create(sumsize, sdepth);
+    UMat sum = _sum.getUMat();
+
+    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
+    gt = src.rows;
+    return krows.run(1, &gt, &lt, false);
+}
+
+static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
+{
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
+        return false;
+
+    static const int tileSize = 16;
+
+    String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
+                                ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
+                                tileSize,
+                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (kcols.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+    Size src_size = src.size();
+    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
+    UMat buf(bufsize, sdepth);
+    UMat buf_sq(bufsize, sqdepth);
+    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
+    size_t gt = src.cols, lt = tileSize;
+    if (!kcols.run(1, &gt, &lt, false))
+        return false;
+
+    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
+    if (krows.empty())
+        return false;
+
+    Size sumsize(src_size.width + 1, src_size.height + 1);
+    _sum.create(sumsize, sdepth);
+    UMat sum = _sum.getUMat();
+    _sqsum.create(sumsize, sqdepth);
+    UMat sum_sq = _sqsum.getUMat();
+
+    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
+    gt = src.rows;
+    return krows.run(1, &gt, &lt, false);
+}
+
+#endif  // HAVE_OPENCL
+
+#ifdef HAVE_IPP
+
+static bool ipp_integral(
+    int depth, int sdepth, int sqdepth,
+    const uchar* src, size_t srcstep,
+    uchar* sum, size_t sumstep,
+    uchar* sqsum, size_t sqsumstep,
+    uchar* tilted, size_t tstep,
+    int width, int height, int cn)
+{
+    CV_INSTRUMENT_REGION_IPP();
+
+    IppiSize size = {width, height};
+
+    if(cn > 1)
+        return false;
+    if(tilted)
     {
-#if CV_TRY_AVX512_SKX
-        CV_UNUSED(_tiltedstep);
-        // TODO:  Add support for 1 channel input (WIP)
-        if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && (cn <= 4)){
-            opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep,
-                                                      sqsum, _sqsumstep, width, height, cn);
-            return true;
-        }
-#else
-        // Avoid warnings in some builds
-        CV_UNUSED(src); CV_UNUSED(_srcstep); CV_UNUSED(sum); CV_UNUSED(_sumstep);
-        CV_UNUSED(sqsum); CV_UNUSED(_sqsumstep); CV_UNUSED(tilted); CV_UNUSED(_tiltedstep);
-        CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(cn);
-#endif
+        CV_UNUSED(tstep);
         return false;
     }
 
-};
-
-#if CV_SIMD && CV_SIMD_WIDTH <= 64
-
-template <>
-struct Integral_SIMD<uchar, int, double>
-{
-    Integral_SIMD() {}
-
-    bool operator()(const uchar * src, size_t _srcstep,
-                    int * sum, size_t _sumstep,
-                    double * sqsum, size_t,
-                    int * tilted, size_t,
-                    int width, int height, int cn) const
+    if(!sqsum)
     {
-        if (sqsum || tilted || cn != 1)
+        if(depth == CV_8U && sdepth == CV_32S)
+            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_8u32s_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, size, 0) >= 0;
+        else if(depth == CV_8UC1 && sdepth == CV_32F)
+            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_8u32f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, size, 0) >= 0;
+        else if(depth == CV_32FC1 && sdepth == CV_32F)
+            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_32f_C1R, (const Ipp32f*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, size) >= 0;
+        else
             return false;
-
-        // the first iteration
-        memset(sum, 0, (width + 1) * sizeof(int));
-
-        // the others
-        for (int i = 0; i < height; ++i)
-        {
-            const uchar * src_row = src + _srcstep * i;
-            int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1;
-            int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1;
-
-            sum_row[-1] = 0;
-
-            v_int32 prev = vx_setzero_s32();
-            int j = 0;
-            for ( ; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
-            {
-                v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
-                v_int32 el4l, el4h;
-#if CV_AVX2 && CV_SIMD_WIDTH == 32
-                __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
-                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
-                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
-                __m256i shmask = _mm256_set1_epi32(7);
-                el4l.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_low(vsum)), prev.val);
-                el4h.val = _mm256_add_epi32(_mm256_cvtepi16_epi32(_v256_extract_high(vsum)), _mm256_permutevar8x32_epi32(el4l.val, shmask));
-                prev.val = _mm256_permutevar8x32_epi32(el4h.val, shmask);
-#else
-                el8 += v_rotate_left<1>(el8);
-                el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH >= 32
-                el8 += v_rotate_left<4>(el8);
-#if CV_SIMD_WIDTH == 64
-                el8 += v_rotate_left<8>(el8);
-#endif
-#endif
-                v_expand(el8, el4l, el4h);
-                el4l += prev;
-                el4h += el4l;
-
-                prev = v_broadcast_element<v_int32::nlanes - 1>(el4h);
-#endif
-                v_store(sum_row + j                  , el4l + vx_load(prev_sum_row + j                  ));
-                v_store(sum_row + j + v_int32::nlanes, el4h + vx_load(prev_sum_row + j + v_int32::nlanes));
-            }
-
-            for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
-                sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
-        }
-        vx_cleanup();
-
-        return true;
     }
-};
-
-template <>
-struct Integral_SIMD<uchar, float, double>
-{
-    Integral_SIMD() {}
-
-    bool operator()(const uchar * src, size_t _srcstep,
-        float * sum, size_t _sumstep,
-        double * sqsum, size_t,
-        float * tilted, size_t,
-        int width, int height, int cn) const
+    else
     {
-        if (sqsum || tilted || cn != 1)
+        if(depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32S)
+            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32s_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, (Ipp32s*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
+        else if(depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F)
+            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32s64f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, (Ipp64f*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
+        else if(depth == CV_8U && sdepth == CV_32F && sqdepth == CV_64F)
+            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32f64f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, (Ipp64f*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
+        else
             return false;
-
-        // the first iteration
-        memset(sum, 0, (width + 1) * sizeof(int));
-
-        // the others
-        for (int i = 0; i < height; ++i)
-        {
-            const uchar * src_row = src + _srcstep * i;
-            float * prev_sum_row = (float *)((uchar *)sum + _sumstep * i) + 1;
-            float * sum_row = (float *)((uchar *)sum + _sumstep * (i + 1)) + 1;
-
-            sum_row[-1] = 0;
-
-            v_float32 prev = vx_setzero_f32();
-            int j = 0;
-            for (; j + v_uint16::nlanes <= width; j += v_uint16::nlanes)
-            {
-                v_int16 el8 = v_reinterpret_as_s16(vx_load_expand(src_row + j));
-                v_float32 el4l, el4h;
-#if CV_AVX2 && CV_SIMD_WIDTH == 32
-                __m256i vsum = _mm256_add_epi16(el8.val, _mm256_slli_si256(el8.val, 2));
-                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 4));
-                vsum = _mm256_add_epi16(vsum, _mm256_slli_si256(vsum, 8));
-                __m256i shmask = _mm256_set1_epi32(7);
-                el4l.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_low(vsum))), prev.val);
-                el4h.val = _mm256_add_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_v256_extract_high(vsum))), _mm256_permutevar8x32_ps(el4l.val, shmask));
-                prev.val = _mm256_permutevar8x32_ps(el4h.val, shmask);
-#else
-                el8 += v_rotate_left<1>(el8);
-                el8 += v_rotate_left<2>(el8);
-#if CV_SIMD_WIDTH >= 32
-                el8 += v_rotate_left<4>(el8);
-#if CV_SIMD_WIDTH == 64
-                el8 += v_rotate_left<8>(el8);
-#endif
-#endif
-                v_int32 el4li, el4hi;
-                v_expand(el8, el4li, el4hi);
-                el4l = v_cvt_f32(el4li) + prev;
-                el4h = v_cvt_f32(el4hi) + el4l;
-
-                prev = v_broadcast_element<v_float32::nlanes - 1>(el4h);
-#endif
-                v_store(sum_row + j                    , el4l + vx_load(prev_sum_row + j                    ));
-                v_store(sum_row + j + v_float32::nlanes, el4h + vx_load(prev_sum_row + j + v_float32::nlanes));
-            }
-
-            for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
-                sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
-        }
-        vx_cleanup();
-
-        return true;
     }
-};
+}
 
-#endif
+#endif  // HAVE_IPP
 
-template<typename T, typename ST, typename QT>
+namespace hal {
+
+template<typename T, typename ST, typename QT> static
 void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep,
                 QT* sqsum, size_t _sqsumstep, ST* tilted, size_t _tiltedstep,
                 int width, int height, int cn )
 {
     int x, y, k;
 
-    if (Integral_SIMD<T, ST, QT>()(src, _srcstep,
-                                   sum, _sumstep,
-                                   sqsum, _sqsumstep,
-                                   tilted, _tiltedstep,
-                                   width, height, cn))
-        return;
-
     int srcstep = (int)(_srcstep/sizeof(T));
     int sumstep = (int)(_sumstep/sizeof(ST));
     int tiltedstep = (int)(_tiltedstep/sizeof(ST));
@@ -401,157 +348,36 @@ void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep,
     }
 }
 
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
+static bool integral_SIMD(
+        int depth, int sdepth, int sqdepth,
+        const uchar* src, size_t srcstep,
+        uchar* sum, size_t sumstep,
+        uchar* sqsum, size_t sqsumstep,
+        uchar* tilted, size_t tstep,
+        int width, int height, int cn)
 {
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    CV_INSTRUMENT_REGION();
 
-    if ( (_src.type() != CV_8UC1) ||
-        !(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
-        return false;
-
-    static const int tileSize = 16;
-
-    String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
-                                ocl::typeToStr(sdepth), tileSize,
-                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
-    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (kcols.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    Size src_size = src.size();
-    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
-    UMat buf(bufsize, sdepth);
-    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
-    size_t gt = src.cols, lt = tileSize;
-    if (!kcols.run(1, &gt, &lt, false))
-        return false;
-
-    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (krows.empty())
-        return false;
-
-    Size sumsize(src_size.width + 1, src_size.height + 1);
-    _sum.create(sumsize, sdepth);
-    UMat sum = _sum.getUMat();
-
-    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
-    gt = src.rows;
-    return krows.run(1, &gt, &lt, false);
+    CV_CPU_DISPATCH(integral_SIMD, (depth, sdepth, sqdepth, src, srcstep, sum, sumstep, sqsum, sqsumstep, tilted, tstep, width, height, cn),
+        CV_CPU_DISPATCH_MODES_ALL);
 }
 
-static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
+void integral(
+        int depth, int sdepth, int sqdepth,
+        const uchar* src, size_t srcstep,
+        uchar* sum, size_t sumstep,
+        uchar* sqsum, size_t sqsumstep,
+        uchar* tilted, size_t tstep,
+        int width, int height, int cn)
 {
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    CV_INSTRUMENT_REGION();
 
-    if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
-        return false;
-
-    static const int tileSize = 16;
-
-    String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
-                                ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
-                                tileSize,
-                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
-    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (kcols.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    Size src_size = src.size();
-    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
-    UMat buf(bufsize, sdepth);
-    UMat buf_sq(bufsize, sqdepth);
-    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
-    size_t gt = src.cols, lt = tileSize;
-    if (!kcols.run(1, &gt, &lt, false))
-        return false;
-
-    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (krows.empty())
-        return false;
-
-    Size sumsize(src_size.width + 1, src_size.height + 1);
-    _sum.create(sumsize, sdepth);
-    UMat sum = _sum.getUMat();
-    _sqsum.create(sumsize, sqdepth);
-    UMat sum_sq = _sqsum.getUMat();
-
-    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
-    gt = src.rows;
-    return krows.run(1, &gt, &lt, false);
-}
-
-#endif
-
-}
-
-#if defined(HAVE_IPP)
-namespace cv
-{
-static bool ipp_integral(
-    int depth, int sdepth, int sqdepth,
-    const uchar* src, size_t srcstep,
-    uchar* sum, size_t sumstep,
-    uchar* sqsum, size_t sqsumstep,
-    uchar* tilted, size_t tstep,
-    int width, int height, int cn)
-{
-    CV_INSTRUMENT_REGION_IPP();
-
-    IppiSize size = {width, height};
-
-    if(cn > 1)
-        return false;
-    if(tilted)
-    {
-        CV_UNUSED(tstep);
-        return false;
-    }
-
-    if(!sqsum)
-    {
-        if(depth == CV_8U && sdepth == CV_32S)
-            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_8u32s_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, size, 0) >= 0;
-        else if(depth == CV_8UC1 && sdepth == CV_32F)
-            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_8u32f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, size, 0) >= 0;
-        else if(depth == CV_32FC1 && sdepth == CV_32F)
-            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_32f_C1R, (const Ipp32f*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, size) >= 0;
-        else
-            return false;
-    }
-    else
-    {
-        if(depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32S)
-            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32s_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, (Ipp32s*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
-        else if(depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F)
-            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32s64f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, (Ipp64f*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
-        else if(depth == CV_8U && sdepth == CV_32F && sqdepth == CV_64F)
-            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32f64f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, (Ipp64f*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
-        else
-            return false;
-    }
-}
-}
-#endif
-
-namespace cv { namespace hal {
-
-void integral(int depth, int sdepth, int sqdepth,
-              const uchar* src, size_t srcstep,
-              uchar* sum, size_t sumstep,
-              uchar* sqsum, size_t sqsumstep,
-              uchar* tilted, size_t tstep,
-              int width, int height, int cn)
-{
     CALL_HAL(integral, cv_hal_integral, depth, sdepth, sqdepth, src, srcstep, sum, sumstep, sqsum, sqsumstep, tilted, tstep, width, height, cn);
     CV_IPP_RUN_FAST(ipp_integral(depth, sdepth, sqdepth, src, srcstep, sum, sumstep, sqsum, sqsumstep, tilted, tstep, width, height, cn));
 
+    if (integral_SIMD(depth, sdepth, sqdepth, src, srcstep, sum, sumstep, sqsum, sqsumstep, tilted, tstep, width, height, cn))
+        return;
+
 #define ONE_CALL(A, B, C) integral_<A, B, C>((const A*)src, srcstep, (B*)sum, sumstep, (C*)sqsum, sqsumstep, (B*)tilted, tstep, width, height, cn)
 
     if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F )
@@ -579,14 +405,14 @@ void integral(int depth, int sdepth, int sqdepth,
     else if( depth == CV_64F && sdepth == CV_64F && sqdepth == CV_64F )
         ONE_CALL(double, double, double);
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        CV_Error(Error::StsUnsupportedFormat, "");
 
 #undef ONE_CALL
 }
 
-}} // cv::hal::
+} // namespace hal
 
-void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, OutputArray _tilted, int sdepth, int sqdepth )
+void integral(InputArray _src, OutputArray _sum, OutputArray _sqsum, OutputArray _tilted, int sdepth, int sqdepth )
 {
     CV_INSTRUMENT_REGION();
 
@@ -624,20 +450,21 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output
                   src.cols, src.rows, cn);
 }
 
-void cv::integral( InputArray src, OutputArray sum, int sdepth )
+void integral( InputArray src, OutputArray sum, int sdepth )
 {
     CV_INSTRUMENT_REGION();
 
     integral( src, sum, noArray(), noArray(), sdepth );
 }
 
-void cv::integral( InputArray src, OutputArray sum, OutputArray sqsum, int sdepth, int sqdepth )
+void integral( InputArray src, OutputArray sum, OutputArray sqsum, int sdepth, int sqdepth )
 {
     CV_INSTRUMENT_REGION();
 
     integral( src, sum, sqsum, noArray(), sdepth, sqdepth );
 }
 
+} // namespace
 
 CV_IMPL void
 cvIntegral( const CvArr* image, CvArr* sumImage,
diff --git a/modules/imgproc/src/sumpixels.hpp b/modules/imgproc/src/sumpixels.hpp
deleted file mode 100644
index 8d5ab0a851..0000000000
--- a/modules/imgproc/src/sumpixels.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2019, Intel Corporation, all rights reserved.
-#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP
-#define OPENCV_IMGPROC_SUM_PIXELS_HPP
-
-namespace cv
-{
-
-namespace opt_AVX512_SKX
-{
-#if CV_TRY_AVX512_SKX
-    void calculate_integral_avx512(
-            const uchar *src, size_t _srcstep,
-            double *sum,      size_t _sumstep,
-            double *sqsum,    size_t _sqsumstep,
-            int width, int height, int cn);
-
-#endif
-} // end namespace opt_AVX512_SKX
-} // end namespace cv
-
-#endif
diff --git a/modules/imgproc/src/sumpixels.simd.hpp b/modules/imgproc/src/sumpixels.simd.hpp
index 89337f3507..c8d60a0040 100644
--- a/modules/imgproc/src/sumpixels.simd.hpp
+++ b/modules/imgproc/src/sumpixels.simd.hpp
@@ -10,7 +10,7 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008,2019 Intel Corporation, all rights reserved.
+// Copyright (C) 2000-2020 Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
@@ -41,13 +41,26 @@
 //
 //M*/
 
-#include "precomp.hpp"
-#include "opencl_kernels_imgproc.hpp"
 #include "opencv2/core/hal/intrin.hpp"
-#include "sumpixels.hpp"
 
-namespace cv
-{
+#if CV_AVX512_SKX
+#include "sumpixels.avx512_skx.hpp"
+#endif
+
+namespace cv { namespace hal {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+// forward declarations
+bool integral_SIMD(
+        int depth, int sdepth, int sqdepth,
+        const uchar* src, size_t srcstep,
+        uchar* sum, size_t sumstep,
+        uchar* sqsum, size_t sqsumstep,
+        uchar* tilted, size_t tstep,
+        int width, int height, int cn);
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+namespace {
 
 template <typename T, typename ST, typename QT>
 struct Integral_SIMD
@@ -62,7 +75,7 @@ struct Integral_SIMD
     }
 };
 
-
+#if CV_AVX512_SKX
 template <>
 struct Integral_SIMD<uchar, double, double> {
     Integral_SIMD() {};
@@ -74,24 +87,19 @@ struct Integral_SIMD<uchar, double, double> {
                     double *tilted,   size_t _tiltedstep,
                     int width, int height, int cn) const
     {
-#if CV_TRY_AVX512_SKX
         CV_UNUSED(_tiltedstep);
         // TODO:  Add support for 1 channel input (WIP)
-        if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && (cn <= 4)){
-            opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep,
-                                                      sqsum, _sqsumstep, width, height, cn);
+        if (!tilted && (cn <= 4))
+        {
+            calculate_integral_avx512(src, _srcstep, sum, _sumstep,
+                                      sqsum, _sqsumstep, width, height, cn);
             return true;
         }
-#else
-        // Avoid warnings in some builds
-        CV_UNUSED(src); CV_UNUSED(_srcstep); CV_UNUSED(sum); CV_UNUSED(_sumstep);
-        CV_UNUSED(sqsum); CV_UNUSED(_sqsumstep); CV_UNUSED(tilted); CV_UNUSED(_tiltedstep);
-        CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(cn);
-#endif
         return false;
     }
 
 };
+#endif
 
 #if CV_SIMD && CV_SIMD_WIDTH <= 64
 
@@ -157,8 +165,6 @@ struct Integral_SIMD<uchar, int, double>
             for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
                 sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
         }
-        vx_cleanup();
-
         return true;
     }
 };
@@ -226,333 +232,26 @@ struct Integral_SIMD<uchar, float, double>
             for (float v = sum_row[j - 1] - prev_sum_row[j - 1]; j < width; ++j)
                 sum_row[j] = (v += src_row[j]) + prev_sum_row[j];
         }
-        vx_cleanup();
-
         return true;
     }
 };
 
 #endif
 
-template<typename T, typename ST, typename QT>
-void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep,
-                QT* sqsum, size_t _sqsumstep, ST* tilted, size_t _tiltedstep,
-                int width, int height, int cn )
+} // namespace anon
+
+bool integral_SIMD(
+        int depth, int sdepth, int sqdepth,
+        const uchar* src, size_t srcstep,
+        uchar* sum, size_t sumstep,
+        uchar* sqsum, size_t sqsumstep,
+        uchar* tilted, size_t tstep,
+        int width, int height, int cn)
 {
-    int x, y, k;
+    CV_INSTRUMENT_REGION();
 
-    if (Integral_SIMD<T, ST, QT>()(src, _srcstep,
-                                   sum, _sumstep,
-                                   sqsum, _sqsumstep,
-                                   tilted, _tiltedstep,
-                                   width, height, cn))
-        return;
-
-    int srcstep = (int)(_srcstep/sizeof(T));
-    int sumstep = (int)(_sumstep/sizeof(ST));
-    int tiltedstep = (int)(_tiltedstep/sizeof(ST));
-    int sqsumstep = (int)(_sqsumstep/sizeof(QT));
-
-    width *= cn;
-
-    memset( sum, 0, (width+cn)*sizeof(sum[0]));
-    sum += sumstep + cn;
-
-    if( sqsum )
-    {
-        memset( sqsum, 0, (width+cn)*sizeof(sqsum[0]));
-        sqsum += sqsumstep + cn;
-    }
-
-    if( tilted )
-    {
-        memset( tilted, 0, (width+cn)*sizeof(tilted[0]));
-        tilted += tiltedstep + cn;
-    }
-
-    if( sqsum == 0 && tilted == 0 )
-    {
-        for( y = 0; y < height; y++, src += srcstep - cn, sum += sumstep - cn )
-        {
-            for( k = 0; k < cn; k++, src++, sum++ )
-            {
-                ST s = sum[-cn] = 0;
-                for( x = 0; x < width; x += cn )
-                {
-                    s += src[x];
-                    sum[x] = sum[x - sumstep] + s;
-                }
-            }
-        }
-    }
-    else if( tilted == 0 )
-    {
-        for( y = 0; y < height; y++, src += srcstep - cn,
-                        sum += sumstep - cn, sqsum += sqsumstep - cn )
-        {
-            for( k = 0; k < cn; k++, src++, sum++, sqsum++ )
-            {
-                ST s = sum[-cn] = 0;
-                QT sq = sqsum[-cn] = 0;
-                for( x = 0; x < width; x += cn )
-                {
-                    T it = src[x];
-                    s += it;
-                    sq += (QT)it*it;
-                    ST t = sum[x - sumstep] + s;
-                    QT tq = sqsum[x - sqsumstep] + sq;
-                    sum[x] = t;
-                    sqsum[x] = tq;
-                }
-            }
-        }
-    }
-    else
-    {
-        AutoBuffer<ST> _buf(width+cn);
-        ST* buf = _buf.data();
-        ST s;
-        QT sq;
-        for( k = 0; k < cn; k++, src++, sum++, tilted++, buf++ )
-        {
-            sum[-cn] = tilted[-cn] = 0;
-
-            for( x = 0, s = 0, sq = 0; x < width; x += cn )
-            {
-                T it = src[x];
-                buf[x] = tilted[x] = it;
-                s += it;
-                sq += (QT)it*it;
-                sum[x] = s;
-                if( sqsum )
-                    sqsum[x] = sq;
-            }
-
-            if( width == cn )
-                buf[cn] = 0;
-
-            if( sqsum )
-            {
-                sqsum[-cn] = 0;
-                sqsum++;
-            }
-        }
-
-        for( y = 1; y < height; y++ )
-        {
-            src += srcstep - cn;
-            sum += sumstep - cn;
-            tilted += tiltedstep - cn;
-            buf += -cn;
-
-            if( sqsum )
-                sqsum += sqsumstep - cn;
-
-            for( k = 0; k < cn; k++, src++, sum++, tilted++, buf++ )
-            {
-                T it = src[0];
-                ST t0 = s = it;
-                QT tq0 = sq = (QT)it*it;
-
-                sum[-cn] = 0;
-                if( sqsum )
-                    sqsum[-cn] = 0;
-                tilted[-cn] = tilted[-tiltedstep];
-
-                sum[0] = sum[-sumstep] + t0;
-                if( sqsum )
-                    sqsum[0] = sqsum[-sqsumstep] + tq0;
-                tilted[0] = tilted[-tiltedstep] + t0 + buf[cn];
-
-                for( x = cn; x < width - cn; x += cn )
-                {
-                    ST t1 = buf[x];
-                    buf[x - cn] = t1 + t0;
-                    t0 = it = src[x];
-                    tq0 = (QT)it*it;
-                    s += t0;
-                    sq += tq0;
-                    sum[x] = sum[x - sumstep] + s;
-                    if( sqsum )
-                        sqsum[x] = sqsum[x - sqsumstep] + sq;
-                    t1 += buf[x + cn] + t0 + tilted[x - tiltedstep - cn];
-                    tilted[x] = t1;
-                }
-
-                if( width > cn )
-                {
-                    ST t1 = buf[x];
-                    buf[x - cn] = t1 + t0;
-                    t0 = it = src[x];
-                    tq0 = (QT)it*it;
-                    s += t0;
-                    sq += tq0;
-                    sum[x] = sum[x - sumstep] + s;
-                    if( sqsum )
-                        sqsum[x] = sqsum[x - sqsumstep] + sq;
-                    tilted[x] = t0 + t1 + tilted[x - tiltedstep - cn];
-                    buf[x] = t0;
-                }
-
-                if( sqsum )
-                    sqsum++;
-            }
-        }
-    }
-}
-
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
-{
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-
-    if ( (_src.type() != CV_8UC1) ||
-        !(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F)))
-        return false;
-
-    static const int tileSize = 16;
-
-    String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s",
-                                ocl::typeToStr(sdepth), tileSize,
-                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
-    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (kcols.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    Size src_size = src.size();
-    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
-    UMat buf(bufsize, sdepth);
-    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
-    size_t gt = src.cols, lt = tileSize;
-    if (!kcols.run(1, &gt, &lt, false))
-        return false;
-
-    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (krows.empty())
-        return false;
-
-    Size sumsize(src_size.width + 1, src_size.height + 1);
-    _sum.create(sumsize, sdepth);
-    UMat sum = _sum.getUMat();
-
-    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum));
-    gt = src.rows;
-    return krows.run(1, &gt, &lt, false);
-}
-
-static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth )
-{
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-
-    if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) )
-        return false;
-
-    static const int tileSize = 16;
-
-    String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s",
-                                ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth),
-                                tileSize,
-                                doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
-    ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (kcols.empty())
-        return false;
-
-    UMat src = _src.getUMat();
-    Size src_size = src.size();
-    Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize);
-    UMat buf(bufsize, sdepth);
-    UMat buf_sq(bufsize, sqdepth);
-    kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq));
-    size_t gt = src.cols, lt = tileSize;
-    if (!kcols.run(1, &gt, &lt, false))
-        return false;
-
-    ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt);
-    if (krows.empty())
-        return false;
-
-    Size sumsize(src_size.width + 1, src_size.height + 1);
-    _sum.create(sumsize, sdepth);
-    UMat sum = _sum.getUMat();
-    _sqsum.create(sumsize, sqdepth);
-    UMat sum_sq = _sqsum.getUMat();
-
-    krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq));
-    gt = src.rows;
-    return krows.run(1, &gt, &lt, false);
-}
-
-#endif
-
-}
-
-#if defined(HAVE_IPP)
-namespace cv
-{
-static bool ipp_integral(
-    int depth, int sdepth, int sqdepth,
-    const uchar* src, size_t srcstep,
-    uchar* sum, size_t sumstep,
-    uchar* sqsum, size_t sqsumstep,
-    uchar* tilted, size_t tstep,
-    int width, int height, int cn)
-{
-    CV_INSTRUMENT_REGION_IPP();
-
-    IppiSize size = {width, height};
-
-    if(cn > 1)
-        return false;
-    if(tilted)
-    {
-        CV_UNUSED(tstep);
-        return false;
-    }
-
-    if(!sqsum)
-    {
-        if(depth == CV_8U && sdepth == CV_32S)
-            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_8u32s_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, size, 0) >= 0;
-        else if(depth == CV_8UC1 && sdepth == CV_32F)
-            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_8u32f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, size, 0) >= 0;
-        else if(depth == CV_32FC1 && sdepth == CV_32F)
-            return CV_INSTRUMENT_FUN_IPP(ippiIntegral_32f_C1R, (const Ipp32f*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, size) >= 0;
-        else
-            return false;
-    }
-    else
-    {
-        if(depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32S)
-            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32s_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, (Ipp32s*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
-        else if(depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F)
-            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32s64f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32s*)sum, (int)sumstep, (Ipp64f*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
-        else if(depth == CV_8U && sdepth == CV_32F && sqdepth == CV_64F)
-            return CV_INSTRUMENT_FUN_IPP(ippiSqrIntegral_8u32f64f_C1R, (const Ipp8u*)src, (int)srcstep, (Ipp32f*)sum, (int)sumstep, (Ipp64f*)sqsum, (int)sqsumstep, size, 0, 0) >= 0;
-        else
-            return false;
-    }
-}
-}
-#endif
-
-namespace cv { namespace hal {
-
-void integral(int depth, int sdepth, int sqdepth,
-              const uchar* src, size_t srcstep,
-              uchar* sum, size_t sumstep,
-              uchar* sqsum, size_t sqsumstep,
-              uchar* tilted, size_t tstep,
-              int width, int height, int cn)
-{
-    CALL_HAL(integral, cv_hal_integral, depth, sdepth, sqdepth, src, srcstep, sum, sumstep, sqsum, sqsumstep, tilted, tstep, width, height, cn);
-    CV_IPP_RUN_FAST(ipp_integral(depth, sdepth, sqdepth, src, srcstep, sum, sumstep, sqsum, sqsumstep, tilted, tstep, width, height, cn));
-
-#define ONE_CALL(A, B, C) integral_<A, B, C>((const A*)src, srcstep, (B*)sum, sumstep, (C*)sqsum, sqsumstep, (B*)tilted, tstep, width, height, cn)
+#define ONE_CALL(T, ST, QT) \
+    return Integral_SIMD<T, ST, QT>()((const T*)src, srcstep, (ST*)sum, sumstep, (QT*)sqsum, sqsumstep, (ST*)tilted, tstep, width, height, cn)
 
     if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F )
         ONE_CALL(uchar, int, double);
@@ -579,89 +278,11 @@ void integral(int depth, int sdepth, int sqdepth,
     else if( depth == CV_64F && sdepth == CV_64F && sqdepth == CV_64F )
         ONE_CALL(double, double, double);
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
+        return false;
 
 #undef ONE_CALL
 }
 
+#endif
+CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // cv::hal::
-
-void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, OutputArray _tilted, int sdepth, int sqdepth )
-{
-    CV_INSTRUMENT_REGION();
-
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    if( sdepth <= 0 )
-        sdepth = depth == CV_8U ? CV_32S : CV_64F;
-    if ( sqdepth <= 0 )
-         sqdepth = CV_64F;
-    sdepth = CV_MAT_DEPTH(sdepth), sqdepth = CV_MAT_DEPTH(sqdepth);
-
-    CV_OCL_RUN(_sum.isUMat() && !_tilted.needed(),
-        (_sqsum.needed() ? ocl_integral(_src, _sum, _sqsum, sdepth, sqdepth) : ocl_integral(_src, _sum, sdepth)));
-
-    Size ssize = _src.size(), isize(ssize.width + 1, ssize.height + 1);
-    _sum.create( isize, CV_MAKETYPE(sdepth, cn) );
-    Mat src = _src.getMat(), sum =_sum.getMat(), sqsum, tilted;
-
-    if( _sqsum.needed() )
-    {
-        _sqsum.create( isize, CV_MAKETYPE(sqdepth, cn) );
-        sqsum = _sqsum.getMat();
-    };
-
-    if( _tilted.needed() )
-    {
-        _tilted.create( isize, CV_MAKETYPE(sdepth, cn) );
-        tilted = _tilted.getMat();
-    }
-
-    hal::integral(depth, sdepth, sqdepth,
-                  src.ptr(), src.step,
-                  sum.ptr(), sum.step,
-                  sqsum.ptr(), sqsum.step,
-                  tilted.ptr(), tilted.step,
-                  src.cols, src.rows, cn);
-}
-
-void cv::integral( InputArray src, OutputArray sum, int sdepth )
-{
-    CV_INSTRUMENT_REGION();
-
-    integral( src, sum, noArray(), noArray(), sdepth );
-}
-
-void cv::integral( InputArray src, OutputArray sum, OutputArray sqsum, int sdepth, int sqdepth )
-{
-    CV_INSTRUMENT_REGION();
-
-    integral( src, sum, sqsum, noArray(), sdepth, sqdepth );
-}
-
-
-CV_IMPL void
-cvIntegral( const CvArr* image, CvArr* sumImage,
-            CvArr* sumSqImage, CvArr* tiltedSumImage )
-{
-    cv::Mat src = cv::cvarrToMat(image), sum = cv::cvarrToMat(sumImage), sum0 = sum;
-    cv::Mat sqsum0, sqsum, tilted0, tilted;
-    cv::Mat *psqsum = 0, *ptilted = 0;
-
-    if( sumSqImage )
-    {
-        sqsum0 = sqsum = cv::cvarrToMat(sumSqImage);
-        psqsum = &sqsum;
-    }
-
-    if( tiltedSumImage )
-    {
-        tilted0 = tilted = cv::cvarrToMat(tiltedSumImage);
-        ptilted = &tilted;
-    }
-    cv::integral( src, sum, psqsum ? cv::_OutputArray(*psqsum) : cv::_OutputArray(),
-                  ptilted ? cv::_OutputArray(*ptilted) : cv::_OutputArray(), sum.depth() );
-
-    CV_Assert( sum.data == sum0.data && sqsum.data == sqsum0.data && tilted.data == tilted0.data );
-}
-
-/* End of file. */