From 7a4c5e942121c7ca7ce7c51dd545acaf9dd78ab1 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Wed, 24 Jan 2018 20:26:16 +0800
Subject: [PATCH 1/4] slice layer ocl support

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/layers/slice_layer.cpp | 49 +++++++++++++++
 modules/dnn/src/opencl/slice.cl        | 87 ++++++++++++++++++++++++++
 modules/dnn/test/test_layers.cpp       | 12 +++-
 3 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 modules/dnn/src/opencl/slice.cl
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 18758b98bf..aba3af8b0d 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -43,6 +43,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -171,11 +172,59 @@ public:
         }
     }
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        if (inputs[0].dims < 4)
+            return false;
+
+        const UMat& inpMat = inputs[0];
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            int groups = outputs[i].size[0];
+            int channels = outputs[i].size[1];
+            int rows = outputs[i].size[2];
+            int cols = outputs[i].size[3];
+
+            int number = (cols % 8 == 0) ? 8 : ((cols % 4 == 0) ? 4 : 1);
+            String buildopt = format("-DNUM=%d ", number);
+            String kname = format("slice%d", number);
+            ocl::Kernel kernel(kname.c_str(), ocl::dnn::slice_oclsrc, buildopt);
+            size_t global[] = { (size_t)groups * channels, (size_t)rows * cols / number };
+            int idx = 0;
+            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inpMat));
+            kernel.set(idx++, (int)(inpMat.size[2] * inpMat.size[3]));
+            kernel.set(idx++, (int)inpMat.size[3]);
+            kernel.set(idx++, (int)global[0]);
+            kernel.set(idx++, (int)(rows * cols));
+            kernel.set(idx++, (int)cols);
+            kernel.set(idx++, (int)sliceRanges[i][2].start);
+            kernel.set(idx++, (int)sliceRanges[i][3].start);
+            kernel.set(idx++, ocl::KernelArg::PtrWriteOnly(outputs[i]));
+            bool ret = kernel.run(2, global, NULL, false);
+            if (!ret)
+                return false;
+        }
+
+        return true;
+    }
+#endif
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
     }
 
diff --git a/modules/dnn/src/opencl/slice.cl b/modules/dnn/src/opencl/slice.cl
new file mode 100644
index 0000000000..81a71489be
--- /dev/null
+++ b/modules/dnn/src/opencl/slice.cl
@@ -0,0 +1,87 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define Dtype float
+#define Dtype4 float4
+#define Dtype8 float8
+
+#if NUM == 8
+    #define load(src, index) vload8(0, src + index)
+    #define store(vec, dst, index) vstore8(vec, 0, dst + index)
+    #define vec_type Dtype8
+    #define SLICE slice8
+#elif NUM == 4
+    #define load(src, index) vload4(0, src + index)
+    #define store(vec, dst, index) vstore4(vec, 0, dst + index)
+    #define vec_type Dtype4
+    #define SLICE slice4
+#elif NUM == 1
+    #define load(src, index) src[index]
+    #define store(vec, dst, index) dst[index] = vec
+    #define vec_type Dtype
+    #define SLICE slice1
+#endif
+
+__kernel void SLICE(__global const Dtype* src,
+                    const int src_plane_size,
+                    const int src_cols,
+                    const int channels,
+                    const int dst_plane_size,
+                    const int dst_cols,
+                    const int row_offset,
+                    const int col_offset,
+                    __global Dtype* dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * NUM;
+
+    if ((x >= channels) || (y >= dst_plane_size))
+        return;
+
+    int row = y / dst_cols + row_offset;
+    int col = y % dst_cols + col_offset;
+
+    int src_index = x * src_plane_size + row * src_cols + col;
+    int dst_index = x * dst_plane_size + y;
+    vec_type val = load(src, src_index);
+    store(val, dst, dst_index);
+}
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 73f7c3594f..a178add7f8 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -367,11 +367,14 @@ OCL_TEST(Layer_Test_PReLU, Accuracy)
 //    );
 //}
 
-static void test_Reshape_Split_Slice_layers()
+static void test_Reshape_Split_Slice_layers(int targetId)
 {
     Net net = readNetFromCaffe(_tf("reshape_and_slice_routines.prototxt"));
     ASSERT_FALSE(net.empty());
 
+    net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+    net.setPreferableTarget(targetId);
+
     Mat input(6, 12, CV_32F);
     RNG rng(0);
     rng.fill(input, RNG::UNIFORM, -1, 1);
@@ -384,7 +387,12 @@ static void test_Reshape_Split_Slice_layers()
 
 TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
 {
-    test_Reshape_Split_Slice_layers();
+    test_Reshape_Split_Slice_layers(DNN_TARGET_CPU);
+}
+
+OCL_TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
+{
+    test_Reshape_Split_Slice_layers(DNN_TARGET_OPENCL);
 }
 
 TEST(Layer_Conv_Elu, Accuracy)

From 54c81cbde43d879fbfc9c272a311e3947a327db3 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Thu, 25 Jan 2018 22:15:41 +0800
Subject: [PATCH 2/4] eltwise layer SUM op update

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/layers/eltwise_layer.cpp | 54 +++++++++----
 modules/dnn/src/opencl/eltwise.cl        | 98 ++++++++++++++++++++++++
 2 files changed, 138 insertions(+), 14 deletions(-)
 create mode 100644 modules/dnn/src/opencl/eltwise.cl

diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 40375734d8..27bd7eedd2 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -43,6 +43,7 @@
 #include "../precomp.hpp"
 #include "layers_common.hpp"
 #include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -271,22 +272,47 @@ public:
         switch (op)
         {
             case SUM:
-                if (coeffs.empty())
                 {
-                    add(inputs[0], inputs[1], outputs[0]);
-                    for (int i = 2; i < inputs.size(); ++i)
-                        add(outputs[0], inputs[i], outputs[0]);
-                }
-                else
-                {
-                    UMat mul0, mul1;
-                    multiply(coeffs[0], inputs[0], mul0);
-                    multiply(coeffs[1], inputs[1], mul1);
-                    add(mul0, mul1, outputs[0]);
-                    for (int i = 2; i < inputs.size(); ++i)
+                    int channels = total(shape(outputs[0]), 0, 2);
+                    int plane_size = total(shape(outputs[0]), 2);
+                    if (channels % 4 == 0 && plane_size % 4 == 0)
                     {
-                        multiply(coeffs[i], inputs[i], mul0);
-                        add(mul0, outputs[0], outputs[0]);
+                        size_t localsize[] = { 128 };
+                        size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
+
+                        for (int i = 0; i < (inputs.size() - 1); ++i)
+                        {
+                            String buildopt = format("-DLOOP=%d", i);
+                            ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
+                            int idx = 0;
+                            UMat inpMat = (i == 0) ? inputs[0] : UMat();
+                            float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
+                            float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
+                            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
+                            kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
+                            kernel.set(idx++, (int)plane_size);
+                            kernel.set(idx++, (float)coeff1);
+                            kernel.set(idx++, (float)coeff2);
+                            kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
+                            bool ret = kernel.run(1, globalsize, localsize, false);
+                            if (!ret)
+                                return false;
+                        }
+                    }
+                    else
+                    {
+                        float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
+                        float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
+                        UMat mul0, mul1;
+                        multiply(coeff1, inputs[0], mul0);
+                        multiply(coeff2, inputs[1], mul1);
+                        add(mul0, mul1, outputs[0]);
+                        for (int i = 2; i < inputs.size(); ++i)
+                        {
+                            float coeff = coeffs.empty() ? 1.f : coeffs[i];
+                            multiply(coeff, inputs[i], mul0);
+                            add(mul0, outputs[0], outputs[0]);
+                        }
                     }
                 }
                 break;
diff --git a/modules/dnn/src/opencl/eltwise.cl b/modules/dnn/src/opencl/eltwise.cl
new file mode 100644
index 0000000000..6f3a374f54
--- /dev/null
+++ b/modules/dnn/src/opencl/eltwise.cl
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define Dtype float
+#define Dtype4 float4
+#define Dtype8 float8
+
+__kernel void op_sum4(__global const Dtype * A,
+                      __global const Dtype * B,
+                      unsigned int A_col_size,
+                      const float coeff1,
+                      const float coeff2,
+                      __global Dtype * C)
+{
+    unsigned int row_gid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+    const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
+    const __global Dtype *src1_read = B + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = C + row_gid * 4 * A_col_size;
+
+    Dtype4 a0, a1, a2, a3;
+    Dtype4 dot0, dot1, dot2, dot3;
+    unsigned int i = lid;
+    while( i < A_col_size / 4)
+    {
+        const Dtype4 b0 = vload4(i, src1_read);
+        const Dtype4 b1 = vload4(i, src1_read + A_col_size);
+        const Dtype4 b2 = vload4(i, src1_read + 2 * A_col_size);
+        const Dtype4 b3 = vload4(i, src1_read + 3 * A_col_size);
+
+#if LOOP == 0
+        a0 = vload4(i, src0_read);
+        a1 = vload4(i, src0_read + A_col_size);
+        a2 = vload4(i, src0_read + 2 * A_col_size);
+        a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 = a0 * coeff1 + b0 * coeff2;
+        dot1 = a1 * coeff1 + b1 * coeff2;
+        dot2 = a2 * coeff1 + b2 * coeff2;
+        dot3 = a3 * coeff1 + b3 * coeff2;
+#else
+        a0 = vload4(i, dst0_read);
+        a1 = vload4(i, dst0_read + A_col_size);
+        a2 = vload4(i, dst0_read + 2 * A_col_size);
+        a3 = vload4(i, dst0_read + 3 * A_col_size);
+
+        dot0 = a0 + b0 * coeff2;
+        dot1 = a1 + b1 * coeff2;
+        dot2 = a2 + b2 * coeff2;
+        dot3 = a3 + b3 * coeff2;
+#endif
+        vstore4(dot0, i, dst0_read);
+        vstore4(dot1, i, dst0_read + A_col_size);
+        vstore4(dot2, i, dst0_read + 2 * A_col_size);
+        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+
+        i += get_local_size(0);
+    }
+}

From 83b16ab7b71bec486be7a6f013ddfb77653dd0f8 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Thu, 1 Feb 2018 02:09:13 +0800
Subject: [PATCH 3/4] fix extra spaces in build option

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/layers/batch_norm_layer.cpp | 2 +-
 modules/dnn/src/layers/mvn_layer.cpp        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 8acf8b2477..52ce95f57f 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -144,7 +144,7 @@ public:
                 UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
                 UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
                 int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
-                String buildopt = format("-DNUM=%d ", number);
+                String buildopt = format("-DNUM=%d", number);
                 String kname = format("batch_norm%d", number);
                 ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
                 if (kernel.empty())
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index 1d5e12b2b6..a74bc0e14e 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -132,7 +132,7 @@ public:
 
             int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
             size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
-            String buildopt = format("-DNUM=%d ", number);
+            String buildopt = format("-DNUM=%d", number);
             if (normVariance)
             {
                 String kname = format("calc_mean%d", number);
@@ -156,9 +156,9 @@ public:
             }
 
             String kname = format("mvn%d", number);
-            buildopt += format("%s %s %s ", (normVariance) ? "-DNORM_VARIANCE" : "",
-                               (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "",
-                               (fuse_relu) ? "-DFUSE_RELU" : "");
+            buildopt += format("%s%s%s", (normVariance) ? " -DNORM_VARIANCE" : "",
+                               (fuse_batch_norm) ? " -DFUSE_BATCH_NORM" : "",
+                               (fuse_relu) ? " -DFUSE_RELU" : "");
             ocl::Kernel kernel1(kname.c_str(), ocl::dnn::mvn_oclsrc, buildopt);
             if (kernel1.empty())
                 return false;

From 6aec71d7ee02955d781ef57126693ec674976076 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Wed, 31 Jan 2018 19:28:58 +0800
Subject: [PATCH 4/4] mvn layer ocl update

it fuse ocl kernels to reduce kernel enqueue

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/src/layers/mvn_layer.cpp |  76 ++++++++++-
 modules/dnn/src/opencl/mvn.cl        | 180 +++++++++++++++++++++++++++
 2 files changed, 251 insertions(+), 5 deletions(-)

diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index a74bc0e14e..c911b741b4 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -93,6 +93,67 @@ public:
     }
 
 #ifdef HAVE_OPENCL
+    bool fast_forward_ocl(std::vector<UMat> &inputs, std::vector<UMat> &outputs)
+    {
+        if( fuse_batch_norm && scale.empty())
+        {
+            bnorm->getScaleShift(scale, shift);
+            bnorm_weight = scale.getUMat(ACCESS_READ);
+            bnorm_bias = shift.getUMat(ACCESS_READ);
+        }
+
+        int splitDim = (acrossChannels) ? 1 : 2;
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            UMat &inpMat = inputs[inpIdx];
+            UMat &outMat = outputs[inpIdx];
+            int newRows = total(shape(inpMat), 0, splitDim);
+
+            MatShape s = shape(newRows, inpMat.total() / newRows);
+            UMat oneMat = UMat::ones(s[1], 1, CV_32F);
+            UMat meanMat = UMat(s[0], 1, CV_32F);
+            UMat tmpMat  = UMat(s[0], s[1], CV_32F);
+            float alpha = 1.0f / s[1];
+
+            String buildopt = "-DNUM=4";
+            ocl::Kernel k("mean_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
+            size_t localsize[] = { 128 };
+            size_t globalsize[] = { (size_t)s[0] / 4 * localsize[0] };
+
+            int argId = 0;
+            k.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
+            k.set(argId++, (int)s[1]);
+            k.set(argId++, alpha);
+            k.set(argId++, ocl::KernelArg::PtrWriteOnly(meanMat));
+            k.set(argId++, ocl::KernelArg::PtrWriteOnly(tmpMat));
+            k.set(argId++, NULL, localsize[0] * sizeof(cl_float4));
+            bool ret = k.run(1, globalsize, localsize, false);
+            if (!ret)
+                return false;
+
+            buildopt += format(" %s %s", (fuse_batch_norm) ? "-DFUSE_BATCH_NORM" : "",
+                               (fuse_relu) ? "-DFUSE_RELU" : "");
+
+            ocl::Kernel k1("mvn_fuse4", ocl::dnn::mvn_oclsrc, buildopt);
+            argId = 0;
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(tmpMat));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(inpMat));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(meanMat));
+            k1.set(argId++, (int)s[1]);
+            k1.set(argId++, (float)alpha);
+            k1.set(argId++, (float)eps);
+            k1.set(argId++, (float)relu_slope);
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
+            k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
+            k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
+            k1.set(argId++, NULL, localsize[0] * sizeof(cl_float4));
+            ret = k1.run(1, globalsize, localsize, false);
+            if (!ret)
+                return false;
+        }
+        return true;
+    }
+
     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
     {
         std::vector<UMat> inputs;
@@ -101,6 +162,15 @@ public:
         inputs_.getUMatVector(inputs);
         outputs_.getUMatVector(outputs);
 
+        int splitDim = (acrossChannels) ? 1 : 2;
+        int row_size = total(shape(inputs[0]), 0, splitDim);
+        int plane_size = total(shape(inputs[0]), splitDim);
+        if (normVariance && (row_size % 4 == 0) && (plane_size % 4 == 0))
+        {
+            bool ret = fast_forward_ocl(inputs, outputs);
+            return ret;
+        }
+
         if( fuse_batch_norm && scale.empty())
         {
             bnorm->getScaleShift(scale, shift);
@@ -112,11 +182,7 @@ public:
         {
             UMat &inpMat = inputs[inpIdx];
             UMat &outMat = outputs[inpIdx];
-
-            int splitDim = (acrossChannels) ? 1 : 2;
-            int i, newRows = 1;
-            for( i = 0; i < splitDim; i++ )
-                newRows *= inpMat.size[i];
+            int newRows = total(shape(inpMat), 0, splitDim);
 
             MatShape s = shape(newRows, inpMat.total() / newRows);
             UMat oneMat = UMat::ones(s[1], 1, CV_32F);
diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl
index cc059eeb1a..9f8ab574ca 100644
--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@@ -50,18 +50,24 @@
     #define vec_type Dtype8
     #define CALC_MEAN calc_mean8
     #define MVN mvn8
+    #define MEAN_FUSE mean_fuse8
+    #define MVN_FUSE mvn_fuse8
 #elif NUM == 4
     #define load(src, index) vload4(0, src + index)
     #define store(vec, dst, index) vstore4(vec, 0, dst + index)
     #define vec_type Dtype4
     #define CALC_MEAN calc_mean4
     #define MVN mvn4
+    #define MEAN_FUSE mean_fuse4
+    #define MVN_FUSE mvn_fuse4
 #elif NUM == 1
     #define load(src, index) src[index]
     #define store(vec, dst, index) dst[index] = vec
     #define vec_type Dtype
     #define CALC_MEAN calc_mean1
     #define MVN mvn1
+    #define MEAN_FUSE mean_fuse1
+    #define MVN_FUSE mvn_fuse1
 #endif
 
 __kernel void CALC_MEAN(__global const Dtype* src,
@@ -128,3 +134,177 @@ __kernel void MVN(__global const Dtype* src,
 
     store(dst_vec, dst, index);
 }
+
+__kernel void MEAN_FUSE(__global const Dtype * A,
+                        unsigned int A_col_size,
+                        float alpha,
+                        __global Dtype4 * result,
+                        __global Dtype * B,
+                        __local Dtype4 * work)
+{
+    unsigned int row_gid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+    const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    Dtype4 dot0, dot1, dot2, dot3;
+    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
+
+    unsigned int i = lid;
+    const Dtype4 b0 = (Dtype4)1.f;
+    while( i < A_col_size / 4)
+    {
+        const Dtype4 a0 = vload4(i, src0_read);
+        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 += a0;
+        dot1 += a1;
+        dot2 += a2;
+        dot3 += a3;
+
+        i += get_local_size(0);
+    }
+
+    work[lid].s0 = dot(dot0, b0);
+    work[lid].s1 = dot(dot1, b0);
+    work[lid].s2 = dot(dot2, b0);
+    work[lid].s3 = dot(dot3, b0);
+
+    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < stride)
+            work[lid] += work[lid+stride];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(lid == 0)
+    {
+        result[row_gid] = alpha * work[0];
+    }
+
+    Dtype4 sum = work[0] * alpha;
+    i = lid;
+    while( i < A_col_size / 4)
+    {
+        const Dtype4 a0 = vload4(i, src0_read);
+        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 = native_powr(a0 - (Dtype4)sum.x, 2);
+        dot1 = native_powr(a1 - (Dtype4)sum.y, 2);
+        dot2 = native_powr(a2 - (Dtype4)sum.z, 2);
+        dot3 = native_powr(a3 - (Dtype4)sum.w, 2);
+
+        vstore4(dot0, i, dst0_read);
+        vstore4(dot1, i, dst0_read + A_col_size);
+        vstore4(dot2, i, dst0_read + 2 * A_col_size);
+        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+
+        i += get_local_size(0);
+    }
+}
+
+__kernel void MVN_FUSE(__global const Dtype * tmp,
+                       __global const Dtype * A,
+                       __global const Dtype4 * mean,
+                       unsigned int A_col_size,
+                       const float alpha_val,
+                       const float eps,
+                       const float relu_slope,
+                       __global const Dtype4 * bnorm_weight,
+                       __global const Dtype4 * bnorm_bias,
+                       __global Dtype * B,
+                       __local Dtype4 * work)
+{
+    unsigned int row_gid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+    const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
+    const __global Dtype *src1_read = A + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = B + row_gid * 4 * A_col_size;
+    Dtype4 dot0, dot1, dot2, dot3;
+    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
+
+    unsigned int i = lid;
+    const Dtype4 b0 = (Dtype4)1.f;
+    while( i < A_col_size / 4)
+    {
+        const Dtype4 a0 = vload4(i, src0_read);
+        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 += a0;
+        dot1 += a1;
+        dot2 += a2;
+        dot3 += a3;
+
+        i += get_local_size(0);
+    }
+
+    work[lid].s0 = dot(dot0, b0);
+    work[lid].s1 = dot(dot1, b0);
+    work[lid].s2 = dot(dot2, b0);
+    work[lid].s3 = dot(dot3, b0);
+
+    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < stride)
+            work[lid] += work[lid+stride];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    Dtype4 mean_val = mean[row_gid];
+    Dtype4 dev_val = sqrt(work[0] * alpha_val) + (Dtype4)eps;
+    Dtype4 alpha = (Dtype4)1.f / dev_val;
+
+    Dtype4 w = (Dtype4)1.f;
+    Dtype4 b = (Dtype4)0.f;
+#ifdef FUSE_BATCH_NORM
+    w = bnorm_weight[row_gid];
+    b = bnorm_bias[row_gid];
+#endif
+
+    i = lid;
+    while( i < A_col_size / 4)
+    {
+        const Dtype4 a0 = vload4(i, src1_read);
+        const Dtype4 a1 = vload4(i, src1_read + A_col_size);
+        const Dtype4 a2 = vload4(i, src1_read + 2 * A_col_size);
+        const Dtype4 a3 = vload4(i, src1_read + 3 * A_col_size);
+
+        dot0 = (a0 - (Dtype4)mean_val.x) * alpha.x;
+        dot1 = (a1 - (Dtype4)mean_val.y) * alpha.y;
+        dot2 = (a2 - (Dtype4)mean_val.z) * alpha.z;
+        dot3 = (a3 - (Dtype4)mean_val.w) * alpha.w;
+
+        dot0 = dot0 * w.x + (Dtype4)b.x;
+        dot1 = dot1 * w.y + (Dtype4)b.y;
+        dot2 = dot2 * w.z + (Dtype4)b.z;
+        dot3 = dot3 * w.w + (Dtype4)b.w;
+
+#ifdef FUSE_RELU
+        Dtype4 new0 = dot0 * relu_slope;
+        dot0 = select(new0, dot0, dot0 > (Dtype4)0.f);
+
+        Dtype4 new1 = dot1 * relu_slope;
+        dot1 = select(new1, dot1, dot1 > (Dtype4)0.f);
+
+        Dtype4 new2 = dot2 * relu_slope;
+        dot2 = select(new2, dot2, dot2 > (Dtype4)0.f);
+
+        Dtype4 new3 = dot3 * relu_slope;
+        dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
+#endif
+
+        vstore4(dot0, i, dst0_read);
+        vstore4(dot1, i, dst0_read + A_col_size);
+        vstore4(dot2, i, dst0_read + 2 * A_col_size);
+        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+
+        i += get_local_size(0);
+    }
+}