From d0e612dc368d657ca56101879dbc58c46a4dbb9b Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Fri, 3 Sep 2021 15:32:29 +0300
Subject: [PATCH 01/15] Merge pull request #20647 from
 rogday:resize_concat_optimization

Fix resize+concat optimization

* fix resize+concat optimization

* add comment and fix indentation
---
 modules/dnn/src/layers/resize_layer.cpp | 7 +++++++
 modules/dnn/test/test_tf_importer.cpp   | 5 +++++
 2 files changed, 12 insertions(+)
diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp
index 40c7351984..8b7d802ab2 100644
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -111,7 +111,14 @@ public:
         internals_arr.getMatVector(internals);
 
         if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3])
+        {
+            // outputs[0] = inputs[0] doesn't work due to BlobManager optimizations
+            if (inputs[0].data != outputs[0].data)
+            {
+                inputs[0].copyTo(outputs[0]);
+            }
             return;
+        }
 
         Mat& inp = inputs[0];
         Mat& out = outputs[0];
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 68d6e88a66..1a2b976eb8 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -1125,6 +1125,11 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_down)
     runTensorFlowNet("resize_bilinear_down");
 }
 
+TEST_P(Test_TensorFlow_layers, resize_concat_optimization)
+{
+    runTensorFlowNet("resize_concat_optimization");
+}
+
 TEST_P(Test_TensorFlow_layers, tf2_dense)
 {
     runTensorFlowNet("tf2_dense");

From 407adc7061c9d2126a5d27c53ff76b56a705f3e2 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sat, 4 Sep 2021 04:35:00 +0000
Subject: [PATCH 02/15] dnn(ocl): fix buffer offsets in IDLF kernel

- drop CreateSubBuffer
- fix FUSED_CONV_ELTWISE mode
---
 modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp   |  2 +-
 .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp  | 86 +++++--------------
 modules/dnn/src/opencl/conv_layer_spatial.cl  | 25 ++++--
 3 files changed, 41 insertions(+), 72 deletions(-)

diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index 7bb277d102..d6fb83becb 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -269,7 +269,7 @@ class OCL4DNNConvSpatial
         void generate_idlf_tuneritems(std::vector< cv::Ptr<tunerParam> > &tunerItems,
                                       int blockM, int blockK, int simd_size);
         void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
-        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
+        void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx);
 
         int32_t group_;
         bool bias_term_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 517a663e46..b4477ebfc4 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -270,17 +270,21 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
 }
 
 template<typename Dtype>
-void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
+void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx)
 {
     if (fused_eltwise)
-        kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
+    {
+        kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_));
+        if (fused_eltwise_offset >= 0)
+            kernel.set(argIdx++, fused_eltwise_offset);
+    }
 
     switch (fused_activ) {
         case OCL4DNN_CONV_FUSED_ACTIV_RELU:
             kernel.set(argIdx++, (float)negative_slope_);
             break;
         case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
-            kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_));
             break;
         case OCL4DNN_CONV_FUSED_ACTIV_POWER:
             kernel.set(argIdx++, (float)power_);
@@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
     if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
         if (!swizzleWeight(weight, config->workItem_output[2], false))
             return false;
+#if 0
         size_t total_bottom_size = bottom_dim_ * numImages;
         size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
         size_t total_bias_size = M_ * group_;
         size_t total_top_size = top_dim_ * numImages;
+#endif
         for (int32_t g = 0; g < group_; ++g) {
             bias_offset = M_ * g;
             int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
@@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 return false;
 
             cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+            setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
 
-            UMat img_buffer;
-            if (image_offset)
-            {
-                CreateSubBuffer(bottom, img_buffer, image_offset,
-                                total_bottom_size - image_offset, false);
-                if (img_buffer.empty())
-                    return false;
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            kernel.set(argIdx++, image_offset);
 
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+            kernel.set(argIdx++, kernel_offset);
 
-            UMat kernel_buffer;
-            if (kernel_offset)
-            {
-                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
-                                total_kernel_size - kernel_offset, false);
-                if (kernel_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
-            }
-
-            UMat bias_buffer;
             if (bias_term_)
             {
-                if (bias_offset)
-                {
-                    CreateSubBuffer(bias, bias_buffer, bias_offset,
-                                    total_bias_size - bias_offset, false);
-                    if (bias_buffer.empty())
-                        return false;
-
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
-                }
-                else
-                {
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
-                }
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                kernel.set(argIdx++, bias_offset);
             }
 
-            UMat out_buffer;
-            if (output_image_offset)
-            {
-                CreateSubBuffer(top, out_buffer, output_image_offset,
-                                total_top_size - output_image_offset, true);
-                if (out_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
-                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-                kernel.set(argIdx++, (int)(top.offset / element_size));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
 
             kernel.set(argIdx++, (uint16_t)width_);
             kernel.set(argIdx++, (uint16_t)height_);
@@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 return false;
 
             cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+            setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
 
             UMat img_buffer;
             if (image_offset)
@@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
             return false;
 
         cl_uint argIdx = 0;
-        setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+        setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
         kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
         kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
         if (bias_term_)
@@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                     return false;
 
                 cl_uint argIdx = 0;
-                setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
+                setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
                 kernel.set(argIdx++, image_offset);
                 kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index 236e8d029a..55015557a0 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -74,18 +74,22 @@
     (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
 } while(0)
 #define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
+#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset,
 #else
 #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
     const Dtype _x_ = (_data_); \
     (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
 } while(0)
 #define ELTWISE_DATA_ARG
+#define ELTWISE_DATA_ARG_WITH_OFFSET
 #endif
 
 #if APPLY_BIAS
 #define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset,
 #else
 #define BIAS_KERNEL_ARG
+#define BIAS_KERNEL_ARG_WITH_OFFSET
 #endif
 
 #define __CAT(x, y) x##y
@@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
 __kernel void
 convolve_simd(
-    ELTWISE_DATA_ARG
+    ELTWISE_DATA_ARG_WITH_OFFSET
     FUSED_ARG
-    __global Dtype* inputs,
-    __global Dtype* weights,
-    BIAS_KERNEL_ARG
-    __global Dtype* outputs_base,
-    const int outputs_offset,
+    __global Dtype* inputs_ptr, const int inputs_offset,
+    __global Dtype* weights_ptr, const int weights_offset,
+    BIAS_KERNEL_ARG_WITH_OFFSET
+    __global Dtype* outputs_base, const int outputs_offset,
     const ushort input_width,
     const ushort input_height,
     const ushort output_width,
     const ushort output_height)
 {
+  __global Dtype* inputs = inputs_ptr + inputs_offset;
+  __global Dtype* weights = weights_ptr + weights_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
   __global Dtype* outputs = outputs_base + outputs_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
   unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
   unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
   unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth

From aaff1256088ce892f977c73ff01817c2e389ac93 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sat, 4 Sep 2021 01:34:02 +0000
Subject: [PATCH 03/15] core(ocl): debug capabilities

---
 modules/core/src/ocl.cpp | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 0c14e7f3e0..daf4fcd280 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -76,8 +76,11 @@
 #undef CV__ALLOCATOR_STATS_LOG
 
 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
+#define CV_OPENCL_SHOW_BUILD_OPTIONS             0
+#define CV_OPENCL_SHOW_BUILD_KERNELS             0
 
 #define CV_OPENCL_SHOW_RUN_KERNELS               0
+#define CV_OPENCL_SYNC_RUN_KERNELS               0
 #define CV_OPENCL_TRACE_CHECK                    0
 
 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
@@ -2983,6 +2986,8 @@ static cv::String dumpValue(size_t sz, const void* p)
 {
     if (!p)
         return "NULL";
+    if (sz == 2)
+        return cv::format("%d / %uu / 0x%04x", *(short*)p, *(unsigned short*)p, *(short*)p);
     if (sz == 4)
         return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
     if (sz == 8)
@@ -3195,6 +3200,10 @@ bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
         return false;  // OpenCV 5.0: raise error
     }
 
+#if CV_OPENCL_SYNC_RUN_KERNELS
+    sync = true;
+#endif
+
     cl_command_queue qq = getQueue(q);
     if (haveTempDstUMats)
         sync = true;
@@ -3625,7 +3634,28 @@ struct Program::Impl
             if (!param_buildExtraOptions.empty())
                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
         }
+#if CV_OPENCL_SHOW_BUILD_OPTIONS
+        CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' options:" << buildflags);
+#endif
         compile(ctx, src_, errmsg);
+#if CV_OPENCL_SHOW_BUILD_KERNELS
+        if (handle)
+        {
+            size_t retsz = 0;
+            char kernels_buffer[4096] = {0};
+            cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
+            CV_OCL_DBG_CHECK_RESULT(result, cv::format("clGetProgramInfo(CL_PROGRAM_KERNEL_NAMES: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
+            if (result == CL_SUCCESS && retsz < sizeof(kernels_buffer))
+            {
+                kernels_buffer[retsz] = 0;
+                CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' kernels: '" << kernels_buffer << "'");
+            }
+            else
+            {
+                CV_LOG_ERROR(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' can't retrieve kernel names!");
+            }
+        }
+#endif
     }
 
     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
@@ -3857,7 +3887,6 @@ struct Program::Impl
                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
             }
 #endif
-
         }
         return handle != NULL;
     }

From 5b2c0168340a8b4e67def8f52885ab21b4bbcadc Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 3 Sep 2021 02:38:53 +0000
Subject: [PATCH 04/15] dnn(ocl): avoid out of buffer access in
 copyWeightsSwizzled

---
 modules/dnn/src/opencl/conv_spatial_helper.cl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/modules/dnn/src/opencl/conv_spatial_helper.cl b/modules/dnn/src/opencl/conv_spatial_helper.cl
index 33d9db57c8..660d085956 100644
--- a/modules/dnn/src/opencl/conv_spatial_helper.cl
+++ b/modules/dnn/src/opencl/conv_spatial_helper.cl
@@ -62,8 +62,8 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
   //Original location
 
   //Output location
-  int outputSublayer = channels / swizzleFactor;
-  int outputSublayerIndex = channels % swizzleFactor;
+  //int outputSublayer = channels / swizzleFactor;
+  //int outputSublayerIndex = channels % swizzleFactor;
 
   int filter = sX / (kernel_w*kernel_h*channels);
   int kernel_X = sX % kernel_w;
@@ -73,6 +73,10 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
   int FP = filter / swizzleFactor;
   int F1 = filter % swizzleFactor;
 
-  weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
-  = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
+  int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1;
+  int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X;
+
+  // idxIn is not valid if (filter >= outputs) - no data for these elements. Output alignment gaps are filled by zeros
+  Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0;
+  weightOut[idxOut] = v;
 }

From 5578ad5e14fe1fcf4d7171f74c665db2a578187f Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Sat, 4 Sep 2021 16:27:51 +0000
Subject: [PATCH 05/15] dnn(ocl): fix automatic globalsize adjusting

- if kernel code doesn't support that
---
 modules/core/include/opencv2/core/ocl.hpp     | 20 +++++++++++++++++--
 modules/core/src/ocl.cpp                      |  8 ++++++++
 modules/dnn/src/layers/batch_norm_layer.cpp   |  2 +-
 modules/dnn/src/layers/mvn_layer.cpp          |  2 +-
 modules/dnn/src/layers/slice_layer.cpp        |  2 +-
 .../dnn/src/ocl4dnn/src/math_functions.cpp    |  1 +
 .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp  | 19 +++++++++---------
 .../dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp   |  2 +-
 modules/dnn/src/opencl/gemm_image.cl          | 10 ++++++++++
 9 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index f03de180fc..b51b39359d 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -562,7 +562,9 @@ public:
         i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11);
         i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this;
     }
-    /** @brief Run the OpenCL kernel.
+
+    /** @brief Run the OpenCL kernel (globalsize value may be adjusted)
+
     @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
     @param globalsize work items for each dimension. It is not the final globalsize passed to
       OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
@@ -571,12 +573,26 @@ public:
     @param localsize work-group size for each dimension.
     @param sync specify whether to wait for OpenCL computation to finish before return.
     @param q command queue
+
+    @note Use run_() if your kernel code doesn't support adjusted globalsize.
     */
     bool run(int dims, size_t globalsize[],
              size_t localsize[], bool sync, const Queue& q=Queue());
+
+    /** @brief Run the OpenCL kernel
+     *
+     * @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
+     * @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
+     * @param localsize work-group size for each dimension.
+     * @param sync specify whether to wait for OpenCL computation to finish before return.
+     * @param q command queue
+     */
+    bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
+
     bool runTask(bool sync, const Queue& q=Queue());
 
-    /** @brief Similar to synchronized run() call with returning of kernel execution time
+    /** @brief Similar to synchronized run_() call with returning of kernel execution time
+     *
      * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
      * @return Execution time in nanoseconds or negative number on error
      */
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index daf4fcd280..a550c1d91a 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
 }
 
 
+bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[],
+                  bool sync, const Queue& q)
+{
+    CV_Assert(p);
+    return p->run(dims, _globalsize, _localsize, sync, NULL, q);
+}
+
+
 static bool isRaiseErrorOnReuseAsyncKernel()
 {
     static bool initialized = false;
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 42676c7938..dcb4005975 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -231,7 +231,7 @@ public:
                 kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
                 kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
                 kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
-                bool ret = kernel.run(2, global, NULL, false);
+                bool ret = kernel.run_(2, global, NULL, false);
                 if (!ret)
                     return false;
             }
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index 8f06216df1..de2b0d5690 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -191,7 +191,7 @@ public:
             k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight));
             k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias));
             k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat));
-            ret = k1.run(1, globalsize, localsize, false);
+            ret = k1.run_(1, globalsize, localsize, false);
             if (!ret)
                 return false;
         }
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 507964edf9..16f1958879 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -482,7 +482,7 @@ public:
                     ocl::KernelArg::PtrReadOnly(input),
                     ocl::KernelArg::PtrWriteOnly(output)
                 )
-                .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
+                .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false);
             if (!ret)
                 return false;
         }  // for outputs.size()
diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
index 855a21e08f..c924d66b12 100644
--- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp
+++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp
@@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
                 .args(
                     ocl::KernelArg::PtrReadOnly(buffer),
                     image, offset,
+                    padded_width, padded_height,
                     width, height,
                     ld)
                 .run(2, global_copy, NULL, false);
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index b4477ebfc4..3b73da801c 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
             swizzled_factor
         );
 
-        size_t global_work_size_copy[3] = {
-            (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
+        size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) };
 
-        if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
+        if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false))
         {
-            std::cout << "Swizzle kernel run failed." << std::endl;
+            CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed");
             return false;
         }
     } else {
@@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
             kernel.set(argIdx++, (uint16_t)height_);
             kernel.set(argIdx++, (uint16_t)output_w_);
             kernel.set(argIdx++, (uint16_t)output_h_);
-            if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
+            if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
             {
                 std::cout << "IDLF kernel run failed." << std::endl;
                 return false;
@@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
             gy = alignSize(gy, blockK);
             size_t global_size[3] = { gx, gy, config->global_work_size[2] };
 
-            if (!kernel.run(3, global_size, config->local_work_size, false))
+            if (!kernel.run_(3, global_size, config->local_work_size, false))
             {
                 std::cout << "GEMM like kernel run failed." << std::endl;
                 return false;
@@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
         global_size[1] = output_h_;
         global_size[2] = num_output_ * num_;
 
-        if (!kernel.run(3, global_size, NULL, false))
+        if (!kernel.run_(3, global_size, NULL, false))
         {
-            std::cout << "DWCONV kernel run failed." << std::endl;
+            CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed");
             return false;
         }
     } else {
@@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 kernel.set(argIdx++, (uint16_t)output_h_);
                 kernel.set(argIdx++, (uint16_t)pad_w_);
                 kernel.set(argIdx++, (uint16_t)pad_h_);
-                if (!kernel.run(3, config->global_work_size,
+                if (!kernel.run_(3, config->global_work_size,
                                 (config->use_null_local) ? NULL : config->local_work_size,
                                 false))
                 {
-                    std::cout << "Basic kernel run failed." << std::endl;
+                    CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed");
                     return false;
                 }
             }
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
index 78576711a7..7b32189fdc 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp
@@ -127,7 +127,7 @@ bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
             oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
             oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
         }
-        ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
+        ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false);
     }
     return ret;
 }
diff --git a/modules/dnn/src/opencl/gemm_image.cl b/modules/dnn/src/opencl/gemm_image.cl
index 710637a093..f6e0020d82 100644
--- a/modules/dnn/src/opencl/gemm_image.cl
+++ b/modules/dnn/src/opencl/gemm_image.cl
@@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
 {
     const int gidx = get_global_id(0);
     const int gidy = get_global_id(1);
+
+    if (gidx >= width || gidy >= height)
+        return;
+
     int2 coord_dst = (int2)(gidx, gidy);
     __global Dtype* A_off = A + offA;
     Dtype srcA = A_off[gidy * ldA + gidx];
@@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
     __global Dtype* A,
     __write_only image2d_t ImA,
     int offA,
+    int padded_width,
+    int padded_height,
     int width,
     int height,
     int ldA)
 {
     const int gidx = get_global_id(0);
     const int gidy = get_global_id(1);
+
+    if (gidx >= padded_width || gidy >= padded_height)
+        return;
+
     int2 coord_dst = (int2)(gidx, gidy);
 #if TYPE == TYPE_HALF
     if (gidx >= width || gidy >= height) {

From 36cc43170d9e1e244d72c25d69cef8d716599a10 Mon Sep 17 00:00:00 2001
From: Zhuo Zhang <imzhuo@foxmail.com>
Date: Mon, 6 Sep 2021 12:03:59 +0800
Subject: [PATCH 06/15] docs: fix image path for py_fast doc

---
 doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown b/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown
index 4ce61370bd..b1b8a81ca8 100644
--- a/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown
+++ b/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown
@@ -98,7 +98,7 @@ import numpy as np
 import cv2 as cv
 from matplotlib import pyplot as plt
 
-img = cv.imread('simple.jpg',0)
+img = cv.imread('blox.jpg',0) # `<opencv_root>/samples/data/blox.jpg`
 
 # Initiate FAST object with default values
 fast = cv.FastFeatureDetector_create()
@@ -113,17 +113,17 @@ print( "nonmaxSuppression:{}".format(fast.getNonmaxSuppression()) )
 print( "neighborhood: {}".format(fast.getType()) )
 print( "Total Keypoints with nonmaxSuppression: {}".format(len(kp)) )
 
-cv.imwrite('fast_true.png',img2)
+cv.imwrite('fast_true.png', img2)
 
 # Disable nonmaxSuppression
 fast.setNonmaxSuppression(0)
-kp = fast.detect(img,None)
+kp = fast.detect(img, None)
 
 print( "Total Keypoints without nonmaxSuppression: {}".format(len(kp)) )
 
 img3 = cv.drawKeypoints(img, kp, None, color=(255,0,0))
 
-cv.imwrite('fast_false.png',img3)
+cv.imwrite('fast_false.png', img3)
 @endcode
 See the results. First image shows FAST with nonmaxSuppression and second one without
 nonmaxSuppression:

From 35e824c28710ca13b2770dc9d547bfc352293dc5 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Mon, 6 Sep 2021 05:51:37 +0300
Subject: [PATCH 07/15] dnn(ocl): fix out of bound access in GEMM-like kernels

- dropped usage of CreateSubBuffer() - buffers lifetime management issue
- fixed elementwise offset
- avoid out of bounds read access
---
 .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp  | 73 ++++-------------
 modules/dnn/src/opencl/conv_layer_spatial.cl  | 79 ++++++++++++++++---
 2 files changed, 80 insertions(+), 72 deletions(-)

diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 3b73da801c..5eee1da4a0 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -945,9 +945,11 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
     } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
         if (!swizzleWeight(weight, config->workItem_output[1], true))
             return false;
+#if 0
         size_t total_bottom_size = bottom_dim_ * numImages;
         size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
         size_t total_bias_size = M_ * group_;
+#endif
         size_t total_top_size = top_dim_ * numImages;
         for (int32_t g = 0; g < group_; ++g) {
             bias_offset = M_ * g;
@@ -960,72 +962,25 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
                 return false;
 
             cl_uint argIdx = 0;
-            setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx);
+            setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx);
 
-            UMat img_buffer;
-            if (image_offset)
-            {
-                CreateSubBuffer(bottom, img_buffer, image_offset,
-                                total_bottom_size - image_offset, false);
-                if (img_buffer.empty())
-                    return false;
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+            kernel.set(argIdx++, (int)image_offset);
+            kernel.set(argIdx++, (int)(bottom.total() - image_offset));
 
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+            kernel.set(argIdx++, (int)kernel_offset);
+            kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset));
 
-            UMat kernel_buffer;
-            if (kernel_offset)
-            {
-                CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
-                                total_kernel_size - kernel_offset, false);
-                if (kernel_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
-            }
-
-            UMat bias_buffer;
             if (bias_term_)
             {
-                if (bias_offset)
-                {
-                    CreateSubBuffer(bias, bias_buffer, bias_offset,
-                                    total_bias_size - bias_offset, false);
-                    if (bias_buffer.empty())
-                        return false;
-
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
-                }
-                else
-                {
-                    kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
-                }
+                kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+                kernel.set(argIdx++, (int)bias_offset);
             }
 
-            UMat out_buffer;
-            if (output_image_offset)
-            {
-                CreateSubBuffer(top, out_buffer, output_image_offset,
-                                total_top_size - output_image_offset, true);
-                if (out_buffer.empty())
-                    return false;
-
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
-                kernel.set(argIdx++, (int)(out_buffer.offset / element_size));
-            }
-            else
-            {
-                kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
-                kernel.set(argIdx++, (int)(top.offset / element_size));
-            }
+            kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+            kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset);
+            kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size));
 
             kernel.set(argIdx++, (uint16_t)width_);
             kernel.set(argIdx++, (uint16_t)height_);
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index 55015557a0..e7bbacd4c4 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -401,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 #define ROW_PITCH input_width
 
 #define GEMM_LIKE_KERNEL_ARGS     \
-    ELTWISE_DATA_ARG              \
+    ELTWISE_DATA_ARG_WITH_OFFSET  \
     FUSED_ARG                     \
-    const __global Dtype *src0,   \
-    const __global Dtype *src1,   \
-    BIAS_KERNEL_ARG               \
-    __global Dtype *dst_base,     \
-    const int dst_offset,         \
+    const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \
+    const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \
+    BIAS_KERNEL_ARG_WITH_OFFSET   \
+    __global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \
     const ushort input_width,     \
     const ushort input_height,    \
     const ushort output_width,    \
@@ -437,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -460,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
     }
     typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
 
+// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds)
+// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1)
+#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+  #define OPTIMIZE_READ 1
+#else
+  #define OPTIMIZE_READ 0
+#endif
+
     // True for all threads if filter_width is multiple of TILE_N
     // else, true for all but right-most column of threads.
     if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
@@ -476,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         // atile is M rows x K columns.
         int curr_x = ( global_y % output_width ) * STRIDE_X;
         int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
@@ -496,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         do
         {
             int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
             curr_y = saved_y;
 #endif
 
@@ -514,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
                 // ...
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+#if OPTIMIZE_READ
   #if KERNEL_WIDTH == 3
                 Dtype_t blockA00 = vload3(0, src0_read);
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
   #else
+    #if 0 // debug
+                if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH)
+                {
+                    printf("CATCH: src0_read-src0: %d   limit=%d   curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x);
+                }
+    #endif
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
   #endif
@@ -639,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         // atile is M rows x K columns.
         int curr_x = ( global_y % output_width ) * STRIDE_X;
         int curr_y = ( global_y / output_width ) * STRIDE_Y;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
         int saved_y = curr_y;
 #endif
         const __global Dtype *src0_read = src0
@@ -659,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
         do
         {
             int patch_row = 0;
-#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
+#if !OPTIMIZE_READ
             curr_y = saved_y;
 #endif
             do
             {
                 // Load atile and interleaved btile.
                 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
-#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
+#if OPTIMIZE_READ
                 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
                 Dtype*  pblockA00 = (Dtype*)(&blockA00);
 #else
@@ -803,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
             }
         }
     }
-#endif
+#endif  // TILE_N_LAST > 0
 }
 #endif
 #ifdef GEMM_LIKE_CONV_32_2
@@ -826,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(8)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -1388,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);
@@ -1574,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
+    const __global Dtype *src0 = src0_ptr + src0_offset;
+    const __global Dtype *src1 = src1_ptr + src1_offset;
+#if APPLY_BIAS
+  __global Dtype* biases_base = biases_base_ptr + biases_base_offset;
+#endif
+
     __global Dtype *dst = dst_base + dst_offset;
+#ifdef FUSED_CONV_ELTWISE
+  __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset;
+#endif
+
     const int group_x = get_group_id(0);
     const int group_y = get_group_id(1);
     const int global_x = get_global_id(0);

From 4807cd8a6eeaa494a76df435679ccc3b9e6545e6 Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Thu, 9 Sep 2021 14:41:40 +0300
Subject: [PATCH 08/15] Merge pull request #20605 from
 rogday:split_slice_shenanigans

Add Normalize subgraph, fix Slice, Mul and Expand

* Add Normalize subgraph, support for starts<0 and axis<0 in Slice, Mul broadcasting in the middle and fix Expand's unsqueeze

* remove todos

* remove range-based for loop

* address review comments

* change >> to > > in template

* fix indexation

* fix expand that does nothing
---
 modules/dnn/src/layers/slice_layer.cpp        |  59 ++++++--
 .../dnn/src/onnx/onnx_graph_simplifier.cpp    |  22 +++
 modules/dnn/src/onnx/onnx_importer.cpp        | 129 ++++++++++++++----
 modules/dnn/test/test_onnx_importer.cpp       |   6 +
 4 files changed, 173 insertions(+), 43 deletions(-)

diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 16f1958879..a470772813 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -58,6 +58,31 @@ namespace cv
 namespace dnn
 {
 
+void sliceRangesFromShape(const MatShape& inpShape, int& axis, std::vector<std::vector<cv::Range> >& sliceRanges)
+{
+    CV_Assert(inpShape.size() > 0);
+    bool axisNeg = (axis < 0);
+    axis = (axis + static_cast<int>(inpShape.size())) % inpShape.size();
+    int n = inpShape[axis];
+
+    for (size_t i = 0; i < sliceRanges.size(); ++i){
+        std::vector<Range>& ranges = sliceRanges[i];
+        if (axisNeg)
+        {
+            ranges.insert(ranges.begin(), axis, Range::all());
+        }
+        Range& range = ranges.back();
+
+        if (range.start >= 0)
+        {
+            continue;
+        }
+
+        CV_Assert(n != 0);
+        range.start = (n + range.start) % n;
+    }
+}
+
 class SliceLayerImpl : public SliceLayer
 {
 public:
@@ -69,20 +94,22 @@ public:
         num_split = params.get<int>("num_split", 0);
         hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
         shapesInitialized = !hasDynamicShapes;
+
         if (params.has("slice_point"))
         {
             CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end"));
             const DictValue &indicesValue = params.get("slice_point");
+            int size = axis > 0 ? axis + 1 : 1;
             sliceRanges.resize(indicesValue.size() + 1,
-                               std::vector<Range>(axis + 1, Range::all()));
+                               std::vector<Range>(size, Range::all()));
             int prevSlice = 0;
             for (int i = 0; i < indicesValue.size(); ++i)
             {
-                sliceRanges[i][axis].start = prevSlice;
-                sliceRanges[i][axis].end = indicesValue.get<int>(i);
-                prevSlice = sliceRanges[i][axis].end;
+                sliceRanges[i][size - 1].start = prevSlice;
+                sliceRanges[i][size - 1].end = indicesValue.get<int>(i);
+                prevSlice = sliceRanges[i][size - 1].end;
             }
-            sliceRanges.back()[axis].start = prevSlice;
+            sliceRanges.back()[size - 1].start = prevSlice;
         }
         else if (params.has("begin"))
         {
@@ -97,7 +124,6 @@ public:
             {
                 int start = begins.get<int>(i);
                 int sizeOrEnd = sizesOrEnds.get<int>(i);  // It may be negative to reverse indexation.
-                CV_Assert(start >= 0);
 
                 sliceRanges[0][i].start = start;
                 if (params.has("size"))
@@ -154,16 +180,20 @@ public:
         CV_Assert(inputs.size() == 1);
         MatShape inpShape = inputs[0];
 
-        if (!sliceRanges.empty())
+        int axis_rw = axis;
+        std::vector<std::vector<cv::Range> > sliceRanges_rw = sliceRanges;
+        sliceRangesFromShape(inpShape, axis_rw, sliceRanges_rw);
+
+        if (!sliceRanges_rw.empty())
         {
-            outputs.resize(sliceRanges.size(), inpShape);
+            outputs.resize(sliceRanges_rw.size(), inpShape);
             for (int i = 0; i < outputs.size(); ++i)
             {
-                CV_Assert(sliceRanges[i].size() <= inpShape.size());
-                for (int j = 0; j < sliceRanges[i].size(); ++j)
+                CV_Assert(sliceRanges_rw[i].size() <= inpShape.size());
+                for (int j = 0; j < sliceRanges_rw[i].size(); ++j)
                 {
                     if (shapesInitialized || inpShape[j] > 0)
-                        outputs[i][j] = normalize_axis_range(sliceRanges[i][j], inpShape[j]).size();
+                        outputs[i][j] = normalize_axis_range(sliceRanges_rw[i][j], inpShape[j]).size();
 
                     if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1))
                         outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j];
@@ -172,10 +202,10 @@ public:
         }
         else  // Divide input blob on equal parts by axis.
         {
-            CV_Assert(0 <= axis && axis < inpShape.size());
+            CV_Assert(0 <= axis_rw && axis_rw < inpShape.size());
             int splits = num_split ? num_split : requiredOutputs;
-            CV_Assert(splits > 0 && inpShape[axis] % splits == 0);
-            inpShape[axis] /= splits;
+            CV_Assert(splits > 0 && inpShape[axis_rw] % splits == 0);
+            inpShape[axis_rw] /= splits;
             outputs.resize(splits, inpShape);
         }
         return false;
@@ -200,6 +230,7 @@ public:
         CV_Assert(inputs.size() == 1);
         const MatSize& inpShape = inputs[0].size;
 
+        sliceRangesFromShape(shape(inputs[0]), axis, sliceRanges);
         finalSliceRanges = sliceRanges;
 
         if (sliceRanges.empty())
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index b81ccf106c..76937e08f3 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -231,6 +231,27 @@ public:
     }
 };
 
+class NormalizeSubgraph2_2 : public NormalizeSubgraphBase
+{
+public:
+    NormalizeSubgraph2_2()
+    {
+        int input = addNodeToMatch("");
+        int norm = addNodeToMatch("ReduceL2", input);
+
+        int min = addNodeToMatch("");
+        int max = addNodeToMatch("");
+        int clip = addNodeToMatch("Clip", norm, min, max);
+
+        int shape = addNodeToMatch("");
+        int expand = addNodeToMatch("Expand", clip, shape);
+
+        addNodeToMatch("Div", input, expand);
+
+        setFusedNode("Normalize", input);
+    }
+};
+
 class NormalizeSubgraph3 : public NormalizeSubgraphBase
 {
 public:
@@ -555,6 +576,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net)
     subgraphs.push_back(makePtr<SoftMaxSubgraph>());
     subgraphs.push_back(makePtr<NormalizeSubgraph1>());
     subgraphs.push_back(makePtr<NormalizeSubgraph2>());
+    subgraphs.push_back(makePtr<NormalizeSubgraph2_2>());
     subgraphs.push_back(makePtr<NormalizeSubgraph3>());
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph1>());
     subgraphs.push_back(makePtr<BatchNormalizationSubgraph2>());
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 6583d6cf62..955c79c0fa 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -59,6 +59,8 @@ class ONNXImporter
     void addLayer(LayerParams& layerParams,
                   const opencv_onnx::NodeProto& node_proto);
 
+    void expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
+                   const std::string& input, size_t n);
 public:
 
     ONNXImporter(Net& net, const char *onnxFile)
@@ -427,6 +429,37 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
     }
 }
 
+/** @brief Make N copies of input layer and set them as input to node_proto.
+ * @param prefix prefix of new layers' names
+ * @param node_proto node which will contain all copies as inputs
+ * @param input name of the node to copy
+ * @param n number of copies
+ */
+void ONNXImporter::expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto,
+                             const std::string& input, size_t n)
+{
+    std::vector<std::string> input_names;
+    input_names.reserve(n);
+    for (size_t j = 0; j < n; j++)
+    {
+        LayerParams copyLP;
+        copyLP.name = format("%s/copy_%d", prefix.c_str(), j);
+        copyLP.type = "Identity";
+        CV_Assert((layer_id.find(copyLP.name) == layer_id.end()) &&
+            "Couldn't copy the node: generated name already exists in the graph.");
+        input_names.push_back(copyLP.name);
+
+        node_proto.set_input(0, input);
+        node_proto.set_output(0, copyLP.name);
+        addLayer(copyLP, node_proto);
+    }
+    node_proto.clear_input();
+    for (size_t i = 0; i < input_names.size(); i++)
+    {
+        node_proto.add_input(input_names[i]);
+    }
+}
+
 void ONNXImporter::addConstant(const std::string& name, const Mat& blob)
 {
     constBlobs.insert(std::make_pair(name, blob));
@@ -1288,6 +1321,37 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node
     addLayer(layerParams, node_proto);
 }
 
+void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis)
+{
+    const size_t diff = outShape.size() - broadShape.size();
+
+    // find the first non-one element of the broadcasting shape
+    axis = 0;
+    for (; axis < broadShape.size() && broadShape[axis] == 1; ++axis) {}
+
+    // find the last non-one element of the broadcasting shape
+    size_t endAxis = broadShape.size();
+    for (; endAxis > axis && broadShape[endAxis - 1] == 1; --endAxis) {}
+
+    // find one between axis and endAxis - as it needs to be broadcasted,
+    // dimensions from the left of axis and from the right of endAxis will be handled by Scale layer
+    broadAxis = -1;
+    for (size_t i = axis; i < endAxis; ++i)
+    {
+        size_t outAxis = i + diff;
+        if (outShape[outAxis] == broadShape[i])
+        {
+            continue;
+        }
+
+        // ensure we need to broadcast only 1 dimension in the middle
+        CV_Assert(broadShape[i] == 1 && broadAxis == -1);
+        broadAxis = static_cast<int>(outAxis);
+    }
+
+    axis += diff;
+}
+
 // "Mul" "Div"
 void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
 {
@@ -1410,13 +1474,31 @@ void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodePro
         }
 
         const MatShape& broadShape = outShapes[node_proto.input(1)];
-        const size_t outShapeSize = outShapes[node_proto.input(0)].size();
-        const size_t diff = outShapeSize - broadShape.size();
+        const MatShape& outShape = outShapes[node_proto.input(0)];
 
-        size_t axis;
-        for (axis = diff; axis < broadShape.size() && broadShape[axis - diff] == 1; ++axis) {}
+        size_t axis = 0;
+        int broadAxis = -1;
+        findBroadAxis(broadShape, outShape, axis, broadAxis);
 
-        CV_Assert(axis != outShapeSize);
+        // if there is a one dimension in the middle that should be broadcasted, broadcast it
+        if (broadAxis != -1)
+        {
+            opencv_onnx::NodeProto concat_node_proto = node_proto;
+            const std::string& input1 = concat_node_proto.input(1);
+
+            expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]);
+
+            LayerParams concatLP;
+            concatLP.name = layerParams.name + "/concat";
+            concatLP.set("axis", broadAxis);
+            concatLP.type = "Concat";
+            concat_node_proto.set_output(0, concatLP.name);
+
+            addLayer(concatLP, concat_node_proto);
+            node_proto.set_input(1, concatLP.name);
+        }
+
+        CV_Assert(axis != outShape.size());
         layerParams.set("axis", static_cast<int>(axis));
         layerParams.type = "Scale";
     }
@@ -1685,12 +1767,11 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
     // Unsqueeze and repeat along new axis
     if (targetShape.size() == inpShape.size() + 1)
     {
+        inpShape.insert(inpShape.begin(), targetShape.size() - inpShape.size(), 1);
         for (int i = 0; i < targetShape.size(); i++)
         {
-            if (targetShape[i] == -1 && i < inpShape.size())
+            if (abs(targetShape[i]) == 1)
                 targetShape[i] = inpShape[i];
-            else if (i < inpShape.size() && targetShape[i] != inpShape[i])
-                inpShape.insert(inpShape.begin() + i, 1);
         }
         if (haveVariables)
         {
@@ -1710,14 +1791,19 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
     CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");
 
     std::vector<int> broadcast_axes;
+    // shapes aren't right-aligned here because targetShape.size() == inpShape.size()
     for (int i = 0; i < targetShape.size(); i++)
     {
         if (targetShape[i] != inpShape[i])
         {
             if (inpShape[i] == 1)
+            {
                 broadcast_axes.push_back(i);
-            else
+            }
+            else if (targetShape[i] != 1)
+            {
                 CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
+            }
         }
     }
 
@@ -1756,31 +1842,16 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node
     }
     else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1)
     {
-        String base_name = layerParams.name + "/copy_";
-        std::vector<std::string> input_names;
-        for (int j = 0; j < targetShape[broadcast_axes[0]]; j++)
-        {
-            std::ostringstream ss;
-            ss << j;
-            LayerParams copyLP;
-            copyLP.name = base_name + ss.str();
-            copyLP.type = "Identity";
-            CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
-            input_names.push_back(copyLP.name);
+        expandMid(layerParams.name, node_proto, srcName, targetShape[broadcast_axes[0]]);
 
-            node_proto.set_input(0, srcName);
-            node_proto.set_output(0, copyLP.name);
-            addLayer(copyLP, node_proto);
-        }
-        node_proto.clear_input();
-        for (int i = 0; i < input_names.size(); i++)
-        {
-            node_proto.add_input(input_names[i]);
-        }
         layerParams.set("axis", broadcast_axes[0]);
         layerParams.type = "Concat";
         node_proto.set_output(0, layerParams.name);
     }
+    else if (broadcast_axes.empty())
+    {
+        layerParams.type = "Identity";
+    }
     else
         CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
     addLayer(layerParams, node_proto);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index bd96729be1..7b94e02d0a 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -285,6 +285,7 @@ TEST_P(Test_ONNX_layers, Scale)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     testONNXModels("scale");
     testONNXModels("scale_broadcast", npy, 0, 0, false, true, 3);
+    testONNXModels("scale_broadcast_mid", npy, 0, 0, false, true, 2);
 }
 
 TEST_P(Test_ONNX_layers, ReduceMean3D)
@@ -471,6 +472,8 @@ TEST_P(Test_ONNX_layers, MatMulAdd)
 
 TEST_P(Test_ONNX_layers, Expand)
 {
+    testONNXModels("expand");
+    testONNXModels("expand_identity");
     testONNXModels("expand_batch");
     testONNXModels("expand_channels");
     testONNXModels("expand_neg_batch");
@@ -611,6 +614,7 @@ TEST_P(Test_ONNX_layers, ReduceL2)
     testONNXModels("reduceL2");
     testONNXModels("reduceL2_subgraph");
     testONNXModels("reduceL2_subgraph_2");
+    testONNXModels("reduceL2_subgraph2_2");
 }
 
 TEST_P(Test_ONNX_layers, Split)
@@ -624,6 +628,7 @@ TEST_P(Test_ONNX_layers, Split)
     testONNXModels("split_3");
     testONNXModels("split_4");
     testONNXModels("split_sizes");
+    testONNXModels("split_neg_axis");
 }
 
 TEST_P(Test_ONNX_layers, Slice)
@@ -632,6 +637,7 @@ TEST_P(Test_ONNX_layers, Slice)
     testONNXModels("slice", npy, 0, 0, false, false);
 #else
     testONNXModels("slice");
+    testONNXModels("slice_neg_starts");
     testONNXModels("slice_opset_11");
 #endif
 }

From 068f33cfdf57ae5009d3e564a37a83bfefda9b22 Mon Sep 17 00:00:00 2001
From: Dale Phurrough <dale@hidale.com>
Date: Thu, 9 Sep 2021 15:20:45 +0200
Subject: [PATCH 09/15] add nodiscard to features2d clone funcs

---
 .../features2d/include/opencv2/features2d.hpp   |  8 ++++----
 modules/python/src2/hdr_parser.py               | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index cff09170c5..bf193599e1 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -1082,7 +1082,7 @@ public:
     that is, copies both parameters and train data. If emptyTrainData is true, the method creates an
     object copy with the current parameters but with empty train data.
      */
-    CV_WRAP virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
+    CV_WRAP CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
 
     /** @brief Creates a descriptor matcher of a given type with the default parameters (using default
     constructor).
@@ -1142,7 +1142,7 @@ protected:
     static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
     static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );
 
-    static Mat clone_op( Mat m ) { return m.clone(); }
+    CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); }
     void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;
 
     //! Collection of descriptors from train images.
@@ -1183,7 +1183,7 @@ public:
      */
     CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ;
 
-    virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
 protected:
     virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
         InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
@@ -1222,7 +1222,7 @@ public:
 
     CV_WRAP static Ptr<FlannBasedMatcher> create();
 
-    virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
 protected:
     static void convertToDMatches( const DescriptorCollection& descriptors,
                                    const Mat& indices, const Mat& distances,
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index 749a9033ee..951dfe11c3 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -432,11 +432,18 @@ class CppHeaderParser(object):
         # filter off some common prefixes, which are meaningless for Python wrappers.
         # note that we do not strip "static" prefix, which does matter;
         # it means class methods, not instance methods
-        decl_str = self.batch_replace(decl_str, [("static inline", ""), ("inline", ""), ("explicit ", ""),
-                                                 ("CV_EXPORTS_W", ""), ("CV_EXPORTS", ""), ("CV_CDECL", ""),
-                                                 ("CV_WRAP ", " "), ("CV_INLINE", ""),
-                                                 ("CV_DEPRECATED", ""), ("CV_DEPRECATED_EXTERNAL", "")]).strip()
-
+        decl_str = self.batch_replace(decl_str, [("static inline", ""),
+                                                 ("inline", ""),
+                                                 ("explicit ", ""),
+                                                 ("CV_EXPORTS_W", ""),
+                                                 ("CV_EXPORTS", ""),
+                                                 ("CV_CDECL", ""),
+                                                 ("CV_WRAP ", " "),
+                                                 ("CV_INLINE", ""),
+                                                 ("CV_DEPRECATED", ""),
+                                                 ("CV_DEPRECATED_EXTERNAL", ""),
+                                                 ("CV_NODISCARD_STD", ""),
+                                                 ("CV_NODISCARD", "")]).strip()
 
         if decl_str.strip().startswith('virtual'):
             virtual_method = True

From d31b93b513df40f1549127836168a0fb17ca3cec Mon Sep 17 00:00:00 2001
From: rogday <s.e.a.98@yandex.ru>
Date: Fri, 10 Sep 2021 14:07:16 +0300
Subject: [PATCH 10/15] Merge pull request #20674 from rogday:prelu_slope

Fix PReLU negative slope access pattern

* fix prelu negative slope access pattern

* change begin() to ptr()
---
 modules/dnn/src/layers/elementwise_layers.cpp | 2 +-
 modules/dnn/test/test_onnx_importer.cpp       | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index e6cf714bff..6e7fa43a70 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -1542,7 +1542,7 @@ Ptr<Layer> ChannelsPReLULayer::create(const LayerParams& params)
     if (params.blobs[0].total() == 1)
     {
         LayerParams reluParams = params;
-        reluParams.set("negative_slope", params.blobs[0].at<float>(0));
+        reluParams.set("negative_slope", *params.blobs[0].ptr<float>());
         return ReLULayer::create(reluParams);
     }
     Ptr<ChannelsPReLULayer> l(new ElementWiseLayer<ChannelsPReLUFunctor>(ChannelsPReLUFunctor(params.blobs[0])));
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 7b94e02d0a..f55510ec7b 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -250,6 +250,11 @@ TEST_P(Test_ONNX_layers, ReLU)
     testONNXModels("ReLU");
 }
 
+TEST_P(Test_ONNX_layers, PReLU)
+{
+    testONNXModels("PReLU_slope");
+}
+
 TEST_P(Test_ONNX_layers, Clip)
 {
     testONNXModels("clip", npy);

From e3f4f874c5214397a41befa6735f31382e7c39d2 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 10 Sep 2021 15:00:11 +0300
Subject: [PATCH 11/15] Merge pull request #20670 from
 alalek:core_ocl_fix_intel_gpu_gemm_requirements

core(OpenCL): fix intel_gpu_gemm kernel requirements

* core(ocl): fix intel_gpu_gemm integration

- allow bailout to generic OpenCL kernel

* core(ocl): avoid failures of generic OpenCL gemm kernel

* core(ocl): define alignment requirements of intel_gpu_gemm kernels
---
 modules/core/src/intel_gpu_gemm.inl.hpp |  71 +++++++----
 modules/core/src/matmul.dispatch.cpp    | 159 +++++++++++++-----------
 modules/core/test/ocl/test_gemm.cpp     |  34 ++---
 3 files changed, 156 insertions(+), 108 deletions(-)

diff --git a/modules/core/src/intel_gpu_gemm.inl.hpp b/modules/core/src/intel_gpu_gemm.inl.hpp
index fbd567b949..fa66856f5e 100644
--- a/modules/core/src/intel_gpu_gemm.inl.hpp
+++ b/modules/core/src/intel_gpu_gemm.inl.hpp
@@ -24,11 +24,6 @@
 
 #ifdef HAVE_OPENCL
 
-#include <sstream>
-#include "opencl_kernels_core.hpp"
-#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
-#include "opencv2/core/opencl/runtime/opencl_core.hpp"
-
 namespace cv
 {
 
@@ -37,52 +32,79 @@ static bool intel_gpu_gemm(
     UMat B, Size sizeB,
     UMat D, Size sizeD,
     double alpha, double beta,
-    bool atrans, bool btrans)
+    bool atrans, bool btrans,
+    bool& isPropagatedC2D
+)
 {
     CV_UNUSED(sizeB);
 
     int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width);
 
-    std::string kernelName;
-    bool ret = true;
+    if (M < 4 || N < 4 || K < 4)  // vload4
+        return false;
 
-    size_t lx = 8, ly = 4;
-    size_t dx = 4, dy = 8;
+    CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K);
+
+    std::string kernelName;
+
+    unsigned int lx = 8, ly = 4;
+    unsigned int dx = 4, dy = 8;
 
     if(!atrans && !btrans)
     {
-
         if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0)
         {
             kernelName = "intelblas_gemm_buffer_NN_sp";
         }
         else
         {
+            if (M % 2 != 0)
+                return false;
+            // vload4(0, dst_write0) - 4 cols
+            // multiply by lx: 8
+            if (N % (4*8) != 0)
+                return false;
             kernelName = "intelblas_gemm_buffer_NN";
         }
     }
     else if(atrans && !btrans)
     {
+        if (M % 32 != 0)
+            return false;
+        if (N % 32 != 0)
+            return false;
         kernelName = "intelblas_gemm_buffer_TN";
     }
     else if(!atrans && btrans)
     {
+        if (M % 128 != 0)
+            return false;
+        if (N % 8 != 0)
+            return false;
+        if (K % 512 != 0)
+            return false;
         kernelName = "intelblas_gemm_buffer_NT";
         ly = 16;
         dx = 1;
     }
     else
     {
+        if (M % 32 != 0)
+            return false;
+        if (N % 32 != 0)
+            return false;
+        if (K % 16 != 0)
+            return false;
         kernelName = "intelblas_gemm_buffer_TT";
     }
 
-    const size_t gx = (size_t)(N + dx - 1) / dx;
-    const size_t gy = (size_t)(M + dy - 1) / dy;
+    CV_LOG_DEBUG(NULL, "kernel: " << kernelName << "  (M=" << M << " N=" << N << " K=" << K << ")");
+
+    const size_t gx = divUp((size_t)N, dx);
+    const size_t gy = divUp((size_t)M, dy);
 
     size_t local[] = {lx, ly, 1};
-    size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1};
-
-    int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
+    size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1};
 
     ocl::Queue q;
     String errmsg;
@@ -110,10 +132,13 @@ static bool intel_gpu_gemm(
                (int)(D.step / sizeof(float))
         );
 
-        ret = k.run(2, global, local, false, q);
+        bool ret = k.run(2, global, local, false, q);
+        return ret;
     }
     else
     {
+        int stride = (M * N < 1024 * 1024) ? 10000000 : 256;
+
         for(int start_index = 0; start_index < K; start_index += stride)
         {
              ocl::Kernel k(kernelName.c_str(), program);
@@ -132,12 +157,16 @@ static bool intel_gpu_gemm(
                     (int) start_index,                          // 14 start_index
                     stride);
 
-            ret = k.run(2, global, local, false, q);
-            if (!ret) return ret;
+            bool ret = k.run(2, global, local, false, q);
+            if (!ret)
+            {
+                if (start_index != 0)
+                    isPropagatedC2D = false;  // D array content is changed, need to rewrite
+                return false;
+            }
         }
+        return true;
     }
-
-    return ret;
 }
 
 } // namespace cv
diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp
index e81064ec16..a7447330fc 100644
--- a/modules/core/src/matmul.dispatch.cpp
+++ b/modules/core/src/matmul.dispatch.cpp
@@ -42,6 +42,8 @@
 //M*/
 
 #include "precomp.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
 #include "opencl_kernels_core.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
@@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
 static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
                       InputArray matC, double beta, OutputArray matD, int flags )
 {
-    int depth = matA.depth(), cn = matA.channels();
-    int type = CV_MAKETYPE(depth, cn);
+    int type = matA.type();
+    int depth = CV_MAT_DEPTH(type);
+    int cn = CV_MAT_CN(type);
 
-    CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
+    CV_CheckTypeEQ(type, matB.type(), "");
+    CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, "");
 
     const ocl::Device & dev = ocl::Device::getDefault();
     bool doubleSupport = dev.doubleFPConfig() > 0;
@@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
     Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
     bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
 
-    CV_Assert( !haveC || matC.type() == type );
+    if (haveC)
+        CV_CheckTypeEQ(type, matC.type(), "");
+
+    Size sizeD(((btrans) ? sizeB.height : sizeB.width),
+               ((atrans) ? sizeA.width : sizeA.height));
+
+    if (atrans)
+        sizeA = Size(sizeA.height, sizeA.width);
+    if (btrans)
+        sizeB = Size(sizeB.height, sizeB.width);
+    if (haveC && ctrans)
+        sizeC = Size(sizeC.height, sizeC.width);
+
+    CV_CheckEQ(sizeA.width, sizeB.height, "");
+    if (haveC)
+        CV_CheckEQ(sizeC, sizeD, "");
+
+    UMat A = matA.getUMat();
+    UMat B = matB.getUMat();
 
-    Size sizeD(((btrans)? sizeB.height : sizeB.width),
-               ((atrans)? sizeA.width : sizeA.height));
     matD.create(sizeD, type);
+    UMat D = matD.getUMat();
 
-    UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
+    bool isPropagatedC2D = false;  // D content is updated with C / C.t()
 
-
-    if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
-    {
-        String opts;
-
-        if (atrans)
-            sizeA = Size(sizeA.height, sizeA.width);
-        if (btrans)
-            sizeB = Size(sizeB.height, sizeB.width);
-        if (haveC && ctrans)
-            sizeC = Size(sizeC.height, sizeC.width);
-
-        CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
-
-        int max_wg_size = (int)dev.maxWorkGroupSize();
-        int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
-
-        if (atrans)
-            A = A.t();
-
-        if (btrans)
-            B = B.t();
-
-        if (haveC)
-            ctrans ? transpose(matC, D) : matC.copyTo(D);
-
-        int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
-        int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
-
-        opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
-                          ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
-                          cn, kercn, block_size,
-                          (sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
-                          haveC ? " -D HAVE_C" : "",
-                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
-
-        ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
-        if (k.empty())
-            return false;
-
-        if (depth == CV_64F)
-            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
-                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
-                   ocl::KernelArg::ReadWrite(D, cn, kercn),
-                   sizeA.width, alpha, beta);
-        else
-            k.args(ocl::KernelArg::ReadOnlyNoSize(A),
-                   ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
-                   ocl::KernelArg::ReadWrite(D, cn, kercn),
-                   sizeA.width, (float)alpha, (float)beta);
-
-        size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
-        size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
-
-        return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
-    }
-    else
+    if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1)
     {
         if (haveC && beta != 0.0)
         {
             ctrans ? transpose(matC, D) : matC.copyTo(D);
+            isPropagatedC2D = true;
         }
         else
         {
             beta = 0.0;
         }
 
-        return intel_gpu_gemm(A, sizeA,
-                              B, sizeB,
-                              D, sizeD,
-                              alpha,
-                              beta,
-                              atrans, btrans);
+        bool res = intel_gpu_gemm(A, matA.size(),
+                                  B, matB.size(),
+                                  D, sizeD,
+                                  alpha,
+                                  beta,
+                                  atrans, btrans,
+                                  isPropagatedC2D);
+        if (res)
+            return true;
+        // fallback on generic OpenCL code
     }
+
+    if (sizeD.width < 8 || sizeD.height < 8)
+        return false;
+
+    String opts;
+
+    int wg_size = (int)dev.maxWorkGroupSize();
+    int sizeDmin = std::min(sizeD.width, sizeD.height);
+    wg_size = std::min(wg_size, sizeDmin * sizeDmin);
+    int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
+
+    if (atrans)
+        A = A.t();
+
+    if (btrans)
+        B = B.t();
+
+    if (haveC && !isPropagatedC2D)
+        ctrans ? transpose(matC, D) : matC.copyTo(D);
+
+    int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
+    int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
+
+    opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
+                      ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
+                      cn, kercn, block_size,
+                      (sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
+                      haveC ? " -D HAVE_C" : "",
+                      doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+
+    ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
+    if (k.empty())
+        return false;
+
+    if (depth == CV_64F)
+        k.args(ocl::KernelArg::ReadOnlyNoSize(A),
+               ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
+               ocl::KernelArg::ReadWrite(D, cn, kercn),
+               sizeA.width, alpha, beta);
+    else
+        k.args(ocl::KernelArg::ReadOnlyNoSize(A),
+               ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
+               ocl::KernelArg::ReadWrite(D, cn, kercn),
+               sizeA.width, (float)alpha, (float)beta);
+
+    size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
+    size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
+
+    return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false);
 }
 #endif
 
diff --git a/modules/core/test/ocl/test_gemm.cpp b/modules/core/test/ocl/test_gemm.cpp
index 825b506780..cb7cb0be1a 100644
--- a/modules/core/test/ocl/test_gemm.cpp
+++ b/modules/core/test/ocl/test_gemm.cpp
@@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm,
 
     double alpha, beta;
 
+    int M, N, K;
+
     TEST_DECLARE_INPUT_PARAMETER(A);
     TEST_DECLARE_INPUT_PARAMETER(B);
     TEST_DECLARE_INPUT_PARAMETER(C);
@@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm,
 
     void generateTestData()
     {
-        // set minimum size to 20, since testing less sizes doesn't make sense
-        Size ARoiSize = randomSize(20, MAX_VALUE);
+        M = (int)randomDoubleLog(1, 100);
+        N = (int)randomDoubleLog(1, 100);
+        K = (int)randomDoubleLog(1, 1200);
+
+        M = roundUp(M, 1);
+        N = roundUp(N, 1);
+        K = roundUp(K, 1);
+
+        Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M);
         Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11);
 
-        if (atrans)
-            ARoiSize = Size(ARoiSize.height, ARoiSize.width);
-
-        Size BRoiSize = randomSize(20, MAX_VALUE);
-        if (btrans)
-            BRoiSize.width = ARoiSize.width;
-        else
-            BRoiSize.height = ARoiSize.width;
-
+        Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K);
         Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11);
 
-        if (btrans)
-            BRoiSize = Size(BRoiSize.height, BRoiSize.width);
-
-        Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width);
+        Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M);
         Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11);
+        randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11);
 
+        Size DRoiSize = Size(N, M);
         Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11);
 
@@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy)
     for (int i = 0; i < test_loop_times; ++i)
     {
         generateTestData();
+        SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K));
 
         OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags));
         OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags));
 
-        double eps = D_roi.size().area() * 1e-4;
+        double eps = D_roi.size().area() * (1e-5 * K);
         OCL_EXPECT_MATS_NEAR(D, eps);
     }
 }

From 9b4ecc96f63d64a07ac043ad06fa44a1fd02b18b Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Tue, 7 Sep 2021 04:39:28 +0000
Subject: [PATCH 12/15] core(ocl): buffer bounds in intelblas_gemm_buffer_NT

---
 modules/core/src/intel_gpu_gemm.inl.hpp |   6 +-
 modules/core/src/opencl/intel_gemm.cl   | 180 +++++++++++-------------
 2 files changed, 83 insertions(+), 103 deletions(-)

diff --git a/modules/core/src/intel_gpu_gemm.inl.hpp b/modules/core/src/intel_gpu_gemm.inl.hpp
index fa66856f5e..28cc4ab9b9 100644
--- a/modules/core/src/intel_gpu_gemm.inl.hpp
+++ b/modules/core/src/intel_gpu_gemm.inl.hpp
@@ -77,11 +77,7 @@ static bool intel_gpu_gemm(
     }
     else if(!atrans && btrans)
     {
-        if (M % 128 != 0)
-            return false;
-        if (N % 8 != 0)
-            return false;
-        if (K % 512 != 0)
+        if (K % 4 != 0)
             return false;
         kernelName = "intelblas_gemm_buffer_NT";
         ly = 16;
diff --git a/modules/core/src/opencl/intel_gemm.cl b/modules/core/src/opencl/intel_gemm.cl
index 6cea8d7efd..53ae790779 100644
--- a/modules/core/src/opencl/intel_gemm.cl
+++ b/modules/core/src/opencl/intel_gemm.cl
@@ -392,6 +392,15 @@ __kernel void intelblas_gemm_buffer_NN(
 #define TILE_N          8
 #define SLM_BLOCK       512
 
+/*
+   A     K         B.t()   K            D     N
+    -----------       -----------        -----------
+    |         |       |         |        |         |
+  M |         |  x  N |         |  =>  M |         |
+    |         |       |         |        |         |
+    -----------       -----------        -----------
+*/
+
 __attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1)))
 __kernel void intelblas_gemm_buffer_NT(
     const __global float *src0, int off0,
@@ -422,59 +431,79 @@ __kernel void intelblas_gemm_buffer_NT(
     float8 dot06 = 0.f;
     float8 dot07 = 0.f;
 
-    float4 brow0;
-    float4 brow1;
-    float4 brow2;
-    float4 brow3;
-    float4 brow4;
-    float4 brow5;
-    float4 brow6;
-    float4 brow7;
+    const int dst_row = (global_y * TILE_M);
+    __global float *dst_write0 = dst + global_x + dst_row * ldC + offd;
 
-    __global float *dst_write0 = dst + local_x * VEC_SIZE + ( group_x * TILE_N ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * ldC + offd;
+    const __global float *src0_read00 = src0 + off0;
+    const int a_row_base = global_y * TILE_M;
+    const int a_col_base = local_x * (TILE_K / 8);  // <= TILE_K - 4
 
-    const __global float *src0_read = src0 + local_x * ( TILE_K / 8 ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M ) * ldA + off0;
-
-    const __global float *src1_read0 = src1 + ( group_x * TILE_N ) * ldB + off1;
+    const __global float *src1_read00 = src1 + off1;
+    const int b_row_base = (group_x * TILE_N);
+    //const int b_col_base = 0;
 
     __local float slm_brow[8 * SLM_BLOCK];
-    __local float* slm_brow0;
 
     int local_index = mad24(local_y, 8, local_x) * 4;
-    int w;
-    for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
+    int w = 0;
+    for (int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK)
+    {
+#define UPDATE_BROW(_row) \
+        { \
+            float4 brow; \
+            int b_row = b_row_base + _row; \
+            int b_col = b_tile + local_index; \
+            if (b_row < N && b_col <= K - 4 /*vload4*/) \
+                brow = vload4(0, src1_read00 + mad24(b_row, ldB, b_col)); \
+            else \
+                brow = (float4)0; \
+            vstore4(brow, 0, slm_brow + mad24(_row, SLM_BLOCK, local_index)); \
+        }
+
         barrier(CLK_LOCAL_MEM_FENCE);
-        vstore4(vload4(0, src1_read0 + mad24(0, ldB, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(1, ldB, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(2, ldB, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(3, ldB, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(4, ldB, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(5, ldB, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(6, ldB, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
-        vstore4(vload4(0, src1_read0 + mad24(7, ldB, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
+        UPDATE_BROW(0);
+        UPDATE_BROW(1);
+        UPDATE_BROW(2);
+        UPDATE_BROW(3);
+        UPDATE_BROW(4);
+        UPDATE_BROW(5);
+        UPDATE_BROW(6);
+        UPDATE_BROW(7);
         barrier(CLK_LOCAL_MEM_FENCE);
+#undef UPDATE_BROW
 
-        slm_brow0 = slm_brow + local_x * (TILE_K / 8);
-        w = b_tile;
-        int end_w = min(b_tile + SLM_BLOCK, K);
-        while( w + TILE_K <= end_w ) {
-            float4 arow;
+        for (int k_tile_offset = 0; k_tile_offset < SLM_BLOCK; k_tile_offset += TILE_K)
+        {
+            int a_col = a_col_base + b_tile + k_tile_offset;
 
-            brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK);
-            brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK);
-            brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK);
-            brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK);
-            brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK);
-            brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK);
-            brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK);
-            brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK);
+            if (a_col > K - 4 /*vload4*/)
+                break;
 
-#define MM_DOT_PRODUCT(_row,_dot)   \
-            arow = vload4(0, src0_read + _row * ldA);                           \
-            _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
-            _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
-            _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
-            _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
+            int slm_brow_col = a_col_base + k_tile_offset;  // <= SLM_BLOCK - 4
+#define READ_SLM_BROW(_row) \
+            float4 brow##_row = vload4(0, slm_brow + mad24(_row, SLM_BLOCK, slm_brow_col));
+
+            READ_SLM_BROW(0);
+            READ_SLM_BROW(1);
+            READ_SLM_BROW(2);
+            READ_SLM_BROW(3);
+            READ_SLM_BROW(4);
+            READ_SLM_BROW(5);
+            READ_SLM_BROW(6);
+            READ_SLM_BROW(7);
+#undef READ_SLM_BROW
+
+#define MM_DOT_PRODUCT(_row,_dot) \
+            { \
+                int a_row = a_row_base + _row; \
+                if (a_row < M) { \
+                    float4 arow = vload4(0, src0_read00 + mad24(a_row, ldA, a_col)); \
+                    _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
+                    _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
+                    _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
+                    _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); \
+                } \
+            }
 
             MM_DOT_PRODUCT(0,dot00);
             MM_DOT_PRODUCT(1,dot01);
@@ -485,53 +514,7 @@ __kernel void intelblas_gemm_buffer_NT(
             MM_DOT_PRODUCT(6,dot06);
             MM_DOT_PRODUCT(7,dot07);
 #undef MM_DOT_PRODUCT
-
-            src0_read += TILE_K;
-            slm_brow0 += TILE_K;
-            w += TILE_K;
         }
-        src1_read0 += SLM_BLOCK;
-    }
-
-    if(w < K) {
-        float4 arow;
-
-#define READ_BROW(_brow,_row) \
-        _brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \
-        _brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \
-        _brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \
-        _brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \
-        _brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f;
-
-        READ_BROW(brow0,0);
-        READ_BROW(brow1,1);
-        READ_BROW(brow2,2);
-        READ_BROW(brow3,3);
-        READ_BROW(brow4,4);
-        READ_BROW(brow5,5);
-        READ_BROW(brow6,6);
-        READ_BROW(brow7,7);
-
-#define MM_DOT_PRODUCT(_row,_dot)   \
-        arow = vload4(0, src0_read + _row * ldA);  \
-        arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \
-        arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \
-        arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \
-        arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \
-        _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \
-        _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \
-        _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \
-        _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot );
-
-        MM_DOT_PRODUCT(0,dot00);
-        MM_DOT_PRODUCT(1,dot01);
-        MM_DOT_PRODUCT(2,dot02);
-        MM_DOT_PRODUCT(3,dot03);
-        MM_DOT_PRODUCT(4,dot04);
-        MM_DOT_PRODUCT(5,dot05);
-        MM_DOT_PRODUCT(6,dot06);
-        MM_DOT_PRODUCT(7,dot07);
-#undef MM_DOT_PRODUCT
     }
 
 #define REDUCE(_dot) \
@@ -572,21 +555,22 @@ __kernel void intelblas_gemm_buffer_NT(
     output = (local_x == 5) ? _dot.s5 : output; \
     output = (local_x == 6) ? _dot.s6 : output; \
     output = (local_x == 7) ? _dot.s7 : output; \
-    if (beta != 0.0) \
+    if (beta != 0.0f) \
         dst_write0[0] = mad(output, (float)alpha, ((float)beta * dst_write0[0])); \
     else \
         dst_write0[0] = output * (float)alpha; \
     dst_write0 += ldC;
 
-    if(global_x < N && global_y * 8 < M) {
-        OUTPUT(dot00);
-        if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); }
-        if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); }
-        if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); }
-        if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); }
-        if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); }
-        if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); }
-        if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); }
+    if (global_x < N && dst_row < M)
+    {
+        /*if (dst_row + 0 < M)*/ { OUTPUT(dot00); }
+        if (dst_row + 1 < M) { OUTPUT(dot01); }
+        if (dst_row + 2 < M) { OUTPUT(dot02); }
+        if (dst_row + 3 < M) { OUTPUT(dot03); }
+        if (dst_row + 4 < M) { OUTPUT(dot04); }
+        if (dst_row + 5 < M) { OUTPUT(dot05); }
+        if (dst_row + 6 < M) { OUTPUT(dot06); }
+        if (dst_row + 7 < M) { OUTPUT(dot07); }
     }
 #undef OUTPUT
 }

From de1a45987964d466a13b3f9ba2e1e59b3be38cca Mon Sep 17 00:00:00 2001
From: Dale Phurrough <dale@hidale.com>
Date: Fri, 10 Sep 2021 17:59:56 +0200
Subject: [PATCH 13/15] fix opencv/opencv#20613

* copy 4.x selectOpenCLDevice() -- it is compatible
* filter platforms rather than trying only first matching
* this works on 3.4 and 4.x master
---
 modules/core/src/ocl.cpp | 59 ++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index a550c1d91a..e93e3094be 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -1670,7 +1670,7 @@ static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
     split(configurationStr, ':', parts);
     if (parts.size() > 3)
     {
-        std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
+        CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
         return false;
     }
     if (parts.size() > 2)
@@ -1687,22 +1687,20 @@ static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
 }
 
 #if defined WINRT || defined _WIN32_WCE
-static cl_device_id selectOpenCLDevice()
+static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
 {
+    CV_UNUSED(configuration)
     return NULL;
 }
 #else
-// std::tolower is int->int
-static char char_tolower(char ch)
-{
-    return (char)std::tolower((int)ch);
-}
-static cl_device_id selectOpenCLDevice()
+static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
 {
     std::string platform, deviceName;
     std::vector<std::string> deviceTypes;
 
-    const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
+    if (!configuration)
+        configuration = getenv("OPENCV_OPENCL_DEVICE");
+
     if (configuration &&
             (strcmp(configuration, "disabled") == 0 ||
              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
@@ -1747,22 +1745,24 @@ static cl_device_id selectOpenCLDevice()
         platforms.resize(numPlatforms);
     }
 
-    int selectedPlatform = -1;
     if (platform.length() > 0)
     {
-        for (size_t i = 0; i < platforms.size(); i++)
+        for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end();)
         {
             std::string name;
-            CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
+            CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, *currentPlatform, CL_PLATFORM_NAME, name));
             if (name.find(platform) != std::string::npos)
             {
-                selectedPlatform = (int)i;
-                break;
+                ++currentPlatform;
+            }
+            else
+            {
+                currentPlatform = platforms.erase(currentPlatform);
             }
         }
-        if (selectedPlatform == -1)
+        if (platforms.size() == 0)
         {
-            std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
+            CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
             goto not_found;
         }
     }
@@ -1781,7 +1781,7 @@ static cl_device_id selectOpenCLDevice()
     {
         int deviceType = 0;
         std::string tempStrDeviceType = deviceTypes[t];
-        std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower);
+        std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
 
         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
             deviceType = Device::TYPE_GPU;
@@ -1793,17 +1793,15 @@ static cl_device_id selectOpenCLDevice()
             deviceType = Device::TYPE_ALL;
         else
         {
-            std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
+            CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
             goto not_found;
         }
 
-        std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
-        for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
-                (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
-                i++)
+        std::vector<cl_device_id> devices;
+        for (std::vector<cl_platform_id>::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end(); ++currentPlatform)
         {
             cl_uint count = 0;
-            cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
+            cl_int status = clGetDeviceIDs(*currentPlatform, deviceType, 0, NULL, &count);
             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
             {
                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
@@ -1812,7 +1810,7 @@ static cl_device_id selectOpenCLDevice()
                 continue;
             size_t base = devices.size();
             devices.resize(base + count);
-            status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
+            status = clGetDeviceIDs(*currentPlatform, deviceType, count, &devices[base], &count);
             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
             {
                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
@@ -1844,13 +1842,16 @@ not_found:
     if (!configuration)
         return NULL; // suppress messages on stderr
 
-    std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl
-            << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
-            << "    Device types: ";
+    std::ostringstream msg;
+    msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
+        << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
+        << "    Device types:";
     for (size_t t = 0; t < deviceTypes.size(); t++)
-        std::cerr << deviceTypes[t] << " ";
+        msg << ' ' << deviceTypes[t];
 
-    std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
+    msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
+
+    CV_LOG_ERROR(NULL, msg.str());
     return NULL;
 }
 #endif

From aa7ba0bc1a99357beed7f4624de353f55ceccb20 Mon Sep 17 00:00:00 2001
From: Alexander Alekhin <alexander.a.alekhin@gmail.com>
Date: Fri, 10 Sep 2021 20:58:37 +0000
Subject: [PATCH 14/15] build: winpack_dldt with dldt 2021.4.1

---
 cmake/OpenCVDetectInferenceEngine.cmake       |   4 +-
 ...-dldt-disable-multidevice-autoplugin.patch |  16 ++
 ...20210630-dldt-disable-unused-targets.patch | 219 ++++++++++++++++++
 .../2021.4.1/20210630-dldt-pdb.patch          |  15 ++
 .../2021.4.1/20210630-dldt-vs-version.patch   |  16 ++
 .../winpack_dldt/2021.4.1/build.config.py     |   1 +
 .../winpack_dldt/2021.4.1/patch.config.py     |   4 +
 .../winpack_dldt/2021.4.1/sysroot.config.py   |  56 +++++
 platforms/winpack_dldt/build_package.py       |   5 +-
 9 files changed, 332 insertions(+), 4 deletions(-)
 create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch
 create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch
 create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch
 create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch
 create mode 100644 platforms/winpack_dldt/2021.4.1/build.config.py
 create mode 100644 platforms/winpack_dldt/2021.4.1/patch.config.py
 create mode 100644 platforms/winpack_dldt/2021.4.1/sysroot.config.py

diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index 6308d1b424..41951b710a 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -147,8 +147,8 @@ if(INF_ENGINE_TARGET)
     endif()
   endif()
   if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT)
-    message(WARNING "InferenceEngine version has not been set, 2021.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
-    set(INF_ENGINE_RELEASE_INIT "2021040000")
+    message(WARNING "InferenceEngine version has not been set, 2021.4.1 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    set(INF_ENGINE_RELEASE_INIT "2021040100")
   elseif(DEFINED INF_ENGINE_RELEASE)
     set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}")
   endif()
diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch
new file mode 100644
index 0000000000..f1e7487442
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch
@@ -0,0 +1,16 @@
+diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
+index 0ba0dd78..7d34e7cb 100644
+--- a/inference-engine/src/CMakeLists.txt
++++ b/inference-engine/src/CMakeLists.txt
+@@ -26,9 +26,9 @@ endif()
+ 
+ add_subdirectory(hetero_plugin)
+ 
+-add_subdirectory(auto_plugin)
++#add_subdirectory(auto_plugin)
+ 
+-add_subdirectory(multi_device)
++#add_subdirectory(multi_device)
+ 
+ add_subdirectory(transformations)
+ 
diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch
new file mode 100644
index 0000000000..9d44cdadc6
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch
@@ -0,0 +1,219 @@
+diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake
+index d49f16a4d..2726ca787 100644
+--- a/cmake/developer_package/add_ie_target.cmake
++++ b/cmake/developer_package/add_ie_target.cmake
+@@ -92,7 +92,7 @@ function(addIeTarget)
+     if (ARG_TYPE STREQUAL EXECUTABLE)
+         add_executable(${ARG_NAME} ${all_sources})
+     elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
+-        add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
++        add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
+     else()
+         message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
+     endif()
+diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
+index 1ac7fd8bf..df7091e51 100644
+--- a/inference-engine/CMakeLists.txt
++++ b/inference-engine/CMakeLists.txt
+@@ -39,7 +39,7 @@ if(ENABLE_TESTS)
+     add_subdirectory(tests)
+ endif()
+ 
+-add_subdirectory(tools)
++#add_subdirectory(tools)
+ 
+ function(ie_build_samples)
+     # samples should be build with the same flags as from OpenVINO package,
+@@ -58,7 +58,7 @@ endfunction()
+ 
+ # gflags and format_reader targets are kept inside of samples directory and
+ # they must be built even if samples build is disabled (required for tests and tools).
+-ie_build_samples()
++#ie_build_samples()
+ 
+ if(ENABLE_PYTHON)
+     add_subdirectory(ie_bridges/python)
+@@ -142,7 +142,7 @@ endif()
+ # Developer package
+ #
+ 
+-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
++#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
+ 
+ # for Template plugin
+ if(NGRAPH_INTERPRETER_ENABLE)
+@@ -166,7 +166,7 @@ function(ie_generate_dev_package_config)
+                 @ONLY)
+ endfunction()
+ 
+-ie_generate_dev_package_config()
++#ie_generate_dev_package_config()
+ 
+ #
+ # Coverage
+diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
+index e8ed1a5c4..1fc9fc3ff 100644
+--- a/inference-engine/src/inference_engine/CMakeLists.txt
++++ b/inference-engine/src/inference_engine/CMakeLists.txt
+@@ -110,7 +110,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS}
+             ${PUBLIC_HEADERS})
+@@ -181,7 +181,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>
+             ${IE_STATIC_DEPENDENT_FILES})
+diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
+index 8eae82bd2..e0e6745b1 100644
+--- a/inference-engine/src/legacy_api/CMakeLists.txt
++++ b/inference-engine/src/legacy_api/CMakeLists.txt
+@@ -26,7 +26,7 @@ endif()
+ 
+ file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${PUBLIC_HEADERS})
+ 
+diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+index fe57b29dd..07831e2fb 100644
+--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
++++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+@@ -67,7 +67,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ #  add test object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
+ target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
+ 
+ target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
+diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
+index f9548339d..ef962145a 100644
+--- a/inference-engine/src/preprocessing/CMakeLists.txt
++++ b/inference-engine/src/preprocessing/CMakeLists.txt
+@@ -101,7 +101,7 @@ endif()
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS})
+ 
+@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>)
+ 
+ set_ie_threading_interface_for(${TARGET_NAME}_s)
+diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
+index 249e47c28..4ddf63049 100644
+--- a/inference-engine/src/vpu/common/CMakeLists.txt
++++ b/inference-engine/src/vpu/common/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
+ 
+ function(add_common_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     ie_faster_build(${TARGET_NAME}
+         UNITY
+@@ -60,7 +60,7 @@ add_common_target("vpu_common_lib" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_common_target("vpu_common_lib_test_static" TRUE)
++    #add_common_target("vpu_common_lib_test_static" TRUE)
+ else()
+     add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
+ endif()
+diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+index bc73ab5b1..b4c1547fc 100644
+--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
++++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
+ 
+ function(add_graph_transformer_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     set_ie_threading_interface_for(${TARGET_NAME})
+ 
+@@ -70,7 +70,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
++    #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ else()
+     add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
+ endif()
+diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+index 8bcb2801a..f7e031c01 100644
+--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
++++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
+ else()
+ 	add_library(pugixml STATIC ${SOURCES})
+ 	if (MSVC)
+-		add_library(pugixml_mt STATIC ${SOURCES})
++                #add_library(pugixml_mt STATIC ${SOURCES})
+ 		#if (WIN32)
+ 		#	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+ 		#	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
+index ff5c381e7..2797ec9ab 100644
+--- a/ngraph/core/builder/CMakeLists.txt
++++ b/ngraph/core/builder/CMakeLists.txt
+@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
+index ef4a764ab..f6d3172e2 100644
+--- a/ngraph/core/reference/CMakeLists.txt
++++ b/ngraph/core/reference/CMakeLists.txt
+@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
+index e9f880b8c..c63f4df63 100644
+--- a/openvino/itt/CMakeLists.txt
++++ b/openvino/itt/CMakeLists.txt
+@@ -6,7 +6,7 @@ set(TARGET_NAME itt)
+ 
+ file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp")
+ 
+-add_library(${TARGET_NAME} STATIC ${SOURCES})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+ add_library(openvino::itt ALIAS ${TARGET_NAME})
+ 
diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch
new file mode 100644
index 0000000000..65e6f84dc8
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch
@@ -0,0 +1,15 @@
+iff --git a/CMakeLists.txt b/CMakeLists.txt
+index e0706a72e..9a053b1e4 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -6,6 +6,10 @@ cmake_minimum_required(VERSION 3.13)
+ 
+ project(OpenVINO)
+ 
++set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
++set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
++set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
++
+ set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+ set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
+ 
diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch
new file mode 100644
index 0000000000..36b0068775
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch
@@ -0,0 +1,16 @@
+diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake
+index 14d4c0e1e..6a44f73b9 100644
+--- a/cmake/developer_package/vs_version/vs_version.cmake
++++ b/cmake/developer_package/vs_version/vs_version.cmake
+@@ -8,9 +8,9 @@ set(IE_VS_VER_FILEVERSION_STR "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERS
+ 
+ set(IE_VS_VER_COMPANY_NAME_STR "Intel Corporation")
+ set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
+-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
++set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
+ set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2021, Intel Corporation")
+-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
++set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
+ 
+ #
+ # ie_add_vs_version_file(NAME <name>
diff --git a/platforms/winpack_dldt/2021.4.1/build.config.py b/platforms/winpack_dldt/2021.4.1/build.config.py
new file mode 100644
index 0000000000..a643c17928
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4.1/build.config.py
@@ -0,0 +1 @@
+os.environ['CI_BUILD_NUMBER'] = '2021.4.1-opencv_winpack_dldt'
diff --git a/platforms/winpack_dldt/2021.4.1/patch.config.py b/platforms/winpack_dldt/2021.4.1/patch.config.py
new file mode 100644
index 0000000000..7f8715aae2
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4.1/patch.config.py
@@ -0,0 +1,4 @@
+applyPatch('20210630-dldt-disable-unused-targets.patch')
+applyPatch('20210630-dldt-pdb.patch')
+applyPatch('20210630-dldt-disable-multidevice-autoplugin.patch')
+applyPatch('20210630-dldt-vs-version.patch')
diff --git a/platforms/winpack_dldt/2021.4.1/sysroot.config.py b/platforms/winpack_dldt/2021.4.1/sysroot.config.py
new file mode 100644
index 0000000000..fa4281107d
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4.1/sysroot.config.py
@@ -0,0 +1,56 @@
+sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
+copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
+#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
+
+build_config = 'Release' if not self.config.build_debug else 'Debug'
+build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
+
+def copy_bin(name):
+    global build_bin_dir, sysroot_bin_dir
+    copytree(build_bin_dir / name, sysroot_bin_dir / name)
+
+dll_suffix = 'd' if self.config.build_debug else ''
+def copy_dll(name):
+    global copy_bin, dll_suffix
+    copy_bin(name + dll_suffix + '.dll')
+    copy_bin(name + dll_suffix + '.pdb')
+
+copy_bin('cache.json')
+copy_dll('clDNNPlugin')
+copy_dll('HeteroPlugin')
+copy_dll('inference_engine')
+copy_dll('inference_engine_ir_reader')
+#copy_dll('inference_engine_ir_v7_reader')
+copy_dll('inference_engine_legacy')
+copy_dll('inference_engine_transformations')  # runtime
+copy_dll('inference_engine_lp_transformations')  # runtime
+#copy_dll('inference_engine_preproc')  # runtime
+copy_dll('MKLDNNPlugin')  # runtime
+copy_dll('myriadPlugin')  # runtime
+#copy_dll('MultiDevicePlugin')  # runtime, not used
+copy_dll('ngraph')
+copy_bin('plugins.xml')
+copy_bin('pcie-ma2x8x.elf')
+copy_bin('usb-ma2x8x.mvcmd')
+
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
+
+sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
+sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
+
+copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
+if not self.config.build_debug:
+    copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
+    copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
+    copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
+else:
+    copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
+    copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
+    copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
+
+sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
+copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
+copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
diff --git a/platforms/winpack_dldt/build_package.py b/platforms/winpack_dldt/build_package.py
index bd4355e1cd..0194323930 100644
--- a/platforms/winpack_dldt/build_package.py
+++ b/platforms/winpack_dldt/build_package.py
@@ -469,7 +469,8 @@ class Builder:
 def main():
 
     dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
-    dldt_src_commit = '2021.4'
+    dldt_src_commit = '2021.4.1'
+    dldt_config = None
     dldt_release = None
 
     build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
@@ -503,7 +504,7 @@ def main():
     parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
     parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')
 
-    parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
+    parser.add_argument('--dldt_config', default=dldt_config, help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
 
     parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)')
 

From 51b03b87e6f9827cbee3feb6dd6dc385c4b77297 Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Fri, 10 Sep 2021 18:15:22 +0800
Subject: [PATCH 15/15] BiasAdd could load Const from second place.

---
 modules/dnn/src/tensorflow/tf_importer.cpp |  7 ++++++-
 modules/dnn/test/test_tf_importer.cpp      | 12 ++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index ca9d7c5e21..521c8ce4c3 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -914,7 +914,12 @@ void TFImporter::parseBias(tensorflow::GraphDef& net, const tensorflow::NodeDef&
         layer_id[name] = id;
 
         // one input only
-        connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        Pin inp0 = parsePin(layer.input(0));
+        if (layer_id.find(inp0.name) != layer_id.end())
+            // First operand is a constant.
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        else
+            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
     }
     else
     {
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 1a2b976eb8..cdf3794bf1 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -559,6 +559,18 @@ TEST_P(Test_TensorFlow_layers, l2_normalize)
     runTensorFlowNet("l2_normalize");
 }
 
+TEST_P(Test_TensorFlow_layers, BiasAdd)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
+            && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    runTensorFlowNet("bias_add_1");
+}
+
 // TODO: fix it and add to l2_normalize
 TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
 {