From d0e612dc368d657ca56101879dbc58c46a4dbb9b Mon Sep 17 00:00:00 2001 From: rogday Date: Fri, 3 Sep 2021 15:32:29 +0300 Subject: [PATCH 01/15] Merge pull request #20647 from rogday:resize_concat_optimization Fix resize+concat optimization * fix resize+concat optimization * add comment and fix indentation --- modules/dnn/src/layers/resize_layer.cpp | 7 +++++++ modules/dnn/test/test_tf_importer.cpp | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp index 40c7351984..8b7d802ab2 100644 --- a/modules/dnn/src/layers/resize_layer.cpp +++ b/modules/dnn/src/layers/resize_layer.cpp @@ -111,7 +111,14 @@ public: internals_arr.getMatVector(internals); if (outHeight == inputs[0].size[2] && outWidth == inputs[0].size[3]) + { + // outputs[0] = inputs[0] doesn't work due to BlobManager optimizations + if (inputs[0].data != outputs[0].data) + { + inputs[0].copyTo(outputs[0]); + } return; + } Mat& inp = inputs[0]; Mat& out = outputs[0]; diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 68d6e88a66..1a2b976eb8 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -1125,6 +1125,11 @@ TEST_P(Test_TensorFlow_layers, resize_bilinear_down) runTensorFlowNet("resize_bilinear_down"); } +TEST_P(Test_TensorFlow_layers, resize_concat_optimization) +{ + runTensorFlowNet("resize_concat_optimization"); +} + TEST_P(Test_TensorFlow_layers, tf2_dense) { runTensorFlowNet("tf2_dense"); From 407adc7061c9d2126a5d27c53ff76b56a705f3e2 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 4 Sep 2021 04:35:00 +0000 Subject: [PATCH 02/15] dnn(ocl): fix buffer offsets in IDLF kernel - drop CreateSubBuffer - fix FUSED_CONV_ELTWISE mode --- modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp | 2 +- .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 86 +++++-------------- modules/dnn/src/opencl/conv_layer_spatial.cl | 25 ++++-- 3 files changed, 41 insertions(+), 72 deletions(-) diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp index 7bb277d102..d6fb83becb 100644 --- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -269,7 +269,7 @@ class OCL4DNNConvSpatial void generate_idlf_tuneritems(std::vector< cv::Ptr > &tunerItems, int blockM, int blockK, int simd_size); void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise); - void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx); + void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx); int32_t group_; bool bias_term_; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index 517a663e46..b4477ebfc4 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -270,17 +270,21 @@ void OCL4DNNConvSpatial::setFusionDefine(ocl4dnnFusedActiv_t fused_activ, } template -void OCL4DNNConvSpatial::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx) +void OCL4DNNConvSpatial::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, int fused_eltwise_offset, ocl::Kernel &kernel, cl_uint &argIdx) { if (fused_eltwise) - kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ)); + { + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom_data2_)); + if (fused_eltwise_offset >= 0) + kernel.set(argIdx++, fused_eltwise_offset); + } switch (fused_activ) { case OCL4DNN_CONV_FUSED_ACTIV_RELU: kernel.set(argIdx++, (float)negative_slope_); break; case OCL4DNN_CONV_FUSED_ACTIV_PRELU: - kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ)); + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(negative_slope_umat_)); break; case OCL4DNN_CONV_FUSED_ACTIV_POWER: kernel.set(argIdx++, (float)power_); @@ -895,10 +899,12 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) { if (!swizzleWeight(weight, config->workItem_output[2], false)) return false; +#if 0 size_t total_bottom_size = bottom_dim_ * numImages; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_bias_size = M_ * group_; size_t total_top_size = top_dim_ * numImages; +#endif for (int32_t g = 0; g < group_; ++g) { bias_offset = M_ * g; int32_t image_offset = width_ * height_ * (channels_ / group_) * g; @@ -910,72 +916,22 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx); - UMat img_buffer; - if (image_offset) - { - CreateSubBuffer(bottom, img_buffer, image_offset, - total_bottom_size - image_offset, false); - if (img_buffer.empty()) - return false; + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + kernel.set(argIdx++, image_offset); - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); + kernel.set(argIdx++, kernel_offset); - UMat kernel_buffer; - if (kernel_offset) - { - CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset, - total_kernel_size - kernel_offset, false); - if (kernel_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); - } - - UMat bias_buffer; if (bias_term_) { - if (bias_offset) - { - CreateSubBuffer(bias, bias_buffer, bias_offset, - total_bias_size - bias_offset, false); - if (bias_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); + kernel.set(argIdx++, bias_offset); } - UMat out_buffer; - if (output_image_offset) - { - CreateSubBuffer(top, out_buffer, output_image_offset, - total_top_size - output_image_offset, true); - if (out_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); - kernel.set(argIdx++, (int)(out_buffer.offset / element_size)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); - kernel.set(argIdx++, (int)(top.offset / element_size)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset); kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)height_); @@ -1005,7 +961,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); UMat img_buffer; if (image_offset) @@ -1112,7 +1068,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); if (bias_term_) @@ -1152,7 +1108,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, image_offset); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 236e8d029a..55015557a0 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -74,18 +74,22 @@ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ } while(0) #define ELTWISE_DATA_ARG __global Dtype* eltwise_data, +#define ELTWISE_DATA_ARG_WITH_OFFSET __global Dtype* eltwise_ptr, int eltwise_offset, #else #define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \ const Dtype _x_ = (_data_); \ (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ } while(0) #define ELTWISE_DATA_ARG +#define ELTWISE_DATA_ARG_WITH_OFFSET #endif #if APPLY_BIAS #define BIAS_KERNEL_ARG __global Dtype * biases_base, +#define BIAS_KERNEL_ARG_WITH_OFFSET __global Dtype * biases_base_ptr, int biases_base_offset, #else #define BIAS_KERNEL_ARG +#define BIAS_KERNEL_ARG_WITH_OFFSET #endif #define __CAT(x, y) x##y @@ -223,19 +227,28 @@ __attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) __attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) __kernel void convolve_simd( - ELTWISE_DATA_ARG + ELTWISE_DATA_ARG_WITH_OFFSET FUSED_ARG - __global Dtype* inputs, - __global Dtype* weights, - BIAS_KERNEL_ARG - __global Dtype* outputs_base, - const int outputs_offset, + __global Dtype* inputs_ptr, const int inputs_offset, + __global Dtype* weights_ptr, const int weights_offset, + BIAS_KERNEL_ARG_WITH_OFFSET + __global Dtype* outputs_base, const int outputs_offset, const ushort input_width, const ushort input_height, const ushort output_width, const ushort output_height) { + __global Dtype* inputs = inputs_ptr + inputs_offset; + __global Dtype* weights = weights_ptr + weights_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype* outputs = outputs_base + outputs_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth From aaff1256088ce892f977c73ff01817c2e389ac93 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 4 Sep 2021 01:34:02 +0000 Subject: [PATCH 03/15] core(ocl): debug capabilities --- modules/core/src/ocl.cpp | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 0c14e7f3e0..daf4fcd280 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -76,8 +76,11 @@ #undef CV__ALLOCATOR_STATS_LOG #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0 +#define CV_OPENCL_SHOW_BUILD_OPTIONS 0 +#define CV_OPENCL_SHOW_BUILD_KERNELS 0 #define CV_OPENCL_SHOW_RUN_KERNELS 0 +#define CV_OPENCL_SYNC_RUN_KERNELS 0 #define CV_OPENCL_TRACE_CHECK 0 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1 @@ -2983,6 +2986,8 @@ static cv::String dumpValue(size_t sz, const void* p) { if (!p) return "NULL"; + if (sz == 2) + return cv::format("%d / %uu / 0x%04x", *(short*)p, *(unsigned short*)p, *(short*)p); if (sz == 4) return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p); if (sz == 8) @@ -3195,6 +3200,10 @@ bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[], return false; // OpenCV 5.0: raise error } +#if CV_OPENCL_SYNC_RUN_KERNELS + sync = true; +#endif + cl_command_queue qq = getQueue(q); if (haveTempDstUMats) sync = true; @@ -3625,7 +3634,28 @@ struct Program::Impl if (!param_buildExtraOptions.empty()) buildflags = joinBuildOptions(buildflags, param_buildExtraOptions); } +#if CV_OPENCL_SHOW_BUILD_OPTIONS + CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' options:" << buildflags); +#endif compile(ctx, src_, errmsg); +#if CV_OPENCL_SHOW_BUILD_KERNELS + if (handle) + { + size_t retsz = 0; + char kernels_buffer[4096] = {0}; + cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz); + CV_OCL_DBG_CHECK_RESULT(result, cv::format("clGetProgramInfo(CL_PROGRAM_KERNEL_NAMES: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str()); + if (result == CL_SUCCESS && retsz < sizeof(kernels_buffer)) + { + kernels_buffer[retsz] = 0; + CV_LOG_INFO(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' kernels: '" << kernels_buffer << "'"); + } + else + { + CV_LOG_ERROR(NULL, "OpenCL program '" << sourceModule_ << "/" << sourceName_ << "' can't retrieve kernel names!"); + } + } +#endif } bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg) @@ -3857,7 +3887,6 @@ struct Program::Impl CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'"); } #endif - } return handle != NULL; } From 5b2c0168340a8b4e67def8f52885ab21b4bbcadc Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 3 Sep 2021 02:38:53 +0000 Subject: [PATCH 04/15] dnn(ocl): avoid out of buffer access in copyWeightsSwizzled --- modules/dnn/src/opencl/conv_spatial_helper.cl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/dnn/src/opencl/conv_spatial_helper.cl b/modules/dnn/src/opencl/conv_spatial_helper.cl index 33d9db57c8..660d085956 100644 --- a/modules/dnn/src/opencl/conv_spatial_helper.cl +++ b/modules/dnn/src/opencl/conv_spatial_helper.cl @@ -62,8 +62,8 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype) //Original location //Output location - int outputSublayer = channels / swizzleFactor; - int outputSublayerIndex = channels % swizzleFactor; + //int outputSublayer = channels / swizzleFactor; + //int outputSublayerIndex = channels % swizzleFactor; int filter = sX / (kernel_w*kernel_h*channels); int kernel_X = sX % kernel_w; @@ -73,6 +73,10 @@ __kernel void TEMPLATE(copyWeightsSwizzled, Dtype) int FP = filter / swizzleFactor; int F1 = filter % swizzleFactor; - weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1] - = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X]; + int idxOut = FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1; + int idxIn = filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X; + + // idxIn is not valid if (filter >= outputs) - no data for these elements. Output alignment gaps are filled by zeros + Dtype v = (filter < outputs) ? weightIn[idxIn] : (Dtype)0; + weightOut[idxOut] = v; } From 5578ad5e14fe1fcf4d7171f74c665db2a578187f Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sat, 4 Sep 2021 16:27:51 +0000 Subject: [PATCH 05/15] dnn(ocl): fix automatic globalsize adjusting - if kernel code doesn't support that --- modules/core/include/opencv2/core/ocl.hpp | 20 +++++++++++++++++-- modules/core/src/ocl.cpp | 8 ++++++++ modules/dnn/src/layers/batch_norm_layer.cpp | 2 +- modules/dnn/src/layers/mvn_layer.cpp | 2 +- modules/dnn/src/layers/slice_layer.cpp | 2 +- .../dnn/src/ocl4dnn/src/math_functions.cpp | 1 + .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 19 +++++++++--------- .../dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp | 2 +- modules/dnn/src/opencl/gemm_image.cl | 10 ++++++++++ 9 files changed, 50 insertions(+), 16 deletions(-) diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index f03de180fc..b51b39359d 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -562,7 +562,9 @@ public: i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); i = set(i, a11); i = set(i, a12); i = set(i, a13); i = set(i, a14); set(i, a15); return *this; } - /** @brief Run the OpenCL kernel. + + /** @brief Run the OpenCL kernel (globalsize value may be adjusted) + @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3. @param globalsize work items for each dimension. It is not the final globalsize passed to OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding @@ -571,12 +573,26 @@ public: @param localsize work-group size for each dimension. @param sync specify whether to wait for OpenCL computation to finish before return. @param q command queue + + @note Use run_() if your kernel code doesn't support adjusted globalsize. */ bool run(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue()); + + /** @brief Run the OpenCL kernel + * + * @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3. + * @param globalsize work items for each dimension. This value is passed to OpenCL without changes. + * @param localsize work-group size for each dimension. + * @param sync specify whether to wait for OpenCL computation to finish before return. + * @param q command queue + */ + bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue()); + bool runTask(bool sync, const Queue& q=Queue()); - /** @brief Similar to synchronized run() call with returning of kernel execution time + /** @brief Similar to synchronized run_() call with returning of kernel execution time + * * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE) * @return Execution time in nanoseconds or negative number on error */ diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index daf4fcd280..a550c1d91a 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -3160,6 +3160,14 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], } +bool Kernel::run_(int dims, size_t _globalsize[], size_t _localsize[], + bool sync, const Queue& q) +{ + CV_Assert(p); + return p->run(dims, _globalsize, _localsize, sync, NULL, q); +} + + static bool isRaiseErrorOnReuseAsyncKernel() { static bool initialized = false; diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index 42676c7938..dcb4005975 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -231,7 +231,7 @@ public: kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight)); kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias)); kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst)); - bool ret = kernel.run(2, global, NULL, false); + bool ret = kernel.run_(2, global, NULL, false); if (!ret) return false; } diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp index 8f06216df1..de2b0d5690 100644 --- a/modules/dnn/src/layers/mvn_layer.cpp +++ b/modules/dnn/src/layers/mvn_layer.cpp @@ -191,7 +191,7 @@ public: k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_weight)); k1.set(argId++, ocl::KernelArg::PtrReadOnly(bnorm_bias)); k1.set(argId++, ocl::KernelArg::PtrWriteOnly(outMat)); - ret = k1.run(1, globalsize, localsize, false); + ret = k1.run_(1, globalsize, localsize, false); if (!ret) return false; } diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 507964edf9..16f1958879 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -482,7 +482,7 @@ public: ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output) ) - .run(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false); + .run_(2, (size_t*)ocl.global_size, (size_t*)ocl.local_size, false); if (!ret) return false; } // for outputs.size() diff --git a/modules/dnn/src/ocl4dnn/src/math_functions.cpp b/modules/dnn/src/ocl4dnn/src/math_functions.cpp index 855a21e08f..c924d66b12 100644 --- a/modules/dnn/src/ocl4dnn/src/math_functions.cpp +++ b/modules/dnn/src/ocl4dnn/src/math_functions.cpp @@ -116,6 +116,7 @@ ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset, .args( ocl::KernelArg::PtrReadOnly(buffer), image, offset, + padded_width, padded_height, width, height, ld) .run(2, global_copy, NULL, false); diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index b4477ebfc4..3b73da801c 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -769,12 +769,11 @@ bool OCL4DNNConvSpatial::swizzleWeight(const UMat &weight, swizzled_factor ); - size_t global_work_size_copy[3] = { - (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 }; + size_t global_work_size_copy[1] = { (size_t)(alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_) }; - if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false)) + if (!oclk_copy_weight.run_(1, global_work_size_copy, NULL, false)) { - std::cout << "Swizzle kernel run failed." << std::endl; + CV_LOG_ERROR(NULL, "DNN/OpenCL: Swizzle kernel run failed"); return false; } } else { @@ -937,7 +936,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, kernel.set(argIdx++, (uint16_t)height_); kernel.set(argIdx++, (uint16_t)output_w_); kernel.set(argIdx++, (uint16_t)output_h_); - if (!kernel.run(3, config->global_work_size, config->local_work_size, false)) + if (!kernel.run_(3, config->global_work_size, config->local_work_size, false)) { std::cout << "IDLF kernel run failed." << std::endl; return false; @@ -1056,7 +1055,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, gy = alignSize(gy, blockK); size_t global_size[3] = { gx, gy, config->global_work_size[2] }; - if (!kernel.run(3, global_size, config->local_work_size, false)) + if (!kernel.run_(3, global_size, config->local_work_size, false)) { std::cout << "GEMM like kernel run failed." << std::endl; return false; @@ -1085,9 +1084,9 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, global_size[1] = output_h_; global_size[2] = num_output_ * num_; - if (!kernel.run(3, global_size, NULL, false)) + if (!kernel.run_(3, global_size, NULL, false)) { - std::cout << "DWCONV kernel run failed." << std::endl; + CV_LOG_ERROR(NULL, "DNN/OpenCL: DWCONV kernel run failed"); return false; } } else { @@ -1127,11 +1126,11 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, kernel.set(argIdx++, (uint16_t)output_h_); kernel.set(argIdx++, (uint16_t)pad_w_); kernel.set(argIdx++, (uint16_t)pad_h_); - if (!kernel.run(3, config->global_work_size, + if (!kernel.run_(3, config->global_work_size, (config->use_null_local) ? NULL : config->local_work_size, false)) { - std::cout << "Basic kernel run failed." << std::endl; + CV_LOG_ERROR(NULL, "DNN/OpenCL: Basic kernel run failed"); return false; } } diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp index 78576711a7..7b32189fdc 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_softmax.cpp @@ -127,7 +127,7 @@ bool OCL4DNNSoftmax::Forward(const UMat& bottom, UMat& top) oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); } - ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false); + ret = oclk_softmax_forward_kernel.run_(3, global_size, local_size, false); } return ret; } diff --git a/modules/dnn/src/opencl/gemm_image.cl b/modules/dnn/src/opencl/gemm_image.cl index 710637a093..f6e0020d82 100644 --- a/modules/dnn/src/opencl/gemm_image.cl +++ b/modules/dnn/src/opencl/gemm_image.cl @@ -954,6 +954,10 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)( { const int gidx = get_global_id(0); const int gidy = get_global_id(1); + + if (gidx >= width || gidy >= height) + return; + int2 coord_dst = (int2)(gidx, gidy); __global Dtype* A_off = A + offA; Dtype srcA = A_off[gidy * ldA + gidx]; @@ -968,12 +972,18 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)( __global Dtype* A, __write_only image2d_t ImA, int offA, + int padded_width, + int padded_height, int width, int height, int ldA) { const int gidx = get_global_id(0); const int gidy = get_global_id(1); + + if (gidx >= padded_width || gidy >= padded_height) + return; + int2 coord_dst = (int2)(gidx, gidy); #if TYPE == TYPE_HALF if (gidx >= width || gidy >= height) { From 36cc43170d9e1e244d72c25d69cef8d716599a10 Mon Sep 17 00:00:00 2001 From: Zhuo Zhang Date: Mon, 6 Sep 2021 12:03:59 +0800 Subject: [PATCH 06/15] docs: fix image path for py_fast doc --- doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown b/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown index 4ce61370bd..b1b8a81ca8 100644 --- a/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown +++ b/doc/py_tutorials/py_feature2d/py_fast/py_fast.markdown @@ -98,7 +98,7 @@ import numpy as np import cv2 as cv from matplotlib import pyplot as plt -img = cv.imread('simple.jpg',0) +img = cv.imread('blox.jpg',0) # `/samples/data/blox.jpg` # Initiate FAST object with default values fast = cv.FastFeatureDetector_create() @@ -113,17 +113,17 @@ print( "nonmaxSuppression:{}".format(fast.getNonmaxSuppression()) ) print( "neighborhood: {}".format(fast.getType()) ) print( "Total Keypoints with nonmaxSuppression: {}".format(len(kp)) ) -cv.imwrite('fast_true.png',img2) +cv.imwrite('fast_true.png', img2) # Disable nonmaxSuppression fast.setNonmaxSuppression(0) -kp = fast.detect(img,None) +kp = fast.detect(img, None) print( "Total Keypoints without nonmaxSuppression: {}".format(len(kp)) ) img3 = cv.drawKeypoints(img, kp, None, color=(255,0,0)) -cv.imwrite('fast_false.png',img3) +cv.imwrite('fast_false.png', img3) @endcode See the results. First image shows FAST with nonmaxSuppression and second one without nonmaxSuppression: From 35e824c28710ca13b2770dc9d547bfc352293dc5 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 6 Sep 2021 05:51:37 +0300 Subject: [PATCH 07/15] dnn(ocl): fix out of bound access in GEMM-like kernels - dropped usage of CreateSubBuffer() - buffers lifetime management issue - fixed elementwise offset - avoid out of bounds read access --- .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 73 ++++------------- modules/dnn/src/opencl/conv_layer_spatial.cl | 79 ++++++++++++++++--- 2 files changed, 80 insertions(+), 72 deletions(-) diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index 3b73da801c..5eee1da4a0 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -945,9 +945,11 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) { if (!swizzleWeight(weight, config->workItem_output[1], true)) return false; +#if 0 size_t total_bottom_size = bottom_dim_ * numImages; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_bias_size = M_ * group_; +#endif size_t total_top_size = top_dim_ * numImages; for (int32_t g = 0; g < group_; ++g) { bias_offset = M_ * g; @@ -960,72 +962,25 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); + setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx); - UMat img_buffer; - if (image_offset) - { - CreateSubBuffer(bottom, img_buffer, image_offset, - total_bottom_size - image_offset, false); - if (img_buffer.empty()) - return false; + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + kernel.set(argIdx++, (int)image_offset); + kernel.set(argIdx++, (int)(bottom.total() - image_offset)); - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); + kernel.set(argIdx++, (int)kernel_offset); + kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset)); - UMat kernel_buffer; - if (kernel_offset) - { - CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset, - total_kernel_size - kernel_offset, false); - if (kernel_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); - } - - UMat bias_buffer; if (bias_term_) { - if (bias_offset) - { - CreateSubBuffer(bias, bias_buffer, bias_offset, - total_bias_size - bias_offset, false); - if (bias_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); + kernel.set(argIdx++, (int)bias_offset); } - UMat out_buffer; - if (output_image_offset) - { - CreateSubBuffer(top, out_buffer, output_image_offset, - total_top_size - output_image_offset, true); - if (out_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); - kernel.set(argIdx++, (int)(out_buffer.offset / element_size)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); - kernel.set(argIdx++, (int)(top.offset / element_size)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset); + kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size)); kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)height_); diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 55015557a0..e7bbacd4c4 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -401,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy. #define ROW_PITCH input_width #define GEMM_LIKE_KERNEL_ARGS \ - ELTWISE_DATA_ARG \ + ELTWISE_DATA_ARG_WITH_OFFSET \ FUSED_ARG \ - const __global Dtype *src0, \ - const __global Dtype *src1, \ - BIAS_KERNEL_ARG \ - __global Dtype *dst_base, \ - const int dst_offset, \ + const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \ + const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \ + BIAS_KERNEL_ARG_WITH_OFFSET \ + __global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \ const ushort input_width, \ const ushort input_height, \ const ushort output_width, \ @@ -437,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy. __attribute__((intel_reqd_sub_group_size(8))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -460,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) } typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; +// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds) +// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1) +#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 + #define OPTIMIZE_READ 1 +#else + #define OPTIMIZE_READ 0 +#endif + // True for all threads if filter_width is multiple of TILE_N // else, true for all but right-most column of threads. if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) @@ -476,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 @@ -496,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) do { int patch_row = 0; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ curr_y = saved_y; #endif @@ -514,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 +#if OPTIMIZE_READ #if KERNEL_WIDTH == 3 Dtype_t blockA00 = vload3(0, src0_read); Dtype* pblockA00 = (Dtype*)(&blockA00); #else + #if 0 // debug + if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH) + { + printf("CATCH: src0_read-src0: %d limit=%d curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x); + } + #endif Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #endif @@ -639,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 @@ -659,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) do { int patch_row = 0; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ curr_y = saved_y; #endif do { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 +#if OPTIMIZE_READ Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #else @@ -803,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) } } } -#endif +#endif // TILE_N_LAST > 0 } #endif #ifdef GEMM_LIKE_CONV_32_2 @@ -826,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(8))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -1388,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(16))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -1574,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(16))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); From 4807cd8a6eeaa494a76df435679ccc3b9e6545e6 Mon Sep 17 00:00:00 2001 From: rogday Date: Thu, 9 Sep 2021 14:41:40 +0300 Subject: [PATCH 08/15] Merge pull request #20605 from rogday:split_slice_shenanigans Add Normalize subgraph, fix Slice, Mul and Expand * Add Normalize subgraph, support for starts<0 and axis<0 in Slice, Mul broadcasting in the middle and fix Expand's unsqueeze * remove todos * remove range-based for loop * address review comments * change >> to > > in template * fix indexation * fix expand that does nothing --- modules/dnn/src/layers/slice_layer.cpp | 59 ++++++-- .../dnn/src/onnx/onnx_graph_simplifier.cpp | 22 +++ modules/dnn/src/onnx/onnx_importer.cpp | 129 ++++++++++++++---- modules/dnn/test/test_onnx_importer.cpp | 6 + 4 files changed, 173 insertions(+), 43 deletions(-) diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 16f1958879..a470772813 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -58,6 +58,31 @@ namespace cv namespace dnn { +void sliceRangesFromShape(const MatShape& inpShape, int& axis, std::vector >& sliceRanges) +{ + CV_Assert(inpShape.size() > 0); + bool axisNeg = (axis < 0); + axis = (axis + static_cast(inpShape.size())) % inpShape.size(); + int n = inpShape[axis]; + + for (size_t i = 0; i < sliceRanges.size(); ++i){ + std::vector& ranges = sliceRanges[i]; + if (axisNeg) + { + ranges.insert(ranges.begin(), axis, Range::all()); + } + Range& range = ranges.back(); + + if (range.start >= 0) + { + continue; + } + + CV_Assert(n != 0); + range.start = (n + range.start) % n; + } +} + class SliceLayerImpl : public SliceLayer { public: @@ -69,20 +94,22 @@ public: num_split = params.get("num_split", 0); hasDynamicShapes = params.get("has_dynamic_shapes", false); shapesInitialized = !hasDynamicShapes; + if (params.has("slice_point")) { CV_Assert(!params.has("begin") && !params.has("size") && !params.has("end")); const DictValue &indicesValue = params.get("slice_point"); + int size = axis > 0 ? axis + 1 : 1; sliceRanges.resize(indicesValue.size() + 1, - std::vector(axis + 1, Range::all())); + std::vector(size, Range::all())); int prevSlice = 0; for (int i = 0; i < indicesValue.size(); ++i) { - sliceRanges[i][axis].start = prevSlice; - sliceRanges[i][axis].end = indicesValue.get(i); - prevSlice = sliceRanges[i][axis].end; + sliceRanges[i][size - 1].start = prevSlice; + sliceRanges[i][size - 1].end = indicesValue.get(i); + prevSlice = sliceRanges[i][size - 1].end; } - sliceRanges.back()[axis].start = prevSlice; + sliceRanges.back()[size - 1].start = prevSlice; } else if (params.has("begin")) { @@ -97,7 +124,6 @@ public: { int start = begins.get(i); int sizeOrEnd = sizesOrEnds.get(i); // It may be negative to reverse indexation. - CV_Assert(start >= 0); sliceRanges[0][i].start = start; if (params.has("size")) @@ -154,16 +180,20 @@ public: CV_Assert(inputs.size() == 1); MatShape inpShape = inputs[0]; - if (!sliceRanges.empty()) + int axis_rw = axis; + std::vector > sliceRanges_rw = sliceRanges; + sliceRangesFromShape(inpShape, axis_rw, sliceRanges_rw); + + if (!sliceRanges_rw.empty()) { - outputs.resize(sliceRanges.size(), inpShape); + outputs.resize(sliceRanges_rw.size(), inpShape); for (int i = 0; i < outputs.size(); ++i) { - CV_Assert(sliceRanges[i].size() <= inpShape.size()); - for (int j = 0; j < sliceRanges[i].size(); ++j) + CV_Assert(sliceRanges_rw[i].size() <= inpShape.size()); + for (int j = 0; j < sliceRanges_rw[i].size(); ++j) { if (shapesInitialized || inpShape[j] > 0) - outputs[i][j] = normalize_axis_range(sliceRanges[i][j], inpShape[j]).size(); + outputs[i][j] = normalize_axis_range(sliceRanges_rw[i][j], inpShape[j]).size(); if (!sliceSteps.empty() && (i < sliceSteps.size()) && (j < sliceSteps[i].size()) && (sliceSteps[i][j] > 1)) outputs[i][j] = (outputs[i][j] + sliceSteps[i][j] - 1) / sliceSteps[i][j]; @@ -172,10 +202,10 @@ public: } else // Divide input blob on equal parts by axis. { - CV_Assert(0 <= axis && axis < inpShape.size()); + CV_Assert(0 <= axis_rw && axis_rw < inpShape.size()); int splits = num_split ? num_split : requiredOutputs; - CV_Assert(splits > 0 && inpShape[axis] % splits == 0); - inpShape[axis] /= splits; + CV_Assert(splits > 0 && inpShape[axis_rw] % splits == 0); + inpShape[axis_rw] /= splits; outputs.resize(splits, inpShape); } return false; @@ -200,6 +230,7 @@ public: CV_Assert(inputs.size() == 1); const MatSize& inpShape = inputs[0].size; + sliceRangesFromShape(shape(inputs[0]), axis, sliceRanges); finalSliceRanges = sliceRanges; if (sliceRanges.empty()) diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp index b81ccf106c..76937e08f3 100644 --- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp +++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp @@ -231,6 +231,27 @@ public: } }; +class NormalizeSubgraph2_2 : public NormalizeSubgraphBase +{ +public: + NormalizeSubgraph2_2() + { + int input = addNodeToMatch(""); + int norm = addNodeToMatch("ReduceL2", input); + + int min = addNodeToMatch(""); + int max = addNodeToMatch(""); + int clip = addNodeToMatch("Clip", norm, min, max); + + int shape = addNodeToMatch(""); + int expand = addNodeToMatch("Expand", clip, shape); + + addNodeToMatch("Div", input, expand); + + setFusedNode("Normalize", input); + } +}; + class NormalizeSubgraph3 : public NormalizeSubgraphBase { public: @@ -555,6 +576,7 @@ void simplifySubgraphs(opencv_onnx::GraphProto& net) subgraphs.push_back(makePtr()); subgraphs.push_back(makePtr()); subgraphs.push_back(makePtr()); + subgraphs.push_back(makePtr()); subgraphs.push_back(makePtr()); subgraphs.push_back(makePtr()); subgraphs.push_back(makePtr()); diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index 6583d6cf62..955c79c0fa 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -59,6 +59,8 @@ class ONNXImporter void addLayer(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto); + void expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto, + const std::string& input, size_t n); public: ONNXImporter(Net& net, const char *onnxFile) @@ -427,6 +429,37 @@ void ONNXImporter::addLayer(LayerParams& layerParams, } } +/** @brief Make N copies of input layer and set them as input to node_proto. + * @param prefix prefix of new layers' names + * @param node_proto node which will contain all copies as inputs + * @param input name of the node to copy + * @param n number of copies + */ +void ONNXImporter::expandMid(const std::string& prefix, opencv_onnx::NodeProto& node_proto, + const std::string& input, size_t n) +{ + std::vector input_names; + input_names.reserve(n); + for (size_t j = 0; j < n; j++) + { + LayerParams copyLP; + copyLP.name = format("%s/copy_%d", prefix.c_str(), j); + copyLP.type = "Identity"; + CV_Assert((layer_id.find(copyLP.name) == layer_id.end()) && + "Couldn't copy the node: generated name already exists in the graph."); + input_names.push_back(copyLP.name); + + node_proto.set_input(0, input); + node_proto.set_output(0, copyLP.name); + addLayer(copyLP, node_proto); + } + node_proto.clear_input(); + for (size_t i = 0; i < input_names.size(); i++) + { + node_proto.add_input(input_names[i]); + } +} + void ONNXImporter::addConstant(const std::string& name, const Mat& blob) { constBlobs.insert(std::make_pair(name, blob)); @@ -1288,6 +1321,37 @@ void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::Node addLayer(layerParams, node_proto); } +void findBroadAxis(const MatShape& broadShape, const MatShape& outShape, size_t& axis, int& broadAxis) +{ + const size_t diff = outShape.size() - broadShape.size(); + + // find the first non-one element of the broadcasting shape + axis = 0; + for (; axis < broadShape.size() && broadShape[axis] == 1; ++axis) {} + + // find the last non-one element of the broadcasting shape + size_t endAxis = broadShape.size(); + for (; endAxis > axis && broadShape[endAxis - 1] == 1; --endAxis) {} + + // find one between axis and endAxis - as it needs to be broadcasted, + // dimensions from the left of axis and from the right of endAxis will be handled by Scale layer + broadAxis = -1; + for (size_t i = axis; i < endAxis; ++i) + { + size_t outAxis = i + diff; + if (outShape[outAxis] == broadShape[i]) + { + continue; + } + + // ensure we need to broadcast only 1 dimension in the middle + CV_Assert(broadShape[i] == 1 && broadAxis == -1); + broadAxis = static_cast(outAxis); + } + + axis += diff; +} + // "Mul" "Div" void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_) { @@ -1410,13 +1474,31 @@ void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodePro } const MatShape& broadShape = outShapes[node_proto.input(1)]; - const size_t outShapeSize = outShapes[node_proto.input(0)].size(); - const size_t diff = outShapeSize - broadShape.size(); + const MatShape& outShape = outShapes[node_proto.input(0)]; - size_t axis; - for (axis = diff; axis < broadShape.size() && broadShape[axis - diff] == 1; ++axis) {} + size_t axis = 0; + int broadAxis = -1; + findBroadAxis(broadShape, outShape, axis, broadAxis); - CV_Assert(axis != outShapeSize); + // if there is a one dimension in the middle that should be broadcasted, broadcast it + if (broadAxis != -1) + { + opencv_onnx::NodeProto concat_node_proto = node_proto; + const std::string& input1 = concat_node_proto.input(1); + + expandMid(layerParams.name, concat_node_proto, input1, outShape[broadAxis]); + + LayerParams concatLP; + concatLP.name = layerParams.name + "/concat"; + concatLP.set("axis", broadAxis); + concatLP.type = "Concat"; + concat_node_proto.set_output(0, concatLP.name); + + addLayer(concatLP, concat_node_proto); + node_proto.set_input(1, concatLP.name); + } + + CV_Assert(axis != outShape.size()); layerParams.set("axis", static_cast(axis)); layerParams.type = "Scale"; } @@ -1685,12 +1767,11 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node // Unsqueeze and repeat along new axis if (targetShape.size() == inpShape.size() + 1) { + inpShape.insert(inpShape.begin(), targetShape.size() - inpShape.size(), 1); for (int i = 0; i < targetShape.size(); i++) { - if (targetShape[i] == -1 && i < inpShape.size()) + if (abs(targetShape[i]) == 1) targetShape[i] = inpShape[i]; - else if (i < inpShape.size() && targetShape[i] != inpShape[i]) - inpShape.insert(inpShape.begin() + i, 1); } if (haveVariables) { @@ -1710,14 +1791,19 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims"); std::vector broadcast_axes; + // shapes aren't right-aligned here because targetShape.size() == inpShape.size() for (int i = 0; i < targetShape.size(); i++) { if (targetShape[i] != inpShape[i]) { if (inpShape[i] == 1) + { broadcast_axes.push_back(i); - else + } + else if (targetShape[i] != 1) + { CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i)); + } } } @@ -1756,31 +1842,16 @@ void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::Node } else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1) { - String base_name = layerParams.name + "/copy_"; - std::vector input_names; - for (int j = 0; j < targetShape[broadcast_axes[0]]; j++) - { - std::ostringstream ss; - ss << j; - LayerParams copyLP; - copyLP.name = base_name + ss.str(); - copyLP.type = "Identity"; - CV_Assert(layer_id.find(copyLP.name) == layer_id.end()); - input_names.push_back(copyLP.name); + expandMid(layerParams.name, node_proto, srcName, targetShape[broadcast_axes[0]]); - node_proto.set_input(0, srcName); - node_proto.set_output(0, copyLP.name); - addLayer(copyLP, node_proto); - } - node_proto.clear_input(); - for (int i = 0; i < input_names.size(); i++) - { - node_proto.add_input(input_names[i]); - } layerParams.set("axis", broadcast_axes[0]); layerParams.type = "Concat"; node_proto.set_output(0, layerParams.name); } + else if (broadcast_axes.empty()) + { + layerParams.type = "Identity"; + } else CV_Error(Error::StsNotImplemented, "Unsupported Expand op"); addLayer(layerParams, node_proto); diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index bd96729be1..7b94e02d0a 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -285,6 +285,7 @@ TEST_P(Test_ONNX_layers, Scale) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER); testONNXModels("scale"); testONNXModels("scale_broadcast", npy, 0, 0, false, true, 3); + testONNXModels("scale_broadcast_mid", npy, 0, 0, false, true, 2); } TEST_P(Test_ONNX_layers, ReduceMean3D) @@ -471,6 +472,8 @@ TEST_P(Test_ONNX_layers, MatMulAdd) TEST_P(Test_ONNX_layers, Expand) { + testONNXModels("expand"); + testONNXModels("expand_identity"); testONNXModels("expand_batch"); testONNXModels("expand_channels"); testONNXModels("expand_neg_batch"); @@ -611,6 +614,7 @@ TEST_P(Test_ONNX_layers, ReduceL2) testONNXModels("reduceL2"); testONNXModels("reduceL2_subgraph"); testONNXModels("reduceL2_subgraph_2"); + testONNXModels("reduceL2_subgraph2_2"); } TEST_P(Test_ONNX_layers, Split) @@ -624,6 +628,7 @@ TEST_P(Test_ONNX_layers, Split) testONNXModels("split_3"); testONNXModels("split_4"); testONNXModels("split_sizes"); + testONNXModels("split_neg_axis"); } TEST_P(Test_ONNX_layers, Slice) @@ -632,6 +637,7 @@ TEST_P(Test_ONNX_layers, Slice) testONNXModels("slice", npy, 0, 0, false, false); #else testONNXModels("slice"); + testONNXModels("slice_neg_starts"); testONNXModels("slice_opset_11"); #endif } From 068f33cfdf57ae5009d3e564a37a83bfefda9b22 Mon Sep 17 00:00:00 2001 From: Dale Phurrough Date: Thu, 9 Sep 2021 15:20:45 +0200 Subject: [PATCH 09/15] add nodiscard to features2d clone funcs --- .../features2d/include/opencv2/features2d.hpp | 8 ++++---- modules/python/src2/hdr_parser.py | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp index cff09170c5..bf193599e1 100644 --- a/modules/features2d/include/opencv2/features2d.hpp +++ b/modules/features2d/include/opencv2/features2d.hpp @@ -1082,7 +1082,7 @@ public: that is, copies both parameters and train data. If emptyTrainData is true, the method creates an object copy with the current parameters but with empty train data. */ - CV_WRAP virtual Ptr clone( bool emptyTrainData=false ) const = 0; + CV_WRAP CV_NODISCARD_STD virtual Ptr clone( bool emptyTrainData=false ) const = 0; /** @brief Creates a descriptor matcher of a given type with the default parameters (using default constructor). @@ -1142,7 +1142,7 @@ protected: static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx ); static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx ); - static Mat clone_op( Mat m ) { return m.clone(); } + CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); } void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const; //! Collection of descriptors from train images. @@ -1183,7 +1183,7 @@ public: */ CV_WRAP static Ptr create( int normType=NORM_L2, bool crossCheck=false ) ; - virtual Ptr clone( bool emptyTrainData=false ) const CV_OVERRIDE; + CV_NODISCARD_STD virtual Ptr clone( bool emptyTrainData=false ) const CV_OVERRIDE; protected: virtual void knnMatchImpl( InputArray queryDescriptors, std::vector >& matches, int k, InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE; @@ -1222,7 +1222,7 @@ public: CV_WRAP static Ptr create(); - virtual Ptr clone( bool emptyTrainData=false ) const CV_OVERRIDE; + CV_NODISCARD_STD virtual Ptr clone( bool emptyTrainData=false ) const CV_OVERRIDE; protected: static void convertToDMatches( const DescriptorCollection& descriptors, const Mat& indices, const Mat& distances, diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py index 749a9033ee..951dfe11c3 100755 --- a/modules/python/src2/hdr_parser.py +++ b/modules/python/src2/hdr_parser.py @@ -432,11 +432,18 @@ class CppHeaderParser(object): # filter off some common prefixes, which are meaningless for Python wrappers. # note that we do not strip "static" prefix, which does matter; # it means class methods, not instance methods - decl_str = self.batch_replace(decl_str, [("static inline", ""), ("inline", ""), ("explicit ", ""), - ("CV_EXPORTS_W", ""), ("CV_EXPORTS", ""), ("CV_CDECL", ""), - ("CV_WRAP ", " "), ("CV_INLINE", ""), - ("CV_DEPRECATED", ""), ("CV_DEPRECATED_EXTERNAL", "")]).strip() - + decl_str = self.batch_replace(decl_str, [("static inline", ""), + ("inline", ""), + ("explicit ", ""), + ("CV_EXPORTS_W", ""), + ("CV_EXPORTS", ""), + ("CV_CDECL", ""), + ("CV_WRAP ", " "), + ("CV_INLINE", ""), + ("CV_DEPRECATED", ""), + ("CV_DEPRECATED_EXTERNAL", ""), + ("CV_NODISCARD_STD", ""), + ("CV_NODISCARD", "")]).strip() if decl_str.strip().startswith('virtual'): virtual_method = True From d31b93b513df40f1549127836168a0fb17ca3cec Mon Sep 17 00:00:00 2001 From: rogday Date: Fri, 10 Sep 2021 14:07:16 +0300 Subject: [PATCH 10/15] Merge pull request #20674 from rogday:prelu_slope Fix PReLU negative slope access pattern * fix prelu negative slope access pattern * change begin() to ptr() --- modules/dnn/src/layers/elementwise_layers.cpp | 2 +- modules/dnn/test/test_onnx_importer.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp index e6cf714bff..6e7fa43a70 100644 --- a/modules/dnn/src/layers/elementwise_layers.cpp +++ b/modules/dnn/src/layers/elementwise_layers.cpp @@ -1542,7 +1542,7 @@ Ptr ChannelsPReLULayer::create(const LayerParams& params) if (params.blobs[0].total() == 1) { LayerParams reluParams = params; - reluParams.set("negative_slope", params.blobs[0].at(0)); + reluParams.set("negative_slope", *params.blobs[0].ptr()); return ReLULayer::create(reluParams); } Ptr l(new ElementWiseLayer(ChannelsPReLUFunctor(params.blobs[0]))); diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 7b94e02d0a..f55510ec7b 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -250,6 +250,11 @@ TEST_P(Test_ONNX_layers, ReLU) testONNXModels("ReLU"); } +TEST_P(Test_ONNX_layers, PReLU) +{ + testONNXModels("PReLU_slope"); +} + TEST_P(Test_ONNX_layers, Clip) { testONNXModels("clip", npy); From e3f4f874c5214397a41befa6735f31382e7c39d2 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 10 Sep 2021 15:00:11 +0300 Subject: [PATCH 11/15] Merge pull request #20670 from alalek:core_ocl_fix_intel_gpu_gemm_requirements core(OpenCL): fix intel_gpu_gemm kernel requirements * core(ocl): fix intel_gpu_gemm integration - allow bailout to generic OpenCL kernel * core(ocl): avoid failures of generic OpenCL gemm kernel * core(ocl): define alignment requirements of intel_gpu_gemm kernels --- modules/core/src/intel_gpu_gemm.inl.hpp | 71 +++++++---- modules/core/src/matmul.dispatch.cpp | 159 +++++++++++++----------- modules/core/test/ocl/test_gemm.cpp | 34 ++--- 3 files changed, 156 insertions(+), 108 deletions(-) diff --git a/modules/core/src/intel_gpu_gemm.inl.hpp b/modules/core/src/intel_gpu_gemm.inl.hpp index fbd567b949..fa66856f5e 100644 --- a/modules/core/src/intel_gpu_gemm.inl.hpp +++ b/modules/core/src/intel_gpu_gemm.inl.hpp @@ -24,11 +24,6 @@ #ifdef HAVE_OPENCL -#include -#include "opencl_kernels_core.hpp" -#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" -#include "opencv2/core/opencl/runtime/opencl_core.hpp" - namespace cv { @@ -37,52 +32,79 @@ static bool intel_gpu_gemm( UMat B, Size sizeB, UMat D, Size sizeD, double alpha, double beta, - bool atrans, bool btrans) + bool atrans, bool btrans, + bool& isPropagatedC2D +) { CV_UNUSED(sizeB); int M = sizeD.height, N = sizeD.width, K = ((atrans)? sizeA.height : sizeA.width); - std::string kernelName; - bool ret = true; + if (M < 4 || N < 4 || K < 4) // vload4 + return false; - size_t lx = 8, ly = 4; - size_t dx = 4, dy = 8; + CV_LOG_VERBOSE(NULL, 0, "M=" << M << " N=" << N << " K=" << K); + + std::string kernelName; + + unsigned int lx = 8, ly = 4; + unsigned int dx = 4, dy = 8; if(!atrans && !btrans) { - if (M % 32 == 0 && N % 32 == 0 && K % 16 == 0) { kernelName = "intelblas_gemm_buffer_NN_sp"; } else { + if (M % 2 != 0) + return false; + // vload4(0, dst_write0) - 4 cols + // multiply by lx: 8 + if (N % (4*8) != 0) + return false; kernelName = "intelblas_gemm_buffer_NN"; } } else if(atrans && !btrans) { + if (M % 32 != 0) + return false; + if (N % 32 != 0) + return false; kernelName = "intelblas_gemm_buffer_TN"; } else if(!atrans && btrans) { + if (M % 128 != 0) + return false; + if (N % 8 != 0) + return false; + if (K % 512 != 0) + return false; kernelName = "intelblas_gemm_buffer_NT"; ly = 16; dx = 1; } else { + if (M % 32 != 0) + return false; + if (N % 32 != 0) + return false; + if (K % 16 != 0) + return false; kernelName = "intelblas_gemm_buffer_TT"; } - const size_t gx = (size_t)(N + dx - 1) / dx; - const size_t gy = (size_t)(M + dy - 1) / dy; + CV_LOG_DEBUG(NULL, "kernel: " << kernelName << " (M=" << M << " N=" << N << " K=" << K << ")"); + + const size_t gx = divUp((size_t)N, dx); + const size_t gy = divUp((size_t)M, dy); size_t local[] = {lx, ly, 1}; - size_t global[] = {(gx + lx - 1) / lx * lx, (gy + ly - 1) / ly * ly, 1}; - - int stride = (M * N < 1024 * 1024) ? 10000000 : 256; + size_t global[] = {roundUp(gx, lx), roundUp(gy, ly), 1}; ocl::Queue q; String errmsg; @@ -110,10 +132,13 @@ static bool intel_gpu_gemm( (int)(D.step / sizeof(float)) ); - ret = k.run(2, global, local, false, q); + bool ret = k.run(2, global, local, false, q); + return ret; } else { + int stride = (M * N < 1024 * 1024) ? 10000000 : 256; + for(int start_index = 0; start_index < K; start_index += stride) { ocl::Kernel k(kernelName.c_str(), program); @@ -132,12 +157,16 @@ static bool intel_gpu_gemm( (int) start_index, // 14 start_index stride); - ret = k.run(2, global, local, false, q); - if (!ret) return ret; + bool ret = k.run(2, global, local, false, q); + if (!ret) + { + if (start_index != 0) + isPropagatedC2D = false; // D array content is changed, need to rewrite + return false; + } } + return true; } - - return ret; } } // namespace cv diff --git a/modules/core/src/matmul.dispatch.cpp b/modules/core/src/matmul.dispatch.cpp index e81064ec16..a7447330fc 100644 --- a/modules/core/src/matmul.dispatch.cpp +++ b/modules/core/src/matmul.dispatch.cpp @@ -42,6 +42,8 @@ //M*/ #include "precomp.hpp" +#include + #include "opencl_kernels_core.hpp" #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp" @@ -155,10 +157,12 @@ static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha, static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, InputArray matC, double beta, OutputArray matD, int flags ) { - int depth = matA.depth(), cn = matA.channels(); - int type = CV_MAKETYPE(depth, cn); + int type = matA.type(); + int depth = CV_MAT_DEPTH(type); + int cn = CV_MAT_CN(type); - CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); + CV_CheckTypeEQ(type, matB.type(), ""); + CV_CheckType(type, type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2, ""); const ocl::Device & dev = ocl::Device::getDefault(); bool doubleSupport = dev.doubleFPConfig() > 0; @@ -170,88 +174,103 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0); bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0; - CV_Assert( !haveC || matC.type() == type ); + if (haveC) + CV_CheckTypeEQ(type, matC.type(), ""); + + Size sizeD(((btrans) ? sizeB.height : sizeB.width), + ((atrans) ? sizeA.width : sizeA.height)); + + if (atrans) + sizeA = Size(sizeA.height, sizeA.width); + if (btrans) + sizeB = Size(sizeB.height, sizeB.width); + if (haveC && ctrans) + sizeC = Size(sizeC.height, sizeC.width); + + CV_CheckEQ(sizeA.width, sizeB.height, ""); + if (haveC) + CV_CheckEQ(sizeC, sizeD, ""); + + UMat A = matA.getUMat(); + UMat B = matB.getUMat(); - Size sizeD(((btrans)? sizeB.height : sizeB.width), - ((atrans)? sizeA.width : sizeA.height)); matD.create(sizeD, type); + UMat D = matD.getUMat(); - UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat(); + bool isPropagatedC2D = false; // D content is updated with C / C.t() - - if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1) - { - String opts; - - if (atrans) - sizeA = Size(sizeA.height, sizeA.width); - if (btrans) - sizeB = Size(sizeB.height, sizeB.width); - if (haveC && ctrans) - sizeC = Size(sizeC.height, sizeC.width); - - CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) ); - - int max_wg_size = (int)dev.maxWorkGroupSize(); - int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; - - if (atrans) - A = A.t(); - - if (btrans) - B = B.t(); - - if (haveC) - ctrans ? transpose(matC, D) : matC.copyTo(D); - - int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; - int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); - - opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", - ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), - cn, kercn, block_size, - (sizeA.width % block_size !=0) ? " -D NO_MULT" : "", - haveC ? " -D HAVE_C" : "", - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - - ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); - if (k.empty()) - return false; - - if (depth == CV_64F) - k.args(ocl::KernelArg::ReadOnlyNoSize(A), - ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), - ocl::KernelArg::ReadWrite(D, cn, kercn), - sizeA.width, alpha, beta); - else - k.args(ocl::KernelArg::ReadOnlyNoSize(A), - ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), - ocl::KernelArg::ReadWrite(D, cn, kercn), - sizeA.width, (float)alpha, (float)beta); - - size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; - size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; - - return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); - } - else + if (dev.intelSubgroupsSupport() && (depth == CV_32F) && cn == 1) { if (haveC && beta != 0.0) { ctrans ? transpose(matC, D) : matC.copyTo(D); + isPropagatedC2D = true; } else { beta = 0.0; } - return intel_gpu_gemm(A, sizeA, - B, sizeB, - D, sizeD, - alpha, - beta, - atrans, btrans); + bool res = intel_gpu_gemm(A, matA.size(), + B, matB.size(), + D, sizeD, + alpha, + beta, + atrans, btrans, + isPropagatedC2D); + if (res) + return true; + // fallback on generic OpenCL code } + + if (sizeD.width < 8 || sizeD.height < 8) + return false; + + String opts; + + int wg_size = (int)dev.maxWorkGroupSize(); + int sizeDmin = std::min(sizeD.width, sizeD.height); + wg_size = std::min(wg_size, sizeDmin * sizeDmin); + int block_size = (wg_size / (32*cn) < 32) ? (wg_size / (16*cn) < 16) ? (wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32; + + if (atrans) + A = A.t(); + + if (btrans) + B = B.t(); + + if (haveC && !isPropagatedC2D) + ctrans ? transpose(matC, D) : matC.copyTo(D); + + int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 }; + int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D); + + opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s", + ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)), + cn, kercn, block_size, + (sizeA.width % block_size !=0) ? " -D NO_MULT" : "", + haveC ? " -D HAVE_C" : "", + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + + ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts); + if (k.empty()) + return false; + + if (depth == CV_64F) + k.args(ocl::KernelArg::ReadOnlyNoSize(A), + ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), + ocl::KernelArg::ReadWrite(D, cn, kercn), + sizeA.width, alpha, beta); + else + k.args(ocl::KernelArg::ReadOnlyNoSize(A), + ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn), + ocl::KernelArg::ReadWrite(D, cn, kercn), + sizeA.width, (float)alpha, (float)beta); + + size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height}; + size_t localsize[2] = { (size_t)block_size, (size_t)block_size}; + + return k.run(2, globalsize, block_size !=1 ? localsize : NULL, false); } #endif diff --git a/modules/core/test/ocl/test_gemm.cpp b/modules/core/test/ocl/test_gemm.cpp index 825b506780..cb7cb0be1a 100644 --- a/modules/core/test/ocl/test_gemm.cpp +++ b/modules/core/test/ocl/test_gemm.cpp @@ -67,6 +67,8 @@ PARAM_TEST_CASE(Gemm, double alpha, beta; + int M, N, K; + TEST_DECLARE_INPUT_PARAMETER(A); TEST_DECLARE_INPUT_PARAMETER(B); TEST_DECLARE_INPUT_PARAMETER(C); @@ -90,30 +92,27 @@ PARAM_TEST_CASE(Gemm, void generateTestData() { - // set minimum size to 20, since testing less sizes doesn't make sense - Size ARoiSize = randomSize(20, MAX_VALUE); + M = (int)randomDoubleLog(1, 100); + N = (int)randomDoubleLog(1, 100); + K = (int)randomDoubleLog(1, 1200); + + M = roundUp(M, 1); + N = roundUp(N, 1); + K = roundUp(K, 1); + + Size ARoiSize = (atrans) ? Size(M, K) : Size(K, M); Border ABorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(A, A_roi, ARoiSize, ABorder, type, -11, 11); - if (atrans) - ARoiSize = Size(ARoiSize.height, ARoiSize.width); - - Size BRoiSize = randomSize(20, MAX_VALUE); - if (btrans) - BRoiSize.width = ARoiSize.width; - else - BRoiSize.height = ARoiSize.width; - + Size BRoiSize = (btrans) ? Size(K, N) : Size(N, K); Border BBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(B, B_roi, BRoiSize, BBorder, type, -11, 11); - if (btrans) - BRoiSize = Size(BRoiSize.height, BRoiSize.width); - - Size DRoiSize = Size(BRoiSize.width, ARoiSize.height), CRoiSizeT(DRoiSize.height, DRoiSize.width); + Size CRoiSize = (ctrans) ? Size(M, N) : Size(N, M); Border CBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); - randomSubMat(C, C_roi, ctrans ? CRoiSizeT : DRoiSize, CBorder, type, -11, 11); + randomSubMat(C, C_roi, CRoiSize, CBorder, type, -11, 11); + Size DRoiSize = Size(N, M); Border DBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(D, D_roi, DRoiSize, DBorder, type, -11, 11); @@ -132,11 +131,12 @@ OCL_TEST_P(Gemm, Accuracy) for (int i = 0; i < test_loop_times; ++i) { generateTestData(); + SCOPED_TRACE(cv::format("i=%d: M=%d N=%d K=%d", i, M, N, K)); OCL_OFF(cv::gemm(A_roi, B_roi, alpha, C_roi, beta, D_roi, flags)); OCL_ON(cv::gemm(uA_roi, uB_roi, alpha, uC_roi, beta, uD_roi, flags)); - double eps = D_roi.size().area() * 1e-4; + double eps = D_roi.size().area() * (1e-5 * K); OCL_EXPECT_MATS_NEAR(D, eps); } } From 9b4ecc96f63d64a07ac043ad06fa44a1fd02b18b Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 7 Sep 2021 04:39:28 +0000 Subject: [PATCH 12/15] core(ocl): buffer bounds in intelblas_gemm_buffer_NT --- modules/core/src/intel_gpu_gemm.inl.hpp | 6 +- modules/core/src/opencl/intel_gemm.cl | 180 +++++++++++------------- 2 files changed, 83 insertions(+), 103 deletions(-) diff --git a/modules/core/src/intel_gpu_gemm.inl.hpp b/modules/core/src/intel_gpu_gemm.inl.hpp index fa66856f5e..28cc4ab9b9 100644 --- a/modules/core/src/intel_gpu_gemm.inl.hpp +++ b/modules/core/src/intel_gpu_gemm.inl.hpp @@ -77,11 +77,7 @@ static bool intel_gpu_gemm( } else if(!atrans && btrans) { - if (M % 128 != 0) - return false; - if (N % 8 != 0) - return false; - if (K % 512 != 0) + if (K % 4 != 0) return false; kernelName = "intelblas_gemm_buffer_NT"; ly = 16; diff --git a/modules/core/src/opencl/intel_gemm.cl b/modules/core/src/opencl/intel_gemm.cl index 6cea8d7efd..53ae790779 100644 --- a/modules/core/src/opencl/intel_gemm.cl +++ b/modules/core/src/opencl/intel_gemm.cl @@ -392,6 +392,15 @@ __kernel void intelblas_gemm_buffer_NN( #define TILE_N 8 #define SLM_BLOCK 512 +/* + A K B.t() K D N + ----------- ----------- ----------- + | | | | | | + M | | x N | | => M | | + | | | | | | + ----------- ----------- ----------- +*/ + __attribute__((reqd_work_group_size(8, LWG_HEIGHT, 1))) __kernel void intelblas_gemm_buffer_NT( const __global float *src0, int off0, @@ -422,59 +431,79 @@ __kernel void intelblas_gemm_buffer_NT( float8 dot06 = 0.f; float8 dot07 = 0.f; - float4 brow0; - float4 brow1; - float4 brow2; - float4 brow3; - float4 brow4; - float4 brow5; - float4 brow6; - float4 brow7; + const int dst_row = (global_y * TILE_M); + __global float *dst_write0 = dst + global_x + dst_row * ldC + offd; - __global float *dst_write0 = dst + local_x * VEC_SIZE + ( group_x * TILE_N ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M) * ldC + offd; + const __global float *src0_read00 = src0 + off0; + const int a_row_base = global_y * TILE_M; + const int a_col_base = local_x * (TILE_K / 8); // <= TILE_K - 4 - const __global float *src0_read = src0 + local_x * ( TILE_K / 8 ) + ( group_y * LWG_HEIGHT * TILE_M + local_y * TILE_M ) * ldA + off0; - - const __global float *src1_read0 = src1 + ( group_x * TILE_N ) * ldB + off1; + const __global float *src1_read00 = src1 + off1; + const int b_row_base = (group_x * TILE_N); + //const int b_col_base = 0; __local float slm_brow[8 * SLM_BLOCK]; - __local float* slm_brow0; int local_index = mad24(local_y, 8, local_x) * 4; - int w; - for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) { + int w = 0; + for (int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) + { +#define UPDATE_BROW(_row) \ + { \ + float4 brow; \ + int b_row = b_row_base + _row; \ + int b_col = b_tile + local_index; \ + if (b_row < N && b_col <= K - 4 /*vload4*/) \ + brow = vload4(0, src1_read00 + mad24(b_row, ldB, b_col)); \ + else \ + brow = (float4)0; \ + vstore4(brow, 0, slm_brow + mad24(_row, SLM_BLOCK, local_index)); \ + } + barrier(CLK_LOCAL_MEM_FENCE); - vstore4(vload4(0, src1_read0 + mad24(0, ldB, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index)); - vstore4(vload4(0, src1_read0 + mad24(1, ldB, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index)); - vstore4(vload4(0, src1_read0 + mad24(2, ldB, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index)); - vstore4(vload4(0, src1_read0 + mad24(3, ldB, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index)); - vstore4(vload4(0, src1_read0 + mad24(4, ldB, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index)); - vstore4(vload4(0, src1_read0 + mad24(5, ldB, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index)); - vstore4(vload4(0, src1_read0 + mad24(6, ldB, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index)); - vstore4(vload4(0, src1_read0 + mad24(7, ldB, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index)); + UPDATE_BROW(0); + UPDATE_BROW(1); + UPDATE_BROW(2); + UPDATE_BROW(3); + UPDATE_BROW(4); + UPDATE_BROW(5); + UPDATE_BROW(6); + UPDATE_BROW(7); barrier(CLK_LOCAL_MEM_FENCE); +#undef UPDATE_BROW - slm_brow0 = slm_brow + local_x * (TILE_K / 8); - w = b_tile; - int end_w = min(b_tile + SLM_BLOCK, K); - while( w + TILE_K <= end_w ) { - float4 arow; + for (int k_tile_offset = 0; k_tile_offset < SLM_BLOCK; k_tile_offset += TILE_K) + { + int a_col = a_col_base + b_tile + k_tile_offset; - brow0 = vload4(0, slm_brow0 + 0 * SLM_BLOCK); - brow1 = vload4(0, slm_brow0 + 1 * SLM_BLOCK); - brow2 = vload4(0, slm_brow0 + 2 * SLM_BLOCK); - brow3 = vload4(0, slm_brow0 + 3 * SLM_BLOCK); - brow4 = vload4(0, slm_brow0 + 4 * SLM_BLOCK); - brow5 = vload4(0, slm_brow0 + 5 * SLM_BLOCK); - brow6 = vload4(0, slm_brow0 + 6 * SLM_BLOCK); - brow7 = vload4(0, slm_brow0 + 7 * SLM_BLOCK); + if (a_col > K - 4 /*vload4*/) + break; -#define MM_DOT_PRODUCT(_row,_dot) \ - arow = vload4(0, src0_read + _row * ldA); \ - _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \ - _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \ - _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \ - _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); + int slm_brow_col = a_col_base + k_tile_offset; // <= SLM_BLOCK - 4 +#define READ_SLM_BROW(_row) \ + float4 brow##_row = vload4(0, slm_brow + mad24(_row, SLM_BLOCK, slm_brow_col)); + + READ_SLM_BROW(0); + READ_SLM_BROW(1); + READ_SLM_BROW(2); + READ_SLM_BROW(3); + READ_SLM_BROW(4); + READ_SLM_BROW(5); + READ_SLM_BROW(6); + READ_SLM_BROW(7); +#undef READ_SLM_BROW + +#define MM_DOT_PRODUCT(_row,_dot) \ + { \ + int a_row = a_row_base + _row; \ + if (a_row < M) { \ + float4 arow = vload4(0, src0_read00 + mad24(a_row, ldA, a_col)); \ + _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \ + _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \ + _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \ + _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); \ + } \ + } MM_DOT_PRODUCT(0,dot00); MM_DOT_PRODUCT(1,dot01); @@ -485,53 +514,7 @@ __kernel void intelblas_gemm_buffer_NT( MM_DOT_PRODUCT(6,dot06); MM_DOT_PRODUCT(7,dot07); #undef MM_DOT_PRODUCT - - src0_read += TILE_K; - slm_brow0 += TILE_K; - w += TILE_K; } - src1_read0 += SLM_BLOCK; - } - - if(w < K) { - float4 arow; - -#define READ_BROW(_brow,_row) \ - _brow = vload4(0, slm_brow0 + _row * SLM_BLOCK); \ - _brow.x = (mad24(local_x, 4, w) < K) ? _brow.x : 0.0f; \ - _brow.y = (mad24(local_x, 4, w + 1) < K) ? _brow.y : 0.0f; \ - _brow.z = (mad24(local_x, 4, w + 2) < K) ? _brow.z : 0.0f; \ - _brow.w = (mad24(local_x, 4, w + 3) < K) ? _brow.w : 0.0f; - - READ_BROW(brow0,0); - READ_BROW(brow1,1); - READ_BROW(brow2,2); - READ_BROW(brow3,3); - READ_BROW(brow4,4); - READ_BROW(brow5,5); - READ_BROW(brow6,6); - READ_BROW(brow7,7); - -#define MM_DOT_PRODUCT(_row,_dot) \ - arow = vload4(0, src0_read + _row * ldA); \ - arow.x = (mad24(local_x, 4, w) < K) ? arow.x : 0.0f; \ - arow.y = (mad24(local_x, 4, w + 1) < K) ? arow.y : 0.0f; \ - arow.z = (mad24(local_x, 4, w + 2) < K) ? arow.z : 0.0f; \ - arow.w = (mad24(local_x, 4, w + 3) < K) ? arow.w : 0.0f; \ - _dot = mad( (float8)(arow.x), (float8)(brow0.x, brow1.x, brow2.x, brow3.x, brow4.x, brow5.x, brow6.x, brow7.x), _dot ); \ - _dot = mad( (float8)(arow.y), (float8)(brow0.y, brow1.y, brow2.y, brow3.y, brow4.y, brow5.y, brow6.y, brow7.y), _dot ); \ - _dot = mad( (float8)(arow.z), (float8)(brow0.z, brow1.z, brow2.z, brow3.z, brow4.z, brow5.z, brow6.z, brow7.z), _dot ); \ - _dot = mad( (float8)(arow.w), (float8)(brow0.w, brow1.w, brow2.w, brow3.w, brow4.w, brow5.w, brow6.w, brow7.w), _dot ); - - MM_DOT_PRODUCT(0,dot00); - MM_DOT_PRODUCT(1,dot01); - MM_DOT_PRODUCT(2,dot02); - MM_DOT_PRODUCT(3,dot03); - MM_DOT_PRODUCT(4,dot04); - MM_DOT_PRODUCT(5,dot05); - MM_DOT_PRODUCT(6,dot06); - MM_DOT_PRODUCT(7,dot07); -#undef MM_DOT_PRODUCT } #define REDUCE(_dot) \ @@ -572,21 +555,22 @@ __kernel void intelblas_gemm_buffer_NT( output = (local_x == 5) ? _dot.s5 : output; \ output = (local_x == 6) ? _dot.s6 : output; \ output = (local_x == 7) ? _dot.s7 : output; \ - if (beta != 0.0) \ + if (beta != 0.0f) \ dst_write0[0] = mad(output, (float)alpha, ((float)beta * dst_write0[0])); \ else \ dst_write0[0] = output * (float)alpha; \ dst_write0 += ldC; - if(global_x < N && global_y * 8 < M) { - OUTPUT(dot00); - if(mad24(global_y, 8, 1) < M) { OUTPUT(dot01); } - if(mad24(global_y, 8, 2) < M) { OUTPUT(dot02); } - if(mad24(global_y, 8, 3) < M) { OUTPUT(dot03); } - if(mad24(global_y, 8, 4) < M) { OUTPUT(dot04); } - if(mad24(global_y, 8, 5) < M) { OUTPUT(dot05); } - if(mad24(global_y, 8, 6) < M) { OUTPUT(dot06); } - if(mad24(global_y, 8, 7) < M) { OUTPUT(dot07); } + if (global_x < N && dst_row < M) + { + /*if (dst_row + 0 < M)*/ { OUTPUT(dot00); } + if (dst_row + 1 < M) { OUTPUT(dot01); } + if (dst_row + 2 < M) { OUTPUT(dot02); } + if (dst_row + 3 < M) { OUTPUT(dot03); } + if (dst_row + 4 < M) { OUTPUT(dot04); } + if (dst_row + 5 < M) { OUTPUT(dot05); } + if (dst_row + 6 < M) { OUTPUT(dot06); } + if (dst_row + 7 < M) { OUTPUT(dot07); } } #undef OUTPUT } From de1a45987964d466a13b3f9ba2e1e59b3be38cca Mon Sep 17 00:00:00 2001 From: Dale Phurrough Date: Fri, 10 Sep 2021 17:59:56 +0200 Subject: [PATCH 13/15] fix opencv/opencv#20613 * copy 4.x selectOpenCLDevice() -- it is compatible * filter platforms rather than trying only first matching * this works on 3.4 and 4.x master --- modules/core/src/ocl.cpp | 59 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index a550c1d91a..e93e3094be 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -1670,7 +1670,7 @@ static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr, split(configurationStr, ':', parts); if (parts.size() > 3) { - std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl; + CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr); return false; } if (parts.size() > 2) @@ -1687,22 +1687,20 @@ static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr, } #if defined WINRT || defined _WIN32_WCE -static cl_device_id selectOpenCLDevice() +static cl_device_id selectOpenCLDevice(const char* configuration = NULL) { + CV_UNUSED(configuration) return NULL; } #else -// std::tolower is int->int -static char char_tolower(char ch) -{ - return (char)std::tolower((int)ch); -} -static cl_device_id selectOpenCLDevice() +static cl_device_id selectOpenCLDevice(const char* configuration = NULL) { std::string platform, deviceName; std::vector deviceTypes; - const char* configuration = getenv("OPENCV_OPENCL_DEVICE"); + if (!configuration) + configuration = getenv("OPENCV_OPENCL_DEVICE"); + if (configuration && (strcmp(configuration, "disabled") == 0 || !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName) @@ -1747,22 +1745,24 @@ static cl_device_id selectOpenCLDevice() platforms.resize(numPlatforms); } - int selectedPlatform = -1; if (platform.length() > 0) { - for (size_t i = 0; i < platforms.size(); i++) + for (std::vector::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end();) { std::string name; - CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name)); + CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, *currentPlatform, CL_PLATFORM_NAME, name)); if (name.find(platform) != std::string::npos) { - selectedPlatform = (int)i; - break; + ++currentPlatform; + } + else + { + currentPlatform = platforms.erase(currentPlatform); } } - if (selectedPlatform == -1) + if (platforms.size() == 0) { - std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl; + CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform); goto not_found; } } @@ -1781,7 +1781,7 @@ static cl_device_id selectOpenCLDevice() { int deviceType = 0; std::string tempStrDeviceType = deviceTypes[t]; - std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower); + std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower); if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu") deviceType = Device::TYPE_GPU; @@ -1793,17 +1793,15 @@ static cl_device_id selectOpenCLDevice() deviceType = Device::TYPE_ALL; else { - std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl; + CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]); goto not_found; } - std::vector devices; // TODO Use clReleaseDevice to cleanup - for (int i = selectedPlatform >= 0 ? selectedPlatform : 0; - (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size()); - i++) + std::vector devices; + for (std::vector::iterator currentPlatform = platforms.begin(); currentPlatform != platforms.end(); ++currentPlatform) { cl_uint count = 0; - cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count); + cl_int status = clGetDeviceIDs(*currentPlatform, deviceType, 0, NULL, &count); if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND)) { CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count"); @@ -1812,7 +1810,7 @@ static cl_device_id selectOpenCLDevice() continue; size_t base = devices.size(); devices.resize(base + count); - status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count); + status = clGetDeviceIDs(*currentPlatform, deviceType, count, &devices[base], &count); if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND)) { CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs"); @@ -1844,13 +1842,16 @@ not_found: if (!configuration) return NULL; // suppress messages on stderr - std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl - << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl - << " Device types: "; + std::ostringstream msg; + msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl + << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl + << " Device types:"; for (size_t t = 0; t < deviceTypes.size(); t++) - std::cerr << deviceTypes[t] << " "; + msg << ' ' << deviceTypes[t]; - std::cerr << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl; + msg << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName); + + CV_LOG_ERROR(NULL, msg.str()); return NULL; } #endif From aa7ba0bc1a99357beed7f4624de353f55ceccb20 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 10 Sep 2021 20:58:37 +0000 Subject: [PATCH 14/15] build: winpack_dldt with dldt 2021.4.1 --- cmake/OpenCVDetectInferenceEngine.cmake | 4 +- ...-dldt-disable-multidevice-autoplugin.patch | 16 ++ ...20210630-dldt-disable-unused-targets.patch | 219 ++++++++++++++++++ .../2021.4.1/20210630-dldt-pdb.patch | 15 ++ .../2021.4.1/20210630-dldt-vs-version.patch | 16 ++ .../winpack_dldt/2021.4.1/build.config.py | 1 + .../winpack_dldt/2021.4.1/patch.config.py | 4 + .../winpack_dldt/2021.4.1/sysroot.config.py | 56 +++++ platforms/winpack_dldt/build_package.py | 5 +- 9 files changed, 332 insertions(+), 4 deletions(-) create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch create mode 100644 platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch create mode 100644 platforms/winpack_dldt/2021.4.1/build.config.py create mode 100644 platforms/winpack_dldt/2021.4.1/patch.config.py create mode 100644 platforms/winpack_dldt/2021.4.1/sysroot.config.py diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake index 6308d1b424..41951b710a 100644 --- a/cmake/OpenCVDetectInferenceEngine.cmake +++ b/cmake/OpenCVDetectInferenceEngine.cmake @@ -147,8 +147,8 @@ if(INF_ENGINE_TARGET) endif() endif() if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT) - message(WARNING "InferenceEngine version has not been set, 2021.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.") - set(INF_ENGINE_RELEASE_INIT "2021040000") + message(WARNING "InferenceEngine version has not been set, 2021.4.1 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.") + set(INF_ENGINE_RELEASE_INIT "2021040100") elseif(DEFINED INF_ENGINE_RELEASE) set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}") endif() diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch new file mode 100644 index 0000000000..f1e7487442 --- /dev/null +++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-multidevice-autoplugin.patch @@ -0,0 +1,16 @@ +diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt +index 0ba0dd78..7d34e7cb 100644 +--- a/inference-engine/src/CMakeLists.txt ++++ b/inference-engine/src/CMakeLists.txt +@@ -26,9 +26,9 @@ endif() + + add_subdirectory(hetero_plugin) + +-add_subdirectory(auto_plugin) ++#add_subdirectory(auto_plugin) + +-add_subdirectory(multi_device) ++#add_subdirectory(multi_device) + + add_subdirectory(transformations) + diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch new file mode 100644 index 0000000000..9d44cdadc6 --- /dev/null +++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-disable-unused-targets.patch @@ -0,0 +1,219 @@ +diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake +index d49f16a4d..2726ca787 100644 +--- a/cmake/developer_package/add_ie_target.cmake ++++ b/cmake/developer_package/add_ie_target.cmake +@@ -92,7 +92,7 @@ function(addIeTarget) + if (ARG_TYPE STREQUAL EXECUTABLE) + add_executable(${ARG_NAME} ${all_sources}) + elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED) +- add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources}) ++ add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources}) + else() + message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}") + endif() +diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt +index 1ac7fd8bf..df7091e51 100644 +--- a/inference-engine/CMakeLists.txt ++++ b/inference-engine/CMakeLists.txt +@@ -39,7 +39,7 @@ if(ENABLE_TESTS) + add_subdirectory(tests) + endif() + +-add_subdirectory(tools) ++#add_subdirectory(tools) + + function(ie_build_samples) + # samples should be build with the same flags as from OpenVINO package, +@@ -58,7 +58,7 @@ endfunction() + + # gflags and format_reader targets are kept inside of samples directory and + # they must be built even if samples build is disabled (required for tests and tools). +-ie_build_samples() ++#ie_build_samples() + + if(ENABLE_PYTHON) + add_subdirectory(ie_bridges/python) +@@ -142,7 +142,7 @@ endif() + # Developer package + # + +-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils) ++#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils) + + # for Template plugin + if(NGRAPH_INTERPRETER_ENABLE) +@@ -166,7 +166,7 @@ function(ie_generate_dev_package_config) + @ONLY) + endfunction() + +-ie_generate_dev_package_config() ++#ie_generate_dev_package_config() + + # + # Coverage +diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt +index e8ed1a5c4..1fc9fc3ff 100644 +--- a/inference-engine/src/inference_engine/CMakeLists.txt ++++ b/inference-engine/src/inference_engine/CMakeLists.txt +@@ -110,7 +110,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr + + # Create object library + +-add_library(${TARGET_NAME}_obj OBJECT ++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL + ${LIBRARY_SRC} + ${LIBRARY_HEADERS} + ${PUBLIC_HEADERS}) +@@ -181,7 +181,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + + # Static library used for unit tests which are always built + +-add_library(${TARGET_NAME}_s STATIC ++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL + $ + $ + ${IE_STATIC_DEPENDENT_FILES}) +diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt +index 8eae82bd2..e0e6745b1 100644 +--- a/inference-engine/src/legacy_api/CMakeLists.txt ++++ b/inference-engine/src/legacy_api/CMakeLists.txt +@@ -26,7 +26,7 @@ endif() + + file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + +-add_library(${TARGET_NAME}_obj OBJECT ++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL + ${LIBRARY_SRC} + ${PUBLIC_HEADERS}) + +diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt +index fe57b29dd..07831e2fb 100644 +--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt ++++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt +@@ -67,7 +67,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + + # add test object library + +-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS}) ++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS}) + target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn) + + target_include_directories(${TARGET_NAME}_obj PRIVATE $ +diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt +index f9548339d..ef962145a 100644 +--- a/inference-engine/src/preprocessing/CMakeLists.txt ++++ b/inference-engine/src/preprocessing/CMakeLists.txt +@@ -101,7 +101,7 @@ endif() + + # Create object library + +-add_library(${TARGET_NAME}_obj OBJECT ++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL + ${LIBRARY_SRC} + ${LIBRARY_HEADERS}) + +@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + + # Static library used for unit tests which are always built + +-add_library(${TARGET_NAME}_s STATIC ++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL + $) + + set_ie_threading_interface_for(${TARGET_NAME}_s) +diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt +index 249e47c28..4ddf63049 100644 +--- a/inference-engine/src/vpu/common/CMakeLists.txt ++++ b/inference-engine/src/vpu/common/CMakeLists.txt +@@ -5,7 +5,7 @@ + file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h) + + function(add_common_target TARGET_NAME STATIC_IE) +- add_library(${TARGET_NAME} STATIC ${SOURCES}) ++ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES}) + + ie_faster_build(${TARGET_NAME} + UNITY +@@ -60,7 +60,7 @@ add_common_target("vpu_common_lib" FALSE) + + # Unit tests support for graph transformer + if(WIN32) +- add_common_target("vpu_common_lib_test_static" TRUE) ++ #add_common_target("vpu_common_lib_test_static" TRUE) + else() + add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib") + endif() +diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt +index bc73ab5b1..b4c1547fc 100644 +--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt ++++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt +@@ -5,7 +5,7 @@ + file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc) + + function(add_graph_transformer_target TARGET_NAME STATIC_IE) +- add_library(${TARGET_NAME} STATIC ${SOURCES}) ++ add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES}) + + set_ie_threading_interface_for(${TARGET_NAME}) + +@@ -70,7 +70,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE) + + # Unit tests support for graph transformer + if(WIN32) +- add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE) ++ #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE) + else() + add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer") + endif() +diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt +index 8bcb2801a..f7e031c01 100644 +--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt ++++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt +@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS) + else() + add_library(pugixml STATIC ${SOURCES}) + if (MSVC) +- add_library(pugixml_mt STATIC ${SOURCES}) ++ #add_library(pugixml_mt STATIC ${SOURCES}) + #if (WIN32) + # set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") + # set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") +diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt +index ff5c381e7..2797ec9ab 100644 +--- a/ngraph/core/builder/CMakeLists.txt ++++ b/ngraph/core/builder/CMakeLists.txt +@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC}) + source_group("include" FILES ${PUBLIC_HEADERS}) + + # Create shared library +-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS}) ++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS}) + + if(COMMAND ie_faster_build) + ie_faster_build(${TARGET_NAME} +diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt +index ef4a764ab..f6d3172e2 100644 +--- a/ngraph/core/reference/CMakeLists.txt ++++ b/ngraph/core/reference/CMakeLists.txt +@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC}) + source_group("include" FILES ${PUBLIC_HEADERS}) + + # Create shared library +-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS}) ++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS}) + + if(COMMAND ie_faster_build) + ie_faster_build(${TARGET_NAME} +diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt +index e9f880b8c..c63f4df63 100644 +--- a/openvino/itt/CMakeLists.txt ++++ b/openvino/itt/CMakeLists.txt +@@ -6,7 +6,7 @@ set(TARGET_NAME itt) + + file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp") + +-add_library(${TARGET_NAME} STATIC ${SOURCES}) ++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES}) + + add_library(openvino::itt ALIAS ${TARGET_NAME}) + diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch new file mode 100644 index 0000000000..65e6f84dc8 --- /dev/null +++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-pdb.patch @@ -0,0 +1,15 @@ +iff --git a/CMakeLists.txt b/CMakeLists.txt +index e0706a72e..9a053b1e4 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -6,6 +6,10 @@ cmake_minimum_required(VERSION 3.13) + + project(OpenVINO) + ++set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS") ++set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF") ++set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF") ++ + set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine) + diff --git a/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch b/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch new file mode 100644 index 0000000000..36b0068775 --- /dev/null +++ b/platforms/winpack_dldt/2021.4.1/20210630-dldt-vs-version.patch @@ -0,0 +1,16 @@ +diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake +index 14d4c0e1e..6a44f73b9 100644 +--- a/cmake/developer_package/vs_version/vs_version.cmake ++++ b/cmake/developer_package/vs_version/vs_version.cmake +@@ -8,9 +8,9 @@ set(IE_VS_VER_FILEVERSION_STR "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERS + + set(IE_VS_VER_COMPANY_NAME_STR "Intel Corporation") + set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}") +-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit") ++set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)") + set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2021, Intel Corporation") +-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/") ++set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend") + + # + # ie_add_vs_version_file(NAME diff --git a/platforms/winpack_dldt/2021.4.1/build.config.py b/platforms/winpack_dldt/2021.4.1/build.config.py new file mode 100644 index 0000000000..a643c17928 --- /dev/null +++ b/platforms/winpack_dldt/2021.4.1/build.config.py @@ -0,0 +1 @@ +os.environ['CI_BUILD_NUMBER'] = '2021.4.1-opencv_winpack_dldt' diff --git a/platforms/winpack_dldt/2021.4.1/patch.config.py b/platforms/winpack_dldt/2021.4.1/patch.config.py new file mode 100644 index 0000000000..7f8715aae2 --- /dev/null +++ b/platforms/winpack_dldt/2021.4.1/patch.config.py @@ -0,0 +1,4 @@ +applyPatch('20210630-dldt-disable-unused-targets.patch') +applyPatch('20210630-dldt-pdb.patch') +applyPatch('20210630-dldt-disable-multidevice-autoplugin.patch') +applyPatch('20210630-dldt-vs-version.patch') diff --git a/platforms/winpack_dldt/2021.4.1/sysroot.config.py b/platforms/winpack_dldt/2021.4.1/sysroot.config.py new file mode 100644 index 0000000000..fa4281107d --- /dev/null +++ b/platforms/winpack_dldt/2021.4.1/sysroot.config.py @@ -0,0 +1,56 @@ +sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin') +copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph') +#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll') + +build_config = 'Release' if not self.config.build_debug else 'Debug' +build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config + +def copy_bin(name): + global build_bin_dir, sysroot_bin_dir + copytree(build_bin_dir / name, sysroot_bin_dir / name) + +dll_suffix = 'd' if self.config.build_debug else '' +def copy_dll(name): + global copy_bin, dll_suffix + copy_bin(name + dll_suffix + '.dll') + copy_bin(name + dll_suffix + '.pdb') + +copy_bin('cache.json') +copy_dll('clDNNPlugin') +copy_dll('HeteroPlugin') +copy_dll('inference_engine') +copy_dll('inference_engine_ir_reader') +#copy_dll('inference_engine_ir_v7_reader') +copy_dll('inference_engine_legacy') +copy_dll('inference_engine_transformations') # runtime +copy_dll('inference_engine_lp_transformations') # runtime +#copy_dll('inference_engine_preproc') # runtime +copy_dll('MKLDNNPlugin') # runtime +copy_dll('myriadPlugin') # runtime +#copy_dll('MultiDevicePlugin') # runtime, not used +copy_dll('ngraph') +copy_bin('plugins.xml') +copy_bin('pcie-ma2x8x.elf') +copy_bin('usb-ma2x8x.mvcmd') + +copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir) +copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb') + +sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine') +sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64') + +copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include') +if not self.config.build_debug: + copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib') + copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib') + copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib') + copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib') +else: + copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib') + copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib') + copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib') + copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib') + +sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses') +copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE') +copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE') diff --git a/platforms/winpack_dldt/build_package.py b/platforms/winpack_dldt/build_package.py index bd4355e1cd..0194323930 100644 --- a/platforms/winpack_dldt/build_package.py +++ b/platforms/winpack_dldt/build_package.py @@ -469,7 +469,8 @@ class Builder: def main(): dldt_src_url = 'https://github.com/openvinotoolkit/openvino' - dldt_src_commit = '2021.4' + dldt_src_commit = '2021.4.1' + dldt_config = None dldt_release = None build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache') @@ -503,7 +504,7 @@ def main(): parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)') parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)') - parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)') + parser.add_argument('--dldt_config', default=dldt_config, help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)') parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)') From 51b03b87e6f9827cbee3feb6dd6dc385c4b77297 Mon Sep 17 00:00:00 2001 From: Zihao Mu Date: Fri, 10 Sep 2021 18:15:22 +0800 Subject: [PATCH 15/15] BiasAdd could load Const from second place. --- modules/dnn/src/tensorflow/tf_importer.cpp | 7 ++++++- modules/dnn/test/test_tf_importer.cpp | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index ca9d7c5e21..521c8ce4c3 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -914,7 +914,12 @@ void TFImporter::parseBias(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer_id[name] = id; // one input only - connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); + Pin inp0 = parsePin(layer.input(0)); + if (layer_id.find(inp0.name) != layer_id.end()) + // First operand is a constant. + connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0); + else + connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0); } else { diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 1a2b976eb8..cdf3794bf1 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -559,6 +559,18 @@ TEST_P(Test_TensorFlow_layers, l2_normalize) runTensorFlowNet("l2_normalize"); } +TEST_P(Test_TensorFlow_layers, BiasAdd) +{ +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000) + if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD + && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X + ) + applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); +#endif + + runTensorFlowNet("bias_add_1"); +} + // TODO: fix it and add to l2_normalize TEST_P(Test_TensorFlow_layers, l2_normalize_3d) {