diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp index 065c0c2566..5f75effeb6 100644 --- a/modules/dnn/src/layers/detection_output_layer.cpp +++ b/modules/dnn/src/layers/detection_output_layer.cpp @@ -45,6 +45,7 @@ #include #include #include "../nms.inl.hpp" +#include "opencl_kernels_dnn.hpp" namespace cv { @@ -211,11 +212,160 @@ public: return false; } +#ifdef HAVE_OPENCL + // Decode all bboxes in a batch + bool ocl_DecodeBBoxesAll(UMat& loc_mat, UMat& prior_mat, + const int num, const int numPriors, const bool share_location, + const int num_loc_classes, const int background_label_id, + const cv::String& code_type, const bool variance_encoded_in_target, + const bool clip, std::vector& all_decode_bboxes) + { + UMat outmat = UMat(loc_mat.dims, loc_mat.size, CV_32F); + size_t nthreads = loc_mat.total(); + String kernel_name; + + if (code_type == "CORNER") + kernel_name = "DecodeBBoxesCORNER"; + else if (code_type == "CENTER_SIZE") + kernel_name = "DecodeBBoxesCENTER_SIZE"; + else + return false; + + for (int i = 0; i < num; ++i) + { + ocl::Kernel kernel(kernel_name.c_str(), ocl::dnn::detection_output_oclsrc); + kernel.set(0, (int)nthreads); + kernel.set(1, ocl::KernelArg::PtrReadOnly(loc_mat)); + kernel.set(2, ocl::KernelArg::PtrReadOnly(prior_mat)); + kernel.set(3, (int)variance_encoded_in_target); + kernel.set(4, (int)numPriors); + kernel.set(5, (int)share_location); + kernel.set(6, (int)num_loc_classes); + kernel.set(7, (int)background_label_id); + kernel.set(8, (int)clip); + kernel.set(9, ocl::KernelArg::PtrWriteOnly(outmat)); + + if (!kernel.run(1, &nthreads, NULL, false)) + return false; + } + + all_decode_bboxes.clear(); + all_decode_bboxes.resize(num); + { + Mat mat = outmat.getMat(ACCESS_READ); + const float* decode_data = mat.ptr(); + for (int i = 0; i < num; ++i) + { + LabelBBox& decode_bboxes = all_decode_bboxes[i]; + for (int c = 0; c < num_loc_classes; ++c) + { + int label = share_location ? -1 : c; + decode_bboxes[label].resize(numPriors); + for (int p = 0; p < numPriors; ++p) + { + int startIdx = p * num_loc_classes * 4; + util::NormalizedBBox& bbox = decode_bboxes[label][p]; + bbox.xmin = decode_data[startIdx + c * 4]; + bbox.ymin = decode_data[startIdx + c * 4 + 1]; + bbox.xmax = decode_data[startIdx + c * 4 + 2]; + bbox.ymax = decode_data[startIdx + c * 4 + 3]; + } + } + } + } + return true; + } + + void ocl_GetConfidenceScores(const UMat& inp1, const int num, + const int numPredsPerClass, const int numClasses, + std::vector& confPreds) + { + int shape[] = { numClasses, numPredsPerClass }; + for (int i = 0; i < num; i++) + confPreds.push_back(Mat(2, shape, CV_32F)); + + UMat umat = inp1.reshape(1, num * numPredsPerClass); + for (int i = 0; i < num; ++i) + { + Range ranges[] = { Range(i * numPredsPerClass, (i + 1) * numPredsPerClass), Range::all() }; + transpose(umat(ranges), confPreds[i]); + } + } + + bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) + { + std::vector inputs; + std::vector outputs; + + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + + std::vector allDecodedBBoxes; + std::vector allConfidenceScores; + + int num = inputs[0].size[0]; + + // extract predictions from input layers + { + int numPriors = inputs[2].size[2] / 4; + + // Retrieve all confidences + ocl_GetConfidenceScores(inputs[1], num, numPriors, _numClasses, allConfidenceScores); + + // Decode all loc predictions to bboxes + bool ret = ocl_DecodeBBoxesAll(inputs[0], inputs[2], num, numPriors, + _shareLocation, _numLocClasses, _backgroundLabelId, + _codeType, _varianceEncodedInTarget, false, + allDecodedBBoxes); + if (!ret) + return false; + } + + size_t numKept = 0; + std::vector > > allIndices; + for (int i = 0; i < num; ++i) + { + numKept += processDetections_(allDecodedBBoxes[i], allConfidenceScores[i], allIndices); + } + + if (numKept == 0) + { + // Set confidences to zeros. + Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)}; + outputs[0](ranges).setTo(0); + return true; + } + int outputShape[] = {1, 1, (int)numKept, 7}; + UMat umat = UMat(4, outputShape, CV_32F); + { + Mat mat = umat.getMat(ACCESS_WRITE); + float* outputsData = mat.ptr(); + + size_t count = 0; + for (int i = 0; i < num; ++i) + { + count += outputDetections_(i, &outputsData[count * 7], + allDecodedBBoxes[i], allConfidenceScores[i], + allIndices[i]); + } + CV_Assert(count == numKept); + } + outputs.clear(); + outputs.push_back(umat); + outs.assign(outputs); + return true; + } +#endif + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr); } @@ -225,7 +375,7 @@ public: CV_TRACE_ARG_VALUE(name, "name", name.c_str()); std::vector allDecodedBBoxes; - std::vector > > allConfidenceScores; + std::vector allConfidenceScores; int num = inputs[0]->size[0]; @@ -286,7 +436,7 @@ public: size_t outputDetections_( const int i, float* outputsData, - const LabelBBox& decodeBBoxes, const std::vector >& confidenceScores, + const LabelBBox& decodeBBoxes, Mat& confidenceScores, const std::map >& indicesMap ) { @@ -294,9 +444,9 @@ public: for (std::map >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it) { int label = it->first; - if (confidenceScores.size() <= label) + if (confidenceScores.rows <= label) CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label)); - const std::vector& scores = confidenceScores[label]; + const std::vector& scores = confidenceScores.row(label); int locLabel = _shareLocation ? -1 : label; LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(locLabel); if (label_bboxes == decodeBBoxes.end()) @@ -320,7 +470,7 @@ public: } size_t processDetections_( - const LabelBBox& decodeBBoxes, const std::vector >& confidenceScores, + const LabelBBox& decodeBBoxes, Mat& confidenceScores, std::vector > >& allIndices ) { @@ -330,10 +480,10 @@ public: { if (c == _backgroundLabelId) continue; // Ignore background class. - if (c >= confidenceScores.size()) + if (c >= confidenceScores.rows) CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find confidence predictions for label %d", c)); - const std::vector& scores = confidenceScores[c]; + const std::vector scores = confidenceScores.row(c); int label = _shareLocation ? -1 : c; LabelBBox::const_iterator label_bboxes = decodeBBoxes.find(label); @@ -351,9 +501,9 @@ public: { int label = it->first; const std::vector& labelIndices = it->second; - if (label >= confidenceScores.size()) + if (label >= confidenceScores.rows) CV_ErrorNoReturn_(cv::Error::StsError, ("Could not find location predictions for label %d", label)); - const std::vector& scores = confidenceScores[label]; + const std::vector& scores = confidenceScores.row(label); for (size_t j = 0; j < labelIndices.size(); ++j) { size_t idx = labelIndices[j]; @@ -630,20 +780,20 @@ public: // confidence prediction for an image. static void GetConfidenceScores(const float* confData, const int num, const int numPredsPerClass, const int numClasses, - std::vector > >& confPreds) + std::vector& confPreds) { - confPreds.clear(); confPreds.resize(num); + int shape[] = { numClasses, numPredsPerClass }; + for (int i = 0; i < num; i++) + confPreds.push_back(Mat(2, shape, CV_32F)); + for (int i = 0; i < num; ++i, confData += numPredsPerClass * numClasses) { - std::vector >& labelScores = confPreds[i]; - labelScores.resize(numClasses); + Mat labelScores = confPreds[i]; for (int c = 0; c < numClasses; ++c) { - std::vector& classLabelScores = labelScores[c]; - classLabelScores.resize(numPredsPerClass); for (int p = 0; p < numPredsPerClass; ++p) { - classLabelScores[p] = confData[p * numClasses + c]; + labelScores.at(c, p) = confData[p * numClasses + c]; } } } diff --git a/modules/dnn/src/opencl/detection_output.cl b/modules/dnn/src/opencl/detection_output.cl new file mode 100644 index 0000000000..f5932cc82a --- /dev/null +++ b/modules/dnn/src/opencl/detection_output.cl @@ -0,0 +1,181 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define Dtype float +#define Dtype4 float4 + +__kernel void DecodeBBoxesCORNER(const int nthreads, + __global const Dtype* loc_data, + __global const Dtype* prior_data, + const int variance_encoded_in_target, + const int num_priors, + const int share_location, + const int num_loc_classes, + const int background_label_id, + const int clip_bbox, + __global Dtype* bbox_data) +{ + for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) + { + Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax; + const int i = index % 4; + const int p = ((index / 4 / num_loc_classes) % num_priors) * 4; + + const int c = (index / 4) % num_loc_classes; + int label = share_location ? -1 : c; + if (label == background_label_id) + return; // Ignore background class. + + Dtype4 loc_vec = vload4(0, loc_data + index - i); + Dtype4 bbox_vec, prior_variance; + if (variance_encoded_in_target) + { + bbox_vec = loc_vec; + } else { + const int start_index = num_priors * 4 + p; + prior_variance = vload4(0, prior_data + start_index); + bbox_vec = loc_vec * prior_variance; + } + + bbox_xmin = bbox_vec.x; + bbox_ymin = bbox_vec.y; + bbox_xmax = bbox_vec.z; + bbox_ymax = bbox_vec.w; + + Dtype4 prior_vec = vload4(0, prior_data + p); + Dtype val; + switch (i) + { + case 0: + val = prior_vec.x + bbox_xmin; + break; + case 1: + val = prior_vec.y + bbox_ymin; + break; + case 2: + val = prior_vec.z + bbox_xmax; + break; + case 3: + val = prior_vec.w + bbox_ymax; + break; + } + + if (clip_bbox) + val = max(min(val, (Dtype)1.), (Dtype)0.); + + bbox_data[index] = val; + } +} + +__kernel void DecodeBBoxesCENTER_SIZE(const int nthreads, + __global const Dtype* loc_data, + __global const Dtype* prior_data, + const int variance_encoded_in_target, + const int num_priors, + const int share_location, + const int num_loc_classes, + const int background_label_id, + const int clip_bbox, + __global Dtype* bbox_data) +{ + for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) + { + Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax; + const int i = index % 4; + const int p = ((index / 4 / num_loc_classes) % num_priors) * 4; + + const int c = (index / 4) % num_loc_classes; + int label = share_location ? -1 : c; + if (label == background_label_id) + return; // Ignore background class. + + Dtype4 loc_vec = vload4(0, loc_data + index - i); + Dtype4 bbox_vec, prior_variance; + if (variance_encoded_in_target) + { + bbox_vec = loc_vec; + } else { + const int start_index = num_priors * 4 + p; + prior_variance = vload4(0, prior_data + start_index); + bbox_vec = loc_vec * prior_variance; + } + + bbox_xmin = bbox_vec.x; + bbox_ymin = bbox_vec.y; + bbox_xmax = bbox_vec.z; + bbox_ymax = bbox_vec.w; + + Dtype4 prior_vec = vload4(0, prior_data + p); + Dtype prior_width = prior_vec.z - prior_vec.x; + Dtype prior_height = prior_vec.w - prior_vec.y; + Dtype prior_center_x = (prior_vec.x + prior_vec.z) * .5; + Dtype prior_center_y = (prior_vec.y + prior_vec.w) * .5; + + Dtype decode_bbox_center_x, decode_bbox_center_y; + Dtype decode_bbox_width, decode_bbox_height; + decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x; + decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y; + decode_bbox_width = exp(bbox_xmax) * prior_width; + decode_bbox_height = exp(bbox_ymax) * prior_height; + + Dtype val; + switch (i) + { + case 0: + val = decode_bbox_center_x - decode_bbox_width * .5; + break; + case 1: + val = decode_bbox_center_y - decode_bbox_height * .5; + break; + case 2: + val = decode_bbox_center_x + decode_bbox_width * .5; + break; + case 3: + val = decode_bbox_center_y + decode_bbox_height * .5; + break; + } + + if (clip_bbox) + val = max(min(val, (Dtype)1.), (Dtype)0.); + + bbox_data[index] = val; + } +}