From ba5e8befa95a72f720cf2d03c1fe04aa222f2847 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Thu, 26 Apr 2018 19:36:19 +0800 Subject: [PATCH] fp16 ocl support for more layers Signed-off-by: Li Peng --- modules/dnn/src/layers/batch_norm_layer.cpp | 13 ++++- modules/dnn/src/layers/blank_layer.cpp | 2 +- .../dnn/src/layers/detection_output_layer.cpp | 50 ++++++++++++++++--- .../dnn/src/layers/normalize_bbox_layer.cpp | 5 +- modules/dnn/src/layers/permute_layer.cpp | 6 ++- modules/dnn/src/layers/prior_box_layer.cpp | 15 ++++-- modules/dnn/src/layers/proposal_layer.cpp | 5 +- modules/dnn/src/layers/region_layer.cpp | 4 +- modules/dnn/src/layers/reorg_layer.cpp | 5 +- modules/dnn/src/layers/reshape_layer.cpp | 2 +- modules/dnn/src/opencl/batchnorm.cl | 29 ++++++----- modules/dnn/src/opencl/permute.cl | 4 +- modules/dnn/src/opencl/prior_box.cl | 27 +++++----- modules/dnn/src/opencl/reorg.cl | 4 ++ 14 files changed, 121 insertions(+), 50 deletions(-) diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp index c2906b63f5..2005254cd3 100644 --- a/modules/dnn/src/layers/batch_norm_layer.cpp +++ b/modules/dnn/src/layers/batch_norm_layer.cpp @@ -120,12 +120,16 @@ public: std::vector inputs; std::vector outputs; + bool use_half = (inputs_.depth() == CV_16S); inputs_.getUMatVector(inputs); outputs_.getUMatVector(outputs); CV_Assert(blobs.size() >= 2); CV_Assert(inputs.size() == 1); + if (use_half && inputs[0].dims == 2) + return false; + if (umat_weight.empty()) { umat_weight = weights_.getUMat(ACCESS_READ); @@ -139,6 +143,7 @@ public: int rows = inpBlob.dims > 2 ? inpBlob.size[2] : 1; int cols = inpBlob.dims > 2 ? inpBlob.size[3] : 1; + String opts = (use_half) ? " -DDtype=half" : " -DDtype=float"; for (size_t ii = 0; ii < outputs.size(); ii++) { if (inpBlob.dims == 2) @@ -154,8 +159,12 @@ public: UMat src = inputs[ii].reshape(1, s.size(), &s[0]); UMat dst = outputs[ii].reshape(1, s.size(), &s[0]); int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1); - String buildopt = format("-DNUM=%d", number); + String buildopt = format("-DNUM=%d", number) + opts; String kname = format("batch_norm%d", number); + if (number == 1) + buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float"); + else + buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number); ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt); if (kernel.empty()) return false; @@ -181,7 +190,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 0794eff9af..847b6228df 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -95,7 +95,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp index 5c4be180a1..44f7b32853 100644 --- a/modules/dnn/src/layers/detection_output_layer.cpp +++ b/modules/dnn/src/layers/detection_output_layer.cpp @@ -307,8 +307,24 @@ public: std::vector inputs; std::vector outputs; - inps.getUMatVector(inputs); - outs.getUMatVector(outputs); + bool use_half = (inps.depth() == CV_16S); + if (use_half) + { + std::vector orig_inputs; + std::vector orig_outputs; + + inps.getUMatVector(orig_inputs); + outs.getUMatVector(orig_outputs); + + inputs.resize(orig_inputs.size()); + for (size_t i = 0; i < orig_inputs.size(); i++) + convertFp16(orig_inputs[i], inputs[i]); + } + else + { + inps.getUMatVector(inputs); + outs.getUMatVector(outputs); + } std::vector allDecodedBBoxes; std::vector allConfidenceScores; @@ -342,7 +358,13 @@ public: { // Set confidences to zeros. Range ranges[] = {Range::all(), Range::all(), Range::all(), Range(2, 3)}; - outputs[0](ranges).setTo(0); + if (use_half) + { + std::vector orig_outputs; + outs.getUMatVector(orig_outputs); + orig_outputs[0](ranges).setTo(0); + } else + outputs[0](ranges).setTo(0); return true; } int outputShape[] = {1, 1, (int)numKept, 7}; @@ -360,9 +382,23 @@ public: } CV_Assert(count == numKept); } - outputs.clear(); - outputs.push_back(umat); - outs.assign(outputs); + + if (use_half) + { + UMat half_umat; + convertFp16(umat, half_umat); + + std::vector orig_outputs; + outs.getUMatVector(orig_outputs); + orig_outputs.clear(); + orig_outputs.push_back(half_umat); + outs.assign(orig_outputs); + } else { + outputs.clear(); + outputs.push_back(umat); + outs.assign(outputs); + } + return true; } #endif @@ -372,7 +408,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index 5e8ed65157..e2fc2c9b27 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -87,6 +87,9 @@ public: std::vector outputs; std::vector internals; + if (inputs_.depth() == CV_16S) + return false; + inputs_.getUMatVector(inputs); outputs_.getUMatVector(outputs); internals_.getUMatVector(internals); @@ -162,7 +165,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp index 85ce8837bc..d4f756ced5 100644 --- a/modules/dnn/src/layers/permute_layer.cpp +++ b/modules/dnn/src/layers/permute_layer.cpp @@ -288,9 +288,11 @@ public: if (!_needsPermute) return false; + bool use_half = (inps.depth() == CV_16S); + String opts = format("-DDtype=%s", use_half ? "half" : "float"); for (size_t i = 0; i < inputs.size(); i++) { - ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc); + ocl::Kernel kernel("permute", ocl::dnn::permute_oclsrc, opts); kernel.set(0, (int)_count); kernel.set(1, ocl::KernelArg::PtrReadOnly(inputs[i])); @@ -313,7 +315,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp index 81a7392376..b854c2602a 100644 --- a/modules/dnn/src/layers/prior_box_layer.cpp +++ b/modules/dnn/src/layers/prior_box_layer.cpp @@ -316,6 +316,7 @@ public: std::vector inputs; std::vector outputs; + bool use_half = (inps.depth() == CV_16S); inps.getUMatVector(inputs); outs.getUMatVector(outputs); @@ -340,9 +341,15 @@ public: heights.copyTo(umat_heights); } - size_t nthreads = _layerHeight * _layerWidth; + String opts; + if (use_half) + opts = "-DDtype=half -DDtype4=half4 -Dconvert_T=convert_half4"; + else + opts = "-DDtype=float -DDtype4=float4 -Dconvert_T=convert_float4"; + + size_t nthreads = _layerHeight * _layerWidth; + ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc, opts); - ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc); kernel.set(0, (int)nthreads); kernel.set(1, (float)_stepX); kernel.set(2, (float)_stepY); @@ -375,7 +382,7 @@ public: // set the variance. { - ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc); + ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc, opts); int offset = total(shape(outputs[0]), 2); size_t nthreads = _layerHeight * _layerWidth * _numPriors; kernel.set(0, (int)nthreads); @@ -395,7 +402,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp index 7784e700ba..44671268a7 100644 --- a/modules/dnn/src/layers/proposal_layer.cpp +++ b/modules/dnn/src/layers/proposal_layer.cpp @@ -158,6 +158,9 @@ public: std::vector outputs; std::vector internals; + if (inputs_.depth() == CV_16S) + return false; + inputs_.getUMatVector(inputs); outputs_.getUMatVector(outputs); internals_.getUMatVector(internals); @@ -237,7 +240,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp index bcf038ce9f..125fa0d14d 100644 --- a/modules/dnn/src/layers/region_layer.cpp +++ b/modules/dnn/src/layers/region_layer.cpp @@ -127,7 +127,7 @@ public: std::vector outputs; // TODO: implement a logistic activation to classification scores. - if (useLogistic) + if (useLogistic || inps.depth() == CV_16S) return false; inps.getUMatVector(inputs); @@ -191,7 +191,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp index a34264f931..f6102c4ef5 100644 --- a/modules/dnn/src/layers/reorg_layer.cpp +++ b/modules/dnn/src/layers/reorg_layer.cpp @@ -96,9 +96,10 @@ public: std::vector inputs; std::vector outputs; + bool use_half = (inps.depth() == CV_16S); inps.getUMatVector(inputs); outs.getUMatVector(outputs); - String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0].type()) + String(" "); + String buildopt= format("-DDtype=%s ", use_half ? "half" : "float"); for (size_t i = 0; i < inputs.size(); i++) { @@ -134,7 +135,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp index f33ed423de..6b2100cdab 100644 --- a/modules/dnn/src/layers/reshape_layer.cpp +++ b/modules/dnn/src/layers/reshape_layer.cpp @@ -219,7 +219,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) && + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()), forward_ocl(inputs_arr, outputs_arr, internals_arr)) diff --git a/modules/dnn/src/opencl/batchnorm.cl b/modules/dnn/src/opencl/batchnorm.cl index e0072c9fea..bdd3d0a443 100644 --- a/modules/dnn/src/opencl/batchnorm.cl +++ b/modules/dnn/src/opencl/batchnorm.cl @@ -40,24 +40,27 @@ // //M*/ -#define Dtype float -#define Dtype4 float4 -#define Dtype8 float8 +#if defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif #if NUM == 8 #define load(src, index) vload8(0, src + index) #define store(vec, dst, index) vstore8(vec, 0, dst + index) - #define vec_type Dtype8 + #define float_type float8 + #define convert_f convert_float8 #define BATCH_NORM batch_norm8 #elif NUM == 4 #define load(src, index) vload4(0, src + index) #define store(vec, dst, index) vstore4(vec, 0, dst + index) - #define vec_type Dtype4 + #define float_type float4 + #define convert_f convert_float4 #define BATCH_NORM batch_norm4 #elif NUM == 1 #define load(src, index) src[index] #define store(vec, dst, index) dst[index] = vec - #define vec_type Dtype + #define float_type float + #define convert_f convert_float #define BATCH_NORM batch_norm1 #endif @@ -65,8 +68,8 @@ __kernel void BATCH_NORM(__global const Dtype* src, const int rows, const int cols, const int channels, - __global const Dtype* weight, - __global const Dtype* bias, + __global const float* weight, + __global const float* bias, __global Dtype* dst) { int x = get_global_id(0); @@ -76,9 +79,9 @@ __kernel void BATCH_NORM(__global const Dtype* src, if (x >= rows || y >= cols) return; - Dtype w = weight[x % channels]; - Dtype b = bias[x % channels]; - vec_type src_vec = load(src, index); - vec_type dst_vec = src_vec * w + (vec_type)b; - store(dst_vec, dst, index); + float w = weight[x % channels]; + float b = bias[x % channels]; + float_type src_vec = convert_f(load(src, index)); + float_type dst_vec = src_vec * w + (float_type)b; + store(convert_T(dst_vec), dst, index); } diff --git a/modules/dnn/src/opencl/permute.cl b/modules/dnn/src/opencl/permute.cl index 38aa7990c1..9e709f201c 100644 --- a/modules/dnn/src/opencl/permute.cl +++ b/modules/dnn/src/opencl/permute.cl @@ -40,7 +40,9 @@ // //M*/ -#define Dtype float +#if defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif __kernel void permute(const int nthreads, __global Dtype* bottom_data, diff --git a/modules/dnn/src/opencl/prior_box.cl b/modules/dnn/src/opencl/prior_box.cl index c51cd43830..6ffbf8df29 100644 --- a/modules/dnn/src/opencl/prior_box.cl +++ b/modules/dnn/src/opencl/prior_box.cl @@ -39,17 +39,18 @@ // //M*/ -#define Dtype float -#define Dtype4 float4 +#if defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif __kernel void prior_box(const int nthreads, - const Dtype stepX, - const Dtype stepY, - __global const Dtype* _offsetsX, - __global const Dtype* _offsetsY, + const float stepX, + const float stepY, + __global const float* _offsetsX, + __global const float* _offsetsY, const int offsetsX_size, - __global const Dtype* _widths, - __global const Dtype* _heights, + __global const float* _widths, + __global const float* _heights, const int widths_size, __global Dtype* dst, const int _layerHeight, @@ -65,7 +66,7 @@ __kernel void prior_box(const int nthreads, outputPtr = dst + index * 4 * offsetsX_size * widths_size; - Dtype _boxWidth, _boxHeight; + float _boxWidth, _boxHeight; Dtype4 vec; for (int i = 0; i < widths_size; ++i) { @@ -73,8 +74,8 @@ __kernel void prior_box(const int nthreads, _boxHeight = _heights[i]; for (int j = 0; j < offsetsX_size; ++j) { - float center_x = (w + _offsetsX[j]) * stepX; - float center_y = (h + _offsetsY[j]) * stepY; + Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX; + Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY; vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin @@ -91,7 +92,7 @@ __kernel void prior_box(const int nthreads, __kernel void set_variance(const int nthreads, const int offset, const int variance_size, - __global const Dtype* variance, + __global const float* variance, __global Dtype* dst) { for (int index = get_global_id(0); index < nthreads; index += get_global_size(0)) @@ -101,7 +102,7 @@ __kernel void set_variance(const int nthreads, if (variance_size == 1) var_vec = (Dtype4)(variance[0]); else - var_vec = vload4(0, variance); + var_vec = convert_T(vload4(0, variance)); vstore4(var_vec, 0, dst + offset + index * 4); } diff --git a/modules/dnn/src/opencl/reorg.cl b/modules/dnn/src/opencl/reorg.cl index a4b9caea84..62df3cceca 100644 --- a/modules/dnn/src/opencl/reorg.cl +++ b/modules/dnn/src/opencl/reorg.cl @@ -39,6 +39,10 @@ // //M*/ +#if defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif + __kernel void reorg(const int count, __global const Dtype* src, const int channels,