pooling ocl kernel optimization
set global size with real output size, also optimize max pooling index computation if necessary. Signed-off-by: Li Peng <peng.li@intel.com>
This commit is contained in:
parent
e4b51fa8ad
commit
145eae321e
@ -1446,7 +1446,7 @@ struct Net::Impl
|
||||
// TODO: OpenCL target support more fusion styles.
|
||||
if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
|
||||
(!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
|
||||
ld.layerInstance->type != "MVN")) )
|
||||
ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
|
||||
continue;
|
||||
|
||||
Ptr<Layer>& currLayer = ld.layerInstance;
|
||||
|
||||
@ -165,6 +165,7 @@ public:
|
||||
(type == AVE ? LIBDNN_POOLING_METHOD_AVE :
|
||||
LIBDNN_POOLING_METHOD_STO);
|
||||
config.avePoolPaddedArea = avePoolPaddedArea;
|
||||
config.computeMaxIdx = computeMaxIdx;
|
||||
config.use_half = use_half;
|
||||
poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
|
||||
}
|
||||
|
||||
@ -352,6 +352,7 @@ struct OCL4DNNPoolConfig
|
||||
pool_method(LIBDNN_POOLING_METHOD_MAX),
|
||||
global_pooling(false),
|
||||
avePoolPaddedArea(true),
|
||||
computeMaxIdx(true),
|
||||
use_half(false)
|
||||
{}
|
||||
MatShape in_shape;
|
||||
@ -365,6 +366,7 @@ struct OCL4DNNPoolConfig
|
||||
ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
|
||||
bool global_pooling; // = false;
|
||||
bool avePoolPaddedArea;
|
||||
bool computeMaxIdx;
|
||||
bool use_half;
|
||||
};
|
||||
|
||||
@ -399,6 +401,7 @@ class OCL4DNNPool
|
||||
int32_t pooled_height_;
|
||||
int32_t pooled_width_;
|
||||
bool avePoolPaddedArea;
|
||||
bool computeMaxIdx;
|
||||
bool use_half;
|
||||
};
|
||||
|
||||
|
||||
@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
|
||||
channels_ = config.channels;
|
||||
pool_method_ = config.pool_method;
|
||||
avePoolPaddedArea = config.avePoolPaddedArea;
|
||||
computeMaxIdx = config.computeMaxIdx;
|
||||
use_half = config.use_half;
|
||||
|
||||
for (int i = 0; i < spatial_dims; ++i)
|
||||
@ -97,7 +98,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
|
||||
UMat& top_mask)
|
||||
{
|
||||
bool ret = true;
|
||||
size_t global[] = { 128 * 128 };
|
||||
size_t global[] = { (size_t)count_ };
|
||||
size_t local[] = { 128 };
|
||||
|
||||
// support 2D case
|
||||
@ -105,8 +106,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
|
||||
{
|
||||
case LIBDNN_POOLING_METHOD_MAX:
|
||||
{
|
||||
bool haveMask = !top_mask.empty();
|
||||
String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
|
||||
String kname = computeMaxIdx ? "max_pool_forward_mask" : "max_pool_forward";
|
||||
kname += (use_half) ? "_half" : "_float";
|
||||
ocl::Kernel oclk_max_pool_forward(
|
||||
kname.c_str(),
|
||||
@ -118,7 +118,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
|
||||
kernel_w_, kernel_h_,
|
||||
stride_w_, stride_h_,
|
||||
pad_w_, pad_h_,
|
||||
haveMask ? " -D HAVE_MASK=1" : ""
|
||||
computeMaxIdx ? " -D HAVE_MASK=1" : ""
|
||||
));
|
||||
|
||||
if (oclk_max_pool_forward.empty())
|
||||
|
||||
@ -65,36 +65,40 @@ __kernel void
|
||||
#endif
|
||||
)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads;
|
||||
index += get_global_size(0))
|
||||
int index = get_global_id(0);
|
||||
if (index >= nthreads)
|
||||
return;
|
||||
|
||||
const int pw = index % pooled_width;
|
||||
const int xx = index / pooled_width;
|
||||
const int ph = xx % pooled_height;
|
||||
const int ch = xx / pooled_height;
|
||||
int hstart = ph * STRIDE_H - PAD_H;
|
||||
int wstart = pw * STRIDE_W - PAD_W;
|
||||
Dtype maxval = -FLT_MAX;
|
||||
int maxidx = -1;
|
||||
int in_offset = ch * height * width;
|
||||
for (int h = 0; h < KERNEL_H; ++h)
|
||||
{
|
||||
const int pw = index % pooled_width;
|
||||
const int ph = (index / pooled_width) % pooled_height;
|
||||
const int c = (index / pooled_width / pooled_height) % channels;
|
||||
const int n = index / pooled_width / pooled_height / channels;
|
||||
int hstart = ph * STRIDE_H - PAD_H;
|
||||
int wstart = pw * STRIDE_W - PAD_W;
|
||||
const int hend = min(hstart + KERNEL_H, height);
|
||||
const int wend = min(wstart + KERNEL_W, width);
|
||||
hstart = max(hstart, (int)0);
|
||||
wstart = max(wstart, (int)0);
|
||||
Dtype maxval = -FLT_MAX;
|
||||
int maxidx = -1;
|
||||
__global const Dtype* bottom_slice = bottom_data
|
||||
+ (n * channels + c) * height * width;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
if (bottom_slice[h * width + w] > maxval) {
|
||||
maxidx = h * width + w;
|
||||
maxval = bottom_slice[maxidx];
|
||||
int off_y = hstart + h;
|
||||
if (off_y >= 0 && off_y < height)
|
||||
{
|
||||
for (int w = 0; w < KERNEL_W; ++w)
|
||||
{
|
||||
int off_x = wstart + w;
|
||||
if (off_x >= 0 && off_x < width)
|
||||
{
|
||||
Dtype val = bottom_data[in_offset + off_y * width + off_x];
|
||||
maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;
|
||||
maxval = fmax(val, maxval);
|
||||
}
|
||||
}
|
||||
}
|
||||
top_data[index] = maxval;
|
||||
#ifdef HAVE_MASK
|
||||
mask[index] = maxidx;
|
||||
#endif
|
||||
}
|
||||
top_data[index] = maxval;
|
||||
#ifdef HAVE_MASK
|
||||
mask[index] = maxidx;
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif defined KERNEL_AVE_POOL
|
||||
@ -105,43 +109,42 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
|
||||
const int pooled_height, const int pooled_width,
|
||||
__global Dtype* top_data)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads;
|
||||
index += get_global_size(0))
|
||||
{
|
||||
{
|
||||
const int pw = index % pooled_width;
|
||||
const int ph = (index / pooled_width) % pooled_height;
|
||||
const int c = (index / pooled_width / pooled_height) % channels;
|
||||
const int n = index / pooled_width / pooled_height / channels;
|
||||
int hstart = ph * STRIDE_H - PAD_H;
|
||||
int wstart = pw * STRIDE_W - PAD_W;
|
||||
int hend = min(hstart + KERNEL_H, height + PAD_H);
|
||||
int wend = min(wstart + KERNEL_W, width + PAD_W);
|
||||
int pool_size;
|
||||
int index = get_global_id(0);
|
||||
if (index >= nthreads)
|
||||
return;
|
||||
|
||||
const int pw = index % pooled_width;
|
||||
const int xx = index / pooled_width;
|
||||
const int ph = xx % pooled_height;
|
||||
const int ch = xx / pooled_height;
|
||||
int hstart = ph * STRIDE_H - PAD_H;
|
||||
int wstart = pw * STRIDE_W - PAD_W;
|
||||
int hend = min(hstart + KERNEL_H, height + PAD_H);
|
||||
int wend = min(wstart + KERNEL_W, width + PAD_W);
|
||||
int pool_size;
|
||||
#ifdef AVE_POOL_PADDING_AREA
|
||||
pool_size = (hend - hstart) * (wend - wstart);
|
||||
hstart = max(hstart, (int)0);
|
||||
wstart = max(wstart, (int)0);
|
||||
hend = min(hend, height);
|
||||
wend = min(wend, width);
|
||||
pool_size = (hend - hstart) * (wend - wstart);
|
||||
hstart = max(hstart, (int)0);
|
||||
wstart = max(wstart, (int)0);
|
||||
hend = min(hend, height);
|
||||
wend = min(wend, width);
|
||||
#else
|
||||
hstart = max(hstart, (int)0);
|
||||
wstart = max(wstart, (int)0);
|
||||
hend = min(hend, height);
|
||||
wend = min(wend, width);
|
||||
pool_size = (hend - hstart) * (wend - wstart);
|
||||
hstart = max(hstart, (int)0);
|
||||
wstart = max(wstart, (int)0);
|
||||
hend = min(hend, height);
|
||||
wend = min(wend, width);
|
||||
pool_size = (hend - hstart) * (wend - wstart);
|
||||
#endif
|
||||
Dtype aveval = 0;
|
||||
__global const Dtype* bottom_slice = bottom_data
|
||||
+ (n * channels + c) * height * width;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
aveval += bottom_slice[h * width + w];
|
||||
}
|
||||
}
|
||||
top_data[index] = aveval / pool_size;
|
||||
Dtype aveval = 0;
|
||||
int in_offset = ch * height * width;
|
||||
for (int h = hstart; h < hend; ++h)
|
||||
{
|
||||
for (int w = wstart; w < wend; ++w)
|
||||
{
|
||||
aveval += bottom_data[in_offset + h * width + w];
|
||||
}
|
||||
}
|
||||
top_data[index] = aveval / pool_size;
|
||||
}
|
||||
|
||||
#elif defined KERNEL_STO_POOL
|
||||
|
||||
Loading…
Reference in New Issue
Block a user