pooling ocl kernel optimization

set global size with real output size, also optimize max pooling index computation if necessary. Signed-off-by: Li Peng <peng.li@intel.com>
2018-06-27 15:15:56 +08:00 · 2018-06-27 15:15:56 +08:00 · 145eae321e
commit 145eae321e
parent e4b51fa8ad
5 changed files with 69 additions and 62 deletions
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@ -1446,7 +1446,7 @@ struct Net::Impl
            // TODO: OpenCL target support more fusion styles.
            if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
                 (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
-                 ld.layerInstance->type != "MVN")) )
+                 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling")) )
                continue;

            Ptr<Layer>& currLayer = ld.layerInstance;
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@ -165,6 +165,7 @@ public:
                                (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
                                               LIBDNN_POOLING_METHOD_STO);
            config.avePoolPaddedArea = avePoolPaddedArea;
+            config.computeMaxIdx = computeMaxIdx;
            config.use_half = use_half;
            poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
        }
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@ -352,6 +352,7 @@ struct OCL4DNNPoolConfig
        pool_method(LIBDNN_POOLING_METHOD_MAX),
        global_pooling(false),
        avePoolPaddedArea(true),
+        computeMaxIdx(true),
        use_half(false)
    {}
    MatShape in_shape;
@ -365,6 +366,7 @@ struct OCL4DNNPoolConfig
    ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
    bool global_pooling; // = false;
    bool avePoolPaddedArea;
+    bool computeMaxIdx;
    bool use_half;
 };

@ -399,6 +401,7 @@ class OCL4DNNPool
        int32_t pooled_height_;
        int32_t pooled_width_;
        bool avePoolPaddedArea;
+        bool computeMaxIdx;
        bool use_half;
 };

--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_pool.cpp
@ -56,6 +56,7 @@ OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
    channels_ = config.channels;
    pool_method_ = config.pool_method;
    avePoolPaddedArea = config.avePoolPaddedArea;
+    computeMaxIdx = config.computeMaxIdx;
    use_half = config.use_half;

    for (int i = 0; i < spatial_dims; ++i)
@ -97,7 +98,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
                                 UMat& top_mask)
 {
    bool ret = true;
-    size_t global[] = { 128 * 128 };
+    size_t global[] = { (size_t)count_ };
    size_t local[] = { 128 };

    // support 2D case
@ -105,8 +106,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
    {
    case LIBDNN_POOLING_METHOD_MAX:
        {
-            bool haveMask = !top_mask.empty();
-            String kname = haveMask ? "max_pool_forward_mask" : "max_pool_forward";
+            String kname = computeMaxIdx ? "max_pool_forward_mask" : "max_pool_forward";
            kname += (use_half) ? "_half" : "_float";
            ocl::Kernel oclk_max_pool_forward(
                kname.c_str(),
@ -118,7 +118,7 @@ bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
-                       haveMask ? " -D HAVE_MASK=1" : ""
+                       computeMaxIdx ? " -D HAVE_MASK=1" : ""
                ));

            if (oclk_max_pool_forward.empty())
--- a/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@ -65,36 +65,40 @@ __kernel void
 #endif
 )
 {
-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0))
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_H;
+  int wstart = pw * STRIDE_W - PAD_W;
+  Dtype maxval = -FLT_MAX;
+  int maxidx = -1;
+  int in_offset = ch * height * width;
+  for (int h = 0; h < KERNEL_H; ++h)
  {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
-    const int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * STRIDE_H - PAD_H;
-    int wstart = pw * STRIDE_W - PAD_W;
-    const int hend = min(hstart + KERNEL_H, height);
-    const int wend = min(wstart + KERNEL_W, width);
-    hstart = max(hstart, (int)0);
-    wstart = max(wstart, (int)0);
-    Dtype maxval = -FLT_MAX;
-    int maxidx = -1;
-    __global const Dtype* bottom_slice = bottom_data
-        + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (bottom_slice[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = bottom_slice[maxidx];
+    int off_y = hstart + h;
+    if (off_y >= 0 && off_y < height)
+    {
+      for (int w = 0; w < KERNEL_W; ++w)
+      {
+        int off_x = wstart + w;
+        if (off_x >= 0 && off_x < width)
+        {
+          Dtype val = bottom_data[in_offset + off_y * width + off_x];
+          maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;
+          maxval = fmax(val, maxval);
        }
      }
    }
-    top_data[index] = maxval;
-#ifdef HAVE_MASK
-    mask[index] = maxidx;
-#endif
  }
+  top_data[index] = maxval;
+#ifdef HAVE_MASK
+  mask[index] = maxidx;
+#endif
 }

 #elif defined KERNEL_AVE_POOL
@ -105,43 +109,42 @@ __kernel void TEMPLATE(ave_pool_forward, Dtype)(
    const int pooled_height, const int pooled_width,
    __global Dtype* top_data)
 {
-  for (int index = get_global_id(0); index < nthreads;
-      index += get_global_size(0))
-  {
-    {
-      const int pw = index % pooled_width;
-      const int ph = (index / pooled_width) % pooled_height;
-      const int c = (index / pooled_width / pooled_height) % channels;
-      const int n = index / pooled_width / pooled_height / channels;
-      int hstart = ph * STRIDE_H - PAD_H;
-      int wstart = pw * STRIDE_W - PAD_W;
-      int hend = min(hstart + KERNEL_H, height + PAD_H);
-      int wend = min(wstart + KERNEL_W, width + PAD_W);
-      int pool_size;
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_H;
+  int wstart = pw * STRIDE_W - PAD_W;
+  int hend = min(hstart + KERNEL_H, height + PAD_H);
+  int wend = min(wstart + KERNEL_W, width + PAD_W);
+  int pool_size;
 #ifdef AVE_POOL_PADDING_AREA
-      pool_size = (hend - hstart) * (wend - wstart);
-      hstart = max(hstart, (int)0);
-      wstart = max(wstart, (int)0);
-      hend = min(hend, height);
-      wend = min(wend, width);
+  pool_size = (hend - hstart) * (wend - wstart);
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
 #else
-      hstart = max(hstart, (int)0);
-      wstart = max(wstart, (int)0);
-      hend = min(hend, height);
-      wend = min(wend, width);
-      pool_size = (hend - hstart) * (wend - wstart);
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
+  pool_size = (hend - hstart) * (wend - wstart);
 #endif
-      Dtype aveval = 0;
-      __global const Dtype* bottom_slice = bottom_data
-          + (n * channels + c) * height * width;
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          aveval += bottom_slice[h * width + w];
-        }
-      }
-      top_data[index] = aveval / pool_size;
+  Dtype aveval = 0;
+  int in_offset = ch * height * width;
+  for (int h = hstart; h < hend; ++h)
+  {
+    for (int w = wstart; w < wend; ++w)
+    {
+      aveval += bottom_data[in_offset + h * width + w];
    }
  }
+  top_data[index] = aveval / pool_size;
 }

 #elif defined KERNEL_STO_POOL