diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl
index e917864d73..140a4d746c 100644
--- a/modules/nonfree/src/opencl/surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduc
 #define op(A, B) (*A)+(B)
     data[tid] = *partial_reduction;
     barrier(CLK_LOCAL_MEM_FENCE);
-
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
     if (tid < 16)
+    {
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
+#if WAVE_SIZE < 16
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
     if (tid < 8)
+    {
+#endif
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
+#if WAVE_SIZE < 8
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
     if (tid < 4)
+    {
+#endif
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
+#if WAVE_SIZE < 4
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
     if (tid < 2)
+    {
+#endif
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
+#if WAVE_SIZE < 2
+    }
     barrier(CLK_LOCAL_MEM_FENCE);
     if (tid < 1)
+    {
+#endif
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
+    }
+#undef WAVE_SIZE
 #undef op
 }
 
@@ -1087,44 +1108,67 @@ void reduce_sum25(
     int tid
     )
 {
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
     // first step is to reduce from 25 to 16
-    if (tid < 9) // use 9 threads
+    if (tid < 9)
     {
         sdata1[tid] += sdata1[tid + 16];
         sdata2[tid] += sdata2[tid + 16];
         sdata3[tid] += sdata3[tid + 16];
         sdata4[tid] += sdata4[tid + 16];
+#if WAVE_SIZE < 16
     }
-
-    // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+    barrier(CLK_LOCAL_MEM_FENCE);
     if (tid < 8)
     {
+#endif
         sdata1[tid] += sdata1[tid + 8];
-        sdata1[tid] += sdata1[tid + 4];
-        sdata1[tid] += sdata1[tid + 2];
-        sdata1[tid] += sdata1[tid + 1];
 
         sdata2[tid] += sdata2[tid + 8];
-        sdata2[tid] += sdata2[tid + 4];
-        sdata2[tid] += sdata2[tid + 2];
-        sdata2[tid] += sdata2[tid + 1];
 
         sdata3[tid] += sdata3[tid + 8];
-        sdata3[tid] += sdata3[tid + 4];
-        sdata3[tid] += sdata3[tid + 2];
-        sdata3[tid] += sdata3[tid + 1];
 
         sdata4[tid] += sdata4[tid + 8];
+#if WAVE_SIZE < 8
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 4)
+    {
+#endif
+        sdata1[tid] += sdata1[tid + 4];
+        sdata2[tid] += sdata2[tid + 4];
+        sdata3[tid] += sdata3[tid + 4];
         sdata4[tid] += sdata4[tid + 4];
+#if WAVE_SIZE < 4
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 2)
+    {
+#endif
+        sdata1[tid] += sdata1[tid + 2];
+        sdata2[tid] += sdata2[tid + 2];
+        sdata3[tid] += sdata3[tid + 2];
         sdata4[tid] += sdata4[tid + 2];
+#if WAVE_SIZE < 2
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 1)
+    {
+#endif
+        sdata1[tid] += sdata1[tid + 1];
+        sdata2[tid] += sdata2[tid + 1];
+        sdata3[tid] += sdata3[tid + 1];
         sdata4[tid] += sdata4[tid + 1];
     }
+#undef WAVE_SIZE
 }
 
 __kernel
     void compute_descriptors64(
     IMAGE_INT8 imgTex,
-    volatile __global float * descriptors,
+    __global float * descriptors,
     __global const float * keypoints,
     int descriptors_step,
     int keypoints_step,
@@ -1158,14 +1202,13 @@ __kernel
         sdyabs[tid] = fabs(sdy[tid]); // |dy| array
     }
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 25)
-    {
+
         reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-    }
+ 
     barrier(CLK_LOCAL_MEM_FENCE);
     if (tid < 25)
     {
-        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
+        __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
 
         // write dx, dy, |dx|, |dy|
         if (tid == 0)
@@ -1180,7 +1223,7 @@ __kernel
 __kernel
     void compute_descriptors128(
     IMAGE_INT8 imgTex,
-    __global volatile float * descriptors,
+    __global float * descriptors,
     __global float * keypoints,
     int descriptors_step,
     int keypoints_step,
@@ -1229,13 +1272,15 @@ __kernel
             sd2[tid] = sdx[tid];
             sdabs2[tid] = fabs(sdx[tid]);
         }
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 
         reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
-
-        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
+    __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+    if (tid < 25)
+    {
         // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
         if (tid == 0)
         {
@@ -1259,11 +1304,14 @@ __kernel
             sd2[tid] = sdy[tid];
             sdabs2[tid] = fabs(sdy[tid]);
         }
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 
         reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
 
+    if (tid < 25)
+    {
         // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
         if (tid == 0)
         {
@@ -1274,6 +1322,103 @@ __kernel
         }
     }
 }
+void reduce_sum128(volatile __local  float* smem, int tid)
+{
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+    if (tid < 64)
+    {
+        smem[tid] += smem[tid + 64];
+#if WAVE_SIZE < 64
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 32) 
+    {
+#endif
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) 
+    {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem[tid] += smem[tid + 8];
+#if WAVE_SIZE < 8
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 4)
+    {
+#endif
+        smem[tid] += smem[tid + 4];
+#if WAVE_SIZE < 4
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 2)
+    {
+#endif
+        smem[tid] += smem[tid + 2];
+#if WAVE_SIZE < 2
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 1)
+    {
+#endif
+        smem[tid] += smem[tid + 1];
+    }
+}
+void reduce_sum64(volatile __local  float* smem, int tid)
+{
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) 
+    {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem[tid] += smem[tid + 8];
+#if WAVE_SIZE < 8
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 4)
+    {
+#endif
+        smem[tid] += smem[tid + 4];
+#if WAVE_SIZE < 4
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 2)
+    {
+#endif
+        smem[tid] += smem[tid + 2];
+#if WAVE_SIZE < 2
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 1)
+    {
+#endif
+        smem[tid] += smem[tid + 1];
+    }
+}
 
 __kernel
     void normalize_descriptors128(__global float * descriptors, int descriptors_step)
@@ -1288,22 +1433,10 @@ __kernel
     sqDesc[get_local_id(0)] = lookup * lookup;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (get_local_id(0) < 64)
-        sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
+    reduce_sum128(sqDesc, get_local_id(0));
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    // reduction to get total
-    if (get_local_id(0) < 32)
-    {
-        volatile __local  float* smem = sqDesc;
 
-        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
-    }
 
     // compute length (square root)
     volatile __local  float len;
@@ -1329,18 +1462,9 @@ __kernel
     sqDesc[get_local_id(0)] = lookup * lookup;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    // reduction to get total
-    if (get_local_id(0) < 32)
-    {
-        volatile __local  float* smem = sqDesc;
 
-        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
-    }
+    reduce_sum64(sqDesc, get_local_id(0));
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // compute length (square root)
     volatile __local  float len;
diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp
index d8336b9387..78864c6f96 100644
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -43,6 +43,7 @@
 //
 //M*/
 #include "precomp.hpp"
+#include <cstdio>
 
 #ifdef HAVE_OPENCV_OCL
 
@@ -57,25 +58,35 @@ namespace cv
         ///////////////////////////OpenCL kernel strings///////////////////////////
         extern const char *surf;
 
-        const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+        const char noImage2dOption [] = "-D DISABLE_IMAGE2D";
 
+        static char SURF_OPTIONS [1024] = ""; 
+        static bool USE_IMAGE2d = false;
         static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
             size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels, int depth)
         {
-            if(support_image2d())
+            char * pSURF_OPTIONS = SURF_OPTIONS;
+            static bool OPTION_INIT = false;
+            if(!OPTION_INIT)
             {
-                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
-            }
-            else
-            {
-                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
+                if( !USE_IMAGE2d )
+                {
+                    strcat(pSURF_OPTIONS, noImage2dOption);
+                    pSURF_OPTIONS += strlen(noImage2dOption);
+                }
+
+                size_t wave_size = 0;
+                queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
+                std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast<int>(wave_size));
+                OPTION_INIT = true;
             }
+            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);
         }
     }
 }
 
 
-static inline int divUp(size_t total, size_t grain)
+static inline size_t divUp(size_t total, size_t grain)
 {
     return (total + grain - 1) / grain;
 }
@@ -152,8 +163,20 @@ public:
         integral(img, surf_.sum);
         if(support_image2d())
         {
-            bindImgTex(img, imgTex);
-            bindImgTex(surf_.sum, sumTex);
+            try
+            {
+                bindImgTex(img, imgTex);
+                bindImgTex(surf_.sum, sumTex);
+                USE_IMAGE2d = true;
+            }
+            catch (const cv::Exception& e)
+            {
+                USE_IMAGE2d = false;
+                if(e.code != CL_IMAGE_FORMAT_NOT_SUPPORTED && e.code != -217)
+                {
+                    throw e;
+                }
+            }
         }
 
         maskSumTex = 0;
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
index 405d92ccd5..62e69a8a24 100644
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -123,6 +123,16 @@ namespace cv
         // returns whether the current context supports image2d_t format or not
         bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
 
+        // the enums are used to query device information
+        // currently only support wavefront size queries
+        enum DEVICE_INFO
+        {
+             WAVEFRONT_SIZE,            //in AMD speak
+             WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak
+        };
+        //info should have been pre-allocated
+        void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info);
+
     }//namespace ocl
 
 }//namespace cv
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index d3fc9c2a2c..9a790f4ee2 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -353,6 +353,51 @@ namespace cv
         {
             return &(Context::getContext()->impl->clCmdQueue);
         }
+
+        void queryDeviceInfo(DEVICE_INFO info_type, void* info)
+        {
+            static Info::Impl* impl = Context::getContext()->impl;
+            switch(info_type)
+            {
+            case WAVEFRONT_SIZE:
+                {
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+                    try
+                    {
+                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
+                            CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(size_t), info, 0));
+                    }
+                    catch(const cv::Exception&)
+#elif defined (CL_DEVICE_WARP_SIZE_NV)
+                    const int EXT_LEN = 4096 + 1 ;
+                    char extends_set[EXT_LEN];
+                    size_t extends_size;
+                    openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
+                    extends_set[EXT_LEN - 1] = 0;
+                    if(std::string(extends_set).find("cl_nv_device_attribute_query") != std::string::npos)
+                    {
+                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
+                            CL_DEVICE_WARP_SIZE_NV, sizeof(size_t), info, 0));
+                    }
+                    else
+#endif
+                    {
+                        // if no way left for us to query the warp size, we can get it from kernel group info
+                        static const char * _kernel_string = "__kernel void test_func() {}";
+                        cl_kernel kernel;
+                        kernel = openCLGetKernelFromSource(Context::getContext(), &_kernel_string, "test_func");
+                        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
+                            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), info, NULL));
+                    }
+
+                }
+                break;
+            default:
+                CV_Error(-1, "Invalid device info type");
+                break;
+            }
+        }
+
         void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
         {
             cl_int status;