diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl index e917864d73..140a4d746c 100644 --- a/modules/nonfree/src/opencl/surf.cl +++ b/modules/nonfree/src/opencl/surf.cl @@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc #define op(A, B) (*A)+(B) data[tid] = *partial_reduction; barrier(CLK_LOCAL_MEM_FENCE); - +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif if (tid < 16) + { data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]); +#if WAVE_SIZE < 16 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); +#if WAVE_SIZE < 8 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); +#if WAVE_SIZE < 4 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 2) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]); +#if WAVE_SIZE < 2 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 1) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); + } +#undef WAVE_SIZE #undef op } @@ -1087,44 +1108,67 @@ void reduce_sum25( int tid ) { +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif // first step is to reduce from 25 to 16 - if (tid < 9) // use 9 threads + if (tid < 9) { sdata1[tid] += sdata1[tid + 16]; sdata2[tid] += sdata2[tid + 16]; sdata3[tid] += sdata3[tid + 16]; sdata4[tid] += sdata4[tid + 16]; +#if WAVE_SIZE < 16 } - - // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp) + barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) { +#endif sdata1[tid] += sdata1[tid + 8]; - sdata1[tid] += sdata1[tid + 4]; - sdata1[tid] += sdata1[tid + 2]; - sdata1[tid] += sdata1[tid + 1]; sdata2[tid] += sdata2[tid + 8]; - sdata2[tid] += sdata2[tid + 4]; - sdata2[tid] += sdata2[tid + 2]; - sdata2[tid] += sdata2[tid + 1]; sdata3[tid] += sdata3[tid + 8]; - sdata3[tid] += sdata3[tid + 4]; - sdata3[tid] += sdata3[tid + 2]; - sdata3[tid] += sdata3[tid + 1]; sdata4[tid] += sdata4[tid + 8]; +#if WAVE_SIZE < 8 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif + sdata1[tid] += sdata1[tid + 4]; + sdata2[tid] += sdata2[tid + 4]; + sdata3[tid] += sdata3[tid + 4]; sdata4[tid] += sdata4[tid + 4]; +#if WAVE_SIZE < 4 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif + sdata1[tid] += sdata1[tid + 2]; + sdata2[tid] += sdata2[tid + 2]; + sdata3[tid] += sdata3[tid + 2]; sdata4[tid] += sdata4[tid + 2]; +#if WAVE_SIZE < 2 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif + sdata1[tid] += sdata1[tid + 1]; + sdata2[tid] += sdata2[tid + 1]; + sdata3[tid] += sdata3[tid + 1]; sdata4[tid] += sdata4[tid + 1]; } +#undef WAVE_SIZE } __kernel void compute_descriptors64( IMAGE_INT8 imgTex, - volatile __global float * descriptors, + __global float * descriptors, __global const float * keypoints, int descriptors_step, int keypoints_step, @@ -1158,14 +1202,13 @@ __kernel sdyabs[tid] = fabs(sdy[tid]); // |dy| array } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 25) - { + reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); - } + barrier(CLK_LOCAL_MEM_FENCE); if (tid < 25) { - volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); + __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); // write dx, dy, |dx|, |dy| if (tid == 0) @@ -1180,7 +1223,7 @@ __kernel __kernel void compute_descriptors128( IMAGE_INT8 imgTex, - __global volatile float * descriptors, + __global float * descriptors, __global float * keypoints, int descriptors_step, int keypoints_step, @@ -1229,13 +1272,15 @@ __kernel sd2[tid] = sdx[tid]; sdabs2[tid] = fabs(sdx[tid]); } - //barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - //barrier(CLK_LOCAL_MEM_FENCE); - - volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); + barrier(CLK_LOCAL_MEM_FENCE); + __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); + if (tid < 25) + { // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) if (tid == 0) { @@ -1259,11 +1304,14 @@ __kernel sd2[tid] = sdy[tid]; sdabs2[tid] = fabs(sdy[tid]); } - //barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - //barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 25) + { // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) if (tid == 0) { @@ -1274,6 +1322,103 @@ __kernel } } } +void reduce_sum128(volatile __local float* smem, int tid) +{ +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif + if (tid < 64) + { + smem[tid] += smem[tid + 64]; +#if WAVE_SIZE < 64 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 32) + { +#endif + smem[tid] += smem[tid + 32]; +#if WAVE_SIZE < 32 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { +#endif + smem[tid] += smem[tid + 16]; +#if WAVE_SIZE < 16 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif + smem[tid] += smem[tid + 8]; +#if WAVE_SIZE < 8 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif + smem[tid] += smem[tid + 4]; +#if WAVE_SIZE < 4 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif + smem[tid] += smem[tid + 2]; +#if WAVE_SIZE < 2 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif + smem[tid] += smem[tid + 1]; + } +} +void reduce_sum64(volatile __local float* smem, int tid) +{ +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif + if (tid < 32) + { + smem[tid] += smem[tid + 32]; +#if WAVE_SIZE < 32 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { +#endif + smem[tid] += smem[tid + 16]; +#if WAVE_SIZE < 16 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif + smem[tid] += smem[tid + 8]; +#if WAVE_SIZE < 8 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif + smem[tid] += smem[tid + 4]; +#if WAVE_SIZE < 4 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif + smem[tid] += smem[tid + 2]; +#if WAVE_SIZE < 2 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif + smem[tid] += smem[tid + 1]; + } +} __kernel void normalize_descriptors128(__global float * descriptors, int descriptors_step) @@ -1288,22 +1433,10 @@ __kernel sqDesc[get_local_id(0)] = lookup * lookup; barrier(CLK_LOCAL_MEM_FENCE); - if (get_local_id(0) < 64) - sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64]; + reduce_sum128(sqDesc, get_local_id(0)); barrier(CLK_LOCAL_MEM_FENCE); - // reduction to get total - if (get_local_id(0) < 32) - { - volatile __local float* smem = sqDesc; - smem[get_local_id(0)] += smem[get_local_id(0) + 32]; - smem[get_local_id(0)] += smem[get_local_id(0) + 16]; - smem[get_local_id(0)] += smem[get_local_id(0) + 8]; - smem[get_local_id(0)] += smem[get_local_id(0) + 4]; - smem[get_local_id(0)] += smem[get_local_id(0) + 2]; - smem[get_local_id(0)] += smem[get_local_id(0) + 1]; - } // compute length (square root) volatile __local float len; @@ -1329,18 +1462,9 @@ __kernel sqDesc[get_local_id(0)] = lookup * lookup; barrier(CLK_LOCAL_MEM_FENCE); - // reduction to get total - if (get_local_id(0) < 32) - { - volatile __local float* smem = sqDesc; - smem[get_local_id(0)] += smem[get_local_id(0) + 32]; - smem[get_local_id(0)] += smem[get_local_id(0) + 16]; - smem[get_local_id(0)] += smem[get_local_id(0) + 8]; - smem[get_local_id(0)] += smem[get_local_id(0) + 4]; - smem[get_local_id(0)] += smem[get_local_id(0) + 2]; - smem[get_local_id(0)] += smem[get_local_id(0) + 1]; - } + reduce_sum64(sqDesc, get_local_id(0)); + barrier(CLK_LOCAL_MEM_FENCE); // compute length (square root) volatile __local float len; diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index d8336b9387..78864c6f96 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -43,6 +43,7 @@ // //M*/ #include "precomp.hpp" +#include #ifdef HAVE_OPENCV_OCL @@ -57,25 +58,35 @@ namespace cv ///////////////////////////OpenCL kernel strings/////////////////////////// extern const char *surf; - const char* noImage2dOption = "-D DISABLE_IMAGE2D"; + const char noImage2dOption [] = "-D DISABLE_IMAGE2D"; + static char SURF_OPTIONS [1024] = ""; + static bool USE_IMAGE2d = false; static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], size_t localThreads[3], vector< pair > &args, int channels, int depth) { - if(support_image2d()) + char * pSURF_OPTIONS = SURF_OPTIONS; + static bool OPTION_INIT = false; + if(!OPTION_INIT) { - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth); - } - else - { - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption); + if( !USE_IMAGE2d ) + { + strcat(pSURF_OPTIONS, noImage2dOption); + pSURF_OPTIONS += strlen(noImage2dOption); + } + + size_t wave_size = 0; + queryDeviceInfo(WAVEFRONT_SIZE, &wave_size); + std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast(wave_size)); + OPTION_INIT = true; } + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS); } } } -static inline int divUp(size_t total, size_t grain) +static inline size_t divUp(size_t total, size_t grain) { return (total + grain - 1) / grain; } @@ -152,8 +163,20 @@ public: integral(img, surf_.sum); if(support_image2d()) { - bindImgTex(img, imgTex); - bindImgTex(surf_.sum, sumTex); + try + { + bindImgTex(img, imgTex); + bindImgTex(surf_.sum, sumTex); + USE_IMAGE2d = true; + } + catch (const cv::Exception& e) + { + USE_IMAGE2d = false; + if(e.code != CL_IMAGE_FORMAT_NOT_SUPPORTED && e.code != -217) + { + throw e; + } + } } maskSumTex = 0; diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp index 405d92ccd5..62e69a8a24 100644 --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@ -123,6 +123,16 @@ namespace cv // returns whether the current context supports image2d_t format or not bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext()); + // the enums are used to query device information + // currently only support wavefront size queries + enum DEVICE_INFO + { + WAVEFRONT_SIZE, //in AMD speak + WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak + }; + //info should have been pre-allocated + void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info); + }//namespace ocl }//namespace cv diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index d3fc9c2a2c..9a790f4ee2 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -353,6 +353,51 @@ namespace cv { return &(Context::getContext()->impl->clCmdQueue); } + + void queryDeviceInfo(DEVICE_INFO info_type, void* info) + { + static Info::Impl* impl = Context::getContext()->impl; + switch(info_type) + { + case WAVEFRONT_SIZE: + { +#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD + try + { + openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], + CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(size_t), info, 0)); + } + catch(const cv::Exception&) +#elif defined (CL_DEVICE_WARP_SIZE_NV) + const int EXT_LEN = 4096 + 1 ; + char extends_set[EXT_LEN]; + size_t extends_size; + openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size)); + extends_set[EXT_LEN - 1] = 0; + if(std::string(extends_set).find("cl_nv_device_attribute_query") != std::string::npos) + { + openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], + CL_DEVICE_WARP_SIZE_NV, sizeof(size_t), info, 0)); + } + else +#endif + { + // if no way left for us to query the warp size, we can get it from kernel group info + static const char * _kernel_string = "__kernel void test_func() {}"; + cl_kernel kernel; + kernel = openCLGetKernelFromSource(Context::getContext(), &_kernel_string, "test_func"); + openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum], + CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), info, NULL)); + } + + } + break; + default: + CV_Error(-1, "Invalid device info type"); + break; + } + } + void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size) { cl_int status;