diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl index e917864d73..140a4d746c 100644 --- a/modules/nonfree/src/opencl/surf.cl +++ b/modules/nonfree/src/opencl/surf.cl @@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc #define op(A, B) (*A)+(B) data[tid] = *partial_reduction; barrier(CLK_LOCAL_MEM_FENCE); - +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif if (tid < 16) + { data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]); +#if WAVE_SIZE < 16 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); +#if WAVE_SIZE < 8 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 4) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); +#if WAVE_SIZE < 4 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 2) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]); +#if WAVE_SIZE < 2 + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 1) + { +#endif data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); + } +#undef WAVE_SIZE #undef op } @@ -1087,44 +1108,67 @@ void reduce_sum25( int tid ) { +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif // first step is to reduce from 25 to 16 - if (tid < 9) // use 9 threads + if (tid < 9) { sdata1[tid] += sdata1[tid + 16]; sdata2[tid] += sdata2[tid + 16]; sdata3[tid] += sdata3[tid + 16]; sdata4[tid] += sdata4[tid + 16]; +#if WAVE_SIZE < 16 } - - // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp) + barrier(CLK_LOCAL_MEM_FENCE); if (tid < 8) { +#endif sdata1[tid] += sdata1[tid + 8]; - sdata1[tid] += sdata1[tid + 4]; - sdata1[tid] += sdata1[tid + 2]; - sdata1[tid] += sdata1[tid + 1]; sdata2[tid] += sdata2[tid + 8]; - sdata2[tid] += sdata2[tid + 4]; - sdata2[tid] += sdata2[tid + 2]; - sdata2[tid] += sdata2[tid + 1]; sdata3[tid] += sdata3[tid + 8]; - sdata3[tid] += sdata3[tid + 4]; - sdata3[tid] += sdata3[tid + 2]; - sdata3[tid] += sdata3[tid + 1]; sdata4[tid] += sdata4[tid + 8]; +#if WAVE_SIZE < 8 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif + sdata1[tid] += sdata1[tid + 4]; + sdata2[tid] += sdata2[tid + 4]; + sdata3[tid] += sdata3[tid + 4]; sdata4[tid] += sdata4[tid + 4]; +#if WAVE_SIZE < 4 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif + sdata1[tid] += sdata1[tid + 2]; + sdata2[tid] += sdata2[tid + 2]; + sdata3[tid] += sdata3[tid + 2]; sdata4[tid] += sdata4[tid + 2]; +#if WAVE_SIZE < 2 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif + sdata1[tid] += sdata1[tid + 1]; + sdata2[tid] += sdata2[tid + 1]; + sdata3[tid] += sdata3[tid + 1]; sdata4[tid] += sdata4[tid + 1]; } +#undef WAVE_SIZE } __kernel void compute_descriptors64( IMAGE_INT8 imgTex, - volatile __global float * descriptors, + __global float * descriptors, __global const float * keypoints, int descriptors_step, int keypoints_step, @@ -1158,14 +1202,13 @@ __kernel sdyabs[tid] = fabs(sdy[tid]); // |dy| array } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 25) - { + reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); - } + barrier(CLK_LOCAL_MEM_FENCE); if (tid < 25) { - volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); + __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2); // write dx, dy, |dx|, |dy| if (tid == 0) @@ -1180,7 +1223,7 @@ __kernel __kernel void compute_descriptors128( IMAGE_INT8 imgTex, - __global volatile float * descriptors, + __global float * descriptors, __global float * keypoints, int descriptors_step, int keypoints_step, @@ -1229,13 +1272,15 @@ __kernel sd2[tid] = sdx[tid]; sdabs2[tid] = fabs(sdx[tid]); } - //barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - //barrier(CLK_LOCAL_MEM_FENCE); - - volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); + barrier(CLK_LOCAL_MEM_FENCE); + __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3); + if (tid < 25) + { // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) if (tid == 0) { @@ -1259,11 +1304,14 @@ __kernel sd2[tid] = sdy[tid]; sdabs2[tid] = fabs(sdy[tid]); } - //barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - //barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 25) + { // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) if (tid == 0) { @@ -1274,6 +1322,103 @@ __kernel } } } +void reduce_sum128(volatile __local float* smem, int tid) +{ +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif + if (tid < 64) + { + smem[tid] += smem[tid + 64]; +#if WAVE_SIZE < 64 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 32) + { +#endif + smem[tid] += smem[tid + 32]; +#if WAVE_SIZE < 32 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { +#endif + smem[tid] += smem[tid + 16]; +#if WAVE_SIZE < 16 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif + smem[tid] += smem[tid + 8]; +#if WAVE_SIZE < 8 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif + smem[tid] += smem[tid + 4]; +#if WAVE_SIZE < 4 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif + smem[tid] += smem[tid + 2]; +#if WAVE_SIZE < 2 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif + smem[tid] += smem[tid + 1]; + } +} +void reduce_sum64(volatile __local float* smem, int tid) +{ +#ifndef WAVE_SIZE +#define WAVE_SIZE 1 +#endif + if (tid < 32) + { + smem[tid] += smem[tid + 32]; +#if WAVE_SIZE < 32 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 16) + { +#endif + smem[tid] += smem[tid + 16]; +#if WAVE_SIZE < 16 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 8) + { +#endif + smem[tid] += smem[tid + 8]; +#if WAVE_SIZE < 8 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { +#endif + smem[tid] += smem[tid + 4]; +#if WAVE_SIZE < 4 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 2) + { +#endif + smem[tid] += smem[tid + 2]; +#if WAVE_SIZE < 2 + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 1) + { +#endif + smem[tid] += smem[tid + 1]; + } +} __kernel void normalize_descriptors128(__global float * descriptors, int descriptors_step) @@ -1288,22 +1433,10 @@ __kernel sqDesc[get_local_id(0)] = lookup * lookup; barrier(CLK_LOCAL_MEM_FENCE); - if (get_local_id(0) < 64) - sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64]; + reduce_sum128(sqDesc, get_local_id(0)); barrier(CLK_LOCAL_MEM_FENCE); - // reduction to get total - if (get_local_id(0) < 32) - { - volatile __local float* smem = sqDesc; - smem[get_local_id(0)] += smem[get_local_id(0) + 32]; - smem[get_local_id(0)] += smem[get_local_id(0) + 16]; - smem[get_local_id(0)] += smem[get_local_id(0) + 8]; - smem[get_local_id(0)] += smem[get_local_id(0) + 4]; - smem[get_local_id(0)] += smem[get_local_id(0) + 2]; - smem[get_local_id(0)] += smem[get_local_id(0) + 1]; - } // compute length (square root) volatile __local float len; @@ -1329,18 +1462,9 @@ __kernel sqDesc[get_local_id(0)] = lookup * lookup; barrier(CLK_LOCAL_MEM_FENCE); - // reduction to get total - if (get_local_id(0) < 32) - { - volatile __local float* smem = sqDesc; - smem[get_local_id(0)] += smem[get_local_id(0) + 32]; - smem[get_local_id(0)] += smem[get_local_id(0) + 16]; - smem[get_local_id(0)] += smem[get_local_id(0) + 8]; - smem[get_local_id(0)] += smem[get_local_id(0) + 4]; - smem[get_local_id(0)] += smem[get_local_id(0) + 2]; - smem[get_local_id(0)] += smem[get_local_id(0) + 1]; - } + reduce_sum64(sqDesc, get_local_id(0)); + barrier(CLK_LOCAL_MEM_FENCE); // compute length (square root) volatile __local float len; diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index d8336b9387..b72d132d64 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -75,7 +75,7 @@ namespace cv } -static inline int divUp(size_t total, size_t grain) +static inline size_t divUp(size_t total, size_t grain) { return (total + grain - 1) / grain; } diff --git a/modules/nonfree/test/test_surf.ocl.cpp b/modules/nonfree/test/test_surf.ocl.cpp index 76ed37de45..0d09cc8b93 100644 --- a/modules/nonfree/test/test_surf.ocl.cpp +++ b/modules/nonfree/test/test_surf.ocl.cpp @@ -144,9 +144,10 @@ PARAM_TEST_CASE(SURF, HessianThreshold, Octaves, OctaveLayers, Extended, Upright } }; -TEST_P(SURF, DISABLED_Detector) +TEST_P(SURF, Detector) { - cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE); + // the data path should be opencv/samples + cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "c/fruits.jpg", cv::IMREAD_GRAYSCALE); ASSERT_FALSE(image.empty()); cv::ocl::SURF_OCL surf; @@ -179,7 +180,7 @@ TEST_P(SURF, DISABLED_Detector) TEST_P(SURF, DISABLED_Descriptor) { - cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE); + cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "c/fruits.jpg", cv::IMREAD_GRAYSCALE); ASSERT_FALSE(image.empty()); cv::ocl::SURF_OCL surf;