From eeddda4701042b6708645544b46b086e653f5668 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Fri, 20 Jun 2014 15:38:15 +0400 Subject: [PATCH 1/2] Optimization of cv::pyrDown for 8UC1. --- modules/imgproc/src/opencl/pyr_down.cl | 185 ++++++++++++++++++------- modules/imgproc/src/pyramids.cpp | 21 +-- 2 files changed, 150 insertions(+), 56 deletions(-) diff --git a/modules/imgproc/src/opencl/pyr_down.cl b/modules/imgproc/src/opencl/pyr_down.cl index b8b06b712b..4babf54a13 100644 --- a/modules/imgproc/src/opencl/pyr_down.cl +++ b/modules/imgproc/src/opencl/pyr_down.cl @@ -79,12 +79,16 @@ #define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x))) +#if kercn == 4 +#define SRC4(_x,_y) convert_float4(*(__global const uchar4*)(srcData + mad24(_y, src_step, PIXSIZE * _x))) +#endif + #define noconvert __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) { - const int x = get_global_id(0); + const int x = get_global_id(0)*kercn; const int y = get_group_id(1); __local FT smem[LOCAL_SIZE + 4]; @@ -97,98 +101,185 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, FT co3 = 0.0625f; const int src_y = 2*y; + int col; - if (src_y >= 2 && src_y < src_rows - 2 && x >= 2 && x < src_cols - 2) + if (src_y >= 2 && src_y < src_rows - 2) { - sum = co3 * SRC(x, src_y - 2); - sum = sum + co2 * SRC(x, src_y - 1); - sum = sum + co1 * SRC(x, src_y ); - sum = sum + co2 * SRC(x, src_y + 1); - sum = sum + co3 * SRC(x, src_y + 2); +#if kercn == 1 + col = EXTRAPOLATE(x, src_cols); + + sum = co3* SRC(col, src_y - 2); + sum = fma(co2, SRC(col, src_y - 1), sum); + sum = fma(co1, SRC(col, src_y ), sum); + sum = fma(co2, SRC(col, src_y + 1), sum); + sum = fma(co3, SRC(col, src_y + 2), sum); smem[2 + get_local_id(0)] = sum; +#else + if (x < src_cols-4) + { + float4 sum4; + sum4 = co3* SRC4(x, src_y - 2); + sum4 = fma(co2, SRC4(x, src_y - 1), sum4); + sum4 = fma(co1, SRC4(x, src_y ), sum4); + sum4 = fma(co2, SRC4(x, src_y + 1), sum4); + sum4 = fma(co3, SRC4(x, src_y + 2), sum4); + vstore4(sum4, get_local_id(0), (__local float*) &smem[2]); + } + else + { + for (int i=0; i<4; i++) + { + col = EXTRAPOLATE(x+i, src_cols); + sum = co3* SRC(col, src_y - 2); + sum = fma(co2, SRC(col, src_y - 1), sum); + sum = fma(co1, SRC(col, src_y ), sum); + sum = fma(co2, SRC(col, src_y + 1), sum); + sum = fma(co3, SRC(col, src_y + 2), sum); + + smem[2 + 4*get_local_id(0)+i] = sum; + } + } +#endif if (get_local_id(0) < 2) { - const int left_x = x - 2; + col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); - sum = co3 * SRC(left_x, src_y - 2); - sum = sum + co2 * SRC(left_x, src_y - 1); - sum = sum + co1 * SRC(left_x, src_y ); - sum = sum + co2 * SRC(left_x, src_y + 1); - sum = sum + co3 * SRC(left_x, src_y + 2); + sum = co3* SRC(col, src_y - 2); + sum = fma(co2, SRC(col, src_y - 1), sum); + sum = fma(co1, SRC(col, src_y ), sum); + sum = fma(co2, SRC(col, src_y + 1), sum); + sum = fma(co3, SRC(col, src_y + 2), sum); smem[get_local_id(0)] = sum; } - if (get_local_id(0) > LOCAL_SIZE - 3) + if (get_local_id(0) > 1 && get_local_id(0) < 4) { - const int right_x = x + 2; + col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); - sum = co3 * SRC(right_x, src_y - 2); - sum = sum + co2 * SRC(right_x, src_y - 1); - sum = sum + co1 * SRC(right_x, src_y ); - sum = sum + co2 * SRC(right_x, src_y + 1); - sum = sum + co3 * SRC(right_x, src_y + 2); + sum = co3* SRC(col, src_y - 2); + sum = fma(co2, SRC(col, src_y - 1), sum); + sum = fma(co1, SRC(col, src_y ), sum); + sum = fma(co2, SRC(col, src_y + 1), sum); + sum = fma(co3, SRC(col, src_y + 2), sum); - smem[4 + get_local_id(0)] = sum; + smem[LOCAL_SIZE + get_local_id(0)] = sum; } } - else + else // need extrapolate y { - int col = EXTRAPOLATE(x, src_cols); +#if kercn == 1 + col = EXTRAPOLATE(x, src_cols); - sum = co3 * SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); - sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y - 1, src_rows)); - sum = sum + co1 * SRC(col, EXTRAPOLATE(src_y , src_rows)); - sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y + 1, src_rows)); - sum = sum + co3 * SRC(col, EXTRAPOLATE(src_y + 2, src_rows)); + sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); smem[2 + get_local_id(0)] = sum; +#else + if (x < src_cols-4) + { + float4 sum4; + sum4 = co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows)); + sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4); + sum4 = fma(co1, SRC4(x, EXTRAPOLATE(src_y , src_rows)), sum4); + sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4); + sum4 = fma(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4); + vstore4(sum4, get_local_id(0), (__local float*) &smem[2]); + } + else + { + for (int i=0; i<4; i++) + { + col = EXTRAPOLATE(x+i, src_cols); + sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); + + smem[2 + 4*get_local_id(0)+i] = sum; + } + } +#endif if (get_local_id(0) < 2) { - col = EXTRAPOLATE(x - 2, src_cols); + col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); - sum = co3 * SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); - sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y - 1, src_rows)); - sum = sum + co1 * SRC(col, EXTRAPOLATE(src_y , src_rows)); - sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y + 1, src_rows)); - sum = sum + co3 * SRC(col, EXTRAPOLATE(src_y + 2, src_rows)); + sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); smem[get_local_id(0)] = sum; } - if (get_local_id(0) > LOCAL_SIZE - 3) + if (get_local_id(0) > 1 && get_local_id(0) < 4) { - col = EXTRAPOLATE(x + 2, src_cols); + col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); - sum = co3 * SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); - sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y - 1, src_rows)); - sum = sum + co1 * SRC(col, EXTRAPOLATE(src_y , src_rows)); - sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y + 1, src_rows)); - sum = sum + co3 * SRC(col, EXTRAPOLATE(src_y + 2, src_rows)); + sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); - smem[4 + get_local_id(0)] = sum; + smem[LOCAL_SIZE + get_local_id(0)] = sum; } } barrier(CLK_LOCAL_MEM_FENCE); +#if kercn == 1 if (get_local_id(0) < LOCAL_SIZE / 2) { const int tid2 = get_local_id(0) * 2; - sum = co3 * smem[2 + tid2 - 2]; - sum = sum + co2 * smem[2 + tid2 - 1]; - sum = sum + co1 * smem[2 + tid2 ]; - sum = sum + co2 * smem[2 + tid2 + 1]; - sum = sum + co3 * smem[2 + tid2 + 2]; + sum = co3* smem[2 + tid2 + 2]; +#if cn == 1 + sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2)); +#else + sum = fma(co3, smem[2 + tid2 - 2], sum); + sum = fma(co2, smem[2 + tid2 - 1], sum); + sum = fma(co1, smem[2 + tid2 ], sum); + sum = fma(co2, smem[2 + tid2 + 1], sum); +#endif const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; if (dst_x < dst_cols) storepix(convertToT(sum), dstData + y * dst_step + dst_x * PIXSIZE); } +#else + int tid4 = get_local_id(0) * 4; + sum = co3* smem[2 + tid4 + 2]; + sum = fma(co3, smem[2 + tid4 - 2], sum); + sum = fma(co2, smem[2 + tid4 - 1], sum); + sum = fma(co1, smem[2 + tid4 ], sum); + sum = fma(co2, smem[2 + tid4 + 1], sum); + + int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2; + + if (dst_x < dst_cols) + storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE)); + + tid4 += 2; + dst_x += 1; + + sum = co3* smem[2 + tid4 + 2]; + sum = fma(co3, smem[2 + tid4 - 2], sum); + sum = fma(co2, smem[2 + tid4 - 1], sum); + sum = fma(co1, smem[2 + tid4 ], sum); + sum = fma(co2, smem[2 + tid4 + 1], sum); + + if (dst_x < dst_cols) + storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE)); +#endif } diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index d1ed92d5d9..418d20ff45 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -405,10 +405,10 @@ typedef void (*PyrFunc)(const Mat&, Mat&, int); static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType) { - int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type); + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if (channels > 4 || (depth == CV_64F && !doubleSupport)) + if (cn > 4 || (depth == CV_64F && !doubleSupport)) return false; Size ssize = _src.size(); @@ -423,17 +423,20 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in int float_depth = depth == CV_64F ? CV_64F : CV_32F; const int local_size = 256; + int kercn = 1; + if (depth == CV_8U && cn == 1 && float_depth == CV_32F) + kercn = 4; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }; char cvt[2][50]; String buildOptions = format( "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s " - "-D T1=%s -D cn=%d -D %s -D LOCAL_SIZE=%d", - ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)), - ocl::convertTypeStr(float_depth, depth, channels, cvt[0]), - ocl::convertTypeStr(depth, float_depth, channels, cvt[1]), + "-D T1=%s -D cn=%d -D kercn=%d -D %s -D LOCAL_SIZE=%d", + ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)), + ocl::convertTypeStr(float_depth, depth, cn, cvt[0]), + ocl::convertTypeStr(depth, float_depth, cn, cvt[1]), doubleSupport ? " -D DOUBLE_SUPPORT" : "", - ocl::typeToStr(depth), channels, borderMap[borderType], local_size + ocl::typeToStr(depth), cn, kercn, borderMap[borderType], local_size ); ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions); if (k.empty()) @@ -441,8 +444,8 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst)); - size_t localThreads[2] = { local_size, 1 }; - size_t globalThreads[2] = { src.cols, dst.rows }; + size_t localThreads[2] = { local_size/kercn, 1 }; + size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, dst.rows }; return k.run(2, globalThreads, localThreads, false); } From d5c99a07b673e15fb65f846d04800fb61f8a5d88 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Mon, 23 Jun 2014 10:18:11 +0400 Subject: [PATCH 2/2] Use fma only for Intel devices --- modules/imgproc/src/opencl/pyr_down.cl | 119 ++++++++++++++----------- modules/imgproc/src/pyramids.cpp | 8 +- 2 files changed, 69 insertions(+), 58 deletions(-) diff --git a/modules/imgproc/src/opencl/pyr_down.cl b/modules/imgproc/src/opencl/pyr_down.cl index 4babf54a13..2358775e7a 100644 --- a/modules/imgproc/src/opencl/pyr_down.cl +++ b/modules/imgproc/src/opencl/pyr_down.cl @@ -80,7 +80,13 @@ #define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x))) #if kercn == 4 -#define SRC4(_x,_y) convert_float4(*(__global const uchar4*)(srcData + mad24(_y, src_step, PIXSIZE * _x))) +#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x))) +#endif + +#ifdef INTEL_DEVICE +#define MAD(x,y,z) fma((x),(y),(z)) +#else +#define MAD(x,y,z) mad((x),(y),(z)) #endif #define noconvert @@ -109,10 +115,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, col = EXTRAPOLATE(x, src_cols); sum = co3* SRC(col, src_y - 2); - sum = fma(co2, SRC(col, src_y - 1), sum); - sum = fma(co1, SRC(col, src_y ), sum); - sum = fma(co2, SRC(col, src_y + 1), sum); - sum = fma(co3, SRC(col, src_y + 2), sum); + sum = MAD(co2, SRC(col, src_y - 1), sum); + sum = MAD(co1, SRC(col, src_y ), sum); + sum = MAD(co2, SRC(col, src_y + 1), sum); + sum = MAD(co3, SRC(col, src_y + 2), sum); smem[2 + get_local_id(0)] = sum; #else @@ -120,10 +126,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, { float4 sum4; sum4 = co3* SRC4(x, src_y - 2); - sum4 = fma(co2, SRC4(x, src_y - 1), sum4); - sum4 = fma(co1, SRC4(x, src_y ), sum4); - sum4 = fma(co2, SRC4(x, src_y + 1), sum4); - sum4 = fma(co3, SRC4(x, src_y + 2), sum4); + sum4 = MAD(co2, SRC4(x, src_y - 1), sum4); + sum4 = MAD(co1, SRC4(x, src_y ), sum4); + sum4 = MAD(co2, SRC4(x, src_y + 1), sum4); + sum4 = MAD(co3, SRC4(x, src_y + 2), sum4); vstore4(sum4, get_local_id(0), (__local float*) &smem[2]); } @@ -133,10 +139,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, { col = EXTRAPOLATE(x+i, src_cols); sum = co3* SRC(col, src_y - 2); - sum = fma(co2, SRC(col, src_y - 1), sum); - sum = fma(co1, SRC(col, src_y ), sum); - sum = fma(co2, SRC(col, src_y + 1), sum); - sum = fma(co3, SRC(col, src_y + 2), sum); + sum = MAD(co2, SRC(col, src_y - 1), sum); + sum = MAD(co1, SRC(col, src_y ), sum); + sum = MAD(co2, SRC(col, src_y + 1), sum); + sum = MAD(co3, SRC(col, src_y + 2), sum); smem[2 + 4*get_local_id(0)+i] = sum; } @@ -147,10 +153,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); sum = co3* SRC(col, src_y - 2); - sum = fma(co2, SRC(col, src_y - 1), sum); - sum = fma(co1, SRC(col, src_y ), sum); - sum = fma(co2, SRC(col, src_y + 1), sum); - sum = fma(co3, SRC(col, src_y + 2), sum); + sum = MAD(co2, SRC(col, src_y - 1), sum); + sum = MAD(co1, SRC(col, src_y ), sum); + sum = MAD(co2, SRC(col, src_y + 1), sum); + sum = MAD(co3, SRC(col, src_y + 2), sum); smem[get_local_id(0)] = sum; } @@ -160,10 +166,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); sum = co3* SRC(col, src_y - 2); - sum = fma(co2, SRC(col, src_y - 1), sum); - sum = fma(co1, SRC(col, src_y ), sum); - sum = fma(co2, SRC(col, src_y + 1), sum); - sum = fma(co3, SRC(col, src_y + 2), sum); + sum = MAD(co2, SRC(col, src_y - 1), sum); + sum = MAD(co1, SRC(col, src_y ), sum); + sum = MAD(co2, SRC(col, src_y + 1), sum); + sum = MAD(co3, SRC(col, src_y + 2), sum); smem[LOCAL_SIZE + get_local_id(0)] = sum; } @@ -174,10 +180,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, col = EXTRAPOLATE(x, src_cols); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); - sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); - sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); smem[2 + get_local_id(0)] = sum; #else @@ -185,10 +191,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, { float4 sum4; sum4 = co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows)); - sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4); - sum4 = fma(co1, SRC4(x, EXTRAPOLATE(src_y , src_rows)), sum4); - sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4); - sum4 = fma(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4); + sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4); + sum4 = MAD(co1, SRC4(x, EXTRAPOLATE(src_y , src_rows)), sum4); + sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4); + sum4 = MAD(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4); vstore4(sum4, get_local_id(0), (__local float*) &smem[2]); } @@ -198,10 +204,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, { col = EXTRAPOLATE(x+i, src_cols); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); - sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); - sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); smem[2 + 4*get_local_id(0)+i] = sum; } @@ -212,10 +218,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); - sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); - sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); smem[get_local_id(0)] = sum; } @@ -225,10 +231,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows)); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); - sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); - sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); - sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum); + sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum); + sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum); + sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum); smem[LOCAL_SIZE + get_local_id(0)] = sum; } @@ -241,15 +247,20 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, { const int tid2 = get_local_id(0) * 2; - sum = co3* smem[2 + tid2 + 2]; + sum = 0.f; #if cn == 1 +#if fdepth <= 5 sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2)); #else - sum = fma(co3, smem[2 + tid2 - 2], sum); - sum = fma(co2, smem[2 + tid2 - 1], sum); - sum = fma(co1, smem[2 + tid2 ], sum); - sum = fma(co2, smem[2 + tid2 + 1], sum); + sum = sum + dot(vload4(0, (__local double*) (&smem)+tid2), (double4)(co3, co2, co1, co2)); #endif +#else + sum = MAD(co3, smem[2 + tid2 - 2], sum); + sum = MAD(co2, smem[2 + tid2 - 1], sum); + sum = MAD(co1, smem[2 + tid2 ], sum); + sum = MAD(co2, smem[2 + tid2 + 1], sum); +#endif + sum = MAD(co3, smem[2 + tid2 + 2], sum); const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; @@ -260,10 +271,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int tid4 = get_local_id(0) * 4; sum = co3* smem[2 + tid4 + 2]; - sum = fma(co3, smem[2 + tid4 - 2], sum); - sum = fma(co2, smem[2 + tid4 - 1], sum); - sum = fma(co1, smem[2 + tid4 ], sum); - sum = fma(co2, smem[2 + tid4 + 1], sum); + sum = MAD(co3, smem[2 + tid4 - 2], sum); + sum = MAD(co2, smem[2 + tid4 - 1], sum); + sum = MAD(co1, smem[2 + tid4 ], sum); + sum = MAD(co2, smem[2 + tid4 + 1], sum); int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2; @@ -274,10 +285,10 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, dst_x += 1; sum = co3* smem[2 + tid4 + 2]; - sum = fma(co3, smem[2 + tid4 - 2], sum); - sum = fma(co2, smem[2 + tid4 - 1], sum); - sum = fma(co1, smem[2 + tid4 ], sum); - sum = fma(co2, smem[2 + tid4 + 1], sum); + sum = MAD(co3, smem[2 + tid4 - 2], sum); + sum = MAD(co2, smem[2 + tid4 - 1], sum); + sum = MAD(co1, smem[2 + tid4 ], sum); + sum = MAD(co2, smem[2 + tid4 + 1], sum); if (dst_x < dst_cols) storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE)); diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp index 418d20ff45..f82ab10725 100644 --- a/modules/imgproc/src/pyramids.cpp +++ b/modules/imgproc/src/pyramids.cpp @@ -424,19 +424,19 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in int float_depth = depth == CV_64F ? CV_64F : CV_32F; const int local_size = 256; int kercn = 1; - if (depth == CV_8U && cn == 1 && float_depth == CV_32F) + if (depth == CV_8U && float_depth == CV_32F && cn == 1 && ocl::Device::getDefault().isIntel()) kercn = 4; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }; char cvt[2][50]; String buildOptions = format( "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s " - "-D T1=%s -D cn=%d -D kercn=%d -D %s -D LOCAL_SIZE=%d", + "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d", ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)), ocl::convertTypeStr(float_depth, depth, cn, cvt[0]), ocl::convertTypeStr(depth, float_depth, cn, cvt[1]), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", - ocl::typeToStr(depth), cn, kercn, borderMap[borderType], local_size + doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(depth), + cn, kercn, float_depth, borderMap[borderType], local_size ); ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions); if (k.empty())