From f7620dc7d184a8a26a0cd6317566821035c3abc2 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Wed, 12 Feb 2014 12:18:55 +0400 Subject: [PATCH 1/3] added 3-channel support to arithmetic operations --- modules/core/src/arithm.cpp | 34 +++++-- modules/core/src/opencl/arithm.cl | 164 +++++++++++++++++++----------- 2 files changed, 129 insertions(+), 69 deletions(-) diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index dbf05a3f8c..706d346c39 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -934,16 +934,23 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if( oclop < 0 || ((haveMask || haveScalar) && (cn > 4 || cn == 3)) || + if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || (!doubleSupport && srcdepth == CV_64F)) return false; char opts[1024]; int kercn = haveMask || haveScalar ? cn : 1; - sprintf(opts, "-D %s%s -D %s -D dstT=%s%s", + int scalarcn = kercn == 3 ? 4 : kercn; + + sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : - ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), + bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : + ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), + kercn); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); if( k.empty() ) @@ -960,7 +967,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, if( haveScalar ) { - size_t esz = CV_ELEM_SIZE(srctype); + size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn; double buf[4] = {0,0,0,0}; if( oclop != OCL_OP_NOT ) @@ -1294,7 +1301,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); bool haveMask = !_mask.empty(); - if( ((haveMask || haveScalar) && (cn > 4 || cn == 3)) ) + if( ((haveMask || haveScalar) && cn > 4) ) return false; int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype)); @@ -1307,21 +1314,26 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, return false; int kercn = haveMask || haveScalar ? cn : 1; + int scalarcn = kercn == 3 ? 4 : kercn; char cvtstr[4][32], opts[1024]; - sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s " - "-D dstT=%s -D workT=%s -D scaleT=%s -D convertToWT1=%s " - "-D convertToWT2=%s -D convertToDT=%s%s", + sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " + "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D convertToWT1=%s " + "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), + ocl::typeToStr(CV_MAKETYPE(depth1, 1)), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), + ocl::typeToStr(CV_MAKETYPE(depth2, 1)), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)), + ocl::typeToStr(CV_MAKETYPE(ddepth, 1)), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), + ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)), ocl::typeToStr(CV_MAKETYPE(wdepth, 1)), ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn); size_t usrdata_esz = CV_ELEM_SIZE(wdepth); const uchar* usrdata_p = (const uchar*)usrdata; @@ -1352,7 +1364,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, if( haveScalar ) { - size_t esz = CV_ELEM_SIZE(wtype); + size_t esz = CV_ELEM_SIZE(wtype)*scalarcn; double buf[4]={0,0,0,0}; Mat src2sc = _src2.getMat(); @@ -2621,7 +2633,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" }; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D BINARY_OP -D srcT1=%s -D workT=srcT1" + format("-D BINARY_OP -D srcT1=%s -D workT=srcT1 -D cn=1" " -D OP_CMP -D CMP_OPERATOR=%s%s", ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), operationMap[op], diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl index ed08384164..a7dacc4283 100644 --- a/modules/core/src/opencl/arithm.cl +++ b/modules/core/src/opencl/arithm.cl @@ -70,21 +70,47 @@ #define CV_PI M_PI_F #endif -#define dstelem *(__global dstT*)(dstptr + dst_index) -#define dstelem2 *(__global dstT*)(dstptr2 + dst_index2) +#ifndef cn +#define cn 1 +#endif + +#if cn == 1 +#undef srcT1_C1 +#undef srcT2_C1 +#undef dstT_C1 +#define srcT1_C1 srcT1 +#define srcT2_C1 srcT2 +#define dstT_C1 dstT +#endif + +#if cn != 3 + #define storedst(val) *(__global dstT*)(dstptr + dst_index) = val + #define storedst2(val) *(__global dstT*)(dstptr2 + dst_index2) = val +#else + #define storedst(val) vstore3(val, 0, (__global dstT_C1*)(dstptr + dst_index)) + #define storedst2(val) vstore3(val, 0, (__global dstT_C1*)(dstptr2 + dst_index2)) +#endif + #define noconvert #ifndef workT #ifndef srcT1 #define srcT1 dstT + #define srcT1_C1 dstT_C1 #endif #ifndef srcT2 #define srcT2 dstT + #define srcT2_C1 dstT_C1 #endif #define workT dstT - #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index) - #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index) + #if cn != 3 + #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index) + #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index) + #else + #define srcelem1 vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index)) + #define srcelem2 vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index)) + #endif #ifndef convertToDT #define convertToDT noconvert #endif @@ -94,153 +120,168 @@ #ifndef convertToWT2 #define convertToWT2 convertToWT1 #endif - #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index)) - #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index)) + #if cn != 3 + #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index)) + #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index)) + #else + #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index))) + #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index))) + #endif #endif +#ifndef workST +#define workST workT +#endif + #define EXTRA_PARAMS #define EXTRA_INDEX #if defined OP_ADD -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2)) #elif defined OP_SUB -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2)) #elif defined OP_RSUB -#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1) +#define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1)) #elif defined OP_ABSDIFF #define PROCESS_ELEM \ workT v = srcelem1 - srcelem2; \ - dstelem = convertToDT(v >= (workT)(0) ? v : -v); + storedst(convertToDT(v >= (workT)(0) ? v : -v)) #elif defined OP_AND -#define PROCESS_ELEM dstelem = srcelem1 & srcelem2 +#define PROCESS_ELEM storedst(srcelem1 & srcelem2) #elif defined OP_OR -#define PROCESS_ELEM dstelem = srcelem1 | srcelem2 +#define PROCESS_ELEM storedst(srcelem1 | srcelem2) #elif defined OP_XOR -#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2 +#define PROCESS_ELEM storedst(srcelem1 ^ srcelem2) #elif defined OP_NOT -#define PROCESS_ELEM dstelem = ~srcelem1 +#define PROCESS_ELEM storedst(~srcelem1) #elif defined OP_MIN -#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(min(srcelem1, srcelem2)) #elif defined OP_MAX -#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(max(srcelem1, srcelem2)) #elif defined OP_MUL -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2)) #elif defined OP_MUL_SCALE #undef EXTRA_PARAMS #ifdef UNARY_OP -#define EXTRA_PARAMS , workT srcelem2, scaleT scale +#define EXTRA_PARAMS , workST srcelem2_, scaleT scale +#undef srcelem2 +#define srcelem2 srcelem2_ #else #define EXTRA_PARAMS , scaleT scale #endif -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * scale * srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2)) #elif defined OP_DIV #define PROCESS_ELEM \ workT e2 = srcelem2, zero = (workT)(0); \ - dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero) + storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero)) #elif defined OP_DIV_SCALE #undef EXTRA_PARAMS #ifdef UNARY_OP -#define EXTRA_PARAMS , workT srcelem2, scaleT scale +#define EXTRA_PARAMS , workST srcelem2_, scaleT scale +#undef srcelem2 +#define srcelem2 srcelem2_ #else #define EXTRA_PARAMS , scaleT scale #endif #define PROCESS_ELEM \ workT e2 = srcelem2, zero = (workT)(0); \ - dstelem = convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)) + storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2))) #elif defined OP_RDIV_SCALE #undef EXTRA_PARAMS #ifdef UNARY_OP -#define EXTRA_PARAMS , workT srcelem2, scaleT scale +#define EXTRA_PARAMS , workST srcelem2_, scaleT scale +#undef srcelem2 +#define srcelem2 srcelem2_ #else #define EXTRA_PARAMS , scaleT scale #endif #define PROCESS_ELEM \ workT e1 = srcelem1, zero = (workT)(0); \ - dstelem = convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)) + storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1))) #elif defined OP_RECIP_SCALE #undef EXTRA_PARAMS #define EXTRA_PARAMS , scaleT scale #define PROCESS_ELEM \ workT e1 = srcelem1, zero = (workT)(0); \ - dstelem = convertToDT(e1 != zero ? scale / e1 : zero) + storedst(convertToDT(e1 != zero ? scale / e1 : zero)) #elif defined OP_ADDW #undef EXTRA_PARAMS #define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma -#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma) +#define PROCESS_ELEM storedst(convertToDT(srcelem1*alpha + srcelem2*beta + gamma)) #elif defined OP_MAG -#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2)) #elif defined OP_ABS_NOSAT #define PROCESS_ELEM \ dstT v = convertToDT(srcelem1); \ - dstelem = v >= 0 ? v : -v + storedst(v >= 0 ? v : -v) #elif defined OP_PHASE_RADIANS #define PROCESS_ELEM \ workT tmp = atan2(srcelem2, srcelem1); \ if(tmp < 0) tmp += 6.283185307179586232f; \ - dstelem = tmp + storedst(tmp) #elif defined OP_PHASE_DEGREES #define PROCESS_ELEM \ workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465f; \ if(tmp < 0) tmp += 360; \ - dstelem = tmp + storedst(tmp) #elif defined OP_EXP -#define PROCESS_ELEM dstelem = exp(srcelem1) +#define PROCESS_ELEM storedst(exp(srcelem1)) #elif defined OP_POW -#define PROCESS_ELEM dstelem = pow(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(pow(srcelem1, srcelem2)) #elif defined OP_POWN #undef workT #define workT int -#define PROCESS_ELEM dstelem = pown(srcelem1, srcelem2) +#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2)) #elif defined OP_SQRT -#define PROCESS_ELEM dstelem = sqrt(srcelem1) +#define PROCESS_ELEM storedst(sqrt(srcelem1)) #elif defined OP_LOG #define PROCESS_ELEM \ -dstT v = (dstT)(srcelem1);\ -dstelem = v > (dstT)(0) ? log(v) : log(-v) + dstT v = (dstT)(srcelem1);\ + storedst(v > (dstT)(0) ? log(v) : log(-v)) #elif defined OP_CMP #define dstT uchar #define srcT2 srcT1 #define convertToWT1 -#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0) +#define PROCESS_ELEM storedst(convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)) #elif defined OP_CONVERT_SCALE_ABS #undef EXTRA_PARAMS #define EXTRA_PARAMS , workT alpha, workT beta #define PROCESS_ELEM \ workT value = srcelem1 * alpha + beta; \ - dstelem = convertToDT(value >= 0 ? value : -value) + storedst(convertToDT(value >= 0 ? value : -value)) #elif defined OP_SCALE_ADD #undef EXTRA_PARAMS #define EXTRA_PARAMS , workT alpha -#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * alpha + srcelem2) +#define PROCESS_ELEM storedst(convertToDT(srcelem1 * alpha + srcelem2)) #elif defined OP_CTP_AD || defined OP_CTP_AR #ifdef OP_CTP_AD @@ -257,8 +298,8 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v) dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \ dstT cartToPolar = y2 <= x2 ? x * y / (x2 + 0.28f * y2 + CV_EPSILON) + tmp : (tmp1 - x * y / (y2 + 0.28f * x2 + CV_EPSILON)); \ TO_DEGREE \ - dstelem = magnitude; \ - dstelem2 = cartToPolar + storedst(magnitude); \ + storedst2(cartToPolar) #elif defined OP_PTC_AD || defined OP_PTC_AR #ifdef OP_PTC_AD @@ -272,15 +313,15 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v) #define PROCESS_ELEM \ dstT x = srcelem1, y = srcelem2; \ FROM_DEGREE; \ - dstelem = cos(alpha) * x; \ - dstelem2 = sin(alpha) * x + storedst(cos(alpha) * x); \ + storedst2(sin(alpha) * x) #elif defined OP_PATCH_NANS #undef EXTRA_PARAMS #define EXTRA_PARAMS , int val #define PROCESS_ELEM \ if (( srcelem1 & 0x7fffffff) > 0x7f800000 ) \ - dstelem = val + storedst(val) #else #error "unknown op type" @@ -290,18 +331,26 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v) #undef EXTRA_PARAMS #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2 #undef EXTRA_INDEX - #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT) + dstoffset2) + #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT_C1)*cn + dstoffset2) #endif #if defined UNARY_OP || defined MASK_UNARY_OP -#undef srcelem2 + #if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \ defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \ defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \ defined OP_MUL || defined OP_DIV || defined OP_POWN #undef EXTRA_PARAMS - #define EXTRA_PARAMS , workT srcelem2 + #define EXTRA_PARAMS , workST srcelem2_ + #undef srcelem2 + #define srcelem2 srcelem2_ #endif + +#if cn == 3 +#undef srcelem2 +#define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z) +#endif + #endif #if defined BINARY_OP @@ -316,11 +365,11 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, if (x < cols && y < rows) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT)) - int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2); + int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2); #endif - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); EXTRA_INDEX; PROCESS_ELEM; @@ -343,9 +392,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); - int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); + int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); PROCESS_ELEM; } @@ -363,9 +412,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, if (x < cols && y < rows) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); - EXTRA_INDEX; + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); PROCESS_ELEM; } @@ -386,8 +434,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1); - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); + int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1); + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset); PROCESS_ELEM; } From 290fbc01211b9785fcf6d531e2fe39fe7c3fa1a2 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Wed, 12 Feb 2014 19:29:18 +0400 Subject: [PATCH 2/3] 3-channel support in OpenCL kernels for setTo, resize, warpAffine and warpPerspective --- modules/core/src/opencl/copyset.cl | 24 +++++-- modules/core/src/umatrix.cpp | 11 ++-- modules/imgproc/src/imgwarp.cpp | 63 ++++++++++++------- modules/imgproc/src/opencl/resize.cl | 47 +++++++------- modules/imgproc/src/opencl/warp_affine.cl | 60 +++++++++++------- .../imgproc/src/opencl/warp_perspective.cl | 60 +++++++++++------- 6 files changed, 166 insertions(+), 99 deletions(-) diff --git a/modules/core/src/opencl/copyset.cl b/modules/core/src/opencl/copyset.cl index 05cde8ee01..cbafe67058 100644 --- a/modules/core/src/opencl/copyset.cl +++ b/modules/core/src/opencl/copyset.cl @@ -87,9 +87,21 @@ __kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_of #else +#ifndef dstST +#define dstST dstT +#endif + +#if cn != 3 +#define value value_ +#define storedst(val) *(__global dstT*)(dstptr + dst_index) = val +#else +#define value (dstT)(value_.x, value_.y, value_.z) +#define storedst(val) vstore3(val, 0, (__global dstT1*)(dstptr + dst_index)) +#endif + __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset, __global uchar* dstptr, int dststep, int dstoffset, - int rows, int cols, dstT value ) + int rows, int cols, dstST value_ ) { int x = get_global_id(0); int y = get_global_id(1); @@ -99,22 +111,22 @@ __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset, int mask_index = mad24(y, maskstep, x + maskoffset); if( mask[mask_index] ) { - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); - *(__global dstT*)(dstptr + dst_index) = value; + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset); + storedst(value); } } } __kernel void set(__global uchar* dstptr, int dststep, int dstoffset, - int rows, int cols, dstT value ) + int rows, int cols, dstST value_ ) { int x = get_global_id(0); int y = get_global_id(1); if (x < cols && y < rows) { - int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset); - *(__global dstT*)(dstptr + dst_index) = value; + int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset); + storedst(value); } } diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 1dd7b4df40..c5559213e1 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -744,20 +744,23 @@ UMat& UMat::setTo(InputArray _value, InputArray _mask) { bool haveMask = !_mask.empty(); int tp = type(), cn = CV_MAT_CN(tp); - if( dims <= 2 && cn <= 4 && cn != 3 && ocl::useOpenCL() ) + if( dims <= 2 && cn <= 4 && CV_MAT_DEPTH(tp) < CV_64F && ocl::useOpenCL() ) { Mat value = _value.getMat(); CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) ); - double buf[4]; + double buf[4]={0,0,0,0}; convertAndUnrollScalar(value, tp, (uchar*)buf, 1); + int scalarcn = cn == 3 ? 4 : cn; char opts[1024]; - sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(tp)); + sprintf(opts, "-D dstT=%s -D dstST=%s -D dstT1=%s -D cn=%d", ocl::memopTypeToStr(tp), + ocl::memopTypeToStr(CV_MAKETYPE(tp,scalarcn)), + ocl::memopTypeToStr(CV_MAT_DEPTH(tp)), cn); ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts); if( !setK.empty() ) { - ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(tp)); + ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn); UMat mask; if( haveMask ) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index f30c9b68b2..fb346f342f 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1957,7 +1957,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, double inv_fx = 1. / fx, inv_fy = 1. / fy; float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy; - if( cn == 3 || !(cn <= 4 && + if( !(cn <= 4 && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) ) return false; @@ -1975,15 +1975,18 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, int wtype = CV_MAKETYPE(wdepth, cn); char buf[2][32]; k.create("resizeLN", ocl::imgproc::resize_oclsrc, - format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s", - depth, ocl::typeToStr(type), ocl::typeToStr(wtype), + format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s " + "-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d", + depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), ocl::convertTypeStr(depth, wdepth, cn, buf[0]), - ocl::convertTypeStr(wdepth, depth, cn, buf[1]))); + ocl::convertTypeStr(wdepth, depth, cn, buf[1]), + cn)); } else if (interpolation == INTER_NEAREST) { k.create("resizeNN", ocl::imgproc::resize_oclsrc, - format("-D INTER_NEAREST -D PIXTYPE=%s -D cn", ocl::memopTypeToStr(type), cn)); + format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d", + ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn)); } else if (interpolation == INTER_AREA) { @@ -1995,9 +1998,9 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, int wtype = CV_MAKE_TYPE(wdepth, cn); char cvt[2][40]; - String buildOption = format("-D INTER_AREA -D T=%s -D WTV=%s -D convertToWTV=%s", - ocl::typeToStr(type), ocl::typeToStr(wtype), - ocl::convertTypeStr(depth, wdepth, cn, cvt[0])); + String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d", + ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), + ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn); UMat alphaOcl, tabofsOcl, mapOcl; UMat dmap, smap; @@ -2005,7 +2008,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, if (is_area_fast) { int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn); - buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST" + buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST" " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff", ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]), ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]), @@ -2028,7 +2031,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, } else { - buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0])); + buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0])); k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption); if (k.empty()) return false; @@ -3887,7 +3890,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, { CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE); - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = depth; + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); double doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; int interpolation = flags & INTER_MAX; @@ -3896,7 +3899,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, if ( !(borderType == cv::BORDER_CONSTANT && (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) || - (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3) + (!doubleSupport && depth == CV_64F) || cn > 4) return false; const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" }; @@ -3904,28 +3907,40 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc; const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective"; + int scalarcn = cn == 3 ? 4 : cn; + int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth); + int sctype = CV_MAKETYPE(wdepth, scalarcn); + ocl::Kernel k; + String opts; if (interpolation == INTER_NEAREST) { - k.create(kernelName, program, - format("-D INTER_NEAREST -D T=%s%s", ocl::typeToStr(type), - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d", ocl::typeToStr(type), + doubleSupport ? " -D DOUBLE_SUPPORT" : "", + ocl::typeToStr(CV_MAT_DEPTH(type)), + ocl::typeToStr(sctype), + cn); } else { char cvt[2][50]; - wdepth = std::max(CV_32S, depth); - k.create(kernelName, program, - format("-D INTER_%s -D T=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s", - interpolationMap[interpolation], ocl::typeToStr(type), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth, - ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, depth, cn, cvt[1]), - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s cn=%d", + interpolationMap[interpolation], ocl::typeToStr(type), + ocl::typeToStr(CV_MAT_DEPTH(type)), + ocl::typeToStr(sctype), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth, + ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, depth, cn, cvt[1]), + doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn); } + + k.create(kernelName, program, opts); if (k.empty()) return false; + double borderBuf[] = {0, 0, 0, 0}; + scalarToRawData(borderValue, borderBuf, sctype); + UMat src = _src.getUMat(), M0; _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); UMat dst = _dst.getUMat(); @@ -3956,7 +3971,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F); k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0), - ocl::KernelArg::Constant(Mat(1, 1, CV_MAKE_TYPE(wdepth, cn), borderValue))); + ocl::KernelArg(0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype))); size_t globalThreads[2] = { dst.cols, dst.rows }; return k.run(2, globalThreads, NULL, false); diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl index d4f2383586..a142d781cf 100644 --- a/modules/imgproc/src/opencl/resize.cl +++ b/modules/imgproc/src/opencl/resize.cl @@ -52,9 +52,19 @@ #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1) #define INC(x,l) min(x+1,l-1) -#define PIXSIZE ((int)sizeof(PIXTYPE)) + #define noconvert(x) (x) +#if cn != 3 +#define loadpix(addr) *(__global const PIXTYPE*)(addr) +#define storepix(val, addr) *(__global PIXTYPE*)(addr) = val +#define PIXSIZE ((int)sizeof(PIXTYPE)) +#else +#define loadpix(addr) vload3(0, (__global const PIXTYPE1*)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global PIXTYPE1*)(addr)) +#define PIXSIZE ((int)sizeof(PIXTYPE1)*3) +#endif + #if defined INTER_LINEAR __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset, @@ -89,10 +99,10 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset, int U1 = rint(INTER_RESIZE_COEF_SCALE - u); int V1 = rint(INTER_RESIZE_COEF_SCALE - v); - WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); - WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); - WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); - WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); + WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); + WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) + mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3); @@ -102,10 +112,10 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset, #else float u1 = 1.f - u; float v1 = 1.f - v; - WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); - WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); - WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); - WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); + WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); + WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); + WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); PIXTYPE uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3; @@ -113,8 +123,7 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset, if(dx < dstcols && dy < dstrows) { - __global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE)); - dst[0] = uval; + storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE)); } } @@ -136,17 +145,13 @@ __kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset, int sx = min(convert_int_rtz(s1), srccols-1); int sy = min(convert_int_rtz(s2), srcrows-1); - __global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE)); - __global const PIXTYPE* src = (__global const PIXTYPE*)(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)); - - dst[0] = src[0]; + storepix(loadpix(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)), + dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE)); } } #elif defined INTER_AREA -#define TSIZE ((int)(sizeof(T))) - #ifdef INTER_AREA_FAST __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, @@ -174,10 +179,10 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_ int src_index = mad24(symap_tab[y + sy], src_step, src_offset); #pragma unroll for (int x = 0; x < XSCALE; ++x) - sum += convertToWTV(((__global const T*)(src + src_index))[sxmap_tab[sx + x]]); + sum += convertToWTV(loadpix(src + src_index + sxmap_tab[sx + x]*PIXSIZE)); } - ((__global T*)(dst + dst_index))[dx] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE)); + storepix(convertToPIXTYPE(convertToWT2V(sum) * (WT2V)(SCALE)), dst + dst_index + dx*PIXSIZE); } } @@ -219,12 +224,12 @@ __kernel void resizeAREA(__global const uchar * src, int src_step, int src_offse for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk) { WTV alpha = (WTV)(xalpha_tab[xk]); - buf += convertToWTV(((__global const T*)(src + src_index))[sx]) * alpha; + buf += convertToWTV(loadpix(src + src_index + sx*PIXSIZE)) * alpha; } sum += buf * beta; } - ((__global T*)(dst + dst_index))[dx] = convertToT(sum); + storepix(convertToPIXTYPE(sum), dst + dst_index + dx*PIXSIZE); } } diff --git a/modules/imgproc/src/opencl/warp_affine.cl b/modules/imgproc/src/opencl/warp_affine.cl index 340cfdd8e0..028e8736e2 100644 --- a/modules/imgproc/src/opencl/warp_affine.cl +++ b/modules/imgproc/src/opencl/warp_affine.cl @@ -64,11 +64,31 @@ #define noconvert +#ifndef ST +#define ST T +#endif + +#if cn != 3 +#define loadpix(addr) *(__global const T*)(addr) +#define storepix(val, addr) *(__global T*)(addr) = val +#define scalar scalar_ +#define pixsize (int)sizeof(T) +#else +#define loadpix(addr) vload3(0, (__global const T1*)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr)) +#ifdef INTER_NEAREST +#define scalar (T)(scalar_.x, scalar_.y, scalar_.z) +#else +#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z) +#endif +#define pixsize ((int)sizeof(T1)*3) +#endif + #ifdef INTER_NEAREST __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant CT * M, T scalar) + __constant CT * M, ST scalar_) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -85,17 +105,15 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of short sx = convert_short_sat(X0 >> AB_BITS); short sy = convert_short_sat(Y0 >> AB_BITS); - int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T)); - __global T * dst = (__global T *)(dstptr + dst_index); + int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) { - int src_index = mad24(sy, src_step, src_offset + sx * (int)sizeof(T)); - __global const T * src = (__global const T *)(srcptr + src_index); - dst[0] = src[0]; + int src_index = mad24(sy, src_step, src_offset + sx * pixsize); + storepix(loadpix(srcptr + src_index), dstptr + dst_index); } else - dst[0] = scalar; + storepix(scalar, dstptr + dst_index); } } @@ -103,7 +121,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant CT * M, WT scalar) + __constant CT * M, ST scalar_) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -126,19 +144,18 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of short ay = convert_short(Y0 & (INTER_TAB_SIZE-1)); WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar; WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar; WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar; WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar; float taby = 1.f/INTER_TAB_SIZE*ay; float tabx = 1.f/INTER_TAB_SIZE*ax; - int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T)); - __global T * dst = (__global T *)(dstptr + dst_index); + int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); #if depth <= 4 int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); @@ -147,11 +164,11 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE ); WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; - dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS); + storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index); #else float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby; WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby; - dst[0] = convertToT(val); + storepix(convertToT(val), dstptr + dst_index); #endif } } @@ -170,7 +187,7 @@ inline void interpolateCubic( float x, float* coeffs ) __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant CT * M, WT scalar) + __constant CT * M, ST scalar_) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -198,7 +215,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of #pragma unroll for (int x = 0; x < 4; x++) v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar; float tab1y[4], tab1x[4]; @@ -207,8 +224,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of interpolateCubic(ayy, tab1y); interpolateCubic(axx, tab1x); - int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T)); - __global T * dst = (__global T *)(dstptr + dst_index); + int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); WT sum = (WT)(0); #if depth <= 4 @@ -221,12 +237,12 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of #pragma unroll for (int i = 0; i < 16; i++) sum += v[i] * itab[i]; - dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ); + storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index); #else #pragma unroll for (int i = 0; i < 16; i++) sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)]; - dst[0] = convertToT( sum ); + storepix(convertToT( sum ), dstptr + dst_index); #endif } } diff --git a/modules/imgproc/src/opencl/warp_perspective.cl b/modules/imgproc/src/opencl/warp_perspective.cl index 211433e709..211f45b5b9 100644 --- a/modules/imgproc/src/opencl/warp_perspective.cl +++ b/modules/imgproc/src/opencl/warp_perspective.cl @@ -64,11 +64,31 @@ #define noconvert +#ifndef ST +#define ST T +#endif + +#if cn != 3 +#define loadpix(addr) *(__global const T*)(addr) +#define storepix(val, addr) *(__global T*)(addr) = val +#define scalar scalar_ +#define pixsize (int)sizeof(T) +#else +#define loadpix(addr) vload3(0, (__global const T1*)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr)) +#ifdef INTER_NEAREST +#define scalar (T)(scalar_.x, scalar_.y, scalar_.z) +#else +#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z) +#endif +#define pixsize ((int)sizeof(T1)*3) +#endif + #ifdef INTER_NEAREST __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant CT * M, T scalar) + __constant CT * M, ST scalar_) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -82,17 +102,15 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s short sx = convert_short_sat_rte(X0*W); short sy = convert_short_sat_rte(Y0*W); - int dst_index = mad24(dy, dst_step, dx * (int)sizeof(T) + dst_offset); - __global T * dst = (__global T *)(dstptr + dst_index); + int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset); if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) { - int src_index = mad24(sy, src_step, sx * (int)sizeof(T) + src_offset); - __global const T * src = (__global const T *)(srcptr + src_index); - dst[0] = src[0]; + int src_index = mad24(sy, src_step, sx * pixsize + src_offset); + storepix(loadpix(srcptr + src_index), dstptr + dst_index); } else - dst[0] = scalar; + storepix(scalar, dstptr + dst_index); } } @@ -100,7 +118,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant CT * M, WT scalar) + __constant CT * M, ST scalar_) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -119,19 +137,18 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s short ax = (short)(X & (INTER_TAB_SIZE - 1)); WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar; WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar; WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar; WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar; float taby = 1.f/INTER_TAB_SIZE*ay; float tabx = 1.f/INTER_TAB_SIZE*ax; - int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T)); - __global T * dst = (__global T *)(dstptr + dst_index); + int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); #if depth <= 4 int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); @@ -140,11 +157,11 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE ); WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; - dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS); + storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index); #else float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby; WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby; - dst[0] = convertToT(val); + storepix(convertToT(val), dstptr + dst_index); #endif } } @@ -163,7 +180,7 @@ inline void interpolateCubic( float x, float* coeffs ) __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant CT * M, WT scalar) + __constant CT * M, ST scalar_) { int dx = get_global_id(0); int dy = get_global_id(1); @@ -187,7 +204,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s #pragma unroll for (int x = 0; x < 4; x++) v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ? - convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar; + convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar; float tab1y[4], tab1x[4]; @@ -196,8 +213,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s interpolateCubic(ayy, tab1y); interpolateCubic(axx, tab1x); - int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T)); - __global T * dst = (__global T *)(dstptr + dst_index); + int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize); WT sum = (WT)(0); #if depth <= 4 @@ -210,12 +226,12 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s #pragma unroll for (int i = 0; i < 16; i++) sum += v[i] * itab[i]; - dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ); + storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index); #else #pragma unroll for (int i = 0; i < 16; i++) sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)]; - dst[0] = convertToT( sum ); + storepix(convertToT( sum ), dstptr + dst_index); #endif } } From 630bdbf42fca7883585ed04ed927a009b9cd51c0 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Thu, 13 Feb 2014 17:37:42 +0400 Subject: [PATCH 3/3] fixed OpenCL kernel flag in resize (thanks to Ilya L) --- modules/imgproc/src/imgwarp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index fb346f342f..a9e16ec4ef 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -3924,7 +3924,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, else { char cvt[2][50]; - opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s cn=%d", + opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s -D cn=%d", interpolationMap[interpolation], ocl::typeToStr(type), ocl::typeToStr(CV_MAT_DEPTH(type)), ocl::typeToStr(sctype),