diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index 7ca8b4b48d..10d8a1cd46 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -1396,7 +1396,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, char cvtstr[4][32], opts[1024]; sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " - "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d", + "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), @@ -1407,7 +1407,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI); + doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI, + oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ? + ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert"); size_t usrdata_esz = CV_ELEM_SIZE(wdepth); const uchar* usrdata_p = (const uchar*)usrdata; diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index b65f4f0091..7a02bd6d72 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -2513,7 +2513,7 @@ static bool ocl_patchNaNs( InputOutputArray _a, float value ) { int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; ocl::Kernel k("KF", ocl::core::arithm_oclsrc, - format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int -D rowsPerWI=%d", + format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=float -D rowsPerWI=%d", rowsPerWI)); if (k.empty()) return false; diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl index 7ff3286d83..3478c1a518 100644 --- a/modules/core/src/opencl/arithm.cl +++ b/modules/core/src/opencl/arithm.cl @@ -65,6 +65,12 @@ #endif #endif +#ifdef INTEL_DEVICE +#pragma OPENCL FP_CONTRACT ON +#pragma OPENCL FP_FAST_FMAF ON +#pragma OPENCL FP_FAST_FMA ON +#endif + #if depth <= 5 #define CV_PI M_PI_F #else @@ -157,9 +163,13 @@ #define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1)) #elif defined OP_ABSDIFF +#if wdepth <= 4 #define PROCESS_ELEM \ - workT v = srcelem1 - srcelem2; \ - storedst(convertToDT(v >= (workT)(0) ? v : -v)) + storedst(convertToDT(convertFromU(abs_diff(srcelem1, srcelem2)))) +#else +#define PROCESS_ELEM \ + storedst(convertToDT(fabs(srcelem1 - srcelem2))) +#endif #elif defined OP_AND #define PROCESS_ELEM storedst(srcelem1 & srcelem2) @@ -237,31 +247,32 @@ #if wdepth <= 4 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma)))) #else -#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, mad(srcelem2, beta, gamma)))) +#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, fma(srcelem2, beta, gamma)))) #endif #elif defined OP_MAG #define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2)) -#elif defined OP_ABS_NOSAT -#define PROCESS_ELEM \ - dstT v = convertToDT(srcelem1); \ - storedst(v >= 0 ? v : -v) - #elif defined OP_PHASE_RADIANS #define PROCESS_ELEM \ - workT tmp = atan2(srcelem2, srcelem1); \ - if(tmp < 0) tmp += 6.283185307179586232f; \ - storedst(tmp) + workT tmp = atan2(srcelem2, srcelem1); \ + if (tmp < 0) \ + tmp += 2 * CV_PI; \ + storedst(tmp) #elif defined OP_PHASE_DEGREES #define PROCESS_ELEM \ - workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465f; \ - if(tmp < 0) tmp += 360; \ + workT tmp = degrees(atan2(srcelem2, srcelem1)); \ + if (tmp < 0) \ + tmp += 360; \ storedst(tmp) #elif defined OP_EXP +#if wdepth == 5 +#define PROCESS_ELEM storedst(native_exp(srcelem1)) +#else #define PROCESS_ELEM storedst(exp(srcelem1)) +#endif #elif defined OP_POW #define PROCESS_ELEM storedst(pow(srcelem1, srcelem2)) @@ -282,12 +293,11 @@ #define PROCESS_ELEM storedst(pown(srcelem1, srcelem2)) #elif defined OP_SQRT -#define PROCESS_ELEM storedst(sqrt(srcelem1)) +#define PROCESS_ELEM storedst(native_sqrt(srcelem1)) #elif defined OP_LOG #define PROCESS_ELEM \ - dstT v = (dstT)(srcelem1);\ - storedst(v > (dstT)(0) ? log(v) : log(-v)) + storedst(log(fabs(srcelem1))) #elif defined OP_CMP #define srcT2 srcT1 @@ -295,9 +305,7 @@ #define convertToWT1 #endif #define PROCESS_ELEM \ - workT __s1 = srcelem1; \ - workT __s2 = srcelem2; \ - storedst(((__s1 CMP_OPERATOR __s2) ? (dstT)(255) : (dstT)(0))) + storedst(srcelem1 CMP_OPERATOR srcelem2 ? (dstT)(255) : (dstT)(0)) #elif defined OP_CONVERT_SCALE_ABS #undef EXTRA_PARAMS @@ -305,11 +313,11 @@ #if wdepth <= 4 #define PROCESS_ELEM \ workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \ - storedst(convertToDT(value >= 0 ? value : -value)) + storedst(convertToDT(abs(value))) #else #define PROCESS_ELEM \ - workT value = mad(srcelem1, (workT)(alpha), (workT)(beta)); \ - storedst(convertToDT(value >= 0 ? value : -value)) + workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \ + storedst(convertToDT(fabs(value))) #endif #elif defined OP_SCALE_ADD @@ -318,7 +326,7 @@ #if wdepth <= 4 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2))) #else -#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, (workT)(alpha), srcelem2))) +#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, (workT)(alpha), srcelem2))) #endif #elif defined OP_CTP_AD || defined OP_CTP_AR @@ -328,7 +336,7 @@ #define CV_EPSILON DBL_EPSILON #endif #ifdef OP_CTP_AD -#define TO_DEGREE cartToPolar *= (180 / CV_PI); +#define TO_DEGREE cartToPolar = degrees(cartToPolar); #elif defined OP_CTP_AR #define TO_DEGREE #endif @@ -346,24 +354,21 @@ #elif defined OP_PTC_AD || defined OP_PTC_AR #ifdef OP_PTC_AD -#define FROM_DEGREE \ - dstT ascale = CV_PI/180.0f; \ - dstT alpha = y * ascale +#define FROM_DEGREE y = radians(y) #else -#define FROM_DEGREE \ - dstT alpha = y +#define FROM_DEGREE #endif #define PROCESS_ELEM \ - dstT x = srcelem1, y = srcelem2; \ + dstT x = srcelem1, y = srcelem2, cosval; \ FROM_DEGREE; \ - storedst(cos(alpha) * x); \ - storedst2(sin(alpha) * x) + storedst2(sincos(y, &cosval) * x); \ + storedst(cosval * x); #elif defined OP_PATCH_NANS #undef EXTRA_PARAMS -#define EXTRA_PARAMS , int val +#define EXTRA_PARAMS , dstT val #define PROCESS_ELEM \ - if (( srcelem1 & 0x7fffffff) > 0x7f800000 ) \ + if (isnan(srcelem1)) \ storedst(val) #else diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl index eb57347a28..a51c5d93a3 100644 --- a/modules/core/src/opencl/minmaxloc.cl +++ b/modules/core/src/opencl/minmaxloc.cl @@ -39,6 +39,14 @@ #define noconvert #define INDEX_MAX UINT_MAX +#if wdepth <= 4 +#define MIN_ABS(a) convertFromU(abs(a)) +#define MIN_ABS2(a, b) convertFromU(abs_diff(a, b)) +#else +#define MIN_ABS(a) fabs(a) +#define MIN_ABS2(a, b) fabs(a - b) +#endif + #if kercn != 3 #define loadpix(addr) *(__global const srcT *)(addr) #define srcTSIZE (int)sizeof(srcT) @@ -182,7 +190,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off #endif temp = convertToDT(loadpix(srcptr + src_index)); #ifdef OP_ABS - temp = temp >= (dstT)(0) ? temp : -temp; + temp = MIN_ABS(temp); #endif #ifdef HAVE_SRC2 @@ -192,9 +200,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE)); #endif temp2 = convertToDT(loadpix(src2ptr + src2_index)); - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); + temp = MIN_ABS2(temp, temp2); #ifdef OP_CALC2 - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; + temp2 = MIN_ABS(temp2); #endif #endif diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 888b5dff8b..f16a742e54 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -108,6 +108,14 @@ #define dstTSIZE ((int)sizeof(dstT1)*3) #endif +#if ddepth <= 4 +#define SUM_ABS(a) convertFromU(abs(a)) +#define SUM_ABS2(a, b) convertFromU(abs_diff(a, b)) +#else +#define SUM_ABS(a) fabs(a) +#define SUM_ABS2(a, b) fabs(a - b) +#endif + #ifdef HAVE_MASK #ifdef HAVE_SRC2 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset @@ -136,7 +144,7 @@ #define FUNC(a, b) a += b #elif defined OP_SUM_ABS -#define FUNC(a, b) a += b >= (dstT)(0) ? b : -b +#define FUNC(a, b) a += SUM_ABS(b) #elif defined OP_SUM_SQR #if ddepth <= 4 @@ -163,15 +171,15 @@ #define PROCESS_ELEMS \ dstT temp = convertToDT(loadpix(srcptr + src_index)); \ dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ + temp = SUM_ABS2(temp, temp2); \ + temp2 = SUM_ABS(temp2); \ FUNC(accumulator2, temp2); \ FUNC(accumulator, temp) #else #define PROCESS_ELEMS \ dstT temp = convertToDT(loadpix(srcptr + src_index)); \ dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp = SUM_ABS2(temp, temp2); \ FUNC(accumulator, temp) #endif #else @@ -255,16 +263,16 @@ #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ + temp = SUM_ABS2(temp, temp2); \ + temp2 = SUM_ABS(temp2); \ FUNC(accumulator, temp); \ FUNC(accumulator2, temp2) #elif kercn == 2 #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ + temp = SUM_ABS2(temp, temp2); \ + temp2 = SUM_ABS(temp2); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator2, temp2.s0); \ @@ -273,8 +281,8 @@ #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ + temp = SUM_ABS2(temp, temp2); \ + temp2 = SUM_ABS(temp2); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -287,8 +295,8 @@ #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ + temp = SUM_ABS2(temp, temp2); \ + temp2 = SUM_ABS(temp2); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -309,8 +317,8 @@ #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ - temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \ + temp = SUM_ABS2(temp, temp2); \ + temp2 = SUM_ABS(temp2); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -349,20 +357,20 @@ #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp = SUM_ABS2(temp, temp2); \ FUNC(accumulator, temp) #elif kercn == 2 #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp = SUM_ABS2(temp, temp2); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1) #elif kercn == 4 #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp = SUM_ABS2(temp, temp2); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -371,7 +379,7 @@ #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp = SUM_ABS2(temp, temp2)); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ @@ -384,7 +392,7 @@ #define REDUCE_GLOBAL \ dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ - temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \ + temp = SUM_ABS2(temp, temp2); \ FUNC(accumulator, temp.s0); \ FUNC(accumulator, temp.s1); \ FUNC(accumulator, temp.s2); \ diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 79da3c623f..3dd042860d 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -499,20 +499,21 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask wgs2_aligned >>= 1; static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; - char cvt[40]; + char cvt[2][40]; String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" - " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s", + " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s", ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth), ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), ocl::typeToStr(ddepth), ddepth, cn, - ocl::convertTypeStr(depth, ddepth, mcn, cvt), + ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]), opMap[sum_op], (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", haveMask ? " -D HAVE_MASK" : "", _src.isContinuous() ? " -D HAVE_SRC_CONT" : "", haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", - haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); + haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", + depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, mcn, cvt[1]) : "noconvert"); ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); if (k.empty()) @@ -1468,10 +1469,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* needMaxLoc = true; } - char cvt[40]; + char cvt[2][40]; String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s" - " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s", + " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s", depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", @@ -1480,9 +1481,11 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), - ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "", + ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]), + absValues ? " -D OP_ABS" : "", haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "", - haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : ""); + haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth, + depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert"); ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); if (k.empty())