From b2c2aabd04eef8b76200c231b965a4705693bda4 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Tue, 10 Jun 2014 17:11:08 +0400
Subject: [PATCH 1/4] used built-in functions

---
 modules/core/src/mathfuncs.cpp    |  2 +-
 modules/core/src/opencl/arithm.cl | 52 ++++++++++++++++++-------------
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 1893214243..81677c3b35 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -2501,7 +2501,7 @@ static bool ocl_patchNaNs( InputOutputArray _a, float value )
 {
     int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                     format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int -D rowsPerWI=%d",
+                     format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=float -D rowsPerWI=%d",
                             rowsPerWI));
     if (k.empty())
         return false;
diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl
index def115c6e8..a6bdc558b9 100644
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -65,6 +65,12 @@
 #endif
 #endif
 
+#ifdef INTEL_DEVICE
+#pragma OPENCL FP_CONTRACT : on
+#pragma OPENCL FP_FAST_FMAF : on
+#pragma OPENCL FP_FAST_FMA : on
+#endif
+
 #if depth <= 5
 #define CV_PI M_PI_F
 #else
@@ -237,7 +243,7 @@
 #if wdepth <= 4
 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma))))
 #else
-#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, alpha, mad(srcelem2, beta, gamma))))
+#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, mad(srcelem2, beta, gamma))))
 #endif
 
 #elif defined OP_MAG
@@ -251,17 +257,23 @@
 #elif defined OP_PHASE_RADIANS
 #define PROCESS_ELEM \
         workT tmp = atan2(srcelem2, srcelem1); \
-        if(tmp < 0) tmp += 6.283185307179586232f; \
+        if (tmp < 0) \
+            tmp += 6.283185307179586232f; \
         storedst(tmp)
 
 #elif defined OP_PHASE_DEGREES
     #define PROCESS_ELEM \
-    workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465f; \
-    if(tmp < 0) tmp += 360; \
+    workT tmp = degrees(atan2(srcelem2, srcelem1)); \
+    if (tmp < 0) \
+        tmp += 360; \
     storedst(tmp)
 
 #elif defined OP_EXP
+#if wdepth == 5
+#define PROCESS_ELEM storedst(native_exp(srcelem1))
+#else
 #define PROCESS_ELEM storedst(exp(srcelem1))
+#endif
 
 #elif defined OP_POW
 #define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))
@@ -272,11 +284,11 @@
 #define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))
 
 #elif defined OP_SQRT
-#define PROCESS_ELEM storedst(sqrt(srcelem1))
+#define PROCESS_ELEM storedst(native_sqrt(srcelem1))
 
 #elif defined OP_LOG
 #define PROCESS_ELEM \
-    dstT v = (dstT)(srcelem1);\
+    dstT v = (dstT)(srcelem1); \
     storedst(v > (dstT)(0) ? log(v) : log(-v))
 
 #elif defined OP_CMP
@@ -285,9 +297,8 @@
 #define convertToWT1
 #endif
 #define PROCESS_ELEM \
-    workT __s1 = srcelem1; \
-    workT __s2 = srcelem2; \
-    storedst(((__s1 CMP_OPERATOR __s2) ? (dstT)(255) : (dstT)(0)))
+    workT s1 = srcelem1, s2 = srcelem2; \
+    storedst(s1 CMP_OPERATOR s2 ? (dstT)(255) : (dstT)(0))
 
 #elif defined OP_CONVERT_SCALE_ABS
 #undef EXTRA_PARAMS
@@ -298,7 +309,7 @@
     storedst(convertToDT(value >= 0 ? value : -value))
 #else
 #define PROCESS_ELEM \
-    workT value = mad(srcelem1, (workT)(alpha), (workT)(beta)); \
+    workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \
     storedst(convertToDT(value >= 0 ? value : -value))
 #endif
 
@@ -308,7 +319,7 @@
 #if wdepth <= 4
 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2)))
 #else
-#define PROCESS_ELEM storedst(convertToDT(mad(srcelem1, (workT)(alpha), srcelem2)))
+#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, (workT)(alpha), srcelem2)))
 #endif
 
 #elif defined OP_CTP_AD || defined OP_CTP_AR
@@ -318,7 +329,7 @@
 #define CV_EPSILON DBL_EPSILON
 #endif
 #ifdef OP_CTP_AD
-#define TO_DEGREE cartToPolar *= (180 / CV_PI);
+#define TO_DEGREE cartToPolar = degrees(cartToPolar);
 #elif defined OP_CTP_AR
 #define TO_DEGREE
 #endif
@@ -336,24 +347,21 @@
 
 #elif defined OP_PTC_AD || defined OP_PTC_AR
 #ifdef OP_PTC_AD
-#define FROM_DEGREE \
-    dstT ascale = CV_PI/180.0f; \
-    dstT alpha = y * ascale
+#define FROM_DEGREE y = radians(y)
 #else
-#define FROM_DEGREE \
-    dstT alpha = y
+#define FROM_DEGREE
 #endif
 #define PROCESS_ELEM \
-    dstT x = srcelem1, y = srcelem2; \
+    dstT x = srcelem1, y = srcelem2, cosval; \
     FROM_DEGREE; \
-    storedst(cos(alpha) * x); \
-    storedst2(sin(alpha) * x)
+    storedst2(sincos(y, &srcelem2) * x); \
+    storedst(cosval * x);
 
 #elif defined OP_PATCH_NANS
 #undef EXTRA_PARAMS
-#define EXTRA_PARAMS , int val
+#define EXTRA_PARAMS , dstT val
 #define PROCESS_ELEM \
-    if (( srcelem1 & 0x7fffffff) > 0x7f800000 ) \
+    if (isnan(srcelem1)) \
         storedst(val)
 
 #else

From f1e24381d1d4a90c6489dcbf866a48c0f27e73c8 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 11 Jun 2014 18:30:48 +0400
Subject: [PATCH 2/4] used abs

---
 modules/core/src/opencl/arithm.cl | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl
index a6bdc558b9..8945ed422a 100644
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -163,9 +163,13 @@
 #define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1))
 
 #elif defined OP_ABSDIFF
+#if wdepth <= 4
 #define PROCESS_ELEM \
-    workT v = srcelem1 - srcelem2; \
-    storedst(convertToDT(v >= (workT)(0) ? v : -v))
+    storedst(convertToDT(abs_diff(srcelem1, srcelem2)))
+#else
+#define PROCESS_ELEM \
+    storedst(convertToDT(fabs(srcelem1 - srcelem2)))
+#endif
 
 #elif defined OP_AND
 #define PROCESS_ELEM storedst(srcelem1 & srcelem2)
@@ -249,17 +253,12 @@
 #elif defined OP_MAG
 #define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))
 
-#elif defined OP_ABS_NOSAT
-#define PROCESS_ELEM \
-    dstT v = convertToDT(srcelem1); \
-    storedst(v >= 0 ? v : -v)
-
 #elif defined OP_PHASE_RADIANS
 #define PROCESS_ELEM \
-        workT tmp = atan2(srcelem2, srcelem1); \
-        if (tmp < 0) \
-            tmp += 6.283185307179586232f; \
-        storedst(tmp)
+    workT tmp = atan2(srcelem2, srcelem1); \
+    if (tmp < 0) \
+        tmp += 6.283185307179586232f; \
+    storedst(tmp)
 
 #elif defined OP_PHASE_DEGREES
     #define PROCESS_ELEM \
@@ -288,8 +287,7 @@
 
 #elif defined OP_LOG
 #define PROCESS_ELEM \
-    dstT v = (dstT)(srcelem1); \
-    storedst(v > (dstT)(0) ? log(v) : log(-v))
+    storedst(log(fabs(srcelem1)))
 
 #elif defined OP_CMP
 #define srcT2 srcT1
@@ -297,8 +295,7 @@
 #define convertToWT1
 #endif
 #define PROCESS_ELEM \
-    workT s1 = srcelem1, s2 = srcelem2; \
-    storedst(s1 CMP_OPERATOR s2 ? (dstT)(255) : (dstT)(0))
+    storedst(srcelem1 CMP_OPERATOR srcelem2 ? (dstT)(255) : (dstT)(0)))
 
 #elif defined OP_CONVERT_SCALE_ABS
 #undef EXTRA_PARAMS
@@ -306,11 +303,11 @@
 #if wdepth <= 4
 #define PROCESS_ELEM \
     workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \
-    storedst(convertToDT(value >= 0 ? value : -value))
+    storedst(convertToDT(abs(value)))
 #else
 #define PROCESS_ELEM \
     workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \
-    storedst(convertToDT(value >= 0 ? value : -value))
+    storedst(convertToDT(fabs(value)))
 #endif
 
 #elif defined OP_SCALE_ADD

From 316c044e0653a21320136572f2f4bccff7e12525 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 11 Jun 2014 18:54:43 +0400
Subject: [PATCH 3/4] used abs in reduction operations

---
 modules/core/src/opencl/minmaxloc.cl | 14 +++++++--
 modules/core/src/opencl/reduce.cl    | 46 ++++++++++++++++------------
 modules/core/src/stat.cpp            |  4 +--
 3 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index eb57347a28..ca708a89c1 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -39,6 +39,14 @@
 #define noconvert
 #define INDEX_MAX UINT_MAX
 
+#if wdepth <= 4
+#define MIN_ABS(a) convertToDT(abs(a))
+#define MIN_ABS2(a, b) convertToDT(abs_diff(a, b))
+#else
+#define MIN_ABS(a) fabs(a)
+#define MIN_ABS2(a, b) fabs(a - b)
+#endif
+
 #if kercn != 3
 #define loadpix(addr) *(__global const srcT *)(addr)
 #define srcTSIZE (int)sizeof(srcT)
@@ -182,7 +190,7 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
 #endif
             temp = convertToDT(loadpix(srcptr + src_index));
 #ifdef OP_ABS
-            temp = temp >= (dstT)(0) ? temp : -temp;
+            temp = MIN_ABS(temp);
 #endif
 
 #ifdef HAVE_SRC2
@@ -192,9 +200,9 @@ __kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_off
             src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
 #endif
             temp2 = convertToDT(loadpix(src2ptr + src2_index));
-            temp = temp > temp2 ? temp - temp2 : (temp2 - temp);
+            temp = MIN_ABS2(temp, temp2);
 #ifdef OP_CALC2
-            temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2;
+            temp2 = MIN_ABS(temp2);
 #endif
 #endif
 
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 888b5dff8b..8c5193f75f 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -108,6 +108,14 @@
 #define dstTSIZE ((int)sizeof(dstT1)*3)
 #endif
 
+#if ddepth <= 4
+#define SUM_ABS(a) convertToDT(abs(a))
+#define SUM_ABS2(a, b) convertToDT(abs_diff(a, b))
+#else
+#define SUM_ABS(a) fabs(a)
+#define SUM_ABS2(a, b) fabs(a - b)
+#endif
+
 #ifdef HAVE_MASK
 #ifdef HAVE_SRC2
 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
@@ -136,7 +144,7 @@
 #define FUNC(a, b) a += b
 
 #elif defined OP_SUM_ABS
-#define FUNC(a, b) a += b >= (dstT)(0) ? b : -b
+#define FUNC(a, b) a += SUM_ABS(b)
 
 #elif defined OP_SUM_SQR
 #if ddepth <= 4
@@ -163,15 +171,15 @@
 #define PROCESS_ELEMS \
     dstT temp = convertToDT(loadpix(srcptr + src_index)); \
     dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
-    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
+    temp = SUM_ABS2(temp, temp2); \
+    temp2 = SUM_ABS(temp2); \
     FUNC(accumulator2, temp2); \
     FUNC(accumulator, temp)
 #else
 #define PROCESS_ELEMS \
     dstT temp = convertToDT(loadpix(srcptr + src_index)); \
     dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp = SUM_ABS2(temp, temp2); \
     FUNC(accumulator, temp)
 #endif
 #else
@@ -255,16 +263,16 @@
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
-    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
+    temp = SUM_ABS2(temp, temp2); \
+    temp2 = SUM_ABS(temp2); \
     FUNC(accumulator, temp); \
     FUNC(accumulator2, temp2)
 #elif kercn == 2
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
-    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
+    temp = SUM_ABS2(temp, temp2); \
+    temp2 = SUM_ABS(temp2); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator2, temp2.s0); \
@@ -273,8 +281,8 @@
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
-    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
+    temp = SUM_ABS2(temp, temp2); \
+    temp2 = SUM_ABS(temp2); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -287,8 +295,8 @@
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
-    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
+    temp = SUM_ABS2(temp, temp2); \
+    temp2 = SUM_ABS(temp2); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -309,8 +317,8 @@
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
-    temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
+    temp = SUM_ABS2(temp, temp2); \
+    temp2 = SUM_ABS(temp2); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -349,20 +357,20 @@
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp = SUM_ABS2(temp, temp2); \
     FUNC(accumulator, temp)
 #elif kercn == 2
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp = SUM_ABS2(temp, temp2); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1)
 #elif kercn == 4
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp = SUM_ABS2(temp, temp2); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -371,7 +379,7 @@
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp = SUM_ABS2(temp, temp2)); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
@@ -384,7 +392,7 @@
 #define REDUCE_GLOBAL \
     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
-    temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
+    temp = SUM_ABS2(temp, temp2); \
     FUNC(accumulator, temp.s0); \
     FUNC(accumulator, temp.s1); \
     FUNC(accumulator, temp.s2); \
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 79da3c623f..eb13247c7e 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -1471,7 +1471,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     char cvt[40];
     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
                          " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
-                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s",
+                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d",
                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
@@ -1482,7 +1482,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
                          ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "",
                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
-                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
+                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth);
 
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())

From 0528d2e2b3fb461a1ae81cf3bb02df1e84c038db Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 11 Jun 2014 19:30:10 +0400
Subject: [PATCH 4/4] added 32s to 32u conversion

---
 modules/core/src/arithm.cpp          |  6 ++++--
 modules/core/src/ocl.cpp             |  4 ++--
 modules/core/src/opencl/arithm.cl    | 16 ++++++++--------
 modules/core/src/opencl/minmaxloc.cl |  4 ++--
 modules/core/src/opencl/reduce.cl    |  4 ++--
 modules/core/src/stat.cpp            | 19 +++++++++++--------
 6 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 98d9567271..b969ad4170 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1396,7 +1396,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     char cvtstr[4][32], opts[1024];
     sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
             "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
-            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d",
+            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
             oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
             ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
@@ -1407,7 +1407,9 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
             ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
             ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
             ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
-            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI);
+            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
+            oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
+            ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
 
     size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
     const uchar* usrdata_p = (const uchar*)usrdata;
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index b580df194d..24ca6ee4d3 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -4591,7 +4591,7 @@ struct Image2D::Impl
         CV_OclDbgAssert(err == CL_SUCCESS);
 
         size_t origin[] = { 0, 0, 0 };
-        size_t region[] = { src.cols, src.rows, 1 };
+        size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
 
         cl_mem devData;
         if (!alias && !src.isContinuous())
@@ -4599,7 +4599,7 @@ struct Image2D::Impl
             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
             CV_OclDbgAssert(err == CL_SUCCESS);
 
-            const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1};
+            const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
             CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl
index 8945ed422a..9dfb5f239e 100644
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -66,9 +66,9 @@
 #endif
 
 #ifdef INTEL_DEVICE
-#pragma OPENCL FP_CONTRACT : on
-#pragma OPENCL FP_FAST_FMAF : on
-#pragma OPENCL FP_FAST_FMA : on
+#pragma OPENCL FP_CONTRACT ON
+#pragma OPENCL FP_FAST_FMAF ON
+#pragma OPENCL FP_FAST_FMA ON
 #endif
 
 #if depth <= 5
@@ -165,7 +165,7 @@
 #elif defined OP_ABSDIFF
 #if wdepth <= 4
 #define PROCESS_ELEM \
-    storedst(convertToDT(abs_diff(srcelem1, srcelem2)))
+    storedst(convertToDT(convertFromU(abs_diff(srcelem1, srcelem2))))
 #else
 #define PROCESS_ELEM \
     storedst(convertToDT(fabs(srcelem1 - srcelem2)))
@@ -247,7 +247,7 @@
 #if wdepth <= 4
 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma))))
 #else
-#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, mad(srcelem2, beta, gamma))))
+#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, fma(srcelem2, beta, gamma))))
 #endif
 
 #elif defined OP_MAG
@@ -257,7 +257,7 @@
 #define PROCESS_ELEM \
     workT tmp = atan2(srcelem2, srcelem1); \
     if (tmp < 0) \
-        tmp += 6.283185307179586232f; \
+        tmp += 2 * CV_PI; \
     storedst(tmp)
 
 #elif defined OP_PHASE_DEGREES
@@ -295,7 +295,7 @@
 #define convertToWT1
 #endif
 #define PROCESS_ELEM \
-    storedst(srcelem1 CMP_OPERATOR srcelem2 ? (dstT)(255) : (dstT)(0)))
+    storedst(srcelem1 CMP_OPERATOR srcelem2 ? (dstT)(255) : (dstT)(0))
 
 #elif defined OP_CONVERT_SCALE_ABS
 #undef EXTRA_PARAMS
@@ -351,7 +351,7 @@
 #define PROCESS_ELEM \
     dstT x = srcelem1, y = srcelem2, cosval; \
     FROM_DEGREE; \
-    storedst2(sincos(y, &srcelem2) * x); \
+    storedst2(sincos(y, &cosval) * x); \
     storedst(cosval * x);
 
 #elif defined OP_PATCH_NANS
diff --git a/modules/core/src/opencl/minmaxloc.cl b/modules/core/src/opencl/minmaxloc.cl
index ca708a89c1..a51c5d93a3 100644
--- a/modules/core/src/opencl/minmaxloc.cl
+++ b/modules/core/src/opencl/minmaxloc.cl
@@ -40,8 +40,8 @@
 #define INDEX_MAX UINT_MAX
 
 #if wdepth <= 4
-#define MIN_ABS(a) convertToDT(abs(a))
-#define MIN_ABS2(a, b) convertToDT(abs_diff(a, b))
+#define MIN_ABS(a) convertFromU(abs(a))
+#define MIN_ABS2(a, b) convertFromU(abs_diff(a, b))
 #else
 #define MIN_ABS(a) fabs(a)
 #define MIN_ABS2(a, b) fabs(a - b)
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 8c5193f75f..f16a742e54 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -109,8 +109,8 @@
 #endif
 
 #if ddepth <= 4
-#define SUM_ABS(a) convertToDT(abs(a))
-#define SUM_ABS2(a, b) convertToDT(abs_diff(a, b))
+#define SUM_ABS(a) convertFromU(abs(a))
+#define SUM_ABS2(a, b) convertFromU(abs_diff(a, b))
 #else
 #define SUM_ABS(a) fabs(a)
 #define SUM_ABS2(a, b) fabs(a - b)
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index eb13247c7e..3dd042860d 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -499,20 +499,21 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
     wgs2_aligned >>= 1;
 
     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
-    char cvt[40];
+    char cvt[2][40];
     String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
-                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s",
+                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s",
                          ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
                          ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
                          ocl::typeToStr(ddepth), ddepth, cn,
-                         ocl::convertTypeStr(depth, ddepth, mcn, cvt),
+                         ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]),
                          opMap[sum_op], (int)wgs, wgs2_aligned,
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                          haveMask ? " -D HAVE_MASK" : "",
                          _src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
                          haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn,
                          haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "",
-                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "");
+                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "",
+                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, mcn, cvt[1]) : "noconvert");
 
     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
     if (k.empty())
@@ -1468,10 +1469,10 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
             needMaxLoc = true;
     }
 
-    char cvt[40];
+    char cvt[2][40];
     String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s"
                          " -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s"
-                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d",
+                         " -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s",
                          depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs,
                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned,
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
@@ -1480,9 +1481,11 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
                          needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "",
                          needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "",
                          ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
-                         ocl::convertTypeStr(depth, ddepth, kercn, cvt), absValues ? " -D OP_ABS" : "",
+                         ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]),
+                         absValues ? " -D OP_ABS" : "",
                          haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "",
-                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth);
+                         haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth,
+                         depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert");
 
     ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts);
     if (k.empty())