diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index ab12dc2cbf..c296f57a3d 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -584,7 +584,8 @@ namespace cv
 
         CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);
 
-        CV_EXPORTS void setIdentity(oclMat& src, double val);
+        //! initializes a scaled identity matrix
+        CV_EXPORTS void setIdentity(oclMat& src, const Scalar & val = Scalar(1));
 
         //////////////////////////////// Filter Engine ////////////////////////////////
 
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 0dd695bfa6..2a663b990a 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -64,11 +64,10 @@ namespace cv
     {
         //////////////////////////////// OpenCL kernel strings /////////////////////
 
+        extern const char *arithm_absdiff_nonsaturate;
         extern const char *arithm_nonzero;
         extern const char *arithm_sum;
-        extern const char *arithm_sum_3;
         extern const char *arithm_minMax;
-        extern const char *arithm_minMax_mask;
         extern const char *arithm_minMaxLoc;
         extern const char *arithm_minMaxLoc_mask;
         extern const char *arithm_LUT;
@@ -318,21 +317,28 @@ void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int
 ////////////////////////////////// sum  //////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-//type = 0 sum,type = 1 absSum,type = 2 sqrSum
-static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum, int type = 0)
+enum { SUM = 0, ABS_SUM, SQR_SUM };
+
+static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int groupnum, int type, int ddepth)
 {
-    vector<pair<size_t , const void *> > args;
-    int all_cols = src.step / (vlen * src.elemSize1());
-    int pre_cols = (src.offset % src.step) / (vlen * src.elemSize1());
-    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / (vlen * src.elemSize1()) - 1;
+    int ochannels = src.oclchannels();
+    int all_cols = src.step / src.elemSize();
+    int pre_cols = (src.offset % src.step) / src.elemSize();
+    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
     int invalid_cols = pre_cols + sec_cols;
     int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
-    int offset = src.offset / (vlen * src.elemSize1());
-    int repeat_s = src.offset / src.elemSize1() - offset * vlen;
-    int repeat_e = (offset + cols) * vlen - src.offset / src.elemSize1() - src.cols * src.oclchannels();
-    char build_options[512];
-    CV_Assert(type == 0 || type == 1 || type == 2);
-    sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d -D FUNC_TYPE_%d", src.depth(), repeat_s, repeat_e, type);
+    int offset = src.offset / src.elemSize();
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const funcMap[] = { "FUNC_SUM", "FUNC_ABS_SUM", "FUNC_SQR_SUM" };
+    const char * const channelMap[] = { " ", " ", "2", "4", "4" };
+    string buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s -D %s",
+                                 typeMap[src.depth()], channelMap[ochannels],
+                                 typeMap[ddepth], channelMap[ochannels],
+                                 typeMap[ddepth], channelMap[ochannels],
+                                 funcMap[type]);
+
+    vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
@@ -340,55 +346,63 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen ,
     args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
-    size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    if (src.oclchannels() != 3)
-        openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", gt, lt, args, -1, -1, build_options);
-    else
-        openCLExecuteKernel(src.clCxt, &arithm_sum_3, "arithm_op_sum_3", gt, lt, args, -1, -1, build_options);
+    size_t globalThreads[3] = { groupnum * 256, 1, 1 };
+    size_t localThreads[3] = { 256, 1, 1 };
+
+    openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 template <typename T>
-Scalar arithmetic_sum(const oclMat &src, int type = 0)
+Scalar arithmetic_sum(const oclMat &src, int type, int ddepth)
 {
+    CV_Assert(src.step % src.elemSize() == 0);
+
     size_t groupnum = src.clCxt->computeUnits();
     CV_Assert(groupnum != 0);
-    int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
+
+    int dbsize = groupnum * src.oclchannels();
     Context *clCxt = src.clCxt;
 
     AutoBuffer<T> _buf(dbsize);
     T *p = (T*)_buf;
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
-    Scalar s = Scalar::all(0.0);
-    arithmetic_sum_buffer_run(src, dstBuffer, vlen, groupnum, type);
-
     memset(p, 0, dbsize * sizeof(T));
-    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T));
-    for (int i = 0; i < dbsize;)
-    {
-        for (int j = 0; j < src.oclchannels(); j++, i++)
-            s.val[j] += p[i];
-    }
 
+    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
+    arithmetic_sum_buffer_run(src, dstBuffer, groupnum, type, ddepth);
+    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T));
     openCLFree(dstBuffer);
+
+    Scalar s = Scalar::all(0.0);
+    for (int i = 0; i < dbsize;)
+         for (int j = 0; j < src.oclchannels(); j++, i++)
+            s.val[j] += p[i];
+
     return s;
 }
 
-typedef Scalar (*sumFunc)(const oclMat &src, int type);
+typedef Scalar (*sumFunc)(const oclMat &src, int type, int ddepth);
+
 Scalar cv::ocl::sum(const oclMat &src)
 {
     if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
-    static sumFunc functab[2] =
+    static sumFunc functab[3] =
     {
+        arithmetic_sum<int>,
         arithmetic_sum<float>,
         arithmetic_sum<double>
     };
 
-    sumFunc func;
-    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
-    return func(src, 0);
+    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    int ddepth = std::max(src.depth(), CV_32S);
+    if (!hasDouble && ddepth == CV_64F)
+        ddepth = CV_32F;
+
+    sumFunc func = functab[ddepth - CV_32S];
+    return func(src, SUM, ddepth);
 }
 
 Scalar cv::ocl::absSum(const oclMat &src)
@@ -397,15 +411,20 @@ Scalar cv::ocl::absSum(const oclMat &src)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
-    static sumFunc functab[2] =
+    static sumFunc functab[3] =
     {
+        arithmetic_sum<int>,
         arithmetic_sum<float>,
         arithmetic_sum<double>
     };
 
-    sumFunc func;
-    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
-    return func(src, 1);
+    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    int ddepth = std::max(src.depth(), CV_32S);
+    if (!hasDouble && ddepth == CV_64F)
+        ddepth = CV_32F;
+
+    sumFunc func = functab[ddepth - CV_32S];
+    return func(src, ABS_SUM, ddepth);
 }
 
 Scalar cv::ocl::sqrSum(const oclMat &src)
@@ -414,15 +433,18 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
-    static sumFunc functab[2] =
+    static sumFunc functab[3] =
     {
-        arithmetic_sum<float>,
+        arithmetic_sum<int>,
+        arithmetic_sum<double>,
         arithmetic_sum<double>
     };
 
-    sumFunc func;
-    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
-    return func(src, 2);
+    bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
+    int ddepth = src.depth() <= CV_32S ? CV_32S : (hasDouble ? CV_64F : CV_32F);
+
+    sumFunc func = functab[ddepth - CV_32S];
+    return func(src, SQR_SUM, ddepth);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -431,23 +453,15 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
 
 void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
 {
-    CV_Assert(src.depth() <= CV_32S);
-    cv::Size sz(1, 1);
-    int channels = src.oclchannels();
-    Mat m1(sz, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(0)),
-        m2(sz, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(0));
-    oclMat dst1(m1), dst2(m2);
+    double total = 1.0 / src.size().area();
 
-    //arithmetic_sum_run(src, dst1,"arithm_op_sum");
-    //arithmetic_sum_run(src, dst2,"arithm_op_squares_sum");
+    mean = sum(src);
+    stddev = sqrSum(src);
 
-    m1 = (Mat)dst1;
-    m2 = (Mat)dst2;
-    int i = 0, *p = (int *)m1.data, *q = (int *)m2.data;
-    for (; i < channels; i++)
+    for (int i = 0; i < 4; ++i)
     {
-        mean.val[i] = (double)p[i] / (src.cols * src.rows);
-        stddev.val[i] = std::sqrt(std::max((double) q[i] / (src.cols * src.rows) - mean.val[i] * mean.val[i] , 0.));
+        mean[i] *= total;
+        stddev[i] = std::sqrt(std::max(stddev[i] * total - mean.val[i] * mean.val[i] , 0.));
     }
 }
 
@@ -455,139 +469,120 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
 //////////////////////////////////// minMax  /////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-static void arithmetic_minMax_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen , int groupnum, string kernelName)
+template <typename T, typename WT>
+static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem &dst, int groupnum, string kernelName)
 {
-    vector<pair<size_t , const void *> > args;
-    int all_cols = src.step / (vlen * src.elemSize1());
-    int pre_cols = (src.offset % src.step) / (vlen * src.elemSize1());
-    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / (vlen * src.elemSize1()) - 1;
+    int all_cols = src.step / src.elemSize();
+    int pre_cols = (src.offset % src.step) / src.elemSize();
+    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
     int invalid_cols = pre_cols + sec_cols;
-    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
-    int offset = src.offset / (vlen * src.elemSize1());
-    int repeat_s = src.offset / src.elemSize1() - offset * vlen;
-    int repeat_e = (offset + cols) * vlen - src.offset / src.elemSize1() - src.cols * src.oclchannels();
-    char build_options[50];
-    sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d", src.depth(), repeat_s, repeat_e);
+    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
+    int offset = src.offset / src.elemSize();
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const channelMap[] = { " ", " ", "2", "4", "4" };
+
+    ostringstream stream;
+    stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()];
+    stream << " -D MAX_VAL=" << (WT)numeric_limits<T>::max();
+    stream << " -D MIN_VAL=" << (WT)numeric_limits<T>::min();
+    string buildOptions = stream.str();
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+
+    int minvalid_cols = 0, moffset = 0;
     if (!mask.empty())
     {
-        int mall_cols = mask.step / (vlen * mask.elemSize1());
-        int mpre_cols = (mask.offset % mask.step) / (vlen * mask.elemSize1());
-        int msec_cols = mall_cols - (mask.offset % mask.step + mask.cols * mask.elemSize() - 1) / (vlen * mask.elemSize1()) - 1;
-        int minvalid_cols = mpre_cols + msec_cols;
-        int moffset = mask.offset / (vlen * mask.elemSize1());
+        int mall_cols = mask.step / mask.elemSize();
+        int mpre_cols = (mask.offset % mask.step) / mask.elemSize();
+        int msec_cols = mall_cols - (mask.offset % mask.step + mask.cols * mask.elemSize() - 1) / mask.elemSize() - 1;
+        minvalid_cols = mpre_cols + msec_cols;
+        moffset = mask.offset / mask.elemSize();
 
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
         args.push_back( make_pair( sizeof(cl_int) , (void *)&minvalid_cols ));
         args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset ));
-        args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
+
+        kernelName += "_mask";
     }
-    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
-    size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, gt, lt, args, -1, -1, build_options);
+
+    size_t globalThreads[3] = {groupnum * 256, 1, 1};
+    size_t localThreads[3] = {256, 1, 1};
+
+    openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
-
-static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen, int groupnum, string kernelName)
-{
-    vector<pair<size_t , const void *> > args;
-    size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    char build_options[50];
-    if (src.oclchannels() == 1)
-    {
-        int cols = (src.cols - 1) / vlen + 1;
-        int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
-        int offset = src.offset / src.elemSize1();
-        int repeat_me = vlen - (mask.cols % vlen == 0 ? vlen : mask.cols % vlen);
-        int minvalid_cols = mask.step / (vlen * mask.elemSize1()) - cols;
-        int moffset = mask.offset / mask.elemSize1();
-        int elemnum = cols * src.rows;
-        sprintf(build_options, "-D DEPTH_%d -D REPEAT_E%d", src.depth(), repeat_me);
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
-        args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&minvalid_cols ));
-        args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset ));
-        args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-        args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
-        openCLExecuteKernel(src.clCxt, &arithm_minMax_mask, kernelName, gt, lt, args, -1, -1, build_options);
-    }
-}
-
-template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
+template <typename T, typename WT>
+void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
                                              const oclMat &mask, oclMat &buf)
 {
     size_t groupnum = src.clCxt->computeUnits();
     CV_Assert(groupnum != 0);
-    groupnum = groupnum * 2;
-    int vlen = 8;
-    int dbsize = groupnum * 2 * vlen * sizeof(T) ;
 
+    int dbsize = groupnum * 2 * src.elemSize();
     ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
 
     cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
-
-    if (mask.empty())
-    {
-        arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax");
-    }
-    else
-    {
-        arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask");
-    }
+    arithmetic_minMax_run<T, WT>(src, mask, buf_data, groupnum, "arithm_op_minMax");
 
     Mat matbuf = Mat(buf);
     T *p = matbuf.ptr<T>();
     if (minVal != NULL)
     {
         *minVal = std::numeric_limits<double>::max();
-        for (int i = 0; i < vlen * (int)groupnum; i++)
-        {
+        for (int i = 0, end = src.oclchannels() * (int)groupnum; i < end; i++)
             *minVal = *minVal < p[i] ? *minVal : p[i];
-        }
     }
     if (maxVal != NULL)
     {
         *maxVal = -std::numeric_limits<double>::max();
-        for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
-        {
+        for (int i = src.oclchannels() * (int)groupnum, end = i << 1; i < end; i++)
             *maxVal = *maxVal > p[i] ? *maxVal : p[i];
-        }
     }
 }
 
-typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
 void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
 {
     oclMat buf;
     minMax_buf(src, minVal, maxVal, mask, buf);
 }
 
+typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
+
 void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
-    CV_Assert(src.oclchannels() == 1);
+    CV_Assert(src.channels() == 1);
+    CV_Assert(src.size() == mask.size() || mask.empty());
+    CV_Assert(src.step % src.elemSize() == 0);
+
+    if (minVal == NULL && maxVal == NULL)
+        return;
+
     if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
+
     static minMaxFunc functab[8] =
     {
-        arithmetic_minMax<uchar>,
-        arithmetic_minMax<char>,
-        arithmetic_minMax<ushort>,
-        arithmetic_minMax<short>,
-        arithmetic_minMax<int>,
-        arithmetic_minMax<float>,
-        arithmetic_minMax<double>,
+        arithmetic_minMax<uchar, int>,
+        arithmetic_minMax<char, int>,
+        arithmetic_minMax<ushort, int>,
+        arithmetic_minMax<short, int>,
+        arithmetic_minMax<int, int>,
+        arithmetic_minMax<float, float>,
+        arithmetic_minMax<double, double>,
         0
     };
+
     minMaxFunc func;
     func = functab[src.depth()];
     func(src, minVal, maxVal, mask, buf);
@@ -599,57 +594,102 @@ void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, cons
 
 double cv::ocl::norm(const oclMat &src1, int normType)
 {
-    return norm(src1, oclMat(src1.size(), src1.type(), Scalar::all(0)), normType);
+    CV_Assert((normType & NORM_RELATIVE) == 0);
+    return norm(src1, oclMat(), normType);
+}
+
+static void arithm_absdiff_nonsaturate_run(const oclMat & src1, const oclMat & src2, oclMat & diff)
+{
+    CV_Assert(src1.step % src1.elemSize() == 0 && (src2.empty() || src2.step % src2.elemSize() == 0));
+    Context *clCxt = src1.clCxt;
+
+    int ddepth = CV_64F;
+    diff.create(src1.size(), CV_MAKE_TYPE(ddepth, src1.channels()));
+
+    int oclChannels = src1.oclchannels(), sdepth = src1.depth();
+    int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
+    int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
+    int diffstep1 = diff.step / diff.elemSize(), diffoffset1 = diff.offset / diff.elemSize();
+
+    string kernelName = "arithm_absdiff_nonsaturate";
+    size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { diff.cols, diff.rows, 1 };
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const channelMap[] = { "", "", "2", "4", "4" };
+
+    std::string buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s",
+                                      typeMap[sdepth], channelMap[oclChannels],
+                                      typeMap[ddepth], channelMap[oclChannels],
+                                      typeMap[ddepth], channelMap[oclChannels]);
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1 ));
+
+    if (!src2.empty())
+    {
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1 ));
+
+        kernelName += "_binary";
+    }
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&diff.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&diffstep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&diffoffset1 ));
+
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
+
+    openCLExecuteKernel(clCxt, &arithm_absdiff_nonsaturate,
+                        kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 {
+    CV_Assert(!src1.empty());
+    CV_Assert(src2.empty() || (src1.type() == src2.type() && src1.size() == src2.size()));
+
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
+    }
+
     bool isRelative = (normType & NORM_RELATIVE) != 0;
-    normType &= 7;
-    CV_Assert(src1.depth() <= CV_32S && src1.type() == src2.type() && ( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2));
-    int channels = src1.oclchannels(), i = 0, *p;
+    normType &= NORM_TYPE_MASK;
+    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
+
+    Scalar s;
+    int cn = src1.channels();
     double r = 0;
-    oclMat gm1(src1.size(), src1.type());
-    int min_int = (normType == NORM_INF ? CL_INT_MIN : 0);
-    Mat m(1, 1, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(min_int));
-    oclMat gm2(m), emptyMat;
-    switch(normType)
+    oclMat diff;
+    arithm_absdiff_nonsaturate_run(src1, src2, diff);
+
+    switch (normType)
     {
     case NORM_INF:
-        //  arithmetic_run(src1, src2, gm1, "arithm_op_absdiff");
-        //arithmetic_minMax_run(gm1,emptyMat, gm2,"arithm_op_max");
-        m = (gm2);
-        p = (int *)m.data;
-        r = -std::numeric_limits<double>::max();
-        for (i = 0; i < channels; i++)
-        {
-            r = std::max(r, (double)p[i]);
-        }
+        diff = diff.reshape(1);
+        minMax(diff, NULL, &r);
         break;
     case NORM_L1:
-        //arithmetic_run(src1, src2, gm1, "arithm_op_absdiff");
-        //arithmetic_sum_run(gm1, gm2,"arithm_op_sum");
-        m = (gm2);
-        p = (int *)m.data;
-        for (i = 0; i < channels; i++)
-        {
-            r = r + (double)p[i];
-        }
+        s = sum(diff);
+        for (int i = 0; i < cn; ++i)
+            r += s[i];
         break;
     case NORM_L2:
-        //arithmetic_run(src1, src2, gm1, "arithm_op_absdiff");
-        //arithmetic_sum_run(gm1, gm2,"arithm_op_squares_sum");
-        m = (gm2);
-        p = (int *)m.data;
-        for (i = 0; i < channels; i++)
-        {
-            r = r + (double)p[i];
-        }
+        s = sqrSum(diff);
+        for (int i = 0; i < cn; ++i)
+            r += s[i];
         r = std::sqrt(r);
         break;
     }
     if (isRelative)
         r = r / norm(src2, normType);
+
     return r;
 }
 
@@ -923,47 +963,38 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat
         return;
     }
 
-    CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && src1.rows == src2.rows && src2.rows == dst.rows);
-    CV_Assert(src1.type() == src2.type() && src1.type() == dst.type());
-
     Context  *clCxt = src1.clCxt;
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    size_t vector_length = 1;
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
+    int depth = dst.depth(), cols1 = src1.cols * src1.oclchannels();
+    int src1step1 = src1.step / src1.elemSize1(), src1offset1 = src1.offset / src1.elemSize1();
+    int src2step1 = src2.step / src2.elemSize1(), src2offset1 = src2.offset / src2.elemSize1();
+    int dststep1 = dst.step / dst.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1();
 
     size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
+    size_t globalThreads[3] = { cols1, dst.rows, 1 };
 
-    int dst_step1 = dst.cols * dst.elemSize();
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1 ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1 ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dststep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
 
-void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angleInDegrees)
+void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleInDegrees)
 {
     CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
+    CV_Assert(x.step % x.elemSize() == 0 && y.step % y.elemSize() == 0);
+
     Angle.create(x.size(), x.type());
-    string kernelName = angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians";
-    if (angleInDegrees)
-        arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
-    else
-        arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
+    arithmetic_phase_run(x, y, Angle, angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians", &arithm_phase);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1228,21 +1259,22 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
 ///////////////////////////// countNonZero ///////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum, string kernelName)
+static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int groupnum, string kernelName)
 {
-    vector<pair<size_t , const void *> > args;
-    int all_cols = src.step / (vlen * src.elemSize1());
-    int pre_cols = (src.offset % src.step) / (vlen * src.elemSize1());
-    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / (vlen * src.elemSize1()) - 1;
+    int ochannels = src.oclchannels();
+    int all_cols = src.step / src.elemSize();
+    int pre_cols = (src.offset % src.step) / src.elemSize();
+    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
     int invalid_cols = pre_cols + sec_cols;
     int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
-    int offset = src.offset / (vlen * src.elemSize1());
-    int repeat_s = src.offset / src.elemSize1() - offset * vlen;
-    int repeat_e = (offset + cols) * vlen - src.offset / src.elemSize1() - src.cols * src.oclchannels();
+    int offset = src.offset / src.elemSize();
 
-    char build_options[50];
-    sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d", src.depth(), repeat_s, repeat_e);
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const channelMap[] = { " ", " ", "2", "4", "4" };
+    string buildOptions = format("-D srcT=%s%s -D dstT=int%s", typeMap[src.depth()], channelMap[ochannels],
+                                 channelMap[ochannels]);
 
+    vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
@@ -1250,33 +1282,44 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen
     args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
-    size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    openCLExecuteKernel(src.clCxt, &arithm_nonzero, kernelName, gt, lt, args, -1, -1, build_options);
+
+    size_t globalThreads[3] = { groupnum * 256, 1, 1 };
+    size_t localThreads[3] = { 256, 1, 1 };
+
+    openCLExecuteKernel(src.clCxt, &arithm_nonzero, kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 int cv::ocl::countNonZero(const oclMat &src)
 {
-    size_t groupnum = src.clCxt->computeUnits();
+    CV_Assert(src.step % src.elemSize() == 0);
+    CV_Assert(src.channels() == 1);
+
+    Context *clCxt = src.clCxt;
     if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "selected device doesn't support double");
     }
+
+    size_t groupnum = src.clCxt->computeUnits();
     CV_Assert(groupnum != 0);
-    int vlen = 8 , dbsize = groupnum * vlen;
-    Context *clCxt = src.clCxt;
+    int dbsize = groupnum;
+
     string kernelName = "arithm_op_nonzero";
 
     AutoBuffer<int> _buf(dbsize);
     int *p = (int*)_buf, nonzero = 0;
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(int));
-    arithmetic_countNonZero_run(src, dstBuffer, vlen, groupnum, kernelName);
-
     memset(p, 0, dbsize * sizeof(int));
+
+    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(int));
+    arithmetic_countNonZero_run(src, dstBuffer, groupnum, kernelName);
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(int));
+
     for (int i = 0; i < dbsize; i++)
         nonzero += p[i];
 
     openCLSafeCall(clReleaseMemObject(dstBuffer));
+
     return nonzero;
 }
 
@@ -1522,8 +1565,8 @@ oclMatExpr::operator oclMat() const
 /////////////////////////////// transpose ////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-#define TILE_DIM      (32)
-#define BLOCK_ROWS    (256/TILE_DIM)
+#define TILE_DIM   (32)
+#define BLOCK_ROWS (256 / TILE_DIM)
 
 static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false)
 {
@@ -1702,63 +1745,35 @@ void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 /////////////////////////////// setIdentity //////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 
-void cv::ocl::setIdentity(oclMat& src, double scalar)
+void cv::ocl::setIdentity(oclMat& src, const Scalar & scalar)
 {
-    CV_Assert(src.empty() == false && src.rows == src.cols);
-    CV_Assert(src.type() == CV_32SC1 || src.type() == CV_32FC1);
-    int src_step = src.step/src.elemSize();
     Context  *clCxt = Context::getContext();
-    size_t local_threads[] = {16, 16, 1};
-    size_t global_threads[] = {src.cols, src.rows, 1};
-
-    string kernelName = "setIdentityKernel";
-    if (src.type() == CV_32FC1)
-        kernelName += "_F1";
-    else if (src.type() == CV_32SC1)
-        kernelName += "_I1";
-    else
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        kernelName += "_D1";
-        if (!(clCxt->supportsFeature(Context::CL_DOUBLE)))
-        {
-            oclMat temp;
-            src.convertTo(temp, CV_32FC1);
-            temp.copyTo(src);
-        }
-
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
+        return;
     }
 
+    CV_Assert(src.step % src.elemSize() == 0);
+
+    int src_step1 = src.step / src.elemSize(), src_offset1 = src.offset / src.elemSize();
+    size_t local_threads[] = { 16, 16, 1 };
+    size_t global_threads[] = { src.cols, src.rows, 1 };
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const channelMap[] = { "", "", "2", "4", "4" };
+    string buildOptions = format("-D T=%s%s", typeMap[src.depth()], channelMap[src.oclchannels()]);
+
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset1 ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src_step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
 
-    int scalar_i = 0;
-    float scalar_f = 0.0f;
-    if (clCxt->supportsFeature(Context::CL_DOUBLE))
-    {
-        if (src.type() == CV_32SC1)
-        {
-            scalar_i = (int)scalar;
-            args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
-        }
-        else
-            args.push_back(make_pair(sizeof(cl_double), (void*)&scalar));
-    }
-    else
-    {
-        if (src.type() == CV_32SC1)
-        {
-            scalar_i = (int)scalar;
-            args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
-        }
-        else
-        {
-            scalar_f = (float)scalar;
-            args.push_back(make_pair(sizeof(cl_float), (void*)&scalar_f));
-        }
-    }
+    oclMat sc(1, 1, src.type(), scalar);
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&sc.data ));
 
-    openCLExecuteKernel(clCxt, &arithm_setidentity, kernelName, global_threads, local_threads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &arithm_setidentity, "setIdentity", global_threads, local_threads,
+                        args, -1, -1, buildOptions.c_str());
 }
diff --git a/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl b/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl
new file mode 100644
index 0000000000..e5d8271394
--- /dev/null
+++ b/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl
@@ -0,0 +1,93 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+__kernel void arithm_absdiff_nonsaturate_binary(__global srcT *src1, int src1_step, int src1_offset,
+                         __global srcT *src2, int src2_step, int src2_offset,
+                         __global dstT *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
+
+        dstT t0 = convertToDstT(src1[src1_index]);
+        dstT t1 = convertToDstT(src2[src2_index]);
+        dstT t2 = t0 - t1;
+
+        dst[dst_index] = t2 >= 0 ? t2 : -t2;
+    }
+}
+
+__kernel void arithm_absdiff_nonsaturate(__global srcT *src1, int src1_step, int src1_offset,
+                         __global dstT *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
+
+        dstT t0 = convertToDstT(src1[src1_index]);
+
+        dst[dst_index] = t0 >= 0 ? t0 : -t0;
+    }
+}
diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl
index 23b2933066..c5d3ec2abd 100644
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -53,169 +53,117 @@
 #endif
 #endif
 
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar8
-#define CONVERT_TYPE convert_uchar8
-#define MIN_VAL 0
-#define MAX_VAL 255
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char8
-#define CONVERT_TYPE convert_char8
-#define MIN_VAL -128
-#define MAX_VAL 127
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort8
-#define CONVERT_TYPE convert_ushort8
-#define MIN_VAL 0
-#define MAX_VAL 65535
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short8
-#define CONVERT_TYPE convert_short8
-#define MIN_VAL -32768
-#define MAX_VAL 32767
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int8
-#define CONVERT_TYPE convert_int8
-#define MIN_VAL INT_MIN
-#define MAX_VAL INT_MAX
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float8
-#define CONVERT_TYPE convert_float8
-#define MIN_VAL (-FLT_MAX)
-#define MAX_VAL FLT_MAX
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double8
-#define CONVERT_TYPE convert_double8
-#define MIN_VAL (-DBL_MAX)
-#define MAX_VAL DBL_MAX
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a) a.s0 = a.s1;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a) a.s0 = a.s2;a.s1 = a.s2;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a) a.s0 = a.s3;a.s1 = a.s3;a.s2 = a.s3;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a) a.s0 = a.s4;a.s1 = a.s4;a.s2 = a.s4;a.s3 = a.s4;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a) a.s0 = a.s5;a.s1 = a.s5;a.s2 = a.s5;a.s3 = a.s5;a.s4 = a.s5;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a) a.s0 = a.s6;a.s1 = a.s6;a.s2 = a.s6;a.s3 = a.s6;a.s4 = a.s6;a.s5 = a.s6;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a) a.s0 = a.s7;a.s1 = a.s7;a.s2 = a.s7;a.s3 = a.s7;a.s4 = a.s7;a.s5 = a.s7;a.s6 = a.s7;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_e(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s7 = a.s6;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s7 = a.s5;a.s6 = a.s5;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s7 = a.s4;a.s6 = a.s4;a.s5 = a.s4;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a) a.s7 = a.s3;a.s6 = a.s3;a.s5 = a.s3;a.s4 = a.s3;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a) a.s7 = a.s2;a.s6 = a.s2;a.s5 = a.s2;a.s4 = a.s2;a.s3 = a.s2;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a) a.s7 = a.s1;a.s6 = a.s1;a.s5 = a.s1;a.s4 = a.s1;a.s3 = a.s1;a.s2 = a.s1;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a) a.s7 = a.s0;a.s6 = a.s0;a.s5 = a.s0;a.s4 = a.s0;a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
-#endif
-
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
 
 /**************************************Array minMax**************************************/
-__kernel void arithm_op_minMax (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                  __global VEC_TYPE *src, __global VEC_TYPE *dst)
+
+__kernel void arithm_op_minMax(__global const T * src, __global T * dst,
+    int cols, int invalid_cols, int offset, int elemnum, int groupnum)
 {
    unsigned int lid = get_local_id(0);
    unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
+   unsigned int id = get_global_id(0);
+
    unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp;
-   if(id < elemnum)
-   {
-       temp = src[idx];
-       if(id % cols == 0 )
-       {
-           repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_e(temp);
-       }
-       minval = temp;
-       maxval = temp;
-   }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+
+   __local T localmem_max[128], localmem_min[128];
+   T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
    {
        idx = offset + id + (id / cols) * invalid_cols;
        temp = src[idx];
-       if(id % cols == 0 )
-       {
-               repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-               repeat_e(temp);
-       }
-       minval = min(minval,temp);
-       maxval = max(maxval,temp);
+       minval = min(minval, temp);
+       maxval = max(maxval, temp);
    }
+
    if(lid > 127)
    {
        localmem_min[lid - 128] = minval;
        localmem_max[lid - 128] = maxval;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
+
    if(lid < 128)
    {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
+       localmem_min[lid] = min(minval, localmem_min[lid]);
+       localmem_max[lid] = max(maxval, localmem_max[lid]);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
    {
-       if(lid < lsize)
+       if (lid < lsize)
        {
            int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-   if( lid == 0)
+
+   if (lid == 0)
+   {
+       dst[gid] = localmem_min[0];
+       dst[gid + groupnum] = localmem_max[0];
+   }
+}
+
+__kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
+    int cols, int invalid_cols, int offset,
+    int elemnum, int groupnum,
+    const __global uchar * mask, int minvalid_cols, int moffset)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int id = get_global_id(0);
+
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
+
+   __local T localmem_max[128], localmem_min[128];
+   T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       midx = moffset + id + (id / cols) * minvalid_cols;
+
+       if (mask[midx])
+       {
+           temp = src[idx];
+           minval = min(minval, temp);
+           maxval = max(maxval, temp);
+       }
+   }
+
+   if(lid > 127)
+   {
+       localmem_min[lid - 128] = minval;
+       localmem_max[lid - 128] = maxval;
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   if(lid < 128)
+   {
+       localmem_min[lid] = min(minval, localmem_min[lid]);
+       localmem_max[lid] = max(maxval, localmem_max[lid]);
+   }
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if (lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+
+   if (lid == 0)
    {
        dst[gid] = localmem_min[0];
        dst[gid + groupnum] = localmem_max[0];
diff --git a/modules/ocl/src/opencl/arithm_nonzero.cl b/modules/ocl/src/opencl/arithm_nonzero.cl
index e34207de36..921367b3df 100644
--- a/modules/ocl/src/opencl/arithm_nonzero.cl
+++ b/modules/ocl/src/opencl/arithm_nonzero.cl
@@ -41,151 +41,53 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
-///
 
-/**************************************PUBLICFUNC*************************************/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
-
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar8
 #endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char8
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort8
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short8
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int8
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float8
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double8
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_e(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s7 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
 
 /**************************************Count NonZero**************************************/
-__kernel void arithm_op_nonzero (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                  __global VEC_TYPE *src, __global int8 *dst)
+
+__kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
+                                  __global srcT *src, __global dstT *dst)
 {
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local int8 localmem_nonzero[128];
-   int8 nonzero;
-   VEC_TYPE zero=0,one=1,temp;
-   if(id < elemnum)
-   {
-       temp = src[idx];
-       if(id % cols == 0 )
-       {
-           repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_e(temp);
-       }
-       nonzero = convert_int8(temp == zero ? zero:one);
-   }
-   else
-   {
-       nonzero = 0;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       temp = src[idx];
-       if(id % cols == 0 )
-       {
-               repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-               repeat_e(temp);
-       }
-       nonzero = nonzero + convert_int8(temp == zero ? zero:one);
-   }
-   if(lid > 127)
-   {
-       localmem_nonzero[lid - 128] = nonzero;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_nonzero[lid] = nonzero + localmem_nonzero[lid];
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    unsigned int  id = get_global_id(0);
+
+    unsigned int idx = offset + id + (id / cols) * invalid_cols;
+    __local dstT localmem_nonzero[128];
+    dstT nonzero = (dstT)(0);
+    srcT zero = (srcT)(0), one = (srcT)(1);
+
+    for (int grain = groupnum << 8; id < elemnum; id += grain)
+    {
+        idx = offset + id + (id / cols) * invalid_cols;
+        nonzero += src[idx] == zero ? zero : one;
+    }
+
+    if (lid > 127)
+        localmem_nonzero[lid - 128] = nonzero;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid < 128)
+        localmem_nonzero[lid] = nonzero + localmem_nonzero[lid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int lsize = 64; lsize > 0; lsize >>= 1)
+    {
+        if (lid < lsize)
+        {
            int lid2 = lsize + lid;
            localmem_nonzero[lid] = localmem_nonzero[lid] + localmem_nonzero[lid2];
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = localmem_nonzero[0];
-   }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+        dst[gid] = localmem_nonzero[0];
 }
diff --git a/modules/ocl/src/opencl/arithm_phase.cl b/modules/ocl/src/opencl/arithm_phase.cl
index 9dda5e957a..a30eba4310 100644
--- a/modules/ocl/src/opencl/arithm_phase.cl
+++ b/modules/ocl/src/opencl/arithm_phase.cl
@@ -45,110 +45,125 @@
 //
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
-#define CV_PI 3.1415926535898
-/**************************************phase inradians**************************************/
-__kernel void arithm_phase_inradians_D5 (__global float *src1, int src1_step, int src1_offset,
-                                         __global float *src2, int src2_step, int src2_offset,
-                                         __global float *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
-{
 
+#define CV_PI 3.1415926535898
+#define CV_2PI 2*3.1415926535898
+
+/**************************************phase inradians**************************************/
+
+__kernel void arithm_phase_inradians_D5(__global float *src1, int src1_step1, int src1_offset1,
+                                         __global float *src2, int src2_step1, int src2_offset1,
+                                         __global float *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
 
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = atan2(data2,data1);
+        float data1 = src1[src1_index];
+        float data2 = src2[src2_index];
+        float tmp = atan2(data2, data1);
 
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+        if (tmp < 0)
+            tmp += CV_2PI;
+
+        dst[dst_index] = tmp;
     }
-
 }
 
 
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_phase_inradians_D6 (__global double *src1, int src1_step, int src1_offset,
-                                         __global double *src2, int src2_step, int src2_offset,
-                                         __global double *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
+__kernel void arithm_phase_inradians_D6(__global double *src1, int src1_step1, int src1_offset1,
+                                         __global double *src2, int src2_step1, int src2_offset1,
+                                         __global double *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
 {
-
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
 
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
+        double data1 = src1[src1_index];
+        double data2 = src2[src2_index];
+        double tmp = atan2(data2, data1);
 
-        *((__global double *)((__global char *)dst + dst_index)) = atan2(data2,data1);
+        if (tmp < 0)
+            tmp += CV_2PI;
+
+        dst[dst_index] = tmp;
     }
-
 }
+
 #endif
 
 /**************************************phase indegrees**************************************/
-__kernel void arithm_phase_indegrees_D5 (__global float *src1, int src1_step, int src1_offset,
-                                         __global float *src2, int src2_step, int src2_offset,
-                                         __global float *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
-{
 
+__kernel void arithm_phase_indegrees_D5(__global float *src1, int src1_step1, int src1_offset1,
+                                         __global float *src2, int src2_step1, int src2_offset1,
+                                         __global float *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
+{
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
 
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = atan2(data2,data1);
-        float tmp_data = 180*tmp/CV_PI;
+        float data1 = src1[src1_index];
+        float data2 = src2[src2_index];
+        float tmp = atan2(data2, data1);
+        tmp = 180 * tmp / CV_PI;
 
-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
+        if (tmp < 0)
+            tmp += 360;
+
+        dst[dst_index] = tmp;
     }
-
 }
 
 
 #if defined (DOUBLE_SUPPORT)
-__kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step, int src1_offset,
-                                         __global double *src2, int src2_step, int src2_offset,
-                                         __global double *dst,  int dst_step,  int dst_offset,
-                                         int rows, int cols, int dst_step1)
+__kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step1, int src1_offset1,
+                                         __global double *src2, int src2_step1, int src2_offset1,
+                                         __global double *dst,  int dst_step1,  int dst_offset1,
+                                         int cols, int rows)
 {
-
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < cols && y < rows)
+    if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
 
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double tmp = atan2(data2,data1);
-        double tmp_data = 180*tmp/CV_PI;
+        double data1 = src1[src1_index];
+        double data2 = src2[src2_index];
+        double tmp = atan2(src2[src2_index], src1[src1_index]);
 
-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
+        tmp = 180 * tmp / CV_PI;
+        if (tmp < 0)
+            tmp += 360;
+
+        dst[dst_index] = tmp;
     }
-
 }
 #endif
diff --git a/modules/ocl/src/opencl/arithm_setidentity.cl b/modules/ocl/src/opencl/arithm_setidentity.cl
index 0604ae81dd..921026b40d 100644
--- a/modules/ocl/src/opencl/arithm_setidentity.cl
+++ b/modules/ocl/src/opencl/arithm_setidentity.cl
@@ -42,6 +42,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
@@ -50,51 +51,19 @@
 #endif
 #endif
 
-
-#if defined (DOUBLE_SUPPORT)
-#define DATA_TYPE double
-#else
-#define DATA_TYPE float
-#endif
-
-__kernel void setIdentityKernel_F1(__global float* src, int src_row, int src_col, int src_step, DATA_TYPE scalar)
+__kernel void setIdentity(__global T * src, int src_step, int src_offset,
+    int cols, int rows, __global const T * scalar)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < src_col && y < src_row)
+    if (x < cols && y < rows)
     {
-        if(x == y)
-            src[y * src_step + x] = scalar;
+        int src_index = mad24(y, src_step, src_offset + x);
+
+        if (x == y)
+            src[src_index] = *scalar;
         else
-            src[y * src_step + x] = 0 * scalar;
-    }
-}
-
-__kernel void setIdentityKernel_D1(__global DATA_TYPE* src, int src_row, int src_col, int src_step, DATA_TYPE scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < src_col && y < src_row)
-    {
-        if(x == y)
-            src[y * src_step + x] = scalar;
-        else
-            src[y * src_step + x] = 0 * scalar;
-    }
-}
-
-__kernel void setIdentityKernel_I1(__global int* src, int src_row, int src_col, int src_step, int scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < src_col && y < src_row)
-    {
-        if(x == y)
-            src[y * src_step + x] = scalar;
-        else
-            src[y * src_step + x] = 0 * scalar;
+            src[src_index] = 0;
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl
index 280b0a5111..4011f03bea 100644
--- a/modules/ocl/src/opencl/arithm_sum.cl
+++ b/modules/ocl/src/opencl/arithm_sum.cl
@@ -43,163 +43,62 @@
 //
 //M*/
 
-/**************************************PUBLICFUNC*************************************/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
-#define RES_TYPE double8
-#define CONVERT_RES_TYPE convert_double8
-#else
-#define RES_TYPE float8
-#define CONVERT_RES_TYPE convert_float8
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar8
+#if defined (FUNC_SUM)
+#define FUNC(a, b) b += a;
 #endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char8
+#if defined (FUNC_ABS_SUM)
+#define FUNC(a, b) b += a >= 0 ? a : -a;
 #endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort8
+#if defined (FUNC_SQR_SUM)
+#define FUNC(a, b) b += a * a;
 #endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short8
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int8
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float8
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double8
-#endif
-
-#if defined (FUNC_TYPE_0)
-#define FUNC(a,b) b += a;
-#endif
-#if defined (FUNC_TYPE_1)
-#define FUNC(a,b) b = b + (a >= 0 ? a : -a);
-#endif
-#if defined (FUNC_TYPE_2)
-#define FUNC(a,b) b = b + a * a;
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;a.s3 = 0;a.s4 = 0;a.s5 = 0;a.s6 = 0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_e(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s7 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
 
 /**************************************Array buffer SUM**************************************/
-__kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                __global VEC_TYPE *src, __global RES_TYPE *dst)
+
+__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                __global srcT *src, __global dstT *dst)
 {
    unsigned int lid = get_local_id(0);
    unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
+   unsigned int id = get_global_id(0);
    unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local RES_TYPE localmem_sum[128];
-   RES_TYPE sum = 0,temp;
-   if(id < elemnum)
-   {
-       temp = CONVERT_RES_TYPE(src[idx]);
-       if(id % cols == 0 )
-       {
-           repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_e(temp);
-       }
-       FUNC(temp,sum);
-   }
-   else
-   {
-       sum = 0;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+
+   __local dstT localmem_sum[128];
+   dstT sum = (dstT)(0), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
    {
        idx = offset + id + (id / cols) * invalid_cols;
-       temp = CONVERT_RES_TYPE(src[idx]);
-       if(id % cols == 0 )
-       {
-               repeat_s(temp);
-       }
-       if(id % cols == cols - 1)
-       {
-               repeat_e(temp);
-       }
-       FUNC(temp,sum);
+       temp = convertToDstT(src[idx]);
+       FUNC(temp, sum);
    }
-   if(lid > 127)
-   {
+
+   if (lid > 127)
        localmem_sum[lid - 128] = sum;
-   }
    barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
+
+   if (lid < 128)
        localmem_sum[lid] = sum + localmem_sum[lid];
-   }
    barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
    {
-       if(lid < lsize)
+       if (lid < lsize)
        {
            int lid2 = lsize + lid;
            localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-   if( lid == 0)
-   {
+
+   if (lid == 0)
        dst[gid] = localmem_sum[0];
-   }
 }
diff --git a/modules/ocl/src/opencl/arithm_sum_3.cl b/modules/ocl/src/opencl/arithm_sum_3.cl
deleted file mode 100644
index 3f6ed08803..0000000000
--- a/modules/ocl/src/opencl/arithm_sum_3.cl
+++ /dev/null
@@ -1,247 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#define RES_TYPE double4
-#define CONVERT_RES_TYPE convert_double4
-#else
-#define RES_TYPE float4
-#define CONVERT_RES_TYPE convert_float4
-#endif
-
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar4
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char4
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort4
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short4
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int4
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float4
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double4
-#endif
-
-#if defined (FUNC_TYPE_0)
-#define FUNC(a,b) b += a;
-#endif
-#if defined (FUNC_TYPE_1)
-#define FUNC(a,b) b = b + (a >= 0 ? a : -a);
-#endif
-#if defined (FUNC_TYPE_2)
-#define FUNC(a,b) b = b + a * a;
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a,b,c) a=a; b =b; c=c;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a,b,c) a.s0=0; b=b; c=c;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a,b,c) a.s0=0; a.s1=0; b=b; c=c;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a,b,c) a.s0=0; a.s1=0; a.s2=0; b=b; c=c;
-#endif
-#if defined (REPEAT_S4)
-#define repeat_s(a,b,c) a=0;b=b; c=c;
-#endif
-#if defined (REPEAT_S5)
-#define repeat_s(a,b,c) a=0; b.s0=0;c=c;
-#endif
-#if defined (REPEAT_S6)
-#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; c=c;
-#endif
-#if defined (REPEAT_S7)
-#define repeat_s(a,b,c) a=0; b.s0=0; b.s1=0; b.s2=0; c=c;
-#endif
-#if defined (REPEAT_S8)
-#define repeat_s(a,b,c) a=0; b=0; c=c;
-#endif
-#if defined (REPEAT_S9)
-#define repeat_s(a,b,c) a=0; b=0; c.s0=0;
-#endif
-#if defined (REPEAT_S10)
-#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0;
-#endif
-#if defined (REPEAT_S11)
-#define repeat_s(a,b,c) a=0; b=0; c.s0=0; c.s1=0; c.s2=0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_e(a,b,c) a=a; b =b; c=c;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a,b,c) a=a; b=b; c.s3=0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a,b,c) a=a; b=b; c.s3=0; c.s2=0; c.s1=0;
-#endif
-#if defined (REPEAT_E4)
-#define repeat_e(a,b,c) a=a; b=b; c=0;
-#endif
-#if defined (REPEAT_E5)
-#define repeat_e(a,b,c) a=a; b.s3=0; c=0;
-#endif
-#if defined (REPEAT_E6)
-#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; c=0;
-#endif
-#if defined (REPEAT_E7)
-#define repeat_e(a,b,c) a=a; b.s3=0; b.s2=0; b.s1=0; c=0;
-#endif
-#if defined (REPEAT_E8)
-#define repeat_e(a,b,c) a=a; b=0; c=0;
-#endif
-#if defined (REPEAT_E9)
-#define repeat_e(a,b,c) a.s3=0; b=0; c=0;
-#endif
-#if defined (REPEAT_E10)
-#define repeat_e(a,b,c) a.s3=0; a.s2=0; b=0; c=0;
-#endif
-#if defined (REPEAT_E11)
-#define repeat_e(a,b,c) a.s3=0; a.s2=0; a.s1=0; b=0; c=0;
-#endif
-
-__kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                __global VEC_TYPE *src, __global RES_TYPE *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int id = get_global_id(0);
-   unsigned int idx = offset + id + (id  / cols) * invalid_cols;
-   idx = idx * 3;
-   __local RES_TYPE localmem_sum1[128];
-   __local RES_TYPE localmem_sum2[128];
-   __local RES_TYPE localmem_sum3[128];
-   RES_TYPE sum1 = 0,sum2 = 0,sum3 = 0,temp1,temp2,temp3;
-   if(id < elemnum)
-   {
-       temp1 = CONVERT_RES_TYPE(src[idx]);
-       temp2 = CONVERT_RES_TYPE(src[idx+1]);
-       temp3 = CONVERT_RES_TYPE(src[idx+2]);
-       if(id % cols == 0 )
-       {
-           repeat_s(temp1,temp2,temp3);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_e(temp1,temp2,temp3);
-       }
-       FUNC(temp1,sum1);
-       FUNC(temp2,sum2);
-       FUNC(temp3,sum3);
-   }
-   else
-   {
-       sum1 = 0;
-       sum2 = 0;
-       sum3 = 0;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       idx = idx * 3;
-       temp1 = CONVERT_RES_TYPE(src[idx]);
-       temp2 = CONVERT_RES_TYPE(src[idx+1]);
-       temp3 = CONVERT_RES_TYPE(src[idx+2]);
-       if(id % cols == 0 )
-       {
-               repeat_s(temp1,temp2,temp3);
-       }
-       if(id % cols == cols - 1)
-       {
-               repeat_e(temp1,temp2,temp3);
-       }
-       FUNC(temp1,sum1);
-       FUNC(temp2,sum2);
-       FUNC(temp3,sum3);
-   }
-   if(lid > 127)
-   {
-       localmem_sum1[lid - 128] = sum1;
-       localmem_sum2[lid - 128] = sum2;
-       localmem_sum3[lid - 128] = sum3;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_sum1[lid] = sum1 + localmem_sum1[lid];
-       localmem_sum2[lid] = sum2 + localmem_sum2[lid];
-       localmem_sum3[lid] = sum3 + localmem_sum3[lid];
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_sum1[lid] = localmem_sum1[lid] + localmem_sum1[lid2];
-           localmem_sum2[lid] = localmem_sum2[lid] + localmem_sum2[lid2];
-           localmem_sum3[lid] = localmem_sum3[lid] + localmem_sum3[lid2];
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid*3]   = localmem_sum1[0];
-       dst[gid*3+1] = localmem_sum2[0];
-       dst[gid*3+2] = localmem_sum3[0];
-   }
-}
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
index 9b20dbf89c..db01d95036 100644
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
@@ -220,8 +220,8 @@ PARAM_TEST_CASE(ArithmTestBase, int, int, bool)
 
         cv::RNG &rng = TS::ptr()->get_rng();
 
-        src1 = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
-        src2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, -15440, 14450, false);
+        src1 = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 2, 11, false);
+        src2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, -1540, 1740, false);
         dst1 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
         dst2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
         mask = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), CV_8UC1, 0, 2, false);
@@ -464,7 +464,6 @@ TEST_P(Mul, Scalar)
     }
 }
 
-
 TEST_P(Mul, Mat_Scalar)
 {
     for (int j = 0; j < LOOP_TIMES; j++)
@@ -507,7 +506,6 @@ TEST_P(Div, Scalar)
     }
 }
 
-
 TEST_P(Div, Mat_Scalar)
 {
     for (int j = 0; j < LOOP_TIMES; j++)
@@ -753,7 +751,7 @@ TEST_P(MinMax, MAT)
     }
 }
 
-TEST_P(MinMax, DISABLED_MASK)
+TEST_P(MinMax, MASK)
 {
     for (int j = 0; j < LOOP_TIMES; j++)
     {
@@ -1022,7 +1020,7 @@ TEST_P(MinMaxLoc, MASK)
 
 typedef ArithmTestBase Sum;
 
-TEST_P(Sum, DISABLED_MAT)
+TEST_P(Sum, MAT)
 {
     for (int j = 0; j < LOOP_TIMES; j++)
     {
@@ -1031,7 +1029,121 @@ TEST_P(Sum, DISABLED_MAT)
         Scalar cpures = cv::sum(src1_roi);
         Scalar gpures = cv::ocl::sum(gsrc1);
 
-        //check results
+        // check results
+        EXPECT_NEAR(cpures[0], gpures[0], 0.1);
+        EXPECT_NEAR(cpures[1], gpures[1], 0.1);
+        EXPECT_NEAR(cpures[2], gpures[2], 0.1);
+        EXPECT_NEAR(cpures[3], gpures[3], 0.1);
+    }
+}
+
+typedef ArithmTestBase SqrSum;
+
+template <typename T, typename WT>
+static Scalar sqrSum(const Mat & src)
+{
+    Scalar sum = Scalar::all(0);
+    int cn = src.channels();
+    WT data[4] = { 0, 0, 0, 0 };
+
+    int cols = src.cols * cn;
+    for (int y = 0; y < src.rows; ++y)
+    {
+        const T * const sdata = src.ptr<T>(y);
+        for (int x = 0; x < cols; )
+            for (int i = 0; i < cn; ++i, ++x)
+            {
+                WT t = static_cast<WT>(sdata[x]);
+                data[i] += t * t;
+            }
+    }
+
+    for (int i = 0; i < cn; ++i)
+        sum[i] = static_cast<double>(data[i]);
+
+    return sum;
+}
+
+typedef Scalar (*sumFunc)(const Mat &);
+
+TEST_P(SqrSum, MAT)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        static sumFunc funcs[] = { sqrSum<uchar, int>,
+                                 sqrSum<char, int>,
+                                 sqrSum<ushort, int>,
+                                 sqrSum<short, int>,
+                                 sqrSum<int, int>,
+                                 sqrSum<float, double>,
+                                 sqrSum<double, double>,
+                                 0 };
+
+        sumFunc func = funcs[src1_roi.depth()];
+        CV_Assert(func != 0);
+
+        Scalar cpures = func(src1_roi);
+        Scalar gpures = cv::ocl::sqrSum(gsrc1);
+
+        // check results
+        EXPECT_NEAR(cpures[0], gpures[0], 1.0);
+        EXPECT_NEAR(cpures[1], gpures[1], 1.0);
+        EXPECT_NEAR(cpures[2], gpures[2], 1.0);
+        EXPECT_NEAR(cpures[3], gpures[3], 1.0);
+    }
+}
+
+typedef ArithmTestBase AbsSum;
+
+template <typename T, typename WT>
+static Scalar absSum(const Mat & src)
+{
+    Scalar sum = Scalar::all(0);
+    int cn = src.channels();
+    WT data[4] = { 0, 0, 0, 0 };
+
+    int cols = src.cols * cn;
+    for (int y = 0; y < src.rows; ++y)
+    {
+        const T * const sdata = src.ptr<T>(y);
+        for (int x = 0; x < cols; )
+            for (int i = 0; i < cn; ++i, ++x)
+            {
+                WT t = static_cast<WT>(sdata[x]);
+                data[i] += t >= 0 ? t : -t;
+            }
+    }
+
+    for (int i = 0; i < cn; ++i)
+        sum[i] = static_cast<double>(data[i]);
+
+    return sum;
+}
+
+TEST_P(AbsSum, MAT)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        static sumFunc funcs[] = { absSum<uchar, int>,
+                                 absSum<char, int>,
+                                 absSum<ushort, int>,
+                                 absSum<short, int>,
+                                 absSum<int, int>,
+                                 absSum<float, double>,
+                                 absSum<double, double>,
+                                 0 };
+
+        sumFunc func = funcs[src1_roi.depth()];
+        CV_Assert(func != 0);
+
+        Scalar cpures = func(src1_roi);
+        Scalar gpures = cv::ocl::absSum(gsrc1);
+
+        // check results
         EXPECT_NEAR(cpures[0], gpures[0], 0.1);
         EXPECT_NEAR(cpures[1], gpures[1], 0.1);
         EXPECT_NEAR(cpures[2], gpures[2], 0.1);
@@ -1059,17 +1171,27 @@ TEST_P(CountNonZero, MAT)
 
 typedef ArithmTestBase Phase;
 
-TEST_P(Phase, DISABLED_Mat)
+TEST_P(Phase, angleInDegrees)
 {
-    for (int angelInDegrees = 0; angelInDegrees < 2; angelInDegrees++)
+    for (int j = 0; j < LOOP_TIMES; j++)
     {
-        for (int j = 0; j < LOOP_TIMES; j++)
-        {
-            random_roi();
-            cv::phase(src1_roi, src2_roi, dst1_roi, angelInDegrees ? true : false);
-            cv::ocl::phase(gsrc1, gsrc2, gdst1, angelInDegrees ? true : false);
-            Near(1e-2);
-        }
+        random_roi();
+        cv::phase(src1_roi, src2_roi, dst1_roi, true);
+        cv::ocl::phase(gsrc1, gsrc2, gdst1, true);
+
+        Near(1e-2);
+    }
+}
+
+TEST_P(Phase, angleInRadians)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+        cv::phase(src1_roi, src2_roi, dst1_roi);
+        cv::ocl::phase(gsrc1, gsrc2, gdst1);
+
+        Near(1e-2);
     }
 }
 
@@ -1301,32 +1423,136 @@ TEST_P(AddWeighted, Mat)
     }
 }
 
+//////////////////////////////// setIdentity /////////////////////////////////////////////////
+
+typedef ArithmTestBase SetIdentity;
+
+TEST_P(SetIdentity, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::setIdentity(dst1_roi, val);
+        cv::ocl::setIdentity(gdst1, val);
+
+        Near(0);
+    }
+}
+
+//////////////////////////////// meanStdDev /////////////////////////////////////////////////
+
+typedef ArithmTestBase MeanStdDev;
+
+TEST_P(MeanStdDev, Mat)
+{
+    for (int j = 0; j < LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        Scalar cpu_mean, cpu_stddev;
+        Scalar gpu_mean, gpu_stddev;
+
+        cv::meanStdDev(src1_roi, cpu_mean, cpu_stddev);
+        cv::ocl::meanStdDev(gsrc1, gpu_mean, gpu_stddev);
+
+        for (int i = 0; i < 4; ++i)
+        {
+            EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 0.1);
+            EXPECT_NEAR(cpu_stddev[i], gpu_stddev[i], 0.1);
+        }
+    }
+}
+
+//////////////////////////////// Norm /////////////////////////////////////////////////
+
+typedef ArithmTestBase Norm;
+
+TEST_P(Norm, NORM_INF)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < LOOP_TIMES; j++)
+        {
+            random_roi();
+
+            int type = NORM_INF;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
+            const double gpuRes = cv::ocl::norm(gsrc1, gsrc2, type);
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
+TEST_P(Norm, NORM_L1)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < LOOP_TIMES; j++)
+        {
+            random_roi();
+
+            int type = NORM_L1;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
+            const double gpuRes = cv::ocl::norm(gsrc1, gsrc2, type);
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
+TEST_P(Norm, NORM_L2)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < LOOP_TIMES; j++)
+        {
+            random_roi();
+
+            int type = NORM_L2;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
+            const double gpuRes = cv::ocl::norm(gsrc1, gsrc2, type);
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
 //////////////////////////////////////// Instantiation /////////////////////////////////////////
 
-INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool(), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool(), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(testing::Values(CV_32F, CV_64F), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
 INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
 INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
 INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool()));
 INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool())); // +
-INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool())); // +
+INSTANTIATE_TEST_CASE_P(Arithm, SqrSum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, AbsSum, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(testing::Range(CV_8U, CV_USRTYPE1), Values(1), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32F, CV_64F), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, SetIdentity, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, MeanStdDev, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Norm, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
 
 #endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_norm.cpp b/modules/ocl/test/test_norm.cpp
deleted file mode 100644
index 2bd847068e..0000000000
--- a/modules/ocl/test/test_norm.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-typedef ::testing::TestWithParam<cv::Size> normFixture;
-
-TEST_P(normFixture, DISABLED_accuracy)
-{
-    const cv::Size srcSize = GetParam();
-
-    cv::Mat src1(srcSize, CV_8UC1), src2(srcSize, CV_8UC1);
-    cv::randu(src1, 0, 2);
-    cv::randu(src2, 0, 2);
-
-    cv::ocl::oclMat oclSrc1(src1), oclSrc2(src2);
-
-    double value = cv::norm(src1, src2, cv::NORM_INF);
-    double oclValue = cv::ocl::norm(oclSrc1, oclSrc2, cv::NORM_INF);
-
-    ASSERT_EQ(value, oclValue);
-}
-
-INSTANTIATE_TEST_CASE_P(oclNormTest, normFixture,
-                        ::testing::Values(cv::Size(500, 500), cv::Size(1000, 1000)));