diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index 7f258dc25f..4f7b679cbd 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -24,7 +24,8 @@ if(PYTHONINTERP_FOUND)
 
   if(NOT ANDROID AND NOT IOS)
     ocv_check_environment_variables(PYTHON_LIBRARY PYTHON_INCLUDE_DIR)
-    find_host_package(PythonLibs "${PYTHON_VERSION_STRING}" EXACT)
+    # not using PYTHON_VERSION_STRING here, because it might not conform to the CMake version format
+    find_host_package(PythonLibs "${PYTHON_VERSION_MAJOR_MINOR}.${PYTHON_VERSION_PATCH}" EXACT)
   endif()
 
   if(NOT ANDROID AND NOT IOS)
diff --git a/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst b/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst
index 86f448e8a2..55b1eec91d 100644
--- a/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst
+++ b/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst
@@ -33,21 +33,21 @@ To draw a line, you need to pass starting and ending coordinates of line. We wil
     img = np.zeros((512,512,3), np.uint8)
 
     # Draw a diagonal blue line with thickness of 5 px
-    img = cv2.line(img,(0,0),(511,511),(255,0,0),5)
+    cv2.line(img,(0,0),(511,511),(255,0,0),5)
 
 Drawing Rectangle
 -------------------
 To draw a rectangle, you need top-left corner and bottom-right corner of rectangle. This time we will draw a green rectangle at the top-right corner of image.
 ::
 
-    img = cv2.rectangle(img,(384,0),(510,128),(0,255,0),3)
+    cv2.rectangle(img,(384,0),(510,128),(0,255,0),3)
 
 Drawing Circle
 ----------------
 To draw a circle, you need its center coordinates and radius. We will draw a circle inside the rectangle drawn above.
 ::
 
-    img = cv2.circle(img,(447,63), 63, (0,0,255), -1)
+    cv2.circle(img,(447,63), 63, (0,0,255), -1)
 
 Drawing Ellipse
 --------------------
@@ -55,7 +55,7 @@ Drawing Ellipse
 To draw the ellipse, we need to pass several arguments. One argument is the center location (x,y). Next argument is axes lengths (major axis length, minor axis length). ``angle`` is the angle of rotation of ellipse in anti-clockwise direction. ``startAngle`` and ``endAngle`` denotes the starting and ending of ellipse arc measured in clockwise direction from major axis. i.e. giving values 0 and 360 gives the full ellipse. For more details, check the documentation of **cv2.ellipse()**. Below example draws a half ellipse at the center of the image.
 ::
 
-    img = cv2.ellipse(img,(256,256),(100,50),0,0,180,255,-1)
+    cv2.ellipse(img,(256,256),(100,50),0,0,180,255,-1)
 
 
 Drawing Polygon
@@ -65,7 +65,7 @@ To draw a polygon, first you need coordinates of vertices. Make those points int
 
     pts = np.array([[10,5],[20,30],[70,20],[50,10]], np.int32)
     pts = pts.reshape((-1,1,2))
-    img = cv2.polylines(img,[pts],True,(0,255,255))
+    cv2.polylines(img,[pts],True,(0,255,255))
 
 .. Note:: If third argument is ``False``, you will get a polylines joining all the points, not a closed shape.
 
@@ -103,4 +103,4 @@ Additional Resources
 
 Exercises
 ==============
-#. Try to create the logo of OpenCV using drawing functions available in OpenCV
+#. Try to create the logo of OpenCV using drawing functions available in OpenCV.
diff --git a/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst b/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst
index a10d5907c5..e1716eb4ce 100644
--- a/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst
+++ b/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst
@@ -73,7 +73,7 @@ Now we find the faces in the image. If faces are found, it returns the positions
 
     faces = face_cascade.detectMultiScale(gray, 1.3, 5)
     for (x,y,w,h) in faces:
-        img = cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
+        cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
         roi_gray = gray[y:y+h, x:x+w]
         roi_color = img[y:y+h, x:x+w]
         eyes = eye_cascade.detectMultiScale(roi_gray)
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 405c12c242..8108a61e60 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -459,6 +459,7 @@ CV_INLINE int cvIsInf( double value )
 #    endif
 #  endif
 #elif defined _MSC_VER && !defined RC_INVOKED
+#  include <intrin.h>
 #  define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
 #else
    CV_INLINE CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index f02bf9d446..3079548969 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -60,7 +60,7 @@ inline void _InputArray::init(int _flags, const void* _obj, Size _sz)
 
 inline void* _InputArray::getObj() const { return obj; }
 
-inline _InputArray::_InputArray() { init(0, 0); }
+inline _InputArray::_InputArray() { init(NONE, 0); }
 inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
 inline _InputArray::_InputArray(const Mat& m) { init(MAT+ACCESS_READ, &m); }
 inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_READ, &vec); }
diff --git a/modules/core/include/opencv2/core/opencl/runtime/ocl_runtime.hpp b/modules/core/include/opencv2/core/opencl/runtime/ocl_runtime.hpp
deleted file mode 100644
index 8191e235dc..0000000000
--- a/modules/core/include/opencv2/core/opencl/runtime/ocl_runtime.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __OPENCV_CORE_OCL_RUNTIME_HPP__
-#define __OPENCV_CORE_OCL_RUNTIME_HPP__
-
-#ifdef HAVE_OPENCL
-
-#if defined(HAVE_OPENCL_STATIC)
-
-#if defined __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#else // HAVE_OPENCL_STATIC
-
-#include "ocl_runtime_opencl.hpp"
-
-#endif // HAVE_OPENCL_STATIC
-
-#ifndef CL_DEVICE_DOUBLE_FP_CONFIG
-#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
-#endif
-
-#ifndef CL_DEVICE_HALF_FP_CONFIG
-#define CL_DEVICE_HALF_FP_CONFIG 0x1033
-#endif
-
-#ifndef CL_VERSION_1_2
-#define CV_REQUIRE_OPENCL_1_2_ERROR CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "OpenCV compiled without OpenCL v1.2 support, so we can't use functionality from OpenCL v1.2")
-#endif
-
-#endif // HAVE_OPENCL
-
-#endif // __OPENCV_CORE_OCL_RUNTIME_HPP__
diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index f6e62da69c..4eae4897e0 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -651,13 +651,13 @@ OCL_PERF_TEST_P(SetIdentityFixture, SetIdentity,
 
 typedef Size_MatType MeanStdDevFixture;
 
-OCL_PERF_TEST_P(MeanStdDevFixture, DISABLED_MeanStdDev,
+OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev,
                 ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
-    const double eps = 1e-5;
+    const double eps = 2e-5;
 
     checkDeviceMaxMemoryAllocSize(srcSize, type);
 
@@ -687,7 +687,7 @@ CV_ENUM(NormType, NORM_INF, NORM_L1, NORM_L2)
 typedef std::tr1::tuple<Size, MatType, NormType> NormParams;
 typedef TestBaseWithParam<NormParams> NormFixture;
 
-OCL_PERF_TEST_P(NormFixture, DISABLED_Norm,
+OCL_PERF_TEST_P(NormFixture, Norm,
                 ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES, NormType::all()))
 {
     const NormParams params = GetParam();
@@ -703,7 +703,7 @@ OCL_PERF_TEST_P(NormFixture, DISABLED_Norm,
 
     OCL_TEST_CYCLE() res = cv::norm(src1, src2, normType);
 
-    SANITY_CHECK(res, 1e-6, ERROR_RELATIVE);
+    SANITY_CHECK(res, 1e-5, ERROR_RELATIVE);
 }
 
 ///////////// Repeat ////////////////////////
@@ -814,6 +814,104 @@ OCL_PERF_TEST_P(NormalizeFixture, Normalize,
     SANITY_CHECK(dst, 5e-2);
 }
 
+///////////// ConvertScaleAbs ////////////////////////
+
+typedef Size_MatType ConvertScaleAbsFixture;
+
+OCL_PERF_TEST_P(ConvertScaleAbsFixture, ConvertScaleAbs,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_8UC(cn));
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::convertScaleAbs(src, dst, 0.5, 2);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// PatchNaNs ////////////////////////
+
+typedef Size_MatType PatchNaNsFixture;
+
+OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(src);
+
+    // generating NaNs
+    {
+        Mat src_ = src.getMat(ACCESS_RW);
+        srcSize.width *= cn;
+        for (int y = 0; y < srcSize.height; ++y)
+        {
+            float * const ptr = src_.ptr<float>(y);
+            for (int x = 0; x < srcSize.width; ++x)
+                ptr[x] = (x + y) % 2 == 0 ? std::numeric_limits<float>::quiet_NaN() : ptr[x];
+        }
+    }
+
+    OCL_TEST_CYCLE() cv::patchNaNs(src, 17.7);
+
+    SANITY_CHECK(src);
+}
+
+
+///////////// ScaleAdd ////////////////////////
+
+typedef Size_MatType ScaleAddFixture;
+
+OCL_PERF_TEST_P(ScaleAddFixture, ScaleAdd,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::scaleAdd(src1, 0.6, src2, dst);
+
+    SANITY_CHECK(dst, 1e-6);
+}
+
+///////////// PSNR ////////////////////////
+
+typedef Size_MatType PSNRFixture;
+
+OCL_PERF_TEST_P(PSNRFixture, PSNR,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    double psnr = 0;
+    UMat src1(srcSize, type), src2(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() psnr = cv::PSNR(src1, src2);
+
+    SANITY_CHECK(psnr, 1e-4, ERROR_RELATIVE);
+}
+
 } } // namespace cvtest::ocl
 
 #endif // HAVE_OPENCL
diff --git a/modules/core/perf/opencl/perf_channels.cpp b/modules/core/perf/opencl/perf_channels.cpp
index 958bb73b5d..4856ed4c39 100644
--- a/modules/core/perf/opencl/perf_channels.cpp
+++ b/modules/core/perf/opencl/perf_channels.cpp
@@ -137,10 +137,13 @@ OCL_PERF_TEST_P(MixChannelsFixture, MixChannels,
 
     checkDeviceMaxMemoryAllocSize(srcSize, type);
 
-    UMat templ(srcSize, type);
-    std::vector<UMat> src(n, templ), dst(n, templ);
+    std::vector<UMat> src(n), dst(n);
     for (int i = 0; i < n; ++i)
+    {
+        src[i] = UMat(srcSize, type);
+        dst[i] = UMat(srcSize, type);
         declare.in(src[i], WARMUP_RNG).out(dst[i]);
+    }
 
     int fromTo[] = { 1,2, 2,0, 0,3, 3,1 };
 
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 90e0d74a49..15bd93ef53 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -2375,7 +2375,7 @@ static bool ocl_patchNaNs( InputOutputArray _a, float value )
     int cn = a.channels();
 
     k.args(ocl::KernelArg::ReadOnlyNoSize(a),
-           ocl::KernelArg::WriteOnly(a), (float)value);
+           ocl::KernelArg::WriteOnly(a, cn), (float)value);
 
     size_t globalsize[2] = { a.cols * cn, a.rows };
     return k.run(2, globalsize, NULL, false);
diff --git a/modules/core/src/opencl/mixchannels.cl b/modules/core/src/opencl/mixchannels.cl
index 173421e6ce..7abd60af42 100644
--- a/modules/core/src/opencl/mixchannels.cl
+++ b/modules/core/src/opencl/mixchannels.cl
@@ -44,7 +44,7 @@
 #define DECLARE_INPUT_MAT(i) \
     __global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,
 #define DECLARE_OUTPUT_MAT(i) \
-    __global const uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
+    __global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
 #define PROCESS_ELEM(i) \
     int src##i##_index = mad24(src##i##_step, y, x * (int)sizeof(T) * scn##i + src##i##_offset); \
     __global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 4f0d806708..7a353144c9 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -51,7 +51,12 @@
 #endif
 
 #define noconvert
+
+#ifdef HAVE_MASK
+#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
+#else
 #define EXTRA_PARAMS
+#endif
 
 #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR
 #if OP_SUM
@@ -65,11 +70,19 @@
     __local dstT localmem[WGS2_ALIGNED]
 #define DEFINE_ACCUMULATOR \
     dstT accumulator = (dstT)(0)
+#ifdef HAVE_MASK
+#define REDUCE_GLOBAL \
+    dstT temp = convertToDT(src[0]); \
+    int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
+    if (mask[mask_index]) \
+        FUNC(accumulator, temp)
+#else
 #define REDUCE_GLOBAL \
     dstT temp = convertToDT(src[0]); \
     FUNC(accumulator, temp)
+#endif
 #define SET_LOCAL_1 \
-        localmem[lid] = accumulator
+    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
     localmem[lid - WGS2_ALIGNED] += accumulator
 #define REDUCE_LOCAL_2 \
@@ -88,7 +101,7 @@
 #define REDUCE_GLOBAL \
     accumulator += src[0] == zero ? zero : one
 #define SET_LOCAL_1 \
-        localmem[lid] = accumulator
+    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
     localmem[lid - WGS2_ALIGNED] += accumulator
 #define REDUCE_LOCAL_2 \
diff --git a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp
index 2ea58c220b..8ff3cec90c 100644
--- a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp
+++ b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp
@@ -1,10 +1,6 @@
 //
 // AUTOGENERATED, DO NOT EDIT
 //
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 // generated by parser_clamdblas.py
 enum OPENCLAMDBLAS_FN_ID {
 //    OPENCLAMDBLAS_FN_clAmdBlasAddScratchImage = 0,
@@ -1251,7 +1247,6 @@ static const struct DynamicFnEntry* openclamdblas_fn[] = {
     NULL/*&clAmdBlasiDamax_definition*/,
     NULL/*&clAmdBlasiSamax_definition*/,
     NULL/*&clAmdBlasiZamax_definition*/,
-    ADDITIONAL_FN_DEFINITIONS // macro for custom functions
 };
 
 // number of enabled functions: 6
diff --git a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp
index 1742ab6063..d5bdf7e0b9 100644
--- a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp
+++ b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp
@@ -1,10 +1,6 @@
 //
 // AUTOGENERATED, DO NOT EDIT
 //
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 // generated by parser_clamdfft.py
 enum OPENCLAMDFFT_FN_ID {
     OPENCLAMDFFT_FN_clAmdFftBakePlan = 0,
@@ -393,7 +389,6 @@ static const struct DynamicFnEntry* openclamdfft_fn[] = {
     &clAmdFftSetResultLocation_definition,
     &clAmdFftSetup_definition,
     &clAmdFftTeardown_definition,
-    ADDITIONAL_FN_DEFINITIONS // macro for custom functions
 };
 
 // number of enabled functions: 15
diff --git a/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp b/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp
index a40a5fd918..913b523139 100644
--- a/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp
+++ b/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp
@@ -1,11 +1,6 @@
 //
 // AUTOGENERATED, DO NOT EDIT
 //
-
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 // generated by parser_cl.py
 enum OPENCL_FN_ID {
     OPENCL_FN_clBuildProgram = 0,
@@ -666,7 +661,6 @@ static const struct DynamicFnEntry* opencl_fn_list[] = {
     &clUnloadCompiler_definition,
     &clUnloadPlatformCompiler_definition,
     &clWaitForEvents_definition,
-    ADDITIONAL_FN_DEFINITIONS // macro for custom functions
 };
 
 // number of enabled functions: 88
diff --git a/modules/core/src/opencl/runtime/generator/common.py b/modules/core/src/opencl/runtime/generator/common.py
index ed0face060..80c545295f 100644
--- a/modules/core/src/opencl/runtime/generator/common.py
+++ b/modules/core/src/opencl/runtime/generator/common.py
@@ -161,7 +161,6 @@ def generateListOfDefinitions(fns, name='opencl_fn_list'):
         else:
             print '    NULL/*&%s_definition*/,' % (fn['name'])
         first = False
-    print '    ADDITIONAL_FN_DEFINITIONS // macro for custom functions'
     print '};'
 
 @outputToString
diff --git a/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list b/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list
index 1f9820a1cf..8b78df1756 100644
--- a/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list
+++ b/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list
@@ -5,7 +5,7 @@ clAmdFftDestroyPlan
 clAmdFftEnqueueTransform
 //clAmdFftGetLayout
 //clAmdFftGetPlanBatchSize
-clAmdFftGetPlanContext
+//clAmdFftGetPlanContext
 //clAmdFftGetPlanDim
 //clAmdFftGetPlanDistance
 //clAmdFftGetPlanInStride
@@ -22,7 +22,7 @@ clAmdFftSetPlanBatchSize
 //clAmdFftSetPlanDim
 clAmdFftSetPlanDistance
 clAmdFftSetPlanInStride
-clAmdFftSetPlanLength
+//clAmdFftSetPlanLength
 clAmdFftSetPlanOutStride
 clAmdFftSetPlanPrecision
 clAmdFftSetPlanScale
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.cpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.cpp.in
deleted file mode 100644
index 8492edda9e..0000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.cpp.in
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "precomp.hpp"
-
-#ifdef HAVE_CLAMDBLAS
-
-#include "opencv2/ocl/cl_runtime/cl_runtime.hpp"
-#include "opencv2/ocl/cl_runtime/clamdblas_runtime.hpp"
-
-#if defined(_WIN32)
-    static void* WinGetProcAddress(const char* name)
-    {
-        static HMODULE opencl_module = NULL;
-        if (!opencl_module)
-        {
-            opencl_module = GetModuleHandleA("clAmdBlas.dll");
-            if (!opencl_module)
-            {
-                opencl_module = LoadLibraryA("clAmdBlas.dll");
-                if (!opencl_module)
-                    return NULL;
-            }
-        }
-        return (void*)GetProcAddress(opencl_module, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
-#endif // _WIN32
-
-#if defined(linux)
-    #include <dlfcn.h>
-    #include <stdio.h>
-
-    static void* GetProcAddress (const char* name)
-    {
-        static void* h = NULL;
-        if (!h)
-        {
-            h = dlopen("libclAmdBlas.so", RTLD_LAZY | RTLD_GLOBAL);
-            if (!h)
-                return NULL;
-        }
-
-        return dlsym(h, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) GetProcAddress(name)
-#endif
-
-#ifndef CV_CL_GET_PROC_ADDRESS
-#define CV_CL_GET_PROC_ADDRESS(name) NULL
-#endif
-
-@CL_FN_ENUMS@
-@CL_FN_NAMES@
-
-static void* openclamdblas_check_fn(int ID)
-{
-    void* func = CV_CL_GET_PROC_ADDRESS(openclamdblas_fn_names[ID]);
-    if (!func)
-    {
-        std::ostringstream msg;
-        msg << "OpenCL AMD BLAS function is not available: [" << openclamdblas_fn_names[ID] << "]";
-        CV_Error(CV_StsBadFunc, msg.str());
-    }
-    extern void* openclamdblas_fn_ptrs[];
-    *(void**)(openclamdblas_fn_ptrs[ID]) = func;
-    return func;
-}
-
-namespace {
-@CL_FN_SWITCH@
-}
-
-@CL_FN_DEFINITIONS@
-
-@CL_FN_PTRS@
-
-#endif
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.hpp.in
deleted file mode 100644
index cbffb0861a..0000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.hpp.in
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __OPENCV_OCL_CLAMDBLAS_RUNTIME_HPP__
-#define __OPENCV_OCL_CLAMDBLAS_RUNTIME_HPP__
-
-#ifdef HAVE_CLAMDBLAS
-
-@CLAMDBLAS_REMAP_ORIGIN@
-
-#include <clAmdBlas.h>
-
-@CLAMDBLAS_REMAP_DYNAMIC@
-
-#ifndef CL_RUNTIME_EXPORT
-#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_OCL_SHARED)) && (defined WIN32 || defined _WIN32 || defined WINCE)
-#define CL_RUNTIME_EXPORT __declspec(dllimport)
-#else
-#define CL_RUNTIME_EXPORT
-#endif
-#endif
-
-
-@CLAMDBLAS_FN_DECLARATIONS@
-
-#endif
-
-#endif // __OPENCV_OCL_CLAMDBLAS_RUNTIME_HPP__
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.cpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.cpp.in
deleted file mode 100644
index aee6bd8ab6..0000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.cpp.in
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "precomp.hpp"
-
-#ifdef HAVE_CLAMDFFT
-
-#include "opencv2/ocl/cl_runtime/cl_runtime.hpp"
-#include "opencv2/ocl/cl_runtime/clamdfft_runtime.hpp"
-
-#if defined(_WIN32)
-    static void* WinGetProcAddress(const char* name)
-    {
-        static HMODULE opencl_module = NULL;
-        if (!opencl_module)
-        {
-            opencl_module = GetModuleHandleA("clAmdFft.Runtime.dll");
-            if (!opencl_module)
-            {
-                opencl_module = LoadLibraryA("clAmdFft.Runtime.dll");
-                if (!opencl_module)
-                    return NULL;
-            }
-        }
-        return (void*)GetProcAddress(opencl_module, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
-#endif // _WIN32
-
-#if defined(linux)
-    #include <dlfcn.h>
-    #include <stdio.h>
-
-    static void* GetProcAddress (const char* name)
-    {
-        static void* h = NULL;
-        if (!h)
-        {
-            h = dlopen("libclAmdFft.Runtime.so", RTLD_LAZY | RTLD_GLOBAL);
-            if (!h)
-                return NULL;
-        }
-
-        return dlsym(h, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) GetProcAddress(name)
-#endif
-
-#ifndef CV_CL_GET_PROC_ADDRESS
-#define CV_CL_GET_PROC_ADDRESS(name) NULL
-#endif
-
-@CL_FN_ENUMS@
-@CL_FN_NAMES@
-
-static void* openclamdfft_check_fn(int ID)
-{
-    void* func = CV_CL_GET_PROC_ADDRESS(openclamdfft_fn_names[ID]);
-    if (!func)
-    {
-        std::ostringstream msg;
-        msg << "OpenCL AMD FFT function is not available: [" << openclamdfft_fn_names[ID] << "]";
-        CV_Error(CV_StsBadFunc, msg.str());
-    }
-    extern void* openclamdfft_fn_ptrs[];
-    *(void**)(openclamdfft_fn_ptrs[ID]) = func;
-    return func;
-}
-
-namespace {
-@CL_FN_SWITCH@
-}
-
-@CL_FN_DEFINITIONS@
-
-@CL_FN_PTRS@
-
-#endif
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.hpp.in
deleted file mode 100644
index 5e26d0154e..0000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.hpp.in
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __OPENCV_OCL_CLAMDFFT_RUNTIME_HPP__
-#define __OPENCV_OCL_CLAMDFFT_RUNTIME_HPP__
-
-#ifdef HAVE_CLAMDFFT
-
-@CLAMDFFT_REMAP_ORIGIN@
-
-#include <clAmdFft.h>
-
-@CLAMDFFT_REMAP_DYNAMIC@
-
-#ifndef CL_RUNTIME_EXPORT
-#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_OCL_SHARED)) && (defined WIN32 || defined _WIN32 || defined WINCE)
-#define CL_RUNTIME_EXPORT __declspec(dllimport)
-#else
-#define CL_RUNTIME_EXPORT
-#endif
-#endif
-
-
-@CLAMDFFT_FN_DECLARATIONS@
-
-#endif
-
-#endif // __OPENCV_OCL_CLAMDFFT_RUNTIME_HPP__
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_impl_opencl.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_runtime_impl_opencl.hpp.in
deleted file mode 100644
index ff0395dcd3..0000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_impl_opencl.hpp.in
+++ /dev/null
@@ -1,10 +0,0 @@
-@CL_FN_ENUMS@
-@CL_FN_NAMES@
-
-namespace {
-@CL_FN_SWITCH@
-}
-
-@CL_FN_DEFINITIONS@
-
-@CL_FN_PTRS@
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl.hpp.in
deleted file mode 100644
index 86690af868..0000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl.hpp.in
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __OPENCV_OCL_CL_RUNTIME_OPENCL_HPP__
-#define __OPENCV_OCL_CL_RUNTIME_OPENCL_HPP__
-
-@CL_REMAP_ORIGIN@
-
-#if defined __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-@CL_REMAP_DYNAMIC@
-
-#ifndef CL_RUNTIME_EXPORT
-#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_OCL_SHARED)) && (defined WIN32 || defined _WIN32 || defined WINCE)
-#define CL_RUNTIME_EXPORT __declspec(dllimport)
-#else
-#define CL_RUNTIME_EXPORT
-#endif
-#endif
-
-@CL_FN_DECLARATIONS@
-
-#endif // __OPENCV_OCL_CL_RUNTIME_OPENCL_HPP__
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl_wrappers.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl_wrappers.hpp.in
deleted file mode 100644
index d02d4c5ff2..0000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl_wrappers.hpp.in
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
-#define __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
-
-@CL_FN_INLINE_WRAPPERS@
-
-#endif // __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
\ No newline at end of file
diff --git a/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in b/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in
index 11c834f22d..f3d12558e1 100644
--- a/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in
+++ b/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in
@@ -1,7 +1,3 @@
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 @CL_FN_ENUMS@
 
 namespace {
diff --git a/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in b/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in
index 11c834f22d..f3d12558e1 100644
--- a/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in
+++ b/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in
@@ -1,7 +1,3 @@
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 @CL_FN_ENUMS@
 
 namespace {
diff --git a/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in b/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in
index f3adb66471..14586017a4 100644
--- a/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in
+++ b/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in
@@ -1,8 +1,3 @@
-
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 @CL_FN_ENUMS@
 
 namespace {
diff --git a/modules/core/src/opencl/runtime/opencl_clamdblas.cpp b/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
index 6296ef674d..e7ff79e3cd 100644
--- a/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
+++ b/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
@@ -100,8 +100,6 @@ static void* openclamdblas_check_fn(int ID);
 
 #define CUSTOM_FUNCTION_ID 1000
 
-#undef ADDITIONAL_FN_DEFINITIONS
-
 //
 // END OF CUSTOM FUNCTIONS HERE
 //
@@ -110,7 +108,6 @@ static void* openclamdblas_check_fn(int ID);
 
 static void* openclamdblas_check_fn(int ID)
 {
-    ID = (ID <= CUSTOM_FUNCTION_ID) ? ID : ID - CUSTOM_FUNCTION_ID;
     assert(ID >= 0 && ID < (int)(sizeof(openclamdblas_fn)/sizeof(openclamdblas_fn[0])));
     const struct DynamicFnEntry* e = openclamdblas_fn[ID];
     void* func = CV_CL_GET_PROC_ADDRESS(e->fnName);
diff --git a/modules/core/src/opencl/runtime/opencl_clamdfft.cpp b/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
index 2514b0a57f..d42bef9b81 100644
--- a/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
+++ b/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
@@ -100,8 +100,6 @@ static void* openclamdfft_check_fn(int ID);
 
 #define CUSTOM_FUNCTION_ID 1000
 
-#undef ADDITIONAL_FN_DEFINITIONS
-
 //
 // END OF CUSTOM FUNCTIONS HERE
 //
@@ -110,7 +108,6 @@ static void* openclamdfft_check_fn(int ID);
 
 static void* openclamdfft_check_fn(int ID)
 {
-    ID = (ID <= CUSTOM_FUNCTION_ID) ? ID : ID - CUSTOM_FUNCTION_ID;
     assert(ID >= 0 && ID < (int)(sizeof(openclamdfft_fn)/sizeof(openclamdfft_fn[0])));
     const struct DynamicFnEntry* e = openclamdfft_fn[ID];
     void* func = CV_CL_GET_PROC_ADDRESS(e->fnName);
diff --git a/modules/core/src/opencl/runtime/opencl_core.cpp b/modules/core/src/opencl/runtime/opencl_core.cpp
index d8f231b2d9..238a5bdfb6 100644
--- a/modules/core/src/opencl/runtime/opencl_core.cpp
+++ b/modules/core/src/opencl/runtime/opencl_core.cpp
@@ -169,25 +169,30 @@ static void* opencl_check_fn(int ID);
 
 #include "runtime_common.hpp"
 
+#include "autogenerated/opencl_core_impl.hpp"
+
 //
 // BEGIN OF CUSTOM FUNCTIONS
 //
 
 #define CUSTOM_FUNCTION_ID 1000
 
-#undef ADDITIONAL_FN_DEFINITIONS
-
 //
 // END OF CUSTOM FUNCTIONS HERE
 //
 
-#include "autogenerated/opencl_core_impl.hpp"
-
 static void* opencl_check_fn(int ID)
 {
-    ID = (ID <= CUSTOM_FUNCTION_ID) ? ID : ID - CUSTOM_FUNCTION_ID;
-    assert(ID >= 0 && ID < (int)(sizeof(opencl_fn_list)/sizeof(opencl_fn_list[0])));
-    const struct DynamicFnEntry* e = opencl_fn_list[ID];
+    const struct DynamicFnEntry* e = NULL;
+    if (ID < CUSTOM_FUNCTION_ID)
+    {
+        assert(ID >= 0 && ID < (int)(sizeof(opencl_fn_list)/sizeof(opencl_fn_list[0])));
+        e = opencl_fn_list[ID];
+    }
+    else
+    {
+        CV_ErrorNoReturn(cv::Error::StsBadArg, "Invalid function ID");
+    }
     void* func = CV_CL_GET_PROC_ADDRESS(e->fnName);
     if (!func)
     {
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 2806efeb31..eac823995a 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -466,7 +466,7 @@ template <typename T> Scalar ocl_part_sum(Mat m)
 
 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
 
-static bool ocl_sum( InputArray _src, Scalar & res, int sum_op )
+static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() )
 {
     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
 
@@ -479,7 +479,10 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op )
     int dbsize = ocl::Device::getDefault().maxComputeUnits();
     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
 
-    int ddepth = std::max(CV_32S, depth), dtype = CV_MAKE_TYPE(ddepth, cn);
+    int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
+            dtype = CV_MAKE_TYPE(ddepth, cn);
+    bool haveMask = _mask.kind() != _InputArray::NONE;
+    CV_Assert(!haveMask || _mask.type() == CV_8UC1);
 
     int wgs2_aligned = 1;
     while (wgs2_aligned < (int)wgs)
@@ -489,19 +492,27 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op )
     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
     char cvt[40];
     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s",
+                  format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
                          ocl::typeToStr(type), ocl::typeToStr(dtype), ocl::convertTypeStr(depth, ddepth, cn, cvt),
                          opMap[sum_op], (int)wgs, wgs2_aligned,
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         haveMask ? " -D HAVE_MASK" : ""));
     if (k.empty())
         return false;
 
-    UMat src = _src.getUMat(), db(1, dbsize, dtype);
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-           dbsize, ocl::KernelArg::PtrWriteOnly(db));
+    UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat();
+
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            dbarg = ocl::KernelArg::PtrWriteOnly(db),
+            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+
+    if (haveMask)
+        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg);
+    else
+        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg);
 
     size_t globalsize = dbsize * wgs;
-    if (k.run(1, &globalsize, &wgs, true))
+    if (k.run(1, &globalsize, &wgs, false))
     {
         typedef Scalar (*part_sum)(Mat m);
         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
@@ -806,15 +817,18 @@ cv::Scalar cv::mean( InputArray _src, InputArray _mask )
 
 namespace cv {
 
-static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv )
+static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
 {
+    bool haveMask = _mask.kind() != _InputArray::NONE;
+
     Scalar mean, stddev;
-    if (!ocl_sum(_src, mean, OCL_OP_SUM))
+    if (!ocl_sum(_src, mean, OCL_OP_SUM, _mask))
         return false;
-    if (!ocl_sum(_src, stddev, OCL_OP_SUM_SQR))
+    if (!ocl_sum(_src, stddev, OCL_OP_SUM_SQR, _mask))
         return false;
 
-    double total = 1.0 / _src.total();
+    int nz = haveMask ? countNonZero(_mask) : (int)_src.total();
+    double total = nz != 0 ? 1.0 / nz : 0;
     int k, j, cn = _src.channels();
     for (int i = 0; i < cn; ++i)
     {
@@ -849,7 +863,7 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
 
 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
 {
-    if (ocl::useOpenCL() && _src.isUMat() && _mask.empty() && ocl_meanStdDev(_src, _mean, _sdv))
+    if (ocl::useOpenCL() && _src.isUMat() && ocl_meanStdDev(_src, _mean, _sdv, _mask))
         return;
 
     Mat src = _src.getMat(), mask = _mask.getMat();
@@ -1882,13 +1896,14 @@ static NormDiffFunc getNormDiffFunc(int normType, int depth)
 
 namespace cv {
 
-static bool ocl_norm( InputArray _src, int normType, double & result )
+static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            haveMask = _mask.kind() != _InputArray::NONE;
 
-    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2) ||
-         (!doubleSupport && depth == CV_64F))
+    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
+         (!doubleSupport && depth == CV_64F) || (normType == NORM_INF && haveMask && cn != 1))
         return false;
 
     UMat src = _src.getUMat();
@@ -1920,16 +1935,25 @@ static bool ocl_norm( InputArray _src, int normType, double & result )
         else
             abssrc = src;
 
-        cv::minMaxIdx(abssrc.reshape(1), NULL, &result);
+        cv::minMaxIdx(haveMask ? abssrc : abssrc.reshape(1), NULL, &result, NULL, NULL, _mask);
     }
-    else if (normType == NORM_L1 || normType == NORM_L2)
+    else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
     {
-        Scalar s;
+        Scalar sc;
         bool unstype = depth == CV_8U || depth == CV_16U;
 
-        ocl_sum(src.reshape(1), s, normType == NORM_L2 ?
-                    OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS) );
-        result = normType == NORM_L1 ? s[0] : std::sqrt(s[0]);
+        if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
+                    OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
+            return false;
+
+        if (!haveMask)
+            cn = 1;
+
+        double s = 0.0;
+        for (int i = 0; i < cn; ++i)
+            s += sc[i];
+
+        result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
     }
 
     return true;
@@ -1945,7 +1969,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
 
     double _result = 0;
-    if (ocl::useOpenCL() && _mask.empty() && _src.isUMat() && _src.dims() <= 2 && ocl_norm(_src, normType, _result))
+    if (ocl::useOpenCL() && _src.isUMat() && _src.dims() <= 2 && ocl_norm(_src, normType, _mask, _result))
         return _result;
 
     Mat src = _src.getMat(), mask = _mask.getMat();
@@ -2237,7 +2261,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, double &
     bool relative = (normType & NORM_RELATIVE) != 0;
     normType &= ~NORM_RELATIVE;
 
-    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2) ||
+    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
          (!doubleSupport && depth == CV_64F))
         return false;
 
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index 03d8422182..ff85e6c248 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -924,6 +924,44 @@ OCL_TEST_P(MeanStdDev, Mat)
     }
 }
 
+OCL_TEST_P(MeanStdDev, Mat_Mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Scalar cpu_mean, cpu_stddev;
+        Scalar gpu_mean, gpu_stddev;
+
+        OCL_OFF(cv::meanStdDev(src1_roi, cpu_mean, cpu_stddev, mask_roi));
+        OCL_ON(cv::meanStdDev(usrc1_roi, gpu_mean, gpu_stddev, umask_roi));
+
+        for (int i = 0; i < cn; ++i)
+        {
+            EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 0.1);
+            EXPECT_NEAR(cpu_stddev[i], gpu_stddev[i], 0.1);
+        }
+    }
+}
+
+OCL_TEST(MeanStdDev_, ZeroMask)
+{
+    Size size(5, 5);
+    UMat um(size, CV_32SC1), umask(size, CV_8UC1, Scalar::all(0));
+    Mat m(size, CV_32SC1), mask(size, CV_8UC1, Scalar::all(0));
+
+    Scalar cpu_mean, cpu_stddev;
+    Scalar gpu_mean, gpu_stddev;
+
+    OCL_OFF(cv::meanStdDev(m, cpu_mean, cpu_stddev, mask));
+    OCL_ON(cv::meanStdDev(um, gpu_mean, gpu_stddev, umask));
+
+    for (int i = 0; i < 4; ++i)
+    {
+        EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 0.1);
+        EXPECT_NEAR(cpu_stddev[i], gpu_stddev[i], 0.1);
+    }
+}
 
 //////////////////////////////////////// Log /////////////////////////////////////////
 
@@ -1124,6 +1162,19 @@ OCL_TEST_P(Norm, NORM_INF_1arg)
     }
 }
 
+OCL_TEST_P(Norm, NORM_INF_1arg_mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_INF, mask_roi));
+        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_INF, umask_roi));
+
+        EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+    }
+}
+
 OCL_TEST_P(Norm, NORM_L1_1arg)
 {
     for (int j = 0; j < test_loop_times; j++)
@@ -1137,6 +1188,19 @@ OCL_TEST_P(Norm, NORM_L1_1arg)
     }
 }
 
+OCL_TEST_P(Norm, NORM_L1_1arg_mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_L1, mask_roi));
+        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_L1, umask_roi));
+
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+    }
+}
+
 OCL_TEST_P(Norm, NORM_L2_1arg)
 {
     for (int j = 0; j < test_loop_times; j++)
@@ -1150,6 +1214,19 @@ OCL_TEST_P(Norm, NORM_L2_1arg)
     }
 }
 
+OCL_TEST_P(Norm, NORM_L2_1arg_mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_L2, mask_roi));
+        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_L2, umask_roi));
+
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+    }
+}
+
 OCL_TEST_P(Norm, NORM_INF_2args)
 {
     for (int relative = 0; relative < 2; ++relative)
@@ -1168,6 +1245,24 @@ OCL_TEST_P(Norm, NORM_INF_2args)
         }
 }
 
+OCL_TEST_P(Norm, NORM_INF_2args_mask)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < test_loop_times; j++)
+        {
+            generateTestData();
+
+            int type = NORM_INF;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type, mask_roi));
+            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type, umask_roi));
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
 OCL_TEST_P(Norm, NORM_L1_2args)
 {
     for (int relative = 0; relative < 2; ++relative)
@@ -1186,6 +1281,24 @@ OCL_TEST_P(Norm, NORM_L1_2args)
         }
 }
 
+OCL_TEST_P(Norm, NORM_L1_2args_mask)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < test_loop_times; j++)
+        {
+            generateTestData();
+
+            int type = NORM_L1;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type, mask_roi));
+            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type, umask_roi));
+
+            EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+        }
+}
+
 OCL_TEST_P(Norm, NORM_L2_2args)
 {
     for (int relative = 0; relative < 2; ++relative)
@@ -1204,6 +1317,24 @@ OCL_TEST_P(Norm, NORM_L2_2args)
         }
 }
 
+OCL_TEST_P(Norm, NORM_L2_2args_mask)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < test_loop_times; j++)
+        {
+            generateTestData();
+
+            int type = NORM_L2;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type, mask_roi));
+            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type, umask_roi));
+
+            EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+        }
+}
+
 //////////////////////////////// Sqrt ////////////////////////////////////////////////
 
 typedef ArithmTestBase Sqrt;
@@ -1355,7 +1486,7 @@ OCL_TEST_P(ScaleAdd, Mat)
         OCL_OFF(cv::scaleAdd(src1_roi, val[0], src2_roi, dst1_roi));
         OCL_ON(cv::scaleAdd(usrc1_roi, val[0], usrc2_roi, udst1_roi));
 
-        Near(depth <= CV_32S ? 1 : 1e-6);
+        Near(depth <= CV_32S ? 1 : 1e-3);
     }
 }
 
@@ -1416,6 +1547,25 @@ OCL_TEST_P(PatchNaNs, Mat)
     }
 }
 
+//////////////////////////////// Psnr ////////////////////////////////////////////////
+
+typedef ArithmTestBase Psnr;
+
+OCL_TEST_P(Psnr, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        double cpuRes = 0, gpuRes = 0;
+
+        OCL_OFF(cpuRes = cv::PSNR(src1_roi, src2_roi));
+        OCL_ON(gpuRes = cv::PSNR(usrc1_roi, usrc2_roi));
+
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+    }
+}
+
 //////////////////////////////////////// Instantiation /////////////////////////////////////////
 
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(::testing::Values(CV_8U, CV_8S), OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
@@ -1455,6 +1605,7 @@ OCL_INSTANTIATE_TEST_CASE_P(Arithm, InRange, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHA
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertScaleAbs, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, ScaleAdd, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, PatchNaNs, Combine(OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, Psnr, Combine(::testing::Values((MatDepth)CV_8U), OCL_ALL_CHANNELS, Bool()));
 
 } } // namespace cvtest::ocl
 
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index be141b1afa..d451b93b1e 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -224,7 +224,7 @@ CV_EXPORTS Ptr<HoughLinesDetector> createHoughLinesDetector(float rho, float the
 //////////////////////////////////////
 // HoughLinesP
 
-//! finds line segments in the black-n-white image using probabalistic Hough transform
+//! finds line segments in the black-n-white image using probabilistic Hough transform
 class CV_EXPORTS HoughSegmentDetector : public Algorithm
 {
 public:
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 089d9e55cc..46f2fe159f 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -1212,7 +1212,7 @@ bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size proce
                         ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
                         processingRectSize,
                         yStep, (float)factor,
-                        normrect, data.origWinSize, MAX_FACES);
+                        normrect, data.origWinSize, (int)MAX_FACES);
         ok = haarKernel.run(2, globalsize, 0, true);
     }
     else if( featureType == FeatureEvaluator::LBP )
@@ -1245,7 +1245,7 @@ bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size proce
                         ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
                         processingRectSize,
                         yStep, (float)factor,
-                        data.origWinSize, MAX_FACES);
+                        data.origWinSize, (int)MAX_FACES);
         ok = lbpKernel.run(2, globalsize, 0, true);
     }
     //CV_Assert(ok);
diff --git a/modules/stitching/doc/warpers.rst b/modules/stitching/doc/warpers.rst
index 1025ffa0cb..278020f07c 100644
--- a/modules/stitching/doc/warpers.rst
+++ b/modules/stitching/doc/warpers.rst
@@ -14,17 +14,17 @@ Rotation-only model image warper interface. ::
     public:
         virtual ~RotationWarper() {}
 
-        virtual Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R) = 0;
+        virtual Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R) = 0;
 
-        virtual Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap) = 0;
+        virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) = 0;
 
-        virtual Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                           Mat &dst) = 0;
+        virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                           OutputArray dst) = 0;
 
-        virtual void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                  Size dst_size, Mat &dst) = 0;
+        virtual void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                  Size dst_size, OutputArray dst) = 0;
 
-        virtual Rect warpRoi(Size src_size, const Mat &K, const Mat &R) = 0;
+        virtual Rect warpRoi(Size src_size, InputArray K, InputArray R) = 0;
     };
 
 detail::RotationWarper::warpPoint
@@ -32,7 +32,7 @@ detail::RotationWarper::warpPoint
 
 Projects the image point.
 
-.. ocv:function:: Point2f detail::RotationWarper::warpPoint(const Point2f &pt, const Mat &K, const Mat &R)
+.. ocv:function:: Point2f detail::RotationWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R)
 
     :param pt: Source point
 
@@ -47,7 +47,7 @@ detail::RotationWarper::buildMaps
 
 Builds the projection maps according to the given camera data.
 
-.. ocv:function:: Rect detail::RotationWarper::buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+.. ocv:function:: Rect detail::RotationWarper::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
 
     :param src_size: Source image size
 
@@ -66,7 +66,7 @@ detail::RotationWarper::warp
 
 Projects the image.
 
-.. ocv:function:: Point detail::RotationWarper::warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode, Mat &dst)
+.. ocv:function:: Point detail::RotationWarper::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
 
     :param src: Source image
 
@@ -87,7 +87,7 @@ detail::RotationWarper::warpBackward
 
 Projects the image backward.
 
-.. ocv:function:: void detail::RotationWarper::warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode, Size dst_size, Mat &dst)
+.. ocv:function:: void detail::RotationWarper::warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, Size dst_size, OutputArray dst)
 
     :param src: Projected image
 
@@ -106,7 +106,7 @@ Projects the image backward.
 detail::RotationWarper::warpRoi
 -------------------------------
 
-.. ocv:function:: Rect detail::RotationWarper::warpRoi(Size src_size, const Mat &K, const Mat &R)
+.. ocv:function:: Rect detail::RotationWarper::warpRoi(Size src_size, InputArray K, InputArray R)
 
     :param src_size: Source image bounding box
 
@@ -124,9 +124,9 @@ Base class for warping logic implementation. ::
 
     struct CV_EXPORTS ProjectorBase
     {
-        void setCameraParams(const Mat &K = Mat::eye(3, 3, CV_32F),
-                            const Mat &R = Mat::eye(3, 3, CV_32F),
-                            const Mat &T = Mat::zeros(3, 1, CV_32F));
+        void setCameraParams(InputArray K = Mat::eye(3, 3, CV_32F),
+                            InputArray R = Mat::eye(3, 3, CV_32F),
+                            InputArray T = Mat::zeros(3, 1, CV_32F));
 
         float scale;
         float k[9];
@@ -146,17 +146,17 @@ Base class for rotation-based warper using a `detail::ProjectorBase`_ derived cl
     class CV_EXPORTS RotationWarperBase : public RotationWarper
     {
     public:
-        Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R);
+        Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R);
 
-        Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap);
+        Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
 
-        Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                Mat &dst);
+        Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                OutputArray dst);
 
-        void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                        Size dst_size, Mat &dst);
+        void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                        Size dst_size, OutputArray dst);
 
-        Rect warpRoi(Size src_size, const Mat &K, const Mat &R);
+        Rect warpRoi(Size src_size, InputArray K, InputArray R);
 
     protected:
 
@@ -183,14 +183,14 @@ Warper that maps an image onto the z = 1 plane. ::
 
         void setScale(float scale) { projector_.scale = scale; }
 
-        Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R, const Mat &T);
+        Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T);
 
-        Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap);
+        Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
 
-        Point warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-                   Mat &dst);
+        Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+                   OutputArray dst);
 
-        Rect warpRoi(Size src_size, const Mat &K, const Mat &R, const Mat &T);
+        Rect warpRoi(Size src_size, InputArray K, InputArray R, InputArray T);
 
     protected:
         void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br);
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 8906d88a3c..093f07cc1a 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -56,17 +56,17 @@ class CV_EXPORTS RotationWarper
 public:
     virtual ~RotationWarper() {}
 
-    virtual Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R) = 0;
+    virtual Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R) = 0;
 
-    virtual Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap) = 0;
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) = 0;
 
-    virtual Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                       Mat &dst) = 0;
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                       OutputArray dst) = 0;
 
-    virtual void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                              Size dst_size, Mat &dst) = 0;
+    virtual void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                              Size dst_size, OutputArray dst) = 0;
 
-    virtual Rect warpRoi(Size src_size, const Mat &K, const Mat &R) = 0;
+    virtual Rect warpRoi(Size src_size, InputArray K, InputArray R) = 0;
 
     virtual float getScale() const { return 1.f; }
     virtual void setScale(float) {}
@@ -75,9 +75,9 @@ public:
 
 struct CV_EXPORTS ProjectorBase
 {
-    void setCameraParams(const Mat &K = Mat::eye(3, 3, CV_32F),
-                         const Mat &R = Mat::eye(3, 3, CV_32F),
-                         const Mat &T = Mat::zeros(3, 1, CV_32F));
+    void setCameraParams(InputArray K = Mat::eye(3, 3, CV_32F),
+                         InputArray R = Mat::eye(3, 3, CV_32F),
+                         InputArray T = Mat::zeros(3, 1, CV_32F));
 
     float scale;
     float k[9];
@@ -92,17 +92,17 @@ template <class P>
 class CV_EXPORTS RotationWarperBase : public RotationWarper
 {
 public:
-    Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R);
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R);
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst);
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst);
 
-    void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                      Size dst_size, Mat &dst);
+    void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                      Size dst_size, OutputArray dst);
 
-    Rect warpRoi(Size src_size, const Mat &K, const Mat &R);
+    Rect warpRoi(Size src_size, InputArray K, InputArray R);
 
     float getScale() const { return projector_.scale; }
     void setScale(float val) { projector_.scale = val; }
@@ -132,14 +132,14 @@ class CV_EXPORTS PlaneWarper : public RotationWarperBase<PlaneProjector>
 public:
     PlaneWarper(float scale = 1.f) { projector_.scale = scale; }
 
-    Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R, const Mat &T);
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T);
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap);
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-               Mat &dst);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               OutputArray dst);
 
-    Rect warpRoi(Size src_size, const Mat &K, const Mat &R, const Mat &T);
+    Rect warpRoi(Size src_size, InputArray K, InputArray R, InputArray T);
 
 protected:
     void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br);
@@ -333,7 +333,7 @@ class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 public:
     PlaneWarperGpu(float scale = 1.f) : PlaneWarper(scale) {}
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -341,7 +341,7 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, T, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -349,8 +349,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
@@ -358,8 +358,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, T, interp_mode, border_mode, d_dst_);
@@ -367,15 +367,15 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
 private:
     cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
@@ -387,7 +387,7 @@ class CV_EXPORTS SphericalWarperGpu : public SphericalWarper
 public:
     SphericalWarperGpu(float scale) : SphericalWarper(scale) {}
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -395,8 +395,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
@@ -404,10 +404,10 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
 private:
     cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
@@ -419,7 +419,7 @@ class CV_EXPORTS CylindricalWarperGpu : public CylindricalWarper
 public:
     CylindricalWarperGpu(float scale) : CylindricalWarper(scale) {}
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -427,8 +427,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
@@ -436,10 +436,10 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
 private:
     cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
@@ -503,6 +503,45 @@ protected:
     }
 };
 
+/////////////////////////////////////// OpenCL Accelerated Warpers /////////////////////////////////////
+
+class CV_EXPORTS PlaneWarperOcl : public PlaneWarper
+{
+public:
+    PlaneWarperOcl(float scale = 1.f) : PlaneWarper(scale) { }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+    {
+        return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32FC1), xmap, ymap);
+    }
+
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+    {
+        return warp(src, K, R, Mat::zeros(3, 1, CV_32FC1), interp_mode, border_mode, dst);
+    }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode, OutputArray dst);
+};
+
+class CV_EXPORTS SphericalWarperOcl :  public SphericalWarper
+{
+public:
+    SphericalWarperOcl(float scale) : SphericalWarper(scale) { }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
+};
+
+class CV_EXPORTS CylindricalWarperOcl :  public CylindricalWarper
+{
+public:
+    CylindricalWarperOcl(float scale) : CylindricalWarper(scale) { }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
+};
+
 } // namespace detail
 } // namespace cv
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
index 144e9e32d4..f6eae4fa78 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
@@ -51,7 +51,7 @@ namespace cv {
 namespace detail {
 
 template <class P>
-Point2f RotationWarperBase<P>::warpPoint(const Point2f &pt, const Mat &K, const Mat &R)
+Point2f RotationWarperBase<P>::warpPoint(const Point2f &pt, InputArray K, InputArray R)
 {
     projector_.setCameraParams(K, R);
     Point2f uv;
@@ -61,15 +61,17 @@ Point2f RotationWarperBase<P>::warpPoint(const Point2f &pt, const Mat &K, const
 
 
 template <class P>
-Rect RotationWarperBase<P>::buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+Rect RotationWarperBase<P>::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray _xmap, OutputArray _ymap)
 {
     projector_.setCameraParams(K, R);
 
     Point dst_tl, dst_br;
     detectResultRoi(src_size, dst_tl, dst_br);
 
-    xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
-    ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+
+    Mat xmap = _xmap.getMat(), ymap = _ymap.getMat();
 
     float x, y;
     for (int v = dst_tl.y; v <= dst_br.y; ++v)
@@ -87,8 +89,8 @@ Rect RotationWarperBase<P>::buildMaps(Size src_size, const Mat &K, const Mat &R,
 
 
 template <class P>
-Point RotationWarperBase<P>::warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                  Mat &dst)
+Point RotationWarperBase<P>::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                  OutputArray dst)
 {
     Mat xmap, ymap;
     Rect dst_roi = buildMaps(src.size(), K, R, xmap, ymap);
@@ -101,14 +103,16 @@ Point RotationWarperBase<P>::warp(const Mat &src, const Mat &K, const Mat &R, in
 
 
 template <class P>
-void RotationWarperBase<P>::warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                         Size dst_size, Mat &dst)
+void RotationWarperBase<P>::warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                         Size dst_size, OutputArray dst)
 {
     projector_.setCameraParams(K, R);
 
     Point src_tl, src_br;
     detectResultRoi(dst_size, src_tl, src_br);
-    CV_Assert(src_br.x - src_tl.x + 1 == src.cols && src_br.y - src_tl.y + 1 == src.rows);
+
+    Size size = src.size();
+    CV_Assert(src_br.x - src_tl.x + 1 == size.width && src_br.y - src_tl.y + 1 == size.height);
 
     Mat xmap(dst_size, CV_32F);
     Mat ymap(dst_size, CV_32F);
@@ -130,7 +134,7 @@ void RotationWarperBase<P>::warpBackward(const Mat &src, const Mat &K, const Mat
 
 
 template <class P>
-Rect RotationWarperBase<P>::warpRoi(Size src_size, const Mat &K, const Mat &R)
+Rect RotationWarperBase<P>::warpRoi(Size src_size, InputArray K, InputArray R)
 {
     projector_.setCameraParams(K, R);
 
diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp
index da5fe26183..cdcb35c201 100644
--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -167,6 +167,24 @@ public:
 };
 #endif
 
+class PlaneWarperOcl: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::PlaneWarperOcl>(scale); }
+};
+
+class SphericalWarperOcl: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::SphericalWarperOcl>(scale); }
+};
+
+class CylindricalWarperOcl: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::CylindricalWarperOcl>(scale); }
+};
+
 } // namespace cv
 
 #endif // __OPENCV_STITCHING_WARPER_CREATORS_HPP__
diff --git a/modules/stitching/src/opencl/warpers.cl b/modules/stitching/src/opencl/warpers.cl
new file mode 100644
index 0000000000..032ddf3cee
--- /dev/null
+++ b/modules/stitching/src/opencl/warpers.cl
@@ -0,0 +1,148 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void buildWarpPlaneMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset,
+                                 __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols,
+                                 __constant float * ck_rinv, __constant float * ct,
+                                 int tl_u, int tl_v, float scale)
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+
+    if (du < cols && dv < rows)
+    {
+        __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float)));
+        __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float)));
+
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        float x_ = u / scale - ct[0];
+        float y_ = v / scale - ct[1];
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
+
+        x /= z;
+        y /= z;
+
+        xmap[0] = x;
+        ymap[0] = y;
+    }
+}
+
+__kernel void buildWarpCylindricalMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset,
+                                       __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols,
+                                       __constant float * ck_rinv, int tl_u, int tl_v, float scale)
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+
+    if (du < cols && dv < rows)
+    {
+        __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float)));
+        __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float)));
+
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        u /= scale;
+        float x_ = sin(u);
+        float y_ = v / scale;
+        float z_ = cos(u);
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+        if (z > 0) x /= z, y /= z;
+        else x = y = -1;
+
+        xmap[0] = x;
+        ymap[0] = y;
+    }
+}
+
+__kernel void buildWarpSphericalMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset,
+                                     __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols,
+                                     __constant float * ck_rinv, int tl_u, int tl_v, float scale)
+{
+    int du = get_global_id(0);
+    int dv = get_global_id(1);
+
+    if (du < cols && dv < rows)
+    {
+        __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float)));
+        __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float)));
+
+        float u = tl_u + du;
+        float v = tl_v + dv;
+        float x, y;
+
+        v /= scale;
+        u /= scale;
+
+        float sinv = sin(v);
+        float x_ = sinv * sin(u);
+        float y_ = -cos(v);
+        float z_ = sinv * cos(u);
+
+        float z;
+        x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
+        y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
+        z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
+
+        if (z > 0) x /= z, y /= z;
+        else x = y = -1;
+
+        xmap[0] = x;
+        ymap[0] = y;
+    }
+}
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index 04445176e0..499202fa0f 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -53,6 +53,7 @@
 #include <sstream>
 #include <cmath>
 #include "opencv2/core.hpp"
+#include "opencv2/core/ocl.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/stitching.hpp"
 #include "opencv2/stitching/detail/autocalib.hpp"
diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp
index 3f71f20405..eb15d44c04 100644
--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -45,8 +45,10 @@
 namespace cv {
 namespace detail {
 
-void ProjectorBase::setCameraParams(const Mat &K, const Mat &R, const Mat &T)
+void ProjectorBase::setCameraParams(InputArray _K, InputArray _R, InputArray _T)
 {
+    Mat K = _K.getMat(), R = _R.getMat(), T = _T.getMat();
+
     CV_Assert(K.size() == Size(3, 3) && K.type() == CV_32F);
     CV_Assert(R.size() == Size(3, 3) && R.type() == CV_32F);
     CV_Assert((T.size() == Size(1, 3) || T.size() == Size(3, 1)) && T.type() == CV_32F);
@@ -76,7 +78,7 @@ void ProjectorBase::setCameraParams(const Mat &K, const Mat &R, const Mat &T)
 }
 
 
-Point2f PlaneWarper::warpPoint(const Point2f &pt, const Mat &K, const Mat &R, const Mat &T)
+Point2f PlaneWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T)
 {
     projector_.setCameraParams(K, R, T);
     Point2f uv;
@@ -85,15 +87,17 @@ Point2f PlaneWarper::warpPoint(const Point2f &pt, const Mat &K, const Mat &R, co
 }
 
 
-Rect PlaneWarper::buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap)
+Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray _xmap, OutputArray _ymap)
 {
     projector_.setCameraParams(K, R, T);
 
     Point dst_tl, dst_br;
     detectResultRoi(src_size, dst_tl, dst_br);
 
-    xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
-    ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+
+    Mat xmap = _xmap.getMat(), ymap = _ymap.getMat();
 
     float x, y;
     for (int v = dst_tl.y; v <= dst_br.y; ++v)
@@ -110,8 +114,8 @@ Rect PlaneWarper::buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat
 }
 
 
-Point PlaneWarper::warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-                        Mat &dst)
+Point PlaneWarper::warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+                        OutputArray dst)
 {
     Mat xmap, ymap;
     Rect dst_roi = buildMaps(src.size(), K, R, T, xmap, ymap);
@@ -123,7 +127,7 @@ Point PlaneWarper::warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T
 }
 
 
-Rect PlaneWarper::warpRoi(Size src_size, const Mat &K, const Mat &R, const Mat &T)
+Rect PlaneWarper::warpRoi(Size src_size, InputArray K, InputArray R, InputArray T)
 {
     projector_.setCameraParams(K, R, T);
 
@@ -211,12 +215,12 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b
 
 
 #ifdef HAVE_OPENCV_CUDAWARPING
-Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap);
 }
 
-Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     projector_.setCameraParams(K, R, T);
 
@@ -229,15 +233,15 @@ Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, const
     return Rect(dst_tl, dst_br);
 }
 
-Point PlaneWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                           cuda::GpuMat &dst)
+Point PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                           cuda::GpuMat & dst)
 {
     return warp(src, K, R, Mat::zeros(3, 1, CV_32F), interp_mode, border_mode, dst);
 }
 
 
-Point PlaneWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-                           cuda::GpuMat &dst)
+Point PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+                           cuda::GpuMat & dst)
 {
     Rect dst_roi = buildMaps(src.size(), K, R, T, d_xmap_, d_ymap_);
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
@@ -246,7 +250,7 @@ Point PlaneWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R,
 }
 
 
-Rect SphericalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect SphericalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     projector_.setCameraParams(K, R);
 
@@ -260,8 +264,8 @@ Rect SphericalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cu
 }
 
 
-Point SphericalWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                               cuda::GpuMat &dst)
+Point SphericalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                               cuda::GpuMat & dst)
 {
     Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
@@ -270,7 +274,7 @@ Point SphericalWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat
 }
 
 
-Rect CylindricalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect CylindricalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     projector_.setCameraParams(K, R);
 
@@ -284,8 +288,8 @@ Rect CylindricalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R,
 }
 
 
-Point CylindricalWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                 cuda::GpuMat &dst)
+Point CylindricalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                 cuda::GpuMat & dst)
 {
     Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
diff --git a/modules/stitching/src/warpers_ocl.cpp b/modules/stitching/src/warpers_ocl.cpp
new file mode 100644
index 0000000000..ce4b8946d8
--- /dev/null
+++ b/modules/stitching/src/warpers_ocl.cpp
@@ -0,0 +1,187 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
+namespace cv {
+namespace detail {
+
+/////////////////////////////////////////// PlaneWarperOcl ////////////////////////////////////////////
+
+Rect PlaneWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap)
+{
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpPlaneMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat r_kinv(1, 9, CV_32FC1, projector_.r_kinv), t(1, 3, CV_32FC1, projector_.t);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(),
+                    ur_kinv = r_kinv.getUMat(ACCESS_READ), ut = t.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(ur_kinv), ocl::KernelArg::PtrReadOnly(ut),
+                   dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return PlaneWarper::buildMaps(src_size, K, R, T, xmap, ymap);
+}
+
+Point PlaneWarperOcl::warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, T, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    UMat udst = dst.getUMat();
+    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+/////////////////////////////////////////// SphericalWarperOcl ////////////////////////////////////////
+
+Rect SphericalWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+{
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpSphericalMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat r_kinv(1, 9, CV_32FC1, projector_.r_kinv);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), ur_kinv = r_kinv.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(ur_kinv), dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return SphericalWarper::buildMaps(src_size, K, R, xmap, ymap);
+}
+
+Point SphericalWarperOcl::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    UMat udst = dst.getUMat();
+    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+/////////////////////////////////////////// CylindricalWarperOcl ////////////////////////////////////////
+
+Rect CylindricalWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+{
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpCylindricalMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat r_kinv(1, 9, CV_32FC1, projector_.r_kinv);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), ur_kinv = r_kinv.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(ur_kinv), dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return CylindricalWarper::buildMaps(src_size, K, R, xmap, ymap);
+}
+
+Point CylindricalWarperOcl::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    UMat udst = dst.getUMat();
+    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+} // namespace detail
+} // namespace cv
diff --git a/modules/stitching/test/ocl/test_warpers.cpp b/modules/stitching/test/ocl/test_warpers.cpp
new file mode 100644
index 0000000000..d2f5dc0091
--- /dev/null
+++ b/modules/stitching/test/ocl/test_warpers.cpp
@@ -0,0 +1,149 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+#include "opencv2/stitching/warpers.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////////////////// WarperTestBase ///////////////////////////
+
+struct WarperTestBase :
+        public Test, public TestUtils
+{
+    Mat src, dst, xmap, ymap;
+    Mat udst, uxmap, uymap;
+    Mat K, R;
+
+    virtual void generateTestData()
+    {
+        Size size = randomSize(1, MAX_VALUE);
+
+        src = randomMat(size, CV_32FC1, -500, 500);
+
+        K = Mat::eye(3, 3, CV_32FC1);
+        R = Mat::eye(3, 3, CV_32FC1);
+    }
+
+    void Near(double threshold = 0.)
+    {
+        EXPECT_MAT_NEAR(xmap, uxmap, threshold);
+        EXPECT_MAT_NEAR(ymap, uymap, threshold);
+        EXPECT_MAT_NEAR(dst, udst, threshold);
+    }
+};
+
+//////////////////////////////// SphericalWarperOcl /////////////////////////////////////////////////
+
+typedef WarperTestBase SphericalWarperOclTest;
+
+OCL_TEST_F(SphericalWarperOclTest, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Ptr<WarperCreator> creator = makePtr<SphericalWarperOcl>();
+        Ptr<detail::RotationWarper> warper = creator->create(2.0);
+
+        OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
+        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+
+        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
+        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+
+        Near(1e-4);
+    }
+}
+
+//////////////////////////////// CylindricalWarperOcl /////////////////////////////////////////////////
+
+typedef WarperTestBase CylindricalWarperOclTest;
+
+OCL_TEST_F(CylindricalWarperOclTest, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Ptr<WarperCreator> creator = makePtr<CylindricalWarperOcl>();
+        Ptr<detail::RotationWarper> warper = creator->create(2.0);
+
+        OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
+        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+
+        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
+        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+
+        Near(1e-4);
+    }
+}
+
+//////////////////////////////// PlaneWarperOcl /////////////////////////////////////////////////
+
+typedef WarperTestBase PlaneWarperOclTest;
+
+OCL_TEST_F(PlaneWarperOclTest, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Ptr<WarperCreator> creator = makePtr<PlaneWarperOcl>();
+        Ptr<detail::RotationWarper> warper = creator->create(2.0);
+
+        OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
+        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+
+        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
+        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+
+        Near(1e-5);
+    }
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp
index 57220c7a7a..2fea52a95e 100644
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -318,6 +318,8 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
 #endif // IMPLEMENT_PARAM_CLASS
 
 #define OCL_TEST_P TEST_P
+#define OCL_TEST_F(name, ...) typedef name OCL_##name; TEST_F(OCL_##name, __VA_ARGS__)
+#define OCL_TEST(name, ...) TEST(OCL_##name, __VA_ARGS__)
 
 #define OCL_OFF(fn) cv::ocl::setUseOpenCL(false); fn
 #define OCL_ON(fn) cv::ocl::setUseOpenCL(true); fn
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index b6eefc6f99..5eb3df46cc 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -71,8 +71,11 @@ static void printUsage()
         "  --preview\n"
         "      Run stitching in the preview mode. Works faster than usual mode,\n"
         "      but output image will have lower resolution.\n"
-        "  --try_gpu (yes|no)\n"
-        "      Try to use GPU. The default value is 'no'. All default values\n"
+        "  --try_cuda (yes|no)\n"
+        "      Try to use CUDA. The default value is 'no'. All default values\n"
+        "      are for CPU mode.\n"
+        "  --try_ocl (yes|no)\n"
+        "      Try to use OpenCL. The default value is 'no'. All default values\n"
         "      are for CPU mode.\n"
         "\nMotion Estimation Flags:\n"
         "  --work_megapix <float>\n"
@@ -123,7 +126,8 @@ static void printUsage()
 // Default command line args
 vector<String> img_names;
 bool preview = false;
-bool try_gpu = false;
+bool try_cuda = false;
+bool try_ocl = false;
 double work_megapix = 0.6;
 double seam_megapix = 0.1;
 double compose_megapix = -1;
@@ -161,15 +165,28 @@ static int parseCmdArgs(int argc, char** argv)
         {
             preview = true;
         }
-        else if (string(argv[i]) == "--try_gpu")
+        else if (string(argv[i]) == "--try_cuda")
         {
             if (string(argv[i + 1]) == "no")
-                try_gpu = false;
+                try_cuda = false;
             else if (string(argv[i + 1]) == "yes")
-                try_gpu = true;
+                try_cuda = true;
             else
             {
-                cout << "Bad --try_gpu flag value\n";
+                cout << "Bad --try_cuda flag value\n";
+                return -1;
+            }
+            i++;
+        }
+        else if (string(argv[i]) == "--try_ocl")
+        {
+            if (string(argv[i + 1]) == "no")
+                try_ocl = false;
+            else if (string(argv[i + 1]) == "yes")
+                try_ocl = true;
+            else
+            {
+                cout << "Bad --try_ocl flag value\n";
                 return -1;
             }
             i++;
@@ -357,7 +374,7 @@ int main(int argc, char* argv[])
     if (features_type == "surf")
     {
 #ifdef HAVE_OPENCV_NONFREE
-        if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+        if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
             finder = makePtr<SurfFeaturesFinderGpu>();
         else
 #endif
@@ -430,7 +447,7 @@ int main(int argc, char* argv[])
     t = getTickCount();
 #endif
     vector<MatchesInfo> pairwise_matches;
-    BestOf2NearestMatcher matcher(try_gpu, match_conf);
+    BestOf2NearestMatcher matcher(try_cuda, match_conf);
     matcher(features, pairwise_matches);
     matcher.collectGarbage();
     LOGLN("Pairwise matching, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
@@ -552,8 +569,17 @@ int main(int argc, char* argv[])
     // Warp images and their masks
 
     Ptr<WarperCreator> warper_creator;
+    if (try_ocl)
+    {
+        if (warp_type == "plane")
+            warper_creator = makePtr<cv::PlaneWarperOcl>();
+        else if (warp_type == "cylindrical")
+            warper_creator = makePtr<cv::CylindricalWarperOcl>();
+        else if (warp_type == "spherical")
+            warper_creator = makePtr<cv::SphericalWarperOcl>();
+    }
 #ifdef HAVE_OPENCV_CUDAWARPING
-    if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+    else if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
     {
         if (warp_type == "plane")
             warper_creator = makePtr<cv::PlaneWarperGpu>();
@@ -636,7 +662,7 @@ int main(int argc, char* argv[])
     else if (seam_find_type == "gc_color")
     {
 #ifdef HAVE_OPENCV_CUDA
-        if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+        if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
             seam_finder = makePtr<detail::GraphCutSeamFinderGpu>(GraphCutSeamFinderBase::COST_COLOR);
         else
 #endif
@@ -645,7 +671,7 @@ int main(int argc, char* argv[])
     else if (seam_find_type == "gc_colorgrad")
     {
 #ifdef HAVE_OPENCV_CUDA
-        if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+        if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
             seam_finder = makePtr<detail::GraphCutSeamFinderGpu>(GraphCutSeamFinderBase::COST_COLOR_GRAD);
         else
 #endif
@@ -755,11 +781,11 @@ int main(int argc, char* argv[])
 
         if (!blender)
         {
-            blender = Blender::createDefault(blend_type, try_gpu);
+            blender = Blender::createDefault(blend_type, try_cuda);
             Size dst_sz = resultRoi(corners, sizes).size();
             float blend_width = sqrt(static_cast<float>(dst_sz.area())) * blend_strength / 100.f;
             if (blend_width < 1.f)
-                blender = Blender::createDefault(Blender::NO, try_gpu);
+                blender = Blender::createDefault(Blender::NO, try_cuda);
             else if (blend_type == Blender::MULTI_BAND)
             {
                 MultiBandBlender* mb = dynamic_cast<MultiBandBlender*>(blender.get());