diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index 98ec14f5fd..f742558540 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -1585,30 +1585,31 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, st
 {
     dst.create(src.size(), src.type());
 
-    int channels = dst.oclchannels(), depth = dst.depth();
-
-    size_t vector_length = 1;
-    int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-    int rows = dst.rows;
-
     size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { cols, rows, 1 };
+    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
+
+    int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
+    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
+    int temp1_step = temp1.step / temp1.elemSize(), temp1_offset = temp1.offset / temp1.elemSize();
 
     vector<pair<size_t , const void *> > args;
     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&temp1.data ));
     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&temp1_step ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.rows ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&temp1_offset ));
 
-    openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
+    openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, -1, dst.depth());
 }
+
 void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
 {
     CV_Assert(x.depth() == CV_32F && t.depth() == CV_32F);
diff --git a/modules/ocl/src/opencl/imgproc_convolve.cl b/modules/ocl/src/opencl/imgproc_convolve.cl
index 76e7cfc55b..db7a7dfc3e 100644
--- a/modules/ocl/src/opencl/imgproc_convolve.cl
+++ b/modules/ocl/src/opencl/imgproc_convolve.cl
@@ -48,9 +48,12 @@
 #elif defined (__NVIDIA__)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+
 /************************************** convolve **************************************/
-__kernel void convolve_D5 (__global float *src, __global float *temp1, __global float *dst,
-                                  int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight)
+
+__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
+                          int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
+                          int src_offset, int dst_offset, int koffset)
 {
     __local float smem[16 + 2 * 8][16 + 2 * 8];
 
@@ -65,7 +68,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
             // 0 | 0 0 | 0
             // -----------
             // 0 | 0 0 | 0
-    smem[y][x] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
+    smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
 
             // 0 | 0 x | x
             // -----------
@@ -73,7 +76,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
             // 0 | 0 0 | 0
             // -----------
             // 0 | 0 0 | 0
-    smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
+    smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
 
             // 0 | 0 0 | 0
             // -----------
@@ -81,7 +84,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
             // x | x 0 | 0
             // -----------
             // x | x 0 | 0
-    smem[y + 16][x] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)];
+    smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
 
             // 0 | 0 0 | 0
             // -----------
@@ -89,21 +92,18 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
             // 0 | 0 x | x
             // -----------
             // 0 | 0 x | x
-    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)];
+    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (gx < cols && gy < rows)
     {
-       float res = 0;
+        float res = 0;
 
         for (int i = 0; i < kHeight; ++i)
-        {
             for (int j = 0; j < kWidth; ++j)
-            {
-                res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * (k_step>>2) + j];
-            }
-        }
-        dst[gy*(dst_step >> 2)+gx] = res;
-   }
+                res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
+
+        dst[gy * dst_step + gx + dst_offset] = res;
+    }
 }