diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 98ec14f5fd..f742558540 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -1585,30 +1585,31 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, st { dst.create(src.size(), src.type()); - int channels = dst.oclchannels(), depth = dst.depth(); - - size_t vector_length = 1; - int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1); - int cols = divUp(dst.cols * channels + offset_cols, vector_length); - int rows = dst.rows; - size_t localThreads[3] = { 16, 16, 1 }; - size_t globalThreads[3] = { cols, rows, 1 }; + size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; + + int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize(); + int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize(); + int temp1_step = temp1.step / temp1.elemSize(), temp1_offset = temp1.offset / temp1.elemSize(); vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&temp1.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src_step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&temp1_step )); args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset )); + args.push_back( make_pair( sizeof(cl_int), (void *)&temp1_offset )); - openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth); + openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, -1, dst.depth()); } + void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y) { CV_Assert(x.depth() == CV_32F && t.depth() == CV_32F); diff --git a/modules/ocl/src/opencl/imgproc_convolve.cl b/modules/ocl/src/opencl/imgproc_convolve.cl index 76e7cfc55b..db7a7dfc3e 100644 --- a/modules/ocl/src/opencl/imgproc_convolve.cl +++ b/modules/ocl/src/opencl/imgproc_convolve.cl @@ -48,9 +48,12 @@ #elif defined (__NVIDIA__) #pragma OPENCL EXTENSION cl_khr_fp64:enable #endif + /************************************** convolve **************************************/ -__kernel void convolve_D5 (__global float *src, __global float *temp1, __global float *dst, - int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight) + +__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst, + int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight, + int src_offset, int dst_offset, int koffset) { __local float smem[16 + 2 * 8][16 + 2 * 8]; @@ -65,7 +68,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global // 0 | 0 0 | 0 // ----------- // 0 | 0 0 | 0 - smem[y][x] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)]; + smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; // 0 | 0 x | x // ----------- @@ -73,7 +76,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global // 0 | 0 0 | 0 // ----------- // 0 | 0 0 | 0 - smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)]; + smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; // 0 | 0 0 | 0 // ----------- @@ -81,7 +84,7 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global // x | x 0 | 0 // ----------- // x | x 0 | 0 - smem[y + 16][x] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(max(gx - 8, 0), cols - 1)]; + smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; // 0 | 0 0 | 0 // ----------- @@ -89,21 +92,18 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global // 0 | 0 x | x // ----------- // 0 | 0 x | x - smem[y + 16][x + 16] = src[min(gy + 8, rows - 1)*(src_step >> 2) + min(gx + 8, cols - 1)]; + smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; barrier(CLK_LOCAL_MEM_FENCE); if (gx < cols && gy < rows) { - float res = 0; + float res = 0; for (int i = 0; i < kHeight; ++i) - { for (int j = 0; j < kWidth; ++j) - { - res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * (k_step>>2) + j]; - } - } - dst[gy*(dst_step >> 2)+gx] = res; - } + res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset]; + + dst[gy * dst_step + gx + dst_offset] = res; + } }