From fc10ffefb838343d91be2b3252eee11292cc9c9f Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Thu, 3 Apr 2014 12:04:35 +0400 Subject: [PATCH] Enabled integer arithmetic for row_filter_C1_D0 --- modules/imgproc/src/filter.cpp | 13 ++++++------- modules/imgproc/src/opencl/filterSepRow.cl | 20 ++++++++++++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 7d870d72f8..d9469530f3 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3475,14 +3475,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, int bdepth = CV_32F; bool int_arithm = false; - if( sdepth == CV_8U && - ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ddepth == CV_8U))) + if( sdepth == CV_8U && ddepth == CV_8U && + rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) { bdepth = CV_32S; - _kernelX.getMat().reshape(1,1).convertTo( kernelX, CV_32S, 1 << shift_bits ); - _kernelY.getMat().reshape(1,1).convertTo( kernelY, CV_32S, 1 << shift_bits ); + kernelX.convertTo( kernelX, CV_32S, 1 << shift_bits ); + kernelY.convertTo( kernelY, CV_32S, 1 << shift_bits ); int_arithm = true; } @@ -3500,7 +3499,7 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, src.locateROI(srcWholeSize, srcOffset); bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0 && !int_arithm; + src.cols % 4 == 0 && src.step % 4 == 0; Size srcSize = src.size(); Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); diff --git a/modules/imgproc/src/opencl/filterSepRow.cl b/modules/imgproc/src/opencl/filterSepRow.cl index 890eeb8cbb..472ac4c91e 100644 --- a/modules/imgproc/src/opencl/filterSepRow.cl +++ b/modules/imgproc/src/opencl/filterSepRow.cl @@ -141,6 +141,14 @@ #define DIG(a) a, __constant dstT1 mat_kernel[] = { COEFF }; +#ifndef INTEGER_ARITHMETIC +#define dstT4 float4 +#define convertDstVec convert_float4 +#else +#define dstT4 int4 +#define convertDstVec convert_int4 +#endif + __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, __global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, @@ -156,7 +164,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel int start_y = y + src_offset_y - radiusy; int start_addr = mad24(start_y, src_step_in_pixel, start_x); - float4 sum; + dstT4 sum; uchar4 temp[READ_TIMES_ROW]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1]; @@ -250,19 +258,23 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel barrier(CLK_LOCAL_MEM_FENCE); // read pixels from lds and calculate the result - sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX]; + sum = convertDstVec(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX]; for (int i = 1; i <= RADIUSX; ++i) { temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); - sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]); +#ifndef INTEGER_ARITHMETIC + sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); +#else + sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); +#endif } start_addr = mad24(y, dst_step_in_pixel, x); // write the result to dst if ((x+3