From 405f12fe80541d7357ffd988d5bae6d41795b5db Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 2 Jun 2014 14:21:03 +0400 Subject: [PATCH] optimized cv::flip (CV_8UC1) --- modules/core/src/copy.cpp | 23 +++++++++++----------- modules/core/src/opencl/flip.cl | 34 +++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 758a49dab4..6900b51803 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -610,13 +610,13 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, #ifdef HAVE_OPENCL -#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS }; static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) { - CV_Assert(flipCode >= - 1 && flipCode <= 1); - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), flipType; + CV_Assert(flipCode >= -1 && flipCode <= 1); + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);; if (cn > 4) return false; @@ -631,10 +631,12 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) ocl::Device dev = ocl::Device::getDefault(); int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1; + kercn = std::max(kercn, cn); ocl::Kernel k(kernelName, ocl::core::flip_oclsrc, - format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d", ocl::memopTypeToStr(type), - ocl::memopTypeToStr(depth), cn, pxPerWIy)); + format( "-D T=%s -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d", + ocl::memopTypeToStr(CV_MAKE_TYPE(depth, kercn)), + ocl::memopTypeToStr(depth), cn, pxPerWIy, kercn)); if (k.empty()) return false; @@ -642,20 +644,19 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode ) _dst.create(size, type); UMat src = _src.getUMat(), dst = _dst.getUMat(); - int cols = size.width, rows = size.height; + int cols = size.width * cn / kercn, rows = size.height; cols = flipType == FLIP_COLS ? (cols + 1) >> 1 : cols; rows = flipType & FLIP_ROWS ? (rows + 1) >> 1 : rows; k.args(ocl::KernelArg::ReadOnlyNoSize(src), - ocl::KernelArg::WriteOnly(dst), rows, cols); + ocl::KernelArg::WriteOnly(dst, cn, kercn), rows, cols); size_t maxWorkGroupSize = dev.maxWorkGroupSize(); CV_Assert(maxWorkGroupSize % 4 == 0); - size_t globalsize[2] = { cols, rows }, localsize[2] = { maxWorkGroupSize / 4, 4 }; - globalsize[1] = DIVUP(globalsize[1], pxPerWIy); - - return k.run(2, globalsize, (flipType == FLIP_COLS) && (!dev.isIntel()) ? localsize : NULL, false); + size_t globalsize[2] = { cols, (rows + pxPerWIy - 1) / pxPerWIy }, + localsize[2] = { maxWorkGroupSize / 4, 4 }; + return k.run(2, globalsize, (flipType == FLIP_COLS) && !dev.isIntel() ? localsize : NULL, false); } #endif diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl index cf518826a7..bd670a5b72 100644 --- a/modules/core/src/opencl/flip.cl +++ b/modules/core/src/opencl/flip.cl @@ -39,7 +39,7 @@ // //M*/ -#if cn != 3 +#if kercn != 3 #define loadpix(addr) *(__global const T *)(addr) #define storepix(val, addr) *(__global T *)(addr) = val #define TSIZE (int)sizeof(T) @@ -54,7 +54,7 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); - int y0 = get_global_id(1)*PIX_PER_WI_Y; + int y0 = get_global_id(1) * PIX_PER_WI_Y; if (x < cols) { @@ -100,6 +100,21 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, T src0 = loadpix(srcptr + src_index0); T src1 = loadpix(srcptr + src_index1); +#if kercn == 2 +#if cn == 1 + src0 = src0.s10; + src1 = src1.s10; +#endif +#elif kercn == 4 +#if cn == 1 + src0 = src0.s3210; + src1 = src1.s3210; +#elif cn == 2 + src0 = src0.s2301; + src1 = src1.s2301; +#endif +#endif + storepix(src1, dstptr + dst_index0); storepix(src0, dstptr + dst_index1); @@ -131,6 +146,21 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int T src0 = loadpix(srcptr + src_index0); T src1 = loadpix(srcptr + src_index1); +#if kercn == 2 +#if cn == 1 + src0 = src0.s10; + src1 = src1.s10; +#endif +#elif kercn == 4 +#if cn == 1 + src0 = src0.s3210; + src1 = src1.s3210; +#elif cn == 2 + src0 = src0.s2301; + src1 = src1.s2301; +#endif +#endif + storepix(src1, dstptr + dst_index0); storepix(src0, dstptr + dst_index1);