diff --git a/modules/core/src/split.dispatch.cpp b/modules/core/src/split.dispatch.cpp new file mode 100644 index 0000000000..ac1b7217bb --- /dev/null +++ b/modules/core/src/split.dispatch.cpp @@ -0,0 +1,413 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + + +#include "precomp.hpp" +#include "opencl_kernels_core.hpp" + +namespace cv { namespace hal { + +#if CV_SIMD +// see the comments for vecmerge_ in merge.cpp +template static void +vecsplit_( const T* src, T** dst, int len, int cn ) +{ + const int VECSZ = VecT::nlanes; + int i, i0 = 0; + T* dst0 = dst[0]; + T* dst1 = dst[1]; + + int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T))); + int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T))); + int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0; + int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0; + + hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE; + if( (r0|r1|r2|r3) != 0 ) + { + mode = hal::STORE_UNALIGNED; + if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % sizeof(T) == 0 && len > VECSZ*2 ) + i0 = VECSZ - (r0 / sizeof(T)); + } + + if( cn == 2 ) + { + for( i = 0; i < len; i += VECSZ ) + { + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } + VecT a, b; + v_load_deinterleave(src + i*cn, a, b); + v_store(dst0 + i, a, mode); + v_store(dst1 + i, b, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } + } + } + else if( cn == 3 ) + { + T* dst2 = dst[2]; + for( i = 0; i < len; i += VECSZ ) + { + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } + VecT a, b, c; + v_load_deinterleave(src + i*cn, a, b, c); + v_store(dst0 + i, a, mode); + v_store(dst1 + i, b, mode); + v_store(dst2 + i, c, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } + } + } + else + { + CV_Assert( cn == 4 ); + T* dst2 = dst[2]; + T* dst3 = dst[3]; + for( i = 0; i < len; i += VECSZ ) + { + if( i > len - VECSZ ) + { + i = len - VECSZ; + mode = hal::STORE_UNALIGNED; + } + VecT a, b, c, d; + v_load_deinterleave(src + i*cn, a, b, c, d); + v_store(dst0 + i, a, mode); + v_store(dst1 + i, b, mode); + v_store(dst2 + i, c, mode); + v_store(dst3 + i, d, mode); + if( i < i0 ) + { + i = i0 - VECSZ; + mode = hal::STORE_ALIGNED_NOCACHE; + } + } + } + vx_cleanup(); +} +#endif + +template static void +split_( const T* src, T** dst, int len, int cn ) +{ + int k = cn % 4 ? cn % 4 : 4; + int i, j; + if( k == 1 ) + { + T* dst0 = dst[0]; + + if(cn == 1) + { + memcpy(dst0, src, len * sizeof(T)); + } + else + { + for( i = 0, j = 0 ; i < len; i++, j += cn ) + dst0[i] = src[j]; + } + } + else if( k == 2 ) + { + T *dst0 = dst[0], *dst1 = dst[1]; + i = j = 0; + + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; + dst1[i] = src[j+1]; + } + } + else if( k == 3 ) + { + T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; + i = j = 0; + + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; + dst1[i] = src[j+1]; + dst2[i] = src[j+2]; + } + } + else + { + T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; + i = j = 0; + + for( ; i < len; i++, j += cn ) + { + dst0[i] = src[j]; dst1[i] = src[j+1]; + dst2[i] = src[j+2]; dst3[i] = src[j+3]; + } + } + + for( ; k < cn; k += 4 ) + { + T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; + for( i = 0, j = k; i < len; i++, j += cn ) + { + dst0[i] = src[j]; dst1[i] = src[j+1]; + dst2[i] = src[j+2]; dst3[i] = src[j+3]; + } + } +} + +void split8u(const uchar* src, uchar** dst, int len, int cn ) +{ + CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn) + +#if CV_SIMD + if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); +} + +void split16u(const ushort* src, ushort** dst, int len, int cn ) +{ + CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn) +#if CV_SIMD + if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); +} + +void split32s(const int* src, int** dst, int len, int cn ) +{ + CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn) +#if CV_SIMD + if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); +} + +void split64s(const int64* src, int64** dst, int len, int cn ) +{ + CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn) +#if CV_SIMD + if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); +} + +}} // cv::hal:: + +/****************************************************************************************\ +* split & merge * +\****************************************************************************************/ + +typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn); + +static SplitFunc getSplitFunc(int depth) +{ + static SplitFunc splitTab[] = + { + (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), + (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0 + }; + + return splitTab[depth]; +} + +#ifdef HAVE_IPP + +namespace cv { +static bool ipp_split(const Mat& src, Mat* mv, int channels) +{ +#ifdef HAVE_IPP_IW_LL + CV_INSTRUMENT_REGION_IPP(); + + if(channels != 3 && channels != 4) + return false; + + if(src.dims <= 2) + { + IppiSize size = ippiSize(src.size()); + void *dstPtrs[4] = {NULL}; + size_t dstStep = mv[0].step; + for(int i = 0; i < channels; i++) + { + dstPtrs[i] = mv[i].ptr(); + if(dstStep != mv[i].step) + return false; + } + + return CV_INSTRUMENT_FUN_IPP(llwiCopySplit, src.ptr(), (int)src.step, dstPtrs, (int)dstStep, size, (int)src.elemSize1(), channels, 0) >= 0; + } + else + { + const Mat *arrays[5] = {NULL}; + uchar *ptrs[5] = {NULL}; + arrays[0] = &src; + + for(int i = 1; i < channels; i++) + { + arrays[i] = &mv[i-1]; + } + + NAryMatIterator it(arrays, ptrs); + IppiSize size = { (int)it.size, 1 }; + + for( size_t i = 0; i < it.nplanes; i++, ++it ) + { + if(CV_INSTRUMENT_FUN_IPP(llwiCopySplit, ptrs[0], 0, (void**)&ptrs[1], 0, size, (int)src.elemSize1(), channels, 0) < 0) + return false; + } + return true; + } +#else + CV_UNUSED(src); CV_UNUSED(mv); CV_UNUSED(channels); + return false; +#endif +} +} +#endif + +void cv::split(const Mat& src, Mat* mv) +{ + CV_INSTRUMENT_REGION(); + + int k, depth = src.depth(), cn = src.channels(); + if( cn == 1 ) + { + src.copyTo(mv[0]); + return; + } + + for( k = 0; k < cn; k++ ) + { + mv[k].create(src.dims, src.size, depth); + } + + CV_IPP_RUN_FAST(ipp_split(src, mv, cn)); + + SplitFunc func = getSplitFunc(depth); + CV_Assert( func != 0 ); + + size_t esz = src.elemSize(), esz1 = src.elemSize1(); + size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; + AutoBuffer _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16); + const Mat** arrays = (const Mat**)_buf.data(); + uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16); + + arrays[0] = &src; + for( k = 0; k < cn; k++ ) + { + arrays[k+1] = &mv[k]; + } + + NAryMatIterator it(arrays, ptrs, cn+1); + size_t total = it.size; + size_t blocksize = std::min((size_t)CV_SPLIT_MERGE_MAX_BLOCK_SIZE(cn), cn <= 4 ? total : std::min(total, blocksize0)); + + for( size_t i = 0; i < it.nplanes; i++, ++it ) + { + for( size_t j = 0; j < total; j += blocksize ) + { + size_t bsz = std::min(total - j, blocksize); + func( ptrs[0], &ptrs[1], (int)bsz, cn ); + + if( j + blocksize < total ) + { + ptrs[0] += bsz*esz; + for( k = 0; k < cn; k++ ) + ptrs[k+1] += bsz*esz1; + } + } + } +} + +#ifdef HAVE_OPENCL + +namespace cv { + +static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) +{ + int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), + rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; + + String dstargs, processelem, indexdecl; + for (int i = 0; i < cn; ++i) + { + dstargs += format("DECLARE_DST_PARAM(%d)", i); + indexdecl += format("DECLARE_INDEX(%d)", i); + processelem += format("PROCESS_ELEM(%d)", i); + } + + ocl::Kernel k("split", ocl::core::split_merge_oclsrc, + format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s" + " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s", + ocl::memopTypeToStr(depth), cn, dstargs.c_str(), + processelem.c_str(), indexdecl.c_str())); + if (k.empty()) + return false; + + Size size = _m.size(); + _mv.create(cn, 1, depth); + for (int i = 0; i < cn; ++i) + _mv.create(size, depth, i); + + std::vector dst; + _mv.getUMatVector(dst); + + int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat())); + for (int i = 0; i < cn; ++i) + argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i])); + k.set(argidx, rowsPerWI); + + size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI }; + return k.run(2, globalsize, NULL, false); +} + +} + +#endif + +void cv::split(InputArray _m, OutputArrayOfArrays _mv) +{ + CV_INSTRUMENT_REGION(); + + CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(), + ocl_split(_m, _mv)) + + Mat m = _m.getMat(); + if( m.empty() ) + { + _mv.release(); + return; + } + + CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() ); + + int depth = m.depth(), cn = m.channels(); + _mv.create(cn, 1, depth); + for (int i = 0; i < cn; ++i) + _mv.create(m.dims, m.size.p, depth, i); + + std::vector dst; + _mv.getMatVector(dst); + + split(m, &dst[0]); +}