From 82cd2f8c939751c2a4b4c5bfe46acd7a48907b11 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 22 Feb 2019 09:17:51 +0000 Subject: [PATCH 1/3] core: clone split.dispatch.cpp --- modules/core/src/{split.cpp => split.dispatch.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modules/core/src/{split.cpp => split.dispatch.cpp} (100%) diff --git a/modules/core/src/split.cpp b/modules/core/src/split.dispatch.cpp similarity index 100% rename from modules/core/src/split.cpp rename to modules/core/src/split.dispatch.cpp From 0311770e8bb7c1e030562b954c8aaee88f006666 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 22 Feb 2019 09:18:27 +0000 Subject: [PATCH 2/3] core: clone split.simd.hpp --- modules/core/src/{split.cpp => split.simd.hpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modules/core/src/{split.cpp => split.simd.hpp} (100%) diff --git a/modules/core/src/split.cpp b/modules/core/src/split.simd.hpp similarity index 100% rename from modules/core/src/split.cpp rename to modules/core/src/split.simd.hpp From 91d152e2c2296c1ab5533820e94a968528b27087 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 22 Feb 2019 09:35:32 +0000 Subject: [PATCH 3/3] core: dispatch split.cpp --- modules/core/CMakeLists.txt | 1 + modules/core/src/split.dispatch.cpp | 213 +++------------------------ modules/core/src/split.simd.hpp | 218 ++-------------------------- 3 files changed, 35 insertions(+), 397 deletions(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 0cf404e805..1895333fed 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -8,6 +8,7 @@ ocv_add_dispatched_file(convert_scale SSE2 AVX2) ocv_add_dispatched_file(count_non_zero SSE2 AVX2) ocv_add_dispatched_file(matmul SSE2 AVX2) ocv_add_dispatched_file(mean SSE2 AVX2) +ocv_add_dispatched_file(split SSE2 AVX2) ocv_add_dispatched_file(sum SSE2 AVX2) # dispatching for accuracy tests diff --git a/modules/core/src/split.dispatch.cpp b/modules/core/src/split.dispatch.cpp index ac1b7217bb..e41eaf1fe7 100644 --- a/modules/core/src/split.dispatch.cpp +++ b/modules/core/src/split.dispatch.cpp @@ -6,213 +6,44 @@ #include "precomp.hpp" #include "opencl_kernels_core.hpp" +#include "split.simd.hpp" +#include "split.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + namespace cv { namespace hal { -#if CV_SIMD -// see the comments for vecmerge_ in merge.cpp -template static void -vecsplit_( const T* src, T** dst, int len, int cn ) -{ - const int VECSZ = VecT::nlanes; - int i, i0 = 0; - T* dst0 = dst[0]; - T* dst1 = dst[1]; - - int r0 = (int)((size_t)(void*)dst0 % (VECSZ*sizeof(T))); - int r1 = (int)((size_t)(void*)dst1 % (VECSZ*sizeof(T))); - int r2 = cn > 2 ? (int)((size_t)(void*)dst[2] % (VECSZ*sizeof(T))) : r0; - int r3 = cn > 3 ? (int)((size_t)(void*)dst[3] % (VECSZ*sizeof(T))) : r0; - - hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE; - if( (r0|r1|r2|r3) != 0 ) - { - mode = hal::STORE_UNALIGNED; - if( r0 == r1 && r0 == r2 && r0 == r3 && r0 % sizeof(T) == 0 && len > VECSZ*2 ) - i0 = VECSZ - (r0 / sizeof(T)); - } - - if( cn == 2 ) - { - for( i = 0; i < len; i += VECSZ ) - { - if( i > len - VECSZ ) - { - i = len - VECSZ; - mode = hal::STORE_UNALIGNED; - } - VecT a, b; - v_load_deinterleave(src + i*cn, a, b); - v_store(dst0 + i, a, mode); - v_store(dst1 + i, b, mode); - if( i < i0 ) - { - i = i0 - VECSZ; - mode = hal::STORE_ALIGNED_NOCACHE; - } - } - } - else if( cn == 3 ) - { - T* dst2 = dst[2]; - for( i = 0; i < len; i += VECSZ ) - { - if( i > len - VECSZ ) - { - i = len - VECSZ; - mode = hal::STORE_UNALIGNED; - } - VecT a, b, c; - v_load_deinterleave(src + i*cn, a, b, c); - v_store(dst0 + i, a, mode); - v_store(dst1 + i, b, mode); - v_store(dst2 + i, c, mode); - if( i < i0 ) - { - i = i0 - VECSZ; - mode = hal::STORE_ALIGNED_NOCACHE; - } - } - } - else - { - CV_Assert( cn == 4 ); - T* dst2 = dst[2]; - T* dst3 = dst[3]; - for( i = 0; i < len; i += VECSZ ) - { - if( i > len - VECSZ ) - { - i = len - VECSZ; - mode = hal::STORE_UNALIGNED; - } - VecT a, b, c, d; - v_load_deinterleave(src + i*cn, a, b, c, d); - v_store(dst0 + i, a, mode); - v_store(dst1 + i, b, mode); - v_store(dst2 + i, c, mode); - v_store(dst3 + i, d, mode); - if( i < i0 ) - { - i = i0 - VECSZ; - mode = hal::STORE_ALIGNED_NOCACHE; - } - } - } - vx_cleanup(); -} -#endif - -template static void -split_( const T* src, T** dst, int len, int cn ) -{ - int k = cn % 4 ? cn % 4 : 4; - int i, j; - if( k == 1 ) - { - T* dst0 = dst[0]; - - if(cn == 1) - { - memcpy(dst0, src, len * sizeof(T)); - } - else - { - for( i = 0, j = 0 ; i < len; i++, j += cn ) - dst0[i] = src[j]; - } - } - else if( k == 2 ) - { - T *dst0 = dst[0], *dst1 = dst[1]; - i = j = 0; - - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; - dst1[i] = src[j+1]; - } - } - else if( k == 3 ) - { - T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; - i = j = 0; - - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; - dst1[i] = src[j+1]; - dst2[i] = src[j+2]; - } - } - else - { - T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; - i = j = 0; - - for( ; i < len; i++, j += cn ) - { - dst0[i] = src[j]; dst1[i] = src[j+1]; - dst2[i] = src[j+2]; dst3[i] = src[j+3]; - } - } - - for( ; k < cn; k += 4 ) - { - T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; - for( i = 0, j = k; i < len; i++, j += cn ) - { - dst0[i] = src[j]; dst1[i] = src[j+1]; - dst2[i] = src[j+2]; dst3[i] = src[j+3]; - } - } -} - void split8u(const uchar* src, uchar** dst, int len, int cn ) { + CV_INSTRUMENT_REGION(); CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn) - -#if CV_SIMD - if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) - vecsplit_(src, dst, len, cn); - else -#endif - split_(src, dst, len, cn); + CV_CPU_DISPATCH(split8u, (src, dst, len, cn), + CV_CPU_DISPATCH_MODES_ALL); } void split16u(const ushort* src, ushort** dst, int len, int cn ) { + CV_INSTRUMENT_REGION(); CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn) -#if CV_SIMD - if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) - vecsplit_(src, dst, len, cn); - else -#endif - split_(src, dst, len, cn); + CV_CPU_DISPATCH(split16u, (src, dst, len, cn), + CV_CPU_DISPATCH_MODES_ALL); } void split32s(const int* src, int** dst, int len, int cn ) { + CV_INSTRUMENT_REGION(); CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn) -#if CV_SIMD - if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 ) - vecsplit_(src, dst, len, cn); - else -#endif - split_(src, dst, len, cn); + CV_CPU_DISPATCH(split32s, (src, dst, len, cn), + CV_CPU_DISPATCH_MODES_ALL); } void split64s(const int64* src, int64** dst, int len, int cn ) { + CV_INSTRUMENT_REGION(); CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn) -#if CV_SIMD - if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) - vecsplit_(src, dst, len, cn); - else -#endif - split_(src, dst, len, cn); + CV_CPU_DISPATCH(split64s, (src, dst, len, cn), + CV_CPU_DISPATCH_MODES_ALL); } -}} // cv::hal:: +} // namespace cv::hal:: /****************************************************************************************\ * split & merge * @@ -233,7 +64,6 @@ static SplitFunc getSplitFunc(int depth) #ifdef HAVE_IPP -namespace cv { static bool ipp_split(const Mat& src, Mat* mv, int channels) { #ifdef HAVE_IPP_IW_LL @@ -282,10 +112,9 @@ static bool ipp_split(const Mat& src, Mat* mv, int channels) return false; #endif } -} #endif -void cv::split(const Mat& src, Mat* mv) +void split(const Mat& src, Mat* mv) { CV_INSTRUMENT_REGION(); @@ -341,8 +170,6 @@ void cv::split(const Mat& src, Mat* mv) #ifdef HAVE_OPENCL -namespace cv { - static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) { int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), @@ -381,11 +208,9 @@ static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) return k.run(2, globalsize, NULL, false); } -} - #endif -void cv::split(InputArray _m, OutputArrayOfArrays _mv) +void split(InputArray _m, OutputArrayOfArrays _mv) { CV_INSTRUMENT_REGION(); @@ -411,3 +236,5 @@ void cv::split(InputArray _m, OutputArrayOfArrays _mv) split(m, &dst[0]); } + +} // namespace diff --git a/modules/core/src/split.simd.hpp b/modules/core/src/split.simd.hpp index ac1b7217bb..25e90c0063 100644 --- a/modules/core/src/split.simd.hpp +++ b/modules/core/src/split.simd.hpp @@ -4,9 +4,16 @@ #include "precomp.hpp" -#include "opencl_kernels_core.hpp" namespace cv { namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +void split8u(const uchar* src, uchar** dst, int len, int cn); +void split16u(const ushort* src, ushort** dst, int len, int cn); +void split32s(const int* src, int** dst, int len, int cn); +void split64s(const int64* src, int64** dst, int len, int cn); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #if CV_SIMD // see the comments for vecmerge_ in merge.cpp @@ -169,8 +176,7 @@ split_( const T* src, T** dst, int len, int cn ) void split8u(const uchar* src, uchar** dst, int len, int cn ) { - CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn) - + CV_INSTRUMENT_REGION(); #if CV_SIMD if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); @@ -181,7 +187,7 @@ void split8u(const uchar* src, uchar** dst, int len, int cn ) void split16u(const ushort* src, ushort** dst, int len, int cn ) { - CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn) + CV_INSTRUMENT_REGION(); #if CV_SIMD if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); @@ -192,7 +198,7 @@ void split16u(const ushort* src, ushort** dst, int len, int cn ) void split32s(const int* src, int** dst, int len, int cn ) { - CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn) + CV_INSTRUMENT_REGION(); #if CV_SIMD if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); @@ -203,7 +209,7 @@ void split32s(const int* src, int** dst, int len, int cn ) void split64s(const int64* src, int64** dst, int len, int cn ) { - CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn) + CV_INSTRUMENT_REGION(); #if CV_SIMD if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) vecsplit_(src, dst, len, cn); @@ -212,202 +218,6 @@ void split64s(const int64* src, int64** dst, int len, int cn ) split_(src, dst, len, cn); } -}} // cv::hal:: - -/****************************************************************************************\ -* split & merge * -\****************************************************************************************/ - -typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn); - -static SplitFunc getSplitFunc(int depth) -{ - static SplitFunc splitTab[] = - { - (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), - (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0 - }; - - return splitTab[depth]; -} - -#ifdef HAVE_IPP - -namespace cv { -static bool ipp_split(const Mat& src, Mat* mv, int channels) -{ -#ifdef HAVE_IPP_IW_LL - CV_INSTRUMENT_REGION_IPP(); - - if(channels != 3 && channels != 4) - return false; - - if(src.dims <= 2) - { - IppiSize size = ippiSize(src.size()); - void *dstPtrs[4] = {NULL}; - size_t dstStep = mv[0].step; - for(int i = 0; i < channels; i++) - { - dstPtrs[i] = mv[i].ptr(); - if(dstStep != mv[i].step) - return false; - } - - return CV_INSTRUMENT_FUN_IPP(llwiCopySplit, src.ptr(), (int)src.step, dstPtrs, (int)dstStep, size, (int)src.elemSize1(), channels, 0) >= 0; - } - else - { - const Mat *arrays[5] = {NULL}; - uchar *ptrs[5] = {NULL}; - arrays[0] = &src; - - for(int i = 1; i < channels; i++) - { - arrays[i] = &mv[i-1]; - } - - NAryMatIterator it(arrays, ptrs); - IppiSize size = { (int)it.size, 1 }; - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - { - if(CV_INSTRUMENT_FUN_IPP(llwiCopySplit, ptrs[0], 0, (void**)&ptrs[1], 0, size, (int)src.elemSize1(), channels, 0) < 0) - return false; - } - return true; - } -#else - CV_UNUSED(src); CV_UNUSED(mv); CV_UNUSED(channels); - return false; #endif -} -} -#endif - -void cv::split(const Mat& src, Mat* mv) -{ - CV_INSTRUMENT_REGION(); - - int k, depth = src.depth(), cn = src.channels(); - if( cn == 1 ) - { - src.copyTo(mv[0]); - return; - } - - for( k = 0; k < cn; k++ ) - { - mv[k].create(src.dims, src.size, depth); - } - - CV_IPP_RUN_FAST(ipp_split(src, mv, cn)); - - SplitFunc func = getSplitFunc(depth); - CV_Assert( func != 0 ); - - size_t esz = src.elemSize(), esz1 = src.elemSize1(); - size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; - AutoBuffer _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16); - const Mat** arrays = (const Mat**)_buf.data(); - uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16); - - arrays[0] = &src; - for( k = 0; k < cn; k++ ) - { - arrays[k+1] = &mv[k]; - } - - NAryMatIterator it(arrays, ptrs, cn+1); - size_t total = it.size; - size_t blocksize = std::min((size_t)CV_SPLIT_MERGE_MAX_BLOCK_SIZE(cn), cn <= 4 ? total : std::min(total, blocksize0)); - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - { - for( size_t j = 0; j < total; j += blocksize ) - { - size_t bsz = std::min(total - j, blocksize); - func( ptrs[0], &ptrs[1], (int)bsz, cn ); - - if( j + blocksize < total ) - { - ptrs[0] += bsz*esz; - for( k = 0; k < cn; k++ ) - ptrs[k+1] += bsz*esz1; - } - } - } -} - -#ifdef HAVE_OPENCL - -namespace cv { - -static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv ) -{ - int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), - rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; - - String dstargs, processelem, indexdecl; - for (int i = 0; i < cn; ++i) - { - dstargs += format("DECLARE_DST_PARAM(%d)", i); - indexdecl += format("DECLARE_INDEX(%d)", i); - processelem += format("PROCESS_ELEM(%d)", i); - } - - ocl::Kernel k("split", ocl::core::split_merge_oclsrc, - format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s" - " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s", - ocl::memopTypeToStr(depth), cn, dstargs.c_str(), - processelem.c_str(), indexdecl.c_str())); - if (k.empty()) - return false; - - Size size = _m.size(); - _mv.create(cn, 1, depth); - for (int i = 0; i < cn; ++i) - _mv.create(size, depth, i); - - std::vector dst; - _mv.getUMatVector(dst); - - int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat())); - for (int i = 0; i < cn; ++i) - argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i])); - k.set(argidx, rowsPerWI); - - size_t globalsize[2] = { (size_t)size.width, ((size_t)size.height + rowsPerWI - 1) / rowsPerWI }; - return k.run(2, globalsize, NULL, false); -} - -} - -#endif - -void cv::split(InputArray _m, OutputArrayOfArrays _mv) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(), - ocl_split(_m, _mv)) - - Mat m = _m.getMat(); - if( m.empty() ) - { - _mv.release(); - return; - } - - CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() ); - - int depth = m.depth(), cn = m.channels(); - _mv.create(cn, 1, depth); - for (int i = 0; i < cn; ++i) - _mv.create(m.dims, m.size.p, depth, i); - - std::vector dst; - _mv.getMatVector(dst); - - split(m, &dst[0]); -} +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace \ No newline at end of file