From e3633ec4a2d0ce7158c5336738c6ec262a80fa78 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 13 Feb 2019 18:27:55 +0300 Subject: [PATCH] core: dispatch count_non_zero --- modules/core/CMakeLists.txt | 1 + modules/core/src/count_non_zero.dispatch.cpp | 192 +------------------ modules/core/src/count_non_zero.simd.hpp | 178 ++--------------- 3 files changed, 24 insertions(+), 347 deletions(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index bfa2fd1d98..9559783d94 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -5,6 +5,7 @@ ocv_add_dispatched_file(stat SSE4_2 AVX2) ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) ocv_add_dispatched_file(convert SSE2 AVX2) ocv_add_dispatched_file(convert_scale SSE2 AVX2) +ocv_add_dispatched_file(count_non_zero SSE2 AVX2) ocv_add_dispatched_file(sum SSE2 AVX2) # dispatching for accuracy tests diff --git a/modules/core/src/count_non_zero.dispatch.cpp b/modules/core/src/count_non_zero.dispatch.cpp index 202e7b846d..d0ce1ef989 100644 --- a/modules/core/src/count_non_zero.dispatch.cpp +++ b/modules/core/src/count_non_zero.dispatch.cpp @@ -7,190 +7,18 @@ #include "opencl_kernels_core.hpp" #include "stat.hpp" +#include "count_non_zero.simd.hpp" +#include "count_non_zero.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + namespace cv { -template -static int countNonZero_(const T* src, int len ) -{ - int i=0, nz = 0; - #if CV_ENABLE_UNROLLED - for(; i <= len - 4; i += 4 ) - nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0); - #endif - for( ; i < len; i++ ) - nz += src[i] != 0; - return nz; -} - -static int countNonZero8u( const uchar* src, int len ) -{ - int i=0, nz = 0; -#if CV_SIMD - int len0 = len & -v_uint8::nlanes; - v_uint8 v_zero = vx_setzero_u8(); - v_uint8 v_one = vx_setall_u8(1); - - v_uint32 v_sum32 = vx_setzero_u32(); - while (i < len0) - { - v_uint16 v_sum16 = vx_setzero_u16(); - int j = i; - while (j < std::min(len0, i + 65280 * v_uint16::nlanes)) - { - v_uint8 v_sum8 = vx_setzero_u8(); - int k = j; - for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes) - v_sum8 += v_one & (vx_load(src + k) == v_zero); - v_uint16 part1, part2; - v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; - j = k; - } - v_uint32 part1, part2; - v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; - i = j; - } - nz = i - v_reduce_sum(v_sum32); - v_cleanup(); -#endif - for( ; i < len; i++ ) - nz += src[i] != 0; - return nz; -} - -static int countNonZero16u( const ushort* src, int len ) -{ - int i = 0, nz = 0; -#if CV_SIMD - int len0 = len & -v_int8::nlanes; - v_uint16 v_zero = vx_setzero_u16(); - v_int8 v_one = vx_setall_s8(1); - - v_int32 v_sum32 = vx_setzero_s32(); - while (i < len0) - { - v_int16 v_sum16 = vx_setzero_s16(); - int j = i; - while (j < std::min(len0, i + 32766 * v_int16::nlanes)) - { - v_int8 v_sum8 = vx_setzero_s8(); - int k = j; - for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes) - v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero)); - v_int16 part1, part2; - v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; - j = k; - } - v_int32 part1, part2; - v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; - i = j; - } - nz = i - v_reduce_sum(v_sum32); - v_cleanup(); -#endif - return nz + countNonZero_(src + i, len - i); -} - -static int countNonZero32s( const int* src, int len ) -{ - int i = 0, nz = 0; -#if CV_SIMD - int len0 = len & -v_int8::nlanes; - v_int32 v_zero = vx_setzero_s32(); - v_int8 v_one = vx_setall_s8(1); - - v_int32 v_sum32 = vx_setzero_s32(); - while (i < len0) - { - v_int16 v_sum16 = vx_setzero_s16(); - int j = i; - while (j < std::min(len0, i + 32766 * v_int16::nlanes)) - { - v_int8 v_sum8 = vx_setzero_s8(); - int k = j; - for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes) - v_sum8 += v_one & v_pack( - v_pack(vx_load(src + k ) == v_zero, vx_load(src + k + v_int32::nlanes) == v_zero), - v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero) - ); - v_int16 part1, part2; - v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; - j = k; - } - v_int32 part1, part2; - v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; - i = j; - } - nz = i - v_reduce_sum(v_sum32); - v_cleanup(); -#endif - return nz + countNonZero_(src + i, len - i); -} - -static int countNonZero32f( const float* src, int len ) -{ - int i = 0, nz = 0; -#if CV_SIMD - int len0 = len & -v_int8::nlanes; - v_float32 v_zero = vx_setzero_f32(); - v_int8 v_one = vx_setall_s8(1); - - v_int32 v_sum32 = vx_setzero_s32(); - while (i < len0) - { - v_int16 v_sum16 = vx_setzero_s16(); - int j = i; - while (j < std::min(len0, i + 32766 * v_int16::nlanes)) - { - v_int8 v_sum8 = vx_setzero_s8(); - int k = j; - for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes) - v_sum8 += v_one & v_pack( - v_pack(v_reinterpret_as_s32(vx_load(src + k ) == v_zero), v_reinterpret_as_s32(vx_load(src + k + v_float32::nlanes) == v_zero)), - v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero)) - ); - v_int16 part1, part2; - v_expand(v_sum8, part1, part2); - v_sum16 += part1 + part2; - j = k; - } - v_int32 part1, part2; - v_expand(v_sum16, part1, part2); - v_sum32 += part1 + part2; - i = j; - } - nz = i - v_reduce_sum(v_sum32); - v_cleanup(); -#endif - return nz + countNonZero_(src + i, len - i); -} - -static int countNonZero64f( const double* src, int len ) -{ - return countNonZero_(src, len); -} - -typedef int (*CountNonZeroFunc)(const uchar*, int); - static CountNonZeroFunc getCountNonZeroTab(int depth) { - static CountNonZeroFunc countNonZeroTab[] = - { - (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), - (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), - (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f), - (CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0 - }; - - return countNonZeroTab[depth]; + CV_INSTRUMENT_REGION(); + CV_CPU_DISPATCH(getCountNonZeroTab, (depth), + CV_CPU_DISPATCH_MODES_ALL); } - #ifdef HAVE_OPENCL static bool ocl_countNonZero( InputArray _src, int & res ) { @@ -288,9 +116,7 @@ static bool ipp_countNonZero( Mat &src, int &res ) } #endif -} // cv:: - -int cv::countNonZero( InputArray _src ) +int countNonZero(InputArray _src) { CV_INSTRUMENT_REGION(); @@ -324,7 +150,7 @@ int cv::countNonZero( InputArray _src ) return nz; } -void cv::findNonZero( InputArray _src, OutputArray _idx ) +void findNonZero(InputArray _src, OutputArray _idx) { CV_INSTRUMENT_REGION(); @@ -351,3 +177,5 @@ void cv::findNonZero( InputArray _src, OutputArray _idx ) *idx_ptr++ = Point(j, i); } } + +} // namespace diff --git a/modules/core/src/count_non_zero.simd.hpp b/modules/core/src/count_non_zero.simd.hpp index 202e7b846d..4c01c08850 100644 --- a/modules/core/src/count_non_zero.simd.hpp +++ b/modules/core/src/count_non_zero.simd.hpp @@ -2,13 +2,20 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html - #include "precomp.hpp" -#include "opencl_kernels_core.hpp" -#include "stat.hpp" namespace cv { +typedef int (*CountNonZeroFunc)(const uchar*, int); + + +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +CountNonZeroFunc getCountNonZeroTab(int depth); + + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + template static int countNonZero_(const T* src, int len ) { @@ -175,9 +182,7 @@ static int countNonZero64f( const double* src, int len ) return countNonZero_(src, len); } -typedef int (*CountNonZeroFunc)(const uchar*, int); - -static CountNonZeroFunc getCountNonZeroTab(int depth) +CountNonZeroFunc getCountNonZeroTab(int depth) { static CountNonZeroFunc countNonZeroTab[] = { @@ -190,164 +195,7 @@ static CountNonZeroFunc getCountNonZeroTab(int depth) return countNonZeroTab[depth]; } - -#ifdef HAVE_OPENCL -static bool ocl_countNonZero( InputArray _src, int & res ) -{ - int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - - if (depth == CV_64F && !doubleSupport) - return false; - - int dbsize = ocl::Device::getDefault().maxComputeUnits(); - size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); - - int wgs2_aligned = 1; - while (wgs2_aligned < (int)wgs) - wgs2_aligned <<= 1; - wgs2_aligned >>= 1; - - ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO" - " -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s", - ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), - ocl::typeToStr(depth), (int)wgs, kercn, - wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", - _src.isContinuous() ? " -D HAVE_SRC_CONT" : "")); - if (k.empty()) - return false; - - UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1); - k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), - dbsize, ocl::KernelArg::PtrWriteOnly(db)); - - size_t globalsize = dbsize * wgs; - if (k.run(1, &globalsize, &wgs, true)) - return res = saturate_cast(cv::sum(db.getMat(ACCESS_READ))[0]), true; - return false; -} #endif -#if defined HAVE_IPP -static bool ipp_countNonZero( Mat &src, int &res ) -{ - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Poor performance of SSE42 - if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42) - return false; -#endif - - Ipp32s count = 0; - int depth = src.depth(); - - if(src.dims <= 2) - { - IppStatus status; - IppiSize size = {src.cols*src.channels(), src.rows}; - - if(depth == CV_8U) - status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0); - else if(depth == CV_32F) - status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0); - else - return false; - - if(status < 0) - return false; - - res = size.width*size.height - count; - } - else - { - IppStatus status; - const Mat *arrays[] = {&src, NULL}; - Mat planes[1]; - NAryMatIterator it(arrays, planes, 1); - IppiSize size = {(int)it.size*src.channels(), 1}; - res = 0; - for (size_t i = 0; i < it.nplanes; i++, ++it) - { - if(depth == CV_8U) - status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr(), (int)it.planes->step, size, &count, 0, 0); - else if(depth == CV_32F) - status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr(), (int)it.planes->step, size, &count, 0, 0); - else - return false; - - if(status < 0 || (int)it.planes->total()*src.channels() < count) - return false; - - res += (int)it.planes->total()*src.channels() - count; - } - } - - return true; -} -#endif - -} // cv:: - -int cv::countNonZero( InputArray _src ) -{ - CV_INSTRUMENT_REGION(); - - int type = _src.type(), cn = CV_MAT_CN(type); - CV_Assert( cn == 1 ); - -#if defined HAVE_OPENCL || defined HAVE_IPP - int res = -1; -#endif - -#ifdef HAVE_OPENCL - CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, - ocl_countNonZero(_src, res), - res) -#endif - - Mat src = _src.getMat(); - CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res); - - CountNonZeroFunc func = getCountNonZeroTab(src.depth()); - CV_Assert( func != 0 ); - - const Mat* arrays[] = {&src, 0}; - uchar* ptrs[1] = {}; - NAryMatIterator it(arrays, ptrs); - int total = (int)it.size, nz = 0; - - for( size_t i = 0; i < it.nplanes; i++, ++it ) - nz += func( ptrs[0], total ); - - return nz; -} - -void cv::findNonZero( InputArray _src, OutputArray _idx ) -{ - CV_INSTRUMENT_REGION(); - - Mat src = _src.getMat(); - CV_Assert( src.type() == CV_8UC1 ); - int n = countNonZero(src); - if( n == 0 ) - { - _idx.release(); - return; - } - if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() ) - _idx.release(); - _idx.create(n, 1, CV_32SC2); - Mat idx = _idx.getMat(); - CV_Assert(idx.isContinuous()); - Point* idx_ptr = idx.ptr(); - - for( int i = 0; i < src.rows; i++ ) - { - const uchar* bin_ptr = src.ptr(i); - for( int j = 0; j < src.cols; j++ ) - if( bin_ptr[j] ) - *idx_ptr++ = Point(j, i); - } -} +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace