diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index dcb39357a4..52cc010fa7 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -647,12 +647,15 @@ macro(ocv_compiler_optimization_fill_cpu_config) if(NOT DEFINED CPU_${OPT}_FEATURE_ALIAS OR NOT "x${CPU_${OPT}_FEATURE_ALIAS}" STREQUAL "x") set(OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE "${OPENCV_CPU_CONTROL_DEFINITIONS_CONFIGMAKE} #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_${OPT} +# define CV_TRY_${OPT} 1 # define CV_CPU_HAS_SUPPORT_${OPT} 1 # define CV_CPU_CALL_${OPT}(fn, args) return (opt_${OPT}::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_${OPT} +# define CV_TRY_${OPT} 1 # define CV_CPU_HAS_SUPPORT_${OPT} (cv::checkHardwareSupport(CV_CPU_${OPT})) # define CV_CPU_CALL_${OPT}(fn, args) if (CV_CPU_HAS_SUPPORT_${OPT}) return (opt_${OPT}::fn args) #else +# define CV_TRY_${OPT} 0 # define CV_CPU_HAS_SUPPORT_${OPT} 0 # define CV_CPU_CALL_${OPT}(fn, args) #endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 8bd0457242..6eaed9e661 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -1,144 +1,180 @@ // AUTOGENERATED, DO NOT EDIT #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE +# define CV_TRY_SSE 1 # define CV_CPU_HAS_SUPPORT_SSE 1 # define CV_CPU_CALL_SSE(fn, args) return (opt_SSE::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE +# define CV_TRY_SSE 1 # define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE)) # define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args) #else +# define CV_TRY_SSE 0 # define CV_CPU_HAS_SUPPORT_SSE 0 # define CV_CPU_CALL_SSE(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...) CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2 +# define CV_TRY_SSE2 1 # define CV_CPU_HAS_SUPPORT_SSE2 1 # define CV_CPU_CALL_SSE2(fn, args) return (opt_SSE2::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2 +# define CV_TRY_SSE2 1 # define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2)) # define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args) #else +# define CV_TRY_SSE2 0 # define CV_CPU_HAS_SUPPORT_SSE2 0 # define CV_CPU_CALL_SSE2(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...) CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3 +# define CV_TRY_SSE3 1 # define CV_CPU_HAS_SUPPORT_SSE3 1 # define CV_CPU_CALL_SSE3(fn, args) return (opt_SSE3::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3 +# define CV_TRY_SSE3 1 # define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3)) # define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args) #else +# define CV_TRY_SSE3 0 # define CV_CPU_HAS_SUPPORT_SSE3 0 # define CV_CPU_CALL_SSE3(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...) CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3 +# define CV_TRY_SSSE3 1 # define CV_CPU_HAS_SUPPORT_SSSE3 1 # define CV_CPU_CALL_SSSE3(fn, args) return (opt_SSSE3::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3 +# define CV_TRY_SSSE3 1 # define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3)) # define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args) #else +# define CV_TRY_SSSE3 0 # define CV_CPU_HAS_SUPPORT_SSSE3 0 # define CV_CPU_CALL_SSSE3(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...) CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1 +# define CV_TRY_SSE4_1 1 # define CV_CPU_HAS_SUPPORT_SSE4_1 1 # define CV_CPU_CALL_SSE4_1(fn, args) return (opt_SSE4_1::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1 +# define CV_TRY_SSE4_1 1 # define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1)) # define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args) #else +# define CV_TRY_SSE4_1 0 # define CV_CPU_HAS_SUPPORT_SSE4_1 0 # define CV_CPU_CALL_SSE4_1(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...) CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2 +# define CV_TRY_SSE4_2 1 # define CV_CPU_HAS_SUPPORT_SSE4_2 1 # define CV_CPU_CALL_SSE4_2(fn, args) return (opt_SSE4_2::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2 +# define CV_TRY_SSE4_2 1 # define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2)) # define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args) #else +# define CV_TRY_SSE4_2 0 # define CV_CPU_HAS_SUPPORT_SSE4_2 0 # define CV_CPU_CALL_SSE4_2(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...) CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT +# define CV_TRY_POPCNT 1 # define CV_CPU_HAS_SUPPORT_POPCNT 1 # define CV_CPU_CALL_POPCNT(fn, args) return (opt_POPCNT::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT +# define CV_TRY_POPCNT 1 # define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT)) # define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args) #else +# define CV_TRY_POPCNT 0 # define CV_CPU_HAS_SUPPORT_POPCNT 0 # define CV_CPU_CALL_POPCNT(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...) CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX +# define CV_TRY_AVX 1 # define CV_CPU_HAS_SUPPORT_AVX 1 # define CV_CPU_CALL_AVX(fn, args) return (opt_AVX::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX +# define CV_TRY_AVX 1 # define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX)) # define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args) #else +# define CV_TRY_AVX 0 # define CV_CPU_HAS_SUPPORT_AVX 0 # define CV_CPU_CALL_AVX(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...) CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16 +# define CV_TRY_FP16 1 # define CV_CPU_HAS_SUPPORT_FP16 1 # define CV_CPU_CALL_FP16(fn, args) return (opt_FP16::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16 +# define CV_TRY_FP16 1 # define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16)) # define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args) #else +# define CV_TRY_FP16 0 # define CV_CPU_HAS_SUPPORT_FP16 0 # define CV_CPU_CALL_FP16(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...) CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2 +# define CV_TRY_AVX2 1 # define CV_CPU_HAS_SUPPORT_AVX2 1 # define CV_CPU_CALL_AVX2(fn, args) return (opt_AVX2::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2 +# define CV_TRY_AVX2 1 # define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2)) # define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args) #else +# define CV_TRY_AVX2 0 # define CV_CPU_HAS_SUPPORT_AVX2 0 # define CV_CPU_CALL_AVX2(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...) CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3 +# define CV_TRY_FMA3 1 # define CV_CPU_HAS_SUPPORT_FMA3 1 # define CV_CPU_CALL_FMA3(fn, args) return (opt_FMA3::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3 +# define CV_TRY_FMA3 1 # define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3)) # define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args) #else +# define CV_TRY_FMA3 0 # define CV_CPU_HAS_SUPPORT_FMA3 0 # define CV_CPU_CALL_FMA3(fn, args) #endif #define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...) CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON +# define CV_TRY_NEON 1 # define CV_CPU_HAS_SUPPORT_NEON 1 # define CV_CPU_CALL_NEON(fn, args) return (opt_NEON::fn args) #elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON +# define CV_TRY_NEON 1 # define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON)) # define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args) #else +# define CV_TRY_NEON 0 # define CV_CPU_HAS_SUPPORT_NEON 0 # define CV_CPU_CALL_NEON(fn, args) #endif diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 9dac3e7072..c165a3a54b 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -315,7 +315,7 @@ public: int inpCnAll = input.size[1], width = input.size[3], height = input.size[2]; int inpCn = inpCnAll / ngroups; p.is1x1_ = kernel == Size(0,0) && pad == Size(0, 0); - p.useAVX2 = checkHardwareSupport(CPU_AVX2); + p.useAVX2 = CV_CPU_HAS_SUPPORT_AVX2; int ncn = std::min(inpCn, (int)BLK_SIZE_CN); p.ofstab_.resize(kernel.width*kernel.height*ncn); @@ -486,7 +486,7 @@ public: // now compute dot product of the weights // and im2row-transformed part of the tensor int bsz = ofs1 - ofs0; - #if CV_DNN_TRY_AVX2 + #if CV_TRY_AVX2 if(useAVX2) fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, relu, cn0 == 0); @@ -776,7 +776,7 @@ public: b_ = &b; c_ = &c; nstripes_ = nstripes; - useAVX2 = checkHardwareSupport(CPU_AVX2); + useAVX2 = CV_CPU_HAS_SUPPORT_AVX2; } void operator()(const Range& range_) const @@ -794,7 +794,7 @@ public: size_t bstep = b_->step1(); size_t cstep = c_->step1(); - #if CV_DNN_TRY_AVX2 + #if CV_TRY_AVX2 if( useAVX2 ) fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); else diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index 9f790da63d..6a8b62d0f4 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -127,7 +127,7 @@ public: biasMat_ = &biasMat; dstMat_ = &dstMat; nstripes_ = nstripes; - useAVX2_ = checkHardwareSupport(CPU_AVX2); + useAVX2_ = CV_CPU_HAS_SUPPORT_AVX2; } void operator()(const Range& r) const @@ -161,7 +161,7 @@ public: memcpy(sptr, sptr_, vecsize*sizeof(sptr[0])); - #if CV_DNN_TRY_AVX2 + #if CV_TRY_AVX2 if( useAVX2_ ) fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); else diff --git a/modules/dnn/src/layers/layers_common.avx2.cpp b/modules/dnn/src/layers/layers_common.avx2.cpp index 5efa834579..1171e83ff4 100644 --- a/modules/dnn/src/layers/layers_common.avx2.cpp +++ b/modules/dnn/src/layers/layers_common.avx2.cpp @@ -43,10 +43,6 @@ #include "layers_common.hpp" #include "opencv2/core/hal/intrin.hpp" -#if CV_DNN_TRY_AVX2 - -#include - namespace cv { namespace dnn { @@ -334,7 +330,6 @@ void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr, _mm256_storeu_ps(cptr3 + n + 8, d31); } } - _mm256_zeroupper(); for( ; n < nb; n++ ) { @@ -350,9 +345,8 @@ void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr, cptr0[n] = d0; } } + _mm256_zeroupper(); } } } - -#endif diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp index 7f46369888..e2d2f42ed0 100644 --- a/modules/dnn/src/layers/layers_common.hpp +++ b/modules/dnn/src/layers/layers_common.hpp @@ -63,9 +63,7 @@ void getConvPoolPaddings(const Size& inp, const Size& out, const Size &kernel, const Size &stride, const String &padMode, Size &pad); -#if CV_SSE2 -#define CV_DNN_TRY_AVX2 1 - +#if CV_TRY_AVX2 void fastConv_avx2(const float* weights, size_t wstep, const float* bias, const float* rowbuf, float* output, const int* outShape, int blockSize, int vecsize, int vecsize_aligned, @@ -76,9 +74,6 @@ void fastGEMM1T_avx2( const float* vec, const float* weights, void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0, size_t bstep, float* cptr, size_t cstep, int ma, int na, int nb ); - -#else -#define CV_DNN_TRY_AVX2 0 #endif }