diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index e0d3b959d0..8b335ad67c 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -9,6 +9,8 @@ endif() set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass") +ocv_add_dispatched_file("layers/layers_common" AVX AVX2) + ocv_add_module(dnn opencv_core opencv_imgproc WRAP python matlab java) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninitialized -Wsign-promo -Wmissing-declarations -Wmissing-prototypes diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 12e38c576b..6e09c8ca98 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -506,13 +506,13 @@ public: int bsz = ofs1 - ofs0; #if CV_TRY_AVX2 if(useAVX2) - fastConv_avx2(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, relu, cn0 == 0); else #endif #if CV_TRY_AVX if(useAVX) - fastConv_avx(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + opt_AVX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, outShape, bsz, vsz, vsz_a, relu, cn0 == 0); else #endif @@ -824,12 +824,12 @@ public: #if CV_TRY_AVX2 if( useAVX2 ) - fastGEMM_avx2( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); + opt_AVX2::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); else #endif #if CV_TRY_AVX if( useAVX ) - fastGEMM_avx( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); + opt_AVX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax ); else #endif for( m = 0; m < mmax; m += 2 ) diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp index f27f39c660..9bec3b086f 100644 --- a/modules/dnn/src/layers/fully_connected_layer.cpp +++ b/modules/dnn/src/layers/fully_connected_layer.cpp @@ -177,12 +177,12 @@ public: #if CV_TRY_AVX2 if( useAVX2 ) - fastGEMM1T_avx2( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); else #endif #if CV_TRY_AVX if( useAVX ) - fastGEMM1T_avx( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); + opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize); else #endif { @@ -191,19 +191,19 @@ public: #if CV_SIMD128 for( ; i <= nw - 4; i += 4, wptr += 4*wstep ) { - vfloat32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f); - vfloat32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f); + v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f); + v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f); for( k = 0; k < vecsize; k += 4 ) { - vfloat32x4 v = v_load_aligned(sptr + k); + v_float32x4 v = v_load_aligned(sptr + k); vs0 += v*v_load_aligned(wptr + k); vs1 += v*v_load_aligned(wptr + wstep + k); vs2 += v*v_load_aligned(wptr + wstep*2 + k); vs3 += v*v_load_aligned(wptr + wstep*3 + k); } - vfloat32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3); + v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3); s += v_load(biasptr + i); v_store(dptr + i, s); } diff --git a/modules/dnn/src/layers/layers_common.avx.cpp b/modules/dnn/src/layers/layers_common.avx.cpp deleted file mode 100644 index 4e0c034eae..0000000000 --- a/modules/dnn/src/layers/layers_common.avx.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Copyright (C) 2017, Intel Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" -#include "layers_common.hpp" -#include "opencv2/core/hal/intrin.hpp" - -#define fastConv_some_avx fastConv_avx -#define fastGEMM1T_some_avx fastGEMM1T_avx -#define fastGEMM_some_avx fastGEMM_avx - -#undef _mm256_fmadd_ps -#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) - -#include "layers_common.simd.hpp" diff --git a/modules/dnn/src/layers/layers_common.avx2.cpp b/modules/dnn/src/layers/layers_common.avx2.cpp deleted file mode 100644 index ef8108cc25..0000000000 --- a/modules/dnn/src/layers/layers_common.avx2.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Copyright (C) 2017, Intel Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" -#include "layers_common.hpp" -#include "opencv2/core/hal/intrin.hpp" - -#define fastConv_some_avx fastConv_avx2 -#define fastGEMM1T_some_avx fastGEMM1T_avx2 -#define fastGEMM_some_avx fastGEMM_avx2 - -#include "layers_common.simd.hpp" diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp index bbab2756f5..f34646af14 100644 --- a/modules/dnn/src/layers/layers_common.hpp +++ b/modules/dnn/src/layers/layers_common.hpp @@ -45,6 +45,10 @@ #include #include +// dispatched AVX/AVX2 optimizations +#include "layers/layers_common.simd.hpp" +#include "layers/layers_common.simd_declarations.hpp" + namespace cv { namespace dnn @@ -64,32 +68,6 @@ void getConvPoolPaddings(const Size& inp, const Size& out, const Size &kernel, const Size &stride, const String &padMode, Size &pad); -#if CV_TRY_AVX -void fastConv_avx(const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput); -void fastGEMM1T_avx( const float* vec, const float* weights, - size_t wstep, const float* bias, - float* dst, int nvecs, int vecsize ); -void fastGEMM_avx( const float* aptr, size_t astep, const float* bptr0, - size_t bstep, float* cptr, size_t cstep, - int ma, int na, int nb ); -#endif - -#if CV_TRY_AVX2 -void fastConv_avx2(const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput); -void fastGEMM1T_avx2( const float* vec, const float* weights, - size_t wstep, const float* bias, - float* dst, int nvecs, int vecsize ); -void fastGEMM_avx2( const float* aptr, size_t astep, const float* bptr0, - size_t bstep, float* cptr, size_t cstep, - int ma, int na, int nb ); -#endif - } } diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index 1110ed0933..9890587fde 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -40,16 +40,34 @@ // //M*/ -#ifndef __DNN_LAYERS_COMMON_SIMD_HPP__ -#define __DNN_LAYERS_COMMON_SIMD_HPP__ +#include "opencv2/core/hal/intrin.hpp" namespace cv { namespace dnn { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -void fastConv_some_avx( const float* weights, size_t wstep, const float* bias, - const float* rowbuf, float* output, const int* outShape, - int blockSize, int vecsize, int vecsize_aligned, - const float* relu, bool initOutput ) +void fastConv( const float* weights, size_t wstep, const float* bias, + const float* rowbuf, float* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, + const float* relu, bool initOutput ); +void fastGEMM1T( const float* vec, const float* weights, + size_t wstep, const float* bias, + float* dst, int nvecs, int vecsize ); +void fastGEMM( const float* aptr, size_t astep, const float* bptr, + size_t bstep, float* cptr, size_t cstep, + int ma, int na, int nb ); + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX + +#if !CV_FMA // AVX workaround +#undef _mm256_fmadd_ps +#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) +#endif + +void fastConv( const float* weights, size_t wstep, const float* bias, + const float* rowbuf, float* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, + const float* relu, bool initOutput ) { int outCn = outShape[1]; size_t outPlaneSize = outShape[2]*outShape[3]; @@ -214,9 +232,9 @@ void fastConv_some_avx( const float* weights, size_t wstep, const float* bias, } // dst = vec * weights^t + bias -void fastGEMM1T_some_avx( const float* vec, const float* weights, - size_t wstep, const float* bias, - float* dst, int nvecs, int vecsize ) +void fastGEMM1T( const float* vec, const float* weights, + size_t wstep, const float* bias, + float* dst, int nvecs, int vecsize ) { int i = 0; @@ -276,9 +294,9 @@ void fastGEMM1T_some_avx( const float* vec, const float* weights, _mm256_zeroupper(); } -void fastGEMM_some_avx( const float* aptr, size_t astep, const float* bptr, - size_t bstep, float* cptr, size_t cstep, - int ma, int na, int nb ) +void fastGEMM( const float* aptr, size_t astep, const float* bptr, + size_t bstep, float* cptr, size_t cstep, + int ma, int na, int nb ) { int n = 0; for( ; n <= nb - 16; n += 16 ) @@ -346,7 +364,7 @@ void fastGEMM_some_avx( const float* aptr, size_t astep, const float* bptr, _mm256_zeroupper(); } -} -} +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY -#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace