From 4f558e8b89d76eb5106a6cd57593751c78d20ca2 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 22:43:34 +0000 Subject: [PATCH 1/8] cmake: added "SSE4_2" into default CPU dispatch --- cmake/OpenCVCompilerOptimizations.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 52cc010fa7..45a536bb28 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -238,7 +238,7 @@ if(X86 OR X86_64) endif() if(NOT DEFINED CPU_DISPATCH) - set(CPU_DISPATCH "SSE4_1;AVX;FP16;AVX2" CACHE STRING "${HELP_CPU_DISPATCH}") + set(CPU_DISPATCH "SSE4_1;SSE4_2;AVX;FP16;AVX2" CACHE STRING "${HELP_CPU_DISPATCH}") endif() if(NOT DEFINED CPU_BASELINE) From 03c3e0edcfd9b38f166d382ad7a1643bc02b68bb Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 13:01:29 +0000 Subject: [PATCH 2/8] core(stat): stat.cpp minor refactoring - remove unused code - added: #if CV_ENABLE_UNROLLED in Hamming's functions --- modules/core/src/stat.cpp | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index d40e91af14..5ea3563444 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -53,16 +53,6 @@ namespace cv { -template static inline Scalar rawToScalar(const T& v) -{ - Scalar s; - typedef typename DataType::channel_type T1; - int i, n = DataType::channels; - for( i = 0; i < n; i++ ) - s.val[i] = ((T1*)&v)[i]; - return s; -} - /****************************************************************************************\ * sum * \****************************************************************************************/ @@ -4344,12 +4334,13 @@ int normHamming(const uchar* a, int n) result += v_reduce_sum(t); } #endif // CV_SIMD128 - +#if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { result += popCountTable[a[i]] + popCountTable[a[i+1]] + popCountTable[a[i+2]] + popCountTable[a[i+3]]; } +#endif for(; i < n; i++) { result += popCountTable[a[i]]; @@ -4415,12 +4406,13 @@ int normHamming(const uchar* a, const uchar* b, int n) result += v_reduce_sum(t); } #endif // CV_SIMD128 - +#if CV_ENABLE_UNROLLED for(; i <= n - 4; i += 4) { result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; } +#endif for(; i < n; i++) { result += popCountTable[a[i] ^ b[i]]; @@ -4463,11 +4455,11 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) return -1; int i = 0; int result = 0; - #if CV_ENABLE_UNROLLED +#if CV_ENABLE_UNROLLED for( ; i <= n - 4; i += 4 ) result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] + tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]]; - #endif +#endif for( ; i < n; i++ ) result += tab[a[i] ^ b[i]]; return result; From 85afbd409be74e01c74e2d62dabb63944599c140 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 13:07:58 +0000 Subject: [PATCH 3/8] core(stat): move implementations into .hpp file w/o changes --- modules/core/src/stat.cpp | 150 ----------------------------- modules/core/src/stat.simd.hpp | 170 +++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+), 150 deletions(-) create mode 100644 modules/core/src/stat.simd.hpp diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 5ea3563444..fd6e0a2f2c 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -4269,156 +4269,6 @@ static const uchar popCountTable4[] = 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; -#if CV_AVX2 -static inline int _mm256_extract_epi32_(__m256i reg, const int i) -{ - CV_DECL_ALIGNED(32) int reg_data[8]; - CV_DbgAssert(0 <= i && i < 8); - _mm256_store_si256((__m256i*)reg_data, reg); - return reg_data[i]; -} -#endif - -int normHamming(const uchar* a, int n) -{ - int i = 0; - int result = 0; -#if CV_AVX2 - if(USE_AVX2) - { - __m256i _r0 = _mm256_setzero_si256(); - __m256i _0 = _mm256_setzero_si256(); - __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); - - for(; i <= n - 32; i+= 32) - { - __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); - - __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask)); - __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, - _mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask)); - - _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); - } - _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); - result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); - } -#endif // CV_AVX2 - -#if CV_POPCNT - if(checkHardwareSupport(CV_CPU_POPCNT)) - { -# if defined CV_POPCNT_U64 - for(; i <= n - 8; i += 8) - { - result += (int)CV_POPCNT_U64(*(uint64*)(a + i)); - } -# endif - for(; i <= n - 4; i += 4) - { - result += CV_POPCNT_U32(*(uint*)(a + i)); - } - } -#endif // CV_POPCNT - -#if CV_SIMD128 - if(hasSIMD128()) - { - v_uint32x4 t = v_setzero_u32(); - for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) - { - t += v_popcount(v_load(a + i)); - } - result += v_reduce_sum(t); - } -#endif // CV_SIMD128 -#if CV_ENABLE_UNROLLED - for(; i <= n - 4; i += 4) - { - result += popCountTable[a[i]] + popCountTable[a[i+1]] + - popCountTable[a[i+2]] + popCountTable[a[i+3]]; - } -#endif - for(; i < n; i++) - { - result += popCountTable[a[i]]; - } - return result; -} - -int normHamming(const uchar* a, const uchar* b, int n) -{ - int i = 0; - int result = 0; -#if CV_AVX2 - if(USE_AVX2) - { - __m256i _r0 = _mm256_setzero_si256(); - __m256i _0 = _mm256_setzero_si256(); - __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); - __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); - - for(; i <= n - 32; i+= 32) - { - __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); - __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); - - __m256i _xor = _mm256_xor_si256(_a0, _b0); - - __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); - __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, - _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); - - _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); - } - _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); - result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); - } -#endif // CV_AVX2 - -#if CV_POPCNT - if(checkHardwareSupport(CV_CPU_POPCNT)) - { -# if defined CV_POPCNT_U64 - for(; i <= n - 8; i += 8) - { - result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); - } -# endif - for(; i <= n - 4; i += 4) - { - result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); - } - } -#endif // CV_POPCNT - -#if CV_SIMD128 - if(hasSIMD128()) - { - v_uint32x4 t = v_setzero_u32(); - for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) - { - t += v_popcount(v_load(a + i) ^ v_load(b + i)); - } - result += v_reduce_sum(t); - } -#endif // CV_SIMD128 -#if CV_ENABLE_UNROLLED - for(; i <= n - 4; i += 4) - { - result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + - popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; - } -#endif - for(; i < n; i++) - { - result += popCountTable[a[i] ^ b[i]]; - } - return result; -} int normHamming(const uchar* a, int n, int cellSize) { diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp new file mode 100644 index 0000000000..25d909530a --- /dev/null +++ b/modules/core/src/stat.simd.hpp @@ -0,0 +1,170 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "opencv2/core/hal/intrin.hpp" + +namespace cv { namespace hal { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +// forward declarations +int normHamming(const uchar* a, int n); +int normHamming(const uchar* a, const uchar* b, int n); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if CV_AVX2 +static inline int _mm256_extract_epi32_(__m256i reg, const int i) +{ + CV_DECL_ALIGNED(32) int reg_data[8]; + CV_DbgAssert(0 <= i && i < 8); + _mm256_store_si256((__m256i*)reg_data, reg); + return reg_data[i]; +} +#endif + +int normHamming(const uchar* a, int n) +{ + int i = 0; + int result = 0; +#if CV_AVX2 + if(USE_AVX2) + { + __m256i _r0 = _mm256_setzero_si256(); + __m256i _0 = _mm256_setzero_si256(); + __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); + + for(; i <= n - 32; i+= 32) + { + __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); + + __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask)); + __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, + _mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask)); + + _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); + } + _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); + result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); + } +#endif // CV_AVX2 + +#if CV_POPCNT + if(checkHardwareSupport(CV_CPU_POPCNT)) + { +# if defined CV_POPCNT_U64 + for(; i <= n - 8; i += 8) + { + result += (int)CV_POPCNT_U64(*(uint64*)(a + i)); + } +# endif + for(; i <= n - 4; i += 4) + { + result += CV_POPCNT_U32(*(uint*)(a + i)); + } + } +#endif // CV_POPCNT + +#if CV_SIMD128 + if(hasSIMD128()) + { + v_uint32x4 t = v_setzero_u32(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) + { + t += v_popcount(v_load(a + i)); + } + result += v_reduce_sum(t); + } +#endif // CV_SIMD128 +#if CV_ENABLE_UNROLLED + for(; i <= n - 4; i += 4) + { + result += popCountTable[a[i]] + popCountTable[a[i+1]] + + popCountTable[a[i+2]] + popCountTable[a[i+3]]; + } +#endif + for(; i < n; i++) + { + result += popCountTable[a[i]]; + } + return result; +} + +int normHamming(const uchar* a, const uchar* b, int n) +{ + int i = 0; + int result = 0; +#if CV_AVX2 + if(USE_AVX2) + { + __m256i _r0 = _mm256_setzero_si256(); + __m256i _0 = _mm256_setzero_si256(); + __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); + __m256i _popcnt_mask = _mm256_set1_epi8(0x0F); + + for(; i <= n - 32; i+= 32) + { + __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i)); + __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i)); + + __m256i _xor = _mm256_xor_si256(_a0, _b0); + + __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); + __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, + _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); + + _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); + } + _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2)); + result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0); + } +#endif // CV_AVX2 + +#if CV_POPCNT + if(checkHardwareSupport(CV_CPU_POPCNT)) + { +# if defined CV_POPCNT_U64 + for(; i <= n - 8; i += 8) + { + result += (int)CV_POPCNT_U64(*(uint64*)(a + i) ^ *(uint64*)(b + i)); + } +# endif + for(; i <= n - 4; i += 4) + { + result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); + } + } +#endif // CV_POPCNT + +#if CV_SIMD128 + if(hasSIMD128()) + { + v_uint32x4 t = v_setzero_u32(); + for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) + { + t += v_popcount(v_load(a + i) ^ v_load(b + i)); + } + result += v_reduce_sum(t); + } +#endif // CV_SIMD128 +#if CV_ENABLE_UNROLLED + for(; i <= n - 4; i += 4) + { + result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] + + popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]]; + } +#endif + for(; i < n; i++) + { + result += popCountTable[a[i] ^ b[i]]; + } + return result; +} + +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} //cv::hal From 880052d3f3f573ddf8141860f87777e375012274 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 13:13:32 +0000 Subject: [PATCH 4/8] core(stat): create dispatch.cpp file --- modules/core/src/stat.dispatch.cpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 modules/core/src/stat.dispatch.cpp diff --git a/modules/core/src/stat.dispatch.cpp b/modules/core/src/stat.dispatch.cpp new file mode 100644 index 0000000000..025c0929f0 --- /dev/null +++ b/modules/core/src/stat.dispatch.cpp @@ -0,0 +1,28 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "precomp.hpp" + +#include "stat.simd.hpp" +#include "stat.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { namespace hal { + +int normHamming(const uchar* a, int n) +{ + CV_INSTRUMENT_REGION() + + CV_CPU_DISPATCH(normHamming, (a, n), + CV_CPU_DISPATCH_MODES_ALL); +} + +int normHamming(const uchar* a, const uchar* b, int n) +{ + CV_INSTRUMENT_REGION() + + CV_CPU_DISPATCH(normHamming, (a, b, n), + CV_CPU_DISPATCH_MODES_ALL); +} + +}} //cv::hal From 6a6222d21c6d8308f208a0d09b8003afb4919890 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 13:18:22 +0000 Subject: [PATCH 5/8] core(stat): remove useless checks --- modules/core/src/stat.simd.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index 25d909530a..206817c5e3 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -28,7 +28,6 @@ int normHamming(const uchar* a, int n) int i = 0; int result = 0; #if CV_AVX2 - if(USE_AVX2) { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); @@ -52,7 +51,6 @@ int normHamming(const uchar* a, int n) #endif // CV_AVX2 #if CV_POPCNT - if(checkHardwareSupport(CV_CPU_POPCNT)) { # if defined CV_POPCNT_U64 for(; i <= n - 8; i += 8) @@ -68,7 +66,6 @@ int normHamming(const uchar* a, int n) #endif // CV_POPCNT #if CV_SIMD128 - if(hasSIMD128()) { v_uint32x4 t = v_setzero_u32(); for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) @@ -97,7 +94,6 @@ int normHamming(const uchar* a, const uchar* b, int n) int i = 0; int result = 0; #if CV_AVX2 - if(USE_AVX2) { __m256i _r0 = _mm256_setzero_si256(); __m256i _0 = _mm256_setzero_si256(); @@ -124,7 +120,6 @@ int normHamming(const uchar* a, const uchar* b, int n) #endif // CV_AVX2 #if CV_POPCNT - if(checkHardwareSupport(CV_CPU_POPCNT)) { # if defined CV_POPCNT_U64 for(; i <= n - 8; i += 8) @@ -140,7 +135,6 @@ int normHamming(const uchar* a, const uchar* b, int n) #endif // CV_POPCNT #if CV_SIMD128 - if(hasSIMD128()) { v_uint32x4 t = v_setzero_u32(); for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) From c45d3568ae81d63ba0280122e6d5c7dde65fd323 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 13:23:13 +0000 Subject: [PATCH 6/8] core(stat): register dispatched code, fix build --- modules/core/CMakeLists.txt | 1 + modules/core/src/stat.cpp | 2 +- modules/core/src/stat.simd.hpp | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 6de15ba6d1..9793e7181c 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,6 +1,7 @@ set(the_description "The Core Functionality") ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) +ocv_add_dispatched_file(stat SSE4_2 AVX AVX2) ocv_add_module(core "${OPENCV_HAL_LINKER_LIBS}" diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index fd6e0a2f2c..b802d0a637 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -4233,7 +4233,7 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr ) namespace cv { namespace hal { -static const uchar popCountTable[] = +extern const uchar popCountTable[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index 206817c5e3..43867731e3 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -5,6 +5,9 @@ #include "opencv2/core/hal/intrin.hpp" namespace cv { namespace hal { + +extern const uchar popCountTable[256]; + CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN // forward declarations From b66c349bba98a3ff0c6d0339ad277844d1fb5fda Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 15:33:05 +0000 Subject: [PATCH 7/8] core(stat): add required CV_AVX_GUARD Added guard with 'vzeroupper' instruction --- modules/core/include/opencv2/core/cv_cpu_dispatch.h | 6 ++++++ modules/core/src/stat.simd.hpp | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index c41c21da60..779039de79 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -111,6 +111,12 @@ struct VZeroUpperGuard { #define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard; #endif +#ifdef __CV_AVX_GUARD +#define CV_AVX_GUARD __CV_AVX_GUARD +#else +#define CV_AVX_GUARD +#endif + #endif // __OPENCV_BUILD diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp index 43867731e3..b75100d3f4 100644 --- a/modules/core/src/stat.simd.hpp +++ b/modules/core/src/stat.simd.hpp @@ -28,6 +28,8 @@ static inline int _mm256_extract_epi32_(__m256i reg, const int i) int normHamming(const uchar* a, int n) { + CV_AVX_GUARD; + int i = 0; int result = 0; #if CV_AVX2 @@ -94,6 +96,8 @@ int normHamming(const uchar* a, int n) int normHamming(const uchar* a, const uchar* b, int n) { + CV_AVX_GUARD; + int i = 0; int result = 0; #if CV_AVX2 From b3f5e3bf94511aa01960da2bb3d60afdffa2d9fd Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Sun, 2 Jul 2017 15:49:38 +0000 Subject: [PATCH 8/8] core(stat): optimize size of binaries, drop AVX dispatch (no imrovements) --- modules/core/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 9793e7181c..cd10920167 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,7 +1,7 @@ set(the_description "The Core Functionality") ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) -ocv_add_dispatched_file(stat SSE4_2 AVX AVX2) +ocv_add_dispatched_file(stat SSE4_2 AVX2) ocv_add_module(core "${OPENCV_HAL_LINKER_LIBS}"